URL https://opencores.org/ocsvn/or1k_soc_on_altera_embedded_dev_kit/or1k_soc_on_altera_embedded_dev_kit/trunk

Subversion Repositories or1k_soc_on_altera_embedded_dev_kit

[/] [or1k_soc_on_altera_embedded_dev_kit/] [trunk/] [linux-2.6/] [linux-2.6.24/] [arch/] [sh/] [lib/] [memcpy-sh4.S] - Blame information for rev 17

Go to most recent revision | Details | Compare with Previous | View Log

Line No.	Rev	Author	Line
1	3	xianfeng	`/*`
2			`* "memcpy" implementation of SuperH`
3			`*`
4			`* Copyright (C) 1999 Niibe Yutaka`
5			`* Copyright (c) 2002 STMicroelectronics Ltd`
6			`* Modified from memcpy.S and micro-optimised for SH4`
7			`* Stuart Menefy (stuart.menefy@st.com)`
8			`*`
9			`*/`
10			`#include`
11
12			`/*`
13			`* void memcpy(void dst, const void *src, size_t n);`
14			`*`
15			`* It is assumed that there is no overlap between src and dst.`
16			`* If there is an overlap, then the results are undefined.`
17			`*/`
18
19			`!`
20			`! GHIJ KLMN OPQR --> ...G HIJK LMNO PQR.`
21			`!`
22
23			`! Size is 16 or greater, and may have trailing bytes`
24
25			`.balign 32`
26			`.Lcase1:`
27			`! Read a long word and write a long word at once`
28			`! At the start of each iteration, r7 contains last long load`
29			`add #-1,r5 ! 79 EX`
30			`mov r4,r2 ! 5 MT (0 cycles latency)`
31
32			`mov.l @(r0,r5),r7 ! 21 LS (2 cycles latency)`
33			`add #-4,r5 ! 50 EX`
34
35			`add #7,r2 ! 79 EX`
36			`!`
37			`#ifdef CONFIG_CPU_LITTLE_ENDIAN`
38			`! 6 cycles, 4 bytes per iteration`
39			`3: mov.l @(r0,r5),r1 ! 21 LS (latency=2) ! NMLK`
40			`mov r7, r3 ! 5 MT (latency=0) ! RQPO`
41
42			`cmp/hi r2,r0 ! 57 MT`
43			`shll16 r3 ! 103 EX`
44
45			`mov r1,r6 ! 5 MT (latency=0)`
46			`shll8 r3 ! 102 EX ! Oxxx`
47
48			`shlr8 r6 ! 106 EX ! xNML`
49			`mov r1, r7 ! 5 MT (latency=0)`
50
51			`or r6,r3 ! 82 EX ! ONML`
52			`bt/s 3b ! 109 BR`
53
54			`mov.l r3,@-r0 ! 30 LS`
55			`#else`
56			`3: mov.l @(r0,r5),r1 ! 21 LS (latency=2) ! KLMN`
57			`mov r7,r3 ! 5 MT (latency=0) ! OPQR`
58
59			`cmp/hi r2,r0 ! 57 MT`
60			`shlr16 r3 ! 107 EX`
61
62			`shlr8 r3 ! 106 EX ! xxxO`
63			`mov r1,r6 ! 5 MT (latency=0)`
64
65			`shll8 r6 ! 102 EX ! LMNx`
66			`mov r1,r7 ! 5 MT (latency=0)`
67
68			`or r6,r3 ! 82 EX ! LMNO`
69			`bt/s 3b ! 109 BR`
70
71			`mov.l r3,@-r0 ! 30 LS`
72			`#endif`
73			`! Finally, copy a byte at once, if necessary`
74
75			`add #4,r5 ! 50 EX`
76			`cmp/eq r4,r0 ! 54 MT`
77
78			`add #-6,r2 ! 50 EX`
79			`bt 9f ! 109 BR`
80
81			`8: cmp/hi r2,r0 ! 57 MT`
82			`mov.b @(r0,r5),r1 ! 20 LS (latency=2)`
83
84			`bt/s 8b ! 109 BR`
85
86			`mov.b r1,@-r0 ! 29 LS`
87
88			`9: rts`
89			`nop`
90
91
92			`!`
93			`! GHIJ KLMN OPQR --> .GHI JKLM NOPQ R...`
94			`!`
95
96			`! Size is 16 or greater, and may have trailing bytes`
97
98			`.balign 32`
99			`.Lcase3:`
100			`! Read a long word and write a long word at once`
101			`! At the start of each iteration, r7 contains last long load`
102			`add #-3,r5 ! 79 EX`
103			`mov r4,r2 ! 5 MT (0 cycles latency)`
104
105			`mov.l @(r0,r5),r7 ! 21 LS (2 cycles latency)`
106			`add #-4,r5 ! 50 EX`
107
108			`add #7,r2 ! 79 EX`
109			`!`
110			`#ifdef CONFIG_CPU_LITTLE_ENDIAN`
111			`! 6 cycles, 4 bytes per iteration`
112			`3: mov.l @(r0,r5),r1 ! 21 LS (latency=2) ! NMLK`
113			`mov r7, r3 ! 5 MT (latency=0) ! RQPO`
114
115			`cmp/hi r2,r0 ! 57 MT`
116			`shll8 r3 ! 102 EX ! QPOx`
117
118			`mov r1,r6 ! 5 MT (latency=0)`
119			`shlr16 r6 ! 107 EX`
120
121			`shlr8 r6 ! 106 EX ! xxxN`
122			`mov r1, r7 ! 5 MT (latency=0)`
123
124			`or r6,r3 ! 82 EX ! QPON`
125			`bt/s 3b ! 109 BR`
126
127			`mov.l r3,@-r0 ! 30 LS`
128			`#else`
129			`3: mov r1,r3 ! OPQR`
130			`shlr8 r3 ! xOPQ`
131			`mov.l @(r0,r5),r1 ! KLMN`
132			`mov r1,r6`
133			`shll16 r6`
134			`shll8 r6 ! Nxxx`
135			`or r6,r3 ! NOPQ`
136			`cmp/hi r2,r0`
137			`bt/s 3b`
138			`mov.l r3,@-r0`
139			`#endif`
140
141			`! Finally, copy a byte at once, if necessary`
142
143			`add #6,r5 ! 50 EX`
144			`cmp/eq r4,r0 ! 54 MT`
145
146			`add #-6,r2 ! 50 EX`
147			`bt 9f ! 109 BR`
148
149			`8: cmp/hi r2,r0 ! 57 MT`
150			`mov.b @(r0,r5),r1 ! 20 LS (latency=2)`
151
152			`bt/s 8b ! 109 BR`
153
154			`mov.b r1,@-r0 ! 29 LS`
155
156			`9: rts`
157			`nop`
158
159			`ENTRY(memcpy)`
160
161			`! Calculate the invariants which will be used in the remainder`
162			`! of the code:`
163			`!`
164			`! r4 --> [ ... ] DST [ ... ] SRC`
165			`! [ ... ] [ ... ]`
166			`! : :`
167			`! r0 --> [ ... ] r0+r5 --> [ ... ]`
168			`!`
169			`!`
170
171			`! Short circuit the common case of src, dst and len being 32 bit aligned`
172			`! and test for zero length move`
173
174			`mov r6, r0 ! 5 MT (0 cycle latency)`
175			`or r4, r0 ! 82 EX`
176
177			`or r5, r0 ! 82 EX`
178			`tst r6, r6 ! 86 MT`
179
180			`bt/s 99f ! 111 BR (zero len)`
181			`tst #3, r0 ! 87 MT`
182
183			`mov r4, r0 ! 5 MT (0 cycle latency)`
184			`add r6, r0 ! 49 EX`
185
186			`mov #16, r1 ! 6 EX`
187			`bt/s .Lcase00 ! 111 BR (aligned)`
188
189			`sub r4, r5 ! 75 EX`
190
191			`! Arguments are not nicely long word aligned or zero len.`
192			`! Check for small copies, and if so do a simple byte at a time copy.`
193			`!`
194			`! Deciding on an exact value of 'small' is not easy, as the point at which`
195			`! using the optimised routines become worthwhile varies (these are the`
196			`! cycle counts for differnet sizes using byte-at-a-time vs. optimised):`
197			`! size byte-at-time long word byte`
198			`! 16 42 39-40 46-50 50-55`
199			`! 24 58 43-44 54-58 62-67`
200			`! 36 82 49-50 66-70 80-85`
201			`! However the penalty for getting it 'wrong' is much higher for long word`
202			`! aligned data (and this is more common), so use a value of 16.`
203
204			`cmp/gt r6,r1 ! 56 MT`
205
206			`add #-1,r5 ! 50 EX`
207			`bf/s 6f ! 108 BR (not small)`
208
209			`mov r5, r3 ! 5 MT (latency=0)`
210			`shlr r6 ! 104 EX`
211
212			`mov.b @(r0,r5),r1 ! 20 LS (latency=2)`
213			`bf/s 4f ! 111 BR`
214
215			`add #-1,r3 ! 50 EX`
216			`tst r6, r6 ! 86 MT`
217
218			`bt/s 98f ! 110 BR`
219			`mov.b r1,@-r0 ! 29 LS`
220
221			`! 4 cycles, 2 bytes per iteration`
222			`3: mov.b @(r0,r5),r1 ! 20 LS (latency=2)`
223
224			`4: mov.b @(r0,r3),r2 ! 20 LS (latency=2)`
225			`dt r6 ! 67 EX`
226
227			`mov.b r1,@-r0 ! 29 LS`
228			`bf/s 3b ! 111 BR`
229
230			`mov.b r2,@-r0 ! 29 LS`
231			`98:`
232			`rts`
233			`nop`
234
235			`99: rts`
236			`mov r4, r0`
237
238			`! Size is not small, so its worthwhile looking for optimisations.`
239			`! First align destination to a long word boundary.`
240			`!`
241			`! r5 = normal value -1`
242
243			`6: tst #3, r0 ! 87 MT`
244			`mov #3, r3 ! 6 EX`
245
246			`bt/s 2f ! 111 BR`
247			`and r0,r3 ! 78 EX`
248
249			`! 3 cycles, 1 byte per iteration`
250			`1: dt r3 ! 67 EX`
251			`mov.b @(r0,r5),r1 ! 19 LS (latency=2)`
252
253			`add #-1, r6 ! 79 EX`
254			`bf/s 1b ! 109 BR`
255
256			`mov.b r1,@-r0 ! 28 LS`
257
258			`2: add #1, r5 ! 79 EX`
259
260			`! Now select the appropriate bulk transfer code based on relative`
261			`! alignment of src and dst.`
262
263			`mov r0, r3 ! 5 MT (latency=0)`
264
265			`mov r5, r0 ! 5 MT (latency=0)`
266			`tst #1, r0 ! 87 MT`
267
268			`bf/s 1f ! 111 BR`
269			`mov #64, r7 ! 6 EX`
270
271			`! bit 0 clear`
272
273			`cmp/ge r7, r6 ! 55 MT`
274
275			`bt/s 2f ! 111 BR`
276			`tst #2, r0 ! 87 MT`
277
278			`! small`
279			`bt/s .Lcase0`
280			`mov r3, r0`
281
282			`bra .Lcase2`
283			`nop`
284
285			`! big`
286			`2: bt/s .Lcase0b`
287			`mov r3, r0`
288
289			`bra .Lcase2b`
290			`nop`
291
292			`! bit 0 set`
293			`1: tst #2, r0 ! 87 MT`
294
295			`bt/s .Lcase1`
296			`mov r3, r0`
297
298			`bra .Lcase3`
299			`nop`
300
301
302			`!`
303			`! GHIJ KLMN OPQR --> GHIJ KLMN OPQR`
304			`!`
305
306			`! src, dst and size are all long word aligned`
307			`! size is non-zero`
308
309			`.balign 32`
310			`.Lcase00:`
311			`mov #64, r1 ! 6 EX`
312			`mov r5, r3 ! 5 MT (latency=0)`
313
314			`cmp/gt r6, r1 ! 56 MT`
315			`add #-4, r5 ! 50 EX`
316
317			`bf .Lcase00b ! 108 BR (big loop)`
318			`shlr2 r6 ! 105 EX`
319
320			`shlr r6 ! 104 EX`
321			`mov.l @(r0, r5), r1 ! 21 LS (latency=2)`
322
323			`bf/s 4f ! 111 BR`
324			`add #-8, r3 ! 50 EX`
325
326			`tst r6, r6 ! 86 MT`
327			`bt/s 5f ! 110 BR`
328
329			`mov.l r1,@-r0 ! 30 LS`
330
331			`! 4 cycles, 2 long words per iteration`
332			`3: mov.l @(r0, r5), r1 ! 21 LS (latency=2)`
333
334			`4: mov.l @(r0, r3), r2 ! 21 LS (latency=2)`
335			`dt r6 ! 67 EX`
336
337			`mov.l r1, @-r0 ! 30 LS`
338			`bf/s 3b ! 109 BR`
339
340			`mov.l r2, @-r0 ! 30 LS`
341
342			`5: rts`
343			`nop`
344
345
346			`! Size is 16 or greater and less than 64, but may have trailing bytes`
347
348			`.balign 32`
349			`.Lcase0:`
350			`add #-4, r5 ! 50 EX`
351			`mov r4, r7 ! 5 MT (latency=0)`
352
353			`mov.l @(r0, r5), r1 ! 21 LS (latency=2)`
354			`mov #4, r2 ! 6 EX`
355
356			`add #11, r7 ! 50 EX`
357			`tst r2, r6 ! 86 MT`
358
359			`mov r5, r3 ! 5 MT (latency=0)`
360			`bt/s 4f ! 111 BR`
361
362			`add #-4, r3 ! 50 EX`
363			`mov.l r1,@-r0 ! 30 LS`
364
365			`! 4 cycles, 2 long words per iteration`
366			`3: mov.l @(r0, r5), r1 ! 21 LS (latency=2)`
367
368			`4: mov.l @(r0, r3), r2 ! 21 LS (latency=2)`
369			`cmp/hi r7, r0`
370
371			`mov.l r1, @-r0 ! 30 LS`
372			`bt/s 3b ! 109 BR`
373
374			`mov.l r2, @-r0 ! 30 LS`
375
376			`! Copy the final 0-3 bytes`
377
378			`add #3,r5 ! 50 EX`
379
380			`cmp/eq r0, r4 ! 54 MT`
381			`add #-10, r7 ! 50 EX`
382
383			`bt 9f ! 110 BR`
384
385			`! 3 cycles, 1 byte per iteration`
386			`1: mov.b @(r0,r5),r1 ! 19 LS`
387			`cmp/hi r7,r0 ! 57 MT`
388
389			`bt/s 1b ! 111 BR`
390			`mov.b r1,@-r0 ! 28 LS`
391
392			`9: rts`
393			`nop`
394
395			`! Size is at least 64 bytes, so will be going round the big loop at least once.`
396			`!`
397			`! r2 = rounded up r4`
398			`! r3 = rounded down r0`
399
400			`.balign 32`
401			`.Lcase0b:`
402			`add #-4, r5 ! 50 EX`
403
404			`.Lcase00b:`
405			`mov r0, r3 ! 5 MT (latency=0)`
406			`mov #(~0x1f), r1 ! 6 EX`
407
408			`and r1, r3 ! 78 EX`
409			`mov r4, r2 ! 5 MT (latency=0)`
410
411			`cmp/eq r3, r0 ! 54 MT`
412			`add #0x1f, r2 ! 50 EX`
413
414			`bt/s 1f ! 110 BR`
415			`and r1, r2 ! 78 EX`
416
417			`! copy initial words until cache line aligned`
418
419			`mov.l @(r0, r5), r1 ! 21 LS (latency=2)`
420			`tst #4, r0 ! 87 MT`
421
422			`mov r5, r6 ! 5 MT (latency=0)`
423			`add #-4, r6 ! 50 EX`
424
425			`bt/s 4f ! 111 BR`
426			`add #8, r3 ! 50 EX`
427
428			`tst #0x18, r0 ! 87 MT`
429
430			`bt/s 1f ! 109 BR`
431			`mov.l r1,@-r0 ! 30 LS`
432
433			`! 4 cycles, 2 long words per iteration`
434			`3: mov.l @(r0, r5), r1 ! 21 LS (latency=2)`
435
436			`4: mov.l @(r0, r6), r7 ! 21 LS (latency=2)`
437			`cmp/eq r3, r0 ! 54 MT`
438
439			`mov.l r1, @-r0 ! 30 LS`
440			`bf/s 3b ! 109 BR`
441
442			`mov.l r7, @-r0 ! 30 LS`
443
444			`! Copy the cache line aligned blocks`
445			`!`
446			`! In use: r0, r2, r4, r5`
447			`! Scratch: r1, r3, r6, r7`
448			`!`
449			`! We could do this with the four scratch registers, but if src`
450			`! and dest hit the same cache line, this will thrash, so make`
451			`! use of additional registers.`
452			`!`
453			`! We also need r0 as a temporary (for movca), so 'undo' the invariant:`
454			`! r5: src (was r0+r5)`
455			`! r1: dest (was r0)`
456			`! this can be reversed at the end, so we don't need to save any extra`
457			`! state.`
458			`!`
459			`1: mov.l r8, @-r15 ! 30 LS`
460			`add r0, r5 ! 49 EX`
461
462			`mov.l r9, @-r15 ! 30 LS`
463			`mov r0, r1 ! 5 MT (latency=0)`
464
465			`mov.l r10, @-r15 ! 30 LS`
466			`add #-0x1c, r5 ! 50 EX`
467
468			`mov.l r11, @-r15 ! 30 LS`
469
470			`! 16 cycles, 32 bytes per iteration`
471			`2: mov.l @(0x00,r5),r0 ! 18 LS (latency=2)`
472			`add #-0x20, r1 ! 50 EX`
473			`mov.l @(0x04,r5),r3 ! 18 LS (latency=2)`
474			`mov.l @(0x08,r5),r6 ! 18 LS (latency=2)`
475			`mov.l @(0x0c,r5),r7 ! 18 LS (latency=2)`
476			`mov.l @(0x10,r5),r8 ! 18 LS (latency=2)`
477			`mov.l @(0x14,r5),r9 ! 18 LS (latency=2)`
478			`mov.l @(0x18,r5),r10 ! 18 LS (latency=2)`
479			`mov.l @(0x1c,r5),r11 ! 18 LS (latency=2)`
480			`movca.l r0,@r1 ! 40 LS (latency=3-7)`
481			`mov.l r3,@(0x04,r1) ! 33 LS`
482			`mov.l r6,@(0x08,r1) ! 33 LS`
483			`mov.l r7,@(0x0c,r1) ! 33 LS`
484
485			`mov.l r8,@(0x10,r1) ! 33 LS`
486			`add #-0x20, r5 ! 50 EX`
487
488			`mov.l r9,@(0x14,r1) ! 33 LS`
489			`cmp/eq r2,r1 ! 54 MT`
490
491			`mov.l r10,@(0x18,r1) ! 33 LS`
492			`bf/s 2b ! 109 BR`
493
494			`mov.l r11,@(0x1c,r1) ! 33 LS`
495
496			`mov r1, r0 ! 5 MT (latency=0)`
497
498			`mov.l @r15+, r11 ! 15 LS`
499			`sub r1, r5 ! 75 EX`
500
501			`mov.l @r15+, r10 ! 15 LS`
502			`cmp/eq r4, r0 ! 54 MT`
503
504			`bf/s 1f ! 109 BR`
505			`mov.l @r15+, r9 ! 15 LS`
506
507			`rts`
508			`1: mov.l @r15+, r8 ! 15 LS`
509			`sub r4, r1 ! 75 EX (len remaining)`
510
511			`! number of trailing bytes is non-zero`
512			`!`
513			`! invariants restored (r5 already decremented by 4)`
514			`! also r1=num bytes remaining`
515
516			`mov #4, r2 ! 6 EX`
517			`mov r4, r7 ! 5 MT (latency=0)`
518
519			`add #0x1c, r5 ! 50 EX (back to -4)`
520			`cmp/hs r2, r1 ! 58 MT`
521
522			`bf/s 5f ! 108 BR`
523			`add #11, r7 ! 50 EX`
524
525			`mov.l @(r0, r5), r6 ! 21 LS (latency=2)`
526			`tst r2, r1 ! 86 MT`
527
528			`mov r5, r3 ! 5 MT (latency=0)`
529			`bt/s 4f ! 111 BR`
530
531			`add #-4, r3 ! 50 EX`
532			`cmp/hs r2, r1 ! 58 MT`
533
534			`bt/s 5f ! 111 BR`
535			`mov.l r6,@-r0 ! 30 LS`
536
537			`! 4 cycles, 2 long words per iteration`
538			`3: mov.l @(r0, r5), r6 ! 21 LS (latency=2)`
539
540			`4: mov.l @(r0, r3), r2 ! 21 LS (latency=2)`
541			`cmp/hi r7, r0`
542
543			`mov.l r6, @-r0 ! 30 LS`
544			`bt/s 3b ! 109 BR`
545
546			`mov.l r2, @-r0 ! 30 LS`
547
548			`! Copy the final 0-3 bytes`
549
550			`5: cmp/eq r0, r4 ! 54 MT`
551			`add #-10, r7 ! 50 EX`
552
553			`bt 9f ! 110 BR`
554			`add #3,r5 ! 50 EX`
555
556			`! 3 cycles, 1 byte per iteration`
557			`1: mov.b @(r0,r5),r1 ! 19 LS`
558			`cmp/hi r7,r0 ! 57 MT`
559
560			`bt/s 1b ! 111 BR`
561			`mov.b r1,@-r0 ! 28 LS`
562
563			`9: rts`
564			`nop`
565
566			`!`
567			`! GHIJ KLMN OPQR --> ..GH IJKL MNOP QR..`
568			`!`
569
570			`.balign 32`
571			`.Lcase2:`
572			`! Size is 16 or greater and less then 64, but may have trailing bytes`
573
574			`2: mov r5, r6 ! 5 MT (latency=0)`
575			`add #-2,r5 ! 50 EX`
576
577			`mov r4,r2 ! 5 MT (latency=0)`
578			`add #-4,r6 ! 50 EX`
579
580			`add #7,r2 ! 50 EX`
581			`3: mov.w @(r0,r5),r1 ! 20 LS (latency=2)`
582
583			`mov.w @(r0,r6),r3 ! 20 LS (latency=2)`
584			`cmp/hi r2,r0 ! 57 MT`
585
586			`mov.w r1,@-r0 ! 29 LS`
587			`bt/s 3b ! 111 BR`
588
589			`mov.w r3,@-r0 ! 29 LS`
590
591			`bra 10f`
592			`nop`
593
594
595			`.balign 32`
596			`.Lcase2b:`
597			`! Size is at least 64 bytes, so will be going round the big loop at least once.`
598			`!`
599			`! r2 = rounded up r4`
600			`! r3 = rounded down r0`
601
602			`mov r0, r3 ! 5 MT (latency=0)`
603			`mov #(~0x1f), r1 ! 6 EX`
604
605			`and r1, r3 ! 78 EX`
606			`mov r4, r2 ! 5 MT (latency=0)`
607
608			`cmp/eq r3, r0 ! 54 MT`
609			`add #0x1f, r2 ! 50 EX`
610
611			`add #-2, r5 ! 50 EX`
612			`bt/s 1f ! 110 BR`
613			`and r1, r2 ! 78 EX`
614
615			`! Copy a short word one at a time until we are cache line aligned`
616			`! Normal values: r0, r2, r3, r4`
617			`! Unused: r1, r6, r7`
618			`! Mod: r5 (=r5-2)`
619			`!`
620			`add #2, r3 ! 50 EX`
621
622			`2: mov.w @(r0,r5),r1 ! 20 LS (latency=2)`
623			`cmp/eq r3,r0 ! 54 MT`
624
625			`bf/s 2b ! 111 BR`
626
627			`mov.w r1,@-r0 ! 29 LS`
628
629			`! Copy the cache line aligned blocks`
630			`!`
631			`! In use: r0, r2, r4, r5 (=r5-2)`
632			`! Scratch: r1, r3, r6, r7`
633			`!`
634			`! We could do this with the four scratch registers, but if src`
635			`! and dest hit the same cache line, this will thrash, so make`
636			`! use of additional registers.`
637			`!`
638			`! We also need r0 as a temporary (for movca), so 'undo' the invariant:`
639			`! r5: src (was r0+r5)`
640			`! r1: dest (was r0)`
641			`! this can be reversed at the end, so we don't need to save any extra`
642			`! state.`
643			`!`
644			`1: mov.l r8, @-r15 ! 30 LS`
645			`add r0, r5 ! 49 EX`
646
647			`mov.l r9, @-r15 ! 30 LS`
648			`mov r0, r1 ! 5 MT (latency=0)`
649
650			`mov.l r10, @-r15 ! 30 LS`
651			`add #-0x1e, r5 ! 50 EX`
652
653			`mov.l r11, @-r15 ! 30 LS`
654
655			`mov.l r12, @-r15 ! 30 LS`
656
657			`! 17 cycles, 32 bytes per iteration`
658			`#ifdef CONFIG_CPU_LITTLE_ENDIAN`
659			`2: mov.w @r5+, r0 ! 14 LS (latency=2) ..JI`
660			`add #-0x20, r1 ! 50 EX`
661
662			`mov.l @r5+, r3 ! 15 LS (latency=2) NMLK`
663
664			`mov.l @r5+, r6 ! 15 LS (latency=2) RQPO`
665			`shll16 r0 ! 103 EX JI..`
666
667			`mov.l @r5+, r7 ! 15 LS (latency=2)`
668			`xtrct r3, r0 ! 48 EX LKJI`
669
670			`mov.l @r5+, r8 ! 15 LS (latency=2)`
671			`xtrct r6, r3 ! 48 EX PONM`
672
673			`mov.l @r5+, r9 ! 15 LS (latency=2)`
674			`xtrct r7, r6 ! 48 EX`
675
676			`mov.l @r5+, r10 ! 15 LS (latency=2)`
677			`xtrct r8, r7 ! 48 EX`
678
679			`mov.l @r5+, r11 ! 15 LS (latency=2)`
680			`xtrct r9, r8 ! 48 EX`
681
682			`mov.w @r5+, r12 ! 15 LS (latency=2)`
683			`xtrct r10, r9 ! 48 EX`
684
685			`movca.l r0,@r1 ! 40 LS (latency=3-7)`
686			`xtrct r11, r10 ! 48 EX`
687
688			`mov.l r3, @(0x04,r1) ! 33 LS`
689			`xtrct r12, r11 ! 48 EX`
690
691			`mov.l r6, @(0x08,r1) ! 33 LS`
692
693			`mov.l r7, @(0x0c,r1) ! 33 LS`
694
695			`mov.l r8, @(0x10,r1) ! 33 LS`
696			`add #-0x40, r5 ! 50 EX`
697
698			`mov.l r9, @(0x14,r1) ! 33 LS`
699			`cmp/eq r2,r1 ! 54 MT`
700
701			`mov.l r10, @(0x18,r1) ! 33 LS`
702			`bf/s 2b ! 109 BR`
703
704			`mov.l r11, @(0x1c,r1) ! 33 LS`
705			`#else`
706			`2: mov.w @(0x1e,r5), r0 ! 17 LS (latency=2)`
707			`add #-2, r5 ! 50 EX`
708
709			`mov.l @(0x1c,r5), r3 ! 18 LS (latency=2)`
710			`add #-4, r1 ! 50 EX`
711
712			`mov.l @(0x18,r5), r6 ! 18 LS (latency=2)`
713			`shll16 r0 ! 103 EX`
714
715			`mov.l @(0x14,r5), r7 ! 18 LS (latency=2)`
716			`xtrct r3, r0 ! 48 EX`
717
718			`mov.l @(0x10,r5), r8 ! 18 LS (latency=2)`
719			`xtrct r6, r3 ! 48 EX`
720
721			`mov.l @(0x0c,r5), r9 ! 18 LS (latency=2)`
722			`xtrct r7, r6 ! 48 EX`
723
724			`mov.l @(0x08,r5), r10 ! 18 LS (latency=2)`
725			`xtrct r8, r7 ! 48 EX`
726
727			`mov.l @(0x04,r5), r11 ! 18 LS (latency=2)`
728			`xtrct r9, r8 ! 48 EX`
729
730			`mov.l @(0x00,r5), r12 ! 18 LS (latency=2)`
731			`xtrct r10, r9 ! 48 EX`
732
733			`movca.l r0,@r1 ! 40 LS (latency=3-7)`
734			`add #-0x1c, r1 ! 50 EX`
735
736			`mov.l r3, @(0x1c,r1) ! 33 LS`
737			`xtrct r11, r10 ! 48 EX`
738
739			`mov.l r6, @(0x18,r1) ! 33 LS`
740			`xtrct r12, r11 ! 48 EX`
741
742			`mov.l r7, @(0x14,r1) ! 33 LS`
743
744			`mov.l r8, @(0x10,r1) ! 33 LS`
745			`add #-0x3e, r5 ! 50 EX`
746
747			`mov.l r9, @(0x0c,r1) ! 33 LS`
748			`cmp/eq r2,r1 ! 54 MT`
749
750			`mov.l r10, @(0x08,r1) ! 33 LS`
751			`bf/s 2b ! 109 BR`
752
753			`mov.l r11, @(0x04,r1) ! 33 LS`
754			`#endif`
755
756			`mov.l @r15+, r12`
757			`mov r1, r0 ! 5 MT (latency=0)`
758
759			`mov.l @r15+, r11 ! 15 LS`
760			`sub r1, r5 ! 75 EX`
761
762			`mov.l @r15+, r10 ! 15 LS`
763			`cmp/eq r4, r0 ! 54 MT`
764
765			`bf/s 1f ! 109 BR`
766			`mov.l @r15+, r9 ! 15 LS`
767
768			`rts`
769			`1: mov.l @r15+, r8 ! 15 LS`
770
771			`add #0x1e, r5 ! 50 EX`
772
773			`! Finish off a short word at a time`
774			`! r5 must be invariant - 2`
775			`10: mov r4,r2 ! 5 MT (latency=0)`
776			`add #1,r2 ! 50 EX`
777
778			`cmp/hi r2, r0 ! 57 MT`
779			`bf/s 1f ! 109 BR`
780
781			`add #2, r2 ! 50 EX`
782
783			`3: mov.w @(r0,r5),r1 ! 20 LS`
784			`cmp/hi r2,r0 ! 57 MT`
785
786			`bt/s 3b ! 109 BR`
787
788			`mov.w r1,@-r0 ! 29 LS`
789			`1:`
790
791			`!`
792			`! Finally, copy the last byte if necessary`
793			`cmp/eq r4,r0 ! 54 MT`
794			`bt/s 9b`
795			`add #1,r5`
796			`mov.b @(r0,r5),r1`
797			`rts`
798			`mov.b r1,@-r0`
799

Browse

Tools

Subversion Repositories or1k_soc_on_altera_embedded_dev_kit

[/] [or1k_soc_on_altera_embedded_dev_kit/] [trunk/] [linux-2.6/] [linux-2.6.24/] [arch/] [sh/] [lib/] [memcpy-sh4.S] - Blame information for rev 17