OpenCores
URL https://opencores.org/ocsvn/or1k_soc_on_altera_embedded_dev_kit/or1k_soc_on_altera_embedded_dev_kit/trunk

Subversion Repositories or1k_soc_on_altera_embedded_dev_kit

[/] [or1k_soc_on_altera_embedded_dev_kit/] [trunk/] [linux-2.6/] [linux-2.6.24/] [arch/] [sh/] [lib/] [memcpy-sh4.S] - Blame information for rev 17

Go to most recent revision | Details | Compare with Previous | View Log

Line No. Rev Author Line
1 3 xianfeng
/*
2
 * "memcpy" implementation of SuperH
3
 *
4
 * Copyright (C) 1999  Niibe Yutaka
5
 * Copyright (c) 2002  STMicroelectronics Ltd
6
 *   Modified from memcpy.S and micro-optimised for SH4
7
 *   Stuart Menefy (stuart.menefy@st.com)
8
 *
9
 */
10
#include 
11
 
12
/*
13
 * void *memcpy(void *dst, const void *src, size_t n);
14
 *
15
 * It is assumed that there is no overlap between src and dst.
16
 * If there is an overlap, then the results are undefined.
17
 */
18
 
19
        !
20
        !       GHIJ KLMN OPQR -->  ...G HIJK LMNO PQR.
21
        !
22
 
23
        ! Size is 16 or greater, and may have trailing bytes
24
 
25
        .balign 32
26
.Lcase1:
27
        ! Read a long word and write a long word at once
28
        ! At the start of each iteration, r7 contains last long load
29
        add     #-1,r5          !  79 EX
30
        mov     r4,r2           !   5 MT (0 cycles latency)
31
 
32
        mov.l   @(r0,r5),r7     !  21 LS (2 cycles latency)
33
        add     #-4,r5          !  50 EX
34
 
35
        add     #7,r2           !  79 EX
36
        !
37
#ifdef CONFIG_CPU_LITTLE_ENDIAN
38
        ! 6 cycles, 4 bytes per iteration
39
3:      mov.l   @(r0,r5),r1     !  21 LS (latency=2)    ! NMLK
40
        mov     r7, r3          !   5 MT (latency=0)    ! RQPO
41
 
42
        cmp/hi  r2,r0           !  57 MT
43
        shll16  r3              ! 103 EX
44
 
45
        mov     r1,r6           !   5 MT (latency=0)
46
        shll8   r3              ! 102 EX                ! Oxxx
47
 
48
        shlr8   r6              ! 106 EX                ! xNML
49
        mov     r1, r7          !   5 MT (latency=0)
50
 
51
        or      r6,r3           !  82 EX                ! ONML
52
        bt/s    3b              ! 109 BR
53
 
54
         mov.l  r3,@-r0         !  30 LS
55
#else
56
3:      mov.l   @(r0,r5),r1     !  21 LS (latency=2)    ! KLMN
57
        mov     r7,r3           !   5 MT (latency=0)    ! OPQR
58
 
59
        cmp/hi  r2,r0           !  57 MT
60
        shlr16  r3              ! 107 EX
61
 
62
        shlr8   r3              ! 106 EX                ! xxxO
63
        mov     r1,r6           !   5 MT (latency=0)
64
 
65
        shll8   r6              ! 102 EX                ! LMNx
66
        mov     r1,r7           !   5 MT (latency=0)
67
 
68
        or      r6,r3           !  82 EX                ! LMNO
69
        bt/s    3b              ! 109 BR
70
 
71
         mov.l  r3,@-r0         !  30 LS
72
#endif
73
        ! Finally, copy a byte at once, if necessary
74
 
75
        add     #4,r5           !  50 EX
76
        cmp/eq  r4,r0           !  54 MT
77
 
78
        add     #-6,r2          !  50 EX
79
        bt      9f              ! 109 BR
80
 
81
8:      cmp/hi  r2,r0           !  57 MT
82
        mov.b   @(r0,r5),r1     !  20 LS (latency=2)
83
 
84
        bt/s    8b              ! 109 BR
85
 
86
         mov.b  r1,@-r0         !  29 LS
87
 
88
9:      rts
89
         nop
90
 
91
 
92
        !
93
        !       GHIJ KLMN OPQR -->  .GHI JKLM NOPQ R...
94
        !
95
 
96
        ! Size is 16 or greater, and may have trailing bytes
97
 
98
        .balign 32
99
.Lcase3:
100
        ! Read a long word and write a long word at once
101
        ! At the start of each iteration, r7 contains last long load
102
        add     #-3,r5          ! 79 EX
103
        mov     r4,r2           !  5 MT (0 cycles latency)
104
 
105
        mov.l   @(r0,r5),r7     ! 21 LS (2 cycles latency)
106
        add     #-4,r5          ! 50 EX
107
 
108
        add     #7,r2           !  79 EX
109
        !
110
#ifdef CONFIG_CPU_LITTLE_ENDIAN
111
        ! 6 cycles, 4 bytes per iteration
112
3:      mov.l   @(r0,r5),r1     !  21 LS (latency=2)    ! NMLK
113
        mov     r7, r3          !   5 MT (latency=0)    ! RQPO
114
 
115
        cmp/hi  r2,r0           !  57 MT
116
        shll8   r3              ! 102 EX                ! QPOx
117
 
118
        mov     r1,r6           !   5 MT (latency=0)
119
        shlr16  r6              ! 107 EX
120
 
121
        shlr8   r6              ! 106 EX                ! xxxN
122
        mov     r1, r7          !   5 MT (latency=0)
123
 
124
        or      r6,r3           !  82 EX                ! QPON
125
        bt/s    3b              ! 109 BR
126
 
127
         mov.l  r3,@-r0         !  30 LS
128
#else
129
3:      mov     r1,r3           ! OPQR
130
        shlr8   r3              ! xOPQ
131
        mov.l   @(r0,r5),r1     ! KLMN
132
        mov     r1,r6
133
        shll16  r6
134
        shll8   r6              ! Nxxx
135
        or      r6,r3           ! NOPQ
136
        cmp/hi  r2,r0
137
        bt/s    3b
138
         mov.l  r3,@-r0
139
#endif
140
 
141
        ! Finally, copy a byte at once, if necessary
142
 
143
        add     #6,r5           !  50 EX
144
        cmp/eq  r4,r0           !  54 MT
145
 
146
        add     #-6,r2          !  50 EX
147
        bt      9f              ! 109 BR
148
 
149
8:      cmp/hi  r2,r0           !  57 MT
150
        mov.b   @(r0,r5),r1     !  20 LS (latency=2)
151
 
152
        bt/s    8b              ! 109 BR
153
 
154
         mov.b  r1,@-r0         !  29 LS
155
 
156
9:      rts
157
         nop
158
 
159
ENTRY(memcpy)
160
 
161
        ! Calculate the invariants which will be used in the remainder
162
        ! of the code:
163
        !
164
        !      r4   -->  [ ...  ] DST             [ ...  ] SRC
165
        !                [ ...  ]                 [ ...  ]
166
        !                  :                        :
167
        !      r0   -->  [ ...  ]       r0+r5 --> [ ...  ]
168
        !
169
        !
170
 
171
        ! Short circuit the common case of src, dst and len being 32 bit aligned
172
        ! and test for zero length move
173
 
174
        mov     r6, r0          !   5 MT (0 cycle latency)
175
        or      r4, r0          !  82 EX
176
 
177
        or      r5, r0          !  82 EX
178
        tst     r6, r6          !  86 MT
179
 
180
        bt/s    99f             ! 111 BR                (zero len)
181
         tst    #3, r0          !  87 MT
182
 
183
        mov     r4, r0          !   5 MT (0 cycle latency)
184
        add     r6, r0          !  49 EX
185
 
186
        mov     #16, r1         !   6 EX
187
        bt/s    .Lcase00        ! 111 BR                (aligned)
188
 
189
         sub    r4, r5          !  75 EX
190
 
191
        ! Arguments are not nicely long word aligned or zero len.
192
        ! Check for small copies, and if so do a simple byte at a time copy.
193
        !
194
        ! Deciding on an exact value of 'small' is not easy, as the point at which
195
        ! using the optimised routines become worthwhile varies (these are the
196
        ! cycle counts for differnet sizes using byte-at-a-time vs. optimised):
197
        !       size    byte-at-time    long    word    byte
198
        !       16      42              39-40   46-50   50-55
199
        !       24      58              43-44   54-58   62-67
200
        !       36      82              49-50   66-70   80-85
201
        ! However the penalty for getting it 'wrong' is much higher for long word
202
        ! aligned data (and this is more common), so use a value of 16.
203
 
204
        cmp/gt  r6,r1           !  56 MT
205
 
206
        add     #-1,r5          !  50 EX
207
        bf/s    6f              ! 108 BR                (not small)
208
 
209
         mov    r5, r3          !   5 MT (latency=0)
210
        shlr    r6              ! 104 EX
211
 
212
        mov.b   @(r0,r5),r1     !  20 LS (latency=2)
213
        bf/s    4f              ! 111 BR
214
 
215
         add    #-1,r3          !  50 EX
216
        tst     r6, r6          !  86 MT
217
 
218
        bt/s    98f             ! 110 BR
219
         mov.b  r1,@-r0         !  29 LS
220
 
221
        ! 4 cycles, 2 bytes per iteration
222
3:      mov.b   @(r0,r5),r1     !  20 LS (latency=2)
223
 
224
4:      mov.b   @(r0,r3),r2     !  20 LS (latency=2)
225
        dt      r6              !  67 EX
226
 
227
        mov.b   r1,@-r0         !  29 LS
228
        bf/s    3b              ! 111 BR
229
 
230
         mov.b  r2,@-r0         !  29 LS
231
98:
232
        rts
233
         nop
234
 
235
99:     rts
236
         mov    r4, r0
237
 
238
        ! Size is not small, so its worthwhile looking for optimisations.
239
        ! First align destination to a long word boundary.
240
        !
241
        ! r5 = normal value -1
242
 
243
6:      tst     #3, r0          !  87 MT
244
        mov     #3, r3          !   6 EX
245
 
246
        bt/s    2f              ! 111 BR
247
         and    r0,r3           !  78 EX
248
 
249
        ! 3 cycles, 1 byte per iteration
250
1:      dt      r3              !  67 EX
251
        mov.b   @(r0,r5),r1     !  19 LS (latency=2)
252
 
253
        add     #-1, r6         !  79 EX
254
        bf/s    1b              ! 109 BR
255
 
256
         mov.b  r1,@-r0         !  28 LS
257
 
258
2:      add     #1, r5          !  79 EX
259
 
260
        ! Now select the appropriate bulk transfer code based on relative
261
        ! alignment of src and dst.
262
 
263
        mov     r0, r3          !   5 MT (latency=0)
264
 
265
        mov     r5, r0          !   5 MT (latency=0)
266
        tst     #1, r0          !  87 MT
267
 
268
        bf/s    1f              ! 111 BR
269
         mov    #64, r7         !   6 EX
270
 
271
        ! bit 0 clear
272
 
273
        cmp/ge  r7, r6          !  55 MT
274
 
275
        bt/s    2f              ! 111 BR
276
         tst    #2, r0          !  87 MT
277
 
278
        ! small
279
        bt/s    .Lcase0
280
         mov    r3, r0
281
 
282
        bra     .Lcase2
283
         nop
284
 
285
        ! big
286
2:      bt/s    .Lcase0b
287
         mov    r3, r0
288
 
289
        bra     .Lcase2b
290
         nop
291
 
292
        ! bit 0 set
293
1:      tst     #2, r0          ! 87 MT
294
 
295
        bt/s    .Lcase1
296
         mov    r3, r0
297
 
298
        bra     .Lcase3
299
         nop
300
 
301
 
302
        !
303
        !       GHIJ KLMN OPQR -->  GHIJ KLMN OPQR
304
        !
305
 
306
        ! src, dst and size are all long word aligned
307
        ! size is non-zero
308
 
309
        .balign 32
310
.Lcase00:
311
        mov     #64, r1         !   6 EX
312
        mov     r5, r3          !   5 MT (latency=0)
313
 
314
        cmp/gt  r6, r1          !  56 MT
315
        add     #-4, r5         !  50 EX
316
 
317
        bf      .Lcase00b       ! 108 BR                (big loop)
318
        shlr2   r6              ! 105 EX
319
 
320
        shlr    r6              ! 104 EX
321
        mov.l   @(r0, r5), r1   !  21 LS (latency=2)
322
 
323
        bf/s    4f              ! 111 BR
324
         add    #-8, r3         !  50 EX
325
 
326
        tst     r6, r6          !  86 MT
327
        bt/s    5f              ! 110 BR
328
 
329
         mov.l  r1,@-r0         !  30 LS
330
 
331
        ! 4 cycles, 2 long words per iteration
332
3:      mov.l   @(r0, r5), r1   !  21 LS (latency=2)
333
 
334
4:      mov.l   @(r0, r3), r2   !  21 LS (latency=2)
335
        dt      r6              !  67 EX
336
 
337
        mov.l   r1, @-r0        !  30 LS
338
        bf/s    3b              ! 109 BR
339
 
340
         mov.l  r2, @-r0        !  30 LS
341
 
342
5:      rts
343
         nop
344
 
345
 
346
        ! Size is 16 or greater and less than 64, but may have trailing bytes
347
 
348
        .balign 32
349
.Lcase0:
350
        add     #-4, r5         !  50 EX
351
        mov     r4, r7          !   5 MT (latency=0)
352
 
353
        mov.l   @(r0, r5), r1   !  21 LS (latency=2)
354
        mov     #4, r2          !   6 EX
355
 
356
        add     #11, r7         !  50 EX
357
        tst     r2, r6          !  86 MT
358
 
359
        mov     r5, r3          !   5 MT (latency=0)
360
        bt/s    4f              ! 111 BR
361
 
362
         add    #-4, r3         !  50 EX
363
        mov.l   r1,@-r0         !  30 LS
364
 
365
        ! 4 cycles, 2 long words per iteration
366
3:      mov.l   @(r0, r5), r1   !  21 LS (latency=2)
367
 
368
4:      mov.l   @(r0, r3), r2   !  21 LS (latency=2)
369
        cmp/hi  r7, r0
370
 
371
        mov.l   r1, @-r0        !  30 LS
372
        bt/s    3b              ! 109 BR
373
 
374
         mov.l  r2, @-r0        !  30 LS
375
 
376
        ! Copy the final 0-3 bytes
377
 
378
        add     #3,r5           !  50 EX
379
 
380
        cmp/eq  r0, r4          !  54 MT
381
        add     #-10, r7        !  50 EX
382
 
383
        bt      9f              ! 110 BR
384
 
385
        ! 3 cycles, 1 byte per iteration
386
1:      mov.b   @(r0,r5),r1     !  19 LS
387
        cmp/hi  r7,r0           !  57 MT
388
 
389
        bt/s    1b              ! 111 BR
390
         mov.b  r1,@-r0         !  28 LS
391
 
392
9:      rts
393
         nop
394
 
395
        ! Size is at least 64 bytes, so will be going round the big loop at least once.
396
        !
397
        !   r2 = rounded up r4
398
        !   r3 = rounded down r0
399
 
400
        .balign 32
401
.Lcase0b:
402
        add     #-4, r5         !  50 EX
403
 
404
.Lcase00b:
405
        mov     r0, r3          !   5 MT (latency=0)
406
        mov     #(~0x1f), r1    !   6 EX
407
 
408
        and     r1, r3          !  78 EX
409
        mov     r4, r2          !   5 MT (latency=0)
410
 
411
        cmp/eq  r3, r0          !  54 MT
412
        add     #0x1f, r2       !  50 EX
413
 
414
        bt/s    1f              ! 110 BR
415
         and    r1, r2          !  78 EX
416
 
417
        ! copy initial words until cache line aligned
418
 
419
        mov.l   @(r0, r5), r1   !  21 LS (latency=2)
420
        tst     #4, r0          !  87 MT
421
 
422
        mov     r5, r6          !   5 MT (latency=0)
423
        add     #-4, r6         !  50 EX
424
 
425
        bt/s    4f              ! 111 BR
426
         add    #8, r3          !  50 EX
427
 
428
        tst     #0x18, r0       !  87 MT
429
 
430
        bt/s    1f              ! 109 BR
431
         mov.l  r1,@-r0         !  30 LS
432
 
433
        ! 4 cycles, 2 long words per iteration
434
3:      mov.l   @(r0, r5), r1   !  21 LS (latency=2)
435
 
436
4:      mov.l   @(r0, r6), r7   !  21 LS (latency=2)
437
        cmp/eq  r3, r0          !  54 MT
438
 
439
        mov.l   r1, @-r0        !  30 LS
440
        bf/s    3b              ! 109 BR
441
 
442
         mov.l  r7, @-r0        !  30 LS
443
 
444
        ! Copy the cache line aligned blocks
445
        !
446
        ! In use: r0, r2, r4, r5
447
        ! Scratch: r1, r3, r6, r7
448
        !
449
        ! We could do this with the four scratch registers, but if src
450
        ! and dest hit the same cache line, this will thrash, so make
451
        ! use of additional registers.
452
        !
453
        ! We also need r0 as a temporary (for movca), so 'undo' the invariant:
454
        !   r5:  src (was r0+r5)
455
        !   r1:  dest (was r0)
456
        ! this can be reversed at the end, so we don't need to save any extra
457
        ! state.
458
        !
459
1:      mov.l   r8, @-r15       !  30 LS
460
        add     r0, r5          !  49 EX
461
 
462
        mov.l   r9, @-r15       !  30 LS
463
        mov     r0, r1          !   5 MT (latency=0)
464
 
465
        mov.l   r10, @-r15      !  30 LS
466
        add     #-0x1c, r5      !  50 EX
467
 
468
        mov.l   r11, @-r15      !  30 LS
469
 
470
        ! 16 cycles, 32 bytes per iteration
471
2:      mov.l   @(0x00,r5),r0   ! 18 LS (latency=2)
472
        add     #-0x20, r1      ! 50 EX
473
        mov.l   @(0x04,r5),r3   ! 18 LS (latency=2)
474
        mov.l   @(0x08,r5),r6   ! 18 LS (latency=2)
475
        mov.l   @(0x0c,r5),r7   ! 18 LS (latency=2)
476
        mov.l   @(0x10,r5),r8   ! 18 LS (latency=2)
477
        mov.l   @(0x14,r5),r9   ! 18 LS (latency=2)
478
        mov.l   @(0x18,r5),r10  ! 18 LS (latency=2)
479
        mov.l   @(0x1c,r5),r11  ! 18 LS (latency=2)
480
        movca.l r0,@r1          ! 40 LS (latency=3-7)
481
        mov.l   r3,@(0x04,r1)   ! 33 LS
482
        mov.l   r6,@(0x08,r1)   ! 33 LS
483
        mov.l   r7,@(0x0c,r1)   ! 33 LS
484
 
485
        mov.l   r8,@(0x10,r1)   ! 33 LS
486
        add     #-0x20, r5      ! 50 EX
487
 
488
        mov.l   r9,@(0x14,r1)   ! 33 LS
489
        cmp/eq  r2,r1           ! 54 MT
490
 
491
        mov.l   r10,@(0x18,r1)  !  33 LS
492
        bf/s    2b              ! 109 BR
493
 
494
         mov.l  r11,@(0x1c,r1)  !  33 LS
495
 
496
        mov     r1, r0          !   5 MT (latency=0)
497
 
498
        mov.l   @r15+, r11      !  15 LS
499
        sub     r1, r5          !  75 EX
500
 
501
        mov.l   @r15+, r10      !  15 LS
502
        cmp/eq  r4, r0          !  54 MT
503
 
504
        bf/s    1f              ! 109 BR
505
         mov.l   @r15+, r9      !  15 LS
506
 
507
        rts
508
1:       mov.l  @r15+, r8       !  15 LS
509
        sub     r4, r1          !  75 EX                (len remaining)
510
 
511
        ! number of trailing bytes is non-zero
512
        !
513
        ! invariants restored (r5 already decremented by 4)
514
        ! also r1=num bytes remaining
515
 
516
        mov     #4, r2          !   6 EX
517
        mov     r4, r7          !   5 MT (latency=0)
518
 
519
        add     #0x1c, r5       !  50 EX                (back to -4)
520
        cmp/hs  r2, r1          !  58 MT
521
 
522
        bf/s    5f              ! 108 BR
523
         add     #11, r7        !  50 EX
524
 
525
        mov.l   @(r0, r5), r6   !  21 LS (latency=2)
526
        tst     r2, r1          !  86 MT
527
 
528
        mov     r5, r3          !   5 MT (latency=0)
529
        bt/s    4f              ! 111 BR
530
 
531
         add    #-4, r3         !  50 EX
532
        cmp/hs  r2, r1          !  58 MT
533
 
534
        bt/s    5f              ! 111 BR
535
         mov.l  r6,@-r0         !  30 LS
536
 
537
        ! 4 cycles, 2 long words per iteration
538
3:      mov.l   @(r0, r5), r6   !  21 LS (latency=2)
539
 
540
4:      mov.l   @(r0, r3), r2   !  21 LS (latency=2)
541
        cmp/hi  r7, r0
542
 
543
        mov.l   r6, @-r0        !  30 LS
544
        bt/s    3b              ! 109 BR
545
 
546
         mov.l  r2, @-r0        !  30 LS
547
 
548
        ! Copy the final 0-3 bytes
549
 
550
5:      cmp/eq  r0, r4          !  54 MT
551
        add     #-10, r7        !  50 EX
552
 
553
        bt      9f              ! 110 BR
554
        add     #3,r5           !  50 EX
555
 
556
        ! 3 cycles, 1 byte per iteration
557
1:      mov.b   @(r0,r5),r1     !  19 LS
558
        cmp/hi  r7,r0           !  57 MT
559
 
560
        bt/s    1b              ! 111 BR
561
         mov.b  r1,@-r0         !  28 LS
562
 
563
9:      rts
564
         nop
565
 
566
        !
567
        !       GHIJ KLMN OPQR -->  ..GH IJKL MNOP QR..
568
        !
569
 
570
        .balign 32
571
.Lcase2:
572
        ! Size is 16 or greater and less then 64, but may have trailing bytes
573
 
574
2:      mov     r5, r6          !   5 MT (latency=0)
575
        add     #-2,r5          !  50 EX
576
 
577
        mov     r4,r2           !   5 MT (latency=0)
578
        add     #-4,r6          !  50 EX
579
 
580
        add     #7,r2           !  50 EX
581
3:      mov.w   @(r0,r5),r1     !  20 LS (latency=2)
582
 
583
        mov.w   @(r0,r6),r3     !  20 LS (latency=2)
584
        cmp/hi  r2,r0           !  57 MT
585
 
586
        mov.w   r1,@-r0         !  29 LS
587
        bt/s    3b              ! 111 BR
588
 
589
         mov.w  r3,@-r0         !  29 LS
590
 
591
        bra     10f
592
         nop
593
 
594
 
595
        .balign 32
596
.Lcase2b:
597
        ! Size is at least 64 bytes, so will be going round the big loop at least once.
598
        !
599
        !   r2 = rounded up r4
600
        !   r3 = rounded down r0
601
 
602
        mov     r0, r3          !   5 MT (latency=0)
603
        mov     #(~0x1f), r1    !   6 EX
604
 
605
        and     r1, r3          !  78 EX
606
        mov     r4, r2          !   5 MT (latency=0)
607
 
608
        cmp/eq  r3, r0          !  54 MT
609
        add     #0x1f, r2       !  50 EX
610
 
611
        add     #-2, r5         !  50 EX
612
        bt/s    1f              ! 110 BR
613
         and    r1, r2          !  78 EX
614
 
615
        ! Copy a short word one at a time until we are cache line aligned
616
        !   Normal values: r0, r2, r3, r4
617
        !   Unused: r1, r6, r7
618
        !   Mod: r5 (=r5-2)
619
        !
620
        add     #2, r3          !  50 EX
621
 
622
2:      mov.w   @(r0,r5),r1     !  20 LS (latency=2)
623
        cmp/eq  r3,r0           !  54 MT
624
 
625
        bf/s    2b              ! 111 BR
626
 
627
         mov.w  r1,@-r0         !  29 LS
628
 
629
        ! Copy the cache line aligned blocks
630
        !
631
        ! In use: r0, r2, r4, r5 (=r5-2)
632
        ! Scratch: r1, r3, r6, r7
633
        !
634
        ! We could do this with the four scratch registers, but if src
635
        ! and dest hit the same cache line, this will thrash, so make
636
        ! use of additional registers.
637
        !
638
        ! We also need r0 as a temporary (for movca), so 'undo' the invariant:
639
        !   r5:  src (was r0+r5)
640
        !   r1:  dest (was r0)
641
        ! this can be reversed at the end, so we don't need to save any extra
642
        ! state.
643
        !
644
1:      mov.l   r8, @-r15       !  30 LS
645
        add     r0, r5          !  49 EX
646
 
647
        mov.l   r9, @-r15       !  30 LS
648
        mov     r0, r1          !   5 MT (latency=0)
649
 
650
        mov.l   r10, @-r15      !  30 LS
651
        add     #-0x1e, r5      !  50 EX
652
 
653
        mov.l   r11, @-r15      !  30 LS
654
 
655
        mov.l   r12, @-r15      !  30 LS
656
 
657
        ! 17 cycles, 32 bytes per iteration
658
#ifdef CONFIG_CPU_LITTLE_ENDIAN
659
2:      mov.w   @r5+, r0        !  14 LS (latency=2)            ..JI
660
        add     #-0x20, r1      !  50 EX
661
 
662
        mov.l   @r5+, r3        !  15 LS (latency=2)            NMLK
663
 
664
        mov.l   @r5+, r6        !  15 LS (latency=2)            RQPO
665
        shll16  r0              ! 103 EX                        JI..
666
 
667
        mov.l   @r5+, r7        !  15 LS (latency=2)
668
        xtrct   r3, r0          !  48 EX                        LKJI
669
 
670
        mov.l   @r5+, r8        !  15 LS (latency=2)
671
        xtrct   r6, r3          !  48 EX                        PONM
672
 
673
        mov.l   @r5+, r9        !  15 LS (latency=2)
674
        xtrct   r7, r6          !  48 EX
675
 
676
        mov.l   @r5+, r10       !  15 LS (latency=2)
677
        xtrct   r8, r7          !  48 EX
678
 
679
        mov.l   @r5+, r11       !  15 LS (latency=2)
680
        xtrct   r9, r8          !  48 EX
681
 
682
        mov.w   @r5+, r12       !  15 LS (latency=2)
683
        xtrct   r10, r9         !  48 EX
684
 
685
        movca.l r0,@r1          !  40 LS (latency=3-7)
686
        xtrct   r11, r10        !  48 EX
687
 
688
        mov.l   r3, @(0x04,r1)  !  33 LS
689
        xtrct   r12, r11        !  48 EX
690
 
691
        mov.l   r6, @(0x08,r1)  !  33 LS
692
 
693
        mov.l   r7, @(0x0c,r1)  !  33 LS
694
 
695
        mov.l   r8, @(0x10,r1)  !  33 LS
696
        add     #-0x40, r5      !  50 EX
697
 
698
        mov.l   r9, @(0x14,r1)  !  33 LS
699
        cmp/eq  r2,r1           !  54 MT
700
 
701
        mov.l   r10, @(0x18,r1) !  33 LS
702
        bf/s    2b              ! 109 BR
703
 
704
         mov.l  r11, @(0x1c,r1) !  33 LS
705
#else
706
2:      mov.w   @(0x1e,r5), r0  !  17 LS (latency=2)
707
        add     #-2, r5         !  50 EX
708
 
709
        mov.l   @(0x1c,r5), r3  !  18 LS (latency=2)
710
        add     #-4, r1         !  50 EX
711
 
712
        mov.l   @(0x18,r5), r6  !  18 LS (latency=2)
713
        shll16  r0              ! 103 EX
714
 
715
        mov.l   @(0x14,r5), r7  !  18 LS (latency=2)
716
        xtrct   r3, r0          !  48 EX
717
 
718
        mov.l   @(0x10,r5), r8  !  18 LS (latency=2)
719
        xtrct   r6, r3          !  48 EX
720
 
721
        mov.l   @(0x0c,r5), r9  !  18 LS (latency=2)
722
        xtrct   r7, r6          !  48 EX
723
 
724
        mov.l   @(0x08,r5), r10 !  18 LS (latency=2)
725
        xtrct   r8, r7          !  48 EX
726
 
727
        mov.l   @(0x04,r5), r11 !  18 LS (latency=2)
728
        xtrct   r9, r8          !  48 EX
729
 
730
        mov.l   @(0x00,r5), r12 !  18 LS (latency=2)
731
        xtrct   r10, r9         !  48 EX
732
 
733
        movca.l r0,@r1          !  40 LS (latency=3-7)
734
        add     #-0x1c, r1      !  50 EX
735
 
736
        mov.l   r3, @(0x1c,r1)  !  33 LS
737
        xtrct   r11, r10        !  48 EX
738
 
739
        mov.l   r6, @(0x18,r1)  !  33 LS
740
        xtrct   r12, r11        !  48 EX
741
 
742
        mov.l   r7, @(0x14,r1)  !  33 LS
743
 
744
        mov.l   r8, @(0x10,r1)  !  33 LS
745
        add     #-0x3e, r5      !  50 EX
746
 
747
        mov.l   r9, @(0x0c,r1)  !  33 LS
748
        cmp/eq  r2,r1           !  54 MT
749
 
750
        mov.l   r10, @(0x08,r1) !  33 LS
751
        bf/s    2b              ! 109 BR
752
 
753
         mov.l  r11, @(0x04,r1) !  33 LS
754
#endif
755
 
756
        mov.l   @r15+, r12
757
        mov     r1, r0          !   5 MT (latency=0)
758
 
759
        mov.l   @r15+, r11      !  15 LS
760
        sub     r1, r5          !  75 EX
761
 
762
        mov.l   @r15+, r10      !  15 LS
763
        cmp/eq  r4, r0          !  54 MT
764
 
765
        bf/s    1f              ! 109 BR
766
         mov.l   @r15+, r9      !  15 LS
767
 
768
        rts
769
1:       mov.l  @r15+, r8       !  15 LS
770
 
771
        add     #0x1e, r5       !  50 EX
772
 
773
        ! Finish off a short word at a time
774
        ! r5 must be invariant - 2
775
10:     mov     r4,r2           !   5 MT (latency=0)
776
        add     #1,r2           !  50 EX
777
 
778
        cmp/hi  r2, r0          !  57 MT
779
        bf/s    1f              ! 109 BR
780
 
781
         add    #2, r2          !  50 EX
782
 
783
3:      mov.w   @(r0,r5),r1     !  20 LS
784
        cmp/hi  r2,r0           !  57 MT
785
 
786
        bt/s    3b              ! 109 BR
787
 
788
         mov.w  r1,@-r0         !  29 LS
789
1:
790
 
791
        !
792
        ! Finally, copy the last byte if necessary
793
        cmp/eq  r4,r0           !  54 MT
794
        bt/s    9b
795
         add    #1,r5
796
        mov.b   @(r0,r5),r1
797
        rts
798
         mov.b  r1,@-r0
799
 

powered by: WebSVN 2.1.0

© copyright 1999-2024 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.