OpenCores
URL https://opencores.org/ocsvn/openrisc_me/openrisc_me/trunk

Subversion Repositories openrisc_me

[/] [openrisc/] [trunk/] [gnu-src/] [newlib-1.18.0/] [newlib/] [libc/] [machine/] [sh/] [memcpy.S] - Blame information for rev 297

Go to most recent revision | Details | Compare with Previous | View Log

Line No. Rev Author Line
1 207 jeremybenn
!
2
! Fast SH memcpy
3
!
4
! by Toshiyasu Morita (tm@netcom.com)
5
! hacked by J"orn Rernnecke (joern.rennecke@superh.com) ("o for o-umlaut)
6
! SH5 code Copyright 2002 SuperH Ltd.
7
!
8
! Entry: ARG0: destination pointer
9
!        ARG1: source pointer
10
!        ARG3: byte count
11
!
12
! Exit:  RESULT: destination pointer
13
!        any other registers in the range r0-r7: trashed
14
!
15
! Notes: Usually one wants to do small reads and write a longword, but
16
!        unfortunately it is difficult in some cases to concatanate bytes
17
!        into a longword on the SH, so this does a longword read and small
18
!        writes.
19
!
20
! This implementation makes two assumptions about how it is called:
21
!
22
! 1.: If the byte count is nonzero, the address of the last byte to be
23
!     copied is unsigned greater than the address of the first byte to
24
!     be copied.  This could be easily swapped for a signed comparison,
25
!     but the algorithm used needs some comparison.
26
!
27
! 2.: When there are two or three bytes in the last word of an 11-or-more
28
!     bytes memory chunk to b copied, the rest of the word can be read
29
!     without side effects.
30
!     This could be easily changed by increasing the minumum size of
31
!     a fast memcpy and the amount subtracted from r7 before L_2l_loop be 2,
32
!     however, this would cost a few extra cyles on average.
33
!     For SHmedia, the assumption is that any quadword can be read in its
34
!     enirety if at least one byte is included in the copy.
35
!
36
 
37
#include "asm.h"
38
 
39
ENTRY(memcpy)
40
 
41
#if __SHMEDIA__
42
 
43
#define LDUAQ(P,O,D0,D1) ldlo.q P,O,D0; ldhi.q P,O+7,D1
44
#define STUAQ(P,O,D0,D1) stlo.q P,O,D0; sthi.q P,O+7,D1
45
#define LDUAL(P,O,D0,D1) ldlo.l P,O,D0; ldhi.l P,O+3,D1
46
#define STUAL(P,O,D0,D1) stlo.l P,O,D0; sthi.l P,O+3,D1
47
 
48
        ld.b r3,0,r63
49
        pta/l Large,tr0
50
        movi 25,r0
51
        bgeu/u r4,r0,tr0
52
        nsb r4,r0
53
        shlli r0,5,r0
54
        movi (L1-L0+63*32 + 1) & 0xffff,r1
55
        sub r1, r0, r0
56
L0:     ptrel r0,tr0
57
        add r2,r4,r5
58
        ptabs r18,tr1
59
        add r3,r4,r6
60
        blink tr0,r63
61
 
62
        .balign 8
63
L1:
64
        /* 0 byte memcpy */
65
        blink tr1,r63
66
 
67
L4_7:   /* 4..7 byte memcpy cntd. */
68
        stlo.l r2, 0, r0
69
        or r6, r7, r6
70
        sthi.l r5, -1, r6
71
        stlo.l r5, -4, r6
72
        blink tr1,r63
73
 
74
L2_3:   /* 2 or 3 byte memcpy cntd. */
75
        st.b r5,-1,r6
76
        blink tr1,r63
77
 
78
        /* 1 byte memcpy */
79
        ld.b r3,0,r0
80
        st.b r2,0,r0
81
        blink tr1,r63
82
 
83
L8_15:  /* 8..15 byte memcpy cntd. */
84
        stlo.q r2, 0, r0
85
        or r6, r7, r6
86
        sthi.q r5, -1, r6
87
        stlo.q r5, -8, r6
88
        blink tr1,r63
89
 
90
        /* 2 or 3 byte memcpy */
91
        ld.b r3,0,r0
92
        ld.b r2,0,r63
93
        ld.b r3,1,r1
94
        st.b r2,0,r0
95
        pta/l L2_3,tr0
96
        ld.b r6,-1,r6
97
        st.b r2,1,r1
98
        blink tr0, r63
99
 
100
        /* 4 .. 7 byte memcpy */
101
        LDUAL (r3, 0, r0, r1)
102
        pta L4_7, tr0
103
        ldlo.l r6, -4, r7
104
        or r0, r1, r0
105
        sthi.l r2, 3, r0
106
        ldhi.l r6, -1, r6
107
        blink tr0, r63
108
 
109
        /* 8 .. 15 byte memcpy */
110
        LDUAQ (r3, 0, r0, r1)
111
        pta L8_15, tr0
112
        ldlo.q r6, -8, r7
113
        or r0, r1, r0
114
        sthi.q r2, 7, r0
115
        ldhi.q r6, -1, r6
116
        blink tr0, r63
117
 
118
        /* 16 .. 24 byte memcpy */
119
        LDUAQ (r3, 0, r0, r1)
120
        LDUAQ (r3, 8, r8, r9)
121
        or r0, r1, r0
122
        sthi.q r2, 7, r0
123
        or r8, r9, r8
124
        sthi.q r2, 15, r8
125
        ldlo.q r6, -8, r7
126
        ldhi.q r6, -1, r6
127
        stlo.q r2, 8, r8
128
        stlo.q r2, 0, r0
129
        or r6, r7, r6
130
        sthi.q r5, -1, r6
131
        stlo.q r5, -8, r6
132
        blink tr1,r63
133
 
134
Large:
135
        ld.b r2, 0, r63
136
        pta/l  Loop_ua, tr1
137
        ori r3, -8, r7
138
        sub r2, r7, r22
139
        sub r3, r2, r6
140
        add r2, r4, r5
141
        ldlo.q r3, 0, r0
142
        addi r5, -16, r5
143
        movi 64+8, r27 // could subtract r7 from that.
144
        stlo.q r2, 0, r0
145
        sthi.q r2, 7, r0
146
        ldx.q r22, r6, r0
147
        bgtu/l r27, r4, tr1
148
 
149
        addi r5, -48, r27
150
        pta/l Loop_line, tr0
151
        addi r6, 64, r36
152
        addi r6, -24, r19
153
        addi r6, -16, r20
154
        addi r6, -8, r21
155
 
156
Loop_line:
157
        ldx.q r22, r36, r63
158
        alloco r22, 32
159
        addi r22, 32, r22
160
        ldx.q r22, r19, r23
161
        sthi.q r22, -25, r0
162
        ldx.q r22, r20, r24
163
        ldx.q r22, r21, r25
164
        stlo.q r22, -32, r0
165
        ldx.q r22, r6,  r0
166
        sthi.q r22, -17, r23
167
        sthi.q r22,  -9, r24
168
        sthi.q r22,  -1, r25
169
        stlo.q r22, -24, r23
170
        stlo.q r22, -16, r24
171
        stlo.q r22,  -8, r25
172
        bgeu r27, r22, tr0
173
 
174
Loop_ua:
175
        addi r22, 8, r22
176
        sthi.q r22, -1, r0
177
        stlo.q r22, -8, r0
178
        ldx.q r22, r6, r0
179
        bgtu/l r5, r22, tr1
180
 
181
        add r3, r4, r7
182
        ldlo.q r7, -8, r1
183
        sthi.q r22, 7, r0
184
        ldhi.q r7, -1, r7
185
        ptabs r18,tr1
186
        stlo.q r22, 0, r0
187
        or r1, r7, r1
188
        sthi.q r5, 15, r1
189
        stlo.q r5, 8, r1
190
        blink tr1, r63
191
 
192
#else /* ! SHMEDIA, i.e. SH1 .. SH4 / SHcompact */
193
 
194
#ifdef __SH5__
195
#define DST r2
196
#define SRC r3
197
#define COUNT r4
198
#define TMP0 r5
199
#define TMP1 r6
200
#define RESULT r2
201
#else
202
#define DST r4
203
#define SRC r5
204
#define COUNT r6
205
#define TMP0 r2
206
#define TMP1 r3
207
#define RESULT r0
208
#endif
209
 
210
#ifdef __LITTLE_ENDIAN__
211
        ! Little endian version copies with increasing addresses.
212
        mov DST,TMP1    ! Save return value
213
        mov #11,r0      ! Check if small number of bytes
214
        cmp/hs r0,COUNT
215
                        ! COUNT becomes src end address
216
        SL(bf, L_small, add SRC,COUNT)
217
        mov #1,r1
218
        tst r1,SRC      ! check if source even
219
        SL(bt, L_even, mov COUNT,r7)
220
        mov.b @SRC+,r0  ! no, make it even.
221
        mov.b r0,@DST
222
        add #1,DST
223
L_even: tst r1,DST      ! check if destination is even
224
        add #-3,r7
225
        SL(bf, L_odddst, mov #2,r1)
226
        tst r1,DST      ! check if destination is 4-byte aligned
227
        mov DST,r0
228
        SL(bt, L_al4dst, sub SRC,r0)
229
        mov.w @SRC+,TMP0
230
        mov.w TMP0,@DST
231
        ! add #2,DST  DST is dead here.
232
L_al4dst:
233
        tst r1,SRC
234
        bt L_al4both
235
        mov.w @SRC+,r1
236
        swap.w r1,r1
237
        add #-6,r0
238
        add #-6,r7      ! r7 := src end address minus 9.
239
        .align 2
240
L_2l_loop:
241
        mov.l @SRC+,TMP0 ! Read & write two longwords per iteration
242
        xtrct TMP0,r1
243
        mov.l r1,@(r0,SRC)
244
        cmp/hs r7,SRC
245
        mov.l @SRC+,r1
246
        xtrct r1,TMP0
247
        mov.l TMP0,@(r0,SRC)
248
        bf L_2l_loop
249
        add #-2,SRC
250
        bra  L_cleanup
251
        add #5,r0
252
L_al4both:
253
        add #-4,r0
254
        .align 2
255
L_al4both_loop:
256
        mov.l @SRC+,DST   ! Read longword, write longword per iteration
257
        cmp/hs r7,SRC
258
        SL(bf, L_al4both_loop, mov.l DST,@(r0,SRC))
259
 
260
        bra L_cleanup
261
        add #3,r0
262
 
263
L_odddst:
264
        tst r1,SRC
265
        SL(bt, L_al4src, add #-1,DST)
266
        mov.w @SRC+,r0
267
        mov.b r0,@(1,DST)
268
        shlr8 r0
269
        mov.b r0,@(2,DST)
270
        add #2,DST
271
L_al4src:
272
        .align 2
273
L_odd_loop:
274
        mov.l @SRC+,r0   ! Read longword, write byte, word, byte per iteration
275
        cmp/hs r7,SRC
276
        mov.b r0,@(1,DST)
277
        shlr8 r0
278
        mov.w r0,@(2,DST)
279
        shlr16 r0
280
        mov.b r0,@(4,DST)
281
        SL(bf, L_odd_loop, add #4,DST)
282
        .align 2 ! avoid nop in more frequently executed code.
283
L_cleanup2:
284
        mov     DST,r0
285
        sub     SRC,r0
286
L_cleanup:
287
        cmp/eq  COUNT,SRC
288
        bt      L_ready
289
        .align 2
290
L_cleanup_loop:
291
        mov.b   @SRC+,r1
292
        cmp/eq  COUNT,SRC
293
        mov.b   r1,@(r0,SRC)
294
        bf      L_cleanup_loop
295
L_ready:
296
        rts
297
        mov     TMP1,RESULT
298
L_small:
299
        bra L_cleanup2
300
        add #-1,DST
301
#else /* ! __LITTLE_ENDIAN__ */
302
        ! Big endian version copies with decreasing addresses.
303
        mov DST,r0
304
        add COUNT,r0
305
        sub DST,SRC
306
        mov #11,r1
307
        cmp/hs r1,COUNT
308
        SL(bf, L_small, add #-1,SRC)
309
        mov SRC,TMP1
310
        add r0,TMP1
311
        shlr TMP1
312
        SL(bt, L_even,
313
        mov DST,r7)
314
        mov.b @(r0,SRC),TMP0
315
        add #-1,TMP1
316
        mov.b TMP0,@-r0
317
L_even:
318
        tst #1,r0
319
        add #-1,SRC
320
        SL(bf, L_odddst, add #8,r7)
321
        tst #2,r0
322
        bt L_al4dst
323
        add #-1,TMP1
324
        mov.w @(r0,SRC),r1
325
        mov.w r1,@-r0
326
L_al4dst:
327
        shlr TMP1
328
        bt L_al4both
329
        mov.w @(r0,SRC),r1
330
        swap.w r1,r1
331
        add #4,r7
332
        add #-4,SRC
333
        .align 2
334
L_2l_loop:
335
        mov.l @(r0,SRC),TMP0
336
        xtrct TMP0,r1
337
        mov.l r1,@-r0
338
        cmp/hs r7,r0
339
        mov.l @(r0,SRC),r1
340
        xtrct r1,TMP0
341
        mov.l TMP0,@-r0
342
        bt L_2l_loop
343
        bra L_cleanup
344
        add #5,SRC
345
 
346
        nop ! avoid nop in executed code.
347
L_al4both:
348
        add #-2,SRC
349
        .align 2
350
L_al4both_loop:
351
        mov.l @(r0,SRC),r1
352
        cmp/hs r7,r0
353
        SL(bt, L_al4both_loop,
354
        mov.l r1,@-r0)
355
        bra L_cleanup
356
        add #3,SRC
357
 
358
        nop ! avoid nop in executed code.
359
L_odddst:
360
        shlr TMP1
361
        bt L_al4src
362
        mov.w @(r0,SRC),r1
363
        mov.b r1,@-r0
364
        shlr8 r1
365
        mov.b r1,@-r0
366
L_al4src:
367
        add #-2,SRC
368
        .align 2
369
L_odd_loop:
370
        mov.l @(r0,SRC),TMP0
371
        cmp/hs r7,r0
372
        mov.b TMP0,@-r0
373
        shlr8 TMP0
374
        mov.w TMP0,@-r0
375
        shlr16 TMP0
376
        mov.b TMP0,@-r0
377
        bt L_odd_loop
378
 
379
        add #3,SRC
380
L_cleanup:
381
L_small:
382
        cmp/eq DST,r0
383
        bt L_ready
384
        add #1,DST
385
        .align 2
386
L_cleanup_loop:
387
        mov.b @(r0,SRC),TMP0
388
        cmp/eq DST,r0
389
        mov.b TMP0,@-r0
390
        bf L_cleanup_loop
391
L_ready:
392
        rts
393
        mov r0,RESULT
394
#endif /* ! __LITTLE_ENDIAN__ */
395
#endif /* ! SHMEDIA */

powered by: WebSVN 2.1.0

© copyright 1999-2024 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.