URL https://opencores.org/ocsvn/or1k/or1k/trunk

Subversion Repositories or1k

[/] [or1k/] [trunk/] [newlib/] [newlib/] [libc/] [machine/] [sh/] [memcpy.S] - Blame information for rev 1774

Go to most recent revision | Details | Compare with Previous | View Log


!
! Fast SH memcpy
!
! by Toshiyasu Morita (tm@netcom.com)
! hacked by J"orn Rernnecke (amylaar@cygnus.co.uk) ("o for o-umlaut)
!
! Entry: r4: destination pointer
!        r5: source pointer
!        r6: byte count
!
! Exit:  r0: destination pointer
!        r1-r7: trashed
!
! Notes: Usually one wants to do small reads and write a longword, but
!        unfortunately it is difficult in some cases to concatanate bytes
!        into a longword on the SH, so this does a longword read and small
!        writes.
!
! This implementation makes two assumptions about how it is called:
!
! 1.: If the byte count is nonzero, the address of the last byte to be
!     copied is unsigned greater than the address of the first byte to
!     be copied.  This could be easily swapped for a signed comparison,
!     but the algorithm used needs some comparison.
!
! 2.: When there are two or three bytes in the last word of an 11-or-bore
!     bytes memory chunk to b copied, the rest of the word can be read
!     without size effects.
!     This could be easily changed by increasing the minumum size of
!     a fast memcpy and the amount subtracted from r7 before L_2l_loop be 2,
!     however, this would cost a few extra cyles on average.
!
 
#include "asm.h"
 
ENTRY(memcpy)
#ifdef __LITTLE_ENDIAN__
        ! Little endian version copies with increasing addresses.
        mov r4,r3       ! Save return value
        mov #11,r0      ! Check if small number of bytes
        cmp/hs r0,r6
                        ! r6 becomes src end address
        SL(bf, L_small, add r5,r6)
        mov #1,r1
        tst r1,r5       ! check if source even
        SL(bt, L_even, mov r6,r7)
        mov.b @r5+,r0   ! no, make it even.
        mov.b r0,@r4
        add #1,r4
L_even: tst r1,r4       ! check if destination is even
        add #-3,r7
        SL(bf, L_odddst, mov #2,r1)
        tst r1,r4       ! check if destination is 4-byte aligned
        mov r4,r0
        SL(bt, L_al4dst, sub r5,r0)
        mov.w @r5+,r2
        mov.w r2,@r4
        ! add #2,r4  r4 is dead here.
L_al4dst:
        tst r1,r5
        bt L_al4both
        mov.w @r5+,r1
        swap.w r1,r1
        add #-6,r0
        add #-6,r7      ! r7 := src end address minus 9.
        .align 2
L_2l_loop:
        mov.l @r5+,r2 ! Read & write two longwords per iteration
        xtrct r2,r1
        mov.l r1,@(r0,r5)
        cmp/hs r7,r5
        mov.l @r5+,r1
        xtrct r1,r2
        mov.l r2,@(r0,r5)
        bf L_2l_loop
        add #-2,r5
        bra  L_cleanup
        add #5,r0
L_al4both:
        add #-4,r0
        .align 2
L_al4both_loop:
        mov.l @r5+,r4   ! Read longword, write longword per iteration
        cmp/hs r7,r5
        SL(bf, L_al4both_loop, mov.l r4,@(r0,r5))
 
        bra L_cleanup
        add #3,r0
 
L_odddst:
        tst r1,r5
        SL(bt, L_al4src, add #-1,r4)
        mov.w @r5+,r0
        mov.b r0,@(1,r4)
        shlr8 r0
        mov.b r0,@(2,r4)
        add #2,r4
L_al4src:
        .align 2
L_odd_loop:
        mov.l @r5+,r0   ! Read longword, write byte, word, byte per iteration
        cmp/hs r7,r5
        mov.b r0,@(1,r4)
        shlr8 r0
        mov.w r0,@(2,r4)
        shlr16 r0
        mov.b r0,@(4,r4)
        SL(bf, L_odd_loop, add #4,r4)
        .align 2 ! avoid nop in more frequently executed code.
L_cleanup2:
        mov     r4,r0
        sub     r5,r0
L_cleanup:
        cmp/eq  r6,r5
        bt      L_ready
        .align 2
L_cleanup_loop:
        mov.b   @r5+,r1
        cmp/eq  r6,r5
        mov.b   r1,@(r0,r5)
        bf      L_cleanup_loop
L_ready:
        rts
        mov     r3,r0
L_small:
        bra L_cleanup2
        add #-1,r4
#else
        ! Big endian version copies with decreasing addresses.
        mov r4,r0
        add r6,r0
        sub r4,r5
        mov #11,r1
        cmp/hs r1,r6
        SL(bf, L_small, add #-1,r5)
        mov r5,r3
        add r0,r3
        shlr r3
        SL(bt, L_even,
        mov r4,r7)
        mov.b @(r0,r5),r2
        add #-1,r3
        mov.b r2,@-r0
L_even:
        tst #1,r0
        add #-1,r5
        SL(bf, L_odddst, add #8,r7)
        tst #2,r0
        bt L_al4dst
        add #-1,r3
        mov.w @(r0,r5),r1
        mov.w r1,@-r0
L_al4dst:
        shlr r3
        bt L_al4both
        mov.w @(r0,r5),r1
        swap.w r1,r1
        add #4,r7
        add #-4,r5
        .align 2
L_2l_loop:
        mov.l @(r0,r5),r2
        xtrct r2,r1
        mov.l r1,@-r0
        cmp/hs r7,r0
        mov.l @(r0,r5),r1
        xtrct r1,r2
        mov.l r2,@-r0
        bt L_2l_loop
        bra L_cleanup
        add #5,r5
 
        nop ! avoid nop in executed code.
L_al4both:
        add #-2,r5
        .align 2
L_al4both_loop:
        mov.l @(r0,r5),r1
        cmp/hs r7,r0
        SL(bt, L_al4both_loop,
        mov.l r1,@-r0)
        bra L_cleanup
        add #3,r5
 
        nop ! avoid nop in executed code.
L_odddst:
        shlr r3
        bt L_al4src
        mov.w @(r0,r5),r1
        mov.b r1,@-r0
        shlr8 r1
        mov.b r1,@-r0
L_al4src:
        add #-2,r5
        .align 2
L_odd_loop:
        mov.l @(r0,r5),r2
        cmp/hs r7,r0
        mov.b r2,@-r0
        shlr8 r2
        mov.w r2,@-r0
        shlr16 r2
        mov.b r2,@-r0
        bt L_odd_loop
 
        add #3,r5
L_cleanup:
L_small:
        cmp/eq r4,r0
        bt L_ready
        add #1,r4
        .align 2
L_cleanup_loop:
        mov.b @(r0,r5),r2
        cmp/eq r4,r0
        mov.b r2,@-r0
        bf L_cleanup_loop
L_ready:
        rts
        nop
#endif

Line No.	Rev	Author	Line
1	39	lampret	`!`
2			`! Fast SH memcpy`
3			`!`
4			`! by Toshiyasu Morita (tm@netcom.com)`
5			`! hacked by J"orn Rernnecke (amylaar@cygnus.co.uk) ("o for o-umlaut)`
6			`!`
7			`! Entry: r4: destination pointer`
8			`! r5: source pointer`
9			`! r6: byte count`
10			`!`
11			`! Exit: r0: destination pointer`
12			`! r1-r7: trashed`
13			`!`
14			`! Notes: Usually one wants to do small reads and write a longword, but`
15			`! unfortunately it is difficult in some cases to concatanate bytes`
16			`! into a longword on the SH, so this does a longword read and small`
17			`! writes.`
18			`!`
19			`! This implementation makes two assumptions about how it is called:`
20			`!`
21			`! 1.: If the byte count is nonzero, the address of the last byte to be`
22			`! copied is unsigned greater than the address of the first byte to`
23			`! be copied. This could be easily swapped for a signed comparison,`
24			`! but the algorithm used needs some comparison.`
25			`!`
26			`! 2.: When there are two or three bytes in the last word of an 11-or-bore`
27			`! bytes memory chunk to b copied, the rest of the word can be read`
28			`! without size effects.`
29			`! This could be easily changed by increasing the minumum size of`
30			`! a fast memcpy and the amount subtracted from r7 before L_2l_loop be 2,`
31			`! however, this would cost a few extra cyles on average.`
32			`!`
33
34			`#include "asm.h"`
35
36			`ENTRY(memcpy)`
37			`#ifdef __LITTLE_ENDIAN__`
38			`! Little endian version copies with increasing addresses.`
39			`mov r4,r3 ! Save return value`
40			`mov #11,r0 ! Check if small number of bytes`
41			`cmp/hs r0,r6`
42			`! r6 becomes src end address`
43			`SL(bf, L_small, add r5,r6)`
44			`mov #1,r1`
45			`tst r1,r5 ! check if source even`
46			`SL(bt, L_even, mov r6,r7)`
47			`mov.b @r5+,r0 ! no, make it even.`
48			`mov.b r0,@r4`
49			`add #1,r4`
50			`L_even: tst r1,r4 ! check if destination is even`
51			`add #-3,r7`
52			`SL(bf, L_odddst, mov #2,r1)`
53			`tst r1,r4 ! check if destination is 4-byte aligned`
54			`mov r4,r0`
55			`SL(bt, L_al4dst, sub r5,r0)`
56			`mov.w @r5+,r2`
57			`mov.w r2,@r4`
58			`! add #2,r4 r4 is dead here.`
59			`L_al4dst:`
60			`tst r1,r5`
61			`bt L_al4both`
62			`mov.w @r5+,r1`
63			`swap.w r1,r1`
64			`add #-6,r0`
65			`add #-6,r7 ! r7 := src end address minus 9.`
66			`.align 2`
67			`L_2l_loop:`
68			`mov.l @r5+,r2 ! Read & write two longwords per iteration`
69			`xtrct r2,r1`
70			`mov.l r1,@(r0,r5)`
71			`cmp/hs r7,r5`
72			`mov.l @r5+,r1`
73			`xtrct r1,r2`
74			`mov.l r2,@(r0,r5)`
75			`bf L_2l_loop`
76			`add #-2,r5`
77			`bra L_cleanup`
78			`add #5,r0`
79			`L_al4both:`
80			`add #-4,r0`
81			`.align 2`
82			`L_al4both_loop:`
83			`mov.l @r5+,r4 ! Read longword, write longword per iteration`
84			`cmp/hs r7,r5`
85			`SL(bf, L_al4both_loop, mov.l r4,@(r0,r5))`
86
87			`bra L_cleanup`
88			`add #3,r0`
89
90			`L_odddst:`
91			`tst r1,r5`
92			`SL(bt, L_al4src, add #-1,r4)`
93			`mov.w @r5+,r0`
94			`mov.b r0,@(1,r4)`
95			`shlr8 r0`
96			`mov.b r0,@(2,r4)`
97			`add #2,r4`
98			`L_al4src:`
99			`.align 2`
100			`L_odd_loop:`
101			`mov.l @r5+,r0 ! Read longword, write byte, word, byte per iteration`
102			`cmp/hs r7,r5`
103			`mov.b r0,@(1,r4)`
104			`shlr8 r0`
105			`mov.w r0,@(2,r4)`
106			`shlr16 r0`
107			`mov.b r0,@(4,r4)`
108			`SL(bf, L_odd_loop, add #4,r4)`
109			`.align 2 ! avoid nop in more frequently executed code.`
110			`L_cleanup2:`
111			`mov r4,r0`
112			`sub r5,r0`
113			`L_cleanup:`
114			`cmp/eq r6,r5`
115			`bt L_ready`
116			`.align 2`
117			`L_cleanup_loop:`
118			`mov.b @r5+,r1`
119			`cmp/eq r6,r5`
120			`mov.b r1,@(r0,r5)`
121			`bf L_cleanup_loop`
122			`L_ready:`
123			`rts`
124			`mov r3,r0`
125			`L_small:`
126			`bra L_cleanup2`
127			`add #-1,r4`
128			`#else`
129			`! Big endian version copies with decreasing addresses.`
130			`mov r4,r0`
131			`add r6,r0`
132			`sub r4,r5`
133			`mov #11,r1`
134			`cmp/hs r1,r6`
135	56	joel	`SL(bf, L_small, add #-1,r5)`
136	39	lampret	`mov r5,r3`
137			`add r0,r3`
138			`shlr r3`
139			`SL(bt, L_even,`
140			`mov r4,r7)`
141			`mov.b @(r0,r5),r2`
142			`add #-1,r3`
143			`mov.b r2,@-r0`
144			`L_even:`
145			`tst #1,r0`
146			`add #-1,r5`
147	56	joel	`SL(bf, L_odddst, add #8,r7)`
148	39	lampret	`tst #2,r0`
149			`bt L_al4dst`
150			`add #-1,r3`
151			`mov.w @(r0,r5),r1`
152			`mov.w r1,@-r0`
153			`L_al4dst:`
154			`shlr r3`
155			`bt L_al4both`
156			`mov.w @(r0,r5),r1`
157			`swap.w r1,r1`
158			`add #4,r7`
159			`add #-4,r5`
160			`.align 2`
161			`L_2l_loop:`
162			`mov.l @(r0,r5),r2`
163			`xtrct r2,r1`
164			`mov.l r1,@-r0`
165			`cmp/hs r7,r0`
166			`mov.l @(r0,r5),r1`
167			`xtrct r1,r2`
168			`mov.l r2,@-r0`
169			`bt L_2l_loop`
170			`bra L_cleanup`
171			`add #5,r5`
172
173			`nop ! avoid nop in executed code.`
174			`L_al4both:`
175			`add #-2,r5`
176			`.align 2`
177			`L_al4both_loop:`
178			`mov.l @(r0,r5),r1`
179			`cmp/hs r7,r0`
180			`SL(bt, L_al4both_loop,`
181			`mov.l r1,@-r0)`
182			`bra L_cleanup`
183			`add #3,r5`
184
185			`nop ! avoid nop in executed code.`
186			`L_odddst:`
187			`shlr r3`
188			`bt L_al4src`
189			`mov.w @(r0,r5),r1`
190			`mov.b r1,@-r0`
191			`shlr8 r1`
192			`mov.b r1,@-r0`
193			`L_al4src:`
194			`add #-2,r5`
195			`.align 2`
196			`L_odd_loop:`
197			`mov.l @(r0,r5),r2`
198			`cmp/hs r7,r0`
199			`mov.b r2,@-r0`
200			`shlr8 r2`
201			`mov.w r2,@-r0`
202			`shlr16 r2`
203			`mov.b r2,@-r0`
204			`bt L_odd_loop`
205
206			`add #3,r5`
207			`L_cleanup:`
208			`L_small:`
209			`cmp/eq r4,r0`
210			`bt L_ready`
211			`add #1,r4`
212			`.align 2`
213			`L_cleanup_loop:`
214			`mov.b @(r0,r5),r2`
215			`cmp/eq r4,r0`
216			`mov.b r2,@-r0`
217			`bf L_cleanup_loop`
218			`L_ready:`
219			`rts`
220			`nop`
221			`#endif`

Browse

Tools

Subversion Repositories or1k

[/] [or1k/] [trunk/] [newlib/] [newlib/] [libc/] [machine/] [sh/] [memcpy.S] - Blame information for rev 1774