OpenCores
URL https://opencores.org/ocsvn/or1k_old/or1k_old/trunk

Subversion Repositories or1k_old

[/] [or1k_old/] [trunk/] [linux/] [uClibc/] [libc/] [string/] [sh64/] [memcpy.S] - Blame information for rev 1782

Details | Compare with Previous | View Log

Line No. Rev Author Line
1 1325 phoenix
/* Cloned and hacked for uClibc by Paul Mundt, December 2003 */
2
/* Modified by SuperH, Inc. September 2003 */
3
!
4
! Fast SH memcpy
5
!
6
! by Toshiyasu Morita (tm@netcom.com)
7
! hacked by J"orn Rernnecke (joern.rennecke@superh.com) ("o for o-umlaut)
8
! SH5 code Copyright 2002 SuperH Ltd.
9
!
10
! Entry: ARG0: destination pointer
11
!        ARG1: source pointer
12
!        ARG2: byte count
13
!
14
! Exit:  RESULT: destination pointer
15
!        any other registers in the range r0-r7: trashed
16
!
17
! Notes: Usually one wants to do small reads and write a longword, but
18
!        unfortunately it is difficult in some cases to concatanate bytes
19
!        into a longword on the SH, so this does a longword read and small
20
!        writes.
21
!
22
! This implementation makes two assumptions about how it is called:
23
!
24
! 1.: If the byte count is nonzero, the address of the last byte to be
25
!     copied is unsigned greater than the address of the first byte to
26
!     be copied.  This could be easily swapped for a signed comparison,
27
!     but the algorithm used needs some comparison.
28
!
29
! 2.: When there are two or three bytes in the last word of an 11-or-more
30
!     bytes memory chunk to b copied, the rest of the word can be read
31
!     without side effects.
32
!     This could be easily changed by increasing the minumum size of
33
!     a fast memcpy and the amount subtracted from r7 before L_2l_loop be 2,
34
!     however, this would cost a few extra cyles on average.
35
!     For SHmedia, the assumption is that any quadword can be read in its
36
!     enirety if at least one byte is included in the copy.
37
!
38
 
39
        .section .text..SHmedia32,"ax"
40
        .globl  memcpy
41
        .type   memcpy, @function
42
 
43
        .align  5
44
memcpy:
45
 
46
#define LDUAQ(P,O,D0,D1) ldlo.q P,O,D0; ldhi.q P,O+7,D1
47
#define STUAQ(P,O,D0,D1) stlo.q P,O,D0; sthi.q P,O+7,D1
48
#define LDUAL(P,O,D0,D1) ldlo.l P,O,D0; ldhi.l P,O+3,D1
49
#define STUAL(P,O,D0,D1) stlo.l P,O,D0; sthi.l P,O+3,D1
50
 
51
        ld.b r3,0,r63
52
        pta/l Large,tr0
53
        movi 25,r0
54
        bgeu/u r4,r0,tr0
55
        nsb r4,r0
56
        shlli r0,5,r0
57
        movi (L1-L0+63*32 + 1) & 0xffff,r1
58
        sub r1, r0, r0
59
L0:     ptrel r0,tr0
60
        add r2,r4,r5
61
        ptabs r18,tr1
62
        add r3,r4,r6
63
        blink tr0,r63
64
 
65
/* Rearranged to make cut2 safe */
66
        .balign 8
67
L4_7:   /* 4..7 byte memcpy cntd. */
68
        stlo.l r2, 0, r0
69
        or r6, r7, r6
70
        sthi.l r5, -1, r6
71
        stlo.l r5, -4, r6
72
        blink tr1,r63
73
 
74
        .balign 8
75
L1:     /* 0 byte memcpy */
76
        nop
77
        blink tr1,r63
78
        nop
79
        nop
80
        nop
81
        nop
82
 
83
L2_3:   /* 2 or 3 byte memcpy cntd. */
84
        st.b r5,-1,r6
85
        blink tr1,r63
86
 
87
        /* 1 byte memcpy */
88
        ld.b r3,0,r0
89
        st.b r2,0,r0
90
        blink tr1,r63
91
 
92
L8_15:  /* 8..15 byte memcpy cntd. */
93
        stlo.q r2, 0, r0
94
        or r6, r7, r6
95
        sthi.q r5, -1, r6
96
        stlo.q r5, -8, r6
97
        blink tr1,r63
98
 
99
        /* 2 or 3 byte memcpy */
100
        ld.b r3,0,r0
101
        ld.b r2,0,r63
102
        ld.b r3,1,r1
103
        st.b r2,0,r0
104
        pta/l L2_3,tr0
105
        ld.b r6,-1,r6
106
        st.b r2,1,r1
107
        blink tr0, r63
108
 
109
        /* 4 .. 7 byte memcpy */
110
        LDUAL (r3, 0, r0, r1)
111
        pta L4_7, tr0
112
        ldlo.l r6, -4, r7
113
        or r0, r1, r0
114
        sthi.l r2, 3, r0
115
        ldhi.l r6, -1, r6
116
        blink tr0, r63
117
 
118
        /* 8 .. 15 byte memcpy */
119
        LDUAQ (r3, 0, r0, r1)
120
        pta L8_15, tr0
121
        ldlo.q r6, -8, r7
122
        or r0, r1, r0
123
        sthi.q r2, 7, r0
124
        ldhi.q r6, -1, r6
125
        blink tr0, r63
126
 
127
        /* 16 .. 24 byte memcpy */
128
        LDUAQ (r3, 0, r0, r1)
129
        LDUAQ (r3, 8, r8, r9)
130
        or r0, r1, r0
131
        sthi.q r2, 7, r0
132
        or r8, r9, r8
133
        sthi.q r2, 15, r8
134
        ldlo.q r6, -8, r7
135
        ldhi.q r6, -1, r6
136
        stlo.q r2, 8, r8
137
        stlo.q r2, 0, r0
138
        or r6, r7, r6
139
        sthi.q r5, -1, r6
140
        stlo.q r5, -8, r6
141
        blink tr1,r63
142
 
143
Large:
144
        ld.b r2, 0, r63
145
        pta/l  Loop_ua, tr1
146
        ori r3, -8, r7
147
        sub r2, r7, r22
148
        sub r3, r2, r6
149
        add r2, r4, r5
150
        ldlo.q r3, 0, r0
151
        addi r5, -16, r5
152
        movi 64+8, r27 // could subtract r7 from that.
153
        stlo.q r2, 0, r0
154
        sthi.q r2, 7, r0
155
        ldx.q r22, r6, r0
156
        bgtu/l r27, r4, tr1
157
 
158
        addi r5, -48, r27
159
        pta/l Loop_line, tr0
160
        addi r6, 64, r36
161
        addi r6, -24, r19
162
        addi r6, -16, r20
163
        addi r6, -8, r21
164
 
165
Loop_line:
166
        ldx.q r22, r36, r63
167
        alloco r22, 32
168
        addi r22, 32, r22
169
        ldx.q r22, r19, r23
170
        sthi.q r22, -25, r0
171
        ldx.q r22, r20, r24
172
        ldx.q r22, r21, r25
173
        stlo.q r22, -32, r0
174
        ldx.q r22, r6,  r0
175
        sthi.q r22, -17, r23
176
        sthi.q r22,  -9, r24
177
        sthi.q r22,  -1, r25
178
        stlo.q r22, -24, r23
179
        stlo.q r22, -16, r24
180
        stlo.q r22,  -8, r25
181
        bgeu r27, r22, tr0
182
 
183
Loop_ua:
184
        addi r22, 8, r22
185
        sthi.q r22, -1, r0
186
        stlo.q r22, -8, r0
187
        ldx.q r22, r6, r0
188
        bgtu/l r5, r22, tr1
189
 
190
        add r3, r4, r7
191
        ldlo.q r7, -8, r1
192
        sthi.q r22, 7, r0
193
        ldhi.q r7, -1, r7
194
        ptabs r18,tr1
195
        stlo.q r22, 0, r0
196
        or r1, r7, r1
197
        sthi.q r5, 15, r1
198
        stlo.q r5, 8, r1
199
        blink tr1, r63
200
 
201
        .size memcpy, . - memcpy
202
 

powered by: WebSVN 2.1.0

© copyright 1999-2024 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.