OpenCores
URL https://opencores.org/ocsvn/or1k/or1k/trunk

Subversion Repositories or1k

[/] [or1k/] [trunk/] [linux/] [linux-2.4/] [arch/] [sh64/] [lib/] [copy_user_memcpy.S] - Blame information for rev 1765

Details | Compare with Previous | View Log

Line No. Rev Author Line
1 1275 phoenix
!
2
! Fast SH memcpy
3
!
4
! by Toshiyasu Morita (tm@netcom.com)
5
! hacked by J"orn Rernnecke (joern.rennecke@superh.com) ("o for o-umlaut)
6
! SH5 code Copyright 2002 SuperH Ltd.
7
!
8
! Entry: ARG0: destination pointer
9
!        ARG1: source pointer
10
!        ARG2: byte count
11
!
12
! Exit:  RESULT: destination pointer
13
!        any other registers in the range r0-r7: trashed
14
!
15
! Notes: Usually one wants to do small reads and write a longword, but
16
!        unfortunately it is difficult in some cases to concatanate bytes
17
!        into a longword on the SH, so this does a longword read and small
18
!        writes.
19
!
20
! This implementation makes two assumptions about how it is called:
21
!
22
! 1.: If the byte count is nonzero, the address of the last byte to be
23
!     copied is unsigned greater than the address of the first byte to
24
!     be copied.  This could be easily swapped for a signed comparison,
25
!     but the algorithm used needs some comparison.
26
!
27
! 2.: When there are two or three bytes in the last word of an 11-or-more
28
!     bytes memory chunk to b copied, the rest of the word can be read
29
!     without side effects.
30
!     This could be easily changed by increasing the minumum size of
31
!     a fast memcpy and the amount subtracted from r7 before L_2l_loop be 2,
32
!     however, this would cost a few extra cyles on average.
33
!     For SHmedia, the assumption is that any quadword can be read in its
34
!     enirety if at least one byte is included in the copy.
35
 
36
/* Imported into Linux kernel by Richard Curnow.  This is used to implement the
37
   __copy_user function in the general case, so it has to be a distinct
38
   function from intra-kernel memcpy to allow for exception fix-ups in the
39
   event that the user pointer is bad somewhere in the copy (e.g. due to
40
   running off the end of the vma).
41
 
42
   Note, this algorithm will be slightly wasteful in the case where the source
43
   and destination pointers are equally aligned, because the stlo/sthi pairs
44
   could then be merged back into single stores.  If there are a lot of cache
45
   misses, this is probably offset by the stall lengths on the preloads.
46
 
47
*/
48
 
49
        .section .text..SHmedia32,"ax"
50
        .little
51
        .balign 32
52
        .global copy_user_memcpy
53
        .global copy_user_memcpy_end
54
copy_user_memcpy:
55
 
56
#define LDUAQ(P,O,D0,D1) ldlo.q P,O,D0; ldhi.q P,O+7,D1
57
#define STUAQ(P,O,D0,D1) stlo.q P,O,D0; sthi.q P,O+7,D1
58
#define LDUAL(P,O,D0,D1) ldlo.l P,O,D0; ldhi.l P,O+3,D1
59
#define STUAL(P,O,D0,D1) stlo.l P,O,D0; sthi.l P,O+3,D1
60
 
61
        ld.b r3,0,r63
62
        pta/l Large,tr0
63
        movi 25,r0
64
        bgeu/u r4,r0,tr0
65
        nsb r4,r0
66
        shlli r0,5,r0
67
        movi (L1-L0+63*32 + 1) & 0xffff,r1
68
        sub r1, r0, r0
69
L0:     ptrel r0,tr0
70
        add r2,r4,r5
71
        ptabs r18,tr1
72
        add r3,r4,r6
73
        blink tr0,r63
74
 
75
/* Rearranged to make cut2 safe */
76
        .balign 8
77
L4_7:   /* 4..7 byte memcpy cntd. */
78
        stlo.l r2, 0, r0
79
        or r6, r7, r6
80
        sthi.l r5, -1, r6
81
        stlo.l r5, -4, r6
82
        blink tr1,r63
83
 
84
        .balign 8
85
L1:     /* 0 byte memcpy */
86
        nop
87
        blink tr1,r63
88
        nop
89
        nop
90
        nop
91
        nop
92
 
93
L2_3:   /* 2 or 3 byte memcpy cntd. */
94
        st.b r5,-1,r6
95
        blink tr1,r63
96
 
97
        /* 1 byte memcpy */
98
        ld.b r3,0,r0
99
        st.b r2,0,r0
100
        blink tr1,r63
101
 
102
L8_15:  /* 8..15 byte memcpy cntd. */
103
        stlo.q r2, 0, r0
104
        or r6, r7, r6
105
        sthi.q r5, -1, r6
106
        stlo.q r5, -8, r6
107
        blink tr1,r63
108
 
109
        /* 2 or 3 byte memcpy */
110
        ld.b r3,0,r0
111
        ld.b r2,0,r63
112
        ld.b r3,1,r1
113
        st.b r2,0,r0
114
        pta/l L2_3,tr0
115
        ld.b r6,-1,r6
116
        st.b r2,1,r1
117
        blink tr0, r63
118
 
119
        /* 4 .. 7 byte memcpy */
120
        LDUAL (r3, 0, r0, r1)
121
        pta L4_7, tr0
122
        ldlo.l r6, -4, r7
123
        or r0, r1, r0
124
        sthi.l r2, 3, r0
125
        ldhi.l r6, -1, r6
126
        blink tr0, r63
127
 
128
        /* 8 .. 15 byte memcpy */
129
        LDUAQ (r3, 0, r0, r1)
130
        pta L8_15, tr0
131
        ldlo.q r6, -8, r7
132
        or r0, r1, r0
133
        sthi.q r2, 7, r0
134
        ldhi.q r6, -1, r6
135
        blink tr0, r63
136
 
137
        /* 16 .. 24 byte memcpy */
138
        LDUAQ (r3, 0, r0, r1)
139
        LDUAQ (r3, 8, r8, r9)
140
        or r0, r1, r0
141
        sthi.q r2, 7, r0
142
        or r8, r9, r8
143
        sthi.q r2, 15, r8
144
        ldlo.q r6, -8, r7
145
        ldhi.q r6, -1, r6
146
        stlo.q r2, 8, r8
147
        stlo.q r2, 0, r0
148
        or r6, r7, r6
149
        sthi.q r5, -1, r6
150
        stlo.q r5, -8, r6
151
        blink tr1,r63
152
 
153
Large:
154
        ld.b r2, 0, r63
155
        pta/l  Loop_ua, tr1
156
        ori r3, -8, r7
157
        sub r2, r7, r22
158
        sub r3, r2, r6
159
        add r2, r4, r5
160
        ldlo.q r3, 0, r0
161
        addi r5, -16, r5
162
        movi 64+8, r27 ! could subtract r7 from that.
163
        stlo.q r2, 0, r0
164
        sthi.q r2, 7, r0
165
        ldx.q r22, r6, r0
166
        bgtu/l r27, r4, tr1
167
 
168
        addi r5, -48, r27
169
        pta/l Loop_line, tr0
170
        addi r6, 64, r36
171
        addi r6, -24, r19
172
        addi r6, -16, r20
173
        addi r6, -8, r21
174
 
175
Loop_line:
176
        ldx.q r22, r36, r63
177
        alloco r22, 32
178
        addi r22, 32, r22
179
        ldx.q r22, r19, r23
180
        sthi.q r22, -25, r0
181
        ldx.q r22, r20, r24
182
        ldx.q r22, r21, r25
183
        stlo.q r22, -32, r0
184
        ldx.q r22, r6,  r0
185
        sthi.q r22, -17, r23
186
        sthi.q r22,  -9, r24
187
        sthi.q r22,  -1, r25
188
        stlo.q r22, -24, r23
189
        stlo.q r22, -16, r24
190
        stlo.q r22,  -8, r25
191
        bgeu r27, r22, tr0
192
 
193
Loop_ua:
194
        addi r22, 8, r22
195
        sthi.q r22, -1, r0
196
        stlo.q r22, -8, r0
197
        ldx.q r22, r6, r0
198
        bgtu/l r5, r22, tr1
199
 
200
        add r3, r4, r7
201
        ldlo.q r7, -8, r1
202
        sthi.q r22, 7, r0
203
        ldhi.q r7, -1, r7
204
        ptabs r18,tr1
205
        stlo.q r22, 0, r0
206
        or r1, r7, r1
207
        sthi.q r5, 15, r1
208
        stlo.q r5, 8, r1
209
        blink tr1, r63
210
copy_user_memcpy_end:
211
        nop

powered by: WebSVN 2.1.0

© copyright 1999-2025 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.