OpenCores
URL https://opencores.org/ocsvn/or1k/or1k/trunk

Subversion Repositories or1k

[/] [or1k/] [trunk/] [linux/] [linux-2.4/] [arch/] [sparc64/] [lib/] [U3copy_in_user.S] - Blame information for rev 1765

Details | Compare with Previous | View Log

Line No. Rev Author Line
1 1275 phoenix
/* $Id: U3copy_in_user.S,v 1.1.1.1 2004-04-15 01:33:49 phoenix Exp $
2
 * U3memcpy.S: UltraSparc-III optimized copy within userspace.
3
 *
4
 * Copyright (C) 1999, 2000 David S. Miller (davem@redhat.com)
5
 */
6
 
7
#ifdef __KERNEL__
8
#include 
9
#include 
10
#undef SMALL_COPY_USES_FPU
11
#define EXNV(x,y,a,b)   \
12
98:     x,y;                            \
13
        .section .fixup;                \
14
        .align 4;                       \
15
99:     retl;                           \
16
         a, b, %o0;                     \
17
        .section __ex_table;            \
18
        .align 4;                       \
19
        .word 98b, 99b;                 \
20
        .text;                          \
21
        .align 4;
22
#define EXNV2(x,y,a,b)  \
23
98:     x,y;                            \
24
        .section .fixup;                \
25
        .align 4;                       \
26
99:     a, b, %o0;                      \
27
        retl;                           \
28
         add %o0, 1, %o0;               \
29
        .section __ex_table;            \
30
        .align 4;                       \
31
        .word 98b, 99b;                 \
32
        .text;                          \
33
        .align 4;
34
#define EXNV3(x,y,a,b)  \
35
98:     x,y;                            \
36
        .section .fixup;                \
37
        .align 4;                       \
38
99:     a, b, %o0;                      \
39
        retl;                           \
40
         add %o0, 8, %o0;               \
41
        .section __ex_table;            \
42
        .align 4;                       \
43
        .word 98b, 99b;                 \
44
        .text;                          \
45
        .align 4;
46
#define EX(x,y,a,b)                     \
47
98:     x,y;                            \
48
        .section .fixup;                \
49
        .align 4;                       \
50
99:     VISExitHalf;                    \
51
        retl;                           \
52
         a, b, %o0;                     \
53
        .section __ex_table;            \
54
        .align 4;                       \
55
        .word 98b, 99b;                 \
56
        .text;                          \
57
        .align 4;
58
#define EXBLK1(x,y)                     \
59
98:     x,y;                            \
60
        .section .fixup;                \
61
        .align 4;                       \
62
99:     VISExitHalf;                    \
63
        add %o4, 0x1c0, %o1;            \
64
        and %o2, (0x40 - 1), %o2;       \
65
        retl;                           \
66
         add %o1, %o2, %o0;             \
67
        .section __ex_table;            \
68
        .align 4;                       \
69
        .word 98b, 99b;                 \
70
        .text;                          \
71
        .align 4;
72
#define EXBLK2(x,y)                     \
73
98:     x,y;                            \
74
        .section .fixup;                \
75
        .align 4;                       \
76
99:     VISExitHalf;                    \
77
        sll %o3, 6, %o3;                \
78
        and %o2, (0x40 - 1), %o2;       \
79
        add %o3, 0x80, %o1;             \
80
        retl;                           \
81
         add %o1, %o2, %o0;             \
82
        .section __ex_table;            \
83
        .align 4;                       \
84
        .word 98b, 99b;                 \
85
        .text;                          \
86
        .align 4;
87
#define EXBLK3(x,y)                     \
88
98:     x,y;                            \
89
        .section .fixup;                \
90
        .align 4;                       \
91
99:     VISExitHalf;                    \
92
        and %o2, (0x40 - 1), %o2;       \
93
        retl;                           \
94
         add %o2, 0x80, %o0;            \
95
        .section __ex_table;            \
96
        .align 4;                       \
97
        .word 98b, 99b;                 \
98
        .text;                          \
99
        .align 4;
100
#define EXBLK4(x,y)                     \
101
98:     x,y;                            \
102
        .section .fixup;                \
103
        .align 4;                       \
104
99:     VISExitHalf;                    \
105
        and %o2, (0x40 - 1), %o2;       \
106
        retl;                           \
107
         add %o2, 0x40, %o0;            \
108
        .section __ex_table;            \
109
        .align 4;                       \
110
        .word 98b, 99b;                 \
111
        .text;                          \
112
        .align 4;
113
#else
114
#define ASI_AIUS 0x80
115
#define ASI_BLK_AIUS 0xf0
116
#define FPRS_FEF  0x04
117
#define VISEntryHalf rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs
118
#define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
119
#define SMALL_COPY_USES_FPU
120
#define EXNV(x,y,a,b)   x,y;
121
#define EXNV2(x,y,a,b)  x,y;
122
#define EXNV3(x,y,a,b)  x,y;
123
#define EX(x,y,a,b)     x,y;
124
#define EXBLK1(x,y)     x,y;
125
#define EXBLK2(x,y)     x,y;
126
#define EXBLK3(x,y)     x,y;
127
#define EXBLK4(x,y)     x,y;
128
#endif
129
 
130
        /* Special/non-trivial issues of this code:
131
         *
132
         * 1) %o5 is preserved from VISEntryHalf to VISExitHalf
133
         * 2) Only low 32 FPU registers are used so that only the
134
         *    lower half of the FPU register set is dirtied by this
135
         *    code.  This is especially important in the kernel.
136
         * 3) This code never prefetches cachelines past the end
137
         *    of the source buffer.
138
         *
139
         *    XXX Actually, Cheetah can buffer up to 8 concurrent
140
         *    XXX prefetches, revisit this...
141
         */
142
 
143
        .text
144
        .align  32
145
 
146
        /* The cheetah's flexible spine, oversized liver, enlarged heart,
147
         * slender muscular body, and claws make it the swiftest hunter
148
         * in Africa and the fastest animal on land.  Can reach speeds
149
         * of up to 2.4GB per second.
150
         */
151
 
152
        .globl          U3copy_in_user
153
U3copy_in_user: /* %o0=dst, %o1=src, %o2=len */
154
        /* Writing to %asi is _expensive_ so we hardcode it.
155
         * Reading %asi to check for KERNEL_DS is comparatively
156
         * cheap.
157
         */
158
        rd              %asi, %g1                       ! MS    Group   (4 cycles)
159
        cmp             %g1, ASI_AIUS                   ! A0    Group
160
        bne             U3memcpy                        ! BR
161
         nop                                            ! A1
162
#ifndef __KERNEL__
163
        /* Save away original 'dst' for memcpy return value. */
164
        mov             %o0, %g3                        ! A0    Group
165
#endif
166
        /* Anything to copy at all? */
167
        cmp             %o2, 0                          ! A1
168
        ble,pn          %icc, U3copy_in_user_short_ret  ! BR
169
 
170
        /* Extremely small copy? */
171
         cmp            %o2, 31                         ! A0    Group
172
        ble,pn          %icc, U3copy_in_user_short      ! BR
173
 
174
        /* Large enough to use unrolled prefetch loops? */
175
         cmp            %o2, 0x100                      ! A1
176
        bge,a,pt        %icc, U3copy_in_user_enter      ! BR    Group
177
         andcc          %o0, 0x3f, %g2                  ! A0
178
 
179
        ba,pt           %xcc, U3copy_in_user_toosmall   ! BR    Group
180
         andcc          %o0, 0x7, %g2                   ! A0
181
 
182
        .align          32
183
U3copy_in_user_short:
184
        /* Copy %o2 bytes from src to dst, one byte at a time. */
185
        EXNV(lduba [%o1 + 0x00] %asi, %o3, add %o2, %g0)! MS    Group
186
        add             %o1, 0x1, %o1                   ! A0
187
        add             %o0, 0x1, %o0                   ! A1
188
        subcc           %o2, 1, %o2                     ! A0    Group
189
 
190
        bg,pt           %icc, U3copy_in_user_short      ! BR
191
         EXNV(stba %o3, [%o0 + -1] %asi, add %o2, 1)    ! MS    Group (1-cycle stall)
192
 
193
U3copy_in_user_short_ret:
194
#ifdef __KERNEL__
195
        retl                                            ! BR    Group (0-4 cycle stall)
196
         clr            %o0                             ! A0
197
#else
198
        retl                                            ! BR    Group (0-4 cycle stall)
199
         mov            %g3, %o0                        ! A0
200
#endif
201
 
202
        /* Here len >= (6 * 64) and condition codes reflect execution
203
         * of "andcc %o0, 0x7, %g2", done by caller.
204
         */
205
        .align          64
206
U3copy_in_user_enter:
207
        /* Is 'dst' already aligned on an 64-byte boundary? */
208
        be,pt           %xcc, 2f                        ! BR
209
 
210
        /* Compute abs((dst & 0x3f) - 0x40) into %g2.  This is the number
211
         * of bytes to copy to make 'dst' 64-byte aligned.  We pre-
212
         * subtract this from 'len'.
213
         */
214
         sub            %g2, 0x40, %g2                  ! A0    Group
215
        sub             %g0, %g2, %g2                   ! A0    Group
216
        sub             %o2, %g2, %o2                   ! A0    Group
217
 
218
        /* Copy %g2 bytes from src to dst, one byte at a time. */
219
1:      EXNV(lduba [%o1 + 0x00] %asi, %o3, add %o2, %g2)! MS    (Group)
220
        add             %o1, 0x1, %o1                   ! A1
221
        add             %o0, 0x1, %o0                   ! A0    Group
222
        subcc           %g2, 0x1, %g2                   ! A1
223
 
224
        bg,pt           %icc, 1b                        ! BR    Group
225
         EXNV2(stba %o3, [%o0 + -1] %asi, add %o2, %g2) ! MS    Group
226
 
227
2:      VISEntryHalf                                    ! MS+MS
228
        and             %o1, 0x7, %g1                   ! A1
229
        ba,pt           %xcc, U3copy_in_user_begin      ! BR
230
         alignaddr      %o1, %g0, %o1                   ! MS          (Break-after)
231
 
232
        .align          64
233
U3copy_in_user_begin:
234
        prefetcha       [%o1 + 0x000] %asi, #one_read   ! MS    Group1
235
        prefetcha       [%o1 + 0x040] %asi, #one_read   ! MS    Group2
236
        andn            %o2, (0x40 - 1), %o4            ! A0
237
        prefetcha       [%o1 + 0x080] %asi, #one_read   ! MS    Group3
238
        cmp             %o4, 0x140                      ! A0
239
        prefetcha       [%o1 + 0x0c0] %asi, #one_read   ! MS    Group4
240
        EX(ldda [%o1 + 0x000] %asi, %f0, add %o2, %g0)  ! MS    Group5 (%f0 results at G8)
241
        bge,a,pt        %icc, 1f                        ! BR
242
 
243
         prefetcha      [%o1 + 0x100] %asi, #one_read   ! MS    Group6
244
1:      EX(ldda [%o1 + 0x008] %asi, %f2, add %o2, %g0)  ! AX           (%f2 results at G9)
245
        cmp             %o4, 0x180                      ! A1
246
        bge,a,pt        %icc, 1f                        ! BR
247
         prefetcha      [%o1 + 0x140] %asi, #one_read   ! MS    Group7
248
1:      EX(ldda [%o1 + 0x010] %asi, %f4, add %o2, %g0)  ! AX           (%f4 results at G10)
249
        cmp             %o4, 0x1c0                      ! A1
250
        bge,a,pt        %icc, 1f                        ! BR
251
 
252
         prefetcha      [%o1 + 0x180] %asi, #one_read   ! MS    Group8
253
1:      faligndata      %f0, %f2, %f16                  ! FGA   Group9 (%f16 at G12)
254
        EX(ldda [%o1 + 0x018] %asi, %f6, add %o2, %g0)  ! AX           (%f6 results at G12)
255
        faligndata      %f2, %f4, %f18                  ! FGA   Group10 (%f18 results at G13)
256
        EX(ldda [%o1 + 0x020] %asi, %f8, add %o2, %g0)  ! MS            (%f8 results at G13)
257
        faligndata      %f4, %f6, %f20                  ! FGA   Group12 (1-cycle stall,%f20 at G15)
258
        EX(ldda [%o1 + 0x028] %asi, %f10, add %o2, %g0) ! MS            (%f10 results at G15)
259
        faligndata      %f6, %f8, %f22                  ! FGA   Group13 (%f22 results at G16)
260
 
261
        EX(ldda [%o1 + 0x030] %asi, %f12, add %o2, %g0) ! MS            (%f12 results at G16)
262
        faligndata      %f8, %f10, %f24                 ! FGA   Group15 (1-cycle stall,%f24 at G18)
263
        EX(ldda [%o1 + 0x038] %asi, %f14, add %o2, %g0) ! MS            (%f14 results at G18)
264
        faligndata      %f10, %f12, %f26                ! FGA   Group16 (%f26 results at G19)
265
        EX(ldda [%o1 + 0x040] %asi, %f0, add %o2, %g0)  ! MS            (%f0 results at G19)
266
 
267
        /* We only use the first loop if len > (7 * 64). */
268
        subcc           %o4, 0x1c0, %o4                 ! A0    Group17
269
        bg,pt           %icc, U3copy_in_user_loop1      ! BR
270
         add            %o1, 0x40, %o1                  ! A1
271
 
272
        add             %o4, 0x140, %o4                 ! A0    Group18
273
        ba,pt           %xcc, U3copy_in_user_loop2      ! BR
274
         srl            %o4, 6, %o3                     ! A0    Group19
275
        nop
276
        nop
277
        nop
278
        nop
279
        nop
280
 
281
        nop
282
        nop
283
 
284
        /* This loop performs the copy and queues new prefetches.
285
         * We drop into the second loop when len <= (5 * 64).  Note
286
         * that this (5 * 64) factor has been subtracted from len
287
         * already.
288
         */
289
U3copy_in_user_loop1:
290
        EXBLK1(ldda [%o1 + 0x008] %asi, %f2)            ! MS    Group2  (%f2 results at G5)
291
        faligndata      %f12, %f14, %f28                ! FGA           (%f28 results at G5)
292
        EXBLK1(ldda [%o1 + 0x010] %asi, %f4)            ! MS    Group3  (%f4 results at G6)
293
        faligndata      %f14, %f0, %f30                 ! FGA   Group4  (1-cycle stall, %f30 at G7)
294
        EXBLK1(stda %f16, [%o0] ASI_BLK_AIUS)           ! MS
295
        EXBLK1(ldda [%o1 + 0x018] %asi, %f6)            ! AX            (%f6 results at G7)
296
 
297
        faligndata      %f0, %f2, %f16                  ! FGA   Group12 (7-cycle stall)
298
        EXBLK1(ldda [%o1 + 0x020] %asi, %f8)            ! MS            (%f8 results at G15)
299
        faligndata      %f2, %f4, %f18                  ! FGA   Group13 (%f18 results at G16)
300
        EXBLK1(ldda [%o1 + 0x028] %asi, %f10)           ! MS            (%f10 results at G16)
301
        faligndata      %f4, %f6, %f20                  ! FGA   Group14 (%f20 results at G17)
302
        EXBLK1(ldda [%o1 + 0x030] %asi, %f12)           ! MS            (%f12 results at G17)
303
        faligndata      %f6, %f8, %f22                  ! FGA   Group15 (%f22 results at G18)
304
        EXBLK1(ldda [%o1 + 0x038] %asi, %f14)           ! MS            (%f14 results at G18)
305
 
306
        faligndata      %f8, %f10, %f24                 ! FGA   Group16 (%f24 results at G19)
307
        EXBLK1(ldda [%o1 + 0x040] %asi, %f0)            ! AX            (%f0 results at G19)
308
        prefetcha       [%o1 + 0x180] %asi, #one_read   ! MS
309
        faligndata      %f10, %f12, %f26                ! FGA   Group17 (%f26 results at G20)
310
        subcc           %o4, 0x40, %o4                  ! A0
311
        add             %o1, 0x40, %o1                  ! A1
312
        bg,pt           %xcc, U3copy_in_user_loop1              ! BR
313
         add            %o0, 0x40, %o0                  ! A0    Group18
314
 
315
U3copy_in_user_loop2_enter:
316
        mov             5, %o3                          ! A1
317
 
318
        /* This loop performs on the copy, no new prefetches are
319
         * queued.  We do things this way so that we do not perform
320
         * any spurious prefetches past the end of the src buffer.
321
         */
322
U3copy_in_user_loop2:
323
        EXBLK2(ldda [%o1 + 0x008] %asi, %f2)            ! MS
324
        faligndata      %f12, %f14, %f28                ! FGA   Group2
325
        EXBLK2(ldda [%o1 + 0x010] %asi, %f4)            ! MS
326
        faligndata      %f14, %f0, %f30                 ! FGA   Group4  (1-cycle stall)
327
        EXBLK2(stda %f16, [%o0] ASI_BLK_AIUS)           ! MS
328
        EXBLK2(ldda [%o1 + 0x018] %asi, %f6)            ! AX
329
        faligndata      %f0, %f2, %f16                  ! FGA   Group12 (7-cycle stall)
330
 
331
        EXBLK2(ldda [%o1 + 0x020] %asi, %f8)            ! MS
332
        faligndata      %f2, %f4, %f18                  ! FGA   Group13
333
        EXBLK2(ldda [%o1 + 0x028] %asi, %f10)           ! MS
334
        faligndata      %f4, %f6, %f20                  ! FGA   Group14
335
        EXBLK2(ldda [%o1 + 0x030] %asi, %f12)           ! MS
336
        faligndata      %f6, %f8, %f22                  ! FGA   Group15
337
        EXBLK2(ldda [%o1 + 0x038] %asi, %f14)           ! MS
338
        faligndata      %f8, %f10, %f24                 ! FGA   Group16
339
 
340
        EXBLK2(ldda [%o1 + 0x040] %asi, %f0)            ! AX
341
        faligndata      %f10, %f12, %f26                ! FGA   Group17
342
        subcc           %o3, 0x01, %o3                  ! A0
343
        add             %o1, 0x40, %o1                  ! A1
344
        bg,pt           %xcc, U3copy_in_user_loop2      ! BR
345
         add            %o0, 0x40, %o0                  ! A0    Group18
346
 
347
        /* Finally we copy the last full 64-byte block. */
348
U3copy_in_user_loopfini:
349
        EXBLK3(ldda [%o1 + 0x008] %asi, %f2)            ! MS
350
        faligndata      %f12, %f14, %f28                ! FGA
351
        EXBLK3(ldda [%o1 + 0x010] %asi, %f4)            ! MS    Group19
352
        faligndata      %f14, %f0, %f30                 ! FGA
353
        EXBLK3(stda %f16, [%o0] ASI_BLK_AIUS)           ! MS    Group20
354
        EXBLK4(ldda [%o1 + 0x018] %asi, %f6)            ! AX
355
        faligndata      %f0, %f2, %f16                  ! FGA   Group11 (7-cycle stall)
356
        EXBLK4(ldda [%o1 + 0x020] %asi, %f8)            ! MS
357
        faligndata      %f2, %f4, %f18                  ! FGA   Group12
358
        EXBLK4(ldda [%o1 + 0x028] %asi, %f10)           ! MS
359
        faligndata      %f4, %f6, %f20                  ! FGA   Group13
360
        EXBLK4(ldda [%o1 + 0x030] %asi, %f12)           ! MS
361
        faligndata      %f6, %f8, %f22                  ! FGA   Group14
362
        EXBLK4(ldda [%o1 + 0x038] %asi, %f14)           ! MS
363
        faligndata      %f8, %f10, %f24                 ! FGA   Group15
364
        cmp             %g1, 0                          ! A0
365
        be,pt           %icc, 1f                        ! BR
366
         add            %o0, 0x40, %o0                  ! A1
367
        EXBLK4(ldda [%o1 + 0x040] %asi, %f0)            ! MS
368
1:      faligndata      %f10, %f12, %f26                ! FGA   Group16
369
        faligndata      %f12, %f14, %f28                ! FGA   Group17
370
        faligndata      %f14, %f0, %f30                 ! FGA   Group18
371
        EXBLK4(stda %f16, [%o0] ASI_BLK_AIUS)           ! MS
372
        add             %o0, 0x40, %o0                  ! A0
373
        add             %o1, 0x40, %o1                  ! A1
374
        membar          #Sync                           ! MS    Group26 (7-cycle stall)
375
 
376
        /* Now we copy the (len modulo 64) bytes at the end.
377
         * Note how we borrow the %f0 loaded above.
378
         *
379
         * Also notice how this code is careful not to perform a
380
         * load past the end of the src buffer just like similar
381
         * code found in U3copy_in_user_toosmall processing.
382
         */
383
U3copy_in_user_loopend:
384
        and             %o2, 0x3f, %o2                  ! A0    Group
385
        andcc           %o2, 0x38, %g2                  ! A0    Group
386
        be,pn           %icc, U3copy_in_user_endcruft   ! BR
387
         subcc          %g2, 0x8, %g2                   ! A1
388
        be,pn           %icc, U3copy_in_user_endcruft   ! BR    Group
389
         cmp            %g1, 0                          ! A0
390
 
391
        be,a,pt         %icc, 1f                        ! BR    Group
392
         EX(ldda [%o1 + 0x00] %asi, %f0, add %o2, %g0)  ! MS
393
 
394
1:      EX(ldda [%o1 + 0x08] %asi, %f2, add %o2, %g0)   ! MS    Group
395
        add             %o1, 0x8, %o1                   ! A0
396
        sub             %o2, 0x8, %o2                   ! A1
397
        subcc           %g2, 0x8, %g2                   ! A0    Group
398
        faligndata      %f0, %f2, %f8                   ! FGA   Group
399
        EX(stda %f8, [%o0 + 0x00] %asi, add %o2, 0x8)   ! MS    (XXX does it stall here? XXX)
400
        be,pn           %icc, U3copy_in_user_endcruft   ! BR
401
         add            %o0, 0x8, %o0                   ! A0
402
        EX(ldda [%o1 + 0x08] %asi, %f0, add %o2, %g0)   ! MS    Group
403
        add             %o1, 0x8, %o1                   ! A0
404
        sub             %o2, 0x8, %o2                   ! A1
405
        subcc           %g2, 0x8, %g2                   ! A0    Group
406
        faligndata      %f2, %f0, %f8                   ! FGA
407
        EX(stda %f8, [%o0 + 0x00] %asi, add %o2, 0x8)   ! MS    (XXX does it stall here? XXX)
408
        bne,pn          %icc, 1b                        ! BR
409
         add            %o0, 0x8, %o0                   ! A0    Group
410
 
411
        /* If anything is left, we copy it one byte at a time.
412
         * Note that %g1 is (src & 0x3) saved above before the
413
         * alignaddr was performed.
414
         */
415
U3copy_in_user_endcruft:
416
        cmp             %o2, 0
417
        add             %o1, %g1, %o1
418
        VISExitHalf
419
        be,pn           %icc, U3copy_in_user_short_ret
420
         nop
421
        ba,a,pt         %xcc, U3copy_in_user_short
422
 
423
        /* If we get here, then 32 <= len < (6 * 64) */
424
U3copy_in_user_toosmall:
425
 
426
#ifdef SMALL_COPY_USES_FPU
427
 
428
        /* Is 'dst' already aligned on an 8-byte boundary? */
429
        be,pt           %xcc, 2f                        ! BR    Group
430
 
431
        /* Compute abs((dst & 7) - 8) into %g2.  This is the number
432
         * of bytes to copy to make 'dst' 8-byte aligned.  We pre-
433
         * subtract this from 'len'.
434
         */
435
         sub            %g2, 0x8, %g2                   ! A0
436
        sub             %g0, %g2, %g2                   ! A0    Group (reg-dep)
437
        sub             %o2, %g2, %o2                   ! A0    Group (reg-dep)
438
 
439
        /* Copy %g2 bytes from src to dst, one byte at a time. */
440
1:      EXNV2(lduba [%o1 + 0x00] %asi, %o3, add %o2, %g2)! MS   (Group) (%o3 in 3 cycles)
441
        add             %o1, 0x1, %o1                   ! A1
442
        add             %o0, 0x1, %o0                   ! A0    Group
443
        subcc           %g2, 0x1, %g2                   ! A1
444
 
445
        bg,pt           %icc, 1b                        ! BR    Group
446
         EXNV2(stba %o3, [%o0 + -1] %asi, add %o2, %g2) ! MS    Group
447
 
448
2:      VISEntryHalf                                    ! MS+MS
449
 
450
        /* Compute (len - (len % 8)) into %g2.  This is guarenteed
451
         * to be nonzero.
452
         */
453
        andn            %o2, 0x7, %g2                   ! A0    Group
454
 
455
        /* You may read this and believe that it allows reading
456
         * one 8-byte longword past the end of src.  It actually
457
         * does not, as %g2 is subtracted as loads are done from
458
         * src, so we always stop before running off the end.
459
         * Also, we are guarenteed to have at least 0x10 bytes
460
         * to move here.
461
         */
462
        sub             %g2, 0x8, %g2                   ! A0    Group (reg-dep)
463
        alignaddr       %o1, %g0, %g1                   ! MS          (Break-after)
464
        EX(ldda [%g1 + 0x00] %asi, %f0, add %o2, %g0)   ! MS    Group (1-cycle stall)
465
        add             %g1, 0x8, %g1                   ! A0
466
 
467
1:      EX(ldda [%g1 + 0x00] %asi, %f2, add %o2, %g0)   ! MS    Group
468
        add             %g1, 0x8, %g1                   ! A0
469
        sub             %o2, 0x8, %o2                   ! A1
470
        subcc           %g2, 0x8, %g2                   ! A0    Group
471
 
472
        faligndata      %f0, %f2, %f8                   ! FGA   Group (1-cycle stall)
473
        EX(stda %f8, [%o0 + 0x00] %asi, add %o2, 0x8)   ! MS    Group (2-cycle stall)
474
        add             %o1, 0x8, %o1                   ! A0
475
        be,pn           %icc, 2f                        ! BR
476
 
477
         add            %o0, 0x8, %o0                   ! A1
478
        EX(ldda [%g1 + 0x00] %asi, %f0, add %o2, %g0)   ! MS    Group
479
        add             %g1, 0x8, %g1                   ! A0
480
        sub             %o2, 0x8, %o2                   ! A1
481
 
482
        subcc           %g2, 0x8, %g2                   ! A0    Group
483
        faligndata      %f2, %f0, %f8                   ! FGA   Group (1-cycle stall)
484
        EX(stda %f8, [%o0 + 0x00] %asi, add %o2, 0x8)   ! MS    Group (2-cycle stall)
485
        add             %o1, 0x8, %o1                   ! A0
486
 
487
        bne,pn          %icc, 1b                        ! BR
488
         add            %o0, 0x8, %o0                   ! A1
489
 
490
        /* Nothing left to copy? */
491
2:      cmp             %o2, 0                          ! A0    Group
492
        VISExitHalf                                     ! A0+MS
493
        be,pn           %icc, U3copy_in_user_short_ret  ! BR    Group
494
         nop                                            ! A0
495
        ba,a,pt         %xcc, U3copy_in_user_short      ! BR    Group
496
 
497
#else /* !(SMALL_COPY_USES_FPU) */
498
 
499
        xor             %o1, %o0, %g2
500
        andcc           %g2, 0x7, %g0
501
        bne,pn          %icc, U3copy_in_user_short
502
         andcc          %o1, 0x7, %g2
503
 
504
        be,pt           %xcc, 2f
505
         sub            %g2, 0x8, %g2
506
        sub             %g0, %g2, %g2
507
        sub             %o2, %g2, %o2
508
 
509
1:      EXNV2(lduba [%o1 + 0x00] %asi, %o3, add %o2, %g2)
510
        add             %o1, 0x1, %o1
511
        add             %o0, 0x1, %o0
512
        subcc           %g2, 0x1, %g2
513
        bg,pt           %icc, 1b
514
         EXNV2(stba %o3, [%o0 + -1] %asi, add %o2, %g2)
515
 
516
2:      andn            %o2, 0x7, %g2
517
        sub             %o2, %g2, %o2
518
 
519
3:      EXNV3(ldxa [%o1 + 0x00] %asi, %o3, add %o2, %g2)
520
        add             %o1, 0x8, %o1
521
        add             %o0, 0x8, %o0
522
        subcc           %g2, 0x8, %g2
523
        bg,pt           %icc, 3b
524
         EXNV3(stxa %o3, [%o0 + -8] %asi, add %o2, %g2)
525
 
526
        cmp             %o2, 0
527
        bne,pn          %icc, U3copy_in_user_short
528
         nop
529
        ba,a,pt         %xcc, U3copy_in_user_short_ret
530
 
531
#endif /* !(SMALL_COPY_USES_FPU) */

powered by: WebSVN 2.1.0

© copyright 1999-2024 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.