OpenCores
URL https://opencores.org/ocsvn/or1k/or1k/trunk

Subversion Repositories or1k

[/] [or1k/] [trunk/] [linux/] [linux-2.4/] [arch/] [sparc64/] [lib/] [VIScsumcopyusr.S] - Blame information for rev 1765

Details | Compare with Previous | View Log

Line No. Rev Author Line
1 1275 phoenix
/* $Id: VIScsumcopyusr.S,v 1.1.1.1 2004-04-15 01:33:52 phoenix Exp $
2
 * VIScsumcopyusr.S: High bandwidth IP checksumming with simultaneous
3
 *                   copying utilizing the UltraSparc Visual Instruction Set.
4
 *
5
 * Copyright (C) 1997, 1999 Jakub Jelinek (jj@ultra.linux.cz)
6
 * Copyright (C) 2000 David S. Miller (davem@redhat.com)
7
 *
8
 * Based on older sparc32/sparc64 checksum.S, which is:
9
 *
10
 *      Copyright(C) 1995 Linus Torvalds
11
 *      Copyright(C) 1995 Miguel de Icaza
12
 *      Copyright(C) 1996,1997 David S. Miller
13
 *    derived from:
14
 *        Linux/Alpha checksum c-code
15
 *        Linux/ix86 inline checksum assembly
16
 *        RFC1071 Computing the Internet Checksum (esp. Jacobsons m68k code)
17
 *        David Mosberger-Tang for optimized reference c-code
18
 *        BSD4.4 portable checksum routine
19
 */
20
 
21
#ifdef __sparc_v9__
22
#define STACKOFF        0x7ff+128
23
#else
24
#define STACKOFF        64
25
#endif
26
 
27
#ifdef __KERNEL__
28
#include 
29
#include 
30
#include 
31
#include 
32
#define ASI_BLK_XOR     0
33
#define ASI_BLK_XOR1    (ASI_BLK_P ^ (ASI_BLK_P >> 3) ^ ASI_P)
34
#define ASI_BLK_OR      (ASI_BLK_P & ~ASI_P)
35
#else
36
#define ASI_P           0x80
37
#define ASI_BLK_P       0xf0
38
#define FRPS_FEF        0x04
39
#define FPRS_DU         0x02
40
#define FPRS_DL         0x01
41
#define ASI_BLK_XOR     (ASI_BLK_P ^ ASI_P)
42
#endif
43
 
44
#define src             o0
45
#define dst             o1
46
#define len             o2
47
#define sum             o3
48
#define x1              g1
49
#define x2              g2
50
#define x3              o4
51
#define x4              g4
52
#define x5              g5
53
#define x6              g7
54
#define x7              g3
55
#define x8              o5
56
 
57
/* Dobrou noc, SunSoft engineers. Spete sladce.
58
 * This has a couple of tricks in and those
59
 * tricks are UltraLinux trade secrets :))
60
 * Once AGAIN, the SunSoft engineers are caught
61
 * asleep at the keyboard :)).
62
 * The main loop does about 20 superscalar cycles
63
 * per 64bytes checksummed/copied.
64
 */
65
 
66
#define LDBLK(O0)                                                                               \
67
        ldda            [%src] ASI_BLK_P, %O0   /*  Load        Group                   */
68
 
69
#define STBLK                                                                                   \
70
        stda            %f48, [%dst] %asi       /*  Store                               */
71
 
72
#ifdef __KERNEL__
73
#define STBLK_XORASI(tmpreg1,tmpreg2)                                                           \
74
        stda            %f48, [%dst] %asi       /*  Store                               */;     \
75
        rd              %asi, %tmpreg1;                                                         \
76
        srl             %tmpreg1, 3, %tmpreg2;                                                  \
77
        xor             %tmpreg1, ASI_BLK_XOR1, %tmpreg1;                                       \
78
        wr              %tmpreg1, %tmpreg2, %asi;
79
#else
80
#define STBLK_XORASI(tmpreg1,tmpreg2)                                                           \
81
        stda            %f48, [%dst] %asi       /*  Store                               */;     \
82
        rd              %asi, %tmpreg1;                                                         \
83
        wr              %tmpreg1, ASI_BLK_XOR, %asi;
84
#endif
85
 
86
#define ST(fx,off)                                                                              \
87
        stda            %fx, [%dst + off] %asi  /*  Store                               */
88
 
89
#define SYNC                                                                                    \
90
        membar          #Sync
91
 
92
 
93
#define DO_THE_TRICK(f0,f2,f4,f6,f8,f10,f12,f14,F0,F2,F4,F6,F8,F10,F12,F14,DUMMY1,A0,A2,A4,A6,A8,A10,A12,A14,B14,DUMMY2,LOAD,STORE1,STORE2,STORE3,STORE4,STORE5,STORE6,STORE7,STORE8,DUMMY3,BRANCH...)  \
94
        LOAD                                    /*  Load        (Group)         */;     \
95
        faligndata      %A14, %F0, %A14         /*  FPA         Group           */;     \
96
        inc             %x5                     /*  IEU0                        */;     \
97
        STORE1                                  /*  Store (optional)            */;     \
98
        faligndata      %F0, %F2, %A0           /*  FPA         Group           */;     \
99
        srl             %x5, 1, %x5             /*  IEU0                        */;     \
100
        add             %sum, %x4, %sum         /*  IEU1                        */;     \
101
        fpadd32         %F0, %f0, %F0           /*  FPA         Group           */;     \
102
        inc             %x6                     /*  IEU0                        */;     \
103
        STORE2                                  /*  Store (optional)            */;     \
104
        faligndata      %F2, %F4, %A2           /*  FPA         Group           */;     \
105
        srl             %x6, 1, %x6             /*  IEU0                        */;     \
106
        add             %sum, %x5, %sum         /*  IEU1                        */;     \
107
        fpadd32         %F2, %f2, %F2           /*  FPA         Group           */;     \
108
        add             %src, 64, %src          /*  IEU0                        */;     \
109
        fcmpgt32        %f0, %F0, %x1           /*  FPM                         */;     \
110
        add             %dst, 64, %dst          /*  IEU1        Group           */;     \
111
        inc             %x7                     /*  IEU0                        */;     \
112
        STORE3                                  /*  Store (optional)            */;     \
113
        faligndata      %F4, %F6, %A4           /*  FPA                         */;     \
114
        fpadd32         %F4, %f4, %F4           /*  FPA         Group           */;     \
115
        add             %sum, %x6, %sum         /*  IEU1                        */;     \
116
        fcmpgt32        %f2, %F2, %x2           /*  FPM                         */;     \
117
        srl             %x7, 1, %x7             /*  IEU0        Group           */;     \
118
        inc             %x8                     /*  IEU1                        */;     \
119
        STORE4                                  /*  Store (optional)            */;     \
120
        faligndata      %F6, %F8, %A6           /*  FPA                         */;     \
121
        fpadd32         %F6, %f6, %F6           /*  FPA         Group           */;     \
122
        srl             %x8, 1, %x8             /*  IEU0                        */;     \
123
        fcmpgt32        %f4, %F4, %x3           /*  FPM                         */;     \
124
        add             %sum, %x7, %sum         /*  IEU0        Group           */;     \
125
        inc             %x1                     /*  IEU1                        */;     \
126
        STORE5                                  /*  Store (optional)            */;     \
127
        faligndata      %F8, %F10, %A8          /*  FPA                         */;     \
128
        fpadd32         %F8, %f8, %F8           /*  FPA         Group           */;     \
129
        srl             %x1, 1, %x1             /*  IEU0                        */;     \
130
        fcmpgt32        %f6, %F6, %x4           /*  FPM                         */;     \
131
        add             %sum, %x8, %sum         /*  IEU0        Group           */;     \
132
        inc             %x2                     /*  IEU1                        */;     \
133
        STORE6                                  /*  Store (optional)            */;     \
134
        faligndata      %F10, %F12, %A10        /*  FPA                         */;     \
135
        fpadd32         %F10, %f10, %F10        /*  FPA         Group           */;     \
136
        srl             %x2, 1, %x2             /*  IEU0                        */;     \
137
        fcmpgt32        %f8, %F8, %x5           /*  FPM                         */;     \
138
        add             %sum, %x1, %sum         /*  IEU0        Group           */;     \
139
        inc             %x3                     /*  IEU1                        */;     \
140
        STORE7                                  /*  Store (optional)            */;     \
141
        faligndata      %F12, %F14, %A12        /*  FPA                         */;     \
142
        fpadd32         %F12, %f12, %F12        /*  FPA         Group           */;     \
143
        srl             %x3, 1, %x3             /*  IEU0                        */;     \
144
        fcmpgt32        %f10, %F10, %x6         /*  FPM                         */;     \
145
        add             %sum, %x2, %sum         /*  IEU0        Group           */;     \
146
        inc             %x4                     /*  IEU1                        */;     \
147
        STORE8                                  /*  Store (optional)            */;     \
148
        fmovd           %F14, %B14              /*  FPA                         */;     \
149
        fpadd32         %F14, %f14, %F14        /*  FPA         Group           */;     \
150
        srl             %x4, 1, %x4             /*  IEU0                        */;     \
151
        fcmpgt32        %f12, %F12, %x7         /*  FPM                         */;     \
152
        add             %sum, %x3, %sum         /*  IEU0        Group           */;     \
153
        subcc           %len, 64, %len          /*  IEU1                        */;     \
154
        BRANCH                                  /*  CTI                         */;     \
155
        fcmpgt32        %f14, %F14, %x8         /*  FPM         Group           */;
156
 
157
#define END_THE_TRICK(f0,f2,f4,f6,f8,f10,f12,f14,FA,FB,S0,S1,S2,S3,T0,T1,U0,fz) \
158
        inc             %x5                     /*  IEU0        Group           */;     \
159
        fpadd32         %f2, %f0, %S0           /*  FPA                         */;     \
160
        add             %sum, %x4, %sum         /*  IEU1                        */;     \
161
        srl             %x5, 1, %x5             /*  IEU0        Group           */;     \
162
        fpadd32         %f6, %f4, %S1           /*  FPA                         */;     \
163
        inc             %x6                     /*  IEU1                        */;     \
164
        fpadd32         %f10, %f8, %S2          /*  FPA         Group           */;     \
165
        add             %sum, %x5, %sum         /*  IEU0                        */;     \
166
        fcmpgt32        %f0, %S0, %x1           /*  FPM                         */;     \
167
        fpadd32         %f14, %f12, %S3         /*  FPA         Group           */;     \
168
        srl             %x6, 1, %x6             /*  IEU0                        */;     \
169
        fcmpgt32        %f4, %S1, %x2           /*  FPM                         */;     \
170
        add             %sum, %x6, %sum         /*  IEU0        Group           */;     \
171
        fzero           %fz                     /*  FPA                         */;     \
172
        fcmpgt32        %f8, %S2, %x3           /*  FPM                         */;     \
173
        inc             %x7                     /*  IEU0        Group           */;     \
174
        inc             %x8                     /*  IEU1                        */;     \
175
        srl             %x7, 1, %x7             /*  IEU0        Group           */;     \
176
        inc             %x1                     /*  IEU1                        */;     \
177
        fpadd32         %S0, %S1, %T0           /*  FPA                         */;     \
178
        fpadd32         %S2, %S3, %T1           /*  FPA         Group           */;     \
179
        add             %sum, %x7, %sum         /*  IEU0                        */;     \
180
        fcmpgt32        %f12, %S3, %x4          /*  FPM                         */;     \
181
        srl             %x8, 1, %x8             /*  IEU0        Group           */;     \
182
        inc             %x2                     /*  IEU1                        */;     \
183
        srl             %x1, 1, %x1             /*  IEU0        Group           */;     \
184
        add             %sum, %x8, %sum         /*  IEU1                        */;     \
185
        add             %sum, %x1, %sum         /*  IEU0        Group           */;     \
186
        fcmpgt32        %S0, %T0, %x5           /*  FPM                         */;     \
187
        srl             %x2, 1, %x2             /*  IEU0        Group           */;     \
188
        fcmpgt32        %S2, %T1, %x6           /*  FPM                         */;     \
189
        inc             %x3                     /*  IEU0        Group           */;     \
190
        add             %sum, %x2, %sum         /*  IEU1                        */;     \
191
        srl             %x3, 1, %x3             /*  IEU0        Group           */;     \
192
        inc             %x4                     /*  IEU1                        */;     \
193
        fpadd32         %T0, %T1, %U0           /*  FPA         Group           */;     \
194
        add             %sum, %x3, %sum         /*  IEU0                        */;     \
195
        fcmpgt32        %fz, %f2, %x7           /*  FPM                         */;     \
196
        srl             %x4, 1, %x4             /*  IEU0        Group           */;     \
197
        fcmpgt32        %fz, %f6, %x8           /*  FPM                         */;     \
198
        inc             %x5                     /*  IEU0        Group           */;     \
199
        add             %sum, %x4, %sum         /*  IEU1                        */;     \
200
        srl             %x5, 1, %x5             /*  IEU0        Group           */;     \
201
        fcmpgt32        %fz, %f10, %x1          /*  FPM                         */;     \
202
        inc             %x6                     /*  IEU0        Group           */;     \
203
        add             %sum, %x5, %sum         /*  IEU1                        */;     \
204
        fmovd           %FA, %FB                /*  FPA         Group           */;     \
205
        fcmpgt32        %fz, %f14, %x2          /*  FPM                         */;     \
206
        srl             %x6, 1, %x6             /*  IEU0        Group           */;     \
207
        ba,pt           %xcc, ett               /*  CTI                         */;     \
208
         inc            %x7                     /*  IEU1                        */;
209
 
210
#define END_THE_TRICK1(f0,f2,f4,f6,f8,f10,f12,f14,FA,FB)                                \
211
        END_THE_TRICK(f0,f2,f4,f6,f8,f10,f12,f14,FA,FB,f48,f50,f52,f54,f56,f58,f60,f62)
212
 
213
#define END_THE_TRICK2(S0,S1,S2,S3,T0,T1,U0,U1,V0,fz)                                   \
214
        fpadd32         %U0, %U1, %V0           /*  FPA         Group           */;     \
215
        srl             %x7, 1, %x7             /*  IEU0                        */;     \
216
        add             %sum, %x6, %sum         /*  IEU1                        */;     \
217
        std             %V0, [%sp + STACKOFF]   /*  Store       Group           */;     \
218
        inc             %x8                     /*  IEU0                        */;     \
219
        sub             %sum, %x7, %sum         /*  IEU1                        */;     \
220
        srl             %x8, 1, %x8             /*  IEU0        Group           */;     \
221
        fcmpgt32        %fz, %S1, %x3           /*  FPM                         */;     \
222
        inc             %x1                     /*  IEU0        Group           */;     \
223
        fcmpgt32        %fz, %S3, %x4           /*  FPM                         */;     \
224
        srl             %x1, 1, %x1             /*  IEU0        Group           */;     \
225
        sub             %sum, %x8, %sum         /*  IEU1                        */;     \
226
        ldx             [%sp + STACKOFF], %x8   /*  Load        Group           */;     \
227
        inc             %x2                     /*  IEU0                        */;     \
228
        sub             %sum, %x1, %sum         /*  IEU1                        */;     \
229
        srl             %x2, 1, %x2             /*  IEU0        Group           */;     \
230
        fcmpgt32        %fz, %T1, %x5           /*  FPM                         */;     \
231
        inc             %x3                     /*  IEU0        Group           */;     \
232
        fcmpgt32        %T0, %U0, %x6           /*  FPM                         */;     \
233
        srl             %x3, 1, %x3             /*  IEU0        Group           */;     \
234
        sub             %sum, %x2, %sum         /*  IEU1                        */;     \
235
        inc             %x4                     /*  IEU0        Group           */;     \
236
        sub             %sum, %x3, %sum         /*  IEU1                        */;     \
237
        srl             %x4, 1, %x4             /*  IEU0        Group           */;     \
238
        fcmpgt32        %fz, %U1, %x7           /*  FPM                         */;     \
239
        inc             %x5                     /*  IEU0        Group           */;     \
240
        fcmpgt32        %U0, %V0, %x1           /*  FPM                         */;     \
241
        srl             %x5, 1, %x5             /*  IEU0        Group           */;     \
242
        sub             %sum, %x4, %sum         /*  IEU1                        */;     \
243
        sub             %sum, %x5, %sum         /*  IEU0        Group           */;     \
244
        fcmpgt32        %fz, %V0, %x2           /*  FPM                         */;     \
245
        inc             %x6                     /*  IEU0        Group           */;     \
246
        inc             %x7                     /*  IEU1                        */;     \
247
        srl             %x6, 1, %x6             /*  IEU0        Group           */;     \
248
        inc             %x1                     /*  IEU1                        */;     \
249
        srl             %x7, 1, %x7             /*  IEU0        Group           */;     \
250
        add             %sum, %x6, %sum         /*  IEU1                        */;     \
251
        srl             %x1, 1, %x1             /*  IEU0        Group           */;     \
252
        sub             %sum, %x7, %sum         /*  IEU1                        */;     \
253
        inc             %x2                     /*  IEU0        Group           */;     \
254
        add             %sum, %x1, %sum         /*  IEU1                        */;     \
255
        srl             %x2, 1, %x2             /*  IEU0        Group           */;     \
256
        sub             %sum, %x2, %sum         /*  IEU0        Group           */;     \
257
        addcc           %sum, %x8, %sum         /*  IEU1        Group           */;     \
258
        bcs,a,pn        %xcc, 33f               /*  CTI                         */;     \
259
         add            %sum, 1, %sum           /*  IEU0        (Group)         */;     \
260
33:                                             /*  That's it                   */;
261
 
262
        .text
263
        .globl          csum_partial_copy_user_vis
264
        .align          32
265
/* %asi should be either ASI_P or ASI_AIUS for csum_partial_copy resp.
266
 * csum_partial_copy_from_user
267
 * This assumes that !((%src^%dst)&3) && !((%src|%dst)&1) && %len >= 256
268
 */
269
csum_partial_copy_user_vis:
270
        andcc           %dst, 7, %g0            /*  IEU1        Group           */
271
        be,pt           %icc, 4f                /*  CTI                         */
272
         and            %dst, 0x38, %o4         /*  IEU0                        */
273
        mov             1, %g5                  /*  IEU0        Group           */
274
        andcc           %dst, 2, %g0            /*  IEU1                        */
275
        be,pt           %icc, 1f                /*  CTI                         */
276
         and            %dst, 4, %g7            /*  IEU0        Group           */
277
        lduh            [%src], %g2             /*  Load                        */
278
        sub             %len, 2, %len           /*  IEU0        Group           */
279
        add             %dst, 2, %dst           /*  IEU1                        */
280
        andcc           %dst, 4, %g7            /*  IEU1        Group           */
281
        sll             %g5, 16, %g5            /*  IEU0                        */
282
        stha            %g2, [%dst - 2] %asi    /*  Store       Group           */
283
        sll             %g2, 16, %g2            /*  IEU0                        */
284
        add             %src, 2, %src           /*  IEU1                        */
285
        addcc           %g2, %sum, %sum         /*  IEU1        Group           */
286
        bcs,a,pn        %icc, 1f                /*  CTI                         */
287
         add            %sum, %g5, %sum         /*  IEU0                        */
288
1:      lduw            [%src], %g2             /*  Load                        */
289
        brz,a,pn        %g7, 4f                 /*  CTI+IEU1    Group           */
290
         and            %dst, 0x38, %o4         /*  IEU0                        */
291
        add             %dst, 4, %dst           /*  IEU0        Group           */
292
        sub             %len, 4, %len           /*  IEU1                        */
293
        addcc           %g2, %sum, %sum         /*  IEU1        Group           */
294
        bcs,a,pn        %icc, 1f                /*  CTI                         */
295
         add            %sum, 1, %sum           /*  IEU0                        */
296
1:      and             %dst, 0x38, %o4         /*  IEU0        Group           */
297
        stwa            %g2, [%dst - 4] %asi    /*  Store                       */
298
        add             %src, 4, %src           /*  IEU1                        */
299
4:
300
#ifdef __KERNEL__
301
        VISEntry
302
#endif
303
        mov             %src, %g7               /*  IEU1        Group           */
304
        fzero           %f48                    /*  FPA                         */
305
        alignaddr       %src, %g0, %src         /*  Single      Group           */
306
        subcc           %g7, %src, %g7          /*  IEU1        Group           */
307
        be,pt           %xcc, 1f                /*  CTI                         */
308
         mov            0x40, %g1               /*  IEU0                        */
309
        lduw            [%src], %g2             /*  Load        Group           */
310
        subcc           %sum, %g2, %sum         /*  IEU1        Group+load stall*/
311
        bcs,a,pn        %icc, 1f                /*  CTI                         */
312
         sub            %sum, 1, %sum           /*  IEU0                        */
313
1:      srl             %sum, 0, %sum           /*  IEU0        Group           */
314
        clr             %g5                     /*  IEU1                        */
315
        brz,pn          %o4, 3f                 /*  CTI+IEU1    Group           */
316
         sub            %g1, %o4, %g1           /*  IEU0                        */
317
        ldd             [%src], %f0             /*  Load                        */
318
        clr             %o4                     /*  IEU0        Group           */
319
        andcc           %dst, 8, %g0            /*  IEU1                        */
320
        be,pn           %icc, 1f                /*  CTI                         */
321
         ldd            [%src + 8], %f2         /*  Load        Group           */
322
        add             %src, 8, %src           /*  IEU0                        */
323
        sub             %len, 8, %len           /*  IEU1                        */
324
        fpadd32         %f0, %f48, %f50         /*  FPA                         */
325
        addcc           %dst, 8, %dst           /*  IEU1        Group           */
326
        faligndata      %f0, %f2, %f16          /*  FPA                         */
327
        fcmpgt32        %f48, %f50, %o4         /*  FPM         Group           */
328
        fmovd           %f2, %f0                /*  FPA         Group           */
329
        ldd             [%src + 8], %f2         /*  Load                        */
330
        stda            %f16, [%dst - 8] %asi   /*  Store                       */
331
        fmovd           %f50, %f48              /*  FPA                         */
332
1:      andcc           %g1, 0x10, %g0          /*  IEU1        Group           */
333
        be,pn           %icc, 1f                /*  CTI                         */
334
         and            %g1, 0x20, %g1          /*  IEU0                        */
335
        fpadd32         %f0, %f48, %f50         /*  FPA                         */
336
        ldd             [%src + 16], %f4        /*  Load        Group           */
337
        add             %src, 16, %src          /*  IEU0                        */
338
        add             %dst, 16, %dst          /*  IEU1                        */
339
        faligndata      %f0, %f2, %f16          /*  FPA                         */
340
        fcmpgt32        %f48, %f50, %g5         /*  FPM         Group           */
341
        sub             %len, 16, %len          /*  IEU0                        */
342
        inc             %o4                     /*  IEU1                        */
343
        stda            %f16, [%dst - 16] %asi  /*  Store       Group           */
344
        fpadd32         %f2, %f50, %f48         /*  FPA                         */
345
        srl             %o4, 1, %o5             /*  IEU0                        */
346
        faligndata      %f2, %f4, %f18          /*  FPA         Group           */
347
        stda            %f18, [%dst - 8] %asi   /*  Store                       */
348
        fcmpgt32        %f50, %f48, %o4         /*  FPM         Group           */
349
        add             %o5, %sum, %sum         /*  IEU0                        */
350
        ldd             [%src + 8], %f2         /*  Load                        */
351
        fmovd           %f4, %f0                /*  FPA                         */
352
1:      brz,a,pn        %g1, 4f                 /*  CTI+IEU1    Group           */
353
         rd             %asi, %g2               /*  LSU         Group + 4 bubbles*/
354
        inc             %g5                     /*  IEU0                        */
355
        fpadd32         %f0, %f48, %f50         /*  FPA                         */
356
        ldd             [%src + 16], %f4        /*  Load        Group           */
357
        srl             %g5, 1, %g5             /*  IEU0                        */
358
        add             %dst, 32, %dst          /*  IEU1                        */
359
        faligndata      %f0, %f2, %f16          /*  FPA                         */
360
        fcmpgt32        %f48, %f50, %o5         /*  FPM         Group           */
361
        inc             %o4                     /*  IEU0                        */
362
        ldd             [%src + 24], %f6        /*  Load                        */
363
        srl             %o4, 1, %o4             /*  IEU0        Group           */
364
        add             %g5, %sum, %sum         /*  IEU1                        */
365
        ldd             [%src + 32], %f8        /*  Load                        */
366
        fpadd32         %f2, %f50, %f48         /*  FPA                         */
367
        faligndata      %f2, %f4, %f18          /*  FPA         Group           */
368
        sub             %len, 32, %len          /*  IEU0                        */
369
        stda            %f16, [%dst - 32] %asi  /*  Store                       */
370
        fcmpgt32        %f50, %f48, %g3         /*  FPM         Group           */
371
        inc             %o5                     /*  IEU0                        */
372
        add             %o4, %sum, %sum         /*  IEU1                        */
373
        fpadd32         %f4, %f48, %f50         /*  FPA                         */
374
        faligndata      %f4, %f6, %f20          /*  FPA         Group           */
375
        srl             %o5, 1, %o5             /*  IEU0                        */
376
        fcmpgt32        %f48, %f50, %g5         /*  FPM         Group           */
377
        add             %o5, %sum, %sum         /*  IEU0                        */
378
        stda            %f18, [%dst - 24] %asi  /*  Store                       */
379
        fpadd32         %f6, %f50, %f48         /*  FPA                         */
380
        inc             %g3                     /*  IEU0        Group           */
381
        stda            %f20, [%dst - 16] %asi  /*  Store                       */
382
        add             %src, 32, %src          /*  IEU1                        */
383
        faligndata      %f6, %f8, %f22          /*  FPA                         */
384
        fcmpgt32        %f50, %f48, %o4         /*  FPM         Group           */
385
        srl             %g3, 1, %g3             /*  IEU0                        */
386
        stda            %f22, [%dst - 8] %asi   /*  Store                       */
387
        add             %g3, %sum, %sum         /*  IEU0        Group           */
388
3:      rd              %asi, %g2               /*  LSU         Group + 4 bubbles*/
389
#ifdef __KERNEL__
390
4:      sethi           %hi(vis0s), %g7         /*  IEU0        Group           */
391
        or              %g2, ASI_BLK_OR, %g2    /*  IEU1                        */
392
#else
393
4:      rd              %pc, %g7                /*  LSU         Group + 4 bubbles*/
394
#endif
395
        inc             %g5                     /*  IEU0        Group           */
396
        and             %src, 0x38, %g3         /*  IEU1                        */
397
        membar          #StoreLoad              /*  LSU         Group           */
398
        srl             %g5, 1, %g5             /*  IEU0                        */
399
        inc             %o4                     /*  IEU1                        */
400
        sll             %g3, 8, %g3             /*  IEU0        Group           */
401
        sub             %len, 0xc0, %len        /*  IEU1                        */
402
        addcc           %g5, %sum, %sum         /*  IEU1        Group           */
403
        srl             %o4, 1, %o4             /*  IEU0                        */
404
        add             %g7, %g3, %g7           /*  IEU0        Group           */
405
        add             %o4, %sum, %sum         /*  IEU1                        */
406
#ifdef __KERNEL__
407
        jmpl            %g7 + %lo(vis0s), %g0   /*  CTI+IEU1    Group           */
408
#else
409
        jmpl            %g7 + (vis0s - 4b), %g0 /*  CTI+IEU1    Group           */
410
#endif
411
         fzero          %f32                    /*  FPA                         */
412
 
413
        .align          2048
414
vis0s:  wr              %g2, ASI_BLK_XOR, %asi  /*  LSU         Group           */
415
        ldda            [%src] ASI_BLK_P, %f0   /*  Load        Group           */
416
        add             %src, 64, %src          /*  IEU0        Group           */
417
        ldda            [%src] ASI_BLK_P, %f16  /*  Load        Group           */
418
        add             %src, 64, %src          /*  IEU0        Group           */
419
        fmovd           %f48, %f62              /*  FPA         Group   f0 available*/
420
        faligndata      %f0, %f2, %f48          /*  FPA         Group   f2 available*/
421
        fcmpgt32        %f32, %f2, %x1          /*  FPM         Group   f4 available*/
422
        fpadd32         %f0, %f62, %f0          /*  FPA                         */
423
        fcmpgt32        %f32, %f4, %x2          /*  FPM         Group   f6 available*/
424
        faligndata      %f2, %f4, %f50          /*  FPA                         */
425
        fcmpgt32        %f62, %f0, %x3          /*  FPM         Group   f8 available*/
426
        faligndata      %f4, %f6, %f52          /*  FPA                         */
427
        fcmpgt32        %f32, %f6, %x4          /*  FPM         Group   f10 available*/
428
        inc             %x1                     /*  IEU0                        */
429
        faligndata      %f6, %f8, %f54          /*  FPA                         */
430
        fcmpgt32        %f32, %f8, %x5          /*  FPM         Group   f12 available*/
431
        srl             %x1, 1, %x1             /*  IEU0                        */
432
        inc             %x2                     /*  IEU1                        */
433
        faligndata      %f8, %f10, %f56         /*  FPA                         */
434
        fcmpgt32        %f32, %f10, %x6         /*  FPM         Group   f14 available*/
435
        srl             %x2, 1, %x2             /*  IEU0                        */
436
        add             %sum, %x1, %sum         /*  IEU1                        */
437
        faligndata      %f10, %f12, %f58        /*  FPA                         */
438
        fcmpgt32        %f32, %f12, %x7         /*  FPM         Group           */
439
        inc             %x3                     /*  IEU0                        */
440
        add             %sum, %x2, %sum         /*  IEU1                        */
441
        faligndata      %f12, %f14, %f60        /*  FPA                         */
442
        fcmpgt32        %f32, %f14, %x8         /*  FPM         Group           */
443
        srl             %x3, 1, %x3             /*  IEU0                        */
444
        inc             %x4                     /*  IEU1                        */
445
        fmovd           %f14, %f62              /*  FPA                         */
446
        srl             %x4, 1, %x4             /*  IEU0        Group           */
447
        add             %sum, %x3, %sum         /*  IEU1                        */
448
vis0:   DO_THE_TRICK(   f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,
449
                        ,f48,f50,f52,f54,f56,f58,f60,f62,f62,
450
                        ,LDBLK(f32),    STBLK,,,,,,,,
451
                        ,bcs,pn %icc, vis0e1)
452
        DO_THE_TRICK(   f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
453
                        ,f48,f50,f52,f54,f56,f58,f60,f62,f62,
454
                        ,LDBLK(f0),     STBLK,,,,,,,,
455
                        ,bcs,pn %icc, vis0e2)
456
        DO_THE_TRICK(   f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,
457
                        ,f48,f50,f52,f54,f56,f58,f60,f62,f62,
458
                        ,LDBLK(f16),    STBLK,,,,,,,,
459
                        ,bcc,pt %icc, vis0)
460
vis0e3: DO_THE_TRICK(   f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,
461
                        ,f48,f50,f52,f54,f56,f58,f60,f62,f32,
462
                        ,SYNC,          STBLK_XORASI(x1,x2),ST(f48,64),ST(f50,8),ST(f52,16),ST(f54,24),ST(f56,32),ST(f58,40),ST(f60,48),
463
                        ,add %dst, 56, %dst; add %len, 192 - 8*8, %len; ba,pt %icc, e2)
464
vis0e1: DO_THE_TRICK(   f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
465
                        ,f48,f50,f52,f54,f56,f58,f60,f62,f0,
466
                        ,SYNC,          STBLK_XORASI(x1,x2),ST(f48,64),ST(f50,8),ST(f52,16),ST(f54,24),ST(f56,32),ST(f58,40),ST(f60,48),
467
                        ,add %dst, 56, %dst; add %len, 192 - 8*8, %len; ba,pt %icc, e3)
468
vis0e2: DO_THE_TRICK(   f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,
469
                        ,f48,f50,f52,f54,f56,f58,f60,f62,f16,
470
                        ,SYNC,          STBLK_XORASI(x1,x2),ST(f48,64),ST(f50,8),ST(f52,16),ST(f54,24),ST(f56,32),ST(f58,40),ST(f60,48),
471
                        ,add %dst, 56, %dst; add %len, 192 - 8*8, %len; ba,pt %icc, e1)
472
        .align          2048
473
vis1s:  wr              %g2, ASI_BLK_XOR, %asi  /*  LSU         Group           */
474
        sub             %src, 8, %src           /*  IEU0        Group           */
475
        ldda            [%src] ASI_BLK_P, %f0   /*  Load        Group           */
476
        add             %src, 64, %src          /*  IEU0        Group           */
477
        ldda            [%src] ASI_BLK_P, %f16  /*  Load        Group           */
478
        add             %src, 64, %src          /*  IEU0        Group           */
479
        fmovd           %f0, %f58               /*  FPA         Group           */
480
        fmovd           %f48, %f0               /*  FPA         Group           */
481
        fcmpgt32        %f32, %f2, %x2          /*  FPM         Group           */
482
        faligndata      %f2, %f4, %f48          /*  FPA                         */
483
        fcmpgt32        %f32, %f4, %x3          /*  FPM         Group           */
484
        faligndata      %f4, %f6, %f50          /*  FPA                         */
485
        fcmpgt32        %f32, %f6, %x4          /*  FPM         Group           */
486
        faligndata      %f6, %f8, %f52          /*  FPA                         */
487
        fcmpgt32        %f32, %f8, %x5          /*  FPM         Group           */
488
        inc             %x2                     /*  IEU1                        */
489
        faligndata      %f8, %f10, %f54         /*  FPA                         */
490
        fcmpgt32        %f32, %f10, %x6         /*  FPM         Group           */
491
        srl             %x2, 1, %x2             /*  IEU0                        */
492
        faligndata      %f10, %f12, %f56        /*  FPA                         */
493
        fcmpgt32        %f32, %f12, %x7         /*  FPM         Group           */
494
        inc             %x3                     /*  IEU0                        */
495
        add             %sum, %x2, %sum         /*  IEU1                        */
496
        faligndata      %f12, %f14, %f58        /*  FPA                         */
497
        fcmpgt32        %f32, %f14, %x8         /*  FPM         Group           */
498
        srl             %x3, 1, %x3             /*  IEU0                        */
499
        inc             %x4                     /*  IEU1                        */
500
        fmovd           %f14, %f60              /*  FPA                         */
501
        srl             %x4, 1, %x4             /*  IEU0        Group           */
502
        add             %sum, %x3, %sum         /*  IEU1                        */
503
vis1:   DO_THE_TRICK(   f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,
504
                        ,f62,f48,f50,f52,f54,f56,f58,f60,f60,
505
                        ,LDBLK(f32),    ,STBLK,,,,,,,
506
                        ,bcs,pn %icc, vis1e1)
507
        DO_THE_TRICK(   f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
508
                        ,f62,f48,f50,f52,f54,f56,f58,f60,f60,
509
                        ,LDBLK(f0),     ,STBLK,,,,,,,
510
                        ,bcs,pn %icc, vis1e2)
511
        DO_THE_TRICK(   f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,
512
                        ,f62,f48,f50,f52,f54,f56,f58,f60,f60,
513
                        ,LDBLK(f16),    ,STBLK,,,,,,,
514
                        ,bcc,pt %icc, vis1)
515
vis1e3: DO_THE_TRICK(   f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,
516
                        ,f62,f48,f50,f52,f54,f56,f58,f60,f32,
517
                        ,SYNC,          ,STBLK_XORASI(x1,x2),ST(f48,0),ST(f50,8),ST(f52,16),ST(f54,24),ST(f56,32),ST(f58,40),
518
                        ,add %dst, 48, %dst; add %len, 192 - 7*8, %len; ba,pt %icc, e2)
519
vis1e1: DO_THE_TRICK(   f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
520
                        ,f62,f48,f50,f52,f54,f56,f58,f60,f0,
521
                        ,SYNC,          ,STBLK_XORASI(x1,x2),ST(f48,0),ST(f50,8),ST(f52,16),ST(f54,24),ST(f56,32),ST(f58,40),
522
                        ,add %dst, 48, %dst; add %len, 192 - 7*8, %len; ba,pt %icc, e3)
523
vis1e2: DO_THE_TRICK(   f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,
524
                        ,f62,f48,f50,f52,f54,f56,f58,f60,f16,
525
                        ,SYNC,          ,STBLK_XORASI(x1,x2),ST(f48,0),ST(f50,8),ST(f52,16),ST(f54,24),ST(f56,32),ST(f58,40),
526
                        ,add %dst, 48, %dst; add %len, 192 - 7*8, %len; ba,pt %icc, e1)
527
        .align          2048
528
vis2s:  wr              %g2, ASI_BLK_XOR, %asi  /*  LSU         Group           */
529
        sub             %src, 16, %src          /*  IEU0        Group           */
530
        ldda            [%src] ASI_BLK_P, %f0   /*  Load        Group           */
531
        add             %src, 64, %src          /*  IEU0        Group           */
532
        ldda            [%src] ASI_BLK_P, %f16  /*  Load        Group           */
533
        add             %src, 64, %src          /*  IEU0        Group           */
534
        fmovd           %f0, %f56               /*  FPA         Group           */
535
        fmovd           %f48, %f0               /*  FPA         Group           */
536
        sub             %dst, 64, %dst          /*  IEU0                        */
537
        fpsub32         %f2, %f2, %f2           /*  FPA         Group           */
538
        fcmpgt32        %f32, %f4, %x3          /*  FPM         Group           */
539
        faligndata      %f4, %f6, %f48          /*  FPA                         */
540
        fcmpgt32        %f32, %f6, %x4          /*  FPM         Group           */
541
        faligndata      %f6, %f8, %f50          /*  FPA                         */
542
        fcmpgt32        %f32, %f8, %x5          /*  FPM         Group           */
543
        faligndata      %f8, %f10, %f52         /*  FPA                         */
544
        fcmpgt32        %f32, %f10, %x6         /*  FPM         Group           */
545
        faligndata      %f10, %f12, %f54        /*  FPA                         */
546
        fcmpgt32        %f32, %f12, %x7         /*  FPM         Group           */
547
        inc             %x3                     /*  IEU0                        */
548
        faligndata      %f12, %f14, %f56        /*  FPA                         */
549
        fcmpgt32        %f32, %f14, %x8         /*  FPM         Group           */
550
        srl             %x3, 1, %x3             /*  IEU0                        */
551
        inc             %x4                     /*  IEU1                        */
552
        fmovd           %f14, %f58              /*  FPA                         */
553
        srl             %x4, 1, %x4             /*  IEU0        Group           */
554
        add             %sum, %x3, %sum         /*  IEU1                        */
555
vis2:   DO_THE_TRICK(   f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,
556
                        ,f60,f62,f48,f50,f52,f54,f56,f58,f58,
557
                        ,LDBLK(f32),    ,,STBLK,,,,,,
558
                        ,bcs,pn %icc, vis2e1)
559
        DO_THE_TRICK(   f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
560
                        ,f60,f62,f48,f50,f52,f54,f56,f58,f58,
561
                        ,LDBLK(f0),     ,,STBLK,,,,,,
562
                        ,bcs,pn %icc, vis2e2)
563
        DO_THE_TRICK(   f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,
564
                        ,f60,f62,f48,f50,f52,f54,f56,f58,f58,
565
                        ,LDBLK(f16),    ,,STBLK,,,,,,
566
                        ,bcc,pt %icc, vis2)
567
vis2e3: DO_THE_TRICK(   f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,
568
                        ,f60,f62,f48,f50,f52,f54,f56,f58,f32,
569
                        ,SYNC,          ,,STBLK_XORASI(x2,x3),ST(f48,64),ST(f50,72),ST(f52,80),ST(f54,88),ST(f56,96),
570
                        ,add %dst, 104, %dst; add %len, 192 - 6*8, %len; ba,pt %icc, e2)
571
vis2e1: DO_THE_TRICK(   f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
572
                        ,f60,f62,f48,f50,f52,f54,f56,f58,f0,
573
                        ,SYNC,          ,,STBLK_XORASI(x2,x3),ST(f48,64),ST(f50,72),ST(f52,80),ST(f54,88),ST(f56,96),
574
                        ,add %dst, 104, %dst; add %len, 192 - 6*8, %len; ba,pt %icc, e3)
575
vis2e2: DO_THE_TRICK(   f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,
576
                        ,f60,f62,f48,f50,f52,f54,f56,f58,f16,
577
                        ,SYNC,          ,,STBLK_XORASI(x2,x3),ST(f48,64),ST(f50,72),ST(f52,80),ST(f54,88),ST(f56,96),
578
                        ,add %dst, 104, %dst; add %len, 192 - 6*8, %len; ba,pt %icc, e1)
579
        .align          2048
580
vis3s:  wr              %g2, ASI_BLK_XOR, %asi  /*  LSU         Group           */
581
        sub             %src, 24, %src          /*  IEU0        Group           */
582
        ldda            [%src] ASI_BLK_P, %f0   /*  Load        Group           */
583
        add             %src, 64, %src          /*  IEU0        Group           */
584
        ldda            [%src] ASI_BLK_P, %f16  /*  Load        Group           */
585
        add             %src, 64, %src          /*  IEU0        Group           */
586
        fmovd           %f0, %f54               /*  FPA         Group           */
587
        fmovd           %f48, %f0               /*  FPA         Group           */
588
        sub             %dst, 64, %dst          /*  IEU0                        */
589
        fpsub32         %f2, %f2, %f2           /*  FPA         Group           */
590
        fpsub32         %f4, %f4, %f4           /*  FPA         Group           */
591
        fcmpgt32        %f32, %f6, %x4          /*  FPM         Group           */
592
        faligndata      %f6, %f8, %f48          /*  FPA                         */
593
        fcmpgt32        %f32, %f8, %x5          /*  FPM         Group           */
594
        faligndata      %f8, %f10, %f50         /*  FPA                         */
595
        fcmpgt32        %f32, %f10, %x6         /*  FPM         Group           */
596
        faligndata      %f10, %f12, %f52        /*  FPA                         */
597
        fcmpgt32        %f32, %f12, %x7         /*  FPM         Group           */
598
        faligndata      %f12, %f14, %f54        /*  FPA                         */
599
        fcmpgt32        %f32, %f14, %x8         /*  FPM         Group           */
600
        fmovd           %f14, %f56              /*  FPA                         */
601
        inc             %x4                     /*  IEU0                        */
602
        srl             %x4, 1, %x4             /*  IEU0        Group           */
603
vis3:   DO_THE_TRICK(   f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,
604
                        ,f58,f60,f62,f48,f50,f52,f54,f56,f56,
605
                        ,LDBLK(f32),    ,,,STBLK,,,,,
606
                        ,bcs,pn %icc, vis3e1)
607
        DO_THE_TRICK(   f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
608
                        ,f58,f60,f62,f48,f50,f52,f54,f56,f56,
609
                        ,LDBLK(f0),     ,,,STBLK,,,,,
610
                        ,bcs,pn %icc, vis3e2)
611
        DO_THE_TRICK(   f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,
612
                        ,f58,f60,f62,f48,f50,f52,f54,f56,f56,
613
                        ,LDBLK(f16),    ,,,STBLK,,,,,
614
                        ,bcc,pt %icc, vis3)
615
vis3e3: DO_THE_TRICK(   f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,
616
                        ,f58,f60,f62,f48,f50,f52,f54,f56,f32,
617
                        ,SYNC,          ,,,STBLK_XORASI(x3,x4),ST(f48,64),ST(f50,72),ST(f52,80),ST(f54,88),
618
                        ,add %dst, 96, %dst; add %len, 192 - 5*8, %len; ba,pt %icc, e2)
619
vis3e1: DO_THE_TRICK(   f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
620
                        ,f58,f60,f62,f48,f50,f52,f54,f56,f0,
621
                        ,SYNC,          ,,,STBLK_XORASI(x3,x4),ST(f48,64),ST(f50,72),ST(f52,80),ST(f54,88),
622
                        ,add %dst, 96, %dst; add %len, 192 - 5*8, %len; ba,pt %icc, e3)
623
vis3e2: DO_THE_TRICK(   f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,
624
                        ,f58,f60,f62,f48,f50,f52,f54,f56,f16,
625
                        ,SYNC,          ,,,STBLK_XORASI(x3,x4),ST(f48,64),ST(f50,72),ST(f52,80),ST(f54,88),
626
                        ,add %dst, 96, %dst; add %len, 192 - 5*8, %len; ba,pt %icc, e1)
627
        .align          2048
628
vis4s:  wr              %g2, ASI_BLK_XOR, %asi  /*  LSU         Group           */
629
        sub             %src, 32, %src          /*  IEU0        Group           */
630
        ldda            [%src] ASI_BLK_P, %f0   /*  Load        Group           */
631
        add             %src, 64, %src          /*  IEU0        Group           */
632
        ldda            [%src] ASI_BLK_P, %f16  /*  Load        Group           */
633
        add             %src, 64, %src          /*  IEU0        Group           */
634
        fmovd           %f0, %f52               /*  FPA         Group           */
635
        fmovd           %f48, %f0               /*  FPA         Group           */
636
        sub             %dst, 64, %dst          /*  IEU0                        */
637
        fpsub32         %f2, %f2, %f2           /*  FPA         Group           */
638
        fpsub32         %f4, %f4, %f4           /*  FPA         Group           */
639
        fpsub32         %f6, %f6, %f6           /*  FPA         Group           */
640
        clr             %x4                     /*  IEU0                        */
641
        fcmpgt32        %f32, %f8, %x5          /*  FPM         Group           */
642
        faligndata      %f8, %f10, %f48         /*  FPA                         */
643
        fcmpgt32        %f32, %f10, %x6         /*  FPM         Group           */
644
        faligndata      %f10, %f12, %f50        /*  FPA                         */
645
        fcmpgt32        %f32, %f12, %x7         /*  FPM         Group           */
646
        faligndata      %f12, %f14, %f52        /*  FPA                         */
647
        fcmpgt32        %f32, %f14, %x8         /*  FPM         Group           */
648
        fmovd           %f14, %f54              /*  FPA                         */
649
vis4:   DO_THE_TRICK(   f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,
650
                        ,f56,f58,f60,f62,f48,f50,f52,f54,f54,
651
                        ,LDBLK(f32),    ,,,,STBLK,,,,
652
                        ,bcs,pn %icc, vis4e1)
653
        DO_THE_TRICK(   f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
654
                        ,f56,f58,f60,f62,f48,f50,f52,f54,f54,
655
                        ,LDBLK(f0),     ,,,,STBLK,,,,
656
                        ,bcs,pn %icc, vis4e2)
657
        DO_THE_TRICK(   f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,
658
                        ,f56,f58,f60,f62,f48,f50,f52,f54,f54,
659
                        ,LDBLK(f16),    ,,,,STBLK,,,,
660
                        ,bcc,pt %icc, vis4)
661
vis4e3: DO_THE_TRICK(   f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,
662
                        ,f56,f58,f60,f62,f48,f50,f52,f54,f32,
663
                        ,SYNC,          ,,,,STBLK_XORASI(x4,x5),ST(f48,64),ST(f50,72),ST(f52,80),
664
                        ,add %dst, 88, %dst; add %len, 192 - 4*8, %len; ba,pt %icc, e2)
665
vis4e1: DO_THE_TRICK(   f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
666
                        ,f56,f58,f60,f62,f48,f50,f52,f54,f0,
667
                        ,SYNC,          ,,,,STBLK_XORASI(x4,x5),ST(f48,64),ST(f50,72),ST(f52,80),
668
                        ,add %dst, 88, %dst; add %len, 192 - 4*8, %len; ba,pt %icc, e3)
669
vis4e2: DO_THE_TRICK(   f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,
670
                        ,f56,f58,f60,f62,f48,f50,f52,f54,f16,
671
                        ,SYNC,          ,,,,STBLK_XORASI(x4,x5),ST(f48,64),ST(f50,72),ST(f52,80),
672
                        ,add %dst, 88, %dst; add %len, 192 - 4*8, %len; ba,pt %icc, e1)
673
        .align          2048
674
vis5s:  ldd             [%src+0], %f10          /*  Load        Group           */
675
        ldd             [%src+8], %f12          /*  Load        Group           */
676
        ldd             [%src+16], %f14         /*  Load        Group           */
677
        add             %src, 24, %src          /*  IEU0        Group           */
678
        wr              %g2, ASI_BLK_XOR, %asi  /*  LSU         Group           */
679
        ldda            [%src] ASI_BLK_P, %f16  /*  Load        Group           */
680
        add             %src, 64, %src          /*  IEU0        Group           */
681
        fmovd           %f48, %f0               /*  FPA         Group           */
682
        fmuld           %f32, %f32, %f2         /*  FPM                         */
683
        clr             %x4                     /*  IEU0                        */
684
        faddd           %f32, %f32, %f4         /*  FPA         Group           */
685
        fmuld           %f32, %f32, %f6         /*  FPM                         */
686
        clr             %x5                     /*  IEU0                        */
687
        faddd           %f32, %f32, %f8         /*  FPA         Group           */
688
        fcmpgt32        %f32, %f10, %x6         /*  FPM         Group           */
689
        sub             %dst, 64, %dst          /*  IEU0                        */
690
        faligndata      %f10, %f12, %f48        /*  FPA                         */
691
        fcmpgt32        %f32, %f12, %x7         /*  FPM         Group           */
692
        faligndata      %f12, %f14, %f50        /*  FPA                         */
693
        fcmpgt32        %f32, %f14, %x8         /*  FPM         Group           */
694
        fmovd           %f14, %f52              /*  FPA                         */
695
vis5:   DO_THE_TRICK(   f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,
696
                        ,f54,f56,f58,f60,f62,f48,f50,f52,f52,
697
                        ,LDBLK(f32),    ,,,,,STBLK,,,
698
                        ,bcs,pn %icc, vis5e1)
699
        DO_THE_TRICK(   f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
700
                        ,f54,f56,f58,f60,f62,f48,f50,f52,f52,
701
                        ,LDBLK(f0),     ,,,,,STBLK,,,
702
                        ,bcs,pn %icc, vis5e2)
703
        DO_THE_TRICK(   f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,
704
                        ,f54,f56,f58,f60,f62,f48,f50,f52,f52,
705
                        ,LDBLK(f16),    ,,,,,STBLK,,,
706
                        ,bcc,pt %icc, vis5)
707
vis5e3: DO_THE_TRICK(   f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,
708
                        ,f54,f56,f58,f60,f62,f48,f50,f52,f32,
709
                        ,SYNC,          ,,,,,STBLK_XORASI(x5,x6),ST(f48,64),ST(f50,72),
710
                        ,add %dst, 80, %dst; add %len, 192 - 3*8, %len; ba,pt %icc, e2)
711
vis5e1: DO_THE_TRICK(   f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
712
                        ,f54,f56,f58,f60,f62,f48,f50,f52,f0,
713
                        ,SYNC,          ,,,,,STBLK_XORASI(x5,x6),ST(f48,64),ST(f50,72),
714
                        ,add %dst, 80, %dst; add %len, 192 - 3*8, %len; ba,pt %icc, e3)
715
vis5e2: DO_THE_TRICK(   f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,
716
                        ,f54,f56,f58,f60,f62,f48,f50,f52,f16,
717
                        ,SYNC,          ,,,,,STBLK_XORASI(x5,x6),ST(f48,64),ST(f50,72),
718
                        ,add %dst, 80, %dst; add %len, 192 - 3*8, %len; ba,pt %icc, e1)
719
        .align          2048
720
vis6s:  ldd             [%src+0], %f12          /*  Load        Group           */
721
        ldd             [%src+8], %f14          /*  Load        Group           */
722
        add             %src, 16, %src          /*  IEU0        Group           */
723
        wr              %g2, ASI_BLK_XOR, %asi  /*  LSU         Group           */
724
        ldda            [%src] ASI_BLK_P, %f16  /*  Load        Group           */
725
        add             %src, 64, %src          /*  IEU0        Group           */
726
        fmovd           %f48, %f0               /*  FPA         Group           */
727
        fmuld           %f32, %f32, %f2         /*  FPM                         */
728
        clr             %x4                     /*  IEU0                        */
729
        faddd           %f32, %f32, %f4         /*  FPA         Group           */
730
        fmuld           %f32, %f32, %f6         /*  FPM                         */
731
        clr             %x5                     /*  IEU0                        */
732
        faddd           %f32, %f32, %f8         /*  FPA         Group           */
733
        fmuld           %f32, %f32, %f10        /*  FPM                         */
734
        clr             %x6                     /*  IEU0                        */
735
        fcmpgt32        %f32, %f12, %x7         /*  FPM         Group           */
736
        sub             %dst, 64, %dst          /*  IEU0                        */
737
        fcmpgt32        %f32, %f14, %x8         /*  FPM         Group           */
738
        faligndata      %f12, %f14, %f48        /*  FPA                         */
739
        fmovd           %f14, %f50              /*  FPA         Group           */
740
vis6:   DO_THE_TRICK(   f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,
741
                        ,f52,f54,f56,f58,f60,f62,f48,f50,f50,
742
                        ,LDBLK(f32),    ,,,,,,STBLK,,
743
                        ,bcs,pn %icc, vis6e1)
744
        DO_THE_TRICK(   f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
745
                        ,f52,f54,f56,f58,f60,f62,f48,f50,f50,
746
                        ,LDBLK(f0),     ,,,,,,STBLK,,
747
                        ,bcs,pn %icc, vis6e2)
748
        DO_THE_TRICK(   f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,
749
                        ,f52,f54,f56,f58,f60,f62,f48,f50,f50,
750
                        ,LDBLK(f16),    ,,,,,,STBLK,,
751
                        ,bcc,pt %icc, vis6)
752
vis6e3: DO_THE_TRICK(   f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,
753
                        ,f52,f54,f56,f58,f60,f62,f48,f50,f32,
754
                        ,SYNC,          ,,,,,,STBLK_XORASI(x6,x7),ST(f48,64),
755
                        ,add %dst, 72, %dst; add %len, 192 - 2*8, %len; ba,pt %icc, e2)
756
vis6e1: DO_THE_TRICK(   f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
757
                        ,f52,f54,f56,f58,f60,f62,f48,f50,f0,
758
                        ,SYNC,          ,,,,,,STBLK_XORASI(x6,x7),ST(f48,64),
759
                        ,add %dst, 72, %dst; add %len, 192 - 2*8, %len; ba,pt %icc, e3)
760
vis6e2: DO_THE_TRICK(   f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,
761
                        ,f52,f54,f56,f58,f60,f62,f48,f50,f16,
762
                        ,SYNC,          ,,,,,,STBLK_XORASI(x6,x7),ST(f48,64),
763
                        ,add %dst, 72, %dst; add %len, 192 - 2*8, %len; ba,pt %icc, e1)
764
        .align          2048
765
vis7s:  ldd             [%src+0], %f14          /*  Load        Group           */
766
        add             %src, 8, %src           /*  IEU0        Group           */
767
        wr              %g2, ASI_BLK_XOR, %asi  /*  LSU         Group           */
768
        ldda            [%src] ASI_BLK_P, %f16  /*  Load        Group           */
769
        add             %src, 64, %src          /*  IEU0        Group           */
770
        fmovd           %f48, %f0               /*  FPA         Group           */
771
        fmuld           %f32, %f32, %f2         /*  FPM                         */
772
        clr             %x4                     /*  IEU0                        */
773
        faddd           %f32, %f32, %f4         /*  FPA         Group           */
774
        fmuld           %f32, %f32, %f6         /*  FPM                         */
775
        clr             %x5                     /*  IEU0                        */
776
        faddd           %f32, %f32, %f8         /*  FPA         Group           */
777
        fmuld           %f32, %f32, %f10        /*  FPM                         */
778
        clr             %x6                     /*  IEU0                        */
779
        faddd           %f32, %f32, %f12        /*  FPA         Group           */
780
        clr             %x7                     /*  IEU0                        */
781
        fcmpgt32        %f32, %f14, %x8         /*  FPM         Group           */
782
        sub             %dst, 64, %dst          /*  IEU0                        */
783
        fmovd           %f14, %f48              /*  FPA                         */
784
vis7:   DO_THE_TRICK(   f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,
785
                        ,f50,f52,f54,f56,f58,f60,f62,f48,f48,
786
                        ,LDBLK(f32),    ,,,,,,,STBLK,
787
                        ,bcs,pn %icc, vis7e1)
788
        DO_THE_TRICK(   f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
789
                        ,f50,f52,f54,f56,f58,f60,f62,f48,f48,
790
                        ,LDBLK(f0),     ,,,,,,,STBLK,
791
                        ,bcs,pn %icc, vis7e2)
792
        DO_THE_TRICK(   f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,
793
                        ,f50,f52,f54,f56,f58,f60,f62,f48,f48,
794
                        ,LDBLK(f16),    ,,,,,,,STBLK,
795
                        ,bcc,pt %icc, vis7)
796
vis7e3: DO_THE_TRICK(   f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,
797
                        ,f50,f52,f54,f56,f58,f60,f62,f48,f32,
798
                        ,SYNC,          ,,,,,,,STBLK_XORASI(x7,x8),
799
                        ,add %dst, 64, %dst; add %len, 192 - 1*8, %len; ba,pt %icc, e2)
800
vis7e1: DO_THE_TRICK(   f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
801
                        ,f50,f52,f54,f56,f58,f60,f62,f48,f0,
802
                        ,SYNC,          ,,,,,,,STBLK_XORASI(x7,x8),
803
                        ,add %dst, 64, %dst; add %len, 192 - 1*8, %len; ba,pt %icc, e3)
804
vis7e2: DO_THE_TRICK(   f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,
805
                        ,f50,f52,f54,f56,f58,f60,f62,f48,f16,
806
                        ,SYNC,          ,,,,,,,STBLK_XORASI(x7,x8),
807
                        ,add %dst, 64, %dst; add %len, 192 - 1*8, %len; ba,pt %icc, e1)
808
e1:     END_THE_TRICK1( f0,f2,f4,f6,f8,f10,f12,f14,f16,f6)
809
e2:     END_THE_TRICK1( f16,f18,f20,f22,f24,f26,f28,f30,f32,f6)
810
e3:     END_THE_TRICK1( f32,f34,f36,f38,f40,f42,f44,f46,f0,f6)
811
ett:    rd              %gsr, %x3               /*  LSU         Group+4bubbles  */
812
        andcc           %x3, 7, %x3             /*  IEU1        Group           */
813
        add             %dst, 8, %dst           /*  IEU0                        */
814
        bne,pn          %icc, 1f                /*  CTI                         */
815
         fzero          %f10                    /*  FPA                         */
816
        brz,a,pn        %len, 2f                /*  CTI+IEU1    Group           */
817
         stda           %f6, [%dst - 8] %asi    /*  Store                       */
818
1:      cmp             %len, 8                 /*  IEU1                        */
819
        blu,pn          %icc, 3f                /*  CTI                         */
820
         sub            %src, 64, %src          /*  IEU0        Group           */
821
1:      ldd             [%src], %f2             /*  Load        Group           */
822
        fpadd32         %f10, %f2, %f12         /*  FPA         Group+load stall*/
823
        add             %src, 8, %src           /*  IEU0                        */
824
        add             %dst, 8, %dst           /*  IEU1                        */
825
        faligndata      %f6, %f2, %f14          /*  FPA         Group           */
826
        fcmpgt32        %f10, %f12, %x5         /*  FPM         Group           */
827
        stda            %f14, [%dst - 16] %asi  /*  Store                       */
828
        fmovd           %f2, %f6                /*  FPA                         */
829
        fmovd           %f12, %f10              /*  FPA         Group           */
830
        sub             %len, 8, %len           /*  IEU1                        */
831
        fzero           %f16                    /*  FPA         Group - FPU nop */
832
        fzero           %f18                    /*  FPA         Group - FPU nop */
833
        inc             %x5                     /*  IEU0                        */
834
        srl             %x5, 1, %x5             /*  IEU0        Group (regdep)  */
835
        cmp             %len, 8                 /*  IEU1                        */
836
        bgeu,pt         %icc, 1b                /*  CTI                         */
837
         add            %x5, %sum, %sum         /*  IEU0        Group           */
838
3:      brz,a,pt        %x3, 2f                 /*  CTI+IEU1                    */
839
         stda           %f6, [%dst - 8] %asi    /*  Store       Group           */
840
        sta             %f7, [%dst - 8] %asi    /*  Store       Group           */
841
        sub             %dst, 4, %dst           /*  IEU0                        */
842
        add             %len, 4, %len           /*  IEU1                        */
843
2:
844
#ifdef __KERNEL__
845
        sub             %sp, 8, %sp             /*  IEU0        Group           */
846
#endif
847
        END_THE_TRICK2( f48,f50,f52,f54,f56,f58,f60,f10,f12,f62)
848
        membar          #Sync                   /*  LSU         Group           */
849
#ifdef __KERNEL__
850
        VISExit
851
        add             %sp, 8, %sp             /*  IEU0        Group           */
852
#endif
853
23:     brnz,pn         %len, 26f               /*  CTI+IEU1    Group           */
854
24:      sllx           %sum, 32, %g1           /*  IEU0                        */
855
25:     addcc           %sum, %g1, %src         /*  IEU1        Group           */
856
        srlx            %src, 32, %src          /*  IEU0        Group (regdep)  */
857
        bcs,a,pn        %xcc, 1f                /*  CTI                         */
858
         add            %src, 1, %src           /*  IEU1                        */
859
#ifndef __KERNEL__
860
1:      retl                                    /*  CTI         Group brk forced*/
861
         srl            %src, 0, %src           /*  IEU0                        */
862
#else
863
1:      sethi           %uhi(PAGE_OFFSET), %g4  /*  IEU0        Group           */
864
        retl                                    /*  CTI         Group brk forced*/
865
         sllx           %g4, 32, %g4            /*  IEU0                        */
866
#endif
867
26:     andcc           %len, 8, %g0            /*  IEU1        Group           */
868
        be,pn           %icc, 1f                /*  CTI                         */
869
         lduw           [%src], %o4             /*  Load                        */
870
        lduw            [%src+4], %g2           /*  Load        Group           */
871
        add             %src, 8, %src           /*  IEU0                        */
872
        add             %dst, 8, %dst           /*  IEU1                        */
873
        sllx            %o4, 32, %g5            /*  IEU0        Group           */
874
        stwa            %o4, [%dst - 8] %asi    /*  Store                       */
875
        or              %g5, %g2, %g5           /*  IEU0        Group           */
876
        stwa            %g2, [%dst - 4] %asi    /*  Store                       */
877
        addcc           %g5, %sum, %sum         /*  IEU1        Group           */
878
        bcs,a,pn        %xcc, 1f                /*  CTI                         */
879
         add            %sum, 1, %sum           /*  IEU0                        */
880
1:      andcc           %len, 4, %g0            /*  IEU1        Group           */
881
        be,a,pn         %icc, 1f                /*  CTI                         */
882
         clr            %g2                     /*  IEU0                        */
883
        lduw            [%src], %g7             /*  Load                        */
884
        add             %src, 4, %src           /*  IEU0        Group           */
885
        add             %dst, 4, %dst           /*  IEU1                        */
886
        sllx            %g7, 32, %g2            /*  IEU0        Group           */
887
        stwa            %g7, [%dst - 4] %asi    /*  Store                       */
888
1:      andcc           %len, 2, %g0            /*  IEU1                        */
889
        be,a,pn         %icc, 1f                /*  CTI                         */
890
         clr            %g3                     /*  IEU0        Group           */
891
        lduh            [%src], %g7             /*  Load                        */
892
        add             %src, 2, %src           /*  IEU1                        */
893
        add             %dst, 2, %dst           /*  IEU0        Group           */
894
        sll             %g7, 16, %g3            /*  IEU0        Group           */
895
        stha            %g7, [%dst - 2] %asi    /*  Store                       */
896
1:      andcc           %len, 1, %g0            /*  IEU1                        */
897
        be,a,pn         %icc, 1f                /*  CTI                         */
898
         clr            %o5                     /*  IEU0        Group           */
899
        ldub            [%src], %g7             /*  Load                        */
900
        sll             %g7, 8, %o5             /*  IEU0        Group           */
901
        stba            %g7, [%dst] %asi        /*  Store                       */
902
1:      or              %g2, %g3, %g3           /*  IEU1                        */
903
        or              %o5, %g3, %g3           /*  IEU0        Group (regdep)  */
904
        addcc           %g3, %sum, %sum         /*  IEU1        Group (regdep)  */
905
        bcs,a,pn        %xcc, 1f                /*  CTI                         */
906
         add            %sum, 1, %sum           /*  IEU0                        */
907
1:      ba,pt           %xcc, 25b               /*  CTI         Group           */
908
         sllx           %sum, 32, %g1           /*  IEU0                        */
909
 
910
#ifdef __KERNEL__
911
end:
912
 
913
        .section        __ex_table
914
        .align          4
915
        .word           csum_partial_copy_user_vis, 0, end, cpc_handler
916
#endif

powered by: WebSVN 2.1.0

© copyright 1999-2024 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.