OpenCores
URL https://opencores.org/ocsvn/or1k/or1k/trunk

Subversion Repositories or1k

[/] [or1k/] [trunk/] [linux/] [linux-2.4/] [include/] [asm-i386/] [xor.h] - Blame information for rev 1774

Go to most recent revision | Details | Compare with Previous | View Log

Line No. Rev Author Line
1 1275 phoenix
/*
2
 * include/asm-i386/xor.h
3
 *
4
 * Optimized RAID-5 checksumming functions for MMX and SSE.
5
 *
6
 * This program is free software; you can redistribute it and/or modify
7
 * it under the terms of the GNU General Public License as published by
8
 * the Free Software Foundation; either version 2, or (at your option)
9
 * any later version.
10
 *
11
 * You should have received a copy of the GNU General Public License
12
 * (for example /usr/src/linux/COPYING); if not, write to the Free
13
 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
14
 */
15
 
16
/*
17
 * High-speed RAID5 checksumming functions utilizing MMX instructions.
18
 * Copyright (C) 1998 Ingo Molnar.
19
 */
20
 
21
#define FPU_SAVE                                                        \
22
  do {                                                                  \
23
        if (!(current->flags & PF_USEDFPU))                             \
24
                __asm__ __volatile__ (" clts;\n");                      \
25
        __asm__ __volatile__ ("fsave %0; fwait": "=m"(fpu_save[0]));     \
26
  } while (0)
27
 
28
#define FPU_RESTORE                                                     \
29
  do {                                                                  \
30
        __asm__ __volatile__ ("frstor %0": : "m"(fpu_save[0]));          \
31
        if (!(current->flags & PF_USEDFPU))                             \
32
                stts();                                                 \
33
  } while (0)
34
 
35
#define LD(x,y)         "       movq   8*("#x")(%1), %%mm"#y"   ;\n"
36
#define ST(x,y)         "       movq %%mm"#y",   8*("#x")(%1)   ;\n"
37
#define XO1(x,y)        "       pxor   8*("#x")(%2), %%mm"#y"   ;\n"
38
#define XO2(x,y)        "       pxor   8*("#x")(%3), %%mm"#y"   ;\n"
39
#define XO3(x,y)        "       pxor   8*("#x")(%4), %%mm"#y"   ;\n"
40
#define XO4(x,y)        "       pxor   8*("#x")(%5), %%mm"#y"   ;\n"
41
 
42
 
43
static void
44
xor_pII_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
45
{
46
        unsigned long lines = bytes >> 7;
47
        char fpu_save[108];
48
 
49
        FPU_SAVE;
50
 
51
        __asm__ __volatile__ (
52
#undef BLOCK
53
#define BLOCK(i) \
54
        LD(i,0)                                  \
55
                LD(i+1,1)                       \
56
                        LD(i+2,2)               \
57
                                LD(i+3,3)       \
58
        XO1(i,0)                         \
59
        ST(i,0)                                  \
60
                XO1(i+1,1)                      \
61
                ST(i+1,1)                       \
62
                        XO1(i+2,2)              \
63
                        ST(i+2,2)               \
64
                                XO1(i+3,3)      \
65
                                ST(i+3,3)
66
 
67
        " .align 32                     ;\n"
68
        " 1:                            ;\n"
69
 
70
        BLOCK(0)
71
        BLOCK(4)
72
        BLOCK(8)
73
        BLOCK(12)
74
 
75
        "       addl $128, %1         ;\n"
76
        "       addl $128, %2         ;\n"
77
        "       decl %0               ;\n"
78
        "       jnz 1b                ;\n"
79
        : "+r" (lines),
80
          "+r" (p1), "+r" (p2)
81
        :
82
        : "memory");
83
 
84
        FPU_RESTORE;
85
}
86
 
87
static void
88
xor_pII_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
89
              unsigned long *p3)
90
{
91
        unsigned long lines = bytes >> 7;
92
        char fpu_save[108];
93
 
94
        FPU_SAVE;
95
 
96
        __asm__ __volatile__ (
97
#undef BLOCK
98
#define BLOCK(i) \
99
        LD(i,0)                                  \
100
                LD(i+1,1)                       \
101
                        LD(i+2,2)               \
102
                                LD(i+3,3)       \
103
        XO1(i,0)                         \
104
                XO1(i+1,1)                      \
105
                        XO1(i+2,2)              \
106
                                XO1(i+3,3)      \
107
        XO2(i,0)                         \
108
        ST(i,0)                                  \
109
                XO2(i+1,1)                      \
110
                ST(i+1,1)                       \
111
                        XO2(i+2,2)              \
112
                        ST(i+2,2)               \
113
                                XO2(i+3,3)      \
114
                                ST(i+3,3)
115
 
116
        " .align 32                     ;\n"
117
        " 1:                            ;\n"
118
 
119
        BLOCK(0)
120
        BLOCK(4)
121
        BLOCK(8)
122
        BLOCK(12)
123
 
124
        "       addl $128, %1         ;\n"
125
        "       addl $128, %2         ;\n"
126
        "       addl $128, %3         ;\n"
127
        "       decl %0               ;\n"
128
        "       jnz 1b                ;\n"
129
        : "+r" (lines),
130
          "+r" (p1), "+r" (p2), "+r" (p3)
131
        :
132
        : "memory");
133
 
134
        FPU_RESTORE;
135
}
136
 
137
static void
138
xor_pII_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
139
              unsigned long *p3, unsigned long *p4)
140
{
141
        unsigned long lines = bytes >> 7;
142
        char fpu_save[108];
143
 
144
        FPU_SAVE;
145
 
146
        __asm__ __volatile__ (
147
#undef BLOCK
148
#define BLOCK(i) \
149
        LD(i,0)                                  \
150
                LD(i+1,1)                       \
151
                        LD(i+2,2)               \
152
                                LD(i+3,3)       \
153
        XO1(i,0)                         \
154
                XO1(i+1,1)                      \
155
                        XO1(i+2,2)              \
156
                                XO1(i+3,3)      \
157
        XO2(i,0)                         \
158
                XO2(i+1,1)                      \
159
                        XO2(i+2,2)              \
160
                                XO2(i+3,3)      \
161
        XO3(i,0)                         \
162
        ST(i,0)                                  \
163
                XO3(i+1,1)                      \
164
                ST(i+1,1)                       \
165
                        XO3(i+2,2)              \
166
                        ST(i+2,2)               \
167
                                XO3(i+3,3)      \
168
                                ST(i+3,3)
169
 
170
        " .align 32                     ;\n"
171
        " 1:                            ;\n"
172
 
173
        BLOCK(0)
174
        BLOCK(4)
175
        BLOCK(8)
176
        BLOCK(12)
177
 
178
        "       addl $128, %1         ;\n"
179
        "       addl $128, %2         ;\n"
180
        "       addl $128, %3         ;\n"
181
        "       addl $128, %4         ;\n"
182
        "       decl %0               ;\n"
183
        "       jnz 1b                ;\n"
184
        : "+r" (lines),
185
          "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4)
186
        :
187
        : "memory");
188
 
189
        FPU_RESTORE;
190
}
191
 
192
 
193
static void
194
xor_pII_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
195
              unsigned long *p3, unsigned long *p4, unsigned long *p5)
196
{
197
        unsigned long lines = bytes >> 7;
198
        char fpu_save[108];
199
 
200
        FPU_SAVE;
201
 
202
        /* need to save/restore p4/p5 manually otherwise gcc's 10 argument
203
           limit gets exceeded (+ counts as two arguments) */
204
        __asm__ __volatile__ (
205
                "  pushl %4\n"
206
                "  pushl %5\n"
207
#undef BLOCK
208
#define BLOCK(i) \
209
        LD(i,0)                                  \
210
                LD(i+1,1)                       \
211
                        LD(i+2,2)               \
212
                                LD(i+3,3)       \
213
        XO1(i,0)                         \
214
                XO1(i+1,1)                      \
215
                        XO1(i+2,2)              \
216
                                XO1(i+3,3)      \
217
        XO2(i,0)                         \
218
                XO2(i+1,1)                      \
219
                        XO2(i+2,2)              \
220
                                XO2(i+3,3)      \
221
        XO3(i,0)                         \
222
                XO3(i+1,1)                      \
223
                        XO3(i+2,2)              \
224
                                XO3(i+3,3)      \
225
        XO4(i,0)                         \
226
        ST(i,0)                                  \
227
                XO4(i+1,1)                      \
228
                ST(i+1,1)                       \
229
                        XO4(i+2,2)              \
230
                        ST(i+2,2)               \
231
                                XO4(i+3,3)      \
232
                                ST(i+3,3)
233
 
234
        " .align 32                     ;\n"
235
        " 1:                            ;\n"
236
 
237
        BLOCK(0)
238
        BLOCK(4)
239
        BLOCK(8)
240
        BLOCK(12)
241
 
242
        "       addl $128, %1         ;\n"
243
        "       addl $128, %2         ;\n"
244
        "       addl $128, %3         ;\n"
245
        "       addl $128, %4         ;\n"
246
        "       addl $128, %5         ;\n"
247
        "       decl %0               ;\n"
248
        "       jnz 1b                ;\n"
249
        "       popl %5\n"
250
        "       popl %4\n"
251
        : "+r" (lines),
252
          "+r" (p1), "+r" (p2), "+r" (p3)
253
        : "r" (p4), "r" (p5)
254
        : "memory");
255
 
256
        FPU_RESTORE;
257
}
258
 
259
#undef LD
260
#undef XO1
261
#undef XO2
262
#undef XO3
263
#undef XO4
264
#undef ST
265
#undef BLOCK
266
 
267
static void
268
xor_p5_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
269
{
270
        unsigned long lines = bytes >> 6;
271
        char fpu_save[108];
272
 
273
        FPU_SAVE;
274
 
275
        __asm__ __volatile__ (
276
        " .align 32                  ;\n"
277
        " 1:                         ;\n"
278
        "       movq   (%1), %%mm0   ;\n"
279
        "       movq  8(%1), %%mm1   ;\n"
280
        "       pxor   (%2), %%mm0   ;\n"
281
        "       movq 16(%1), %%mm2   ;\n"
282
        "       movq %%mm0,   (%1)   ;\n"
283
        "       pxor  8(%2), %%mm1   ;\n"
284
        "       movq 24(%1), %%mm3   ;\n"
285
        "       movq %%mm1,  8(%1)   ;\n"
286
        "       pxor 16(%2), %%mm2   ;\n"
287
        "       movq 32(%1), %%mm4   ;\n"
288
        "       movq %%mm2, 16(%1)   ;\n"
289
        "       pxor 24(%2), %%mm3   ;\n"
290
        "       movq 40(%1), %%mm5   ;\n"
291
        "       movq %%mm3, 24(%1)   ;\n"
292
        "       pxor 32(%2), %%mm4   ;\n"
293
        "       movq 48(%1), %%mm6   ;\n"
294
        "       movq %%mm4, 32(%1)   ;\n"
295
        "       pxor 40(%2), %%mm5   ;\n"
296
        "       movq 56(%1), %%mm7   ;\n"
297
        "       movq %%mm5, 40(%1)   ;\n"
298
        "       pxor 48(%2), %%mm6   ;\n"
299
        "       pxor 56(%2), %%mm7   ;\n"
300
        "       movq %%mm6, 48(%1)   ;\n"
301
        "       movq %%mm7, 56(%1)   ;\n"
302
 
303
        "       addl $64, %1         ;\n"
304
        "       addl $64, %2         ;\n"
305
        "       decl %0              ;\n"
306
        "       jnz 1b               ;\n"
307
        : "+r" (lines),
308
          "+r" (p1), "+r" (p2)
309
        :
310
        : "memory");
311
 
312
        FPU_RESTORE;
313
}
314
 
315
static void
316
xor_p5_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
317
             unsigned long *p3)
318
{
319
        unsigned long lines = bytes >> 6;
320
        char fpu_save[108];
321
 
322
        FPU_SAVE;
323
 
324
        __asm__ __volatile__ (
325
        " .align 32,0x90             ;\n"
326
        " 1:                         ;\n"
327
        "       movq   (%1), %%mm0   ;\n"
328
        "       movq  8(%1), %%mm1   ;\n"
329
        "       pxor   (%2), %%mm0   ;\n"
330
        "       movq 16(%1), %%mm2   ;\n"
331
        "       pxor  8(%2), %%mm1   ;\n"
332
        "       pxor   (%3), %%mm0   ;\n"
333
        "       pxor 16(%2), %%mm2   ;\n"
334
        "       movq %%mm0,   (%1)   ;\n"
335
        "       pxor  8(%3), %%mm1   ;\n"
336
        "       pxor 16(%3), %%mm2   ;\n"
337
        "       movq 24(%1), %%mm3   ;\n"
338
        "       movq %%mm1,  8(%1)   ;\n"
339
        "       movq 32(%1), %%mm4   ;\n"
340
        "       movq 40(%1), %%mm5   ;\n"
341
        "       pxor 24(%2), %%mm3   ;\n"
342
        "       movq %%mm2, 16(%1)   ;\n"
343
        "       pxor 32(%2), %%mm4   ;\n"
344
        "       pxor 24(%3), %%mm3   ;\n"
345
        "       pxor 40(%2), %%mm5   ;\n"
346
        "       movq %%mm3, 24(%1)   ;\n"
347
        "       pxor 32(%3), %%mm4   ;\n"
348
        "       pxor 40(%3), %%mm5   ;\n"
349
        "       movq 48(%1), %%mm6   ;\n"
350
        "       movq %%mm4, 32(%1)   ;\n"
351
        "       movq 56(%1), %%mm7   ;\n"
352
        "       pxor 48(%2), %%mm6   ;\n"
353
        "       movq %%mm5, 40(%1)   ;\n"
354
        "       pxor 56(%2), %%mm7   ;\n"
355
        "       pxor 48(%3), %%mm6   ;\n"
356
        "       pxor 56(%3), %%mm7   ;\n"
357
        "       movq %%mm6, 48(%1)   ;\n"
358
        "       movq %%mm7, 56(%1)   ;\n"
359
 
360
        "       addl $64, %1         ;\n"
361
        "       addl $64, %2         ;\n"
362
        "       addl $64, %3         ;\n"
363
        "       decl %0              ;\n"
364
        "       jnz 1b               ;\n"
365
        : "+r" (lines),
366
          "+r" (p1), "+r" (p2), "+r" (p3)
367
        :
368
        : "memory" );
369
 
370
        FPU_RESTORE;
371
}
372
 
373
static void
374
xor_p5_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
375
             unsigned long *p3, unsigned long *p4)
376
{
377
        unsigned long lines = bytes >> 6;
378
        char fpu_save[108];
379
 
380
        FPU_SAVE;
381
 
382
        __asm__ __volatile__ (
383
        " .align 32,0x90             ;\n"
384
        " 1:                         ;\n"
385
        "       movq   (%1), %%mm0   ;\n"
386
        "       movq  8(%1), %%mm1   ;\n"
387
        "       pxor   (%2), %%mm0   ;\n"
388
        "       movq 16(%1), %%mm2   ;\n"
389
        "       pxor  8(%2), %%mm1   ;\n"
390
        "       pxor   (%3), %%mm0   ;\n"
391
        "       pxor 16(%2), %%mm2   ;\n"
392
        "       pxor  8(%3), %%mm1   ;\n"
393
        "       pxor   (%4), %%mm0   ;\n"
394
        "       movq 24(%1), %%mm3   ;\n"
395
        "       pxor 16(%3), %%mm2   ;\n"
396
        "       pxor  8(%4), %%mm1   ;\n"
397
        "       movq %%mm0,   (%1)   ;\n"
398
        "       movq 32(%1), %%mm4   ;\n"
399
        "       pxor 24(%2), %%mm3   ;\n"
400
        "       pxor 16(%4), %%mm2   ;\n"
401
        "       movq %%mm1,  8(%1)   ;\n"
402
        "       movq 40(%1), %%mm5   ;\n"
403
        "       pxor 32(%2), %%mm4   ;\n"
404
        "       pxor 24(%3), %%mm3   ;\n"
405
        "       movq %%mm2, 16(%1)   ;\n"
406
        "       pxor 40(%2), %%mm5   ;\n"
407
        "       pxor 32(%3), %%mm4   ;\n"
408
        "       pxor 24(%4), %%mm3   ;\n"
409
        "       movq %%mm3, 24(%1)   ;\n"
410
        "       movq 56(%1), %%mm7   ;\n"
411
        "       movq 48(%1), %%mm6   ;\n"
412
        "       pxor 40(%3), %%mm5   ;\n"
413
        "       pxor 32(%4), %%mm4   ;\n"
414
        "       pxor 48(%2), %%mm6   ;\n"
415
        "       movq %%mm4, 32(%1)   ;\n"
416
        "       pxor 56(%2), %%mm7   ;\n"
417
        "       pxor 40(%4), %%mm5   ;\n"
418
        "       pxor 48(%3), %%mm6   ;\n"
419
        "       pxor 56(%3), %%mm7   ;\n"
420
        "       movq %%mm5, 40(%1)   ;\n"
421
        "       pxor 48(%4), %%mm6   ;\n"
422
        "       pxor 56(%4), %%mm7   ;\n"
423
        "       movq %%mm6, 48(%1)   ;\n"
424
        "       movq %%mm7, 56(%1)   ;\n"
425
 
426
        "       addl $64, %1         ;\n"
427
        "       addl $64, %2         ;\n"
428
        "       addl $64, %3         ;\n"
429
        "       addl $64, %4         ;\n"
430
        "       decl %0              ;\n"
431
        "       jnz 1b               ;\n"
432
        : "+r" (lines),
433
          "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4)
434
        :
435
        : "memory");
436
 
437
        FPU_RESTORE;
438
}
439
 
440
static void
441
xor_p5_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
442
             unsigned long *p3, unsigned long *p4, unsigned long *p5)
443
{
444
        unsigned long lines = bytes >> 6;
445
        char fpu_save[108];
446
 
447
        FPU_SAVE;
448
 
449
        /* need to save p4/p5 manually to not exceed gcc's 10 argument limit */
450
        __asm__ __volatile__ (
451
        "       pushl %4\n"
452
        "       pushl %5\n"
453
        " .align 32,0x90             ;\n"
454
        " 1:                         ;\n"
455
        "       movq   (%1), %%mm0   ;\n"
456
        "       movq  8(%1), %%mm1   ;\n"
457
        "       pxor   (%2), %%mm0   ;\n"
458
        "       pxor  8(%2), %%mm1   ;\n"
459
        "       movq 16(%1), %%mm2   ;\n"
460
        "       pxor   (%3), %%mm0   ;\n"
461
        "       pxor  8(%3), %%mm1   ;\n"
462
        "       pxor 16(%2), %%mm2   ;\n"
463
        "       pxor   (%4), %%mm0   ;\n"
464
        "       pxor  8(%4), %%mm1   ;\n"
465
        "       pxor 16(%3), %%mm2   ;\n"
466
        "       movq 24(%1), %%mm3   ;\n"
467
        "       pxor   (%5), %%mm0   ;\n"
468
        "       pxor  8(%5), %%mm1   ;\n"
469
        "       movq %%mm0,   (%1)   ;\n"
470
        "       pxor 16(%4), %%mm2   ;\n"
471
        "       pxor 24(%2), %%mm3   ;\n"
472
        "       movq %%mm1,  8(%1)   ;\n"
473
        "       pxor 16(%5), %%mm2   ;\n"
474
        "       pxor 24(%3), %%mm3   ;\n"
475
        "       movq 32(%1), %%mm4   ;\n"
476
        "       movq %%mm2, 16(%1)   ;\n"
477
        "       pxor 24(%4), %%mm3   ;\n"
478
        "       pxor 32(%2), %%mm4   ;\n"
479
        "       movq 40(%1), %%mm5   ;\n"
480
        "       pxor 24(%5), %%mm3   ;\n"
481
        "       pxor 32(%3), %%mm4   ;\n"
482
        "       pxor 40(%2), %%mm5   ;\n"
483
        "       movq %%mm3, 24(%1)   ;\n"
484
        "       pxor 32(%4), %%mm4   ;\n"
485
        "       pxor 40(%3), %%mm5   ;\n"
486
        "       movq 48(%1), %%mm6   ;\n"
487
        "       movq 56(%1), %%mm7   ;\n"
488
        "       pxor 32(%5), %%mm4   ;\n"
489
        "       pxor 40(%4), %%mm5   ;\n"
490
        "       pxor 48(%2), %%mm6   ;\n"
491
        "       pxor 56(%2), %%mm7   ;\n"
492
        "       movq %%mm4, 32(%1)   ;\n"
493
        "       pxor 48(%3), %%mm6   ;\n"
494
        "       pxor 56(%3), %%mm7   ;\n"
495
        "       pxor 40(%5), %%mm5   ;\n"
496
        "       pxor 48(%4), %%mm6   ;\n"
497
        "       pxor 56(%4), %%mm7   ;\n"
498
        "       movq %%mm5, 40(%1)   ;\n"
499
        "       pxor 48(%5), %%mm6   ;\n"
500
        "       pxor 56(%5), %%mm7   ;\n"
501
        "       movq %%mm6, 48(%1)   ;\n"
502
        "       movq %%mm7, 56(%1)   ;\n"
503
 
504
        "       addl $64, %1         ;\n"
505
        "       addl $64, %2         ;\n"
506
        "       addl $64, %3         ;\n"
507
        "       addl $64, %4         ;\n"
508
        "       addl $64, %5         ;\n"
509
        "       decl %0              ;\n"
510
        "       jnz 1b               ;\n"
511
        "       popl %5\n"
512
        "       popl %4\n"
513
        : "+g" (lines),
514
          "+r" (p1), "+r" (p2), "+r" (p3)
515
        : "r" (p4), "r" (p5)
516
        : "memory");
517
 
518
        FPU_RESTORE;
519
}
520
 
521
static struct xor_block_template xor_block_pII_mmx = {
522
        name: "pII_mmx",
523
        do_2: xor_pII_mmx_2,
524
        do_3: xor_pII_mmx_3,
525
        do_4: xor_pII_mmx_4,
526
        do_5: xor_pII_mmx_5,
527
};
528
 
529
static struct xor_block_template xor_block_p5_mmx = {
530
        name: "p5_mmx",
531
        do_2: xor_p5_mmx_2,
532
        do_3: xor_p5_mmx_3,
533
        do_4: xor_p5_mmx_4,
534
        do_5: xor_p5_mmx_5,
535
};
536
 
537
#undef FPU_SAVE
538
#undef FPU_RESTORE
539
 
540
/*
541
 * Cache avoiding checksumming functions utilizing KNI instructions
542
 * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
543
 */
544
 
545
#define XMMS_SAVE                               \
546
        __asm__ __volatile__ (                  \
547
                "movl %%cr0,%0          ;\n\t"  \
548
                "clts                   ;\n\t"  \
549
                "movups %%xmm0,(%1)     ;\n\t"  \
550
                "movups %%xmm1,0x10(%1) ;\n\t"  \
551
                "movups %%xmm2,0x20(%1) ;\n\t"  \
552
                "movups %%xmm3,0x30(%1) ;\n\t"  \
553
                : "=&r" (cr0)                   \
554
                : "r" (xmm_save)                \
555
                : "memory")
556
 
557
#define XMMS_RESTORE                            \
558
        __asm__ __volatile__ (                  \
559
                "sfence                 ;\n\t"  \
560
                "movups (%1),%%xmm0     ;\n\t"  \
561
                "movups 0x10(%1),%%xmm1 ;\n\t"  \
562
                "movups 0x20(%1),%%xmm2 ;\n\t"  \
563
                "movups 0x30(%1),%%xmm3 ;\n\t"  \
564
                "movl   %0,%%cr0        ;\n\t"  \
565
                :                               \
566
                : "r" (cr0), "r" (xmm_save)     \
567
                : "memory")
568
 
569
#define ALIGN16 __attribute__((aligned(16)))
570
 
571
#define OFFS(x)         "16*("#x")"
572
#define PF_OFFS(x)      "256+16*("#x")"
573
#define PF0(x)          "       prefetchnta "PF_OFFS(x)"(%1)            ;\n"
574
#define LD(x,y)         "       movaps   "OFFS(x)"(%1), %%xmm"#y"       ;\n"
575
#define ST(x,y)         "       movaps %%xmm"#y",   "OFFS(x)"(%1)       ;\n"
576
#define PF1(x)          "       prefetchnta "PF_OFFS(x)"(%2)            ;\n"
577
#define PF2(x)          "       prefetchnta "PF_OFFS(x)"(%3)            ;\n"
578
#define PF3(x)          "       prefetchnta "PF_OFFS(x)"(%4)            ;\n"
579
#define PF4(x)          "       prefetchnta "PF_OFFS(x)"(%5)            ;\n"
580
#define PF5(x)          "       prefetchnta "PF_OFFS(x)"(%6)            ;\n"
581
#define XO1(x,y)        "       xorps   "OFFS(x)"(%2), %%xmm"#y"        ;\n"
582
#define XO2(x,y)        "       xorps   "OFFS(x)"(%3), %%xmm"#y"        ;\n"
583
#define XO3(x,y)        "       xorps   "OFFS(x)"(%4), %%xmm"#y"        ;\n"
584
#define XO4(x,y)        "       xorps   "OFFS(x)"(%5), %%xmm"#y"        ;\n"
585
#define XO5(x,y)        "       xorps   "OFFS(x)"(%6), %%xmm"#y"        ;\n"
586
 
587
 
588
static void
589
xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
590
{
591
        unsigned long lines = bytes >> 8;
592
        char xmm_save[16*4] ALIGN16;
593
        int cr0;
594
 
595
        XMMS_SAVE;
596
 
597
        __asm__ __volatile__ (
598
#undef BLOCK
599
#define BLOCK(i) \
600
                LD(i,0)                                  \
601
                        LD(i+1,1)                       \
602
                PF1(i)                                  \
603
                                PF1(i+2)                \
604
                                LD(i+2,2)               \
605
                                        LD(i+3,3)       \
606
                PF0(i+4)                                \
607
                                PF0(i+6)                \
608
                XO1(i,0)                         \
609
                        XO1(i+1,1)                      \
610
                                XO1(i+2,2)              \
611
                                        XO1(i+3,3)      \
612
                ST(i,0)                                  \
613
                        ST(i+1,1)                       \
614
                                ST(i+2,2)               \
615
                                        ST(i+3,3)       \
616
 
617
 
618
                PF0(0)
619
                                PF0(2)
620
 
621
        " .align 32                     ;\n"
622
        " 1:                            ;\n"
623
 
624
                BLOCK(0)
625
                BLOCK(4)
626
                BLOCK(8)
627
                BLOCK(12)
628
 
629
        "       addl $256, %1           ;\n"
630
        "       addl $256, %2           ;\n"
631
        "       decl %0                 ;\n"
632
        "       jnz 1b                  ;\n"
633
        : "+r" (lines),
634
          "+r" (p1), "+r" (p2)
635
        :
636
        : "memory");
637
 
638
        XMMS_RESTORE;
639
}
640
 
641
static void
642
xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
643
          unsigned long *p3)
644
{
645
        unsigned long lines = bytes >> 8;
646
        char xmm_save[16*4] ALIGN16;
647
        int cr0;
648
 
649
        XMMS_SAVE;
650
 
651
        __asm__ __volatile__ (
652
#undef BLOCK
653
#define BLOCK(i) \
654
                PF1(i)                                  \
655
                                PF1(i+2)                \
656
                LD(i,0)                                  \
657
                        LD(i+1,1)                       \
658
                                LD(i+2,2)               \
659
                                        LD(i+3,3)       \
660
                PF2(i)                                  \
661
                                PF2(i+2)                \
662
                PF0(i+4)                                \
663
                                PF0(i+6)                \
664
                XO1(i,0)                         \
665
                        XO1(i+1,1)                      \
666
                                XO1(i+2,2)              \
667
                                        XO1(i+3,3)      \
668
                XO2(i,0)                         \
669
                        XO2(i+1,1)                      \
670
                                XO2(i+2,2)              \
671
                                        XO2(i+3,3)      \
672
                ST(i,0)                                  \
673
                        ST(i+1,1)                       \
674
                                ST(i+2,2)               \
675
                                        ST(i+3,3)       \
676
 
677
 
678
                PF0(0)
679
                                PF0(2)
680
 
681
        " .align 32                     ;\n"
682
        " 1:                            ;\n"
683
 
684
                BLOCK(0)
685
                BLOCK(4)
686
                BLOCK(8)
687
                BLOCK(12)
688
 
689
        "       addl $256, %1           ;\n"
690
        "       addl $256, %2           ;\n"
691
        "       addl $256, %3           ;\n"
692
        "       decl %0                 ;\n"
693
        "       jnz 1b                  ;\n"
694
        : "+r" (lines),
695
          "+r" (p1), "+r"(p2), "+r"(p3)
696
        :
697
        : "memory" );
698
 
699
        XMMS_RESTORE;
700
}
701
 
702
static void
703
xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
704
          unsigned long *p3, unsigned long *p4)
705
{
706
        unsigned long lines = bytes >> 8;
707
        char xmm_save[16*4] ALIGN16;
708
        int cr0;
709
 
710
        XMMS_SAVE;
711
 
712
        __asm__ __volatile__ (
713
#undef BLOCK
714
#define BLOCK(i) \
715
                PF1(i)                                  \
716
                                PF1(i+2)                \
717
                LD(i,0)                                  \
718
                        LD(i+1,1)                       \
719
                                LD(i+2,2)               \
720
                                        LD(i+3,3)       \
721
                PF2(i)                                  \
722
                                PF2(i+2)                \
723
                XO1(i,0)                         \
724
                        XO1(i+1,1)                      \
725
                                XO1(i+2,2)              \
726
                                        XO1(i+3,3)      \
727
                PF3(i)                                  \
728
                                PF3(i+2)                \
729
                PF0(i+4)                                \
730
                                PF0(i+6)                \
731
                XO2(i,0)                         \
732
                        XO2(i+1,1)                      \
733
                                XO2(i+2,2)              \
734
                                        XO2(i+3,3)      \
735
                XO3(i,0)                         \
736
                        XO3(i+1,1)                      \
737
                                XO3(i+2,2)              \
738
                                        XO3(i+3,3)      \
739
                ST(i,0)                                  \
740
                        ST(i+1,1)                       \
741
                                ST(i+2,2)               \
742
                                        ST(i+3,3)       \
743
 
744
 
745
                PF0(0)
746
                                PF0(2)
747
 
748
        " .align 32                     ;\n"
749
        " 1:                            ;\n"
750
 
751
                BLOCK(0)
752
                BLOCK(4)
753
                BLOCK(8)
754
                BLOCK(12)
755
 
756
        "       addl $256, %1           ;\n"
757
        "       addl $256, %2           ;\n"
758
        "       addl $256, %3           ;\n"
759
        "       addl $256, %4           ;\n"
760
        "       decl %0                 ;\n"
761
        "       jnz 1b                  ;\n"
762
        : "+r" (lines),
763
          "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4)
764
        :
765
        : "memory" );
766
 
767
        XMMS_RESTORE;
768
}
769
 
770
static void
771
xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
772
          unsigned long *p3, unsigned long *p4, unsigned long *p5)
773
{
774
        unsigned long lines = bytes >> 8;
775
        char xmm_save[16*4] ALIGN16;
776
        int cr0;
777
 
778
        XMMS_SAVE;
779
 
780
        /* need to save p4/p5 manually to not exceed gcc's 10 argument limit */
781
        __asm__ __volatile__ (
782
                " pushl %4\n"
783
                " pushl %5\n"
784
#undef BLOCK
785
#define BLOCK(i) \
786
                PF1(i)                                  \
787
                                PF1(i+2)                \
788
                LD(i,0)                                  \
789
                        LD(i+1,1)                       \
790
                                LD(i+2,2)               \
791
                                        LD(i+3,3)       \
792
                PF2(i)                                  \
793
                                PF2(i+2)                \
794
                XO1(i,0)                         \
795
                        XO1(i+1,1)                      \
796
                                XO1(i+2,2)              \
797
                                        XO1(i+3,3)      \
798
                PF3(i)                                  \
799
                                PF3(i+2)                \
800
                XO2(i,0)                         \
801
                        XO2(i+1,1)                      \
802
                                XO2(i+2,2)              \
803
                                        XO2(i+3,3)      \
804
                PF4(i)                                  \
805
                                PF4(i+2)                \
806
                PF0(i+4)                                \
807
                                PF0(i+6)                \
808
                XO3(i,0)                         \
809
                        XO3(i+1,1)                      \
810
                                XO3(i+2,2)              \
811
                                        XO3(i+3,3)      \
812
                XO4(i,0)                         \
813
                        XO4(i+1,1)                      \
814
                                XO4(i+2,2)              \
815
                                        XO4(i+3,3)      \
816
                ST(i,0)                                  \
817
                        ST(i+1,1)                       \
818
                                ST(i+2,2)               \
819
                                        ST(i+3,3)       \
820
 
821
 
822
                PF0(0)
823
                                PF0(2)
824
 
825
        " .align 32                     ;\n"
826
        " 1:                            ;\n"
827
 
828
                BLOCK(0)
829
                BLOCK(4)
830
                BLOCK(8)
831
                BLOCK(12)
832
 
833
        "       addl $256, %1           ;\n"
834
        "       addl $256, %2           ;\n"
835
        "       addl $256, %3           ;\n"
836
        "       addl $256, %4           ;\n"
837
        "       addl $256, %5           ;\n"
838
        "       decl %0                 ;\n"
839
        "       jnz 1b                  ;\n"
840
        "       popl %5\n"
841
        "       popl %4\n"
842
        : "+r" (lines),
843
          "+r" (p1), "+r" (p2), "+r" (p3)
844
        : "r" (p4), "r" (p5)
845
        : "memory");
846
 
847
        XMMS_RESTORE;
848
}
849
 
850
static struct xor_block_template xor_block_pIII_sse = {
851
        name: "pIII_sse",
852
        do_2: xor_sse_2,
853
        do_3: xor_sse_3,
854
        do_4: xor_sse_4,
855
        do_5: xor_sse_5,
856
};
857
 
858
/* Also try the generic routines.  */
859
#include <asm-generic/xor.h>
860
 
861
#undef XOR_TRY_TEMPLATES
862
#define XOR_TRY_TEMPLATES                               \
863
        do {                                            \
864
                xor_speed(&xor_block_8regs);            \
865
                xor_speed(&xor_block_32regs);           \
866
                if (cpu_has_xmm)                        \
867
                        xor_speed(&xor_block_pIII_sse); \
868
                if (md_cpu_has_mmx()) {                 \
869
                        xor_speed(&xor_block_pII_mmx);  \
870
                        xor_speed(&xor_block_p5_mmx);   \
871
                }                                       \
872
        } while (0)
873
 
874
/* We force the use of the SSE xor block because it can write around L2.
875
   We may also be able to load into the L1 only depending on how the cpu
876
   deals with a load to a line that is being prefetched.  */
877
#define XOR_SELECT_TEMPLATE(FASTEST) \
878
        (cpu_has_xmm ? &xor_block_pIII_sse : FASTEST)

powered by: WebSVN 2.1.0

© copyright 1999-2024 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.