OpenCores
URL https://opencores.org/ocsvn/or1k/or1k/trunk

Subversion Repositories or1k

[/] [or1k/] [trunk/] [linux/] [linux-2.4/] [include/] [asm-x86_64/] [xor.h] - Blame information for rev 1765

Details | Compare with Previous | View Log

Line No. Rev Author Line
1 1275 phoenix
/*
2
 * include/asm-x86_64/xor.h
3
 *
4
 * Optimized RAID-5 checksumming functions for MMX and SSE.
5
 *
6
 * This program is free software; you can redistribute it and/or modify
7
 * it under the terms of the GNU General Public License as published by
8
 * the Free Software Foundation; either version 2, or (at your option)
9
 * any later version.
10
 *
11
 * You should have received a copy of the GNU General Public License
12
 * (for example /usr/src/linux/COPYING); if not, write to the Free
13
 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
14
 */
15
 
16
 
17
/*
18
 * Cache avoiding checksumming functions utilizing KNI instructions
19
 * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
20
 */
21
 
22
/*
23
 * Based on
24
 * High-speed RAID5 checksumming functions utilizing SSE instructions.
25
 * Copyright (C) 1998 Ingo Molnar.
26
 */
27
 
28
/*
29
 * x86-64 changes / gcc fixes from Andi Kleen.
30
 * Copyright 2002 Andi Kleen, SuSE Labs.
31
 */
32
 
33
typedef struct { unsigned long a,b; } __attribute__((aligned(16))) xmm_store_t;
34
 
35
/* Doesn't use gcc to save the XMM registers, because there is no easy way to
36
   tell it to do a clts before the register saving. */
37
#define XMMS_SAVE                               \
38
        asm volatile (                  \
39
                "movq %%cr0,%0          ;\n\t"  \
40
                "clts                   ;\n\t"  \
41
                "movups %%xmm0,(%1)     ;\n\t"  \
42
                "movups %%xmm1,0x10(%1) ;\n\t"  \
43
                "movups %%xmm2,0x20(%1) ;\n\t"  \
44
                "movups %%xmm3,0x30(%1) ;\n\t"  \
45
                : "=&r" (cr0)                   \
46
                : "r" (xmm_save)                \
47
                : "memory")
48
 
49
#define XMMS_RESTORE                            \
50
        asm volatile (                  \
51
                "sfence                 ;\n\t"  \
52
                "movups (%1),%%xmm0     ;\n\t"  \
53
                "movups 0x10(%1),%%xmm1 ;\n\t"  \
54
                "movups 0x20(%1),%%xmm2 ;\n\t"  \
55
                "movups 0x30(%1),%%xmm3 ;\n\t"  \
56
                "movq   %0,%%cr0        ;\n\t"  \
57
                :                               \
58
                : "r" (cr0), "r" (xmm_save)     \
59
                : "memory")
60
 
61
#define OFFS(x)         "16*("#x")"
62
#define PF_OFFS(x)      "320+16*("#x")"
63
#define PF0(x)          "       prefetchnta "PF_OFFS(x)"(%[p1])         ;\n"
64
#define LD(x,y)         "       movaps   "OFFS(x)"(%[p1]), %%xmm"#y"    ;\n"
65
#define ST(x,y)         "       movntdq %%xmm"#y",   "OFFS(x)"(%[p1])   ;\n"
66
#define PF1(x)          "       prefetchnta "PF_OFFS(x)"(%[p2])         ;\n"
67
#define PF2(x)          "       prefetchnta "PF_OFFS(x)"(%[p3])         ;\n"
68
#define PF3(x)          "       prefetchnta "PF_OFFS(x)"(%[p4])         ;\n"
69
#define PF4(x)          "       prefetchnta "PF_OFFS(x)"(%[p5])         ;\n"
70
#define PF5(x)          "       prefetchnta "PF_OFFS(x)"(%[p6])         ;\n"
71
#define XO1(x,y)        "       xorps   "OFFS(x)"(%[p2]), %%xmm"#y"     ;\n"
72
#define XO2(x,y)        "       xorps   "OFFS(x)"(%[p3]), %%xmm"#y"     ;\n"
73
#define XO3(x,y)        "       xorps   "OFFS(x)"(%[p4]), %%xmm"#y"     ;\n"
74
#define XO4(x,y)        "       xorps   "OFFS(x)"(%[p5]), %%xmm"#y"     ;\n"
75
#define XO5(x,y)        "       xorps   "OFFS(x)"(%[p6]), %%xmm"#y"     ;\n"
76
 
77
static void
78
xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
79
{
80
        unsigned int lines = bytes >> 7;
81
        unsigned long cr0;
82
        xmm_store_t xmm_save[4];
83
 
84
        XMMS_SAVE;
85
 
86
        asm volatile (
87
#undef BLOCK
88
#define BLOCK(i) \
89
                LD(i,0)                                  \
90
                        LD(i+1,1)                       \
91
                PF1(i)                                  \
92
                                LD(i+2,2)               \
93
                                        LD(i+3,3)       \
94
                PF0(i+4)                                \
95
                XO1(i,0)                         \
96
                        XO1(i+1,1)                      \
97
                ST(i,0)                                  \
98
                        ST(i+1,1)                       \
99
                                XO1(i+2,2)              \
100
                                        XO1(i+3,3)      \
101
                                ST(i+2,2)               \
102
                                        ST(i+3,3)       \
103
 
104
 
105
                PF0(0)
106
 
107
        " .p2align 4                    ;\n"
108
        " 1:                            ;\n"
109
 
110
                BLOCK(0)
111
                BLOCK(4)
112
 
113
        "       decl %[cnt]\n"
114
        "       leaq 128(%[p1]),%[p1]\n"
115
        "       leaq 128(%[p2]),%[p2]\n"
116
        "       jnz 1b\n"
117
        : [p1] "+r" (p1), [p2] "+r" (p2), [cnt] "+r" (lines)
118
        :
119
        : "memory");
120
 
121
        XMMS_RESTORE;
122
}
123
 
124
static void
125
xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
126
          unsigned long *p3)
127
{
128
        unsigned int lines = bytes >> 7;
129
        xmm_store_t xmm_save[4];
130
        unsigned long cr0;
131
 
132
        XMMS_SAVE;
133
 
134
        __asm__ __volatile__ (
135
#undef BLOCK
136
#define BLOCK(i) \
137
                PF1(i)                                  \
138
                LD(i,0)                                  \
139
                        LD(i+1,1)                       \
140
                XO1(i,0)                         \
141
                        XO1(i+1,1)                      \
142
                                LD(i+2,2)               \
143
                                        LD(i+3,3)       \
144
                PF2(i)                                  \
145
                PF0(i+4)                                \
146
                                XO1(i+2,2)              \
147
                                        XO1(i+3,3)      \
148
                XO2(i,0)                         \
149
                        XO2(i+1,1)                      \
150
                ST(i,0)                                  \
151
                        ST(i+1,1)                       \
152
                                XO2(i+2,2)              \
153
                                        XO2(i+3,3)      \
154
                                ST(i+2,2)               \
155
                                        ST(i+3,3)       \
156
 
157
 
158
                PF0(0)
159
 
160
        " .p2align 4                    ;\n"
161
        " 1:                            ;\n"
162
 
163
                BLOCK(0)
164
                BLOCK(4)
165
 
166
        "       decl %[cnt]\n"
167
        "       leaq 128(%[p1]),%[p1]\n"
168
        "       leaq 128(%[p2]),%[p2]\n"
169
        "       leaq 128(%[p3]),%[p3]\n"
170
        "       jnz  1b"
171
        : [cnt] "+r" (lines),
172
          [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
173
        :
174
        : "memory");
175
        XMMS_RESTORE;
176
}
177
 
178
static void
179
xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
180
          unsigned long *p3, unsigned long *p4)
181
{
182
        unsigned int lines = bytes >> 7;
183
        xmm_store_t xmm_save[4];
184
        unsigned long cr0;
185
 
186
        XMMS_SAVE;
187
 
188
        __asm__ __volatile__ (
189
#undef BLOCK
190
#define BLOCK(i) \
191
                PF1(i)                                  \
192
                LD(i,0)                                  \
193
                        LD(i+1,1)                       \
194
                XO1(i,0)                         \
195
                        XO1(i+1,1)                      \
196
                                LD(i+2,2)               \
197
                                        LD(i+3,3)       \
198
                PF2(i)                                  \
199
                                XO1(i+2,2)              \
200
                                        XO1(i+3,3)      \
201
                PF3(i)                                  \
202
                PF0(i+4)                                \
203
                XO2(i,0)                         \
204
                        XO2(i+1,1)                      \
205
                                XO2(i+2,2)              \
206
                                        XO2(i+3,3)      \
207
                XO3(i,0)                         \
208
                        XO3(i+1,1)                      \
209
                ST(i,0)                                  \
210
                        ST(i+1,1)                       \
211
                                XO3(i+2,2)              \
212
                                        XO3(i+3,3)      \
213
                                ST(i+2,2)               \
214
                                        ST(i+3,3)       \
215
 
216
 
217
                PF0(0)
218
 
219
        " .align 32                     ;\n"
220
        " 1:                            ;\n"
221
 
222
                BLOCK(0)
223
                BLOCK(4)
224
 
225
        "       decl %[cnt]\n"
226
        "       leaq 128(%[p1]),%[p1]\n"
227
        "       leaq 128(%[p2]),%[p2]\n"
228
        "       leaq 128(%[p3]),%[p3]\n"
229
        "       leaq 128(%[p4]),%[p4]\n"
230
        "       jnz  1b"
231
        : [cnt] "+r" (lines),
232
          [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
233
        :
234
        : "memory" );
235
 
236
        XMMS_RESTORE;
237
}
238
 
239
static void
240
xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
241
          unsigned long *p3, unsigned long *p4, unsigned long *p5)
242
{
243
        unsigned int lines = bytes >> 7;
244
        xmm_store_t xmm_save[4];
245
        unsigned long cr0;
246
 
247
        XMMS_SAVE;
248
 
249
        __asm__ __volatile__ (
250
#undef BLOCK
251
#define BLOCK(i) \
252
                PF1(i)                                  \
253
                LD(i,0)                                  \
254
                        LD(i+1,1)                       \
255
                XO1(i,0)                         \
256
                        XO1(i+1,1)                      \
257
                                LD(i+2,2)               \
258
                                        LD(i+3,3)       \
259
                PF2(i)                                  \
260
                                XO1(i+2,2)              \
261
                                        XO1(i+3,3)      \
262
                PF3(i)                                  \
263
                XO2(i,0)                         \
264
                        XO2(i+1,1)                      \
265
                                XO2(i+2,2)              \
266
                                        XO2(i+3,3)      \
267
                PF4(i)                                  \
268
                PF0(i+4)                                \
269
                XO3(i,0)                         \
270
                        XO3(i+1,1)                      \
271
                                XO3(i+2,2)              \
272
                                        XO3(i+3,3)      \
273
                XO4(i,0)                         \
274
                        XO4(i+1,1)                      \
275
                ST(i,0)                                  \
276
                        ST(i+1,1)                       \
277
                                XO4(i+2,2)              \
278
                                        XO4(i+3,3)      \
279
                                ST(i+2,2)               \
280
                                        ST(i+3,3)       \
281
 
282
 
283
                PF0(0)
284
 
285
        " .p2align 4                    ;\n"
286
        " 1:                            ;\n"
287
 
288
                BLOCK(0)
289
                BLOCK(4)
290
 
291
        "       decl %[cnt]\n"
292
        "       leaq 128(%[p1]),%[p1]\n"
293
        "       leaq 128(%[p2]),%[p2]\n"
294
        "       leaq 128(%[p3]),%[p3]\n"
295
        "       leaq 128(%[p4]),%[p4]\n"
296
        "       leaq 128(%[p5]),%[p5]\n"
297
        "       jnz  1b"
298
        : [cnt] "+r" (lines),
299
          [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4),
300
          [p5] "+r" (p5)
301
        :
302
        : "memory");
303
 
304
        XMMS_RESTORE;
305
}
306
 
307
#if __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC__MINOR__ >= 3)
308
#define STORE_NTI(x,mem) __builtin_ia32_movnti(&(mem), (x))
309
#else
310
#define STORE_NTI(x,mem)  asm("movnti %1,%0" : "=m" (mem) : "r" (x)) 
311
#endif
312
 
313
 
314
static void
315
xor_64regs_stream_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
316
{
317
        long lines = bytes / (sizeof (long)) / 8;
318
 
319
        do {
320
                register long d0, d1, d2, d3, d4, d5, d6, d7;
321
                d0 = p1[0];      /* Pull the stuff into registers        */
322
                d1 = p1[1];     /*  ... in bursts, if possible.         */
323
                d2 = p1[2];
324
                d3 = p1[3];
325
                d4 = p1[4];
326
                d5 = p1[5];
327
                d6 = p1[6];
328
                d7 = p1[7];
329
                __builtin_prefetch(p1 + 5*64, 0, 0);
330
                d0 ^= p2[0];
331
                d1 ^= p2[1];
332
                d2 ^= p2[2];
333
                d3 ^= p2[3];
334
                d4 ^= p2[4];
335
                d5 ^= p2[5];
336
                d6 ^= p2[6];
337
                d7 ^= p2[7];
338
                __builtin_prefetch(p2 + 5*64, 0, 0);
339
                STORE_NTI(d0, p1[0]);
340
                STORE_NTI(d1, p1[1]);
341
                STORE_NTI(d2, p1[2]);
342
                STORE_NTI(d3, p1[3]);
343
                STORE_NTI(d4, p1[4]);
344
                STORE_NTI(d5, p1[5]);
345
                STORE_NTI(d6, p1[6]);
346
                STORE_NTI(d7, p1[7]);
347
                p1 += 8;
348
                p2 += 8;
349
        } while (--lines > 0);
350
}
351
 
352
static void
353
xor_64regs_stream_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
354
            unsigned long *p3)
355
{
356
        long lines = bytes / (sizeof (long)) / 8;
357
 
358
        do {
359
                register long d0, d1, d2, d3, d4, d5, d6, d7;
360
                d0 = p1[0];      /* Pull the stuff into registers        */
361
                d1 = p1[1];     /*  ... in bursts, if possible.         */
362
                d2 = p1[2];
363
                d3 = p1[3];
364
                d4 = p1[4];
365
                d5 = p1[5];
366
                d6 = p1[6];
367
                d7 = p1[7];
368
                __builtin_prefetch(p1 + 5*64, 0, 0);
369
                d0 ^= p2[0];
370
                d1 ^= p2[1];
371
                d2 ^= p2[2];
372
                d3 ^= p2[3];
373
                d4 ^= p2[4];
374
                d5 ^= p2[5];
375
                d6 ^= p2[6];
376
                d7 ^= p2[7];
377
                __builtin_prefetch(p2 + 5*64, 0, 0);
378
                d0 ^= p3[0];
379
                d1 ^= p3[1];
380
                d2 ^= p3[2];
381
                d3 ^= p3[3];
382
                d4 ^= p3[4];
383
                d5 ^= p3[5];
384
                d6 ^= p3[6];
385
                d7 ^= p3[7];
386
                __builtin_prefetch(p3 + 5*64, 0, 0);
387
                STORE_NTI(d0, p1[0]);
388
                STORE_NTI(d1, p1[1]);
389
                STORE_NTI(d2, p1[2]);
390
                STORE_NTI(d3, p1[3]);
391
                STORE_NTI(d4, p1[4]);
392
                STORE_NTI(d5, p1[5]);
393
                STORE_NTI(d6, p1[6]);
394
                STORE_NTI(d7, p1[7]);
395
                p1 += 8;
396
                p2 += 8;
397
                p3 += 8;
398
        } while (--lines > 0);
399
}
400
 
401
static void
402
xor_64regs_stream_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
403
            unsigned long *p3, unsigned long *p4)
404
{
405
        long lines = bytes / (sizeof (long)) / 8;
406
 
407
        do {
408
                register long d0, d1, d2, d3, d4, d5, d6, d7;
409
                d0 = p1[0];      /* Pull the stuff into registers        */
410
                d1 = p1[1];     /*  ... in bursts, if possible.         */
411
                d2 = p1[2];
412
                d3 = p1[3];
413
                d4 = p1[4];
414
                d5 = p1[5];
415
                d6 = p1[6];
416
                d7 = p1[7];
417
                __builtin_prefetch(p1 + 5*64, 0, 0);
418
                d0 ^= p2[0];
419
                d1 ^= p2[1];
420
                d2 ^= p2[2];
421
                d3 ^= p2[3];
422
                d4 ^= p2[4];
423
                d5 ^= p2[5];
424
                d6 ^= p2[6];
425
                d7 ^= p2[7];
426
                __builtin_prefetch(p2 + 5*64, 0, 0);
427
                d0 ^= p3[0];
428
                d1 ^= p3[1];
429
                d2 ^= p3[2];
430
                d3 ^= p3[3];
431
                d4 ^= p3[4];
432
                d5 ^= p3[5];
433
                d6 ^= p3[6];
434
                d7 ^= p3[7];
435
                __builtin_prefetch(p3 + 5*64, 0, 0);
436
                d0 ^= p4[0];
437
                d1 ^= p4[1];
438
                d2 ^= p4[2];
439
                d3 ^= p4[3];
440
                d4 ^= p4[4];
441
                d5 ^= p4[5];
442
                d6 ^= p4[6];
443
                d7 ^= p4[7];
444
                __builtin_prefetch(p4 + 5*64, 0, 0);
445
                STORE_NTI(d0, p1[0]);
446
                STORE_NTI(d1, p1[1]);
447
                STORE_NTI(d2, p1[2]);
448
                STORE_NTI(d3, p1[3]);
449
                STORE_NTI(d4, p1[4]);
450
                STORE_NTI(d5, p1[5]);
451
                STORE_NTI(d6, p1[6]);
452
                STORE_NTI(d7, p1[7]);
453
                p1 += 8;
454
                p2 += 8;
455
                p3 += 8;
456
                p4 += 8;
457
        } while (--lines > 0);
458
}
459
 
460
static void
461
xor_64regs_stream_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
462
            unsigned long *p3, unsigned long *p4, unsigned long *p5)
463
{
464
        long lines = bytes / (sizeof (long)) / 8;
465
 
466
        do {
467
                register long d0, d1, d2, d3, d4, d5, d6, d7;
468
                d0 = p1[0];      /* Pull the stuff into registers        */
469
                d1 = p1[1];     /*  ... in bursts, if possible.         */
470
                d2 = p1[2];
471
                d3 = p1[3];
472
                d4 = p1[4];
473
                d5 = p1[5];
474
                d6 = p1[6];
475
                d7 = p1[7];
476
                __builtin_prefetch(p1 + 5*64, 0, 0);
477
                d0 ^= p2[0];
478
                d1 ^= p2[1];
479
                d2 ^= p2[2];
480
                d3 ^= p2[3];
481
                d4 ^= p2[4];
482
                d5 ^= p2[5];
483
                d6 ^= p2[6];
484
                d7 ^= p2[7];
485
                __builtin_prefetch(p2 + 5*64, 0, 0);
486
                d0 ^= p3[0];
487
                d1 ^= p3[1];
488
                d2 ^= p3[2];
489
                d3 ^= p3[3];
490
                d4 ^= p3[4];
491
                d5 ^= p3[5];
492
                d6 ^= p3[6];
493
                d7 ^= p3[7];
494
                __builtin_prefetch(p3 + 5*64, 0, 0);
495
                d0 ^= p4[0];
496
                d1 ^= p4[1];
497
                d2 ^= p4[2];
498
                d3 ^= p4[3];
499
                d4 ^= p4[4];
500
                d5 ^= p4[5];
501
                d6 ^= p4[6];
502
                d7 ^= p4[7];
503
                __builtin_prefetch(p4 + 5*64, 0, 0);
504
                d0 ^= p5[0];
505
                d1 ^= p5[1];
506
                d2 ^= p5[2];
507
                d3 ^= p5[3];
508
                d4 ^= p5[4];
509
                d5 ^= p5[5];
510
                d6 ^= p5[6];
511
                d7 ^= p5[7];
512
                __builtin_prefetch(p5 + 5*64, 0, 0);
513
                STORE_NTI(d0, p1[0]);
514
                STORE_NTI(d1, p1[1]);
515
                STORE_NTI(d2, p1[2]);
516
                STORE_NTI(d3, p1[3]);
517
                STORE_NTI(d4, p1[4]);
518
                STORE_NTI(d5, p1[5]);
519
                STORE_NTI(d6, p1[6]);
520
                STORE_NTI(d7, p1[7]);
521
                p1 += 8;
522
                p2 += 8;
523
                p3 += 8;
524
                p4 += 8;
525
                p5 += 8;
526
        } while (--lines > 0);
527
}
528
 
529
 
530
static struct xor_block_template xor_block_sse = {
531
        name: "128byte sse streaming",
532
        do_2: xor_sse_2,
533
        do_3: xor_sse_3,
534
        do_4: xor_sse_4,
535
        do_5: xor_sse_5,
536
};
537
 
538
static struct xor_block_template xor_block_64regs_stream = {
539
        name: "64byte int streaming",
540
        do_2: xor_64regs_stream_2,
541
        do_3: xor_64regs_stream_3,
542
        do_4: xor_64regs_stream_4,
543
        do_5: xor_64regs_stream_5,
544
};
545
 
546
/* AK: the speed test is useless: it only tests cache hot */
547
#undef XOR_TRY_TEMPLATES
548
#define XOR_TRY_TEMPLATES                               \
549
        do {                                            \
550
                xor_speed(&xor_block_sse);      \
551
                xor_speed(&xor_block_64regs_stream);    \
552
        } while (0)
553
 
554
#define XOR_SELECT_TEMPLATE(FASTEST) (FASTEST)

powered by: WebSVN 2.1.0

© copyright 1999-2024 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.