OpenCores
URL https://opencores.org/ocsvn/openrisc/openrisc/trunk

Subversion Repositories openrisc

[/] [openrisc/] [trunk/] [gnu-dev/] [or1k-gcc/] [gcc/] [config/] [rs6000/] [si2vmx.h] - Blame information for rev 749

Go to most recent revision | Details | Compare with Previous | View Log

Line No. Rev Author Line
1 709 jeremybenn
/* Cell BEA specific SPU intrinsics to PPU/VMX intrinsics
2
   Copyright (C) 2007, 2009 Free Software Foundation, Inc.
3
 
4
   This file is free software; you can redistribute it and/or modify it under
5
   the terms of the GNU General Public License as published by the Free
6
   Software Foundation; either version 3 of the License, or (at your option)
7
   any later version.
8
 
9
   This file is distributed in the hope that it will be useful, but WITHOUT
10
   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11
   FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
12
   for more details.
13
 
14
   Under Section 7 of GPL version 3, you are granted additional
15
   permissions described in the GCC Runtime Library Exception, version
16
   3.1, as published by the Free Software Foundation.
17
 
18
   You should have received a copy of the GNU General Public License and
19
   a copy of the GCC Runtime Library Exception along with this program;
20
   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
21
   <http://www.gnu.org/licenses/>.  */
22
 
23
#ifndef _SI2VMX_H_
24
#define _SI2VMX_H_      1
25
 
26
#ifndef __SPU__
27
 
28
#include <stdlib.h>
29
#include <vec_types.h>
30
 
31
 
32
/* Specify a default halt action for spu_hcmpeq and spu_hcmpgt intrinsics.
33
 * Users can override the action by defining it prior to including this
34
 * header file.
35
 */
36
#ifndef SPU_HALT_ACTION
37
#define SPU_HALT_ACTION         abort()
38
#endif
39
 
40
/* Specify a default stop action for the spu_stop intrinsic.
41
 * Users can override the action by defining it prior to including this
42
 * header file.
43
 */
44
#ifndef SPU_STOP_ACTION
45
#define SPU_STOP_ACTION         abort()
46
#endif
47
 
48
 
49
/* Specify a default action for unsupported intrinsic.
50
 * Users can override the action by defining it prior to including this
51
 * header file.
52
 */
53
#ifndef SPU_UNSUPPORTED_ACTION
54
#define SPU_UNSUPPORTED_ACTION  abort()
55
#endif
56
 
57
 
58
/* Casting intrinsics - from scalar to quadword
59
 */
60
 
61
static __inline qword si_from_uchar(unsigned char c) {
62
  union {
63
    qword q;
64
    unsigned char c[16];
65
  } x;
66
  x.c[3] = c;
67
  return (x.q);
68
}
69
 
70
static __inline qword si_from_char(signed char c) {
71
  union {
72
    qword q;
73
    signed char c[16];
74
  } x;
75
  x.c[3] = c;
76
  return (x.q);
77
}
78
 
79
static __inline qword si_from_ushort(unsigned short s) {
80
  union {
81
    qword q;
82
    unsigned short s[8];
83
  } x;
84
  x.s[1] = s;
85
  return (x.q);
86
}
87
 
88
static __inline qword si_from_short(short s) {
89
  union {
90
    qword q;
91
    short s[8];
92
  } x;
93
  x.s[1] = s;
94
  return (x.q);
95
}
96
 
97
 
98
static __inline qword si_from_uint(unsigned int i) {
99
  union {
100
    qword q;
101
    unsigned int i[4];
102
  } x;
103
  x.i[0] = i;
104
  return (x.q);
105
}
106
 
107
static __inline qword si_from_int(int i) {
108
  union {
109
    qword q;
110
    int i[4];
111
  } x;
112
  x.i[0] = i;
113
  return (x.q);
114
}
115
 
116
static __inline qword si_from_ullong(unsigned long long l) {
117
  union {
118
    qword q;
119
    unsigned long long l[2];
120
  } x;
121
  x.l[0] = l;
122
  return (x.q);
123
}
124
 
125
static __inline qword si_from_llong(long long l) {
126
  union {
127
    qword q;
128
    long long l[2];
129
  } x;
130
  x.l[0] = l;
131
  return (x.q);
132
}
133
 
134
static __inline qword si_from_float(float f) {
135
  union {
136
    qword q;
137
    float f[4];
138
  } x;
139
  x.f[0] = f;
140
  return (x.q);
141
}
142
 
143
static __inline qword si_from_double(double d) {
144
  union {
145
    qword q;
146
    double d[2];
147
  } x;
148
  x.d[0] = d;
149
  return (x.q);
150
}
151
 
152
static __inline qword si_from_ptr(void *ptr) {
153
  union {
154
    qword q;
155
    void *p;
156
  } x;
157
  x.p = ptr;
158
  return (x.q);
159
}
160
 
161
 
162
/* Casting intrinsics - from quadword to scalar
163
 */
164
static __inline unsigned char si_to_uchar(qword q) {
165
  union {
166
    qword q;
167
    unsigned char c[16];
168
  } x;
169
  x.q = q;
170
  return (x.c[3]);
171
}
172
 
173
static __inline signed char si_to_char(qword q) {
174
  union {
175
    qword q;
176
    signed char c[16];
177
  } x;
178
  x.q = q;
179
  return (x.c[3]);
180
}
181
 
182
static __inline unsigned short si_to_ushort(qword q) {
183
  union {
184
    qword q;
185
    unsigned short s[8];
186
  } x;
187
  x.q = q;
188
  return (x.s[1]);
189
}
190
 
191
static __inline short si_to_short(qword q) {
192
  union {
193
    qword q;
194
    short s[8];
195
  } x;
196
  x.q = q;
197
  return (x.s[1]);
198
}
199
 
200
static __inline unsigned int si_to_uint(qword q) {
201
  union {
202
    qword q;
203
    unsigned int i[4];
204
  } x;
205
  x.q = q;
206
  return (x.i[0]);
207
}
208
 
209
static __inline int si_to_int(qword q) {
210
  union {
211
    qword q;
212
    int i[4];
213
  } x;
214
  x.q = q;
215
  return (x.i[0]);
216
}
217
 
218
static __inline unsigned long long si_to_ullong(qword q) {
219
  union {
220
    qword q;
221
    unsigned long long l[2];
222
  } x;
223
  x.q = q;
224
  return (x.l[0]);
225
}
226
 
227
static __inline long long si_to_llong(qword q) {
228
  union {
229
    qword q;
230
    long long l[2];
231
  } x;
232
  x.q = q;
233
  return (x.l[0]);
234
}
235
 
236
static __inline float si_to_float(qword q) {
237
  union {
238
    qword q;
239
    float f[4];
240
  } x;
241
  x.q = q;
242
  return (x.f[0]);
243
}
244
 
245
static __inline double si_to_double(qword q) {
246
  union {
247
    qword q;
248
    double d[2];
249
  } x;
250
  x.q = q;
251
  return (x.d[0]);
252
}
253
 
254
static __inline void * si_to_ptr(qword q) {
255
  union {
256
    qword q;
257
    void *p;
258
  } x;
259
  x.q = q;
260
  return (x.p);
261
}
262
 
263
 
264
/* Absolute difference
265
 */
266
static __inline qword si_absdb(qword a, qword b)
267
{
268
  vec_uchar16 ac, bc, dc;
269
 
270
  ac = (vec_uchar16)(a);
271
  bc = (vec_uchar16)(b);
272
  dc = vec_sel(vec_sub(bc, ac), vec_sub(ac, bc), vec_cmpgt(ac, bc));
273
 
274
  return ((qword)(dc));
275
}
276
 
277
/* Add intrinsics
278
 */
279
#define si_a(_a, _b)            ((qword)(vec_add((vec_uint4)(_a), (vec_uint4)(_b))))
280
 
281
#define si_ah(_a, _b)           ((qword)(vec_add((vec_ushort8)(_a), (vec_ushort8)(_b))))
282
 
283
static __inline qword si_ai(qword a, int b)
284
{
285
  return ((qword)(vec_add((vec_int4)(a),
286
                          vec_splat((vec_int4)(si_from_int(b)), 0))));
287
}
288
 
289
 
290
static __inline qword si_ahi(qword a, short b)
291
{
292
  return ((qword)(vec_add((vec_short8)(a),
293
                          vec_splat((vec_short8)(si_from_short(b)), 1))));
294
}
295
 
296
 
297
#define si_fa(_a, _b)   ((qword)(vec_add((vec_float4)(_a), (vec_float4)(_b))))
298
 
299
 
300
static __inline qword si_dfa(qword a, qword b)
301
{
302
  union {
303
    vec_double2 v;
304
    double d[2];
305
  } ad, bd, dd;
306
 
307
  ad.v = (vec_double2)(a);
308
  bd.v = (vec_double2)(b);
309
  dd.d[0] = ad.d[0] + bd.d[0];
310
  dd.d[1] = ad.d[1] + bd.d[1];
311
 
312
  return ((qword)(dd.v));
313
}
314
 
315
/* Add word extended
316
 */
317
#define si_addx(_a, _b, _c)     ((qword)(vec_add(vec_add((vec_uint4)(_a), (vec_uint4)(_b)),     \
318
                                                 vec_and((vec_uint4)(_c), vec_splat_u32(1)))))
319
 
320
 
321
/* Bit-wise AND
322
 */
323
#define si_and(_a, _b)          ((qword)(vec_and((vec_uint4)(_a), (vec_uint4)(_b))))
324
 
325
 
326
static __inline qword si_andbi(qword a, signed char b)
327
{
328
  return ((qword)(vec_and((vec_char16)(a),
329
                          vec_splat((vec_char16)(si_from_char(b)), 3))));
330
}
331
 
332
static __inline qword si_andhi(qword a, signed short b)
333
{
334
  return ((qword)(vec_and((vec_short8)(a),
335
                          vec_splat((vec_short8)(si_from_short(b)), 1))));
336
}
337
 
338
 
339
static __inline qword si_andi(qword a, signed int b)
340
{
341
  return ((qword)(vec_and((vec_int4)(a),
342
                          vec_splat((vec_int4)(si_from_int(b)), 0))));
343
}
344
 
345
 
346
/* Bit-wise AND with complement
347
 */
348
#define si_andc(_a, _b)         ((qword)(vec_andc((vec_uchar16)(_a), (vec_uchar16)(_b))))
349
 
350
 
351
/* Average byte vectors
352
 */
353
#define si_avgb(_a, _b)         ((qword)(vec_avg((vec_uchar16)(_a), (vec_uchar16)(_b))))
354
 
355
 
356
/* Branch indirect and set link on external data
357
 */
358
#define si_bisled(_func)        /* not mappable */
359
#define si_bisledd(_func)       /* not mappable */
360
#define si_bislede(_func)       /* not mappable */
361
 
362
 
363
/* Borrow generate
364
 */
365
#define si_bg(_a, _b)           ((qword)(vec_subc((vec_uint4)(_b), (vec_uint4)(_a))))
366
 
367
#define si_bgx(_a, _b, _c)      ((qword)(vec_and(vec_or(vec_cmpgt((vec_uint4)(_b), (vec_uint4)(_a)),            \
368
                                                        vec_and(vec_cmpeq((vec_uint4)(_b), (vec_uint4)(_a)),    \
369
                                                                (vec_uint4)(_c))), vec_splat_u32(1))))
370
 
371
/* Compare absolute equal
372
 */
373
static __inline qword si_fcmeq(qword a, qword b)
374
{
375
  vec_float4 msb = (vec_float4)((vec_uint4){0x80000000, 0x80000000, 0x80000000, 0x80000000});
376
 
377
  return ((qword)(vec_cmpeq(vec_andc((vec_float4)(a), msb),
378
                                  vec_andc((vec_float4)(b), msb))));
379
}
380
 
381
static __inline qword si_dfcmeq(qword a, qword b)
382
{
383
  vec_uint4 sign_mask= (vec_uint4) { 0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF };
384
  vec_uint4 nan_mask = (vec_uint4) { 0x7FF00000, 0x00000000, 0x7FF00000, 0x00000000 };
385
  vec_uchar16 hihi_promote = (vec_uchar16) { 0,1,2,3,  16,17,18,19,  8,9,10,11, 24,25,26,27};
386
 
387
  vec_uint4 biteq;
388
  vec_uint4 aabs;
389
  vec_uint4 babs;
390
  vec_uint4 a_gt;
391
  vec_uint4 ahi_inf;
392
  vec_uint4 anan;
393
  vec_uint4 result;
394
 
395
  union {
396
    vec_uchar16 v;
397
    int i[4];
398
  } x;
399
 
400
  /* Shift 4 bytes  */
401
  x.i[3] = 4 << 3;
402
 
403
  /*  Mask out sign bits */
404
  aabs = vec_and((vec_uint4)a,sign_mask);
405
  babs = vec_and((vec_uint4)b,sign_mask);
406
 
407
  /*  A)  Check for bit equality, store in high word */
408
  biteq = (vec_uint4) vec_cmpeq((vec_uint4)aabs,(vec_uint4)babs);
409
  biteq = vec_and(biteq,(vec_uint4)vec_slo((vec_uchar16)biteq,x.v));
410
 
411
  /*
412
      B)  Check if a is NaN, store in high word
413
 
414
      B1) If the high word is greater than max_exp (indicates a NaN)
415
      B2) If the low word is greater than 0
416
  */
417
  a_gt = (vec_uint4)vec_cmpgt(aabs,nan_mask);
418
 
419
  /*  B3) Check if the high word is equal to the inf exponent */
420
  ahi_inf = (vec_uint4)vec_cmpeq(aabs,nan_mask);
421
 
422
  /*  anan = B1[hi] or (B2[lo] and B3[hi]) */
423
  anan = (vec_uint4)vec_or(a_gt,vec_and((vec_uint4)vec_slo((vec_uchar16)a_gt,x.v),ahi_inf));
424
 
425
  /*  result = A and not B  */
426
  result = vec_andc(biteq, anan);
427
 
428
  /*  Promote high words to 64 bits and return  */
429
  return ((qword)(vec_perm((vec_uchar16)result, (vec_uchar16)result, hihi_promote)));
430
}
431
 
432
 
433
/* Compare absolute greater than
434
 */
435
static __inline qword si_fcmgt(qword a, qword b)
436
{
437
  vec_float4 msb = (vec_float4)((vec_uint4){0x80000000, 0x80000000, 0x80000000, 0x80000000});
438
 
439
  return ((qword)(vec_cmpgt(vec_andc((vec_float4)(a), msb),
440
                                  vec_andc((vec_float4)(b), msb))));
441
}
442
 
443
static __inline qword si_dfcmgt(qword a, qword b)
444
{
445
  vec_uchar16 splat_hi = (vec_uchar16) { 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
446
  vec_uint4 nan_mask = (vec_uint4) { 0x7FF00000, 0x0, 0x7FF00000, 0x0 };
447
  vec_uint4 sign_mask = (vec_uint4) { 0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF };
448
 
449
  union {
450
    vec_uchar16 v;
451
    int i[4];
452
  } x;
453
 
454
  /* Shift 4 bytes  */
455
  x.i[3] = 4 << 3;
456
 
457
  // absolute value of a,b 
458
  vec_uint4 aabs = vec_and((vec_uint4)a, sign_mask);
459
  vec_uint4 babs = vec_and((vec_uint4)b, sign_mask);
460
 
461
  // check if a is nan
462
  vec_uint4 a_inf = (vec_uint4)vec_cmpeq(aabs, nan_mask);
463
  vec_uint4 a_nan = (vec_uint4)vec_cmpgt(aabs, nan_mask);
464
  a_nan = vec_or(a_nan, vec_and((vec_uint4)vec_slo((vec_uchar16)a_nan,x.v),a_inf));
465
  a_nan = (vec_uint4)vec_perm((vec_uchar16)a_nan, (vec_uchar16)a_nan, splat_hi);
466
 
467
  // check if b is nan
468
  vec_uint4 b_inf = (vec_uint4)vec_cmpeq(babs, nan_mask);
469
  vec_uint4 b_nan = (vec_uint4)vec_cmpgt(babs, nan_mask);
470
  b_nan = vec_or(b_nan, vec_and((vec_uint4)vec_slo((vec_uchar16)b_nan,x.v),b_inf));
471
  b_nan = (vec_uint4)vec_perm((vec_uchar16)b_nan, (vec_uchar16)b_nan, splat_hi);
472
 
473
  // A) Check if the exponents are different 
474
  vec_uint4 gt_hi = (vec_uint4)vec_cmpgt(aabs,babs);
475
 
476
  // B) Check if high word equal, and low word greater
477
  vec_uint4 gt_lo = (vec_uint4)vec_cmpgt((vec_uint4)aabs, (vec_uint4)babs);
478
  vec_uint4 eq = (vec_uint4)vec_cmpeq(aabs, babs);
479
  vec_uint4 eqgt = vec_and(eq,vec_slo(gt_lo,x.v));
480
 
481
  //  If either A or B is true, return true (unless NaNs detected) 
482
  vec_uint4 r = vec_or(gt_hi, eqgt);
483
 
484
  // splat the high words of the comparison step
485
  r = (vec_uint4)vec_perm((vec_uchar16)r,(vec_uchar16)r,splat_hi);
486
 
487
  // correct for NaNs in input
488
  return ((qword)vec_andc(r,vec_or(a_nan,b_nan)));
489
}
490
 
491
 
492
/* Compare equal
493
 */
494
static __inline qword si_ceqb(qword a, qword b)
495
{
496
  return ((qword)(vec_cmpeq((vec_uchar16)(a), (vec_uchar16)(b))));
497
}
498
 
499
static __inline qword si_ceqh(qword a, qword b)
500
{
501
  return ((qword)(vec_cmpeq((vec_ushort8)(a), (vec_ushort8)(b))));
502
}
503
 
504
static __inline qword si_ceq(qword a, qword b)
505
{
506
  return ((qword)(vec_cmpeq((vec_uint4)(a), (vec_uint4)(b))));
507
}
508
 
509
static __inline qword si_fceq(qword a, qword b)
510
{
511
  return ((qword)(vec_cmpeq((vec_float4)(a), (vec_float4)(b))));
512
}
513
 
514
static __inline qword si_ceqbi(qword a, signed char b)
515
{
516
  return ((qword)(vec_cmpeq((vec_char16)(a),
517
                            vec_splat((vec_char16)(si_from_char(b)), 3))));
518
}
519
 
520
static __inline qword si_ceqhi(qword a, signed short b)
521
{
522
  return ((qword)(vec_cmpeq((vec_short8)(a),
523
                          vec_splat((vec_short8)(si_from_short(b)), 1))));
524
}
525
 
526
static __inline qword si_ceqi(qword a, signed int b)
527
{
528
  return ((qword)(vec_cmpeq((vec_int4)(a),
529
                          vec_splat((vec_int4)(si_from_int(b)), 0))));
530
}
531
 
532
static __inline qword si_dfceq(qword a, qword b)
533
{
534
  vec_uint4 sign_mask= (vec_uint4) { 0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF };
535
  vec_uint4 nan_mask = (vec_uint4) { 0x7FF00000, 0x00000000, 0x7FF00000, 0x00000000 };
536
  vec_uchar16 hihi_promote = (vec_uchar16) { 0,1,2,3,  16,17,18,19,  8,9,10,11, 24,25,26,27};
537
 
538
  vec_uint4 biteq;
539
  vec_uint4 aabs;
540
  vec_uint4 babs;
541
  vec_uint4 a_gt;
542
  vec_uint4 ahi_inf;
543
  vec_uint4 anan;
544
  vec_uint4 iszero;
545
  vec_uint4 result;
546
 
547
  union {
548
    vec_uchar16 v;
549
    int i[4];
550
  } x;
551
 
552
  /* Shift 4 bytes  */
553
  x.i[3] = 4 << 3;
554
 
555
  /*  A)  Check for bit equality, store in high word */
556
  biteq = (vec_uint4) vec_cmpeq((vec_uint4)a,(vec_uint4)b);
557
  biteq = vec_and(biteq,(vec_uint4)vec_slo((vec_uchar16)biteq,x.v));
558
 
559
  /*  Mask out sign bits */
560
  aabs = vec_and((vec_uint4)a,sign_mask);
561
  babs = vec_and((vec_uint4)b,sign_mask);
562
 
563
  /*
564
      B)  Check if a is NaN, store in high word
565
 
566
      B1) If the high word is greater than max_exp (indicates a NaN)
567
      B2) If the low word is greater than 0
568
  */
569
  a_gt = (vec_uint4)vec_cmpgt(aabs,nan_mask);
570
 
571
  /*  B3) Check if the high word is equal to the inf exponent */
572
  ahi_inf = (vec_uint4)vec_cmpeq(aabs,nan_mask);
573
 
574
  /*  anan = B1[hi] or (B2[lo] and B3[hi]) */
575
  anan = (vec_uint4)vec_or(a_gt,vec_and((vec_uint4)vec_slo((vec_uchar16)a_gt,x.v),ahi_inf));
576
 
577
  /*  C)  Check for 0 = -0 special case */
578
  iszero =(vec_uint4)vec_cmpeq((vec_uint4)vec_or(aabs,babs),(vec_uint4)vec_splat_u32(0));
579
  iszero = vec_and(iszero,(vec_uint4)vec_slo((vec_uchar16)iszero,x.v));
580
 
581
  /*  result = (A or C) and not B  */
582
  result = vec_or(biteq,iszero);
583
  result = vec_andc(result, anan);
584
 
585
  /*  Promote high words to 64 bits and return  */
586
  return ((qword)(vec_perm((vec_uchar16)result, (vec_uchar16)result, hihi_promote)));
587
}
588
 
589
 
590
/* Compare greater than
591
 */
592
static __inline qword si_cgtb(qword a, qword b)
593
{
594
  return ((qword)(vec_cmpgt((vec_char16)(a), (vec_char16)(b))));
595
}
596
 
597
static __inline qword si_cgth(qword a, qword b)
598
{
599
  return ((qword)(vec_cmpgt((vec_short8)(a), (vec_short8)(b))));
600
}
601
 
602
static __inline qword si_cgt(qword a, qword b)
603
{
604
  return ((qword)(vec_cmpgt((vec_int4)(a), (vec_int4)(b))));
605
}
606
 
607
static __inline qword si_clgtb(qword a, qword b)
608
{
609
  return ((qword)(vec_cmpgt((vec_uchar16)(a), (vec_uchar16)(b))));
610
}
611
 
612
static __inline qword si_clgth(qword a, qword b)
613
{
614
  return ((qword)(vec_cmpgt((vec_ushort8)(a), (vec_ushort8)(b))));
615
}
616
 
617
static __inline qword si_clgt(qword a, qword b)
618
{
619
  return ((qword)(vec_cmpgt((vec_uint4)(a), (vec_uint4)(b))));
620
}
621
 
622
static __inline qword si_fcgt(qword a, qword b)
623
{
624
  return ((qword)(vec_cmpgt((vec_float4)(a), (vec_float4)(b))));
625
}
626
 
627
static __inline qword si_dfcgt(qword a, qword b)
628
{
629
  vec_uchar16 splat_hi = (vec_uchar16) { 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
630
  vec_uchar16 borrow_shuffle = (vec_uchar16) { 4,5,6,7, 192,192,192,192, 12,13,14,15, 192,192,192,192 };
631
  vec_uint4 nan_mask = (vec_uint4) { 0x7FF00000, 0x0, 0x7FF00000, 0x0 };
632
  vec_uint4 sign_mask = (vec_uint4) { 0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF };
633
 
634
  union {
635
    vec_uchar16 v;
636
    int i[4];
637
  } x;
638
 
639
  /* Shift 4 bytes  */
640
  x.i[3] = 4 << 3;
641
 
642
  // absolute value of a,b 
643
  vec_uint4 aabs = vec_and((vec_uint4)a, sign_mask);
644
  vec_uint4 babs = vec_and((vec_uint4)b, sign_mask);
645
 
646
  // check if a is nan
647
  vec_uint4 a_inf = (vec_uint4)vec_cmpeq(aabs, nan_mask);
648
  vec_uint4 a_nan = (vec_uint4)vec_cmpgt(aabs, nan_mask);
649
  a_nan = vec_or(a_nan, vec_and((vec_uint4)vec_slo((vec_uchar16)a_nan,x.v),a_inf));
650
  a_nan = (vec_uint4)vec_perm((vec_uchar16)a_nan, (vec_uchar16)a_nan, splat_hi);
651
 
652
  // check if b is nan
653
  vec_uint4 b_inf = (vec_uint4)vec_cmpeq(babs, nan_mask);
654
  vec_uint4 b_nan = (vec_uint4)vec_cmpgt(babs, nan_mask);
655
  b_nan = vec_or(b_nan, vec_and((vec_uint4)vec_slo((vec_uchar16)b_nan,x.v),b_inf));
656
  b_nan = (vec_uint4)vec_perm((vec_uchar16)b_nan, (vec_uchar16)b_nan, splat_hi);
657
 
658
  // sign of a
659
  vec_uint4 asel = (vec_uint4)vec_sra((vec_int4)(a), (vec_uint4)vec_splat(((vec_uint4)si_from_int(31)), 0));
660
  asel = (vec_uint4)vec_perm((vec_uchar16)asel,(vec_uchar16)asel,splat_hi);
661
 
662
  // sign of b
663
  vec_uint4 bsel = (vec_uint4)vec_sra((vec_int4)(b), (vec_uint4)vec_splat(((vec_uint4)si_from_int(31)), 0));
664
  bsel = (vec_uint4)vec_perm((vec_uchar16)bsel,(vec_uchar16)bsel,splat_hi);
665
 
666
  // negative a
667
  vec_uint4 abor = vec_subc((vec_uint4)vec_splat_u32(0), aabs);
668
  vec_uchar16 pat = vec_sel(((vec_uchar16){0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15}), vec_sr(borrow_shuffle, vec_splat_u8(3)), vec_sra(borrow_shuffle, vec_splat_u8(7)));
669
  abor = (vec_uint4)(vec_perm(vec_perm((vec_uchar16)abor, (vec_uchar16)abor, borrow_shuffle),((vec_uchar16){0, 0, 0, 0, 0, 0, 0, 0, 0xFF, 0xFF, 0xFF, 0xFF, 0x80, 0x80, 0x80, 0x80}),pat));
670
  vec_uint4 aneg = vec_add(vec_add(vec_splat_u32(0), vec_nor(aabs, aabs)), vec_and(abor, vec_splat_u32(1)));
671
 
672
  // pick the one we want
673
  vec_int4 aval = (vec_int4)vec_sel((vec_uchar16)aabs, (vec_uchar16)aneg, (vec_uchar16)asel);
674
 
675
  // negative b
676
  vec_uint4 bbor = vec_subc((vec_uint4)vec_splat_u32(0), babs);
677
  bbor = (vec_uint4)(vec_perm(vec_perm((vec_uchar16)bbor, (vec_uchar16)bbor, borrow_shuffle),((vec_uchar16){0, 0, 0, 0, 0, 0, 0, 0, 0xFF, 0xFF, 0xFF, 0xFF, 0x80, 0x80, 0x80, 0x80}),pat));
678
  vec_uint4 bneg = vec_add(vec_nor(babs, babs), vec_and(bbor, vec_splat_u32(1)));
679
 
680
  // pick the one we want
681
  vec_int4 bval=(vec_int4)vec_sel((vec_uchar16)babs, (vec_uchar16)bneg, (vec_uchar16)bsel);
682
 
683
  // A) Check if the exponents are different 
684
  vec_uint4 gt_hi = (vec_uint4)vec_cmpgt(aval,bval);
685
 
686
  // B) Check if high word equal, and low word greater
687
  vec_uint4 gt_lo = (vec_uint4)vec_cmpgt((vec_uint4)aval, (vec_uint4)bval);
688
  vec_uint4 eq = (vec_uint4)vec_cmpeq(aval, bval);
689
  vec_uint4 eqgt = vec_and(eq,vec_slo(gt_lo,x.v));
690
 
691
  //  If either A or B is true, return true (unless NaNs detected) 
692
  vec_uint4 r = vec_or(gt_hi, eqgt);
693
 
694
  // splat the high words of the comparison step
695
  r = (vec_uint4)vec_perm((vec_uchar16)r,(vec_uchar16)r,splat_hi);
696
 
697
  // correct for NaNs in input
698
  return ((qword)vec_andc(r,vec_or(a_nan,b_nan)));
699
}
700
 
701
static __inline qword si_cgtbi(qword a, signed char b)
702
{
703
  return ((qword)(vec_cmpgt((vec_char16)(a),
704
                            vec_splat((vec_char16)(si_from_char(b)), 3))));
705
}
706
 
707
static __inline qword si_cgthi(qword a, signed short b)
708
{
709
  return ((qword)(vec_cmpgt((vec_short8)(a),
710
                            vec_splat((vec_short8)(si_from_short(b)), 1))));
711
}
712
 
713
static __inline qword si_cgti(qword a, signed int b)
714
{
715
  return ((qword)(vec_cmpgt((vec_int4)(a),
716
                            vec_splat((vec_int4)(si_from_int(b)), 0))));
717
}
718
 
719
static __inline qword si_clgtbi(qword a, unsigned char b)
720
{
721
  return ((qword)(vec_cmpgt((vec_uchar16)(a),
722
                            vec_splat((vec_uchar16)(si_from_uchar(b)), 3))));
723
}
724
 
725
static __inline qword si_clgthi(qword a, unsigned short b)
726
{
727
  return ((qword)(vec_cmpgt((vec_ushort8)(a),
728
                            vec_splat((vec_ushort8)(si_from_ushort(b)), 1))));
729
}
730
 
731
static __inline qword si_clgti(qword a, unsigned int b)
732
{
733
  return ((qword)(vec_cmpgt((vec_uint4)(a),
734
                            vec_splat((vec_uint4)(si_from_uint(b)), 0))));
735
}
736
 
737
static __inline qword si_dftsv(qword a, char b)
738
{
739
  vec_uchar16 splat_hi = (vec_uchar16) { 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
740
  vec_uint4 sign_mask = (vec_uint4) { 0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF };
741
  vec_uint4 result = (vec_uint4){0};
742
  vec_uint4 sign = (vec_uint4)vec_sra((vec_int4)(a), (vec_uint4)vec_splat(((vec_uint4)si_from_int(31)), 0));
743
  sign = (vec_uint4)vec_perm((vec_uchar16)sign,(vec_uchar16)sign,splat_hi);
744
  vec_uint4 aabs = vec_and((vec_uint4)a,sign_mask);
745
 
746
  union {
747
    vec_uchar16 v;
748
    int i[4];
749
  } x;
750
 
751
  /* Shift 4 bytes  */
752
  x.i[3] = 4 << 3;
753
 
754
  /* Nan or +inf or -inf  */
755
  if (b & 0x70)
756
  {
757
    vec_uint4 nan_mask = (vec_uint4) { 0x7FF00000, 0x0, 0x7FF00000, 0x0 };
758
    vec_uint4 a_inf = (vec_uint4)vec_cmpeq(aabs, nan_mask);
759
     /* NaN  */
760
     if (b & 0x40)
761
     {
762
       vec_uint4 a_nan = (vec_uint4)vec_cmpgt(aabs, nan_mask);
763
       a_nan = vec_or(a_nan, vec_and((vec_uint4)vec_slo((vec_uchar16)a_nan,x.v),a_inf));
764
       a_nan = (vec_uint4)vec_perm((vec_uchar16)a_nan, (vec_uchar16)a_nan, splat_hi);
765
       result = vec_or(result, a_nan);
766
     }
767
     /* inf  */
768
     if (b & 0x30)
769
     {
770
       a_inf = vec_and((vec_uint4)vec_slo((vec_uchar16)a_inf,x.v), a_inf);
771
       a_inf = (vec_uint4)vec_perm((vec_uchar16)a_inf, (vec_uchar16)a_inf, splat_hi);
772
        /* +inf  */
773
        if (b & 0x20)
774
          result = vec_or(vec_andc(a_inf, sign), result);
775
        /* -inf  */
776
        if (b & 0x10)
777
          result = vec_or(vec_and(a_inf, sign), result);
778
     }
779
  }
780
  /* 0 or denorm  */
781
  if (b & 0xF)
782
  {
783
    vec_uint4 iszero =(vec_uint4)vec_cmpeq(aabs,(vec_uint4)vec_splat_u32(0));
784
    iszero = vec_and(iszero,(vec_uint4)vec_slo((vec_uchar16)iszero,x.v));
785
    /* denorm  */
786
    if (b & 0x3)
787
    {
788
      vec_uint4 denorm_mask = (vec_uint4){0xFFFFF, 0xFFFFF, 0xFFFFF, 0xFFFFF};
789
      vec_uint4 isdenorm = vec_nor((vec_uint4)vec_cmpgt(aabs, denorm_mask), iszero);
790
      isdenorm = (vec_uint4)vec_perm((vec_uchar16)isdenorm, (vec_uchar16)isdenorm, splat_hi);
791
      /* +denorm  */
792
     if (b & 0x2)
793
        result = vec_or(vec_andc(isdenorm, sign), result);
794
      /* -denorm  */
795
     if (b & 0x1)
796
        result = vec_or(vec_and(isdenorm, sign), result);
797
    }
798
    /* 0  */
799
    if (b & 0xC)
800
    {
801
      iszero = (vec_uint4)vec_perm((vec_uchar16)iszero, (vec_uchar16)iszero, splat_hi);
802
      /* +0  */
803
     if (b & 0x8)
804
        result = vec_or(vec_andc(iszero, sign), result);
805
      /* -0  */
806
     if (b & 0x4)
807
        result = vec_or(vec_and(iszero, sign), result);
808
    }
809
  }
810
  return ((qword)result);
811
}
812
 
813
 
814
/* Carry generate
815
 */
816
#define si_cg(_a, _b)           ((qword)(vec_addc((vec_uint4)(_a), (vec_uint4)(_b))))
817
 
818
#define si_cgx(_a, _b, _c)      ((qword)(vec_or(vec_addc((vec_uint4)(_a), (vec_uint4)(_b)),             \
819
                                                vec_addc(vec_add((vec_uint4)(_a), (vec_uint4)(_b)),     \
820
                                                         vec_and((vec_uint4)(_c), vec_splat_u32(1))))))
821
 
822
 
823
/* Count ones for bytes
824
 */
825
static __inline qword si_cntb(qword a)
826
{
827
  vec_uchar16 nib_cnt = (vec_uchar16){0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4};
828
  vec_uchar16 four = { 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 };
829
  vec_uchar16 av;
830
 
831
  av = (vec_uchar16)(a);
832
 
833
  return ((qword)(vec_add(vec_perm(nib_cnt, nib_cnt, av),
834
                          vec_perm(nib_cnt, nib_cnt, vec_sr (av, four)))));
835
}
836
 
837
/* Count ones for bytes
838
 */
839
static __inline qword si_clz(qword a)
840
{
841
  vec_uchar16 av;
842
  vec_uchar16 cnt_hi, cnt_lo, cnt, tmp1, tmp2, tmp3;
843
  vec_uchar16 four    = vec_splat_u8(4);
844
  vec_uchar16 nib_cnt = (vec_uchar16){4, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0};
845
  vec_uchar16 eight   = vec_splat_u8(8);
846
  vec_uchar16 sixteen = (vec_uchar16){16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16};
847
  vec_uchar16 twentyfour = (vec_uchar16){24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24};
848
 
849
  av = (vec_uchar16)(a);
850
 
851
  cnt_hi = vec_perm(nib_cnt, nib_cnt, vec_sr(av, four));
852
  cnt_lo = vec_perm(nib_cnt, nib_cnt, av);
853
 
854
  cnt = vec_add(cnt_hi, vec_and(cnt_lo, vec_cmpeq(cnt_hi, four)));
855
 
856
  tmp1 = (vec_uchar16)vec_sl((vec_uint4)(cnt), (vec_uint4)(eight));
857
  tmp2 = (vec_uchar16)vec_sl((vec_uint4)(cnt), (vec_uint4)(sixteen));
858
  tmp3 = (vec_uchar16)vec_sl((vec_uint4)(cnt), (vec_uint4)(twentyfour));
859
 
860
  cnt = vec_add(cnt, vec_and(tmp1, vec_cmpeq(cnt, eight)));
861
  cnt = vec_add(cnt, vec_and(tmp2, vec_cmpeq(cnt, sixteen)));
862
  cnt = vec_add(cnt, vec_and(tmp3, vec_cmpeq(cnt, twentyfour)));
863
 
864
  return (qword)((vec_sr((vec_uint4)(cnt), (vec_uint4)(twentyfour))));
865
}
866
 
867
/* Convert to float
868
 */
869
#define si_cuflt(_a, _b)        ((qword)(vec_ctf((vec_uint4)(_a), _b)))
870
#define si_csflt(_a, _b)        ((qword)(vec_ctf((vec_int4)(_a), _b)))
871
 
872
/* Convert to signed int
873
 */
874
#define si_cflts(_a, _b)        ((qword)(vec_cts((vec_float4)(_a), _b)))
875
 
876
/* Convert to unsigned int
877
 */
878
#define si_cfltu(_a, _b)        ((qword)(vec_ctu((vec_float4)(_a), _b)))
879
 
880
/* Synchronize
881
 */
882
#define si_dsync()              /* do nothing */
883
#define si_sync()               /* do nothing */
884
#define si_syncc()              /* do nothing */
885
 
886
 
887
/* Equivalence
888
 */
889
static __inline qword si_eqv(qword a, qword b)
890
{
891
  vec_uchar16 d;
892
 
893
  d = vec_xor((vec_uchar16)(a), (vec_uchar16)(b));
894
  return ((qword)(vec_nor(d, d)));
895
}
896
 
897
/* Extend
898
 */
899
static __inline qword si_xsbh(qword a)
900
{
901
  vec_char16 av;
902
 
903
  av = (vec_char16)(a);
904
  return ((qword)(vec_unpackh(vec_perm(av, av, ((vec_uchar16){1, 3, 5, 7, 9,11,13,15,
905
                                                              0, 0, 0, 0, 0, 0, 0, 0})))));
906
}
907
 
908
static __inline qword si_xshw(qword a)
909
{
910
  vec_short8 av;
911
 
912
  av = (vec_short8)(a);
913
  return ((qword)(vec_unpackh(vec_perm(av, av, ((vec_uchar16){2, 3, 6, 7,
914
                                                              10,11,14,15,
915
                                                              0, 0, 0, 0,
916
                                                              0, 0, 0, 0})))));
917
}
918
 
919
static __inline qword si_xswd(qword a)
920
{
921
  vec_int4 av;
922
 
923
  av = (vec_int4)(a);
924
  return ((qword)(vec_perm(av, vec_sra(av, ((vec_uint4){31,31,31,31})),
925
                           ((vec_uchar16){20, 21, 22, 23,
926
                                           4,  5,  6,  7,
927
                                          28, 29, 30, 31,
928
                                          12, 13, 14, 15}))));
929
}
930
 
931
static __inline qword si_fesd(qword a)
932
{
933
  union {
934
    double d[2];
935
    vec_double2 vd;
936
  } out;
937
  union {
938
    float f[4];
939
    vec_float4 vf;
940
  } in;
941
 
942
  in.vf = (vec_float4)(a);
943
  out.d[0] = (double)(in.f[0]);
944
  out.d[1] = (double)(in.f[2]);
945
  return ((qword)(out.vd));
946
}
947
 
948
/* Gather
949
 */
950
static __inline qword si_gbb(qword a)
951
{
952
  vec_uchar16 bits;
953
  vec_uint4   bytes;
954
 
955
  bits  = vec_sl(vec_and((vec_uchar16)(a), vec_splat_u8(1)), ((vec_uchar16){7, 6, 5, 4, 3, 2, 1, 0,
956
                                                                            7, 6, 5, 4, 3, 2, 1, 0}));
957
  bytes = (vec_uint4)vec_sum2s((vec_int4)(vec_sum4s(bits, ((vec_uint4){0}))), ((vec_int4){0}));
958
 
959
  return ((qword)(vec_perm(bytes, bytes, ((vec_uchar16){0, 0, 7,15, 0, 0, 0, 0,
960
                                                        0, 0, 0, 0, 0, 0, 0, 0}))));
961
}
962
 
963
 
964
static __inline qword si_gbh(qword a)
965
{
966
  vec_ushort8 bits;
967
  vec_uint4   bytes;
968
 
969
  bits  = vec_sl(vec_and((vec_ushort8)(a), vec_splat_u16(1)), ((vec_ushort8){7, 6, 5, 4, 3, 2, 1, 0}));
970
 
971
  bytes = (vec_uint4)vec_sums((vec_int4)(vec_sum4s((vec_short8)(bits), (vec_int4){0})), (vec_int4){0});
972
 
973
  return ((qword)(vec_sld(bytes, bytes, 12)));
974
}
975
 
976
static __inline qword si_gb(qword a)
977
{
978
  vec_uint4 bits;
979
  vec_uint4 bytes;
980
 
981
  bits  = vec_sl(vec_and((vec_uint4)(a), vec_splat_u32(1)), ((vec_uint4){3, 2, 1, 0}));
982
  bytes = (vec_uint4)vec_sums((vec_int4)(bits), ((vec_int4){0}));
983
  return ((qword)(vec_sld(bytes, bytes, 12)));
984
}
985
 
986
 
987
/* Compare and halt
988
 */
989
static __inline void si_heq(qword a, qword b)
990
{
991
  union {
992
    vector unsigned int v;
993
    unsigned int i[4];
994
  } aa, bb;
995
 
996
  aa.v = (vector unsigned int)(a);
997
  bb.v = (vector unsigned int)(b);
998
 
999
  if (aa.i[0] == bb.i[0]) { SPU_HALT_ACTION; };
1000
}
1001
 
1002
static __inline void si_heqi(qword a, unsigned int b)
1003
{
1004
  union {
1005
    vector unsigned int v;
1006
    unsigned int i[4];
1007
  } aa;
1008
 
1009
  aa.v = (vector unsigned int)(a);
1010
 
1011
  if (aa.i[0] == b) { SPU_HALT_ACTION; };
1012
}
1013
 
1014
static __inline void si_hgt(qword a, qword b)
1015
{
1016
  union {
1017
    vector signed int v;
1018
    signed int i[4];
1019
  } aa, bb;
1020
 
1021
  aa.v = (vector signed int)(a);
1022
  bb.v = (vector signed int)(b);
1023
 
1024
  if (aa.i[0] > bb.i[0]) { SPU_HALT_ACTION; };
1025
}
1026
 
1027
static __inline void si_hgti(qword a, signed int b)
1028
{
1029
  union {
1030
    vector signed int v;
1031
    signed int i[4];
1032
  } aa;
1033
 
1034
  aa.v = (vector signed int)(a);
1035
 
1036
  if (aa.i[0] > b) { SPU_HALT_ACTION; };
1037
}
1038
 
1039
static __inline void si_hlgt(qword a, qword b)
1040
{
1041
  union {
1042
    vector unsigned int v;
1043
    unsigned int i[4];
1044
  } aa, bb;
1045
 
1046
  aa.v = (vector unsigned int)(a);
1047
  bb.v = (vector unsigned int)(b);
1048
 
1049
  if (aa.i[0] > bb.i[0]) { SPU_HALT_ACTION; };
1050
}
1051
 
1052
static __inline void si_hlgti(qword a, unsigned int b)
1053
{
1054
  union {
1055
    vector unsigned int v;
1056
    unsigned int i[4];
1057
  } aa;
1058
 
1059
  aa.v = (vector unsigned int)(a);
1060
 
1061
  if (aa.i[0] > b) { SPU_HALT_ACTION; };
1062
}
1063
 
1064
 
1065
/* Multiply and Add
1066
 */
1067
static __inline qword si_mpya(qword a, qword b, qword c)
1068
{
1069
  return ((qword)(vec_msum(vec_and((vec_short8)(a),
1070
                                   ((vec_short8){0, -1, 0, -1, 0, -1, 0, -1})),
1071
                           (vec_short8)(b), (vec_int4)(c))));
1072
}
1073
 
1074
static __inline qword si_fma(qword a, qword b, qword c)
1075
{
1076
  return ((qword)(vec_madd((vec_float4)(a), (vec_float4)(b), (vec_float4)(c))));
1077
}
1078
 
1079
static __inline qword si_dfma(qword a, qword b, qword c)
1080
{
1081
  union {
1082
    vec_double2 v;
1083
    double d[2];
1084
  } aa, bb, cc, dd;
1085
 
1086
  aa.v = (vec_double2)(a);
1087
  bb.v = (vec_double2)(b);
1088
  cc.v = (vec_double2)(c);
1089
  dd.d[0] = aa.d[0] * bb.d[0] + cc.d[0];
1090
  dd.d[1] = aa.d[1] * bb.d[1] + cc.d[1];
1091
  return ((qword)(dd.v));
1092
}
1093
 
1094
/* Form Mask
1095
 */
1096
#define si_fsmbi(_a)    si_fsmb(si_from_int(_a))
1097
 
1098
static __inline qword si_fsmb(qword a)
1099
{
1100
  vec_char16 mask;
1101
  vec_ushort8 in;
1102
 
1103
  in = (vec_ushort8)(a);
1104
  mask = (vec_char16)(vec_perm(in, in, ((vec_uchar16){2, 2, 2, 2, 2, 2, 2, 2,
1105
                                                      3, 3, 3, 3, 3, 3, 3, 3})));
1106
  return ((qword)(vec_sra(vec_sl(mask, ((vec_uchar16){0, 1, 2, 3, 4, 5, 6, 7,
1107
                                                      0, 1, 2, 3, 4, 5, 6, 7})),
1108
                          vec_splat_u8(7))));
1109
}
1110
 
1111
 
1112
static __inline qword si_fsmh(qword a)
1113
{
1114
  vec_uchar16 in;
1115
  vec_short8 mask;
1116
 
1117
  in = (vec_uchar16)(a);
1118
  mask = (vec_short8)(vec_splat(in, 3));
1119
  return ((qword)(vec_sra(vec_sl(mask, ((vec_ushort8){0, 1, 2, 3, 4, 5, 6, 7})),
1120
                          vec_splat_u16(15))));
1121
}
1122
 
1123
static __inline qword si_fsm(qword a)
1124
{
1125
  vec_uchar16 in;
1126
  vec_int4 mask;
1127
 
1128
  in = (vec_uchar16)(a);
1129
  mask = (vec_int4)(vec_splat(in, 3));
1130
  return ((qword)(vec_sra(vec_sl(mask, ((vec_uint4){28, 29, 30, 31})),
1131
                          ((vec_uint4){31,31,31,31}))));
1132
}
1133
 
1134
/* Move from/to registers
1135
 */
1136
#define si_fscrrd()             ((qword)((vec_uint4){0}))
1137
#define si_fscrwr(_a)
1138
 
1139
#define si_mfspr(_reg)          ((qword)((vec_uint4){0}))
1140
#define si_mtspr(_reg, _a)
1141
 
1142
/* Multiply High High Add
1143
 */
1144
static __inline qword si_mpyhha(qword a, qword b, qword c)
1145
{
1146
  return ((qword)(vec_add(vec_mule((vec_short8)(a), (vec_short8)(b)), (vec_int4)(c))));
1147
}
1148
 
1149
static __inline qword si_mpyhhau(qword a, qword b, qword c)
1150
{
1151
  return ((qword)(vec_add(vec_mule((vec_ushort8)(a), (vec_ushort8)(b)), (vec_uint4)(c))));
1152
}
1153
 
1154
/* Multiply Subtract
1155
 */
1156
static __inline qword si_fms(qword a, qword b, qword c)
1157
{
1158
  return ((qword)(vec_madd((vec_float4)(a), (vec_float4)(b),
1159
                           vec_sub(((vec_float4){0.0f}), (vec_float4)(c)))));
1160
}
1161
 
1162
static __inline qword si_dfms(qword a, qword b, qword c)
1163
{
1164
  union {
1165
    vec_double2 v;
1166
    double d[2];
1167
  } aa, bb, cc, dd;
1168
 
1169
  aa.v = (vec_double2)(a);
1170
  bb.v = (vec_double2)(b);
1171
  cc.v = (vec_double2)(c);
1172
  dd.d[0] = aa.d[0] * bb.d[0] - cc.d[0];
1173
  dd.d[1] = aa.d[1] * bb.d[1] - cc.d[1];
1174
  return ((qword)(dd.v));
1175
}
1176
 
1177
/* Multiply
1178
 */
1179
static __inline qword si_fm(qword a, qword b)
1180
{
1181
  return ((qword)(vec_madd((vec_float4)(a), (vec_float4)(b), ((vec_float4){0.0f}))));
1182
}
1183
 
1184
static __inline qword si_dfm(qword a, qword b)
1185
{
1186
  union {
1187
    vec_double2 v;
1188
    double d[2];
1189
  } aa, bb, dd;
1190
 
1191
  aa.v = (vec_double2)(a);
1192
  bb.v = (vec_double2)(b);
1193
  dd.d[0] = aa.d[0] * bb.d[0];
1194
  dd.d[1] = aa.d[1] * bb.d[1];
1195
  return ((qword)(dd.v));
1196
}
1197
 
1198
/* Multiply High
1199
 */
1200
static __inline qword si_mpyh(qword a, qword b)
1201
{
1202
  vec_uint4 sixteen = (vec_uint4){16, 16, 16, 16};
1203
 
1204
  return ((qword)(vec_sl(vec_mule((vec_short8)(a), (vec_short8)(vec_sl((vec_uint4)(b), sixteen))), sixteen)));
1205
}
1206
 
1207
 
1208
/* Multiply High High
1209
 */
1210
static __inline qword si_mpyhh(qword a, qword b)
1211
{
1212
  return ((qword)(vec_mule((vec_short8)(a), (vec_short8)(b))));
1213
}
1214
 
1215
static __inline qword si_mpyhhu(qword a, qword b)
1216
{
1217
  return ((qword)(vec_mule((vec_ushort8)(a), (vec_ushort8)(b))));
1218
}
1219
 
1220
/* Multiply Odd
1221
 */
1222
static __inline qword si_mpy(qword a, qword b)
1223
{
1224
  return ((qword)(vec_mulo((vec_short8)(a), (vec_short8)(b))));
1225
}
1226
 
1227
static __inline qword si_mpyu(qword a, qword b)
1228
{
1229
  return ((qword)(vec_mulo((vec_ushort8)(a), (vec_ushort8)(b))));
1230
}
1231
 
1232
static __inline qword si_mpyi(qword a, short b)
1233
{
1234
  return ((qword)(vec_mulo((vec_short8)(a),
1235
                           vec_splat((vec_short8)(si_from_short(b)), 1))));
1236
}
1237
 
1238
static __inline qword si_mpyui(qword a, unsigned short b)
1239
{
1240
  return ((qword)(vec_mulo((vec_ushort8)(a),
1241
                           vec_splat((vec_ushort8)(si_from_ushort(b)), 1))));
1242
}
1243
 
1244
/* Multiply and Shift Right
1245
 */
1246
static __inline qword si_mpys(qword a, qword b)
1247
{
1248
  return ((qword)(vec_sra(vec_mulo((vec_short8)(a), (vec_short8)(b)), ((vec_uint4){16,16,16,16}))));
1249
}
1250
 
1251
/* Nand
1252
 */
1253
static __inline qword si_nand(qword a, qword b)
1254
{
1255
  vec_uchar16 d;
1256
 
1257
  d = vec_and((vec_uchar16)(a), (vec_uchar16)(b));
1258
  return ((qword)(vec_nor(d, d)));
1259
}
1260
 
1261
/* Negative Multiply Add
1262
 */
1263
static __inline qword si_dfnma(qword a, qword b, qword c)
1264
{
1265
  union {
1266
    vec_double2 v;
1267
    double d[2];
1268
  } aa, bb, cc, dd;
1269
 
1270
  aa.v = (vec_double2)(a);
1271
  bb.v = (vec_double2)(b);
1272
  cc.v = (vec_double2)(c);
1273
  dd.d[0] = -cc.d[0] - aa.d[0] * bb.d[0];
1274
  dd.d[1] = -cc.d[1] - aa.d[1] * bb.d[1];
1275
  return ((qword)(dd.v));
1276
}
1277
 
1278
/* Negative Multiply and Subtract
1279
 */
1280
static __inline qword si_fnms(qword a, qword b, qword c)
1281
{
1282
  return ((qword)(vec_nmsub((vec_float4)(a), (vec_float4)(b), (vec_float4)(c))));
1283
}
1284
 
1285
static __inline qword si_dfnms(qword a, qword b, qword c)
1286
{
1287
  union {
1288
    vec_double2 v;
1289
    double d[2];
1290
  } aa, bb, cc, dd;
1291
 
1292
  aa.v = (vec_double2)(a);
1293
  bb.v = (vec_double2)(b);
1294
  cc.v = (vec_double2)(c);
1295
  dd.d[0] = cc.d[0] - aa.d[0] * bb.d[0];
1296
  dd.d[1] = cc.d[1] - aa.d[1] * bb.d[1];
1297
  return ((qword)(dd.v));
1298
}
1299
 
1300
/* Nor
1301
 */
1302
static __inline qword si_nor(qword a, qword b)
1303
{
1304
  return ((qword)(vec_nor((vec_uchar16)(a), (vec_uchar16)(b))));
1305
}
1306
 
1307
/* Or
1308
 */
1309
static __inline qword si_or(qword a, qword b)
1310
{
1311
  return ((qword)(vec_or((vec_uchar16)(a), (vec_uchar16)(b))));
1312
}
1313
 
1314
static __inline qword si_orbi(qword a, unsigned char b)
1315
{
1316
  return ((qword)(vec_or((vec_uchar16)(a),
1317
                         vec_splat((vec_uchar16)(si_from_uchar(b)), 3))));
1318
}
1319
 
1320
static __inline qword si_orhi(qword a, unsigned short b)
1321
{
1322
  return ((qword)(vec_or((vec_ushort8)(a),
1323
                          vec_splat((vec_ushort8)(si_from_ushort(b)), 1))));
1324
}
1325
 
1326
static __inline qword si_ori(qword a, unsigned int b)
1327
{
1328
  return ((qword)(vec_or((vec_uint4)(a),
1329
                          vec_splat((vec_uint4)(si_from_uint(b)), 0))));
1330
}
1331
 
1332
/* Or Complement
1333
 */
1334
static __inline qword si_orc(qword a, qword b)
1335
{
1336
  return ((qword)(vec_or((vec_uchar16)(a), vec_nor((vec_uchar16)(b), (vec_uchar16)(b)))));
1337
}
1338
 
1339
 
1340
/* Or Across
1341
 */
1342
static __inline qword si_orx(qword a)
1343
{
1344
  vec_uchar16 tmp;
1345
  tmp = (vec_uchar16)(a);
1346
  tmp = vec_or(tmp, vec_sld(tmp, tmp, 8));
1347
  tmp = vec_or(tmp, vec_sld(tmp, tmp, 4));
1348
  return ((qword)(vec_and(tmp, ((vec_uchar16){0xFF,0xFF,0xFF,0xFF, 0x00,0x00,0x00,0x00,
1349
                                              0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00}))));
1350
}
1351
 
1352
 
1353
/* Estimates
1354
 */
1355
static __inline qword si_frest(qword a)
1356
{
1357
  return ((qword)(vec_re((vec_float4)(a))));
1358
}
1359
 
1360
static __inline qword si_frsqest(qword a)
1361
{
1362
  return ((qword)(vec_rsqrte((vec_float4)(a))));
1363
}
1364
 
1365
#define si_fi(_a, _d)           (_d)
1366
 
1367
/* Channel Read and Write
1368
 */
1369
#define si_rdch(_channel)               ((qword)(vec_splat_u8(0)))      /* not mappable */
1370
#define si_rchcnt(_channel)             ((qword)(vec_splat_u8(0)))      /* not mappable */
1371
#define si_wrch(_channel, _a)           /* not mappable */
1372
 
1373
/* Rotate Left
1374
 */
1375
static __inline qword si_roth(qword a, qword b)
1376
{
1377
  return ((qword)(vec_rl((vec_ushort8)(a), (vec_ushort8)(b))));
1378
}
1379
 
1380
static __inline qword si_rot(qword a, qword b)
1381
{
1382
  return ((qword)(vec_rl((vec_uint4)(a), (vec_uint4)(b))));
1383
}
1384
 
1385
static __inline qword si_rothi(qword a, int b)
1386
{
1387
  return ((qword)(vec_rl((vec_ushort8)(a),
1388
                         vec_splat((vec_ushort8)(si_from_int(b)), 1))));
1389
}
1390
 
1391
static __inline qword si_roti(qword a, int b)
1392
{
1393
  return ((qword)(vec_rl((vec_uint4)(a),
1394
                         vec_splat((vec_uint4)(si_from_int(b)), 0))));
1395
}
1396
 
1397
/* Rotate Left with Mask
1398
 */
1399
static __inline qword si_rothm(qword a, qword b)
1400
{
1401
  vec_ushort8 neg_b;
1402
  vec_ushort8 mask;
1403
 
1404
  neg_b = (vec_ushort8)vec_sub(vec_splat_s16(0), (vec_short8)(b));
1405
  mask = vec_sra(vec_sl(neg_b, vec_splat_u16(11)), vec_splat_u16(15));
1406
  return ((qword)(vec_andc(vec_sr((vec_ushort8)(a), neg_b), mask)));
1407
}
1408
 
1409
static __inline qword si_rotm(qword a, qword b)
1410
{
1411
  vec_uint4 neg_b;
1412
  vec_uint4 mask;
1413
 
1414
  neg_b = (vec_uint4)vec_sub(vec_splat_s32(0), (vec_int4)(b));
1415
  mask = vec_sra(vec_sl(neg_b, ((vec_uint4){26,26,26,26})), ((vec_uint4){31,31,31,31}));
1416
  return ((qword)(vec_andc(vec_sr((vec_uint4)(a), neg_b), mask)));
1417
}
1418
 
1419
static __inline qword si_rothmi(qword a, int b)
1420
{
1421
  vec_ushort8 neg_b;
1422
  vec_ushort8 mask;
1423
 
1424
  neg_b = vec_splat((vec_ushort8)(si_from_int(-b)), 1);
1425
  mask = vec_sra(vec_sl(neg_b, vec_splat_u16(11)), vec_splat_u16(15));
1426
  return ((qword)(vec_andc(vec_sr((vec_ushort8)(a), neg_b), mask)));
1427
}
1428
 
1429
static __inline qword si_rotmi(qword a, int b)
1430
{
1431
  vec_uint4 neg_b;
1432
  vec_uint4 mask;
1433
 
1434
  neg_b = vec_splat((vec_uint4)(si_from_int(-b)), 0);
1435
  mask = vec_sra(vec_sl(neg_b, ((vec_uint4){26,26,26,26})), ((vec_uint4){31,31,31,31}));
1436
  return ((qword)(vec_andc(vec_sr((vec_uint4)(a), neg_b), mask)));
1437
}
1438
 
1439
 
1440
/* Rotate Left Algebraic with Mask
1441
 */
1442
static __inline qword si_rotmah(qword a, qword b)
1443
{
1444
  vec_ushort8 neg_b;
1445
  vec_ushort8 mask;
1446
 
1447
  neg_b = (vec_ushort8)vec_sub(vec_splat_s16(0), (vec_short8)(b));
1448
  mask = vec_sra(vec_sl(neg_b, vec_splat_u16(11)), vec_splat_u16(15));
1449
  return ((qword)(vec_sra((vec_short8)(a), (vec_ushort8)vec_or(neg_b, mask))));
1450
}
1451
 
1452
static __inline qword si_rotma(qword a, qword b)
1453
{
1454
  vec_uint4 neg_b;
1455
  vec_uint4 mask;
1456
 
1457
  neg_b = (vec_uint4)vec_sub(vec_splat_s32(0), (vec_int4)(b));
1458
  mask = vec_sra(vec_sl(neg_b, ((vec_uint4){26,26,26,26})), ((vec_uint4){31,31,31,31}));
1459
  return ((qword)(vec_sra((vec_int4)(a), (vec_uint4)vec_or(neg_b, mask))));
1460
}
1461
 
1462
 
1463
static __inline qword si_rotmahi(qword a, int b)
1464
{
1465
  vec_ushort8 neg_b;
1466
  vec_ushort8 mask;
1467
 
1468
  neg_b = vec_splat((vec_ushort8)(si_from_int(-b)), 1);
1469
  mask = vec_sra(vec_sl(neg_b, vec_splat_u16(11)), vec_splat_u16(15));
1470
  return ((qword)(vec_sra((vec_short8)(a), (vec_ushort8)vec_or(neg_b, mask))));
1471
}
1472
 
1473
static __inline qword si_rotmai(qword a, int b)
1474
{
1475
  vec_uint4 neg_b;
1476
  vec_uint4 mask;
1477
 
1478
  neg_b = vec_splat((vec_uint4)(si_from_int(-b)), 0);
1479
  mask = vec_sra(vec_sl(neg_b, ((vec_uint4){26,26,26,26})), ((vec_uint4){31,31,31,31}));
1480
  return ((qword)(vec_sra((vec_int4)(a), (vec_uint4)vec_or(neg_b, mask))));
1481
}
1482
 
1483
 
1484
/* Rotate Left Quadword by Bytes with Mask
1485
 */
1486
static __inline qword si_rotqmbyi(qword a, int count)
1487
{
1488
  union {
1489
    vec_uchar16 v;
1490
    int i[4];
1491
  } x;
1492
  vec_uchar16 mask;
1493
 
1494
  count = 0 - count;
1495
  x.i[3] = count << 3;
1496
  mask = (count & 0x10) ? vec_splat_u8(0) : vec_splat_u8(-1);
1497
 
1498
  return ((qword)(vec_and(vec_sro((vec_uchar16)(a), x.v), mask)));
1499
}
1500
 
1501
 
1502
static __inline qword si_rotqmby(qword a, qword count)
1503
{
1504
  union {
1505
    vec_uchar16 v;
1506
    int i[4];
1507
  } x;
1508
  int cnt;
1509
  vec_uchar16 mask;
1510
 
1511
  x.v = (vec_uchar16)(count);
1512
  x.i[0] = cnt = (0 - x.i[0]) << 3;
1513
 
1514
  x.v = vec_splat(x.v, 3);
1515
  mask = (cnt & 0x80) ? vec_splat_u8(0) : vec_splat_u8(-1);
1516
 
1517
  return ((qword)(vec_and(vec_sro((vec_uchar16)(a), x.v), mask)));
1518
}
1519
 
1520
 
1521
/* Rotate Left Quadword by Bytes
1522
 */
1523
static __inline qword si_rotqbyi(qword a, int count)
1524
{
1525
  union {
1526
    vec_uchar16 v;
1527
    int i[4];
1528
  } left, right;
1529
 
1530
  count <<= 3;
1531
  left.i[3] = count;
1532
  right.i[3] = 0 - count;
1533
  return ((qword)(vec_or(vec_slo((vec_uchar16)(a), left.v), vec_sro((vec_uchar16)(a), right.v))));
1534
}
1535
 
1536
static __inline qword si_rotqby(qword a, qword count)
1537
{
1538
  vec_uchar16 left, right;
1539
 
1540
  left = vec_sl(vec_splat((vec_uchar16)(count), 3), vec_splat_u8(3));
1541
  right = vec_sub(vec_splat_u8(0), left);
1542
  return ((qword)(vec_or(vec_slo((vec_uchar16)(a), left), vec_sro((vec_uchar16)(a), right))));
1543
}
1544
 
1545
/* Rotate Left Quadword by Bytes Bit Count
1546
 */
1547
static __inline qword si_rotqbybi(qword a, qword count)
1548
{
1549
  vec_uchar16 left, right;
1550
 
1551
  left = vec_splat((vec_uchar16)(count), 3);
1552
  right = vec_sub(vec_splat_u8(7), left);
1553
  return ((qword)(vec_or(vec_slo((vec_uchar16)(a), left), vec_sro((vec_uchar16)(a), right))));
1554
}
1555
 
1556
 
1557
/* Rotate Left Quadword by Bytes Bit Count
1558
 */
1559
static __inline qword si_rotqbii(qword a, int count)
1560
{
1561
  vec_uchar16 x, y;
1562
  vec_uchar16 result;
1563
 
1564
  x = vec_splat((vec_uchar16)(si_from_int(count & 7)), 3);
1565
  y = (vec_uchar16)(vec_sr((vec_uint4)vec_sro((vec_uchar16)(a), ((vec_uchar16)((vec_uint4){0,0,0,120}))),
1566
                           (vec_uint4)vec_sub(vec_splat_u8(8), x)));
1567
  result = vec_or(vec_sll((qword)(a), x), y);
1568
  return ((qword)(result));
1569
}
1570
 
1571
static __inline qword si_rotqbi(qword a, qword count)
1572
{
1573
  vec_uchar16 x, y;
1574
  vec_uchar16 result;
1575
 
1576
  x = vec_and(vec_splat((vec_uchar16)(count), 3), vec_splat_u8(7));
1577
  y = (vec_uchar16)(vec_sr((vec_uint4)vec_sro((vec_uchar16)(a), ((vec_uchar16)((vec_uint4){0,0,0,120}))),
1578
                           (vec_uint4)vec_sub(vec_splat_u8(8), x)));
1579
 
1580
  result = vec_or(vec_sll((qword)(a), x), y);
1581
  return ((qword)(result));
1582
}
1583
 
1584
 
1585
/* Rotate Left Quadword and Mask by Bits
1586
 */
1587
static __inline qword si_rotqmbii(qword a, int count)
1588
{
1589
  return ((qword)(vec_srl((vec_uchar16)(a), vec_splat((vec_uchar16)(si_from_int(0 - count)), 3))));
1590
}
1591
 
1592
static __inline qword si_rotqmbi(qword a, qword count)
1593
{
1594
  return ((qword)(vec_srl((vec_uchar16)(a), vec_sub(vec_splat_u8(0), vec_splat((vec_uchar16)(count), 3)))));
1595
}
1596
 
1597
 
1598
/* Rotate Left Quadword and Mask by Bytes with Bit Count
1599
 */
1600
static __inline qword si_rotqmbybi(qword a, qword count)
1601
{
1602
  union {
1603
    vec_uchar16 v;
1604
    int i[4];
1605
  } x;
1606
  int cnt;
1607
  vec_uchar16 mask;
1608
 
1609
  x.v = (vec_uchar16)(count);
1610
  x.i[0] = cnt = 0 - (x.i[0] & ~7);
1611
  x.v = vec_splat(x.v, 3);
1612
  mask = (cnt & 0x80) ? vec_splat_u8(0) : vec_splat_u8(-1);
1613
 
1614
  return ((qword)(vec_and(vec_sro((vec_uchar16)(a), x.v), mask)));
1615
}
1616
 
1617
 
1618
 
1619
 
1620
/* Round Double to Float
1621
 */
1622
static __inline qword si_frds(qword a)
1623
{
1624
  union {
1625
    vec_float4 v;
1626
    float f[4];
1627
  } d;
1628
  union {
1629
    vec_double2 v;
1630
    double d[2];
1631
  } in;
1632
 
1633
  in.v = (vec_double2)(a);
1634
  d.v = (vec_float4){0.0f};
1635
  d.f[0] = (float)in.d[0];
1636
  d.f[2] = (float)in.d[1];
1637
 
1638
  return ((qword)(d.v));
1639
}
1640
 
1641
/* Select Bits
1642
 */
1643
static __inline qword si_selb(qword a, qword b, qword c)
1644
{
1645
  return ((qword)(vec_sel((vec_uchar16)(a), (vec_uchar16)(b), (vec_uchar16)(c))));
1646
}
1647
 
1648
 
1649
/* Shuffle Bytes
1650
 */
1651
static __inline qword si_shufb(qword a, qword b, qword pattern)
1652
{
1653
  vec_uchar16 pat;
1654
 
1655
  pat = vec_sel(((vec_uchar16){0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15}),
1656
                vec_sr((vec_uchar16)(pattern), vec_splat_u8(3)),
1657
                vec_sra((vec_uchar16)(pattern), vec_splat_u8(7)));
1658
  return ((qword)(vec_perm(vec_perm(a, b, pattern),
1659
                           ((vec_uchar16){0, 0, 0, 0, 0, 0, 0, 0,
1660
                                          0xFF, 0xFF, 0xFF, 0xFF, 0x80, 0x80, 0x80, 0x80}),
1661
                           pat)));
1662
}
1663
 
1664
 
1665
/* Shift Left
1666
 */
1667
static __inline qword si_shlh(qword a, qword b)
1668
{
1669
  vec_ushort8 mask;
1670
 
1671
  mask = (vec_ushort8)vec_sra(vec_sl((vec_ushort8)(b), vec_splat_u16(11)), vec_splat_u16(15));
1672
  return ((qword)(vec_andc(vec_sl((vec_ushort8)(a), (vec_ushort8)(b)), mask)));
1673
}
1674
 
1675
static __inline qword si_shl(qword a, qword b)
1676
{
1677
  vec_uint4 mask;
1678
 
1679
  mask = (vec_uint4)vec_sra(vec_sl((vec_uint4)(b), ((vec_uint4){26,26,26,26})), ((vec_uint4){31,31,31,31}));
1680
  return ((qword)(vec_andc(vec_sl((vec_uint4)(a), (vec_uint4)(b)), mask)));
1681
}
1682
 
1683
 
1684
static __inline qword si_shlhi(qword a, unsigned int b)
1685
{
1686
  vec_ushort8 mask;
1687
  vec_ushort8 bv;
1688
 
1689
  bv = vec_splat((vec_ushort8)(si_from_int(b)), 1);
1690
  mask = (vec_ushort8)vec_sra(vec_sl(bv, vec_splat_u16(11)), vec_splat_u16(15));
1691
  return ((qword)(vec_andc(vec_sl((vec_ushort8)(a), bv), mask)));
1692
}
1693
 
1694
static __inline qword si_shli(qword a, unsigned int b)
1695
{
1696
  vec_uint4 bv;
1697
  vec_uint4 mask;
1698
 
1699
  bv = vec_splat((vec_uint4)(si_from_uint(b)), 0);
1700
  mask = (vec_uint4)vec_sra(vec_sl(bv, ((vec_uint4){26,26,26,26})), ((vec_uint4){31,31,31,31}));
1701
  return ((qword)(vec_andc(vec_sl((vec_uint4)(a), bv), mask)));
1702
}
1703
 
1704
 
1705
/* Shift Left Quadword
1706
 */
1707
static __inline qword si_shlqbii(qword a, unsigned int count)
1708
{
1709
  vec_uchar16 x;
1710
 
1711
  x = vec_splat((vec_uchar16)(si_from_uint(count)), 3);
1712
  return ((qword)(vec_sll((vec_uchar16)(a), x)));
1713
}
1714
 
1715
static __inline qword si_shlqbi(qword a, qword count)
1716
{
1717
  vec_uchar16 x;
1718
 
1719
  x = vec_splat((vec_uchar16)(count), 3);
1720
  return ((qword)(vec_sll((vec_uchar16)(a), x)));
1721
}
1722
 
1723
 
1724
/* Shift Left Quadword by Bytes
1725
 */
1726
static __inline qword si_shlqbyi(qword a, unsigned int count)
1727
{
1728
  union {
1729
    vec_uchar16 v;
1730
    int i[4];
1731
  } x;
1732
  vec_uchar16 mask;
1733
 
1734
  x.i[3] = count << 3;
1735
  mask = (count & 0x10) ? vec_splat_u8(0) : vec_splat_u8(-1);
1736
  return ((qword)(vec_and(vec_slo((vec_uchar16)(a), x.v), mask)));
1737
}
1738
 
1739
static __inline qword si_shlqby(qword a, qword count)
1740
{
1741
  union {
1742
    vec_uchar16 v;
1743
    unsigned int i[4];
1744
  } x;
1745
  unsigned int cnt;
1746
  vec_uchar16 mask;
1747
 
1748
  x.v = vec_sl(vec_splat((vec_uchar16)(count), 3), vec_splat_u8(3));
1749
  cnt = x.i[0];
1750
  mask = (cnt & 0x80) ? vec_splat_u8(0) : vec_splat_u8(-1);
1751
  return ((qword)(vec_and(vec_slo((vec_uchar16)(a), x.v), mask)));
1752
}
1753
 
1754
/* Shift Left Quadword by Bytes with Bit Count
1755
 */
1756
static __inline qword si_shlqbybi(qword a, qword count)
1757
{
1758
  union {
1759
    vec_uchar16 v;
1760
    int i[4];
1761
  } x;
1762
  unsigned int cnt;
1763
  vec_uchar16 mask;
1764
 
1765
  x.v = vec_splat((vec_uchar16)(count), 3);
1766
  cnt = x.i[0];
1767
  mask = (cnt & 0x80) ? vec_splat_u8(0) : vec_splat_u8(-1);
1768
  return ((qword)(vec_and(vec_slo((vec_uchar16)(a), x.v), mask)));
1769
}
1770
 
1771
 
1772
/* Stop and Signal
1773
 */
1774
#define si_stop(_type)          SPU_STOP_ACTION
1775
#define si_stopd(a, b, c)       SPU_STOP_ACTION
1776
 
1777
 
1778
/* Subtract
1779
 */
1780
static __inline qword si_sfh(qword a, qword b)
1781
{
1782
  return ((qword)(vec_sub((vec_ushort8)(b), (vec_ushort8)(a))));
1783
}
1784
 
1785
static __inline qword si_sf(qword a, qword b)
1786
{
1787
  return ((qword)(vec_sub((vec_uint4)(b), (vec_uint4)(a))));
1788
}
1789
 
1790
static __inline qword si_fs(qword a, qword b)
1791
{
1792
  return ((qword)(vec_sub((vec_float4)(a), (vec_float4)(b))));
1793
}
1794
 
1795
static __inline qword si_dfs(qword a, qword b)
1796
{
1797
  union {
1798
    vec_double2 v;
1799
    double d[2];
1800
  } aa, bb, dd;
1801
 
1802
  aa.v = (vec_double2)(a);
1803
  bb.v = (vec_double2)(b);
1804
  dd.d[0] = aa.d[0] - bb.d[0];
1805
  dd.d[1] = aa.d[1] - bb.d[1];
1806
  return ((qword)(dd.v));
1807
}
1808
 
1809
static __inline qword si_sfhi(qword a, short b)
1810
{
1811
  return ((qword)(vec_sub(vec_splat((vec_short8)(si_from_short(b)), 1),
1812
                          (vec_short8)(a))));
1813
}
1814
 
1815
static __inline qword si_sfi(qword a, int b)
1816
{
1817
  return ((qword)(vec_sub(vec_splat((vec_int4)(si_from_int(b)), 0),
1818
                          (vec_int4)(a))));
1819
}
1820
 
1821
/* Subtract word extended
1822
 */
1823
#define si_sfx(_a, _b, _c)      ((qword)(vec_add(vec_add((vec_uint4)(_b),                               \
1824
                                                         vec_nor((vec_uint4)(_a), (vec_uint4)(_a))),    \
1825
                                                 vec_and((vec_uint4)(_c), vec_splat_u32(1)))))
1826
 
1827
 
1828
/* Sum Bytes into Shorts
1829
 */
1830
static __inline qword si_sumb(qword a, qword b)
1831
{
1832
  vec_uint4 zero = (vec_uint4){0};
1833
  vec_ushort8 sum_a, sum_b;
1834
 
1835
  sum_a = (vec_ushort8)vec_sum4s((vec_uchar16)(a), zero);
1836
  sum_b = (vec_ushort8)vec_sum4s((vec_uchar16)(b), zero);
1837
 
1838
  return ((qword)(vec_perm(sum_a, sum_b, ((vec_uchar16){18, 19,  2,  3, 22, 23,  6,  7,
1839
                                                        26, 27, 10, 11, 30, 31, 14, 15}))));
1840
}
1841
 
1842
/* Exclusive OR
1843
 */
1844
static __inline qword si_xor(qword a, qword b)
1845
{
1846
  return ((qword)(vec_xor((vec_uchar16)(a), (vec_uchar16)(b))));
1847
}
1848
 
1849
static __inline qword si_xorbi(qword a, unsigned char b)
1850
{
1851
  return ((qword)(vec_xor((vec_uchar16)(a),
1852
                          vec_splat((vec_uchar16)(si_from_uchar(b)), 3))));
1853
}
1854
 
1855
static __inline qword si_xorhi(qword a, unsigned short b)
1856
{
1857
  return ((qword)(vec_xor((vec_ushort8)(a),
1858
                          vec_splat((vec_ushort8)(si_from_ushort(b)), 1))));
1859
}
1860
 
1861
static __inline qword si_xori(qword a, unsigned int b)
1862
{
1863
  return ((qword)(vec_xor((vec_uint4)(a),
1864
                          vec_splat((vec_uint4)(si_from_uint(b)), 0))));
1865
}
1866
 
1867
 
1868
/* Generate Controls for Sub-Quadword Insertion
1869
 */
1870
static __inline qword si_cbd(qword a, int imm)
1871
{
1872
  union {
1873
    vec_uint4 v;
1874
    unsigned char c[16];
1875
  } shmask;
1876
 
1877
  shmask.v = ((vec_uint4){0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F});
1878
  shmask.c[(si_to_uint(a) + (unsigned int)(imm)) & 0xF] = 0x03;
1879
  return ((qword)(shmask.v));
1880
}
1881
 
1882
static __inline qword si_cdd(qword a, int imm)
1883
{
1884
  union {
1885
    vec_uint4 v;
1886
    unsigned long long ll[2];
1887
  } shmask;
1888
 
1889
  shmask.v = ((vec_uint4){0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F});
1890
  shmask.ll[((si_to_uint(a) + (unsigned int)(imm)) >> 3) & 0x1] = 0x0001020304050607ULL;
1891
  return ((qword)(shmask.v));
1892
}
1893
 
1894
static __inline qword si_chd(qword a, int imm)
1895
{
1896
  union {
1897
    vec_uint4 v;
1898
    unsigned short s[8];
1899
  } shmask;
1900
 
1901
  shmask.v = ((vec_uint4){0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F});
1902
  shmask.s[((si_to_uint(a) + (unsigned int)(imm)) >> 1) & 0x7] = 0x0203;
1903
  return ((qword)(shmask.v));
1904
}
1905
 
1906
static __inline qword si_cwd(qword a, int imm)
1907
{
1908
  union {
1909
    vec_uint4 v;
1910
    unsigned int i[4];
1911
  } shmask;
1912
 
1913
  shmask.v = ((vec_uint4){0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F});
1914
  shmask.i[((si_to_uint(a) + (unsigned int)(imm)) >> 2) & 0x3] = 0x00010203;
1915
  return ((qword)(shmask.v));
1916
}
1917
 
1918
static __inline qword si_cbx(qword a, qword b)
1919
{
1920
  union {
1921
    vec_uint4 v;
1922
    unsigned char c[16];
1923
  } shmask;
1924
 
1925
  shmask.v = ((vec_uint4){0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F});
1926
  shmask.c[si_to_uint((qword)(vec_add((vec_uint4)(a), (vec_uint4)(b)))) & 0xF] = 0x03;
1927
  return ((qword)(shmask.v));
1928
}
1929
 
1930
 
1931
static __inline qword si_cdx(qword a, qword b)
1932
{
1933
  union {
1934
    vec_uint4 v;
1935
    unsigned long long ll[2];
1936
  } shmask;
1937
 
1938
  shmask.v = ((vec_uint4){0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F});
1939
  shmask.ll[(si_to_uint((qword)(vec_add((vec_uint4)(a), (vec_uint4)(b)))) >> 3) & 0x1] = 0x0001020304050607ULL;
1940
  return ((qword)(shmask.v));
1941
}
1942
 
1943
static __inline qword si_chx(qword a, qword b)
1944
{
1945
  union {
1946
    vec_uint4 v;
1947
    unsigned short s[8];
1948
  } shmask;
1949
 
1950
  shmask.v = ((vec_uint4){0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F});
1951
  shmask.s[(si_to_uint((qword)(vec_add((vec_uint4)(a), (vec_uint4)(b)))) >> 1) & 0x7] = 0x0203;
1952
  return ((qword)(shmask.v));
1953
}
1954
 
1955
static __inline qword si_cwx(qword a, qword b)
1956
{
1957
  union {
1958
    vec_uint4 v;
1959
    unsigned int i[4];
1960
  } shmask;
1961
 
1962
  shmask.v = ((vec_uint4){0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F});
1963
  shmask.i[(si_to_uint((qword)(vec_add((vec_uint4)(a), (vec_uint4)(b)))) >> 2) & 0x3] = 0x00010203;
1964
  return ((qword)(shmask.v));
1965
}
1966
 
1967
 
1968
/* Constant Formation
1969
 */
1970
static __inline qword si_il(signed short imm)
1971
{
1972
  return ((qword)(vec_splat((vec_int4)(si_from_int((signed int)(imm))), 0)));
1973
}
1974
 
1975
 
1976
static __inline qword si_ila(unsigned int imm)
1977
{
1978
  return ((qword)(vec_splat((vec_uint4)(si_from_uint(imm)), 0)));
1979
}
1980
 
1981
static __inline qword si_ilh(signed short imm)
1982
{
1983
  return ((qword)(vec_splat((vec_short8)(si_from_short(imm)), 1)));
1984
}
1985
 
1986
static __inline qword si_ilhu(signed short imm)
1987
{
1988
  return ((qword)(vec_splat((vec_uint4)(si_from_uint((unsigned int)(imm) << 16)), 0)));
1989
}
1990
 
1991
static __inline qword si_iohl(qword a, unsigned short imm)
1992
{
1993
  return ((qword)(vec_or((vec_uint4)(a), vec_splat((vec_uint4)(si_from_uint((unsigned int)(imm))), 0))));
1994
}
1995
 
1996
/* No Operation
1997
 */
1998
#define si_lnop()               /* do nothing */
1999
#define si_nop()                /* do nothing */
2000
 
2001
 
2002
/* Memory Load and Store
2003
 */
2004
static __inline qword si_lqa(unsigned int imm)
2005
{
2006
  return ((qword)(vec_ld(0, (vector unsigned char *)(imm))));
2007
}
2008
 
2009
static __inline qword si_lqd(qword a, unsigned int imm)
2010
{
2011
  return ((qword)(vec_ld(si_to_uint(a) & ~0xF, (vector unsigned char *)(imm))));
2012
}
2013
 
2014
static __inline qword si_lqr(unsigned int imm)
2015
{
2016
  return ((qword)(vec_ld(0, (vector unsigned char *)(imm))));
2017
}
2018
 
2019
static __inline qword si_lqx(qword a, qword b)
2020
{
2021
  return ((qword)(vec_ld(si_to_uint((qword)(vec_add((vec_uint4)(a), (vec_uint4)(b)))), (vector unsigned char *)(0))));
2022
}
2023
 
2024
static __inline void si_stqa(qword a, unsigned int imm)
2025
{
2026
  vec_st((vec_uchar16)(a), 0, (vector unsigned char *)(imm));
2027
}
2028
 
2029
static __inline void si_stqd(qword a, qword b, unsigned int imm)
2030
{
2031
  vec_st((vec_uchar16)(a), si_to_uint(b) & ~0xF, (vector unsigned char *)(imm));
2032
}
2033
 
2034
static __inline void si_stqr(qword a, unsigned int imm)
2035
{
2036
  vec_st((vec_uchar16)(a), 0, (vector unsigned char *)(imm));
2037
}
2038
 
2039
static __inline void si_stqx(qword a, qword b, qword c)
2040
{
2041
  vec_st((vec_uchar16)(a),
2042
         si_to_uint((qword)(vec_add((vec_uint4)(b), (vec_uint4)(c)))),
2043
         (vector unsigned char *)(0));
2044
}
2045
 
2046
#endif /* !__SPU__ */
2047
#endif /* !_SI2VMX_H_ */
2048
 

powered by: WebSVN 2.1.0

© copyright 1999-2024 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.