OpenCores
URL https://opencores.org/ocsvn/openrisc/openrisc/trunk

Subversion Repositories openrisc

[/] [openrisc/] [trunk/] [gnu-dev/] [or1k-gcc/] [gcc/] [config/] [spu/] [vmx2spu.h] - Blame information for rev 801

Go to most recent revision | Details | Compare with Previous | View Log

Line No. Rev Author Line
1 709 jeremybenn
/* Copyright (C) 2006, 2007, 2009 Free Software Foundation, Inc.
2
 
3
   This file is free software; you can redistribute it and/or modify it under
4
   the terms of the GNU General Public License as published by the Free
5
   Software Foundation; either version 3 of the License, or (at your option)
6
   any later version.
7
 
8
   This file is distributed in the hope that it will be useful, but WITHOUT
9
   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10
   FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
11
   for more details.
12
 
13
   Under Section 7 of GPL version 3, you are granted additional
14
   permissions described in the GCC Runtime Library Exception, version
15
   3.1, as published by the Free Software Foundation.
16
 
17
   You should have received a copy of the GNU General Public License and
18
   a copy of the GCC Runtime Library Exception along with this program;
19
   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
20
   <http://www.gnu.org/licenses/>.  */
21
 
22
#ifndef _VMX2SPU_H_
23
#define _VMX2SPU_H_     1
24
 
25
#ifdef __cplusplus
26
 
27
#ifdef __SPU__
28
 
29
#include <spu_intrinsics.h>
30
#include <vec_types.h>
31
 
32
/* This file maps generic VMX intrinsics and predicates to the SPU using
33
 * overloaded C++ functions.
34
 */
35
 
36
/************************************************************************
37
 *                        INTRINSICS
38
 ************************************************************************/
39
 
40
/* vec_abs (vector absolute value)
41
 * =======
42
 */
43
static inline vec_char16 vec_abs(vec_char16 a)
44
{
45
  vec_char16 minus_a;
46
 
47
  minus_a = (vec_char16)(spu_add((vec_ushort8)(spu_and(spu_xor(a, 0xFF), 0x7F)), 0x101));
48
  return (spu_sel(minus_a, a, spu_cmpgt(a, -1)));
49
}
50
 
51
static inline vec_short8 vec_abs(vec_short8 a)
52
{
53
  return (spu_sel(spu_sub(0, a), a, spu_cmpgt(a, -1)));
54
}
55
 
56
static inline vec_int4 vec_abs(vec_int4 a)
57
{
58
  return (spu_sel(spu_sub(0, a), a, spu_cmpgt(a, -1)));
59
}
60
 
61
static inline vec_float4 vec_abs(vec_float4 a)
62
{
63
  return ((vec_float4)(spu_rlmask(spu_sl((vec_uint4)(a), 1), -1)));
64
}
65
 
66
/* vec_abss (vector absolute value saturate)
67
 * ========
68
 */
69
static inline vec_char16 vec_abss(vec_char16 a)
70
{
71
  vec_char16 minus_a;
72
 
73
  minus_a = (vec_char16)spu_add((vec_short8)(spu_xor(a, -1)),
74
                                (vec_short8)(spu_and(spu_cmpgt((vec_uchar16)(a), 0x80), 1)));
75
  return (spu_sel(minus_a, a, spu_cmpgt(a, -1)));
76
}
77
 
78
static inline vec_short8 vec_abss(vec_short8 a)
79
{
80
  vec_short8 minus_a;
81
 
82
  minus_a = spu_add(spu_sub(0, a), (vec_short8)(spu_cmpeq(a, ((vec_short8){0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000}))));
83
  return (spu_sel(minus_a, a, spu_cmpgt(a, -1)));
84
}
85
 
86
static inline vec_int4 vec_abss(vec_int4 a)
87
{
88
  vec_int4 minus_a;
89
 
90
  minus_a = spu_add(spu_sub(0, a), (vec_int4)(spu_cmpeq(a, ((vec_int4){0x80000000,0x80000000,0x80000000,0x80000000}))));
91
  return (spu_sel(minus_a, a, spu_cmpgt(a, -1)));
92
}
93
 
94
 
95
/* vec_add (vector add)
96
 * =======
97
 */
98
static inline vec_uchar16 vec_add(vec_uchar16 a, vec_uchar16 b)
99
{
100
  return ((vec_uchar16)(spu_sel(spu_add((vec_ushort8)(a), (vec_ushort8)(b)),
101
                                spu_add(spu_and((vec_ushort8)(a), 0xFF00), spu_and((vec_ushort8)(b), 0xFF00)),
102
                                spu_splats((unsigned short)(0xFF00)))));
103
}
104
 
105
static inline vec_char16 vec_add(vec_char16 a, vec_char16 b)
106
{
107
  return ((vec_char16)vec_add((vec_uchar16)(a), (vec_uchar16)(b)));
108
}
109
 
110
static inline vec_char16 vec_add(vec_bchar16 a, vec_char16 b)
111
{
112
  return ((vec_char16)vec_add((vec_uchar16)(a), (vec_uchar16)(b)));
113
}
114
 
115
static inline vec_char16 vec_add(vec_char16 a, vec_bchar16 b)
116
{
117
  return ((vec_char16)vec_add((vec_uchar16)(a), (vec_uchar16)(b)));
118
}
119
 
120
static inline vec_ushort8 vec_add(vec_ushort8 a, vec_ushort8 b)
121
{
122
  return (spu_add(a, b));
123
}
124
 
125
static inline vec_short8 vec_add(vec_short8 a, vec_short8 b)
126
{
127
  return (spu_add(a, b));
128
}
129
 
130
static inline vec_short8 vec_add(vec_bshort8 a, vec_short8 b)
131
{
132
  return (spu_add((vec_short8)(a), b));
133
}
134
 
135
static inline vec_short8 vec_add(vec_short8 a, vec_bshort8 b)
136
{
137
  return (spu_add(a, (vec_short8)(b)));
138
}
139
 
140
static inline vec_uint4 vec_add(vec_uint4 a, vec_uint4 b)
141
{
142
  return (spu_add(a, b));
143
}
144
 
145
static inline vec_int4 vec_add(vec_int4 a, vec_int4 b)
146
{
147
  return (spu_add(a, b));
148
}
149
 
150
static inline vec_int4 vec_add(vec_bint4 a, vec_int4 b)
151
{
152
  return (spu_add((vec_int4)(a), b));
153
}
154
 
155
static inline vec_int4 vec_add(vec_int4 a, vec_bint4 b)
156
{
157
  return (spu_add(a, (vec_int4)(b)));
158
}
159
 
160
static inline vec_float4 vec_add(vec_float4 a, vec_float4 b)
161
{
162
  return (spu_add(a, b));
163
}
164
 
165
/* vec_addc (vector add carryout unsigned word)
166
 * ========
167
 */
168
#define vec_addc(_a, _b)        spu_genc(_a, _b)
169
 
170
/* vec_adds (vector add saturated)
171
 * ========
172
 */
173
static inline vec_uchar16 vec_adds(vec_uchar16 a, vec_uchar16 b)
174
{
175
  vec_uchar16 s1, s2, s, d;
176
 
177
  s1 = (vec_uchar16)(spu_add(spu_rlmask((vec_ushort8)(a), -8), spu_rlmask((vec_ushort8)(b), -8)));
178
  s2 = (vec_uchar16)(spu_add(spu_and((vec_ushort8)(a), 0xFF), spu_and((vec_ushort8)(b), 0xFF)));
179
  s  = spu_shuffle(s1, s2, ((vec_uchar16){0, 16,  2, 18,  4, 20,  6, 22,
180
                                          8, 24, 10, 26, 12, 28, 14, 30}));
181
  d  = spu_shuffle(s1, s2, ((vec_uchar16){1, 17,  3, 19,  5, 21,  7, 23,
182
                                          9, 25, 11, 27, 13, 29, 15, 31}));
183
  return (spu_or(d, spu_cmpeq(s, 1)));
184
}
185
 
186
static inline vec_char16 vec_adds(vec_char16 a, vec_char16 b)
187
{
188
  vec_uchar16 s1, s2, s, d;
189
 
190
  s1 = (vec_uchar16)(spu_add(spu_rlmask((vec_ushort8)(a), -8), spu_rlmask((vec_ushort8)(b), -8)));
191
  s2 = (vec_uchar16)(spu_add(spu_and((vec_ushort8)(a), 0xFF), spu_and((vec_ushort8)(b), 0xFF)));
192
  s  = spu_shuffle(s1, s2, ((vec_uchar16){1, 17,  3, 19,  5, 21,  7, 23,
193
                                          9, 25, 11, 27, 13, 29, 15, 31}));
194
  d = spu_sel(s, spu_splats((unsigned char)0x7F), spu_cmpgt(spu_and(s, (vec_uchar16)(spu_nor(a, b))), 0x7F));
195
  d = spu_sel(d, spu_splats((unsigned char)0x80), spu_cmpgt(spu_nor(s, (vec_uchar16)(spu_nand(a, b))), 0x7F));
196
  return ((vec_char16)(d));
197
}
198
 
199
static inline vec_char16 vec_adds(vec_bchar16 a, vec_char16 b)
200
{
201
  return (vec_adds((vec_char16)(a), b));
202
}
203
 
204
static inline vec_char16 vec_adds(vec_char16 a, vec_bchar16 b)
205
{
206
  return (vec_adds(a, (vec_char16)(b)));
207
}
208
 
209
static inline vec_ushort8 vec_adds(vec_ushort8 a, vec_ushort8 b)
210
{
211
  vec_ushort8 s, d;
212
 
213
  s = spu_add(a, b);
214
  d = spu_or(s, spu_rlmaska(spu_sel(spu_xor(s, -1), a, spu_eqv(a, b)), -15));
215
  return (d);
216
}
217
 
218
static inline vec_short8 vec_adds(vec_short8 a, vec_short8 b)
219
{
220
  vec_short8 s, d;
221
 
222
  s = spu_add(a, b);
223
  d = spu_sel(s, spu_splats((signed short)0x7FFF), (vec_ushort8)(spu_rlmaska(spu_and(s, spu_nor(a, b)), -15)));
224
  d = spu_sel(d, spu_splats((signed short)0x8000), (vec_ushort8)(spu_rlmaska(spu_nor(s, spu_nand(a, b)), -15)));
225
  return (d);
226
}
227
 
228
static inline vec_short8 vec_adds(vec_bshort8 a, vec_short8 b)
229
{
230
  return (vec_adds((vec_short8)(a), b));
231
}
232
 
233
static inline vec_short8 vec_adds(vec_short8 a, vec_bshort8 b)
234
{
235
  return (vec_adds(a, (vec_short8)(b)));
236
}
237
 
238
static inline vec_uint4 vec_adds(vec_uint4 a, vec_uint4 b)
239
{
240
  return (spu_or(spu_add(a, b), spu_rlmaska(spu_sl(spu_genc(a, b), 31), -31)));
241
}
242
 
243
static inline vec_int4 vec_adds(vec_int4 a, vec_int4 b)
244
{
245
  vec_int4 s, d;
246
 
247
  s = spu_add(a, b);
248
  d = spu_sel(s, spu_splats((signed int)0x7FFFFFFF), (vec_uint4)spu_rlmaska(spu_and(s, spu_nor(a, b)), -31));
249
  d = spu_sel(d, spu_splats((signed int)0x80000000), (vec_uint4)spu_rlmaska(spu_nor(s, spu_nand(a, b)), -31));
250
  return (d);
251
}
252
 
253
static inline vec_int4 vec_adds(vec_bint4 a, vec_int4 b)
254
{
255
  return (vec_adds((vec_int4)(a), b));
256
}
257
 
258
static inline vec_int4 vec_adds(vec_int4 a, vec_bint4 b)
259
{
260
  return (vec_adds(a, (vec_int4)(b)));
261
}
262
 
263
/* vec_and (vector logical and)
264
 * =======
265
 */
266
static inline vec_uchar16 vec_and(vec_uchar16 a, vec_uchar16 b)
267
{
268
  return (spu_and(a, b));
269
}
270
 
271
static inline vec_char16 vec_and(vec_char16 a, vec_char16 b)
272
{
273
  return (spu_and(a, b));
274
}
275
 
276
static inline vec_char16 vec_and(vec_bchar16 a, vec_char16 b)
277
{
278
  return (spu_and((vec_char16)(a), b));
279
}
280
 
281
static inline vec_char16 vec_and(vec_char16 a, vec_bchar16 b)
282
{
283
  return (spu_and(a, (vec_char16)(b)));
284
}
285
 
286
static inline vec_ushort8 vec_and(vec_ushort8 a, vec_ushort8 b)
287
{
288
  return (spu_and(a, b));
289
}
290
 
291
static inline vec_short8 vec_and(vec_short8 a, vec_short8 b)
292
{
293
  return (spu_and(a, b));
294
}
295
 
296
static inline vec_short8 vec_and(vec_bshort8 a, vec_short8 b)
297
{
298
  return (spu_and((vec_short8)(a), b));
299
}
300
 
301
static inline vec_short8 vec_and(vec_short8 a, vec_bshort8 b)
302
{
303
  return (spu_and(a, (vec_short8)(b)));
304
}
305
 
306
static inline vec_uint4 vec_and(vec_uint4 a, vec_uint4 b)
307
{
308
  return (spu_and(a, b));
309
}
310
 
311
static inline vec_int4 vec_and(vec_int4 a, vec_int4 b)
312
{
313
  return (spu_and(a, b));
314
}
315
 
316
static inline vec_int4 vec_and(vec_bint4 a, vec_int4 b)
317
{
318
  return (spu_and((vec_int4)(a), b));
319
}
320
 
321
static inline vec_int4 vec_and(vec_int4 a, vec_bint4 b)
322
{
323
  return (spu_and(a, (vec_int4)(b)));
324
}
325
 
326
static inline vec_float4 vec_and(vec_float4 a, vec_float4 b)
327
{
328
  return (spu_and(a, b));
329
}
330
 
331
static inline vec_float4 vec_and(vec_bint4 a, vec_float4 b)
332
{
333
  return (spu_and((vec_float4)(a),b));
334
}
335
 
336
static inline vec_float4 vec_and(vec_float4 a, vec_bint4 b)
337
{
338
  return (spu_and(a, (vec_float4)(b)));
339
}
340
 
341
 
342
/* vec_andc (vector logical and with complement)
343
 * ========
344
 */
345
static inline vec_uchar16 vec_andc(vec_uchar16 a, vec_uchar16 b)
346
{
347
  return (spu_andc(a, b));
348
}
349
 
350
static inline vec_char16 vec_andc(vec_char16 a, vec_char16 b)
351
{
352
  return (spu_andc(a, b));
353
}
354
 
355
static inline vec_char16 vec_andc(vec_bchar16 a, vec_char16 b)
356
{
357
  return (spu_andc((vec_char16)(a), b));
358
}
359
 
360
static inline vec_char16 vec_andc(vec_char16 a, vec_bchar16 b)
361
{
362
  return (spu_andc(a, (vec_char16)(b)));
363
}
364
 
365
static inline vec_ushort8 vec_andc(vec_ushort8 a, vec_ushort8 b)
366
{
367
  return (spu_andc(a, b));
368
}
369
 
370
static inline vec_short8 vec_andc(vec_short8 a, vec_short8 b)
371
{
372
  return (spu_andc(a, b));
373
}
374
 
375
static inline vec_short8 vec_andc(vec_bshort8 a, vec_short8 b)
376
{
377
  return (spu_andc((vec_short8)(a), b));
378
}
379
 
380
static inline vec_short8 vec_andc(vec_short8 a, vec_bshort8 b)
381
{
382
  return (spu_andc(a, (vec_short8)(b)));
383
}
384
 
385
static inline vec_uint4 vec_andc(vec_uint4 a, vec_uint4 b)
386
{
387
  return (spu_andc(a, b));
388
}
389
 
390
static inline vec_int4 vec_andc(vec_int4 a, vec_int4 b)
391
{
392
  return (spu_andc(a, b));
393
}
394
 
395
static inline vec_int4 vec_andc(vec_bint4 a, vec_int4 b)
396
{
397
  return (spu_andc((vec_int4)(a), b));
398
}
399
 
400
static inline vec_int4 vec_andc(vec_int4 a, vec_bint4 b)
401
{
402
  return (spu_andc(a, (vec_int4)(b)));
403
}
404
 
405
static inline vec_float4 vec_andc(vec_float4 a, vec_float4 b)
406
{
407
  return (spu_andc(a,b));
408
}
409
 
410
static inline vec_float4 vec_andc(vec_bint4 a, vec_float4 b)
411
{
412
  return (spu_andc((vec_float4)(a),b));
413
}
414
 
415
static inline vec_float4 vec_andc(vec_float4 a, vec_bint4 b)
416
{
417
  return (spu_andc(a, (vec_float4)(b)));
418
}
419
 
420
/* vec_avg (vector average)
421
 * =======
422
 */
423
static inline vec_uchar16 vec_avg(vec_uchar16 a, vec_uchar16 b)
424
{
425
  return (spu_avg(a, b));
426
}
427
 
428
static inline vec_char16 vec_avg(vec_char16 a, vec_char16 b)
429
{
430
  return ((vec_char16)(spu_xor(spu_avg((vec_uchar16)(a), (vec_uchar16)(b)),
431
                               (vec_uchar16)(spu_and(spu_xor(a,b), 0x80)))));
432
}
433
 
434
static inline vec_ushort8 vec_avg(vec_ushort8 a, vec_ushort8 b)
435
{
436
  return (spu_add(spu_add(spu_rlmask(a, -1), spu_rlmask(b, -1)),
437
                  spu_and(spu_or(a, b), 1)));
438
}
439
 
440
static inline vec_short8 vec_avg(vec_short8 a, vec_short8 b)
441
{
442
  return (spu_add(spu_add(spu_rlmaska(a, -1), spu_rlmaska(b, -1)),
443
                  spu_and(spu_or(a, b), 1)));
444
}
445
 
446
static inline vec_uint4 vec_avg(vec_uint4 a, vec_uint4 b)
447
{
448
  return (spu_add(spu_add(spu_rlmask(a, -1), spu_rlmask(b, -1)),
449
                  spu_and(spu_or(a, b), 1)));
450
}
451
 
452
static inline vec_int4 vec_avg(vec_int4 a, vec_int4 b)
453
{
454
  return (spu_add(spu_add(spu_rlmaska(a, -1), spu_rlmaska(b, -1)),
455
                  spu_and(spu_or(a, b), 1)));
456
}
457
 
458
 
459
/* vec_ceil (vector ceiling)
460
 * ========
461
 */
462
static inline vec_float4 vec_ceil(vec_float4 a)
463
{
464
  vec_int4  exp;
465
  vec_uint4 mask;
466
 
467
  a = spu_add(a, (vec_float4)(spu_and(spu_xor(spu_rlmaska((vec_int4)a, -31), -1), spu_splats((signed int)0x3F7FFFFF))));
468
  exp = spu_sub(127, (vec_int4)(spu_and(spu_rlmask((vec_uint4)(a), -23), 0xFF)));
469
  mask = spu_rlmask(spu_splats((unsigned int)0x7FFFFF), exp);
470
  mask = spu_sel(spu_splats((unsigned int)0), mask, spu_cmpgt(exp, -31));
471
  mask = spu_or(mask, spu_xor((vec_uint4)(spu_rlmaska(spu_add(exp, -1), -31)), -1));
472
 
473
  return ((vec_float4)(spu_andc((vec_uint4)(a), mask)));
474
}
475
 
476
 
477
/* vec_cmpb (vector compare bounds floating-point)
478
 * ========
479
 */
480
static inline vec_int4 vec_cmpb(vec_float4 a, vec_float4 b)
481
{
482
  vec_int4 b0 = (vec_int4)spu_splats(0x80000000);
483
  vec_int4 b1 = (vec_int4)spu_splats(0x40000000);
484
 
485
  return (spu_or(spu_and((vec_int4)spu_cmpgt(a, b), b0),
486
                 spu_and((vec_int4)spu_cmpgt(spu_xor(b, (vec_float4)(b0)), a), b1)));
487
}
488
 
489
/* vec_cmpeq (vector compare equal)
490
 * =========
491
 */
492
#define vec_cmpeq(_a, _b)       spu_cmpeq(_a, _b)
493
 
494
 
495
/* vec_cmpge (vector compare greater than or equal)
496
 * =========
497
 */
498
static inline vec_bint4 vec_cmpge(vec_float4 a, vec_float4 b)
499
{
500
  return (spu_xor(spu_cmpgt(b, a), -1));
501
}
502
 
503
 
504
/* vec_cmpgt (vector compare greater than)
505
 * =========
506
 */
507
#define vec_cmpgt(_a, _b)       spu_cmpgt(_a, _b)
508
 
509
 
510
/* vec_cmple (vector compare less than or equal)
511
 * =========
512
 */
513
static inline vec_bint4 vec_cmple(vec_float4 a, vec_float4 b)
514
{
515
  return (spu_xor(spu_cmpgt(a, b), -1));
516
}
517
 
518
 
519
/* vec_cmplt (vector compare less than)
520
 * =========
521
 */
522
#define vec_cmplt(_a, _b)       spu_cmpgt(_b, _a)
523
 
524
 
525
/* vec_ctf (vector convert from fixed-point word)
526
 * =======
527
 */
528
#define vec_ctf(_a, _b)         spu_convtf(_a, _b)
529
 
530
 
531
/* vec_cts (vector convert to signed fixed-point word saturate)
532
 * =======
533
 */
534
#define vec_cts(_a, _b)         spu_convts(_a, _b)
535
 
536
 
537
/* vec_ctu (vector convert to unsigned fixed-point word saturate)
538
 * =======
539
 */
540
#define vec_ctu(_a, _b)         spu_convtu(_a, _b)
541
 
542
 
543
/* vec_dss (vector data stream stop)
544
 * =======
545
 */
546
#define vec_dss(_a)
547
 
548
 
549
/* vec_dssall (vector data stream stop all)
550
 * ==========
551
 */
552
#define vec_dssall()
553
 
554
 
555
/* vec_dst (vector data stream touch)
556
 * =======
557
 */
558
#define vec_dst(_a, _b, _c)
559
 
560
 
561
/* vec_dstst (vector data stream touch for store)
562
 * =========
563
 */
564
#define vec_dstst(_a, _b, _c)
565
 
566
 
567
/* vec_dststt (vector data stream touch for store transient)
568
 * ==========
569
 */
570
#define vec_dststt(_a, _b, _c)
571
 
572
 
573
/* vec_dstt (vector data stream touch transient)
574
 * ========
575
 */
576
#define vec_dstt(_a, _b, _c)
577
 
578
 
579
/* vec_expte (vector is 2 raised tp the exponent estimate floating-point)
580
 * =========
581
 */
582
static inline vec_float4 vec_expte(vec_float4 a)
583
{
584
  vec_float4 bias, frac, exp;
585
  vec_int4 ia;
586
 
587
  bias = (vec_float4)(spu_andc(spu_splats((signed int)0x3F7FFFFF), spu_rlmaska((vec_int4)(a), -31)));
588
  ia   = spu_convts(spu_add(a, bias), 0);
589
  frac = spu_sub(spu_convtf(ia, 0), a);
590
  exp  = (vec_float4)(spu_sl(spu_add(ia, 127), 23));
591
 
592
  return (spu_mul(spu_madd(spu_madd(spu_splats(0.17157287f), frac, spu_splats(-0.67157287f)),
593
                           frac, spu_splats(1.0f)), exp));
594
}
595
 
596
 
597
/* vec_floor (vector floor)
598
 * =========
599
 */
600
static inline vec_float4 vec_floor(vec_float4 a)
601
{
602
  vec_int4  exp;
603
  vec_uint4 mask;
604
 
605
  a = spu_sub(a, (vec_float4)(spu_and(spu_rlmaska((vec_int4)a, -31), spu_splats((signed int)0x3F7FFFFF))));
606
  exp = spu_sub(127, (vec_int4)(spu_and(spu_rlmask((vec_uint4)(a), -23), 0xFF)));
607
  mask = spu_rlmask(spu_splats((unsigned int)0x7FFFFF), exp);
608
  mask = spu_sel(spu_splats((unsigned int)0), mask, spu_cmpgt(exp, -31));
609
  mask = spu_or(mask, spu_xor((vec_uint4)(spu_rlmaska(spu_add(exp, -1), -31)), -1));
610
 
611
  return ((vec_float4)(spu_andc((vec_uint4)(a), mask)));
612
}
613
 
614
 
615
/* vec_ld (vector load indexed)
616
 * ======
617
 */
618
static inline vec_uchar16 vec_ld(int a, unsigned char *b)
619
{
620
  return (*((vec_uchar16 *)(b+a)));
621
}
622
 
623
static inline vec_uchar16 vec_ld(int a, vec_uchar16 *b)
624
{
625
  return (*((vec_uchar16 *)((unsigned char *)(b)+a)));
626
}
627
 
628
static inline vec_char16 vec_ld(int a, signed char *b)
629
{
630
  return (*((vec_char16 *)(b+a)));
631
}
632
 
633
static inline vec_char16 vec_ld(int a, vec_char16 *b)
634
{
635
  return (*((vec_char16 *)((signed char *)(b)+a)));
636
}
637
 
638
static inline vec_ushort8 vec_ld(int a, unsigned short *b)
639
{
640
  return (*((vec_ushort8 *)((unsigned char *)(b)+a)));
641
}
642
 
643
static inline vec_ushort8 vec_ld(int a, vec_ushort8 *b)
644
{
645
  return (*((vec_ushort8 *)((unsigned char *)(b)+a)));
646
}
647
 
648
static inline vec_short8 vec_ld(int a, signed short *b)
649
{
650
  return (*((vec_short8 *)((unsigned char *)(b)+a)));
651
}
652
 
653
static inline vec_short8 vec_ld(int a, vec_short8 *b)
654
{
655
  return (*((vec_short8 *)((signed char *)(b)+a)));
656
}
657
 
658
static inline vec_uint4 vec_ld(int a, unsigned int *b)
659
{
660
  return (*((vec_uint4 *)((unsigned char *)(b)+a)));
661
}
662
 
663
static inline vec_uint4 vec_ld(int a, vec_uint4 *b)
664
{
665
  return (*((vec_uint4 *)((unsigned char *)(b)+a)));
666
}
667
 
668
static inline vec_int4 vec_ld(int a, signed int *b)
669
{
670
  return (*((vec_int4 *)((unsigned char *)(b)+a)));
671
}
672
 
673
static inline vec_int4 vec_ld(int a, vec_int4 *b)
674
{
675
  return (*((vec_int4 *)((signed char *)(b)+a)));
676
}
677
 
678
static inline vec_float4 vec_ld(int a, float *b)
679
{
680
  return (*((vec_float4 *)((unsigned char *)(b)+a)));
681
}
682
 
683
static inline vec_float4 vec_ld(int a, vec_float4 *b)
684
{
685
  return (*((vec_float4 *)((unsigned char *)(b)+a)));
686
}
687
 
688
/* vec_lde (vector load element indexed)
689
 * =======
690
 */
691
static inline vec_uchar16 vec_lde(int a, unsigned char *b)
692
{
693
  return (*((vec_uchar16 *)(b+a)));
694
}
695
 
696
static inline vec_char16 vec_lde(int a, signed char *b)
697
{
698
  return (*((vec_char16 *)(b+a)));
699
}
700
 
701
static inline vec_ushort8 vec_lde(int a, unsigned short *b)
702
{
703
  return (*((vec_ushort8 *)((unsigned char *)(b)+a)));
704
}
705
 
706
static inline vec_short8 vec_lde(int a, signed short *b)
707
{
708
  return (*((vec_short8 *)((unsigned char *)(b)+a)));
709
}
710
 
711
 
712
static inline vec_uint4 vec_lde(int a, unsigned int *b)
713
{
714
  return (*((vec_uint4 *)((unsigned char *)(b)+a)));
715
}
716
 
717
static inline vec_int4 vec_lde(int a, signed int *b)
718
{
719
  return (*((vec_int4 *)((unsigned char *)(b)+a)));
720
}
721
 
722
 
723
static inline vec_float4 vec_lde(int a, float *b)
724
{
725
  return (*((vec_float4 *)((unsigned char *)(b)+a)));
726
}
727
 
728
/* vec_ldl (vector load indexed LRU)
729
 * =======
730
 */
731
#define vec_ldl(_a, _b)         vec_ld(_a, _b)
732
 
733
 
734
/* vec_loge (vector log2 estimate floating-point)
735
 * ========
736
 */
737
static inline vec_float4 vec_loge(vec_float4 a)
738
{
739
  vec_int4 exp;
740
  vec_float4 frac;
741
 
742
  exp  = spu_add((vec_int4)(spu_and(spu_rlmask((vec_uint4)(a), -23), 0xFF)), -127);
743
  frac = (vec_float4)(spu_sub((vec_int4)(a), spu_sl(exp, 23)));
744
 
745
  return (spu_madd(spu_madd(spu_splats(-0.33985f), frac, spu_splats(2.01955f)),
746
                   frac, spu_sub(spu_convtf(exp, 0), spu_splats(1.6797f))));
747
}
748
 
749
 
750
/* vec_lvsl (vector load for shift left)
751
 * ========
752
 */
753
static inline vec_uchar16 vec_lvsl(int a, unsigned char *b)
754
{
755
  return ((vec_uchar16)spu_add((vec_ushort8)(spu_splats((unsigned char)((a + (int)(b)) & 0xF))),
756
                               ((vec_ushort8){0x0001, 0x0203, 0x0405, 0x0607,
757
                                              0x0809, 0x0A0B, 0x0C0D, 0x0E0F})));
758
}
759
 
760
static inline vec_uchar16 vec_lvsl(int a, signed char *b)
761
{
762
  return (vec_lvsl(a, (unsigned char *)b));
763
}
764
 
765
static inline vec_uchar16 vec_lvsl(int a, unsigned short *b)
766
{
767
  return (vec_lvsl(a, (unsigned char *)b));
768
}
769
 
770
static inline vec_uchar16 vec_lvsl(int a, short *b)
771
{
772
  return (vec_lvsl(a, (unsigned char *)b));
773
}
774
 
775
static inline vec_uchar16 vec_lvsl(int a, unsigned int *b)
776
{
777
  return (vec_lvsl(a, (unsigned char *)b));
778
}
779
 
780
static inline vec_uchar16 vec_lvsl(int a, int *b)
781
{
782
  return (vec_lvsl(a, (unsigned char *)b));
783
}
784
 
785
static inline vec_uchar16 vec_lvsl(int a, float *b)
786
{
787
  return (vec_lvsl(a, (unsigned char *)b));
788
}
789
 
790
 
791
/* vec_lvsr (vector load for shift right)
792
 * ========
793
 */
794
static  inline vec_uchar16 vec_lvsr(int a, unsigned char *b)
795
{
796
  return ((vec_uchar16)(spu_sub(((vec_ushort8){0x1011, 0x1213, 0x1415, 0x1617,
797
                                               0x1819, 0x1A1B, 0x1C1D, 0x1E1F}),
798
                                (vec_ushort8)(spu_splats((unsigned char)((a + (int)(b)) & 0xF))))));
799
}
800
 
801
static inline vec_uchar16 vec_lvsr(int a, signed char *b)
802
{
803
  return (vec_lvsr(a, (unsigned char *)b));
804
}
805
 
806
static inline vec_uchar16 vec_lvsr(int a, unsigned short *b)
807
{
808
  return (vec_lvsr(a, (unsigned char *)b));
809
}
810
 
811
static inline vec_uchar16 vec_lvsr(int a, short *b)
812
{
813
  return (vec_lvsr(a, (unsigned char *)b));
814
}
815
 
816
static inline vec_uchar16 vec_lvsr(int a, unsigned int *b)
817
{
818
  return (vec_lvsr(a, (unsigned char *)b));
819
}
820
 
821
static inline vec_uchar16 vec_lvsr(int a, int *b)
822
{
823
  return (vec_lvsr(a, (unsigned char *)b));
824
}
825
 
826
static inline vec_uchar16 vec_lvsr(int a, float *b)
827
{
828
  return (vec_lvsr(a, (unsigned char *)b));
829
}
830
 
831
/* vec_madd (vector multiply add)
832
 * ========
833
 */
834
#define vec_madd(_a, _b, _c)    spu_madd(_a, _b, _c)
835
 
836
 
837
 
838
/* vec_madds (vector multiply add saturate)
839
 * =========
840
 */
841
static inline vec_short8 vec_madds(vec_short8 a, vec_short8 b, vec_short8 c)
842
{
843
  return (vec_adds(c, spu_sel((vec_short8)(spu_sl(spu_mule(a, b), 1)),
844
                              (vec_short8)(spu_rlmask(spu_mulo(a, b), -15)),
845
                              ((vec_ushort8){0, 0xFFFF, 0, 0xFFFF, 0, 0xFFFF, 0, 0xFFFF}))));
846
}
847
 
848
/* vec_max (vector maximum)
849
 * =======
850
 */
851
static inline vec_uchar16 vec_max(vec_uchar16 a, vec_uchar16 b)
852
{
853
  return (spu_sel(b, a, spu_cmpgt(a, b)));
854
}
855
 
856
static inline vec_char16 vec_max(vec_char16 a, vec_char16 b)
857
{
858
  return (spu_sel(b, a, spu_cmpgt(a, b)));
859
}
860
 
861
static inline vec_char16 vec_max(vec_bchar16 a, vec_char16 b)
862
{
863
  return (spu_sel(b, (vec_char16)(a), spu_cmpgt((vec_char16)(a), b)));
864
}
865
 
866
static inline vec_char16 vec_max(vec_char16 a, vec_bchar16 b)
867
{
868
  return (spu_sel((vec_char16)(b), a, spu_cmpgt(a, (vec_char16)(b))));
869
}
870
 
871
static inline vec_ushort8 vec_max(vec_ushort8 a, vec_ushort8 b)
872
{
873
  return (spu_sel(b, a, spu_cmpgt(a, b)));
874
}
875
 
876
static inline vec_short8 vec_max(vec_short8 a, vec_short8 b)
877
{
878
  return (spu_sel(b, a, spu_cmpgt(a, b)));
879
}
880
 
881
static inline vec_short8 vec_max(vec_bshort8 a, vec_short8 b)
882
{
883
  return (spu_sel(b, (vec_short8)(a), spu_cmpgt((vec_short8)(a), b)));
884
}
885
 
886
static inline vec_short8 vec_max(vec_short8 a, vec_bshort8 b)
887
{
888
  return (spu_sel((vec_short8)(b), a, spu_cmpgt(a, (vec_short8)(b))));
889
}
890
 
891
static inline vec_uint4 vec_max(vec_uint4 a, vec_uint4 b)
892
{
893
  return (spu_sel(b, a, spu_cmpgt(a, b)));
894
}
895
 
896
static inline vec_int4 vec_max(vec_int4 a, vec_int4 b)
897
{
898
  return (spu_sel(b, a, spu_cmpgt(a, b)));
899
}
900
 
901
static inline vec_int4 vec_max(vec_bint4 a, vec_int4 b)
902
{
903
  return (spu_sel(b, (vec_int4)(a), spu_cmpgt((vec_int4)(a), b)));
904
}
905
 
906
static inline vec_int4 vec_max(vec_int4 a, vec_bint4 b)
907
{
908
  return (spu_sel((vec_int4)(b), a, spu_cmpgt(a, (vec_int4)(b))));
909
}
910
 
911
static inline vec_float4 vec_max(vec_float4 a, vec_float4 b)
912
{
913
  return (spu_sel(b, a, spu_cmpgt(a, b)));
914
}
915
 
916
 
917
/* vec_mergeh (vector merge high)
918
 * ==========
919
 */
920
static inline vec_uchar16 vec_mergeh(vec_uchar16 a, vec_uchar16 b)
921
{
922
  return (spu_shuffle(a, b, ((vec_uchar16){0, 16, 1, 17, 2, 18, 3, 19,
923
                                           4, 20, 5, 21, 6, 22, 7, 23})));
924
}
925
 
926
static inline vec_char16 vec_mergeh(vec_char16 a, vec_char16 b)
927
{
928
  return (spu_shuffle(a, b, ((vec_uchar16){0, 16, 1, 17, 2, 18, 3, 19,
929
                                           4, 20, 5, 21, 6, 22, 7, 23})));
930
}
931
 
932
static inline vec_ushort8 vec_mergeh(vec_ushort8 a, vec_ushort8 b)
933
{
934
  return (spu_shuffle(a, b, ((vec_uchar16){0, 1, 16, 17, 2, 3, 18, 19,
935
                                           4, 5, 20, 21, 6, 7, 22, 23})));
936
}
937
 
938
static inline vec_short8 vec_mergeh(vec_short8 a, vec_short8 b)
939
{
940
  return (spu_shuffle(a, b, ((vec_uchar16){0, 1, 16, 17, 2, 3, 18, 19,
941
                                           4, 5, 20, 21, 6, 7, 22, 23})));
942
}
943
 
944
static inline vec_uint4 vec_mergeh(vec_uint4 a, vec_uint4 b)
945
{
946
  return (spu_shuffle(a, b, ((vec_uchar16){0, 1, 2, 3, 16, 17, 18, 19,
947
                                           4, 5, 6, 7, 20, 21, 22, 23})));
948
}
949
 
950
static inline vec_int4 vec_mergeh(vec_int4 a, vec_int4 b)
951
{
952
  return (spu_shuffle(a, b, ((vec_uchar16){0, 1, 2, 3, 16, 17, 18, 19,
953
                                           4, 5, 6, 7, 20, 21, 22, 23})));
954
}
955
 
956
static inline vec_float4 vec_mergeh(vec_float4 a, vec_float4 b)
957
{
958
  return (spu_shuffle(a, b, ((vec_uchar16){0, 1, 2, 3, 16, 17, 18, 19,
959
                                           4, 5, 6, 7, 20, 21, 22, 23})));
960
}
961
 
962
/* vec_mergel (vector merge low)
963
 * ==========
964
 */
965
static inline vec_uchar16 vec_mergel(vec_uchar16 a, vec_uchar16 b)
966
{
967
  return (spu_shuffle(a, b, ((vec_uchar16){ 8, 24,  9, 25, 10, 26, 11, 27,
968
                                           12, 28, 13, 29, 14, 30, 15, 31})));
969
}
970
 
971
static inline vec_char16 vec_mergel(vec_char16 a, vec_char16 b)
972
{
973
  return (spu_shuffle(a, b, ((vec_uchar16){ 8, 24,  9, 25, 10, 26, 11, 27,
974
                                           12, 28, 13, 29, 14, 30, 15, 31})));
975
}
976
 
977
static inline vec_ushort8 vec_mergel(vec_ushort8 a, vec_ushort8 b)
978
{
979
  return (spu_shuffle(a, b, ((vec_uchar16){ 8,  9, 24, 25, 10, 11, 26, 27,
980
                                           12, 13, 28, 29, 14, 15, 30, 31})));
981
}
982
 
983
static inline vec_short8 vec_mergel(vec_short8 a, vec_short8 b)
984
{
985
  return (spu_shuffle(a, b, ((vec_uchar16){ 8,  9, 24, 25, 10, 11, 26, 27,
986
                                           12, 13, 28, 29, 14, 15, 30, 31})));
987
}
988
 
989
static inline vec_uint4 vec_mergel(vec_uint4 a, vec_uint4 b)
990
{
991
  return (spu_shuffle(a, b, ((vec_uchar16){ 8,  9, 10, 11, 24, 25, 26, 27,
992
                                           12, 13, 14, 15, 28, 29, 30, 31})));
993
}
994
 
995
static inline vec_int4 vec_mergel(vec_int4 a, vec_int4 b)
996
{
997
  return (spu_shuffle(a, b, ((vec_uchar16){ 8,  9, 10, 11, 24, 25, 26, 27,
998
                                           12, 13, 14, 15, 28, 29, 30, 31})));
999
}
1000
 
1001
static inline vec_float4 vec_mergel(vec_float4 a, vec_float4 b)
1002
{
1003
  return (spu_shuffle(a, b, ((vec_uchar16){ 8,  9, 10, 11, 24, 25, 26, 27,
1004
                                           12, 13, 14, 15, 28, 29, 30, 31})));
1005
}
1006
 
1007
/* vec_mfvscr (vector move from vector status and control register)
1008
 * ==========
1009
 */
1010
static inline vec_ushort8 vec_mfvscr()
1011
{
1012
  return ((vec_ushort8)spu_splats(0));           /* not supported */
1013
}
1014
 
1015
 
1016
/* vec_min (vector minimum)
1017
 * =======
1018
 */
1019
static inline vec_uchar16 vec_min(vec_uchar16 a, vec_uchar16 b)
1020
{
1021
  return (spu_sel(a, b, spu_cmpgt(a, b)));
1022
}
1023
 
1024
static inline vec_char16 vec_min(vec_char16 a, vec_char16 b)
1025
{
1026
  return (spu_sel(a, b, spu_cmpgt(a, b)));
1027
}
1028
 
1029
static inline vec_char16 vec_min(vec_bchar16 a, vec_char16 b)
1030
{
1031
  return (spu_sel((vec_char16)(a), b, spu_cmpgt((vec_char16)(a), b)));
1032
}
1033
 
1034
static inline vec_char16 vec_min(vec_char16 a, vec_bchar16 b)
1035
{
1036
  return (spu_sel(a, (vec_char16)(b), spu_cmpgt(a, (vec_char16)(b))));
1037
}
1038
 
1039
static inline vec_ushort8 vec_min(vec_ushort8 a, vec_ushort8 b)
1040
{
1041
  return (spu_sel(a, b, spu_cmpgt(a, b)));
1042
}
1043
 
1044
static inline vec_short8 vec_min(vec_short8 a, vec_short8 b)
1045
{
1046
  return (spu_sel(a, b, spu_cmpgt(a, b)));
1047
}
1048
 
1049
static inline vec_short8 vec_min(vec_bshort8 a, vec_short8 b)
1050
{
1051
  return (spu_sel((vec_short8)(a), b, spu_cmpgt((vec_short8)(a), b)));
1052
}
1053
 
1054
static inline vec_short8 vec_min(vec_short8 a, vec_bshort8 b)
1055
{
1056
  return (spu_sel(a, (vec_short8)(b), spu_cmpgt(a, (vec_short8)(b))));
1057
}
1058
 
1059
static inline vec_uint4 vec_min(vec_uint4 a, vec_uint4 b)
1060
{
1061
  return (spu_sel(a, b, spu_cmpgt(a, b)));
1062
}
1063
 
1064
static inline vec_int4 vec_min(vec_int4 a, vec_int4 b)
1065
{
1066
  return (spu_sel(a, b, spu_cmpgt(a, b)));
1067
}
1068
 
1069
static inline vec_int4 vec_min(vec_bint4 a, vec_int4 b)
1070
{
1071
  return (spu_sel((vec_int4)(a), b, spu_cmpgt((vec_int4)(a), b)));
1072
}
1073
 
1074
static inline vec_int4 vec_min(vec_int4 a, vec_bint4 b)
1075
{
1076
  return (spu_sel(a, (vec_int4)(b), spu_cmpgt(a, (vec_int4)(b))));
1077
}
1078
 
1079
static inline vec_float4 vec_min(vec_float4 a, vec_float4 b)
1080
{
1081
  return (spu_sel(a, b, spu_cmpgt(a, b)));
1082
}
1083
 
1084
/* vec_mladd (vector multiply low and add unsigned half word)
1085
 * =========
1086
 */
1087
static inline vec_short8 vec_mladd(vec_short8 a, vec_short8 b, vec_short8 c)
1088
{
1089
  return ((vec_short8)(spu_shuffle(spu_madd((vec_short8)(spu_rl((vec_uint4)(a), -16)),
1090
                                            (vec_short8)(spu_rl((vec_uint4)(b), -16)),
1091
                                            (vec_int4)(spu_rl((vec_uint4)(c), -16))),
1092
                                   spu_madd(a, b, spu_extend(c)),
1093
                                   ((vec_uchar16){ 2,  3, 18, 19,  6,  7, 22, 23,
1094
                                                  10, 11, 26, 27, 14, 15, 30, 31}))));
1095
}
1096
 
1097
 
1098
static inline vec_ushort8 vec_mladd(vec_ushort8 a, vec_ushort8 b, vec_ushort8 c)
1099
{
1100
  return ((vec_ushort8)(vec_mladd((vec_short8)(a), (vec_short8)(b), (vec_short8)(c))));
1101
}
1102
 
1103
static inline vec_short8 vec_mladd(vec_ushort8 a, vec_short8 b, vec_short8 c)
1104
{
1105
  return (vec_mladd((vec_short8)(a), b, c));
1106
}
1107
 
1108
static inline vec_short8 vec_mladd(vec_short8 a, vec_ushort8 b, vec_ushort8 c)
1109
{
1110
  return (vec_mladd(a, (vec_short8)(b), (vec_short8)(c)));
1111
}
1112
 
1113
 
1114
/* vec_mradds (vector multiply round and add saturate)
1115
 * ==========
1116
 */
1117
static inline vec_short8 vec_mradds(vec_short8 a, vec_short8 b, vec_short8 c)
1118
{
1119
  vec_int4 round = (vec_int4)spu_splats(0x4000);
1120
  vec_short8 hi, lo;
1121
 
1122
  hi = (vec_short8)(spu_sl(spu_add(spu_mule(a, b), round), 1));
1123
  lo = (vec_short8)(spu_rlmask(spu_add(spu_mulo(a, b), round), -15));
1124
 
1125
  return (vec_adds(spu_sel(hi, lo, ((vec_ushort8){0, 0xFFFF, 0, 0xFFFF, 0, 0xFFFF, 0, 0xFFFF})), c));
1126
}
1127
 
1128
 
1129
/* vec_msum (vector multiply sum)
1130
 * ========
1131
 */
1132
static inline vec_uint4 vec_msum(vec_uchar16 a, vec_uchar16 b, vec_uint4 c)
1133
{
1134
  vec_ushort8 a1, a2, b1, b2;
1135
  vec_uint4 p1, p2;
1136
 
1137
  a1 = spu_and((vec_ushort8)(a), 0xFF);
1138
  a2 = spu_rlmask((vec_ushort8)(a), -8);
1139
  b1 = spu_and((vec_ushort8)(b), 0xFF);
1140
  b2 = spu_rlmask((vec_ushort8)(b), -8);
1141
 
1142
  p1 = spu_add(spu_mulo(a1, b1), spu_mulo(spu_rlqwbyte(a1, -2), spu_rlqwbyte(b1, -2)));
1143
  p2 = spu_add(spu_mulo(a2, b2), spu_mulo(spu_rlqwbyte(a2, -2), spu_rlqwbyte(b2, -2)));
1144
  return (spu_add(p2, spu_add(p1, c)));
1145
}
1146
 
1147
static inline vec_int4 vec_msum(vec_char16 a, vec_uchar16 b, vec_int4 c)
1148
{
1149
  vec_short8 a1, a2, b1, b2;
1150
  vec_int4 p1, p2;
1151
 
1152
  a1 = (vec_short8)(spu_extend(a));
1153
  a2 = spu_rlmaska((vec_short8)(a), -8);
1154
  b1 = (vec_short8)(spu_and((vec_ushort8)(b), 0xFF));
1155
  b2 = (vec_short8)spu_rlmask((vec_ushort8)(b), -8);
1156
 
1157
  p1 = spu_add(spu_mulo(a1, b1), spu_mulo(spu_rlqwbyte(a1, -2), spu_rlqwbyte(b1, -2)));
1158
  p2 = spu_add(spu_mulo(a2, b2), spu_mulo(spu_rlqwbyte(a2, -2), spu_rlqwbyte(b2, -2)));
1159
  return (spu_add(p2, spu_add(p1, c)));
1160
}
1161
 
1162
static inline vec_uint4 vec_msum(vec_ushort8 a, vec_ushort8 b, vec_uint4 c)
1163
{
1164
  return (spu_add(spu_add(spu_mulo(a, b), spu_mulo(spu_rlqwbyte(a, -2), spu_rlqwbyte(b, -2))), c));
1165
}
1166
 
1167
static inline vec_int4 vec_msum(vec_short8 a, vec_short8 b, vec_int4 c)
1168
{
1169
  return (spu_add(spu_add(spu_mulo(a, b), spu_mulo(spu_rlqwbyte(a, -2), spu_rlqwbyte(b, -2))), c));
1170
}
1171
 
1172
 
1173
/* vec_msums (vector multiply sum saturate)
1174
 * ========
1175
 */
1176
static inline vec_uint4 vec_msums(vec_ushort8 a, vec_ushort8 b, vec_uint4 c)
1177
{
1178
  vec_uint4 p1, p2;
1179
 
1180
  p1 = spu_mulo(a, b);
1181
  p2 = spu_mulo(spu_rlqwbyte(a, -2), spu_rlqwbyte(b, -2));
1182
 
1183
  return (vec_adds(p2, vec_adds(p1, c)));
1184
}
1185
 
1186
static inline vec_int4 vec_msums(vec_short8 a, vec_short8 b, vec_int4 c)
1187
{
1188
  return (vec_adds(spu_add(spu_mulo(a, b), spu_mulo(spu_rlqwbyte(a, -2), spu_rlqwbyte(b, -2))), c));
1189
}
1190
 
1191
/* vec_mtvscr (vector move to vector status and control register)
1192
 * ==========
1193
 */
1194
#define vec_mtvscr(_a)          /* not supported */
1195
 
1196
 
1197
/* vec_mule (vector multiply even)
1198
 * ========
1199
 */
1200
static inline vec_ushort8 vec_mule(vec_uchar16 a, vec_uchar16 b)
1201
{
1202
  vec_ushort8 hi, lo;
1203
 
1204
  hi = (vec_ushort8)spu_mulo((vec_ushort8)(spu_rlmask((vec_uint4)(a), -24)),
1205
                             (vec_ushort8)(spu_rlmask((vec_uint4)(b), -24)));
1206
  lo = (vec_ushort8)spu_mulo((vec_ushort8)(spu_rlmask((vec_short8)(a), -8)),
1207
                             (vec_ushort8)(spu_rlmask((vec_short8)(b), -8)));
1208
 
1209
  return (spu_shuffle(hi, lo, ((vec_uchar16){ 2,  3, 18, 19,  6,  7, 22, 23,
1210
                                             10, 11, 26, 27, 14, 15, 30, 31})));
1211
}
1212
 
1213
static inline vec_short8 vec_mule(vec_char16 a, vec_char16 b)
1214
{
1215
  vec_short8 hi, lo;
1216
 
1217
  hi = (vec_short8)spu_mulo((vec_short8)(spu_rlmaska((vec_uint4)(a), -24)),
1218
                            (vec_short8)(spu_rlmaska((vec_uint4)(b), -24)));
1219
  lo = (vec_short8)spu_mulo((vec_short8)(spu_rlmaska((vec_short8)(a), -8)),
1220
                            (vec_short8)(spu_rlmaska((vec_short8)(b), -8)));
1221
 
1222
  return (spu_shuffle(hi, lo, ((vec_uchar16){ 2,  3, 18, 19,  6,  7, 22, 23,
1223
                                             10, 11, 26, 27, 14, 15, 30, 31})));
1224
}
1225
 
1226
static inline vec_uint4 vec_mule(vec_ushort8 a, vec_ushort8 b)
1227
{
1228
 return (spu_mulo((vec_ushort8)spu_rlmask((vec_uint4)(a), -16),
1229
                  (vec_ushort8)spu_rlmask((vec_uint4)(b), -16)));
1230
}
1231
 
1232
 
1233
static inline vec_int4 vec_mule(vec_short8 a, vec_short8 b)
1234
{
1235
 return (spu_mulo((vec_short8)spu_rlmaska((vec_int4)(a), -16),
1236
                  (vec_short8)spu_rlmaska((vec_int4)(b), -16)));
1237
}
1238
 
1239
 
1240
/* vec_mulo (vector multiply odd)
1241
 * ========
1242
 */
1243
static inline vec_ushort8 vec_mulo(vec_uchar16 a, vec_uchar16 b)
1244
{
1245
  vec_ushort8 hi, lo;
1246
 
1247
  hi = (vec_ushort8)spu_mulo((vec_ushort8)(spu_and(spu_rlmask((vec_uint4)(a), -16), 0xFF)),
1248
                             (vec_ushort8)(spu_and(spu_rlmask((vec_uint4)(b), -16), 0xFF)));
1249
  lo = (vec_ushort8)spu_mulo(spu_and((vec_ushort8)(a), 0xFF), spu_and((vec_ushort8)(b), 0xFF));
1250
 
1251
  return (spu_shuffle(hi, lo, ((vec_uchar16){ 2,  3, 18, 19,  6,  7, 22, 23,
1252
                                             10, 11, 26, 27, 14, 15, 30, 31})));
1253
}
1254
 
1255
static inline vec_short8 vec_mulo(vec_char16 a, vec_char16 b)
1256
{
1257
  vec_short8 aa, bb, hi, lo;
1258
 
1259
  aa = spu_extend(a);
1260
  bb = spu_extend(b);
1261
 
1262
  hi = (vec_short8)spu_mulo((vec_short8)(spu_rlmaska((vec_uint4)(aa), -16)),
1263
                (vec_short8)(spu_rlmaska((vec_uint4)(bb), -16)));
1264
  lo = (vec_short8)spu_mulo(aa, bb);
1265
  return (spu_shuffle(hi, lo, ((vec_uchar16){ 2,  3, 18, 19,  6,  7, 22, 23,
1266
                                             10, 11, 26, 27, 14, 15, 30, 31})));
1267
}
1268
 
1269
static inline vec_uint4 vec_mulo(vec_ushort8 a, vec_ushort8 b)
1270
{
1271
  return (spu_mulo(a, b));
1272
}
1273
 
1274
 
1275
static inline vec_int4 vec_mulo(vec_short8 a, vec_short8 b)
1276
{
1277
  return (spu_mulo(a, b));
1278
}
1279
 
1280
 
1281
/* vec_nmsub (vector negative multiply subtract)
1282
 * =========
1283
 */
1284
#define vec_nmsub(_a, _b, _c)   spu_nmsub(_a, _b, _c)
1285
 
1286
 
1287
/* vec_nor (vector logical nor)
1288
 * =======
1289
 */
1290
#define vec_nor(_a, _b)         spu_nor(_a, _b)
1291
 
1292
 
1293
/* vec_or (vector logical or)
1294
 * ======
1295
 */
1296
static inline vec_uchar16 vec_or(vec_uchar16 a, vec_uchar16 b)
1297
{
1298
  return (spu_or(a, b));
1299
}
1300
 
1301
static inline vec_char16 vec_or(vec_char16 a, vec_char16 b)
1302
{
1303
  return (spu_or(a, b));
1304
}
1305
 
1306
static inline vec_char16 vec_or(vec_bchar16 a, vec_char16 b)
1307
{
1308
  return (spu_or((vec_char16)(a), b));
1309
}
1310
 
1311
static inline vec_char16 vec_or(vec_char16 a, vec_bchar16 b)
1312
{
1313
  return (spu_or(a, (vec_char16)(b)));
1314
}
1315
 
1316
static inline vec_ushort8 vec_or(vec_ushort8 a, vec_ushort8 b)
1317
{
1318
  return (spu_or(a, b));
1319
}
1320
 
1321
static inline vec_short8 vec_or(vec_short8 a, vec_short8 b)
1322
{
1323
  return (spu_or(a, b));
1324
}
1325
 
1326
static inline vec_short8 vec_or(vec_bshort8 a, vec_short8 b)
1327
{
1328
  return (spu_or((vec_short8)(a), b));
1329
}
1330
 
1331
static inline vec_short8 vec_or(vec_short8 a, vec_bshort8 b)
1332
{
1333
  return (spu_or(a, (vec_short8)(b)));
1334
}
1335
 
1336
static inline vec_uint4 vec_or(vec_uint4 a, vec_uint4 b)
1337
{
1338
  return (spu_or(a, b));
1339
}
1340
 
1341
static inline vec_int4 vec_or(vec_int4 a, vec_int4 b)
1342
{
1343
  return (spu_or(a, b));
1344
}
1345
 
1346
static inline vec_int4 vec_or(vec_bint4 a, vec_int4 b)
1347
{
1348
  return (spu_or((vec_int4)(a), b));
1349
}
1350
 
1351
static inline vec_int4 vec_or(vec_int4 a, vec_bint4 b)
1352
{
1353
  return (spu_or(a, (vec_int4)(b)));
1354
}
1355
 
1356
static inline vec_float4 vec_or(vec_float4 a, vec_float4 b)
1357
{
1358
  return (spu_or(a, b));
1359
}
1360
 
1361
static inline vec_float4 vec_or(vec_bint4 a, vec_float4 b)
1362
{
1363
  return (spu_or((vec_float4)(a),b));
1364
}
1365
 
1366
static inline vec_float4 vec_or(vec_float4 a, vec_bint4 b)
1367
{
1368
  return (spu_or(a, (vec_float4)(b)));
1369
}
1370
 
1371
 
1372
/* vec_pack (vector pack)
1373
 * ========
1374
 */
1375
static inline vec_uchar16 vec_pack(vec_ushort8 a, vec_ushort8 b)
1376
{
1377
  return ((vec_uchar16)spu_shuffle(a, b, ((vec_uchar16){ 1,  3,  5,  7,  9, 11, 13, 15,
1378
                                                        17, 19, 21, 23, 25, 27, 29, 31})));
1379
}
1380
 
1381
static inline vec_char16 vec_pack(vec_short8 a, vec_short8 b)
1382
{
1383
  return ((vec_char16)spu_shuffle(a, b, ((vec_uchar16){ 1,  3,  5,  7,  9, 11, 13, 15,
1384
                                                       17, 19, 21, 23, 25, 27, 29, 31})));
1385
}
1386
 
1387
static inline vec_ushort8 vec_pack(vec_uint4 a, vec_uint4 b)
1388
{
1389
  return ((vec_ushort8)spu_shuffle(a, b, ((vec_uchar16){ 2,  3,  6,  7, 10, 11, 14, 15,
1390
                                                        18, 19, 22, 23, 26, 27, 30, 31})));
1391
}
1392
 
1393
static inline vec_short8 vec_pack(vec_int4 a, vec_int4 b)
1394
{
1395
  return ((vec_short8)spu_shuffle(a, b, ((vec_uchar16){ 2,  3,  6,  7, 10, 11, 14, 15,
1396
                                                       18, 19, 22, 23, 26, 27, 30, 31})));
1397
}
1398
 
1399
 
1400
/* vec_packpx (vector pack pixel)
1401
 * ==========
1402
 */
1403
static inline vec_pixel8 vec_packpx(vec_uint4 a, vec_uint4 b)
1404
{
1405
  vec_uint4 x03FF = (vec_uint4)(spu_splats((unsigned short)0x03FF));
1406
  vec_uint4 x001F = (vec_uint4)(spu_splats((unsigned short)0x001F));
1407
 
1408
  return ((vec_pixel8)(spu_shuffle(spu_sel(spu_sel(spu_sl(a, 7), spu_sl(a, 10), x03FF),
1409
                                           spu_sl(a, 13), x001F),
1410
                                   spu_sel(spu_sel(spu_sl(b, 7), spu_sl(b, 10), x03FF),
1411
                                           spu_sl(b, 13), x001F),
1412
                                   ((vec_uchar16){ 0,  1,  4,  5,   8,  9, 12, 13,
1413
                                                  16, 17, 20, 21, 24, 25, 28, 29}))));
1414
}
1415
 
1416
 
1417
/* vec_packs (vector pack saturate)
1418
 * =========
1419
 */
1420
static inline vec_uchar16 vec_packs(vec_ushort8 a, vec_ushort8 b)
1421
{
1422
  vec_ushort8 max = spu_splats((unsigned short)0x00FF);
1423
 
1424
  return ((vec_uchar16)(spu_shuffle(spu_sel(a, max, spu_cmpgt(a, 255)),
1425
                                    spu_sel(b, max, spu_cmpgt(b, 255)),
1426
                                    ((vec_uchar16){ 1,  3,  5,  7,  9, 11, 13, 15,
1427
                                                   17, 19, 21, 23, 25, 27, 29, 31}))));
1428
}
1429
 
1430
static inline vec_char16 vec_packs(vec_short8 a, vec_short8 b)
1431
{
1432
  vec_short8 max = spu_splats((signed short)0x007F);
1433
  vec_short8 min = spu_splats((signed short)0xFF80);
1434
 
1435
  return ((vec_char16)(spu_shuffle(spu_sel(min, spu_sel(a, max, spu_cmpgt(a, 127)), spu_cmpgt(a, -128)),
1436
                                    spu_sel(min, spu_sel(b, max, spu_cmpgt(b, 127)), spu_cmpgt(b, -128)),
1437
                                   ((vec_uchar16){ 1,  3,  5,  7,  9, 11, 13, 15,
1438
                                                  17, 19, 21, 23, 25, 27, 29, 31}))));
1439
}
1440
 
1441
static inline vec_ushort8 vec_packs(vec_uint4 a, vec_uint4 b)
1442
{
1443
  vec_uint4 max = spu_splats((unsigned int)0x0000FFFF);
1444
 
1445
  return ((vec_ushort8)(spu_shuffle(spu_sel(a, max, spu_cmpgt(a, max)),
1446
                                    spu_sel(b, max, spu_cmpgt(b, max)),
1447
                                    ((vec_uchar16){ 2,  3,  6,  7, 10, 11, 14, 15,
1448
                                                   18, 19, 22, 23, 26, 27, 30, 31}))));
1449
}
1450
 
1451
static inline vec_short8 vec_packs(vec_int4 a, vec_int4 b)
1452
{
1453
  vec_int4 max = spu_splats((signed int)0x00007FFF);
1454
  vec_int4 min = spu_splats((signed int)0xFFFF8000);
1455
 
1456
  return ((vec_short8)(spu_shuffle(spu_sel(min, spu_sel(a, max, spu_cmpgt(a, max)), spu_cmpgt(a, min)),
1457
                                   spu_sel(min, spu_sel(b, max, spu_cmpgt(b, max)), spu_cmpgt(b, min)),
1458
                                   ((vec_uchar16){ 2,  3,  6,  7, 10, 11, 14, 15,
1459
                                                  18, 19, 22, 23, 26, 27, 30, 31}))));
1460
}
1461
 
1462
 
1463
/* vec_packsu (vector pack saturate unsigned)
1464
 * ==========
1465
 */
1466
static inline vec_uchar16 vec_packsu(vec_ushort8 a, vec_ushort8 b)
1467
{
1468
  return ((vec_uchar16)spu_shuffle(spu_or(a, (vec_ushort8)(spu_cmpgt(a, 255))),
1469
                                   spu_or(b, (vec_ushort8)(spu_cmpgt(b, 255))),
1470
                                   ((vec_uchar16){ 1,  3,  5,  7,  9, 11, 13, 15,
1471
                                                  17, 19, 21, 23, 25, 27, 29, 31})));
1472
}
1473
 
1474
static inline vec_uchar16 vec_packsu(vec_short8 a, vec_short8 b)
1475
{
1476
  vec_short8 max = spu_splats((signed short)0x00FF);
1477
  vec_short8 min = spu_splats((signed short)0x0000);
1478
 
1479
  return ((vec_uchar16)(spu_shuffle(spu_sel(min, spu_sel(a, max, spu_cmpgt(a, 255)), spu_cmpgt(a, 0)),
1480
                                    spu_sel(min, spu_sel(b, max, spu_cmpgt(b, 255)), spu_cmpgt(b, 0)),
1481
                                    ((vec_uchar16){ 1,  3,  5,  7,  9, 11, 13, 15,
1482
                                                   17, 19, 21, 23, 25, 27, 29, 31}))));
1483
 
1484
  return (vec_packsu((vec_ushort8)(a), (vec_ushort8)(b)));
1485
}
1486
 
1487
static inline vec_ushort8 vec_packsu(vec_uint4 a, vec_uint4 b)
1488
{
1489
  vec_uint4 max = spu_splats((unsigned int)0xFFFF);
1490
 
1491
  return ((vec_ushort8)spu_shuffle(spu_or(a, (vec_uint4)(spu_cmpgt(a, max))),
1492
                                   spu_or(b, (vec_uint4)(spu_cmpgt(b, max))),
1493
                                   ((vec_uchar16){ 2,  3,  6,  7, 10, 11, 14, 15,
1494
                                                  18, 19, 22, 23, 26, 27, 30, 31})));
1495
}
1496
 
1497
static inline vec_ushort8 vec_packsu(vec_int4 a, vec_int4 b)
1498
{
1499
  vec_int4 max = spu_splats((signed int)0x0000FFFF);
1500
  vec_int4 min = spu_splats((signed int)0x00000000);
1501
 
1502
  return ((vec_ushort8)(spu_shuffle(spu_sel(min, spu_sel(a, max, spu_cmpgt(a, max)), spu_cmpgt(a, min)),
1503
                                    spu_sel(min, spu_sel(b, max, spu_cmpgt(b, max)), spu_cmpgt(b, min)),
1504
                                    ((vec_uchar16){ 2,  3,  6,  7, 10, 11, 14, 15,
1505
                                                   18, 19, 22, 23, 26, 27, 30, 31}))));
1506
}
1507
 
1508
 
1509
/* vec_perm (vector permute)
1510
 * ========
1511
 */
1512
static inline vec_uchar16 vec_perm(vec_uchar16 a, vec_uchar16 b, vec_uchar16 c)
1513
{
1514
  return (spu_shuffle(a, b, spu_and(c, 0x1F)));
1515
}
1516
 
1517
static inline vec_char16 vec_perm(vec_char16 a, vec_char16 b, vec_uchar16 c)
1518
{
1519
  return ((vec_char16)(vec_perm((vec_uchar16)(a), (vec_uchar16)(b), c)));
1520
}
1521
 
1522
static inline vec_ushort8 vec_perm(vec_ushort8 a, vec_ushort8 b, vec_uchar16 c)
1523
{
1524
  return ((vec_ushort8)(vec_perm((vec_uchar16)(a), (vec_uchar16)(b), c)));
1525
}
1526
 
1527
static inline vec_short8 vec_perm(vec_short8 a, vec_short8 b, vec_uchar16 c)
1528
{
1529
  return ((vec_short8)(vec_perm((vec_uchar16)(a), (vec_uchar16)(b), c)));
1530
}
1531
 
1532
static inline vec_uint4 vec_perm(vec_uint4 a, vec_uint4 b, vec_uchar16 c)
1533
{
1534
  return ((vec_uint4)(vec_perm((vec_uchar16)(a), (vec_uchar16)(b), c)));
1535
}
1536
 
1537
static inline vec_int4 vec_perm(vec_int4 a, vec_int4 b, vec_uchar16 c)
1538
{
1539
  return ((vec_int4)(vec_perm((vec_uchar16)(a), (vec_uchar16)(b), c)));
1540
}
1541
 
1542
static inline vec_float4 vec_perm(vec_float4 a, vec_float4 b, vec_uchar16 c)
1543
{
1544
  return ((vec_float4)(vec_perm((vec_uchar16)(a), (vec_uchar16)(b), c)));
1545
}
1546
 
1547
 
1548
/* vec_re (vector reciprocal estimate)
1549
 * ======
1550
 */
1551
#define vec_re(_a)      spu_re(_a)
1552
 
1553
 
1554
/* vec_rl (vector rotate left)
1555
 * ======
1556
 */
1557
static inline vec_uchar16 vec_rl(vec_uchar16 a, vec_uchar16 b)
1558
{
1559
  vec_ushort8 r1, r2;
1560
 
1561
  r1 = spu_rl(spu_and((vec_ushort8)(a), 0xFF), (vec_short8)spu_and((vec_ushort8)(b), 7));
1562
  r2 = spu_rl(spu_and((vec_ushort8)(a), -256), (vec_short8)spu_and(spu_rlmask((vec_ushort8)(b), -8), 7));
1563
  return ((vec_uchar16)(spu_sel(spu_or(r2, spu_sl(r2, 8)), spu_or(r1, spu_rlmask(r1, -8)), spu_splats((unsigned short)0xFF))));
1564
}
1565
 
1566
static inline vec_char16 vec_rl(vec_char16 a, vec_uchar16 b)
1567
{
1568
  return ((vec_char16)(vec_rl((vec_uchar16)(a), b)));
1569
}
1570
 
1571
static inline vec_ushort8 vec_rl(vec_ushort8 a, vec_ushort8 b)
1572
{
1573
  return (spu_rl(a, (vec_short8)(b)));
1574
}
1575
 
1576
static inline vec_short8 vec_rl(vec_short8 a, vec_ushort8 b)
1577
{
1578
  return (spu_rl(a, (vec_short8)(b)));
1579
}
1580
 
1581
static inline vec_uint4 vec_rl(vec_uint4 a, vec_uint4 b)
1582
{
1583
  return (spu_rl(a, (vec_int4)(b)));
1584
}
1585
 
1586
static inline vec_int4 vec_rl(vec_int4 a, vec_uint4 b)
1587
{
1588
  return (spu_rl(a, (vec_int4)(b)));
1589
}
1590
 
1591
 
1592
/* vec_round (vector round)
1593
 * =========
1594
 */
1595
static inline vec_float4 vec_round(vec_float4 a)
1596
{
1597
  vec_float4 s_half, s_one, d;
1598
  vec_uint4 odd;
1599
  vec_uint4 msb = spu_splats((unsigned int)0x80000000);
1600
  vec_float4 half = spu_splats(0.5f);
1601
  vec_int4 exp;
1602
  vec_uint4 mask;
1603
 
1604
  s_half = (vec_float4)(spu_sel((vec_uint4)(half), (vec_uint4)(a), msb));
1605
  a = spu_add(a, s_half);
1606
  s_one = spu_add(s_half, s_half);
1607
  exp  = spu_sub(127, (vec_int4)(spu_and(spu_rlmask((vec_uint4)(a), -23), 0xFF)));
1608
  mask = spu_rlmask(spu_splats((unsigned int)0x7FFFFF), exp);
1609
  mask = spu_sel(spu_splats((unsigned int)0), mask, spu_cmpgt(exp, -31));
1610
  mask = spu_or(mask, spu_xor((vec_uint4)(spu_rlmaska(spu_add(exp, -1), -31)), -1));
1611
 
1612
  odd = spu_and((vec_uint4)(spu_convts(a, 0)), 1);
1613
  s_one = spu_andc(s_one, (vec_float4)spu_cmpeq(mask, 0));
1614
  s_one = spu_and(s_one, spu_and((vec_float4)spu_cmpeq(spu_and((vec_uint4)(a), mask), 0),
1615
                                 (vec_float4)spu_cmpeq(odd, 1)));
1616
  d = spu_andc(a, (vec_float4)(mask));
1617
  d = spu_sub(d, s_one);
1618
  return (d);
1619
}
1620
 
1621
/* vec_rsqrte (vector reciprocal square root estimate)
1622
 * ==========
1623
 */
1624
#define vec_rsqrte(_a)  spu_rsqrte(_a)
1625
 
1626
 
1627
/* vec_sel (vector select)
1628
 * =======
1629
 */
1630
#define vec_sel(_a, _b, _c)     spu_sel(_a, _b, _c)
1631
 
1632
 
1633
/* vec_sl (vector shift left)
1634
 * ======
1635
 */
1636
static inline vec_uchar16 vec_sl(vec_uchar16 a, vec_uchar16 b)
1637
{
1638
  vec_ushort8 hi, lo;
1639
 
1640
  lo = spu_and(spu_sl((vec_ushort8)(a), spu_and((vec_ushort8)(b), 7)), 0xFF);
1641
  hi = spu_sl(spu_and((vec_ushort8)(a), -256), spu_and(spu_rlmask((vec_ushort8)(b), -8), 7));
1642
 
1643
  return ((vec_uchar16)(spu_or(hi, lo)));
1644
}
1645
 
1646
static inline vec_char16 vec_sl(vec_char16 a, vec_uchar16 b)
1647
{
1648
  return ((vec_char16)(vec_sl((vec_uchar16)(a), b)));
1649
}
1650
 
1651
static inline vec_ushort8 vec_sl(vec_ushort8 a, vec_ushort8 b)
1652
{
1653
  return (spu_sl(a, spu_and(b, 15)));
1654
}
1655
 
1656
static inline vec_short8 vec_sl(vec_short8 a, vec_ushort8 b)
1657
{
1658
  return (spu_sl(a, spu_and((vec_ushort8)(b), 15)));
1659
}
1660
 
1661
static inline vec_uint4 vec_sl(vec_uint4 a, vec_uint4 b)
1662
{
1663
  return (spu_sl(a, spu_and(b, 31)));
1664
}
1665
 
1666
static inline vec_int4 vec_sl(vec_int4 a, vec_uint4 b)
1667
{
1668
  return (spu_sl(a, spu_and(b, 31)));
1669
}
1670
 
1671
 
1672
/* vec_sld (vector shift left double)
1673
 * =======
1674
 */
1675
#define vec_sld(_a, _b, _c)     spu_shuffle(_a, _b, ((vec_uchar16){ 0+(_c),  1+(_c),  2+(_c),  3+(_c),  \
1676
                                                                    4+(_c),  5+(_c),  6+(_c),  7+(_c),  \
1677
                                                                    8+(_c),  9+(_c), 10+(_c), 11+(_c),  \
1678
                                                                   12+(_c), 13+(_c), 14+(_c), 15+(_c)}))
1679
 
1680
 
1681
/* vec_sll (vector shift left long)
1682
 * =======
1683
 */
1684
#define vec_sll(_a, _b)         spu_slqw(_a, spu_extract((vec_uint4)(_b), 0))
1685
 
1686
 
1687
/* vec_slo (vector shift left by octet)
1688
 * =======
1689
 */
1690
#define vec_slo(_a, _b)         spu_slqwbytebc(_a, spu_extract((vec_uint4)(_b), 3) & 0x7F)
1691
 
1692
 
1693
/* vec_splat (vector splat)
1694
 * =========
1695
 */
1696
#define vec_splat(_a, _b)       spu_splats(spu_extract(_a, _b))
1697
 
1698
 
1699
/* vec_splat_s8 (vector splat signed byte)
1700
 * ============
1701
 */
1702
#define vec_splat_s8(_a)        spu_splats((signed char)(_a))
1703
 
1704
 
1705
/* vec_splat_s16 (vector splat signed half-word)
1706
 * =============
1707
 */
1708
#define vec_splat_s16(_a)       spu_splats((signed short)(_a))
1709
 
1710
 
1711
/* vec_splat_s32 (vector splat signed word)
1712
 * =============
1713
 */
1714
#define vec_splat_s32(_a)       spu_splats((signed int)(_a))
1715
 
1716
 
1717
/* vec_splat_u8 (vector splat unsigned byte)
1718
 * ============
1719
 */
1720
#define vec_splat_u8(_a)        spu_splats((unsigned char)(_a))
1721
 
1722
 
1723
/* vec_splat_u16 (vector splat unsigned half-word)
1724
 * =============
1725
 */
1726
#define vec_splat_u16(_a)       spu_splats((unsigned short)(_a))
1727
 
1728
 
1729
/* vec_splat_u32 (vector splat unsigned word)
1730
 * =============
1731
 */
1732
#define vec_splat_u32(_a)       spu_splats((unsigned int)(_a))
1733
 
1734
 
1735
/* vec_sr (vector shift right)
1736
 * ======
1737
 */
1738
static inline vec_uchar16 vec_sr(vec_uchar16 a, vec_uchar16 b)
1739
{
1740
  vec_ushort8 hi, lo;
1741
 
1742
  lo = spu_rlmask(spu_and((vec_ushort8)(a), 0xFF), spu_sub(0, (vec_short8)(spu_and((vec_ushort8)(b), 7))));
1743
  hi = spu_and(spu_rlmask((vec_ushort8)(a), spu_sub(0, (vec_short8)(spu_and(spu_rlmask((vec_ushort8)(b), -8), 7)))), -256);
1744
 
1745
  return ((vec_uchar16)(spu_or(hi, lo)));
1746
}
1747
 
1748
static inline vec_char16 vec_sr(vec_char16 a, vec_uchar16 b)
1749
{
1750
  return ((vec_char16)(vec_sr((vec_uchar16)(a), b)));
1751
}
1752
 
1753
static inline vec_ushort8 vec_sr(vec_ushort8 a, vec_ushort8 b)
1754
{
1755
  return (spu_rlmask(a, spu_sub(0, (vec_short8)(spu_and(b, 15)))));
1756
}
1757
 
1758
static inline vec_short8 vec_sr(vec_short8 a, vec_ushort8 b)
1759
{
1760
  return ((vec_short8)(vec_sr((vec_ushort8)(a), b)));
1761
}
1762
 
1763
static inline vec_uint4 vec_sr(vec_uint4 a, vec_uint4 b)
1764
{
1765
  return (spu_rlmask(a, spu_sub(0, (vec_int4)(spu_and(b, 31)))));
1766
}
1767
 
1768
static inline vec_int4 vec_sr(vec_int4 a, vec_uint4 b)
1769
{
1770
  return ((vec_int4)(vec_sr((vec_uint4)(a), b)));
1771
}
1772
 
1773
 
1774
/* vec_sra (vector shift right algebraic)
1775
 * =======
1776
 */
1777
static inline vec_char16 vec_sra(vec_char16 a, vec_uchar16 b)
1778
{
1779
  vec_short8 hi, lo;
1780
 
1781
  lo = spu_and(spu_rlmaska(spu_extend(a), spu_sub(0, (vec_short8)(spu_and((vec_ushort8)(b), 7)))), 0xFF);
1782
  hi = spu_and(spu_rlmaska((vec_short8)(a), spu_sub(0, (vec_short8)(spu_and(spu_rlmask((vec_ushort8)(b), -8), 7)))), -256);
1783
 
1784
  return ((vec_char16)(spu_or(hi, lo)));
1785
}
1786
 
1787
static inline vec_uchar16 vec_sra(vec_uchar16 a, vec_uchar16 b)
1788
{
1789
  return ((vec_uchar16)(vec_sra((vec_char16)(a), b)));
1790
}
1791
 
1792
static inline vec_short8 vec_sra(vec_short8 a, vec_ushort8 b)
1793
{
1794
  return (spu_rlmaska(a, spu_sub(0, (vec_short8)(spu_and(b, 15)))));
1795
}
1796
 
1797
static inline vec_ushort8 vec_sra(vec_ushort8 a, vec_ushort8 b)
1798
{
1799
  return ((vec_ushort8)(vec_sra((vec_short8)(a), b)));
1800
}
1801
 
1802
static inline vec_int4 vec_sra(vec_int4 a, vec_uint4 b)
1803
{
1804
  return (spu_rlmaska(a, spu_sub(0, (vec_int4)(spu_and(b, 31)))));
1805
}
1806
 
1807
static inline vec_uint4 vec_sra(vec_uint4 a, vec_uint4 b)
1808
{
1809
  return ((vec_uint4)(vec_sra((vec_int4)(a), b)));
1810
}
1811
 
1812
 
1813
/* vec_srl (vector shift right long)
1814
 * =======
1815
 */
1816
#define vec_srl(_a, _b)         spu_rlmaskqw(_a, 0-spu_extract((vec_int4)(_b), 3))
1817
 
1818
 
1819
/* vec_sro (vector shift right by octet)
1820
 * =======
1821
 */
1822
#define vec_sro(_a, _b)         spu_rlmaskqwbyte(_a, 0 - ((spu_extract((vec_int4)(_b), 3) >> 3) & 0xF))
1823
 
1824
/* vec_st (vector store indexed)
1825
 * ======
1826
 */
1827
static inline void vec_st(vec_uchar16 a, int b, unsigned char *c)
1828
{
1829
  *((vec_uchar16 *)(c+b)) = a;
1830
}
1831
 
1832
static inline void vec_st(vec_uchar16 a, int b, vec_uchar16 *c)
1833
{
1834
  *((vec_uchar16 *)((unsigned char *)(c)+b)) = a;
1835
}
1836
 
1837
static inline void vec_st(vec_char16 a, int b, signed char *c)
1838
{
1839
  *((vec_char16 *)(c+b)) = a;
1840
}
1841
 
1842
static inline void vec_st(vec_char16 a, int b, vec_char16 *c)
1843
{
1844
  *((vec_char16 *)((signed char *)(c)+b)) = a;
1845
}
1846
 
1847
static inline void vec_st(vec_bchar16 a, int b, signed char *c)
1848
{
1849
  *((vec_bchar16 *)((signed char *)(c)+b)) = a;
1850
}
1851
 
1852
static inline void vec_st(vec_ushort8 a, int b, unsigned short *c)
1853
{
1854
  *((vec_ushort8 *)((unsigned char *)(c)+b)) = a;
1855
}
1856
 
1857
static inline void vec_st(vec_ushort8 a, int b, vec_ushort8 *c)
1858
{
1859
  *((vec_ushort8 *)((unsigned char *)(c)+b)) = a;
1860
}
1861
 
1862
static inline void vec_st(vec_short8 a, int b, signed short *c)
1863
{
1864
  *((vec_short8 *)((unsigned char *)(c)+b)) = a;
1865
}
1866
 
1867
static inline void vec_st(vec_short8 a, int b, vec_short8 *c)
1868
{
1869
  *((vec_short8 *)((signed char *)(c)+b)) = a;
1870
}
1871
 
1872
static inline void vec_st(vec_bshort8 a, int b, signed short *c)
1873
{
1874
  *((vec_bshort8 *)((signed char *)(c)+b)) = a;
1875
}
1876
 
1877
static inline void vec_st(vec_uint4 a, int b, unsigned int *c)
1878
{
1879
  *((vec_uint4 *)((unsigned char *)(c)+b)) = a;
1880
}
1881
 
1882
static inline void vec_st(vec_uint4 a, int b, vec_uint4 *c)
1883
{
1884
  *((vec_uint4 *)((unsigned char *)(c)+b)) = a;
1885
}
1886
 
1887
static inline void vec_st(vec_int4 a, int b, signed int *c)
1888
{
1889
  *((vec_int4 *)((unsigned char *)(c)+b)) = a;
1890
}
1891
 
1892
static inline void vec_st(vec_int4 a, int b, vec_int4 *c)
1893
{
1894
  *((vec_int4 *)((signed char *)(c)+b)) = a;
1895
}
1896
 
1897
static inline void vec_st(vec_bint4 a, int b, signed int *c)
1898
{
1899
  *((vec_bint4 *)((signed char *)(c)+b)) = a;
1900
}
1901
 
1902
static inline void vec_st(vec_float4 a, int b, float *c)
1903
{
1904
  *((vec_float4 *)((unsigned char *)(c)+b)) = a;
1905
}
1906
 
1907
static inline void vec_st(vec_float4 a, int b, vec_float4 *c)
1908
{
1909
  *((vec_float4 *)((unsigned char *)(c)+b)) = a;
1910
}
1911
 
1912
 
1913
/* vec_ste (vector store element indexed)
1914
 * =======
1915
 */
1916
static inline void vec_ste(vec_uchar16 a, int b, unsigned char *c)
1917
{
1918
  unsigned char *ptr;
1919
 
1920
  ptr = c + b;
1921
  *ptr = spu_extract(a, (int)(ptr) & 15);
1922
}
1923
 
1924
static inline void vec_ste(vec_char16 a, int b, signed char *c)
1925
{
1926
  vec_ste((vec_uchar16)(a), b, (unsigned char *)(c));
1927
}
1928
 
1929
static inline void vec_ste(vec_bchar16 a, int b, signed char *c)
1930
{
1931
  vec_ste((vec_uchar16)(a), b, (unsigned char *)(c));
1932
}
1933
 
1934
static inline void vec_ste(vec_ushort8 a, int b, unsigned short *c)
1935
{
1936
  unsigned short *ptr;
1937
 
1938
  ptr = (unsigned short *)(((unsigned int)(c) + b) & ~1);
1939
  *ptr = spu_extract(a, ((int)(ptr) >> 1) & 7);
1940
}
1941
 
1942
static inline void vec_ste(vec_short8 a, int b, signed short *c)
1943
{
1944
  vec_ste((vec_ushort8)(a), b, (unsigned short *)(c));
1945
}
1946
 
1947
static inline void vec_ste(vec_bshort8 a, int b, signed short *c)
1948
{
1949
  vec_ste((vec_ushort8)(a), b, (unsigned short *)(c));
1950
}
1951
 
1952
static inline void vec_ste(vec_uint4 a, int b, unsigned int *c)
1953
{
1954
  unsigned int *ptr;
1955
 
1956
  ptr = (unsigned int *)(((unsigned int)(c) + b) & ~3);
1957
  *ptr = spu_extract(a, ((int)(ptr) >> 2) & 3);
1958
}
1959
 
1960
static inline void vec_ste(vec_int4 a, int b, signed int *c)
1961
{
1962
  vec_ste((vec_uint4)(a), b, (unsigned int *)(c));
1963
}
1964
 
1965
static inline void vec_ste(vec_bint4 a, int b, signed int *c)
1966
{
1967
  vec_ste((vec_uint4)(a), b, (unsigned int *)(c));
1968
}
1969
 
1970
static inline void vec_ste(vec_float4 a, int b, float *c)
1971
{
1972
  vec_ste((vec_uint4)(a), b, (unsigned int *)(c));
1973
}
1974
 
1975
 
1976
/* vec_stl (vector store indexed LRU)
1977
 * =======
1978
 */
1979
#define vec_stl(_a, _b, _c)             vec_st(_a, _b, _c)
1980
 
1981
 
1982
/* vec_sub (vector subtract)
1983
 * =======
1984
 */
1985
static inline vec_uchar16 vec_sub(vec_uchar16 a, vec_uchar16 b)
1986
{
1987
  return ((vec_uchar16)(spu_sel(spu_sub((vec_ushort8)(a), (vec_ushort8)(b)),
1988
                                spu_sub(spu_and((vec_ushort8)(a), -256), spu_and((vec_ushort8)(b), -256)),
1989
                                spu_splats((unsigned short)0xFF00))));
1990
}
1991
 
1992
static inline vec_char16 vec_sub(vec_char16 a, vec_char16 b)
1993
{
1994
  return ((vec_char16)(vec_sub((vec_uchar16)(a), (vec_uchar16)(b))));
1995
}
1996
 
1997
static inline vec_char16 vec_sub(vec_bchar16 a, vec_char16 b)
1998
{
1999
  return ((vec_char16)(vec_sub((vec_uchar16)(a), (vec_uchar16)(b))));
2000
}
2001
 
2002
static inline vec_char16 vec_sub(vec_char16 a, vec_bchar16 b)
2003
{
2004
  return ((vec_char16)(vec_sub((vec_uchar16)(a), (vec_uchar16)(b))));
2005
}
2006
 
2007
static inline vec_ushort8 vec_sub(vec_ushort8 a, vec_ushort8 b)
2008
{
2009
  return (spu_sub(a, b));
2010
}
2011
 
2012
static inline vec_short8 vec_sub(vec_short8 a, vec_short8 b)
2013
{
2014
  return (spu_sub(a, b));
2015
}
2016
 
2017
static inline vec_short8 vec_sub(vec_bshort8 a, vec_short8 b)
2018
{
2019
  return (spu_sub((vec_short8)(a), b));
2020
}
2021
 
2022
static inline vec_short8 vec_sub(vec_short8 a, vec_bshort8 b)
2023
{
2024
  return (spu_sub(a, (vec_short8)(b)));
2025
}
2026
 
2027
static inline vec_uint4 vec_sub(vec_uint4 a, vec_uint4 b)
2028
{
2029
  return (spu_sub(a, b));
2030
}
2031
 
2032
static inline vec_int4 vec_sub(vec_int4 a, vec_int4 b)
2033
{
2034
  return (spu_sub(a, b));
2035
}
2036
 
2037
static inline vec_int4 vec_sub(vec_bint4 a, vec_int4 b)
2038
{
2039
  return (spu_sub((vec_int4)(a), b));
2040
}
2041
 
2042
static inline vec_int4 vec_sub(vec_int4 a, vec_bint4 b)
2043
{
2044
  return (spu_sub(a, (vec_int4)(b)));
2045
}
2046
 
2047
static inline vec_float4 vec_sub(vec_float4 a, vec_float4 b)
2048
{
2049
  return (spu_sub(a, b));
2050
}
2051
 
2052
 
2053
/* vec_subc (vector subtract carryout)
2054
 * ========
2055
 */
2056
#define vec_subc(_a, _b)        spu_genb(_a, _b)
2057
 
2058
 
2059
/* vec_subs (vector subtract saturate)
2060
 * ========
2061
 */
2062
static inline vec_uchar16 vec_subs(vec_uchar16 a, vec_uchar16 b)
2063
{
2064
  vec_ushort8 s1, s2;
2065
  vec_uchar16 s, d;
2066
 
2067
  s1 = spu_sub(spu_rlmask((vec_ushort8)(a), -8), spu_rlmask((vec_ushort8)(b), -8));
2068
  s2 = spu_sub(spu_and((vec_ushort8)(a), 0xFF), spu_and((vec_ushort8)(b), 0xFF));
2069
  s  = (vec_uchar16)(spu_shuffle(s1, s2, ((vec_uchar16){0, 16,  2, 18,  4, 20,  6, 22,
2070
                                                        8, 24, 10, 26, 12, 28, 14, 30})));
2071
  d  = (vec_uchar16)(spu_shuffle(s1, s2, ((vec_uchar16){1, 17,  3, 19,  5, 21,  7, 23,
2072
                                                        9, 25, 11, 27, 13, 29, 15, 31})));
2073
  return (spu_andc(d, s));
2074
}
2075
 
2076
static inline vec_char16 vec_subs(vec_char16 a, vec_char16 b)
2077
{
2078
  vec_ushort8 s1, s2;
2079
  vec_uchar16 s, d;
2080
 
2081
  s1 = spu_sub(spu_rlmask((vec_ushort8)(a), -8), spu_rlmask((vec_ushort8)(b), -8));
2082
  s2 = spu_sub(spu_and((vec_ushort8)(a), 0xFF), spu_and((vec_ushort8)(b), 0xFF));
2083
  s  = (vec_uchar16)(spu_shuffle(s1, s2, ((vec_uchar16){1, 17,  3, 19,  5, 21,  7, 23,
2084
                                                        9, 25, 11, 27, 13, 29, 15, 31})));
2085
  d  = spu_sel(s, spu_splats((unsigned char)0x7F), spu_cmpgt(spu_nor((vec_uchar16)(a), spu_nand(s, (vec_uchar16)(b))), 0x7F));
2086
  d  = spu_sel(d, spu_splats((unsigned char)0x80), spu_cmpgt(spu_and((vec_uchar16)(a), spu_nor(s, (vec_uchar16)(b))), 0x7F));
2087
 
2088
  return ((vec_char16)(d));
2089
}
2090
 
2091
static inline vec_char16 vec_subs(vec_bchar16 a, vec_char16 b)
2092
{
2093
  return (vec_subs((vec_char16)(a), b));
2094
}
2095
 
2096
static inline vec_char16 vec_subs(vec_char16 a, vec_bchar16 b)
2097
{
2098
  return (vec_subs(a, (vec_char16)(b)));
2099
}
2100
 
2101
static inline vec_ushort8 vec_subs(vec_ushort8 a, vec_ushort8 b)
2102
{
2103
  return (spu_andc(spu_sub(a, b), spu_cmpgt(b, a)));
2104
}
2105
 
2106
static inline vec_short8 vec_subs(vec_short8 a, vec_short8 b)
2107
{
2108
  vec_short8 s;
2109
  vec_short8 d;
2110
 
2111
  s = spu_sub(a, b);
2112
  d = spu_sel(s, spu_splats((signed short)0x7FFF), (vec_ushort8)(spu_rlmaska(spu_nor(a, spu_nand(s, b)), -15)));
2113
  d = spu_sel(d, spu_splats((signed short)0x8000), (vec_ushort8)(spu_rlmaska(spu_and(a, spu_nor(s, b)), -15)));
2114
 
2115
  return (d);
2116
}
2117
 
2118
static inline vec_short8 vec_subs(vec_bshort8 a, vec_short8 b)
2119
{
2120
  return ((vec_short8)(vec_subs((vec_short8)(a), b)));
2121
}
2122
 
2123
static inline vec_short8 vec_subs(vec_short8 a, vec_bshort8 b)
2124
{
2125
  return ((vec_short8)(vec_subs(a, (vec_short8)(b))));
2126
}
2127
 
2128
static inline vec_uint4 vec_subs(vec_uint4 a, vec_uint4 b)
2129
{
2130
  return (spu_andc(spu_sub(a, b), spu_cmpgt(b, a)));
2131
}
2132
 
2133
static inline vec_int4 vec_subs(vec_int4 a, vec_int4 b)
2134
{
2135
  vec_int4 s;
2136
  vec_int4 d;
2137
 
2138
  s = spu_sub(a, b);
2139
  d = spu_sel(s, spu_splats((signed int)0x7FFFFFFF), (vec_uint4)(spu_rlmaska(spu_nor(a, spu_nand(s, b)), -31)));
2140
  d = spu_sel(d, spu_splats((signed int)0x80000000), (vec_uint4)(spu_rlmaska(spu_and(a, spu_nor(s, b)), -31)));
2141
 
2142
  return (d);
2143
}
2144
 
2145
static inline vec_int4 vec_subs(vec_bint4 a, vec_int4 b)
2146
{
2147
  return ((vec_int4)(vec_subs((vec_int4)(a), b)));
2148
}
2149
 
2150
static inline vec_int4 vec_subs(vec_int4 a, vec_bint4 b)
2151
{
2152
  return ((vec_int4)(vec_subs(a, (vec_int4)(b))));
2153
}
2154
 
2155
 
2156
/* vec_sum4s (vector sum across partial (1/4) saturated)
2157
 * =========
2158
 */
2159
static inline vec_uint4 vec_sum4s(vec_uchar16 a, vec_uint4 b)
2160
{
2161
  vec_uint4 a01_23, a0123;
2162
 
2163
  a01_23 = (vec_uint4)(spu_add(spu_rlmask((vec_ushort8)(a), -8),
2164
                               spu_and((vec_ushort8)(a), 0xFF)));
2165
  a0123 = spu_add(spu_rlmask(a01_23, -16), spu_and(a01_23, 0x1FF));
2166
  return (vec_adds(a0123, b));
2167
}
2168
 
2169
static inline vec_int4 vec_sum4s(vec_char16 a, vec_int4 b)
2170
{
2171
  vec_int4 a01_23, a0123;
2172
 
2173
  a01_23 = (vec_int4)(spu_add(spu_rlmaska((vec_short8)(a), -8),
2174
                              spu_extend(a)));
2175
  a0123 = spu_add(spu_rlmaska(a01_23, -16), spu_extend((vec_short8)(a01_23)));
2176
  return (vec_adds(a0123, b));
2177
}
2178
 
2179
static inline vec_int4 vec_sum4s(vec_short8 a, vec_int4 b)
2180
{
2181
  vec_int4 a0123;
2182
 
2183
  a0123 = spu_add(spu_rlmaska((vec_int4)(a), -16), spu_extend(a));
2184
  return (vec_adds(a0123, b));
2185
}
2186
 
2187
 
2188
/* vec_sum2s (vector sum across partial (1/2) saturated)
2189
 * =========
2190
 */
2191
static inline vec_int4 vec_sum2s(vec_int4 a, vec_int4 b)
2192
{
2193
  vec_int4 c, d;
2194
  vec_int4 sign1, sign2, sign3;
2195
  vec_int4 carry, sum_l, sum_h, sat, sat_val;
2196
 
2197
  sign1 = spu_rlmaska(a, -31);
2198
  sign2 = spu_rlmaska(b, -31);
2199
 
2200
  c = spu_rlqwbyte(a, -4);
2201
  sign3 = spu_rlqwbyte(sign1, -4);
2202
 
2203
  carry = spu_genc(a, b);
2204
  sum_l = spu_add(a, b);
2205
  sum_h = spu_addx(sign1, sign2, carry);
2206
 
2207
  carry = spu_genc(sum_l, c);
2208
  sum_l = spu_add(sum_l, c);
2209
  sum_h = spu_addx(sum_h, sign3, carry);
2210
 
2211
  sign1 = spu_rlmaska(sum_l, -31);
2212
  sign2 = spu_rlmaska(sum_h, -31);
2213
 
2214
  sat_val = spu_xor(sign2, spu_splats((signed int)0x7FFFFFFF));
2215
 
2216
  sat = spu_orc(spu_xor(sign1, sign2), (vec_int4)spu_cmpeq(sum_h, sign2));
2217
 
2218
  d = spu_and(spu_sel(sum_l, sat_val, (vec_uint4)(sat)), (vec_int4){0, -1, 0, -1});
2219
 
2220
  return (d);
2221
}
2222
 
2223
 
2224
/* vec_sums (vector sum saturated)
2225
 * ========
2226
 */
2227
static inline vec_int4 vec_sums(vec_int4 a, vec_int4 b)
2228
{
2229
  vec_int4 a0, a1, a2, c0, c1, c2, d;
2230
  vec_int4 sign_a, sign_b, sign_l, sign_h;
2231
  vec_int4 sum_l, sum_h, sat, sat_val;
2232
 
2233
  sign_a = spu_rlmaska(a, -31);
2234
  sign_b = spu_rlmaska(b, -31);
2235
 
2236
  a0 = spu_rlqwbyte(a, -12);
2237
  a1 = spu_rlqwbyte(a, -8);
2238
  a2 = spu_rlqwbyte(a, -4);
2239
 
2240
  sum_l = spu_add(a, b);
2241
  sum_h = spu_addx(sign_a, sign_b, spu_genc(a, b));
2242
 
2243
  c2 = spu_genc(sum_l, a2);
2244
  sum_l = spu_add(sum_l, a2);
2245
  sum_h = spu_addx(sum_h, spu_rlqwbyte(sign_a, -4), c2);
2246
 
2247
  c1 = spu_genc(sum_l, a1);
2248
  sum_l = spu_add(sum_l, a1);
2249
  sum_h = spu_addx(sum_h, spu_rlqwbyte(sign_a, -8), c1);
2250
 
2251
  c0 = spu_genc(sum_l, a0);
2252
  sum_l = spu_add(sum_l, a0);
2253
  sum_h = spu_addx(sum_h, spu_rlqwbyte(sign_a, -12), c0);
2254
 
2255
  sign_l = spu_rlmaska(sum_l, -31);
2256
  sign_h = spu_rlmaska(sum_h, -31);
2257
 
2258
  sat_val = spu_xor(sign_h, spu_splats((signed int)0x7FFFFFFF));
2259
 
2260
  sat = spu_orc(spu_xor(sign_l, sign_h), (vec_int4)spu_cmpeq(sum_h, sign_h));
2261
 
2262
  d = spu_and(spu_sel(sum_l, sat_val, (vec_uint4)(sat)), ((vec_int4){0, 0, 0, -1}));
2263
 
2264
  return (d);
2265
}
2266
 
2267
 
2268
/* vec_trunc (vector truncate)
2269
 * =========
2270
 */
2271
static inline vec_float4 vec_trunc(vec_float4 a)
2272
{
2273
  vec_int4 exp;
2274
  vec_uint4 mask;
2275
 
2276
  exp  = spu_sub(127, (vec_int4)(spu_and(spu_rlmask((vec_uint4)(a), -23), 0xFF)));
2277
  mask = spu_rlmask(spu_splats((unsigned int)0x7FFFFF), exp);
2278
  mask = spu_sel(spu_splats((unsigned int)0), mask, spu_cmpgt(exp, -31));
2279
  mask = spu_or(mask, spu_xor((vec_uint4)(spu_rlmaska(spu_add(exp, -1), -31)), -1));
2280
  return (spu_andc(a, (vec_float4)(mask)));
2281
}
2282
 
2283
/* vec_unpackh (vector unpack high element)
2284
 * ===========
2285
 */
2286
static inline vec_short8 vec_unpackh(vec_char16 a)
2287
{
2288
  return (spu_extend(spu_shuffle(a, a, ((vec_uchar16){0, 0, 1, 1, 2, 2, 3, 3,
2289
                                                      4, 4, 5, 5, 6, 6, 7, 7}))));
2290
}
2291
 
2292
static inline vec_bshort8 vec_unpackh(vec_bchar16 a)
2293
{
2294
  return ((vec_bshort8)(vec_unpackh((vec_char16)(a))));
2295
}
2296
 
2297
static inline vec_int4 vec_unpackh(vec_short8 a)
2298
{
2299
  return (spu_extend(spu_shuffle(a, a, ((vec_uchar16){0, 0, 0, 1, 0, 0, 2, 3,
2300
                                                      0, 0, 4, 5, 0, 0, 6, 7}))));
2301
}
2302
 
2303
#ifdef SUPPORT_UNPACK_PIXEL
2304
/* Due to type conflicts, unpacking of pixel types and boolean shorts
2305
 * can not simultaneously be supported. By default, the boolean short is
2306
 * supported.
2307
 */
2308
static inline vec_uint4 vec_unpackh(vec_pixel8 a)
2309
{
2310
  vec_ushort8 p1, p2;
2311
 
2312
  p1 = spu_shuffle((vec_ushort8)(spu_rlmaska((vec_short8)(a.p), -7)),
2313
                   spu_and((vec_ushort8)(a.p), 0x1F),
2314
                   ((vec_uchar16){ 0, 128, 128, 17,  2, 128, 128, 19,
2315
                                   4, 128, 128, 21,  6, 128, 128, 23}));
2316
  p2 = spu_shuffle(spu_and(spu_rlmask((vec_ushort8)(a.p), -5), 0x1F),
2317
                   spu_and(spu_rlmask((vec_ushort8)(a.p), -10), 0x1F),
2318
                   ((vec_uchar16){ 128,  17, 1, 128, 128,  19, 3, 128,
2319
                                   128,  21, 5, 128, 128,  23, 7, 128}));
2320
  return ((vec_uint4)(spu_or(p1, p2)));
2321
}
2322
 
2323
#else
2324
 
2325
static inline vec_bint4 vec_unpackh(vec_bshort8 a)
2326
{
2327
  return ((vec_bint4)(vec_unpackh((vec_short8)(a))));
2328
}
2329
#endif
2330
 
2331
 
2332
 
2333
 
2334
 
2335
/* vec_unpackl (vector unpack low element)
2336
 * ===========
2337
 */
2338
static inline vec_short8 vec_unpackl(vec_char16 a)
2339
{
2340
  return (spu_extend(spu_shuffle(a, a, ((vec_uchar16){8, 8, 9, 9, 10, 10, 11, 11,
2341
                                                      12, 12, 13, 13, 14, 14, 15, 15}))));
2342
}
2343
 
2344
static inline vec_bshort8 vec_unpackl(vec_bchar16 a)
2345
{
2346
  return ((vec_bshort8)(vec_unpackl((vec_char16)(a))));
2347
}
2348
 
2349
 
2350
static inline vec_int4 vec_unpackl(vec_short8 a)
2351
{
2352
  return (spu_extend(spu_shuffle(a, a, ((vec_uchar16){0, 0, 8, 9, 0, 0, 10, 11,
2353
                                                      0, 0,12,13, 0, 0, 14, 15}))));
2354
}
2355
 
2356
 
2357
#ifdef SUPPORT_UNPACK_PIXEL
2358
/* Due to type conflicts, unpacking of pixel types and boolean shorts
2359
 * can not simultaneously be supported. By default, the boolean short is
2360
 * supported.
2361
 */
2362
static inline vec_uint4 vec_unpackl(vec_pixel8 a)
2363
{
2364
  vec_ushort8 p1, p2;
2365
 
2366
  p1 = spu_shuffle((vec_ushort8)(spu_rlmaska((vec_short8)(a), -7)),
2367
                   spu_and((vec_ushort8)(a), 0x1F),
2368
                   ((vec_uchar16){ 8, 128, 128, 25,  10, 128, 128, 27,
2369
                                  12, 128, 128, 29,  14, 128, 128, 31}));
2370
  p2 = spu_shuffle(spu_and(spu_rlmask((vec_ushort8)(a), -5), 0x1F),
2371
                   spu_and(spu_rlmask((vec_ushort8)(a), -10), 0x1F),
2372
                   ((vec_uchar16){ 128, 25,  9, 128, 128, 27, 11, 128,
2373
                                   128, 29, 13, 128, 128, 31, 15, 128}));
2374
  return ((vec_uint4)(spu_or(p1, p2)));
2375
}
2376
 
2377
#else
2378
 
2379
static inline vec_bint4 vec_unpackl(vec_bshort8 a)
2380
{
2381
  return ((vec_bint4)(vec_unpackl((vec_short8)(a))));
2382
 
2383
}
2384
#endif
2385
 
2386
 
2387
 
2388
/* vec_xor (vector logical xor)
2389
 * ======
2390
 */
2391
static inline vec_uchar16 vec_xor(vec_uchar16 a, vec_uchar16 b)
2392
{
2393
  return (spu_xor(a, b));
2394
}
2395
 
2396
static inline vec_char16 vec_xor(vec_char16 a, vec_char16 b)
2397
{
2398
  return (spu_xor(a, b));
2399
}
2400
 
2401
static inline vec_char16 vec_xor(vec_bchar16 a, vec_char16 b)
2402
{
2403
  return (spu_xor((vec_char16)(a), b));
2404
}
2405
 
2406
static inline vec_char16 vec_xor(vec_char16 a, vec_bchar16 b)
2407
{
2408
  return (spu_xor(a, (vec_char16)(b)));
2409
}
2410
 
2411
static inline vec_ushort8 vec_xor(vec_ushort8 a, vec_ushort8 b)
2412
{
2413
  return (spu_xor(a, b));
2414
}
2415
 
2416
static inline vec_short8 vec_xor(vec_short8 a, vec_short8 b)
2417
{
2418
  return (spu_xor(a, b));
2419
}
2420
 
2421
static inline vec_short8 vec_xor(vec_bshort8 a, vec_short8 b)
2422
{
2423
  return (spu_xor((vec_short8)(a), b));
2424
}
2425
 
2426
static inline vec_short8 vec_xor(vec_short8 a, vec_bshort8 b)
2427
{
2428
  return (spu_xor(a, (vec_short8)(b)));
2429
}
2430
 
2431
static inline vec_uint4 vec_xor(vec_uint4 a, vec_uint4 b)
2432
{
2433
  return (spu_xor(a, b));
2434
}
2435
 
2436
static inline vec_int4 vec_xor(vec_int4 a, vec_int4 b)
2437
{
2438
  return (spu_xor(a, b));
2439
}
2440
 
2441
static inline vec_int4 vec_xor(vec_bint4 a, vec_int4 b)
2442
{
2443
  return (spu_xor((vec_int4)(a), b));
2444
}
2445
 
2446
static inline vec_int4 vec_xor(vec_int4 a, vec_bint4 b)
2447
{
2448
  return (spu_xor(a, (vec_int4)(b)));
2449
}
2450
 
2451
static inline vec_float4 vec_xor(vec_float4 a, vec_float4 b)
2452
{
2453
  return (spu_xor(a, b));
2454
}
2455
 
2456
static inline vec_float4 vec_xor(vec_bint4 a, vec_float4 b)
2457
{
2458
  return (spu_xor((vec_float4)(a),b));
2459
}
2460
 
2461
static inline vec_float4 vec_xor(vec_float4 a, vec_bint4 b)
2462
{
2463
  return (spu_xor(a, (vec_float4)(b)));
2464
}
2465
 
2466
/************************************************************************
2467
 *                        PREDICATES
2468
 ************************************************************************/
2469
 
2470
/* vec_all_eq (all elements equal)
2471
 * ==========
2472
 */
2473
static inline int vec_all_eq(vec_uchar16 a, vec_uchar16 b)
2474
{
2475
  return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0xFFFF));
2476
}
2477
 
2478
static inline int vec_all_eq(vec_char16 a, vec_char16 b)
2479
{
2480
  return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0xFFFF));
2481
}
2482
 
2483
static inline int vec_all_eq(vec_bchar16 a, vec_char16 b)
2484
{
2485
  return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_char16)(a), b)), 0) == 0xFFFF));
2486
}
2487
 
2488
static inline int vec_all_eq(vec_char16 a, vec_bchar16 b)
2489
{
2490
  return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_char16)(b))), 0) == 0xFFFF));
2491
}
2492
 
2493
static inline int vec_all_eq(vec_ushort8 a, vec_ushort8 b)
2494
{
2495
  return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0xFF));
2496
}
2497
 
2498
static inline int vec_all_eq(vec_short8 a, vec_short8 b)
2499
{
2500
  return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0xFF));
2501
}
2502
 
2503
static inline int vec_all_eq(vec_bshort8 a, vec_short8 b)
2504
{
2505
  return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_short8)(a), b)), 0) == 0xFF));
2506
}
2507
 
2508
static inline int vec_all_eq(vec_short8 a, vec_bshort8 b)
2509
{
2510
  return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_short8)(b))), 0) == 0xFF));
2511
}
2512
 
2513
static inline int vec_all_eq(vec_uint4 a, vec_uint4 b)
2514
{
2515
  return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0xF));
2516
}
2517
 
2518
static inline int vec_all_eq(vec_int4 a, vec_int4 b)
2519
{
2520
  return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0xF));
2521
}
2522
 
2523
static inline int vec_all_eq(vec_bint4 a, vec_int4 b)
2524
{
2525
  return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_int4)(a), b)), 0) == 0xF));
2526
}
2527
 
2528
static inline int vec_all_eq(vec_int4 a, vec_bint4 b)
2529
{
2530
  return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_int4)(b))), 0) == 0xF));
2531
}
2532
 
2533
static inline int vec_all_eq(vec_float4 a, vec_float4 b)
2534
{
2535
  return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0xF));
2536
}
2537
 
2538
 
2539
/* vec_all_ge (all elements greater than or equal)
2540
 * ==========
2541
 */
2542
static inline int vec_all_ge(vec_uchar16 a, vec_uchar16 b)
2543
{
2544
  return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0));
2545
}
2546
 
2547
static inline int vec_all_ge(vec_char16 a, vec_char16 b)
2548
{
2549
  return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0));
2550
}
2551
 
2552
static inline  int vec_all_ge(vec_bchar16 a, vec_char16 b)
2553
{
2554
  return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_char16)(a))), 0) == 0));
2555
}
2556
 
2557
static inline int vec_all_ge(vec_char16 a, vec_bchar16 b)
2558
{
2559
  return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_char16)(b), a)), 0) == 0));
2560
}
2561
 
2562
static inline int vec_all_ge(vec_ushort8 a, vec_ushort8 b)
2563
{
2564
  return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0));
2565
}
2566
 
2567
static inline int vec_all_ge(vec_short8 a, vec_short8 b)
2568
{
2569
  return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0));
2570
}
2571
 
2572
static inline int vec_all_ge(vec_bshort8 a, vec_short8 b)
2573
{
2574
  return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_short8)(a))), 0) == 0));
2575
}
2576
 
2577
static inline int vec_all_ge(vec_short8 a, vec_bshort8 b)
2578
{
2579
  return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_short8)(b), a)), 0) == 0));
2580
}
2581
 
2582
static inline int vec_all_ge(vec_uint4 a, vec_uint4 b)
2583
{
2584
  return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0));
2585
}
2586
 
2587
static inline int vec_all_ge(vec_int4 a, vec_int4 b)
2588
{
2589
  return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0));
2590
}
2591
 
2592
static inline int vec_all_ge(vec_bint4 a, vec_int4 b)
2593
{
2594
  return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_int4)(a))), 0) == 0));
2595
}
2596
 
2597
static inline int vec_all_ge(vec_int4 a, vec_bint4 b)
2598
{
2599
  return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_int4)(b), a)), 0) == 0));
2600
}
2601
 
2602
static inline int vec_all_ge(vec_float4 a, vec_float4 b)
2603
{
2604
  return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0));
2605
}
2606
 
2607
 
2608
/* vec_all_gt (all elements greater than)
2609
 * ==========
2610
 */
2611
static inline int vec_all_gt(vec_uchar16 a, vec_uchar16 b)
2612
{
2613
  return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0xFFFF));
2614
}
2615
 
2616
static inline int vec_all_gt(vec_char16 a, vec_char16 b)
2617
{
2618
  return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0xFFFF));
2619
}
2620
 
2621
static inline int vec_all_gt(vec_bchar16 a, vec_char16 b)
2622
{
2623
  return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_char16)(a), b)), 0) == 0xFFFF));
2624
}
2625
 
2626
static inline int vec_all_gt(vec_char16 a, vec_bchar16 b)
2627
{
2628
  return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_char16)(b))), 0) == 0xFFFF));
2629
}
2630
 
2631
static inline int vec_all_gt(vec_ushort8 a, vec_ushort8 b)
2632
{
2633
  return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0xFF));
2634
}
2635
 
2636
static inline int vec_all_gt(vec_short8 a, vec_short8 b)
2637
{
2638
  return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0xFF));
2639
}
2640
 
2641
static inline int vec_all_gt(vec_bshort8 a, vec_short8 b)
2642
{
2643
  return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_short8)(a), b)), 0) == 0xFF));
2644
}
2645
 
2646
static inline int vec_all_gt(vec_short8 a, vec_bshort8 b)
2647
{
2648
  return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_short8)(b))), 0) == 0xFF));
2649
}
2650
 
2651
static inline int vec_all_gt(vec_uint4 a, vec_uint4 b)
2652
{
2653
  return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0xF));
2654
}
2655
 
2656
static inline int vec_all_gt(vec_int4 a, vec_int4 b)
2657
{
2658
  return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0xF));
2659
}
2660
 
2661
static inline int vec_all_gt(vec_bint4 a, vec_int4 b)
2662
{
2663
  return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_int4)(a), b)), 0) == 0xF));
2664
}
2665
 
2666
static inline int vec_all_gt(vec_int4 a, vec_bint4 b)
2667
{
2668
  return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_int4)(b))), 0) == 0xF));
2669
}
2670
 
2671
static inline int vec_all_gt(vec_float4 a, vec_float4 b)
2672
{
2673
  return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0xF));
2674
}
2675
 
2676
 
2677
/* vec_all_in (all elements in bounds)
2678
 * ==========
2679
 */
2680
static inline int vec_all_in(vec_float4 a, vec_float4 b)
2681
{
2682
  return (spu_extract(spu_gather(spu_nor(spu_cmpabsgt(a, b), (vec_uint4)(spu_rlmaska((vec_int4)(b), -31)))), 0) == 0xF);
2683
}
2684
 
2685
 
2686
/* vec_all_le (all elements less than or equal)
2687
 * ==========
2688
 */
2689
static inline int vec_all_le(vec_uchar16 a, vec_uchar16 b)
2690
{
2691
  return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0));
2692
}
2693
 
2694
static inline int vec_all_le(vec_char16 a, vec_char16 b)
2695
{
2696
  return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0));
2697
}
2698
 
2699
static inline int vec_all_le(vec_bchar16 a, vec_char16 b)
2700
{
2701
  return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_char16)(a), b)), 0) == 0));
2702
}
2703
 
2704
static inline int vec_all_le(vec_char16 a, vec_bchar16 b)
2705
{
2706
  return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_char16)(b))), 0) == 0));
2707
}
2708
 
2709
static inline int vec_all_le(vec_ushort8 a, vec_ushort8 b)
2710
{
2711
  return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0));
2712
}
2713
 
2714
static inline int vec_all_le(vec_short8 a, vec_short8 b)
2715
{
2716
  return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0));
2717
}
2718
 
2719
static inline int vec_all_le(vec_bshort8 a, vec_short8 b)
2720
{
2721
  return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_short8)(a), b)), 0) == 0));
2722
}
2723
 
2724
static inline int vec_all_le(vec_short8 a, vec_bshort8 b)
2725
{
2726
  return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_short8)(b))), 0) == 0));
2727
}
2728
 
2729
static inline int vec_all_le(vec_uint4 a, vec_uint4 b)
2730
{
2731
  return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0));
2732
}
2733
 
2734
static inline int vec_all_le(vec_int4 a, vec_int4 b)
2735
{
2736
  return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0));
2737
}
2738
 
2739
static inline int vec_all_le(vec_bint4 a, vec_int4 b)
2740
{
2741
  return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_int4)(a), b)), 0) == 0));
2742
}
2743
 
2744
static inline int vec_all_le(vec_int4 a, vec_bint4 b)
2745
{
2746
  return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_int4)(b))), 0) == 0));
2747
}
2748
 
2749
static inline int vec_all_le(vec_float4 a, vec_float4 b)
2750
{
2751
  return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0));
2752
}
2753
 
2754
 
2755
/* vec_all_lt (all elements less than)
2756
 * ==========
2757
 */
2758
static inline int vec_all_lt(vec_uchar16 a, vec_uchar16 b)
2759
{
2760
  return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0xFFFF));
2761
}
2762
 
2763
static inline int vec_all_lt(vec_char16 a, vec_char16 b)
2764
{
2765
  return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0xFFFF));
2766
}
2767
 
2768
static inline int vec_all_lt(vec_bchar16 a, vec_char16 b)
2769
{
2770
  return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_char16)(a))), 0) == 0xFFFF));
2771
}
2772
 
2773
static inline int vec_all_lt(vec_char16 a, vec_bchar16 b)
2774
{
2775
  return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_char16)(b), a)), 0) == 0xFFFF));
2776
}
2777
 
2778
static inline int vec_all_lt(vec_ushort8 a, vec_ushort8 b)
2779
{
2780
  return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0xFF));
2781
}
2782
 
2783
static inline int vec_all_lt(vec_short8 a, vec_short8 b)
2784
{
2785
  return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0xFF));
2786
}
2787
 
2788
static inline int vec_all_lt(vec_bshort8 a, vec_short8 b)
2789
{
2790
  return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_short8)(a))), 0) == 0xFF));
2791
}
2792
 
2793
static inline int vec_all_lt(vec_short8 a, vec_bshort8 b)
2794
{
2795
  return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_short8)(b), a)), 0) == 0xFF));
2796
}
2797
 
2798
static inline int vec_all_lt(vec_uint4 a, vec_uint4 b)
2799
{
2800
  return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0xF));
2801
}
2802
 
2803
static inline int vec_all_lt(vec_int4 a, vec_int4 b)
2804
{
2805
  return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0xF));
2806
}
2807
 
2808
static inline int vec_all_lt(vec_bint4 a, vec_int4 b)
2809
{
2810
  return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_int4)(a))), 0) == 0xF));
2811
}
2812
 
2813
static inline int vec_all_lt(vec_int4 a, vec_bint4 b)
2814
{
2815
  return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_int4)(b), a)), 0) == 0xF));
2816
}
2817
 
2818
static inline int vec_all_lt(vec_float4 a, vec_float4 b)
2819
{
2820
  return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0xF));
2821
}
2822
 
2823
 
2824
/* vec_all_nan (all elements not a number)
2825
 * ===========
2826
 */
2827
static inline int vec_all_nan(vec_float4 a)
2828
{
2829
  vec_uint4 exp, man;
2830
  vec_uint4 exp_mask = spu_splats((unsigned int)0x7F800000);
2831
 
2832
  exp = spu_and((vec_uint4)(a), exp_mask);
2833
  man = spu_and((vec_uint4)(a), spu_splats((unsigned int)0x007FFFFF));
2834
  return ((int)(spu_extract(spu_gather(spu_andc(spu_cmpeq(exp, exp_mask),
2835
                                                spu_cmpeq(man, 0))), 0) == 0xF));
2836
}
2837
 
2838
#define vec_all_nan(_a)         (0)
2839
 
2840
 
2841
/* vec_all_ne (all elements not equal)
2842
 * ==========
2843
 */
2844
static inline int vec_all_ne(vec_uchar16 a, vec_uchar16 b)
2845
{
2846
  return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0));
2847
}
2848
 
2849
static inline int vec_all_ne(vec_char16 a, vec_char16 b)
2850
{
2851
  return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0));
2852
}
2853
 
2854
static inline int vec_all_ne(vec_bchar16 a, vec_char16 b)
2855
{
2856
  return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_char16)(a), b)), 0) == 0));
2857
}
2858
 
2859
static inline int vec_all_ne(vec_char16 a, vec_bchar16 b)
2860
{
2861
  return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_char16)(b))), 0) == 0));
2862
}
2863
 
2864
static inline int vec_all_ne(vec_ushort8 a, vec_ushort8 b)
2865
{
2866
  return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0));
2867
}
2868
 
2869
static inline int vec_all_ne(vec_short8 a, vec_short8 b)
2870
{
2871
  return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0));
2872
}
2873
 
2874
static inline int vec_all_ne(vec_bshort8 a, vec_short8 b)
2875
{
2876
  return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_short8)(a), b)), 0) == 0));
2877
}
2878
 
2879
static inline int vec_all_ne(vec_short8 a, vec_bshort8 b)
2880
{
2881
  return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_short8)(b))), 0) == 0));
2882
}
2883
 
2884
static inline int vec_all_ne(vec_uint4 a, vec_uint4 b)
2885
{
2886
  return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0));
2887
}
2888
 
2889
static inline int vec_all_ne(vec_int4 a, vec_int4 b)
2890
{
2891
  return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0));
2892
}
2893
 
2894
static inline int vec_all_ne(vec_bint4 a, vec_int4 b)
2895
{
2896
  return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_int4)(a), b)), 0) == 0));
2897
}
2898
 
2899
static inline int vec_all_ne(vec_int4 a, vec_bint4 b)
2900
{
2901
  return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_int4)(b))), 0) == 0));
2902
}
2903
 
2904
static inline int vec_all_ne(vec_float4 a, vec_float4 b)
2905
{
2906
  return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0));
2907
}
2908
 
2909
 
2910
/* vec_all_nge (all elements not greater than or equal)
2911
 * ===========
2912
 */
2913
static inline int vec_all_nge(vec_float4 a, vec_float4 b)
2914
{
2915
  return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0xF));
2916
}
2917
 
2918
 
2919
/* vec_all_ngt (all elements not greater than)
2920
 * ===========
2921
 */
2922
static inline int vec_all_ngt(vec_float4 a, vec_float4 b)
2923
{
2924
  return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0));
2925
}
2926
 
2927
 
2928
/* vec_all_nle (all elements not less than or equal)
2929
 * ===========
2930
 */
2931
static inline int vec_all_nle(vec_float4 a, vec_float4 b)
2932
{
2933
  return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0xF));
2934
}
2935
 
2936
 
2937
/* vec_all_nlt (all elements not less than)
2938
 * ===========
2939
 */
2940
static inline int vec_all_nlt(vec_float4 a, vec_float4 b)
2941
{
2942
  return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0));
2943
}
2944
 
2945
 
2946
/* vec_all_numeric (all elements numeric)
2947
 * ===========
2948
 */
2949
static inline int vec_all_numeric(vec_float4 a)
2950
{
2951
  vec_uint4 exp;
2952
 
2953
  exp = spu_and(spu_rlmask((vec_uint4)(a), -23), 0xFF);
2954
  return ((int)(spu_extract(spu_gather(spu_cmpeq(exp, 255)), 0) == 0));
2955
}
2956
 
2957
 
2958
 
2959
/* vec_any_eq (any elements equal)
2960
 * ==========
2961
 */
2962
static inline int vec_any_eq(vec_uchar16 a, vec_uchar16 b)
2963
{
2964
  return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0));
2965
}
2966
 
2967
static inline int vec_any_eq(vec_char16 a, vec_char16 b)
2968
{
2969
  return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0));
2970
}
2971
 
2972
static inline int vec_any_eq(vec_bchar16 a, vec_char16 b)
2973
{
2974
  return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_char16)(a), b)), 0) != 0));
2975
}
2976
 
2977
static inline int vec_any_eq(vec_char16 a, vec_bchar16 b)
2978
{
2979
  return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_char16)(b))), 0) != 0));
2980
}
2981
 
2982
static inline int vec_any_eq(vec_ushort8 a, vec_ushort8 b)
2983
{
2984
  return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0));
2985
}
2986
 
2987
static inline int vec_any_eq(vec_short8 a, vec_short8 b)
2988
{
2989
  return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0));
2990
}
2991
 
2992
static inline int vec_any_eq(vec_bshort8 a, vec_short8 b)
2993
{
2994
  return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_short8)(a), b)), 0) != 0));
2995
}
2996
 
2997
static inline int vec_any_eq(vec_short8 a, vec_bshort8 b)
2998
{
2999
  return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_short8)(b))), 0) != 0));
3000
}
3001
 
3002
static inline int vec_any_eq(vec_uint4 a, vec_uint4 b)
3003
{
3004
  return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpeq(a, b), -31)), 0)));
3005
}
3006
 
3007
static inline int vec_any_eq(vec_int4 a, vec_int4 b)
3008
{
3009
  return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpeq(a, b), -31)), 0)));
3010
}
3011
 
3012
static inline int vec_any_eq(vec_bint4 a, vec_int4 b)
3013
{
3014
  return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpeq((vec_int4)(a), b), -31)), 0)));
3015
}
3016
 
3017
static inline int vec_any_eq(vec_int4 a, vec_bint4 b)
3018
{
3019
  return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpeq(a, (vec_int4)(b)), -31)), 0)));
3020
}
3021
 
3022
static inline int vec_any_eq(vec_float4 a, vec_float4 b)
3023
{
3024
  return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpeq(a, b), -31)), 0)));
3025
}
3026
 
3027
/* vec_any_ge (any elements greater than or equal)
3028
 * ==========
3029
 */
3030
static inline int vec_any_ge(vec_uchar16 a, vec_uchar16 b)
3031
{
3032
  return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0xFFFF));
3033
}
3034
 
3035
static inline int vec_any_ge(vec_char16 a, vec_char16 b)
3036
{
3037
  return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0xFFFF));
3038
}
3039
 
3040
static inline int vec_any_ge(vec_bchar16 a, vec_char16 b)
3041
{
3042
  return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_char16)(a))), 0) != 0xFFFF));
3043
}
3044
 
3045
static inline int vec_any_ge(vec_char16 a, vec_bchar16 b)
3046
{
3047
  return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_char16)(b), a)), 0) != 0xFFFF));
3048
}
3049
 
3050
static inline int vec_any_ge(vec_ushort8 a, vec_ushort8 b)
3051
{
3052
  return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0xFF));
3053
}
3054
 
3055
static inline int vec_any_ge(vec_short8 a, vec_short8 b)
3056
{
3057
  return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0xFF));
3058
}
3059
 
3060
static inline int vec_any_ge(vec_bshort8 a, vec_short8 b)
3061
{
3062
  return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_short8)(a))), 0) != 0xFF));
3063
}
3064
 
3065
static inline int vec_any_ge(vec_short8 a, vec_bshort8 b)
3066
{
3067
  return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_short8)(b), a)), 0) != 0xFF));
3068
}
3069
 
3070
static inline int vec_any_ge(vec_uint4 a, vec_uint4 b)
3071
{
3072
  return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0xF));
3073
}
3074
 
3075
static inline int vec_any_ge(vec_int4 a, vec_int4 b)
3076
{
3077
  return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0xF));
3078
}
3079
 
3080
static inline int vec_any_ge(vec_bint4 a, vec_int4 b)
3081
{
3082
  return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_int4)(a))), 0) != 0xF));
3083
}
3084
 
3085
static inline int vec_any_ge(vec_int4 a, vec_bint4 b)
3086
{
3087
  return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_int4)(b), a)), 0) != 0xF));
3088
}
3089
 
3090
static inline int vec_any_ge(vec_float4 a, vec_float4 b)
3091
{
3092
  return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0xF));
3093
}
3094
 
3095
 
3096
/* vec_any_gt (any elements greater than)
3097
 * ==========
3098
 */
3099
static inline int vec_any_gt(vec_uchar16 a, vec_uchar16 b)
3100
{
3101
  return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0));
3102
}
3103
 
3104
static inline int vec_any_gt(vec_char16 a, vec_char16 b)
3105
{
3106
  return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0));
3107
}
3108
 
3109
static inline int vec_any_gt(vec_bchar16 a, vec_char16 b)
3110
{
3111
  return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_char16)(a), b)), 0) != 0));
3112
}
3113
 
3114
static inline int vec_any_gt(vec_char16 a, vec_bchar16 b)
3115
{
3116
  return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_char16)(b))), 0) != 0));
3117
}
3118
 
3119
static inline int vec_any_gt(vec_ushort8 a, vec_ushort8 b)
3120
{
3121
  return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0));
3122
}
3123
 
3124
static inline int vec_any_gt(vec_short8 a, vec_short8 b)
3125
{
3126
  return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0));
3127
}
3128
 
3129
static inline int vec_any_gt(vec_bshort8 a, vec_short8 b)
3130
{
3131
  return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_short8)(a), b)), 0) != 0));
3132
}
3133
 
3134
static inline int vec_any_gt(vec_short8 a, vec_bshort8 b)
3135
{
3136
  return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_short8)(b))), 0) != 0));
3137
}
3138
 
3139
 
3140
static inline int vec_any_gt(vec_uint4 a, vec_uint4 b)
3141
{
3142
  return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(a, b), -31)), 0)));
3143
}
3144
 
3145
static inline int vec_any_gt(vec_int4 a, vec_int4 b)
3146
{
3147
  return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(a, b), -31)), 0)));
3148
}
3149
 
3150
static inline int vec_any_gt(vec_bint4 a, vec_int4 b)
3151
{
3152
  return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt((vec_int4)(a), b), -31)), 0)));
3153
}
3154
 
3155
static inline int vec_any_gt(vec_int4 a, vec_bint4 b)
3156
{
3157
  return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(a, (vec_int4)(b)), -31)), 0)));
3158
}
3159
 
3160
static inline int vec_any_gt(vec_float4 a, vec_float4 b)
3161
{
3162
  return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(a, b), -31)), 0)));
3163
}
3164
 
3165
/* vec_any_le (any elements less than or equal)
3166
 * ==========
3167
 */
3168
static inline int vec_any_le(vec_uchar16 a, vec_uchar16 b)
3169
{
3170
  return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0xFFFF));
3171
}
3172
 
3173
static inline int vec_any_le(vec_char16 a, vec_char16 b)
3174
{
3175
  return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0xFFFF));
3176
}
3177
 
3178
static inline int vec_any_le(vec_bchar16 a, vec_char16 b)
3179
{
3180
  return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_char16)(a), b)), 0) != 0xFFFF));
3181
}
3182
 
3183
static inline int vec_any_le(vec_char16 a, vec_bchar16 b)
3184
{
3185
  return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_char16)(b))), 0) != 0xFFFF));
3186
}
3187
 
3188
static inline int vec_any_le(vec_ushort8 a, vec_ushort8 b)
3189
{
3190
  return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0xFF));
3191
}
3192
 
3193
static inline int vec_any_le(vec_short8 a, vec_short8 b)
3194
{
3195
  return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0xFF));
3196
}
3197
 
3198
static inline int vec_any_le(vec_bshort8 a, vec_short8 b)
3199
{
3200
  return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_short8)(a), b)), 0) != 0xFF));
3201
}
3202
 
3203
static inline int vec_any_le(vec_short8 a, vec_bshort8 b)
3204
{
3205
  return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_short8)(b))), 0) != 0xFF));
3206
}
3207
 
3208
static inline int vec_any_le(vec_uint4 a, vec_uint4 b)
3209
{
3210
  return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0xF));
3211
}
3212
 
3213
static inline int vec_any_le(vec_int4 a, vec_int4 b)
3214
{
3215
  return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0xF));
3216
}
3217
 
3218
static inline int vec_any_le(vec_bint4 a, vec_int4 b)
3219
{
3220
  return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_int4)(a), b)), 0) != 0xF));
3221
}
3222
 
3223
static inline int vec_any_le(vec_int4 a, vec_bint4 b)
3224
{
3225
  return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_int4)(b))), 0) != 0xF));
3226
}
3227
 
3228
static inline int vec_any_le(vec_float4 a, vec_float4 b)
3229
{
3230
  return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0xF));
3231
}
3232
 
3233
 
3234
/* vec_any_lt (any elements less than)
3235
 * ==========
3236
 */
3237
static inline int vec_any_lt(vec_uchar16 a, vec_uchar16 b)
3238
{
3239
  return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0));
3240
}
3241
 
3242
static inline int vec_any_lt(vec_char16 a, vec_char16 b)
3243
{
3244
  return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0));
3245
}
3246
 
3247
static inline int vec_any_lt(vec_bchar16 a, vec_char16 b)
3248
{
3249
  return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_char16)(a))), 0) != 0));
3250
}
3251
 
3252
static inline int vec_any_lt(vec_char16 a, vec_bchar16 b)
3253
{
3254
  return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_char16)(b), a)), 0) != 0));
3255
}
3256
 
3257
static inline int vec_any_lt(vec_ushort8 a, vec_ushort8 b)
3258
{
3259
  return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0));
3260
}
3261
 
3262
static inline int vec_any_lt(vec_short8 a, vec_short8 b)
3263
{
3264
  return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0));
3265
}
3266
 
3267
static inline int vec_any_lt(vec_bshort8 a, vec_short8 b)
3268
{
3269
  return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_short8)(a))), 0) != 0));
3270
}
3271
 
3272
static inline int vec_any_lt(vec_short8 a, vec_bshort8 b)
3273
{
3274
  return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_short8)(b), a)), 0) != 0));
3275
}
3276
 
3277
static inline int vec_any_lt(vec_uint4 a, vec_uint4 b)
3278
{
3279
  return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(b, a), -31)), 0)));
3280
}
3281
 
3282
static inline int vec_any_lt(vec_int4 a, vec_int4 b)
3283
{
3284
  return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(b, a), -31)), 0)));
3285
}
3286
 
3287
static inline int vec_any_lt(vec_bint4 a, vec_int4 b)
3288
{
3289
  return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(b, (vec_int4)(a)), -31)), 0)));
3290
}
3291
 
3292
static inline int vec_any_lt(vec_int4 a, vec_bint4 b)
3293
{
3294
  return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt((vec_int4)(b), a), -31)), 0)));
3295
}
3296
 
3297
static inline int vec_any_lt(vec_float4 a, vec_float4 b)
3298
{
3299
  return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(b, a), -31)), 0)));
3300
}
3301
 
3302
/* vec_any_nan (any elements not a number)
3303
 * ===========
3304
 */
3305
static inline int vec_any_nan(vec_float4 a)
3306
{
3307
  vec_uint4 exp, man;
3308
  vec_uint4 exp_mask = spu_splats((unsigned int)0x7F800000);
3309
 
3310
  exp = spu_and((vec_uint4)(a), exp_mask);
3311
  man = spu_and((vec_uint4)(a), spu_splats((unsigned int)0x007FFFFF));
3312
  return ((int)(spu_extract(spu_gather(spu_andc(spu_cmpeq(exp, exp_mask),
3313
                                                spu_cmpeq(man, 0))), 0) != 0));
3314
}
3315
 
3316
 
3317
/* vec_any_ne (any elements not equal)
3318
 * ==========
3319
 */
3320
static inline int vec_any_ne(vec_uchar16 a, vec_uchar16 b)
3321
{
3322
  return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0xFFFF));
3323
}
3324
 
3325
static inline int vec_any_ne(vec_char16 a, vec_char16 b)
3326
{
3327
  return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0xFFFF));
3328
}
3329
 
3330
static inline int vec_any_ne(vec_bchar16 a, vec_char16 b)
3331
{
3332
  return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_char16)(a), b)), 0) != 0xFFFF));
3333
}
3334
 
3335
static inline int vec_any_ne(vec_char16 a, vec_bchar16 b)
3336
{
3337
  return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_char16)(b))), 0) != 0xFFFF));
3338
}
3339
 
3340
static inline int vec_any_ne(vec_ushort8 a, vec_ushort8 b)
3341
{
3342
  return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0xFF));
3343
}
3344
 
3345
static inline int vec_any_ne(vec_short8 a, vec_short8 b)
3346
{
3347
  return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0xFF));
3348
}
3349
 
3350
static inline int vec_any_ne(vec_bshort8 a, vec_short8 b)
3351
{
3352
  return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_short8)(a), b)), 0) != 0xFF));
3353
}
3354
 
3355
static inline int vec_any_ne(vec_short8 a, vec_bshort8 b)
3356
{
3357
  return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_short8)(b))), 0) != 0xFF));
3358
}
3359
 
3360
static inline int vec_any_ne(vec_uint4 a, vec_uint4 b)
3361
{
3362
  return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0xF));
3363
}
3364
 
3365
static inline int vec_any_ne(vec_int4 a, vec_int4 b)
3366
{
3367
  return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0xF));
3368
}
3369
 
3370
static inline int vec_any_ne(vec_bint4 a, vec_int4 b)
3371
{
3372
  return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_int4)(a), b)), 0) != 0xF));
3373
}
3374
 
3375
static inline int vec_any_ne(vec_int4 a, vec_bint4 b)
3376
{
3377
  return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_int4)(b))), 0) != 0xF));
3378
}
3379
 
3380
static inline int vec_any_ne(vec_float4 a, vec_float4 b)
3381
{
3382
  return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0xF));
3383
}
3384
 
3385
 
3386
/* vec_any_nge (any elements not greater than or equal)
3387
 * ===========
3388
 */
3389
static inline int vec_any_nge(vec_float4 a, vec_float4 b)
3390
{
3391
  return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(b, a), -31)), 0)));
3392
}
3393
 
3394
/* vec_any_ngt (any elements not greater than)
3395
 * ===========
3396
 */
3397
static inline int vec_any_ngt(vec_float4 a, vec_float4 b)
3398
{
3399
  return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0xF));
3400
}
3401
 
3402
 
3403
/* vec_any_nle (any elements not less than or equal)
3404
 * ===========
3405
 */
3406
static inline int vec_any_nle(vec_float4 a, vec_float4 b)
3407
{
3408
  return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0));
3409
}
3410
 
3411
 
3412
/* vec_any_nlt (any elements not less than)
3413
 * ===========
3414
 */
3415
static inline int vec_any_nlt(vec_float4 a, vec_float4 b)
3416
{
3417
  return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0xF));
3418
}
3419
 
3420
 
3421
/* vec_any_numeric (any elements numeric)
3422
 * ===============
3423
 */
3424
static inline int vec_any_numeric(vec_float4 a)
3425
{
3426
  vec_uint4 exp;
3427
 
3428
  exp = spu_and(spu_rlmask((vec_uint4)(a), -23), 0xFF);
3429
  return ((int)(spu_extract(spu_gather(spu_cmpeq(exp, 255)), 0) != 0xF));
3430
}
3431
 
3432
 
3433
/* vec_any_out (any elements out of bounds)
3434
 * ===========
3435
 */
3436
static inline int vec_any_out(vec_float4 a, vec_float4 b)
3437
{
3438
  return (spu_extract(spu_gather(spu_nor(spu_cmpabsgt(a, b), (vec_uint4)(spu_rlmaska((vec_int4)(b), -31)))), 0) != 0xF);
3439
}
3440
 
3441
 
3442
/* CBE Language Extension Intrinsics
3443
 */
3444
 
3445
/* vec_extract (extract element from vector)
3446
 * ===========
3447
 */
3448
#define vec_extract(_a, _element)       spu_extract(_a, _element)
3449
 
3450
 
3451
/* vec_insert (insert scalar into specified vector element)
3452
 * ==========
3453
 */
3454
#define vec_insert(_a, _b, _element)    spu_insert(_a, _b, _element)
3455
 
3456
/* vec_lvlx (load vector left indexed)
3457
 * ========
3458
 */
3459
static inline vec_uchar16 vec_lvlx(int a, unsigned char *b)
3460
{
3461
  vec_uchar16 *p = (vec_uchar16 *)((unsigned char *)(b) + a);
3462
  return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3463
}
3464
 
3465
static inline vec_uchar16 vec_lvlx(int a, vec_uchar16 *b)
3466
{
3467
  vec_uchar16 *p = (vec_uchar16 *)((unsigned char *)(b) + a);
3468
  return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3469
}
3470
 
3471
static inline vec_char16 vec_lvlx(int a, signed char *b)
3472
{
3473
  vec_char16 *p = (vec_char16 *)((unsigned char *)(b) + a);
3474
  return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3475
}
3476
 
3477
static inline vec_char16 vec_lvlx(int a, vec_char16 *b)
3478
{
3479
  vec_char16 *p = (vec_char16 *)((unsigned char *)(b) + a);
3480
  return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3481
}
3482
 
3483
static inline vec_ushort8 vec_lvlx(int a, unsigned short *b)
3484
{
3485
  vec_ushort8 *p = (vec_ushort8 *)((unsigned char *)(b) + a);
3486
  return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3487
}
3488
 
3489
static inline vec_ushort8 vec_lvlx(int a, vec_ushort8 *b)
3490
{
3491
  vec_ushort8 *p = (vec_ushort8 *)((unsigned char *)(b) + a);
3492
  return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3493
}
3494
 
3495
static inline vec_short8 vec_lvlx(int a, signed short *b)
3496
{
3497
  vec_short8 *p = (vec_short8 *)((unsigned char *)(b) + a);
3498
  return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3499
}
3500
 
3501
static inline vec_short8 vec_lvlx(int a, vec_short8 *b)
3502
{
3503
  vec_short8 *p = (vec_short8 *)((unsigned char *)(b) + a);
3504
  return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3505
}
3506
 
3507
static inline vec_uint4 vec_lvlx(int a, unsigned int *b)
3508
{
3509
  vec_uint4 *p = (vec_uint4 *)((unsigned char *)(b) + a);
3510
  return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3511
}
3512
 
3513
static inline vec_uint4 vec_lvlx(int a, vec_uint4 *b)
3514
{
3515
  vec_uint4 *p = (vec_uint4 *)((unsigned char *)(b) + a);
3516
  return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3517
}
3518
 
3519
static inline vec_int4 vec_lvlx(int a, signed int *b)
3520
{
3521
  vec_int4 *p = (vec_int4 *)((unsigned char *)(b) + a);
3522
  return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3523
}
3524
 
3525
static inline vec_int4 vec_lvlx(int a, vec_int4 *b)
3526
{
3527
  vec_int4 *p = (vec_int4 *)((unsigned char *)(b) + a);
3528
  return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3529
}
3530
 
3531
static inline vec_float4 vec_lvlx(int a, float *b)
3532
{
3533
  vec_float4 *p = (vec_float4 *)((unsigned char *)(b) + a);
3534
  return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3535
}
3536
 
3537
static inline vec_float4 vec_lvlx(int a, vec_float4 *b)
3538
{
3539
  vec_float4 *p = (vec_float4 *)((unsigned char *)(b) + a);
3540
  return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3541
}
3542
 
3543
 
3544
/* vec_lvlxl (load vector left indexed last)
3545
 * =========
3546
 */
3547
#define vec_lvlxl(_a, _b)       vec_lvlx(_a, _b)
3548
 
3549
 
3550
/* vec_lvrx (load vector right indexed)
3551
 * ========
3552
 */
3553
static inline vec_uchar16 vec_lvrx(int a, unsigned char *b)
3554
{
3555
  vec_uchar16 *p = (vec_uchar16 *)((unsigned char *)(b) + a);
3556
  return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3557
}
3558
 
3559
static inline vec_uchar16 vec_lvrx(int a, vec_uchar16 *b)
3560
{
3561
  vec_uchar16 *p = (vec_uchar16 *)((unsigned char *)(b) + a);
3562
  return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3563
}
3564
 
3565
static inline vec_char16 vec_lvrx(int a, signed char *b)
3566
{
3567
  vec_char16 *p = (vec_char16 *)((unsigned char *)(b) + a);
3568
  return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3569
}
3570
 
3571
static inline vec_char16 vec_lvrx(int a, vec_char16 *b)
3572
{
3573
  vec_char16 *p = (vec_char16 *)((unsigned char *)(b) + a);
3574
  return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3575
}
3576
 
3577
static inline vec_ushort8 vec_lvrx(int a, unsigned short *b)
3578
{
3579
  vec_ushort8 *p = (vec_ushort8 *)((unsigned char *)(b) + a);
3580
  return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3581
}
3582
 
3583
static inline vec_ushort8 vec_lvrx(int a, vec_ushort8 *b)
3584
{
3585
  vec_ushort8 *p = (vec_ushort8 *)((unsigned char *)(b) + a);
3586
  return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3587
}
3588
 
3589
static inline vec_short8 vec_lvrx(int a, signed short *b)
3590
{
3591
  vec_short8 *p = (vec_short8 *)((unsigned char *)(b) + a);
3592
  return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3593
}
3594
 
3595
static inline vec_short8 vec_lvrx(int a, vec_short8 *b)
3596
{
3597
  vec_short8 *p = (vec_short8 *)((unsigned char *)(b) + a);
3598
  return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3599
}
3600
 
3601
static inline vec_uint4 vec_lvrx(int a, unsigned int *b)
3602
{
3603
  vec_uint4 *p = (vec_uint4 *)((unsigned char *)(b) + a);
3604
  return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3605
}
3606
 
3607
static inline vec_uint4 vec_lvrx(int a, vec_uint4 *b)
3608
{
3609
  vec_uint4 *p = (vec_uint4 *)((unsigned char *)(b) + a);
3610
  return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3611
}
3612
 
3613
static inline vec_int4 vec_lvrx(int a, signed int *b)
3614
{
3615
  vec_int4 *p = (vec_int4 *)((unsigned char *)(b) + a);
3616
  return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3617
}
3618
 
3619
static inline vec_int4 vec_lvrx(int a, vec_int4 *b)
3620
{
3621
  vec_int4 *p = (vec_int4 *)((unsigned char *)(b) + a);
3622
  return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3623
}
3624
 
3625
static inline vec_float4 vec_lvrx(int a, float *b)
3626
{
3627
  vec_float4 *p = (vec_float4 *)((unsigned char *)(b) + a);
3628
  return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3629
}
3630
 
3631
static inline vec_float4 vec_lvrx(int a, vec_float4 *b)
3632
{
3633
  vec_float4 *p = (vec_float4 *)((unsigned char *)(b) + a);
3634
  return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3635
}
3636
 
3637
 
3638
 
3639
/* vec_lvrxl (load vector right indexed last)
3640
 * =========
3641
 */
3642
#define vec_lvrxl(_a, _b)       vec_lvrx(_a, _b)
3643
 
3644
 
3645
/* vec_promote (promote scalar to a vector)
3646
 * ===========
3647
 */
3648
#define vec_promote(_a, _element)       spu_promote(_a, _element)
3649
 
3650
 
3651
/* vec_splats (splat scalar to a vector)
3652
 * ==========
3653
 */
3654
#define vec_splats(_a)  spu_splats(_a)
3655
 
3656
 
3657
/* vec_stvlx (store vector left indexed)
3658
 * =========
3659
 */
3660
static inline void vec_stvlx(vec_uchar16 a, int b, unsigned char *c)
3661
{
3662
  int shift;
3663
  vec_uchar16 *p = (vec_uchar16 *)((unsigned char *)(c) + b);
3664
 
3665
  shift = -((int)p & 0xF);
3666
  *p = spu_sel(*p,
3667
               spu_rlmaskqwbyte(a, shift),
3668
               spu_rlmaskqwbyte(spu_splats((unsigned char)0xFF), shift));
3669
}
3670
 
3671
static inline void vec_stvlx(vec_uchar16 a, int b, vec_uchar16 *c)
3672
{
3673
  int shift;
3674
  vec_uchar16 *p = (vec_uchar16 *)((unsigned char *)(c) + b);
3675
 
3676
  shift = -((int)p & 0xF);
3677
  *p = spu_sel(*p,
3678
               spu_rlmaskqwbyte(a, shift),
3679
               spu_rlmaskqwbyte(spu_splats((unsigned char)0xFF), shift));
3680
}
3681
 
3682
static inline void vec_stvlx(vec_char16 a, int b, signed char *c)
3683
{
3684
  int shift;
3685
  vec_char16 *p = (vec_char16 *)((unsigned char *)(c) + b);
3686
 
3687
  shift = -((int)p & 0xF);
3688
  *p = spu_sel(*p,
3689
               spu_rlmaskqwbyte(a, shift),
3690
               spu_rlmaskqwbyte(spu_splats((unsigned char)0xFF), shift));
3691
}
3692
 
3693
static inline void vec_stvlx(vec_char16 a, int b, vec_char16 *c)
3694
{
3695
  int shift;
3696
  vec_char16 *p = (vec_char16 *)((unsigned char *)(c) + b);
3697
 
3698
  shift = -((int)p & 0xF);
3699
  *p = spu_sel(*p,
3700
               spu_rlmaskqwbyte(a, shift),
3701
               spu_rlmaskqwbyte(spu_splats((unsigned char)0xFF), shift));
3702
}
3703
 
3704
static inline void vec_stvlx(vec_ushort8 a, int b, unsigned short *c)
3705
{
3706
  int shift;
3707
  vec_ushort8 *p = (vec_ushort8 *)((unsigned char *)(c) + b);
3708
 
3709
  shift = -((int)p & 0xF);
3710
  *p = spu_sel(*p,
3711
               spu_rlmaskqwbyte(a, shift),
3712
               spu_rlmaskqwbyte(spu_splats((unsigned short)0xFFFF), shift));
3713
}
3714
 
3715
static inline void vec_stvlx(vec_ushort8 a, int b, vec_ushort8 *c)
3716
{
3717
  int shift;
3718
  vec_ushort8 *p = (vec_ushort8 *)((unsigned char *)(c) + b);
3719
 
3720
  shift = -((int)p & 0xF);
3721
  *p = spu_sel(*p,
3722
               spu_rlmaskqwbyte(a, shift),
3723
               spu_rlmaskqwbyte(spu_splats((unsigned short)0xFFFF), shift));
3724
}
3725
 
3726
static inline void vec_stvlx(vec_short8 a, int b, signed short *c)
3727
{
3728
  int shift;
3729
  vec_short8 *p = (vec_short8 *)((unsigned char *)(c) + b);
3730
 
3731
  shift = -((int)p & 0xF);
3732
  *p = spu_sel(*p,
3733
               spu_rlmaskqwbyte(a, shift),
3734
               spu_rlmaskqwbyte(spu_splats((unsigned short)0xFFFF), shift));
3735
}
3736
 
3737
static inline void vec_stvlx(vec_short8 a, int b, vec_short8 *c)
3738
{
3739
  int shift;
3740
  vec_short8 *p = (vec_short8 *)((unsigned char *)(c) + b);
3741
 
3742
  shift = -((int)p & 0xF);
3743
  *p = spu_sel(*p,
3744
               spu_rlmaskqwbyte(a, shift),
3745
               spu_rlmaskqwbyte(spu_splats((unsigned short)0xFFFF), shift));
3746
}
3747
 
3748
static inline void vec_stvlx(vec_uint4 a, int b, unsigned int *c)
3749
{
3750
  int shift;
3751
  vec_uint4 *p = (vec_uint4 *)((unsigned char *)(c) + b);
3752
 
3753
  shift = -((int)p & 0xF);
3754
  *p = spu_sel(*p,
3755
               spu_rlmaskqwbyte(a, shift),
3756
               spu_rlmaskqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
3757
}
3758
 
3759
static inline void vec_stvlx(vec_uint4 a, int b, vec_uint4 *c)
3760
{
3761
  int shift;
3762
  vec_uint4 *p = (vec_uint4 *)((unsigned char *)(c) + b);
3763
 
3764
  shift = -((int)p & 0xF);
3765
  *p = spu_sel(*p,
3766
               spu_rlmaskqwbyte(a, shift),
3767
               spu_rlmaskqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
3768
}
3769
 
3770
static inline void vec_stvlx(vec_int4 a, int b, signed int *c)
3771
{
3772
  int shift;
3773
  vec_int4 *p = (vec_int4 *)((unsigned char *)(c) + b);
3774
 
3775
  shift = -((int)p & 0xF);
3776
  *p = spu_sel(*p,
3777
               spu_rlmaskqwbyte(a, shift),
3778
               spu_rlmaskqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
3779
}
3780
 
3781
static inline void vec_stvlx(vec_int4 a, int b, vec_int4 *c)
3782
{
3783
  int shift;
3784
  vec_int4 *p = (vec_int4 *)((unsigned char *)(c) + b);
3785
 
3786
  shift = -((int)p & 0xF);
3787
  *p = spu_sel(*p,
3788
               spu_rlmaskqwbyte(a, shift),
3789
               spu_rlmaskqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
3790
}
3791
 
3792
static inline void vec_stvlx(vec_float4 a, int b, float *c)
3793
{
3794
  int shift;
3795
  vec_float4 *p = (vec_float4 *)((unsigned char *)(c) + b);
3796
 
3797
  shift = -((int)p & 0xF);
3798
  *p = spu_sel(*p,
3799
               spu_rlmaskqwbyte(a, shift),
3800
               spu_rlmaskqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
3801
}
3802
 
3803
static inline void vec_stvlx(vec_float4 a, int b, vec_float4 *c)
3804
{
3805
  int shift;
3806
  vec_float4 *p = (vec_float4 *)((unsigned char *)(c) + b);
3807
 
3808
  shift = -((int)p & 0xF);
3809
  *p = spu_sel(*p,
3810
               spu_rlmaskqwbyte(a, shift),
3811
               spu_rlmaskqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
3812
}
3813
 
3814
/* vec_stvlxl (store vector left indexed last)
3815
 * ==========
3816
 */
3817
#define vec_stvlxl(_a, _b, _c)  vec_stvlx(_a, _b, _c)
3818
 
3819
 
3820
/* vec_stvrx (store vector right indexed)
3821
 * =========
3822
 */
3823
static inline void vec_stvrx(vec_uchar16 a, int b, unsigned char *c)
3824
{
3825
  int shift;
3826
  vec_uchar16 *p = (vec_uchar16 *)((unsigned char *)(c) + b);
3827
 
3828
  shift = 16-((int)p & 0xF);
3829
  *p = spu_sel(*p,
3830
               spu_slqwbyte(a, shift),
3831
               spu_slqwbyte(spu_splats((unsigned char)0xFF), shift));
3832
}
3833
 
3834
static inline void vec_stvrx(vec_uchar16 a, int b, vec_uchar16 *c)
3835
{
3836
  int shift;
3837
  vec_uchar16 *p = (vec_uchar16 *)((unsigned char *)(c) + b);
3838
 
3839
  shift = 16-((int)p & 0xF);
3840
  *p = spu_sel(*p,
3841
               spu_slqwbyte(a, shift),
3842
               spu_slqwbyte(spu_splats((unsigned char)0xFF), shift));
3843
}
3844
 
3845
static inline void vec_stvrx(vec_char16 a, int b, signed char *c)
3846
{
3847
  int shift;
3848
  vec_char16 *p = (vec_char16 *)((unsigned char *)(c) + b);
3849
 
3850
  shift = 16-((int)p & 0xF);
3851
  *p = spu_sel(*p,
3852
               spu_slqwbyte(a, shift),
3853
               spu_slqwbyte(spu_splats((unsigned char)0xFF), shift));
3854
}
3855
 
3856
static inline void vec_stvrx(vec_char16 a, int b, vec_char16 *c)
3857
{
3858
  int shift;
3859
  vec_char16 *p = (vec_char16 *)((unsigned char *)(c) + b);
3860
 
3861
  shift = 16-((int)p & 0xF);
3862
  *p = spu_sel(*p,
3863
               spu_slqwbyte(a, shift),
3864
               spu_slqwbyte(spu_splats((unsigned char)0xFF), shift));
3865
}
3866
 
3867
static inline void vec_stvrx(vec_ushort8 a, int b, unsigned short *c)
3868
{
3869
  int shift;
3870
  vec_ushort8 *p = (vec_ushort8 *)((unsigned char *)(c) + b);
3871
 
3872
  shift = 16-((int)p & 0xF);
3873
  *p = spu_sel(*p,
3874
               spu_slqwbyte(a, shift),
3875
               spu_slqwbyte(spu_splats((unsigned short)0xFFFF), shift));
3876
}
3877
 
3878
static inline void vec_stvrx(vec_ushort8 a, int b, vec_ushort8 *c)
3879
{
3880
  int shift;
3881
  vec_ushort8 *p = (vec_ushort8 *)((unsigned char *)(c) + b);
3882
 
3883
  shift = 16-((int)p & 0xF);
3884
  *p = spu_sel(*p,
3885
               spu_slqwbyte(a, shift),
3886
               spu_slqwbyte(spu_splats((unsigned short)0xFFFF), shift));
3887
}
3888
 
3889
static inline void vec_stvrx(vec_short8 a, int b, signed short *c)
3890
{
3891
  int shift;
3892
  vec_short8 *p = (vec_short8 *)((unsigned char *)(c) + b);
3893
 
3894
  shift = 16-((int)p & 0xF);
3895
  *p = spu_sel(*p,
3896
               spu_slqwbyte(a, shift),
3897
               spu_slqwbyte(spu_splats((unsigned short)0xFFFF), shift));
3898
}
3899
 
3900
static inline void vec_stvrx(vec_short8 a, int b, vec_short8 *c)
3901
{
3902
  int shift;
3903
  vec_short8 *p = (vec_short8 *)((unsigned char *)(c) + b);
3904
 
3905
  shift = 16-((int)p & 0xF);
3906
  *p = spu_sel(*p,
3907
               spu_slqwbyte(a, shift),
3908
               spu_slqwbyte(spu_splats((unsigned short)0xFFFF), shift));
3909
}
3910
 
3911
static inline void vec_stvrx(vec_uint4 a, int b, unsigned int *c)
3912
{
3913
  int shift;
3914
  vec_uint4 *p = (vec_uint4 *)((unsigned char *)(c) + b);
3915
 
3916
  shift = 16-((int)p & 0xF);
3917
  *p = spu_sel(*p,
3918
               spu_slqwbyte(a, shift),
3919
               spu_slqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
3920
}
3921
 
3922
static inline void vec_stvrx(vec_uint4 a, int b, vec_uint4 *c)
3923
{
3924
  int shift;
3925
  vec_uint4 *p = (vec_uint4 *)((unsigned char *)(c) + b);
3926
 
3927
  shift = 16-((int)p & 0xF);
3928
  *p = spu_sel(*p,
3929
               spu_slqwbyte(a, shift),
3930
               spu_slqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
3931
}
3932
 
3933
static inline void vec_stvrx(vec_int4 a, int b, signed int *c)
3934
{
3935
  int shift;
3936
  vec_int4 *p = (vec_int4 *)((unsigned char *)(c) + b);
3937
 
3938
  shift = 16-((int)p & 0xF);
3939
  *p = spu_sel(*p,
3940
               spu_slqwbyte(a, shift),
3941
               spu_slqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
3942
}
3943
 
3944
static inline void vec_stvrx(vec_int4 a, int b, vec_int4 *c)
3945
{
3946
  int shift;
3947
  vec_int4 *p = (vec_int4 *)((unsigned char *)(c) + b);
3948
 
3949
  shift = 16-((int)p & 0xF);
3950
  *p = spu_sel(*p,
3951
               spu_slqwbyte(a, shift),
3952
               spu_slqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
3953
}
3954
 
3955
static inline void vec_stvrx(vec_float4 a, int b, float *c)
3956
{
3957
  int shift;
3958
  vec_float4 *p = (vec_float4 *)((unsigned char *)(c) + b);
3959
 
3960
  shift = 16-((int)p & 0xF);
3961
  *p = spu_sel(*p,
3962
               spu_slqwbyte(a, shift),
3963
               spu_slqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
3964
}
3965
 
3966
static inline void vec_stvrx(vec_float4 a, int b, vec_float4 *c)
3967
{
3968
  int shift;
3969
  vec_float4 *p = (vec_float4 *)((unsigned char *)(c) + b);
3970
 
3971
  shift = 16-((int)p & 0xF);
3972
  *p = spu_sel(*p,
3973
               spu_slqwbyte(a, shift),
3974
               spu_slqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
3975
}
3976
 
3977
/* vec_stvrxl (store vector right indexed last)
3978
 * ==========
3979
 */
3980
#define vec_stvrxl(_a, _b, _c)  vec_stvrx(_a, _b, _c)
3981
 
3982
 
3983
#endif /* __SPU__ */
3984
#endif /* __cplusplus */
3985
#endif /* !_VMX2SPU_H_ */

powered by: WebSVN 2.1.0

© copyright 1999-2024 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.