OpenCores
URL https://opencores.org/ocsvn/openrisc_2011-10-31/openrisc_2011-10-31/trunk

Subversion Repositories openrisc_2011-10-31

[/] [openrisc/] [trunk/] [gnu-src/] [gcc-4.5.1/] [gcc/] [config/] [i386/] [xmmintrin.h] - Blame information for rev 282

Details | Compare with Previous | View Log

Line No. Rev Author Line
1 282 jeremybenn
/* Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009
2
   Free Software Foundation, Inc.
3
 
4
   This file is part of GCC.
5
 
6
   GCC is free software; you can redistribute it and/or modify
7
   it under the terms of the GNU General Public License as published by
8
   the Free Software Foundation; either version 3, or (at your option)
9
   any later version.
10
 
11
   GCC is distributed in the hope that it will be useful,
12
   but WITHOUT ANY WARRANTY; without even the implied warranty of
13
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14
   GNU General Public License for more details.
15
 
16
   Under Section 7 of GPL version 3, you are granted additional
17
   permissions described in the GCC Runtime Library Exception, version
18
   3.1, as published by the Free Software Foundation.
19
 
20
   You should have received a copy of the GNU General Public License and
21
   a copy of the GCC Runtime Library Exception along with this program;
22
   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
23
   <http://www.gnu.org/licenses/>.  */
24
 
25
/* Implemented from the specification included in the Intel C++ Compiler
26
   User Guide and Reference, version 9.0.  */
27
 
28
#ifndef _XMMINTRIN_H_INCLUDED
29
#define _XMMINTRIN_H_INCLUDED
30
 
31
#ifndef __SSE__
32
# error "SSE instruction set not enabled"
33
#else
34
 
35
/* We need type definitions from the MMX header file.  */
36
#include <mmintrin.h>
37
 
38
/* Get _mm_malloc () and _mm_free ().  */
39
#include <mm_malloc.h>
40
 
41
/* The Intel API is flexible enough that we must allow aliasing with other
42
   vector types, and their scalar components.  */
43
typedef float __m128 __attribute__ ((__vector_size__ (16), __may_alias__));
44
 
45
/* Internal data types for implementing the intrinsics.  */
46
typedef float __v4sf __attribute__ ((__vector_size__ (16)));
47
 
48
/* Create a selector for use with the SHUFPS instruction.  */
49
#define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \
50
 (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0))
51
 
52
/* Constants for use with _mm_prefetch.  */
53
enum _mm_hint
54
{
55
  _MM_HINT_T0 = 3,
56
  _MM_HINT_T1 = 2,
57
  _MM_HINT_T2 = 1,
58
  _MM_HINT_NTA = 0
59
};
60
 
61
/* Bits in the MXCSR.  */
62
#define _MM_EXCEPT_MASK       0x003f
63
#define _MM_EXCEPT_INVALID    0x0001
64
#define _MM_EXCEPT_DENORM     0x0002
65
#define _MM_EXCEPT_DIV_ZERO   0x0004
66
#define _MM_EXCEPT_OVERFLOW   0x0008
67
#define _MM_EXCEPT_UNDERFLOW  0x0010
68
#define _MM_EXCEPT_INEXACT    0x0020
69
 
70
#define _MM_MASK_MASK         0x1f80
71
#define _MM_MASK_INVALID      0x0080
72
#define _MM_MASK_DENORM       0x0100
73
#define _MM_MASK_DIV_ZERO     0x0200
74
#define _MM_MASK_OVERFLOW     0x0400
75
#define _MM_MASK_UNDERFLOW    0x0800
76
#define _MM_MASK_INEXACT      0x1000
77
 
78
#define _MM_ROUND_MASK        0x6000
79
#define _MM_ROUND_NEAREST     0x0000
80
#define _MM_ROUND_DOWN        0x2000
81
#define _MM_ROUND_UP          0x4000
82
#define _MM_ROUND_TOWARD_ZERO 0x6000
83
 
84
#define _MM_FLUSH_ZERO_MASK   0x8000
85
#define _MM_FLUSH_ZERO_ON     0x8000
86
#define _MM_FLUSH_ZERO_OFF    0x0000
87
 
88
/* Create a vector of zeros.  */
89
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
90
_mm_setzero_ps (void)
91
{
92
  return __extension__ (__m128){ 0.0f, 0.0f, 0.0f, 0.0f };
93
}
94
 
95
/* Perform the respective operation on the lower SPFP (single-precision
96
   floating-point) values of A and B; the upper three SPFP values are
97
   passed through from A.  */
98
 
99
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
100
_mm_add_ss (__m128 __A, __m128 __B)
101
{
102
  return (__m128) __builtin_ia32_addss ((__v4sf)__A, (__v4sf)__B);
103
}
104
 
105
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
106
_mm_sub_ss (__m128 __A, __m128 __B)
107
{
108
  return (__m128) __builtin_ia32_subss ((__v4sf)__A, (__v4sf)__B);
109
}
110
 
111
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
112
_mm_mul_ss (__m128 __A, __m128 __B)
113
{
114
  return (__m128) __builtin_ia32_mulss ((__v4sf)__A, (__v4sf)__B);
115
}
116
 
117
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
118
_mm_div_ss (__m128 __A, __m128 __B)
119
{
120
  return (__m128) __builtin_ia32_divss ((__v4sf)__A, (__v4sf)__B);
121
}
122
 
123
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
124
_mm_sqrt_ss (__m128 __A)
125
{
126
  return (__m128) __builtin_ia32_sqrtss ((__v4sf)__A);
127
}
128
 
129
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
130
_mm_rcp_ss (__m128 __A)
131
{
132
  return (__m128) __builtin_ia32_rcpss ((__v4sf)__A);
133
}
134
 
135
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
136
_mm_rsqrt_ss (__m128 __A)
137
{
138
  return (__m128) __builtin_ia32_rsqrtss ((__v4sf)__A);
139
}
140
 
141
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
142
_mm_min_ss (__m128 __A, __m128 __B)
143
{
144
  return (__m128) __builtin_ia32_minss ((__v4sf)__A, (__v4sf)__B);
145
}
146
 
147
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
148
_mm_max_ss (__m128 __A, __m128 __B)
149
{
150
  return (__m128) __builtin_ia32_maxss ((__v4sf)__A, (__v4sf)__B);
151
}
152
 
153
/* Perform the respective operation on the four SPFP values in A and B.  */
154
 
155
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
156
_mm_add_ps (__m128 __A, __m128 __B)
157
{
158
  return (__m128) __builtin_ia32_addps ((__v4sf)__A, (__v4sf)__B);
159
}
160
 
161
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
162
_mm_sub_ps (__m128 __A, __m128 __B)
163
{
164
  return (__m128) __builtin_ia32_subps ((__v4sf)__A, (__v4sf)__B);
165
}
166
 
167
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
168
_mm_mul_ps (__m128 __A, __m128 __B)
169
{
170
  return (__m128) __builtin_ia32_mulps ((__v4sf)__A, (__v4sf)__B);
171
}
172
 
173
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
174
_mm_div_ps (__m128 __A, __m128 __B)
175
{
176
  return (__m128) __builtin_ia32_divps ((__v4sf)__A, (__v4sf)__B);
177
}
178
 
179
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
180
_mm_sqrt_ps (__m128 __A)
181
{
182
  return (__m128) __builtin_ia32_sqrtps ((__v4sf)__A);
183
}
184
 
185
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
186
_mm_rcp_ps (__m128 __A)
187
{
188
  return (__m128) __builtin_ia32_rcpps ((__v4sf)__A);
189
}
190
 
191
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
192
_mm_rsqrt_ps (__m128 __A)
193
{
194
  return (__m128) __builtin_ia32_rsqrtps ((__v4sf)__A);
195
}
196
 
197
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
198
_mm_min_ps (__m128 __A, __m128 __B)
199
{
200
  return (__m128) __builtin_ia32_minps ((__v4sf)__A, (__v4sf)__B);
201
}
202
 
203
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
204
_mm_max_ps (__m128 __A, __m128 __B)
205
{
206
  return (__m128) __builtin_ia32_maxps ((__v4sf)__A, (__v4sf)__B);
207
}
208
 
209
/* Perform logical bit-wise operations on 128-bit values.  */
210
 
211
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
212
_mm_and_ps (__m128 __A, __m128 __B)
213
{
214
  return __builtin_ia32_andps (__A, __B);
215
}
216
 
217
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
218
_mm_andnot_ps (__m128 __A, __m128 __B)
219
{
220
  return __builtin_ia32_andnps (__A, __B);
221
}
222
 
223
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
224
_mm_or_ps (__m128 __A, __m128 __B)
225
{
226
  return __builtin_ia32_orps (__A, __B);
227
}
228
 
229
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
230
_mm_xor_ps (__m128 __A, __m128 __B)
231
{
232
  return __builtin_ia32_xorps (__A, __B);
233
}
234
 
235
/* Perform a comparison on the lower SPFP values of A and B.  If the
236
   comparison is true, place a mask of all ones in the result, otherwise a
237
   mask of zeros.  The upper three SPFP values are passed through from A.  */
238
 
239
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
240
_mm_cmpeq_ss (__m128 __A, __m128 __B)
241
{
242
  return (__m128) __builtin_ia32_cmpeqss ((__v4sf)__A, (__v4sf)__B);
243
}
244
 
245
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
246
_mm_cmplt_ss (__m128 __A, __m128 __B)
247
{
248
  return (__m128) __builtin_ia32_cmpltss ((__v4sf)__A, (__v4sf)__B);
249
}
250
 
251
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
252
_mm_cmple_ss (__m128 __A, __m128 __B)
253
{
254
  return (__m128) __builtin_ia32_cmpless ((__v4sf)__A, (__v4sf)__B);
255
}
256
 
257
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
258
_mm_cmpgt_ss (__m128 __A, __m128 __B)
259
{
260
  return (__m128) __builtin_ia32_movss ((__v4sf) __A,
261
                                        (__v4sf)
262
                                        __builtin_ia32_cmpltss ((__v4sf) __B,
263
                                                                (__v4sf)
264
                                                                __A));
265
}
266
 
267
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
268
_mm_cmpge_ss (__m128 __A, __m128 __B)
269
{
270
  return (__m128) __builtin_ia32_movss ((__v4sf) __A,
271
                                        (__v4sf)
272
                                        __builtin_ia32_cmpless ((__v4sf) __B,
273
                                                                (__v4sf)
274
                                                                __A));
275
}
276
 
277
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
278
_mm_cmpneq_ss (__m128 __A, __m128 __B)
279
{
280
  return (__m128) __builtin_ia32_cmpneqss ((__v4sf)__A, (__v4sf)__B);
281
}
282
 
283
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
284
_mm_cmpnlt_ss (__m128 __A, __m128 __B)
285
{
286
  return (__m128) __builtin_ia32_cmpnltss ((__v4sf)__A, (__v4sf)__B);
287
}
288
 
289
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
290
_mm_cmpnle_ss (__m128 __A, __m128 __B)
291
{
292
  return (__m128) __builtin_ia32_cmpnless ((__v4sf)__A, (__v4sf)__B);
293
}
294
 
295
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
296
_mm_cmpngt_ss (__m128 __A, __m128 __B)
297
{
298
  return (__m128) __builtin_ia32_movss ((__v4sf) __A,
299
                                        (__v4sf)
300
                                        __builtin_ia32_cmpnltss ((__v4sf) __B,
301
                                                                 (__v4sf)
302
                                                                 __A));
303
}
304
 
305
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
306
_mm_cmpnge_ss (__m128 __A, __m128 __B)
307
{
308
  return (__m128) __builtin_ia32_movss ((__v4sf) __A,
309
                                        (__v4sf)
310
                                        __builtin_ia32_cmpnless ((__v4sf) __B,
311
                                                                 (__v4sf)
312
                                                                 __A));
313
}
314
 
315
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
316
_mm_cmpord_ss (__m128 __A, __m128 __B)
317
{
318
  return (__m128) __builtin_ia32_cmpordss ((__v4sf)__A, (__v4sf)__B);
319
}
320
 
321
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
322
_mm_cmpunord_ss (__m128 __A, __m128 __B)
323
{
324
  return (__m128) __builtin_ia32_cmpunordss ((__v4sf)__A, (__v4sf)__B);
325
}
326
 
327
/* Perform a comparison on the four SPFP values of A and B.  For each
328
   element, if the comparison is true, place a mask of all ones in the
329
   result, otherwise a mask of zeros.  */
330
 
331
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
332
_mm_cmpeq_ps (__m128 __A, __m128 __B)
333
{
334
  return (__m128) __builtin_ia32_cmpeqps ((__v4sf)__A, (__v4sf)__B);
335
}
336
 
337
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
338
_mm_cmplt_ps (__m128 __A, __m128 __B)
339
{
340
  return (__m128) __builtin_ia32_cmpltps ((__v4sf)__A, (__v4sf)__B);
341
}
342
 
343
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
344
_mm_cmple_ps (__m128 __A, __m128 __B)
345
{
346
  return (__m128) __builtin_ia32_cmpleps ((__v4sf)__A, (__v4sf)__B);
347
}
348
 
349
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
350
_mm_cmpgt_ps (__m128 __A, __m128 __B)
351
{
352
  return (__m128) __builtin_ia32_cmpgtps ((__v4sf)__A, (__v4sf)__B);
353
}
354
 
355
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
356
_mm_cmpge_ps (__m128 __A, __m128 __B)
357
{
358
  return (__m128) __builtin_ia32_cmpgeps ((__v4sf)__A, (__v4sf)__B);
359
}
360
 
361
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
362
_mm_cmpneq_ps (__m128 __A, __m128 __B)
363
{
364
  return (__m128) __builtin_ia32_cmpneqps ((__v4sf)__A, (__v4sf)__B);
365
}
366
 
367
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
368
_mm_cmpnlt_ps (__m128 __A, __m128 __B)
369
{
370
  return (__m128) __builtin_ia32_cmpnltps ((__v4sf)__A, (__v4sf)__B);
371
}
372
 
373
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
374
_mm_cmpnle_ps (__m128 __A, __m128 __B)
375
{
376
  return (__m128) __builtin_ia32_cmpnleps ((__v4sf)__A, (__v4sf)__B);
377
}
378
 
379
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
380
_mm_cmpngt_ps (__m128 __A, __m128 __B)
381
{
382
  return (__m128) __builtin_ia32_cmpngtps ((__v4sf)__A, (__v4sf)__B);
383
}
384
 
385
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
386
_mm_cmpnge_ps (__m128 __A, __m128 __B)
387
{
388
  return (__m128) __builtin_ia32_cmpngeps ((__v4sf)__A, (__v4sf)__B);
389
}
390
 
391
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
392
_mm_cmpord_ps (__m128 __A, __m128 __B)
393
{
394
  return (__m128) __builtin_ia32_cmpordps ((__v4sf)__A, (__v4sf)__B);
395
}
396
 
397
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
398
_mm_cmpunord_ps (__m128 __A, __m128 __B)
399
{
400
  return (__m128) __builtin_ia32_cmpunordps ((__v4sf)__A, (__v4sf)__B);
401
}
402
 
403
/* Compare the lower SPFP values of A and B and return 1 if true
404
   and 0 if false.  */
405
 
406
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
407
_mm_comieq_ss (__m128 __A, __m128 __B)
408
{
409
  return __builtin_ia32_comieq ((__v4sf)__A, (__v4sf)__B);
410
}
411
 
412
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
413
_mm_comilt_ss (__m128 __A, __m128 __B)
414
{
415
  return __builtin_ia32_comilt ((__v4sf)__A, (__v4sf)__B);
416
}
417
 
418
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
419
_mm_comile_ss (__m128 __A, __m128 __B)
420
{
421
  return __builtin_ia32_comile ((__v4sf)__A, (__v4sf)__B);
422
}
423
 
424
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
425
_mm_comigt_ss (__m128 __A, __m128 __B)
426
{
427
  return __builtin_ia32_comigt ((__v4sf)__A, (__v4sf)__B);
428
}
429
 
430
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
431
_mm_comige_ss (__m128 __A, __m128 __B)
432
{
433
  return __builtin_ia32_comige ((__v4sf)__A, (__v4sf)__B);
434
}
435
 
436
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
437
_mm_comineq_ss (__m128 __A, __m128 __B)
438
{
439
  return __builtin_ia32_comineq ((__v4sf)__A, (__v4sf)__B);
440
}
441
 
442
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
443
_mm_ucomieq_ss (__m128 __A, __m128 __B)
444
{
445
  return __builtin_ia32_ucomieq ((__v4sf)__A, (__v4sf)__B);
446
}
447
 
448
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
449
_mm_ucomilt_ss (__m128 __A, __m128 __B)
450
{
451
  return __builtin_ia32_ucomilt ((__v4sf)__A, (__v4sf)__B);
452
}
453
 
454
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
455
_mm_ucomile_ss (__m128 __A, __m128 __B)
456
{
457
  return __builtin_ia32_ucomile ((__v4sf)__A, (__v4sf)__B);
458
}
459
 
460
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
461
_mm_ucomigt_ss (__m128 __A, __m128 __B)
462
{
463
  return __builtin_ia32_ucomigt ((__v4sf)__A, (__v4sf)__B);
464
}
465
 
466
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
467
_mm_ucomige_ss (__m128 __A, __m128 __B)
468
{
469
  return __builtin_ia32_ucomige ((__v4sf)__A, (__v4sf)__B);
470
}
471
 
472
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
473
_mm_ucomineq_ss (__m128 __A, __m128 __B)
474
{
475
  return __builtin_ia32_ucomineq ((__v4sf)__A, (__v4sf)__B);
476
}
477
 
478
/* Convert the lower SPFP value to a 32-bit integer according to the current
479
   rounding mode.  */
480
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
481
_mm_cvtss_si32 (__m128 __A)
482
{
483
  return __builtin_ia32_cvtss2si ((__v4sf) __A);
484
}
485
 
486
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
487
_mm_cvt_ss2si (__m128 __A)
488
{
489
  return _mm_cvtss_si32 (__A);
490
}
491
 
492
#ifdef __x86_64__
493
/* Convert the lower SPFP value to a 32-bit integer according to the
494
   current rounding mode.  */
495
 
496
/* Intel intrinsic.  */
497
extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
498
_mm_cvtss_si64 (__m128 __A)
499
{
500
  return __builtin_ia32_cvtss2si64 ((__v4sf) __A);
501
}
502
 
503
/* Microsoft intrinsic.  */
504
extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
505
_mm_cvtss_si64x (__m128 __A)
506
{
507
  return __builtin_ia32_cvtss2si64 ((__v4sf) __A);
508
}
509
#endif
510
 
511
/* Convert the two lower SPFP values to 32-bit integers according to the
512
   current rounding mode.  Return the integers in packed form.  */
513
extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
514
_mm_cvtps_pi32 (__m128 __A)
515
{
516
  return (__m64) __builtin_ia32_cvtps2pi ((__v4sf) __A);
517
}
518
 
519
extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
520
_mm_cvt_ps2pi (__m128 __A)
521
{
522
  return _mm_cvtps_pi32 (__A);
523
}
524
 
525
/* Truncate the lower SPFP value to a 32-bit integer.  */
526
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
527
_mm_cvttss_si32 (__m128 __A)
528
{
529
  return __builtin_ia32_cvttss2si ((__v4sf) __A);
530
}
531
 
532
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
533
_mm_cvtt_ss2si (__m128 __A)
534
{
535
  return _mm_cvttss_si32 (__A);
536
}
537
 
538
#ifdef __x86_64__
539
/* Truncate the lower SPFP value to a 32-bit integer.  */
540
 
541
/* Intel intrinsic.  */
542
extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
543
_mm_cvttss_si64 (__m128 __A)
544
{
545
  return __builtin_ia32_cvttss2si64 ((__v4sf) __A);
546
}
547
 
548
/* Microsoft intrinsic.  */
549
extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
550
_mm_cvttss_si64x (__m128 __A)
551
{
552
  return __builtin_ia32_cvttss2si64 ((__v4sf) __A);
553
}
554
#endif
555
 
556
/* Truncate the two lower SPFP values to 32-bit integers.  Return the
557
   integers in packed form.  */
558
extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
559
_mm_cvttps_pi32 (__m128 __A)
560
{
561
  return (__m64) __builtin_ia32_cvttps2pi ((__v4sf) __A);
562
}
563
 
564
extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
565
_mm_cvtt_ps2pi (__m128 __A)
566
{
567
  return _mm_cvttps_pi32 (__A);
568
}
569
 
570
/* Convert B to a SPFP value and insert it as element zero in A.  */
571
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
572
_mm_cvtsi32_ss (__m128 __A, int __B)
573
{
574
  return (__m128) __builtin_ia32_cvtsi2ss ((__v4sf) __A, __B);
575
}
576
 
577
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
578
_mm_cvt_si2ss (__m128 __A, int __B)
579
{
580
  return _mm_cvtsi32_ss (__A, __B);
581
}
582
 
583
#ifdef __x86_64__
584
/* Convert B to a SPFP value and insert it as element zero in A.  */
585
 
586
/* Intel intrinsic.  */
587
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
588
_mm_cvtsi64_ss (__m128 __A, long long __B)
589
{
590
  return (__m128) __builtin_ia32_cvtsi642ss ((__v4sf) __A, __B);
591
}
592
 
593
/* Microsoft intrinsic.  */
594
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
595
_mm_cvtsi64x_ss (__m128 __A, long long __B)
596
{
597
  return (__m128) __builtin_ia32_cvtsi642ss ((__v4sf) __A, __B);
598
}
599
#endif
600
 
601
/* Convert the two 32-bit values in B to SPFP form and insert them
602
   as the two lower elements in A.  */
603
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
604
_mm_cvtpi32_ps (__m128 __A, __m64 __B)
605
{
606
  return (__m128) __builtin_ia32_cvtpi2ps ((__v4sf) __A, (__v2si)__B);
607
}
608
 
609
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
610
_mm_cvt_pi2ps (__m128 __A, __m64 __B)
611
{
612
  return _mm_cvtpi32_ps (__A, __B);
613
}
614
 
615
/* Convert the four signed 16-bit values in A to SPFP form.  */
616
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
617
_mm_cvtpi16_ps (__m64 __A)
618
{
619
  __v4hi __sign;
620
  __v2si __hisi, __losi;
621
  __v4sf __zero, __ra, __rb;
622
 
623
  /* This comparison against zero gives us a mask that can be used to
624
     fill in the missing sign bits in the unpack operations below, so
625
     that we get signed values after unpacking.  */
626
  __sign = __builtin_ia32_pcmpgtw ((__v4hi)0LL, (__v4hi)__A);
627
 
628
  /* Convert the four words to doublewords.  */
629
  __hisi = (__v2si) __builtin_ia32_punpckhwd ((__v4hi)__A, __sign);
630
  __losi = (__v2si) __builtin_ia32_punpcklwd ((__v4hi)__A, __sign);
631
 
632
  /* Convert the doublewords to floating point two at a time.  */
633
  __zero = (__v4sf) _mm_setzero_ps ();
634
  __ra = __builtin_ia32_cvtpi2ps (__zero, __hisi);
635
  __rb = __builtin_ia32_cvtpi2ps (__ra, __losi);
636
 
637
  return (__m128) __builtin_ia32_movlhps (__ra, __rb);
638
}
639
 
640
/* Convert the four unsigned 16-bit values in A to SPFP form.  */
641
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
642
_mm_cvtpu16_ps (__m64 __A)
643
{
644
  __v2si __hisi, __losi;
645
  __v4sf __zero, __ra, __rb;
646
 
647
  /* Convert the four words to doublewords.  */
648
  __hisi = (__v2si) __builtin_ia32_punpckhwd ((__v4hi)__A, (__v4hi)0LL);
649
  __losi = (__v2si) __builtin_ia32_punpcklwd ((__v4hi)__A, (__v4hi)0LL);
650
 
651
  /* Convert the doublewords to floating point two at a time.  */
652
  __zero = (__v4sf) _mm_setzero_ps ();
653
  __ra = __builtin_ia32_cvtpi2ps (__zero, __hisi);
654
  __rb = __builtin_ia32_cvtpi2ps (__ra, __losi);
655
 
656
  return (__m128) __builtin_ia32_movlhps (__ra, __rb);
657
}
658
 
659
/* Convert the low four signed 8-bit values in A to SPFP form.  */
660
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
661
_mm_cvtpi8_ps (__m64 __A)
662
{
663
  __v8qi __sign;
664
 
665
  /* This comparison against zero gives us a mask that can be used to
666
     fill in the missing sign bits in the unpack operations below, so
667
     that we get signed values after unpacking.  */
668
  __sign = __builtin_ia32_pcmpgtb ((__v8qi)0LL, (__v8qi)__A);
669
 
670
  /* Convert the four low bytes to words.  */
671
  __A = (__m64) __builtin_ia32_punpcklbw ((__v8qi)__A, __sign);
672
 
673
  return _mm_cvtpi16_ps(__A);
674
}
675
 
676
/* Convert the low four unsigned 8-bit values in A to SPFP form.  */
677
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
678
_mm_cvtpu8_ps(__m64 __A)
679
{
680
  __A = (__m64) __builtin_ia32_punpcklbw ((__v8qi)__A, (__v8qi)0LL);
681
  return _mm_cvtpu16_ps(__A);
682
}
683
 
684
/* Convert the four signed 32-bit values in A and B to SPFP form.  */
685
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
686
_mm_cvtpi32x2_ps(__m64 __A, __m64 __B)
687
{
688
  __v4sf __zero = (__v4sf) _mm_setzero_ps ();
689
  __v4sf __sfa = __builtin_ia32_cvtpi2ps (__zero, (__v2si)__A);
690
  __v4sf __sfb = __builtin_ia32_cvtpi2ps (__sfa, (__v2si)__B);
691
  return (__m128) __builtin_ia32_movlhps (__sfa, __sfb);
692
}
693
 
694
/* Convert the four SPFP values in A to four signed 16-bit integers.  */
695
extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
696
_mm_cvtps_pi16(__m128 __A)
697
{
698
  __v4sf __hisf = (__v4sf)__A;
699
  __v4sf __losf = __builtin_ia32_movhlps (__hisf, __hisf);
700
  __v2si __hisi = __builtin_ia32_cvtps2pi (__hisf);
701
  __v2si __losi = __builtin_ia32_cvtps2pi (__losf);
702
  return (__m64) __builtin_ia32_packssdw (__hisi, __losi);
703
}
704
 
705
/* Convert the four SPFP values in A to four signed 8-bit integers.  */
706
extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
707
_mm_cvtps_pi8(__m128 __A)
708
{
709
  __v4hi __tmp = (__v4hi) _mm_cvtps_pi16 (__A);
710
  return (__m64) __builtin_ia32_packsswb (__tmp, (__v4hi)0LL);
711
}
712
 
713
/* Selects four specific SPFP values from A and B based on MASK.  */
714
#ifdef __OPTIMIZE__
715
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
716
_mm_shuffle_ps (__m128 __A, __m128 __B, int const __mask)
717
{
718
  return (__m128) __builtin_ia32_shufps ((__v4sf)__A, (__v4sf)__B, __mask);
719
}
720
#else
721
#define _mm_shuffle_ps(A, B, MASK)                                      \
722
  ((__m128) __builtin_ia32_shufps ((__v4sf)(__m128)(A),                 \
723
                                   (__v4sf)(__m128)(B), (int)(MASK)))
724
#endif
725
 
726
/* Selects and interleaves the upper two SPFP values from A and B.  */
727
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
728
_mm_unpackhi_ps (__m128 __A, __m128 __B)
729
{
730
  return (__m128) __builtin_ia32_unpckhps ((__v4sf)__A, (__v4sf)__B);
731
}
732
 
733
/* Selects and interleaves the lower two SPFP values from A and B.  */
734
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
735
_mm_unpacklo_ps (__m128 __A, __m128 __B)
736
{
737
  return (__m128) __builtin_ia32_unpcklps ((__v4sf)__A, (__v4sf)__B);
738
}
739
 
740
/* Sets the upper two SPFP values with 64-bits of data loaded from P;
741
   the lower two values are passed through from A.  */
742
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
743
_mm_loadh_pi (__m128 __A, __m64 const *__P)
744
{
745
  return (__m128) __builtin_ia32_loadhps ((__v4sf)__A, (const __v2sf *)__P);
746
}
747
 
748
/* Stores the upper two SPFP values of A into P.  */
749
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
750
_mm_storeh_pi (__m64 *__P, __m128 __A)
751
{
752
  __builtin_ia32_storehps ((__v2sf *)__P, (__v4sf)__A);
753
}
754
 
755
/* Moves the upper two values of B into the lower two values of A.  */
756
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
757
_mm_movehl_ps (__m128 __A, __m128 __B)
758
{
759
  return (__m128) __builtin_ia32_movhlps ((__v4sf)__A, (__v4sf)__B);
760
}
761
 
762
/* Moves the lower two values of B into the upper two values of A.  */
763
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
764
_mm_movelh_ps (__m128 __A, __m128 __B)
765
{
766
  return (__m128) __builtin_ia32_movlhps ((__v4sf)__A, (__v4sf)__B);
767
}
768
 
769
/* Sets the lower two SPFP values with 64-bits of data loaded from P;
770
   the upper two values are passed through from A.  */
771
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
772
_mm_loadl_pi (__m128 __A, __m64 const *__P)
773
{
774
  return (__m128) __builtin_ia32_loadlps ((__v4sf)__A, (const __v2sf *)__P);
775
}
776
 
777
/* Stores the lower two SPFP values of A into P.  */
778
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
779
_mm_storel_pi (__m64 *__P, __m128 __A)
780
{
781
  __builtin_ia32_storelps ((__v2sf *)__P, (__v4sf)__A);
782
}
783
 
784
/* Creates a 4-bit mask from the most significant bits of the SPFP values.  */
785
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
786
_mm_movemask_ps (__m128 __A)
787
{
788
  return __builtin_ia32_movmskps ((__v4sf)__A);
789
}
790
 
791
/* Return the contents of the control register.  */
792
extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
793
_mm_getcsr (void)
794
{
795
  return __builtin_ia32_stmxcsr ();
796
}
797
 
798
/* Read exception bits from the control register.  */
799
extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
800
_MM_GET_EXCEPTION_STATE (void)
801
{
802
  return _mm_getcsr() & _MM_EXCEPT_MASK;
803
}
804
 
805
extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
806
_MM_GET_EXCEPTION_MASK (void)
807
{
808
  return _mm_getcsr() & _MM_MASK_MASK;
809
}
810
 
811
extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
812
_MM_GET_ROUNDING_MODE (void)
813
{
814
  return _mm_getcsr() & _MM_ROUND_MASK;
815
}
816
 
817
extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
818
_MM_GET_FLUSH_ZERO_MODE (void)
819
{
820
  return _mm_getcsr() & _MM_FLUSH_ZERO_MASK;
821
}
822
 
823
/* Set the control register to I.  */
824
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
825
_mm_setcsr (unsigned int __I)
826
{
827
  __builtin_ia32_ldmxcsr (__I);
828
}
829
 
830
/* Set exception bits in the control register.  */
831
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
832
_MM_SET_EXCEPTION_STATE(unsigned int __mask)
833
{
834
  _mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | __mask);
835
}
836
 
837
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
838
_MM_SET_EXCEPTION_MASK (unsigned int __mask)
839
{
840
  _mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | __mask);
841
}
842
 
843
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
844
_MM_SET_ROUNDING_MODE (unsigned int __mode)
845
{
846
  _mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | __mode);
847
}
848
 
849
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
850
_MM_SET_FLUSH_ZERO_MODE (unsigned int __mode)
851
{
852
  _mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | __mode);
853
}
854
 
855
/* Create a vector with element 0 as F and the rest zero.  */
856
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
857
_mm_set_ss (float __F)
858
{
859
  return __extension__ (__m128)(__v4sf){ __F, 0.0f, 0.0f, 0.0f };
860
}
861
 
862
/* Create a vector with all four elements equal to F.  */
863
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
864
_mm_set1_ps (float __F)
865
{
866
  return __extension__ (__m128)(__v4sf){ __F, __F, __F, __F };
867
}
868
 
869
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
870
_mm_set_ps1 (float __F)
871
{
872
  return _mm_set1_ps (__F);
873
}
874
 
875
/* Create a vector with element 0 as *P and the rest zero.  */
876
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
877
_mm_load_ss (float const *__P)
878
{
879
  return _mm_set_ss (*__P);
880
}
881
 
882
/* Create a vector with all four elements equal to *P.  */
883
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
884
_mm_load1_ps (float const *__P)
885
{
886
  return _mm_set1_ps (*__P);
887
}
888
 
889
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
890
_mm_load_ps1 (float const *__P)
891
{
892
  return _mm_load1_ps (__P);
893
}
894
 
895
/* Load four SPFP values from P.  The address must be 16-byte aligned.  */
896
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
897
_mm_load_ps (float const *__P)
898
{
899
  return (__m128) *(__v4sf *)__P;
900
}
901
 
902
/* Load four SPFP values from P.  The address need not be 16-byte aligned.  */
903
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
904
_mm_loadu_ps (float const *__P)
905
{
906
  return (__m128) __builtin_ia32_loadups (__P);
907
}
908
 
909
/* Load four SPFP values in reverse order.  The address must be aligned.  */
910
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
911
_mm_loadr_ps (float const *__P)
912
{
913
  __v4sf __tmp = *(__v4sf *)__P;
914
  return (__m128) __builtin_ia32_shufps (__tmp, __tmp, _MM_SHUFFLE (0,1,2,3));
915
}
916
 
917
/* Create the vector [Z Y X W].  */
918
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
919
_mm_set_ps (const float __Z, const float __Y, const float __X, const float __W)
920
{
921
  return __extension__ (__m128)(__v4sf){ __W, __X, __Y, __Z };
922
}
923
 
924
/* Create the vector [W X Y Z].  */
925
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
926
_mm_setr_ps (float __Z, float __Y, float __X, float __W)
927
{
928
  return __extension__ (__m128)(__v4sf){ __Z, __Y, __X, __W };
929
}
930
 
931
/* Stores the lower SPFP value.  */
932
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
933
_mm_store_ss (float *__P, __m128 __A)
934
{
935
  *__P = __builtin_ia32_vec_ext_v4sf ((__v4sf)__A, 0);
936
}
937
 
938
extern __inline float __attribute__((__gnu_inline__, __always_inline__, __artificial__))
939
_mm_cvtss_f32 (__m128 __A)
940
{
941
  return __builtin_ia32_vec_ext_v4sf ((__v4sf)__A, 0);
942
}
943
 
944
/* Store four SPFP values.  The address must be 16-byte aligned.  */
945
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
946
_mm_store_ps (float *__P, __m128 __A)
947
{
948
  *(__v4sf *)__P = (__v4sf)__A;
949
}
950
 
951
/* Store four SPFP values.  The address need not be 16-byte aligned.  */
952
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
953
_mm_storeu_ps (float *__P, __m128 __A)
954
{
955
  __builtin_ia32_storeups (__P, (__v4sf)__A);
956
}
957
 
958
/* Store the lower SPFP value across four words.  */
959
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
960
_mm_store1_ps (float *__P, __m128 __A)
961
{
962
  __v4sf __va = (__v4sf)__A;
963
  __v4sf __tmp = __builtin_ia32_shufps (__va, __va, _MM_SHUFFLE (0,0,0,0));
964
  _mm_storeu_ps (__P, __tmp);
965
}
966
 
967
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
968
_mm_store_ps1 (float *__P, __m128 __A)
969
{
970
  _mm_store1_ps (__P, __A);
971
}
972
 
973
/* Store four SPFP values in reverse order.  The address must be aligned.  */
974
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
975
_mm_storer_ps (float *__P, __m128 __A)
976
{
977
  __v4sf __va = (__v4sf)__A;
978
  __v4sf __tmp = __builtin_ia32_shufps (__va, __va, _MM_SHUFFLE (0,1,2,3));
979
  _mm_store_ps (__P, __tmp);
980
}
981
 
982
/* Sets the low SPFP value of A from the low value of B.  */
983
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
984
_mm_move_ss (__m128 __A, __m128 __B)
985
{
986
  return (__m128) __builtin_ia32_movss ((__v4sf)__A, (__v4sf)__B);
987
}
988
 
989
/* Extracts one of the four words of A.  The selector N must be immediate.  */
990
#ifdef __OPTIMIZE__
991
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
992
_mm_extract_pi16 (__m64 const __A, int const __N)
993
{
994
  return __builtin_ia32_vec_ext_v4hi ((__v4hi)__A, __N);
995
}
996
 
997
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
998
_m_pextrw (__m64 const __A, int const __N)
999
{
1000
  return _mm_extract_pi16 (__A, __N);
1001
}
1002
#else
1003
#define _mm_extract_pi16(A, N)  \
1004
  ((int) __builtin_ia32_vec_ext_v4hi ((__v4hi)(__m64)(A), (int)(N)))
1005
 
1006
#define _m_pextrw(A, N) _mm_extract_pi16(A, N)
1007
#endif
1008
 
1009
/* Inserts word D into one of four words of A.  The selector N must be
1010
   immediate.  */
1011
#ifdef __OPTIMIZE__
1012
extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1013
_mm_insert_pi16 (__m64 const __A, int const __D, int const __N)
1014
{
1015
  return (__m64) __builtin_ia32_vec_set_v4hi ((__v4hi)__A, __D, __N);
1016
}
1017
 
1018
extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1019
_m_pinsrw (__m64 const __A, int const __D, int const __N)
1020
{
1021
  return _mm_insert_pi16 (__A, __D, __N);
1022
}
1023
#else
1024
#define _mm_insert_pi16(A, D, N)                                \
1025
  ((__m64) __builtin_ia32_vec_set_v4hi ((__v4hi)(__m64)(A),     \
1026
                                        (int)(D), (int)(N)))
1027
 
1028
#define _m_pinsrw(A, D, N) _mm_insert_pi16(A, D, N)
1029
#endif
1030
 
1031
/* Compute the element-wise maximum of signed 16-bit values.  */
1032
extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1033
_mm_max_pi16 (__m64 __A, __m64 __B)
1034
{
1035
  return (__m64) __builtin_ia32_pmaxsw ((__v4hi)__A, (__v4hi)__B);
1036
}
1037
 
1038
extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1039
_m_pmaxsw (__m64 __A, __m64 __B)
1040
{
1041
  return _mm_max_pi16 (__A, __B);
1042
}
1043
 
1044
/* Compute the element-wise maximum of unsigned 8-bit values.  */
1045
extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1046
_mm_max_pu8 (__m64 __A, __m64 __B)
1047
{
1048
  return (__m64) __builtin_ia32_pmaxub ((__v8qi)__A, (__v8qi)__B);
1049
}
1050
 
1051
extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1052
_m_pmaxub (__m64 __A, __m64 __B)
1053
{
1054
  return _mm_max_pu8 (__A, __B);
1055
}
1056
 
1057
/* Compute the element-wise minimum of signed 16-bit values.  */
1058
extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1059
_mm_min_pi16 (__m64 __A, __m64 __B)
1060
{
1061
  return (__m64) __builtin_ia32_pminsw ((__v4hi)__A, (__v4hi)__B);
1062
}
1063
 
1064
extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1065
_m_pminsw (__m64 __A, __m64 __B)
1066
{
1067
  return _mm_min_pi16 (__A, __B);
1068
}
1069
 
1070
/* Compute the element-wise minimum of unsigned 8-bit values.  */
1071
extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1072
_mm_min_pu8 (__m64 __A, __m64 __B)
1073
{
1074
  return (__m64) __builtin_ia32_pminub ((__v8qi)__A, (__v8qi)__B);
1075
}
1076
 
1077
extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1078
_m_pminub (__m64 __A, __m64 __B)
1079
{
1080
  return _mm_min_pu8 (__A, __B);
1081
}
1082
 
1083
/* Create an 8-bit mask of the signs of 8-bit values.  */
1084
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1085
_mm_movemask_pi8 (__m64 __A)
1086
{
1087
  return __builtin_ia32_pmovmskb ((__v8qi)__A);
1088
}
1089
 
1090
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1091
_m_pmovmskb (__m64 __A)
1092
{
1093
  return _mm_movemask_pi8 (__A);
1094
}
1095
 
1096
/* Multiply four unsigned 16-bit values in A by four unsigned 16-bit values
1097
   in B and produce the high 16 bits of the 32-bit results.  */
1098
extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1099
_mm_mulhi_pu16 (__m64 __A, __m64 __B)
1100
{
1101
  return (__m64) __builtin_ia32_pmulhuw ((__v4hi)__A, (__v4hi)__B);
1102
}
1103
 
1104
extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1105
_m_pmulhuw (__m64 __A, __m64 __B)
1106
{
1107
  return _mm_mulhi_pu16 (__A, __B);
1108
}
1109
 
1110
/* Return a combination of the four 16-bit values in A.  The selector
1111
   must be an immediate.  */
1112
#ifdef __OPTIMIZE__
1113
extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1114
_mm_shuffle_pi16 (__m64 __A, int const __N)
1115
{
1116
  return (__m64) __builtin_ia32_pshufw ((__v4hi)__A, __N);
1117
}
1118
 
1119
extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1120
_m_pshufw (__m64 __A, int const __N)
1121
{
1122
  return _mm_shuffle_pi16 (__A, __N);
1123
}
1124
#else
1125
#define _mm_shuffle_pi16(A, N) \
1126
  ((__m64) __builtin_ia32_pshufw ((__v4hi)(__m64)(A), (int)(N)))
1127
 
1128
#define _m_pshufw(A, N) _mm_shuffle_pi16 (A, N)
1129
#endif
1130
 
1131
/* Conditionally store byte elements of A into P.  The high bit of each
1132
   byte in the selector N determines whether the corresponding byte from
1133
   A is stored.  */
1134
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1135
_mm_maskmove_si64 (__m64 __A, __m64 __N, char *__P)
1136
{
1137
  __builtin_ia32_maskmovq ((__v8qi)__A, (__v8qi)__N, __P);
1138
}
1139
 
1140
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1141
_m_maskmovq (__m64 __A, __m64 __N, char *__P)
1142
{
1143
  _mm_maskmove_si64 (__A, __N, __P);
1144
}
1145
 
1146
/* Compute the rounded averages of the unsigned 8-bit values in A and B.  */
1147
extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1148
_mm_avg_pu8 (__m64 __A, __m64 __B)
1149
{
1150
  return (__m64) __builtin_ia32_pavgb ((__v8qi)__A, (__v8qi)__B);
1151
}
1152
 
1153
extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1154
_m_pavgb (__m64 __A, __m64 __B)
1155
{
1156
  return _mm_avg_pu8 (__A, __B);
1157
}
1158
 
1159
/* Compute the rounded averages of the unsigned 16-bit values in A and B.  */
1160
extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1161
_mm_avg_pu16 (__m64 __A, __m64 __B)
1162
{
1163
  return (__m64) __builtin_ia32_pavgw ((__v4hi)__A, (__v4hi)__B);
1164
}
1165
 
1166
extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1167
_m_pavgw (__m64 __A, __m64 __B)
1168
{
1169
  return _mm_avg_pu16 (__A, __B);
1170
}
1171
 
1172
/* Compute the sum of the absolute differences of the unsigned 8-bit
1173
   values in A and B.  Return the value in the lower 16-bit word; the
1174
   upper words are cleared.  */
1175
extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1176
_mm_sad_pu8 (__m64 __A, __m64 __B)
1177
{
1178
  return (__m64) __builtin_ia32_psadbw ((__v8qi)__A, (__v8qi)__B);
1179
}
1180
 
1181
extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1182
_m_psadbw (__m64 __A, __m64 __B)
1183
{
1184
  return _mm_sad_pu8 (__A, __B);
1185
}
1186
 
1187
/* Loads one cache line from address P to a location "closer" to the
1188
   processor.  The selector I specifies the type of prefetch operation.  */
1189
#ifdef __OPTIMIZE__
1190
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1191
_mm_prefetch (const void *__P, enum _mm_hint __I)
1192
{
1193
  __builtin_prefetch (__P, 0, __I);
1194
}
1195
#else
1196
#define _mm_prefetch(P, I) \
1197
  __builtin_prefetch ((P), 0, (I))
1198
#endif
1199
 
1200
/* Stores the data in A to the address P without polluting the caches.  */
1201
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1202
_mm_stream_pi (__m64 *__P, __m64 __A)
1203
{
1204
  __builtin_ia32_movntq ((unsigned long long *)__P, (unsigned long long)__A);
1205
}
1206
 
1207
/* Likewise.  The address must be 16-byte aligned.  */
1208
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1209
_mm_stream_ps (float *__P, __m128 __A)
1210
{
1211
  __builtin_ia32_movntps (__P, (__v4sf)__A);
1212
}
1213
 
1214
/* Guarantees that every preceding store is globally visible before
1215
   any subsequent store.  */
1216
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1217
_mm_sfence (void)
1218
{
1219
  __builtin_ia32_sfence ();
1220
}
1221
 
1222
/* The execution of the next instruction is delayed by an implementation
1223
   specific amount of time.  The instruction does not modify the
1224
   architectural state.  */
1225
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1226
_mm_pause (void)
1227
{
1228
  __asm__ __volatile__ ("rep; nop" : : );
1229
}
1230
 
1231
/* Transpose the 4x4 matrix composed of row[0-3].  */
1232
#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3)                       \
1233
do {                                                                    \
1234
  __v4sf __r0 = (row0), __r1 = (row1), __r2 = (row2), __r3 = (row3);    \
1235
  __v4sf __t0 = __builtin_ia32_unpcklps (__r0, __r1);                   \
1236
  __v4sf __t1 = __builtin_ia32_unpcklps (__r2, __r3);                   \
1237
  __v4sf __t2 = __builtin_ia32_unpckhps (__r0, __r1);                   \
1238
  __v4sf __t3 = __builtin_ia32_unpckhps (__r2, __r3);                   \
1239
  (row0) = __builtin_ia32_movlhps (__t0, __t1);                         \
1240
  (row1) = __builtin_ia32_movhlps (__t1, __t0);                         \
1241
  (row2) = __builtin_ia32_movlhps (__t2, __t3);                         \
1242
  (row3) = __builtin_ia32_movhlps (__t3, __t2);                         \
1243
} while (0)
1244
 
1245
/* For backward source compatibility.  */
1246
#ifdef __SSE2__
1247
# include <emmintrin.h>
1248
#endif
1249
 
1250
#endif /* __SSE__ */
1251
#endif /* _XMMINTRIN_H_INCLUDED */

powered by: WebSVN 2.1.0

© copyright 1999-2024 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.