OpenCores
URL https://opencores.org/ocsvn/openrisc_me/openrisc_me/trunk

Subversion Repositories openrisc_me

[/] [openrisc/] [trunk/] [gnu-src/] [gcc-4.5.1/] [gcc/] [config/] [i386/] [avxintrin.h] - Blame information for rev 300

Go to most recent revision | Details | Compare with Previous | View Log

Line No. Rev Author Line
1 282 jeremybenn
/* Copyright (C) 2008, 2009 Free Software Foundation, Inc.
2
 
3
   This file is part of GCC.
4
 
5
   GCC is free software; you can redistribute it and/or modify
6
   it under the terms of the GNU General Public License as published by
7
   the Free Software Foundation; either version 3, or (at your option)
8
   any later version.
9
 
10
   GCC is distributed in the hope that it will be useful,
11
   but WITHOUT ANY WARRANTY; without even the implied warranty of
12
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13
   GNU General Public License for more details.
14
 
15
   Under Section 7 of GPL version 3, you are granted additional
16
   permissions described in the GCC Runtime Library Exception, version
17
   3.1, as published by the Free Software Foundation.
18
 
19
   You should have received a copy of the GNU General Public License and
20
   a copy of the GCC Runtime Library Exception along with this program;
21
   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
22
   <http://www.gnu.org/licenses/>.  */
23
 
24
/* Implemented from the specification included in the Intel C++ Compiler
25
   User Guide and Reference, version 11.0.  */
26
 
27
#ifndef _IMMINTRIN_H_INCLUDED
28
# error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
29
#endif
30
 
31
/* Internal data types for implementing the intrinsics.  */
32
typedef double __v4df __attribute__ ((__vector_size__ (32)));
33
typedef float __v8sf __attribute__ ((__vector_size__ (32)));
34
typedef long long __v4di __attribute__ ((__vector_size__ (32)));
35
typedef int __v8si __attribute__ ((__vector_size__ (32)));
36
typedef short __v16hi __attribute__ ((__vector_size__ (32)));
37
typedef char __v32qi __attribute__ ((__vector_size__ (32)));
38
 
39
/* The Intel API is flexible enough that we must allow aliasing with other
40
   vector types, and their scalar components.  */
41
typedef float __m256 __attribute__ ((__vector_size__ (32),
42
                                     __may_alias__));
43
typedef long long __m256i __attribute__ ((__vector_size__ (32),
44
                                          __may_alias__));
45
typedef double __m256d __attribute__ ((__vector_size__ (32),
46
                                       __may_alias__));
47
 
48
/* Compare predicates for scalar and packed compare intrinsics.  */
49
 
50
/* Equal (ordered, non-signaling)  */
51
#define _CMP_EQ_OQ      0x00
52
/* Less-than (ordered, signaling)  */
53
#define _CMP_LT_OS      0x01
54
/* Less-than-or-equal (ordered, signaling)  */
55
#define _CMP_LE_OS      0x02
56
/* Unordered (non-signaling)  */
57
#define _CMP_UNORD_Q    0x03
58
/* Not-equal (unordered, non-signaling)  */
59
#define _CMP_NEQ_UQ     0x04
60
/* Not-less-than (unordered, signaling)  */
61
#define _CMP_NLT_US     0x05
62
/* Not-less-than-or-equal (unordered, signaling)  */
63
#define _CMP_NLE_US     0x06
64
/* Ordered (nonsignaling)   */
65
#define _CMP_ORD_Q      0x07
66
/* Equal (unordered, non-signaling)  */
67
#define _CMP_EQ_UQ      0x08
68
/* Not-greater-than-or-equal (unordered, signaling)  */
69
#define _CMP_NGE_US     0x09
70
/* Not-greater-than (unordered, signaling)  */
71
#define _CMP_NGT_US     0x0a
72
/* False (ordered, non-signaling)  */
73
#define _CMP_FALSE_OQ   0x0b
74
/* Not-equal (ordered, non-signaling)  */
75
#define _CMP_NEQ_OQ     0x0c
76
/* Greater-than-or-equal (ordered, signaling)  */
77
#define _CMP_GE_OS      0x0d
78
/* Greater-than (ordered, signaling)  */
79
#define _CMP_GT_OS      0x0e
80
/* True (unordered, non-signaling)  */
81
#define _CMP_TRUE_UQ    0x0f
82
/* Equal (ordered, signaling)  */
83
#define _CMP_EQ_OS      0x10
84
/* Less-than (ordered, non-signaling)  */
85
#define _CMP_LT_OQ      0x11
86
/* Less-than-or-equal (ordered, non-signaling)  */
87
#define _CMP_LE_OQ      0x12
88
/* Unordered (signaling)  */
89
#define _CMP_UNORD_S    0x13
90
/* Not-equal (unordered, signaling)  */
91
#define _CMP_NEQ_US     0x14
92
/* Not-less-than (unordered, non-signaling)  */
93
#define _CMP_NLT_UQ     0x15
94
/* Not-less-than-or-equal (unordered, non-signaling)  */
95
#define _CMP_NLE_UQ     0x16
96
/* Ordered (signaling)  */
97
#define _CMP_ORD_S      0x17
98
/* Equal (unordered, signaling)  */
99
#define _CMP_EQ_US      0x18
100
/* Not-greater-than-or-equal (unordered, non-signaling)  */
101
#define _CMP_NGE_UQ     0x19
102
/* Not-greater-than (unordered, non-signaling)  */
103
#define _CMP_NGT_UQ     0x1a
104
/* False (ordered, signaling)  */
105
#define _CMP_FALSE_OS   0x1b
106
/* Not-equal (ordered, signaling)  */
107
#define _CMP_NEQ_OS     0x1c
108
/* Greater-than-or-equal (ordered, non-signaling)  */
109
#define _CMP_GE_OQ      0x1d
110
/* Greater-than (ordered, non-signaling)  */
111
#define _CMP_GT_OQ      0x1e
112
/* True (unordered, signaling)  */
113
#define _CMP_TRUE_US    0x1f
114
 
115
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
116
_mm256_add_pd (__m256d __A, __m256d __B)
117
{
118
  return (__m256d) __builtin_ia32_addpd256 ((__v4df)__A, (__v4df)__B);
119
}
120
 
121
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
122
_mm256_add_ps (__m256 __A, __m256 __B)
123
{
124
  return (__m256) __builtin_ia32_addps256 ((__v8sf)__A, (__v8sf)__B);
125
}
126
 
127
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
128
_mm256_addsub_pd (__m256d __A, __m256d __B)
129
{
130
  return (__m256d) __builtin_ia32_addsubpd256 ((__v4df)__A, (__v4df)__B);
131
}
132
 
133
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
134
_mm256_addsub_ps (__m256 __A, __m256 __B)
135
{
136
  return (__m256) __builtin_ia32_addsubps256 ((__v8sf)__A, (__v8sf)__B);
137
}
138
 
139
 
140
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
141
_mm256_and_pd (__m256d __A, __m256d __B)
142
{
143
  return (__m256d) __builtin_ia32_andpd256 ((__v4df)__A, (__v4df)__B);
144
}
145
 
146
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
147
_mm256_and_ps (__m256 __A, __m256 __B)
148
{
149
  return (__m256) __builtin_ia32_andps256 ((__v8sf)__A, (__v8sf)__B);
150
}
151
 
152
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
153
_mm256_andnot_pd (__m256d __A, __m256d __B)
154
{
155
  return (__m256d) __builtin_ia32_andnpd256 ((__v4df)__A, (__v4df)__B);
156
}
157
 
158
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
159
_mm256_andnot_ps (__m256 __A, __m256 __B)
160
{
161
  return (__m256) __builtin_ia32_andnps256 ((__v8sf)__A, (__v8sf)__B);
162
}
163
 
164
/* Double/single precision floating point blend instructions - select
165
   data from 2 sources using constant/variable mask.  */
166
 
167
#ifdef __OPTIMIZE__
168
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
169
_mm256_blend_pd (__m256d __X, __m256d __Y, const int __M)
170
{
171
  return (__m256d) __builtin_ia32_blendpd256 ((__v4df)__X,
172
                                              (__v4df)__Y,
173
                                              __M);
174
}
175
 
176
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
177
_mm256_blend_ps (__m256 __X, __m256 __Y, const int __M)
178
{
179
  return (__m256) __builtin_ia32_blendps256 ((__v8sf)__X,
180
                                             (__v8sf)__Y,
181
                                             __M);
182
}
183
#else
184
#define _mm256_blend_pd(X, Y, M)                                        \
185
  ((__m256d) __builtin_ia32_blendpd256 ((__v4df)(__m256d)(X),           \
186
                                        (__v4df)(__m256d)(Y), (int)(M)))
187
 
188
#define _mm256_blend_ps(X, Y, M)                                        \
189
  ((__m256) __builtin_ia32_blendps256 ((__v8sf)(__m256)(X),             \
190
                                       (__v8sf)(__m256)(Y), (int)(M)))
191
#endif
192
 
193
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
194
_mm256_blendv_pd (__m256d __X, __m256d __Y, __m256d __M)
195
{
196
  return (__m256d) __builtin_ia32_blendvpd256 ((__v4df)__X,
197
                                               (__v4df)__Y,
198
                                               (__v4df)__M);
199
}
200
 
201
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
202
_mm256_blendv_ps (__m256 __X, __m256 __Y, __m256 __M)
203
{
204
  return (__m256) __builtin_ia32_blendvps256 ((__v8sf)__X,
205
                                              (__v8sf)__Y,
206
                                              (__v8sf)__M);
207
}
208
 
209
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
210
_mm256_div_pd (__m256d __A, __m256d __B)
211
{
212
  return (__m256d) __builtin_ia32_divpd256 ((__v4df)__A, (__v4df)__B);
213
}
214
 
215
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
216
_mm256_div_ps (__m256 __A, __m256 __B)
217
{
218
  return (__m256) __builtin_ia32_divps256 ((__v8sf)__A, (__v8sf)__B);
219
}
220
 
221
/* Dot product instructions with mask-defined summing and zeroing parts
222
   of result.  */
223
 
224
#ifdef __OPTIMIZE__
225
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
226
_mm256_dp_ps (__m256 __X, __m256 __Y, const int __M)
227
{
228
  return (__m256) __builtin_ia32_dpps256 ((__v8sf)__X,
229
                                          (__v8sf)__Y,
230
                                          __M);
231
}
232
#else
233
#define _mm256_dp_ps(X, Y, M)                                           \
234
  ((__m256) __builtin_ia32_dpps256 ((__v8sf)(__m256)(X),                \
235
                                    (__v8sf)(__m256)(Y), (int)(M)))
236
#endif
237
 
238
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
239
_mm256_hadd_pd (__m256d __X, __m256d __Y)
240
{
241
  return (__m256d) __builtin_ia32_haddpd256 ((__v4df)__X, (__v4df)__Y);
242
}
243
 
244
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
245
_mm256_hadd_ps (__m256 __X, __m256 __Y)
246
{
247
  return (__m256) __builtin_ia32_haddps256 ((__v8sf)__X, (__v8sf)__Y);
248
}
249
 
250
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
251
_mm256_hsub_pd (__m256d __X, __m256d __Y)
252
{
253
  return (__m256d) __builtin_ia32_hsubpd256 ((__v4df)__X, (__v4df)__Y);
254
}
255
 
256
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
257
_mm256_hsub_ps (__m256 __X, __m256 __Y)
258
{
259
  return (__m256) __builtin_ia32_hsubps256 ((__v8sf)__X, (__v8sf)__Y);
260
}
261
 
262
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
263
_mm256_max_pd (__m256d __A, __m256d __B)
264
{
265
  return (__m256d) __builtin_ia32_maxpd256 ((__v4df)__A, (__v4df)__B);
266
}
267
 
268
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
269
_mm256_max_ps (__m256 __A, __m256 __B)
270
{
271
  return (__m256) __builtin_ia32_maxps256 ((__v8sf)__A, (__v8sf)__B);
272
}
273
 
274
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
275
_mm256_min_pd (__m256d __A, __m256d __B)
276
{
277
  return (__m256d) __builtin_ia32_minpd256 ((__v4df)__A, (__v4df)__B);
278
}
279
 
280
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
281
_mm256_min_ps (__m256 __A, __m256 __B)
282
{
283
  return (__m256) __builtin_ia32_minps256 ((__v8sf)__A, (__v8sf)__B);
284
}
285
 
286
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
287
_mm256_mul_pd (__m256d __A, __m256d __B)
288
{
289
  return (__m256d) __builtin_ia32_mulpd256 ((__v4df)__A, (__v4df)__B);
290
}
291
 
292
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
293
_mm256_mul_ps (__m256 __A, __m256 __B)
294
{
295
  return (__m256) __builtin_ia32_mulps256 ((__v8sf)__A, (__v8sf)__B);
296
}
297
 
298
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
299
_mm256_or_pd (__m256d __A, __m256d __B)
300
{
301
  return (__m256d) __builtin_ia32_orpd256 ((__v4df)__A, (__v4df)__B);
302
}
303
 
304
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
305
_mm256_or_ps (__m256 __A, __m256 __B)
306
{
307
  return (__m256) __builtin_ia32_orps256 ((__v8sf)__A, (__v8sf)__B);
308
}
309
 
310
#ifdef __OPTIMIZE__
311
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
312
_mm256_shuffle_pd (__m256d __A, __m256d __B, const int __mask)
313
{
314
  return (__m256d) __builtin_ia32_shufpd256 ((__v4df)__A, (__v4df)__B,
315
                                             __mask);
316
}
317
 
318
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
319
_mm256_shuffle_ps (__m256 __A, __m256 __B, const int __mask)
320
{
321
  return (__m256) __builtin_ia32_shufps256 ((__v8sf)__A, (__v8sf)__B,
322
                                            __mask);
323
}
324
#else
325
#define _mm256_shuffle_pd(A, B, N)                                      \
326
  ((__m256d)__builtin_ia32_shufpd256 ((__v4df)(__m256d)(A),             \
327
                                      (__v4df)(__m256d)(B), (int)(N)))
328
 
329
#define _mm256_shuffle_ps(A, B, N)                                      \
330
  ((__m256) __builtin_ia32_shufps256 ((__v8sf)(__m256)(A),              \
331
                                      (__v8sf)(__m256)(B), (int)(N)))
332
#endif
333
 
334
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
335
_mm256_sub_pd (__m256d __A, __m256d __B)
336
{
337
  return (__m256d) __builtin_ia32_subpd256 ((__v4df)__A, (__v4df)__B);
338
}
339
 
340
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
341
_mm256_sub_ps (__m256 __A, __m256 __B)
342
{
343
  return (__m256) __builtin_ia32_subps256 ((__v8sf)__A, (__v8sf)__B);
344
}
345
 
346
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
347
_mm256_xor_pd (__m256d __A, __m256d __B)
348
{
349
  return (__m256d) __builtin_ia32_xorpd256 ((__v4df)__A, (__v4df)__B);
350
}
351
 
352
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
353
_mm256_xor_ps (__m256 __A, __m256 __B)
354
{
355
  return (__m256) __builtin_ia32_xorps256 ((__v8sf)__A, (__v8sf)__B);
356
}
357
 
358
#ifdef __OPTIMIZE__
359
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
360
_mm_cmp_pd (__m128d __X, __m128d __Y, const int __P)
361
{
362
  return (__m128d) __builtin_ia32_cmppd ((__v2df)__X, (__v2df)__Y, __P);
363
}
364
 
365
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
366
_mm_cmp_ps (__m128 __X, __m128 __Y, const int __P)
367
{
368
  return (__m128) __builtin_ia32_cmpps ((__v4sf)__X, (__v4sf)__Y, __P);
369
}
370
 
371
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
372
_mm256_cmp_pd (__m256d __X, __m256d __Y, const int __P)
373
{
374
  return (__m256d) __builtin_ia32_cmppd256 ((__v4df)__X, (__v4df)__Y,
375
                                            __P);
376
}
377
 
378
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
379
_mm256_cmp_ps (__m256 __X, __m256 __Y, const int __P)
380
{
381
  return (__m256) __builtin_ia32_cmpps256 ((__v8sf)__X, (__v8sf)__Y,
382
                                           __P);
383
}
384
 
385
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
386
_mm_cmp_sd (__m128d __X, __m128d __Y, const int __P)
387
{
388
  return (__m128d) __builtin_ia32_cmpsd ((__v2df)__X, (__v2df)__Y, __P);
389
}
390
 
391
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
392
_mm_cmp_ss (__m128 __X, __m128 __Y, const int __P)
393
{
394
  return (__m128) __builtin_ia32_cmpss ((__v4sf)__X, (__v4sf)__Y, __P);
395
}
396
#else
397
#define _mm_cmp_pd(X, Y, P)                                             \
398
  ((__m128d) __builtin_ia32_cmppd ((__v2df)(__m128d)(X),                \
399
                                   (__v2df)(__m128d)(Y), (int)(P)))
400
 
401
#define _mm_cmp_ps(X, Y, P)                                             \
402
  ((__m128) __builtin_ia32_cmpps ((__v4sf)(__m128)(X),                  \
403
                                  (__v4sf)(__m128)(Y), (int)(P)))
404
 
405
#define _mm256_cmp_pd(X, Y, P)                                          \
406
  ((__m256d) __builtin_ia32_cmppd256 ((__v4df)(__m256d)(X),             \
407
                                      (__v4df)(__m256d)(Y), (int)(P)))
408
 
409
#define _mm256_cmp_ps(X, Y, P)                                          \
410
  ((__m256) __builtin_ia32_cmpps256 ((__v8sf)(__m256)(X),               \
411
                                     (__v8sf)(__m256)(Y), (int)(P)))
412
 
413
#define _mm_cmp_sd(X, Y, P)                                             \
414
  ((__m128d) __builtin_ia32_cmpsd ((__v2df)(__m128d)(X),                \
415
                                   (__v2df)(__m128d)(Y), (int)(P)))
416
 
417
#define _mm_cmp_ss(X, Y, P)                                             \
418
  ((__m128) __builtin_ia32_cmpss ((__v4sf)(__m128)(X),                  \
419
                                  (__v4sf)(__m128)(Y), (int)(P)))
420
#endif
421
 
422
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
423
_mm256_cvtepi32_pd (__m128i __A)
424
{
425
  return (__m256d)__builtin_ia32_cvtdq2pd256 ((__v4si) __A);
426
}
427
 
428
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
429
_mm256_cvtepi32_ps (__m256i __A)
430
{
431
  return (__m256)__builtin_ia32_cvtdq2ps256 ((__v8si) __A);
432
}
433
 
434
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
435
_mm256_cvtpd_ps (__m256d __A)
436
{
437
  return (__m128)__builtin_ia32_cvtpd2ps256 ((__v4df) __A);
438
}
439
 
440
extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
441
_mm256_cvtps_epi32 (__m256 __A)
442
{
443
  return (__m256i)__builtin_ia32_cvtps2dq256 ((__v8sf) __A);
444
}
445
 
446
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
447
_mm256_cvtps_pd (__m128 __A)
448
{
449
  return (__m256d)__builtin_ia32_cvtps2pd256 ((__v4sf) __A);
450
}
451
 
452
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
453
_mm256_cvttpd_epi32 (__m256d __A)
454
{
455
  return (__m128i)__builtin_ia32_cvttpd2dq256 ((__v4df) __A);
456
}
457
 
458
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
459
_mm256_cvtpd_epi32 (__m256d __A)
460
{
461
  return (__m128i)__builtin_ia32_cvtpd2dq256 ((__v4df) __A);
462
}
463
 
464
extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
465
_mm256_cvttps_epi32 (__m256 __A)
466
{
467
  return (__m256i)__builtin_ia32_cvttps2dq256 ((__v8sf) __A);
468
}
469
 
470
#ifdef __OPTIMIZE__
471
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
472
_mm256_extractf128_pd (__m256d __X, const int __N)
473
{
474
  return (__m128d) __builtin_ia32_vextractf128_pd256 ((__v4df)__X, __N);
475
}
476
 
477
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
478
_mm256_extractf128_ps (__m256 __X, const int __N)
479
{
480
  return (__m128) __builtin_ia32_vextractf128_ps256 ((__v8sf)__X, __N);
481
}
482
 
483
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
484
_mm256_extractf128_si256 (__m256i __X, const int __N)
485
{
486
  return (__m128i) __builtin_ia32_vextractf128_si256 ((__v8si)__X, __N);
487
}
488
 
489
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
490
_mm256_extract_epi32 (__m256i __X, int const __N)
491
{
492
  __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 2);
493
  return _mm_extract_epi32 (__Y, __N % 4);
494
}
495
 
496
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
497
_mm256_extract_epi16 (__m256i __X, int const __N)
498
{
499
  __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 3);
500
  return _mm_extract_epi16 (__Y, __N % 8);
501
}
502
 
503
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
504
_mm256_extract_epi8 (__m256i __X, int const __N)
505
{
506
  __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 4);
507
  return _mm_extract_epi8 (__Y, __N % 16);
508
}
509
 
510
#ifdef __x86_64__
511
extern __inline long long  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
512
_mm256_extract_epi64 (__m256i __X, const int __N)
513
{
514
  __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 1);
515
  return _mm_extract_epi64 (__Y, __N % 2);
516
}
517
#endif
518
#else
519
#define _mm256_extractf128_pd(X, N)                                     \
520
  ((__m128d) __builtin_ia32_vextractf128_pd256 ((__v4df)(__m256d)(X),   \
521
                                                (int)(N)))
522
 
523
#define _mm256_extractf128_ps(X, N)                                     \
524
  ((__m128) __builtin_ia32_vextractf128_ps256 ((__v8sf)(__m256)(X),     \
525
                                               (int)(N)))
526
 
527
#define _mm256_extractf128_si256(X, N)                                  \
528
  ((__m128i) __builtin_ia32_vextractf128_si256 ((__v8si)(__m256i)(X),   \
529
                                                (int)(N)))
530
 
531
#define _mm256_extract_epi32(X, N)                                      \
532
  (__extension__                                                        \
533
   ({                                                                   \
534
      __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 2);           \
535
      _mm_extract_epi32 (__Y, (N) % 4);                                 \
536
    }))
537
 
538
#define _mm256_extract_epi16(X, N)                                      \
539
  (__extension__                                                        \
540
   ({                                                                   \
541
      __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 3);           \
542
      _mm_extract_epi16 (__Y, (N) % 8);                                 \
543
    }))
544
 
545
#define _mm256_extract_epi8(X, N)                                       \
546
  (__extension__                                                        \
547
   ({                                                                   \
548
      __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 4);           \
549
      _mm_extract_epi8 (__Y, (N) % 16);                                 \
550
    }))
551
 
552
#ifdef __x86_64__
553
#define _mm256_extract_epi64(X, N)                                      \
554
  (__extension__                                                        \
555
   ({                                                                   \
556
      __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 1);           \
557
      _mm_extract_epi64 (__Y, (N) % 2);                                 \
558
    }))
559
#endif
560
#endif
561
 
562
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
563
_mm256_zeroall (void)
564
{
565
  __builtin_ia32_vzeroall ();
566
}
567
 
568
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
569
_mm256_zeroupper (void)
570
{
571
  __builtin_ia32_vzeroupper ();
572
}
573
 
574
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
575
_mm_permutevar_pd (__m128d __A, __m128i __C)
576
{
577
  return (__m128d) __builtin_ia32_vpermilvarpd ((__v2df)__A,
578
                                                (__v2di)__C);
579
}
580
 
581
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
582
_mm256_permutevar_pd (__m256d __A, __m256i __C)
583
{
584
  return (__m256d) __builtin_ia32_vpermilvarpd256 ((__v4df)__A,
585
                                                   (__v4di)__C);
586
}
587
 
588
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
589
_mm_permutevar_ps (__m128 __A, __m128i __C)
590
{
591
  return (__m128) __builtin_ia32_vpermilvarps ((__v4sf)__A,
592
                                               (__v4si)__C);
593
}
594
 
595
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
596
_mm256_permutevar_ps (__m256 __A, __m256i __C)
597
{
598
  return (__m256) __builtin_ia32_vpermilvarps256 ((__v8sf)__A,
599
                                                  (__v8si)__C);
600
}
601
 
602
#ifdef __OPTIMIZE__
603
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
604
_mm_permute_pd (__m128d __X, const int __C)
605
{
606
  return (__m128d) __builtin_ia32_vpermilpd ((__v2df)__X, __C);
607
}
608
 
609
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
610
_mm256_permute_pd (__m256d __X, const int __C)
611
{
612
  return (__m256d) __builtin_ia32_vpermilpd256 ((__v4df)__X, __C);
613
}
614
 
615
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
616
_mm_permute_ps (__m128 __X, const int __C)
617
{
618
  return (__m128) __builtin_ia32_vpermilps ((__v4sf)__X, __C);
619
}
620
 
621
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
622
_mm256_permute_ps (__m256 __X, const int __C)
623
{
624
  return (__m256) __builtin_ia32_vpermilps256 ((__v8sf)__X, __C);
625
}
626
#else
627
#define _mm_permute_pd(X, C)                                            \
628
  ((__m128d) __builtin_ia32_vpermilpd ((__v2df)(__m128d)(X), (int)(C)))
629
 
630
#define _mm256_permute_pd(X, C)                                         \
631
  ((__m256d) __builtin_ia32_vpermilpd256 ((__v4df)(__m256d)(X), (int)(C)))
632
 
633
#define _mm_permute_ps(X, C)                                            \
634
  ((__m128) __builtin_ia32_vpermilps ((__v4sf)(__m128)(X), (int)(C)))
635
 
636
#define _mm256_permute_ps(X, C)                                         \
637
  ((__m256) __builtin_ia32_vpermilps256 ((__v8sf)(__m256)(X), (int)(C)))
638
#endif
639
 
640
#ifdef __OPTIMIZE__
641
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
642
_mm256_permute2f128_pd (__m256d __X, __m256d __Y, const int __C)
643
{
644
  return (__m256d) __builtin_ia32_vperm2f128_pd256 ((__v4df)__X,
645
                                                    (__v4df)__Y,
646
                                                    __C);
647
}
648
 
649
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
650
_mm256_permute2f128_ps (__m256 __X, __m256 __Y, const int __C)
651
{
652
  return (__m256) __builtin_ia32_vperm2f128_ps256 ((__v8sf)__X,
653
                                                   (__v8sf)__Y,
654
                                                   __C);
655
}
656
 
657
extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
658
_mm256_permute2f128_si256 (__m256i __X, __m256i __Y, const int __C)
659
{
660
  return (__m256i) __builtin_ia32_vperm2f128_si256 ((__v8si)__X,
661
                                                    (__v8si)__Y,
662
                                                    __C);
663
}
664
#else
665
#define _mm256_permute2f128_pd(X, Y, C)                                 \
666
  ((__m256d) __builtin_ia32_vperm2f128_pd256 ((__v4df)(__m256d)(X),     \
667
                                              (__v4df)(__m256d)(Y),     \
668
                                              (int)(C)))
669
 
670
#define _mm256_permute2f128_ps(X, Y, C)                                 \
671
  ((__m256) __builtin_ia32_vperm2f128_ps256 ((__v8sf)(__m256)(X),       \
672
                                             (__v8sf)(__m256)(Y),       \
673
                                             (int)(C)))
674
 
675
#define _mm256_permute2f128_si256(X, Y, C)                              \
676
  ((__m256i) __builtin_ia32_vperm2f128_si256 ((__v8si)(__m256i)(X),     \
677
                                              (__v8si)(__m256i)(Y),     \
678
                                              (int)(C)))
679
#endif
680
 
681
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
682
_mm_broadcast_ss (float const *__X)
683
{
684
  return (__m128) __builtin_ia32_vbroadcastss (__X);
685
}
686
 
687
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
688
_mm256_broadcast_sd (double const *__X)
689
{
690
  return (__m256d) __builtin_ia32_vbroadcastsd256 (__X);
691
}
692
 
693
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
694
_mm256_broadcast_ss (float const *__X)
695
{
696
  return (__m256) __builtin_ia32_vbroadcastss256 (__X);
697
}
698
 
699
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
700
_mm256_broadcast_pd (__m128d const *__X)
701
{
702
  return (__m256d) __builtin_ia32_vbroadcastf128_pd256 (__X);
703
}
704
 
705
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
706
_mm256_broadcast_ps (__m128 const *__X)
707
{
708
  return (__m256) __builtin_ia32_vbroadcastf128_ps256 (__X);
709
}
710
 
711
#ifdef __OPTIMIZE__
712
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
713
_mm256_insertf128_pd (__m256d __X, __m128d __Y, const int __O)
714
{
715
  return (__m256d) __builtin_ia32_vinsertf128_pd256 ((__v4df)__X,
716
                                                     (__v2df)__Y,
717
                                                     __O);
718
}
719
 
720
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
721
_mm256_insertf128_ps (__m256 __X, __m128 __Y, const int __O)
722
{
723
  return (__m256) __builtin_ia32_vinsertf128_ps256 ((__v8sf)__X,
724
                                                    (__v4sf)__Y,
725
                                                    __O);
726
}
727
 
728
extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
729
_mm256_insertf128_si256 (__m256i __X, __m128i __Y, const int __O)
730
{
731
  return (__m256i) __builtin_ia32_vinsertf128_si256 ((__v8si)__X,
732
                                                     (__v4si)__Y,
733
                                                     __O);
734
}
735
 
736
extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
737
_mm256_insert_epi32 (__m256i __X, int __D, int const __N)
738
{
739
  __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 2);
740
  __Y = _mm_insert_epi16 (__Y, __D, __N % 4);
741
  return _mm256_insertf128_si256 (__X, __Y, __N >> 2);
742
}
743
 
744
extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
745
_mm256_insert_epi16 (__m256i __X, int __D, int const __N)
746
{
747
  __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 3);
748
  __Y = _mm_insert_epi16 (__Y, __D, __N % 8);
749
  return _mm256_insertf128_si256 (__X, __Y, __N >> 3);
750
}
751
 
752
extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
753
_mm256_insert_epi8 (__m256i __X, int __D, int const __N)
754
{
755
  __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 4);
756
  __Y = _mm_insert_epi8 (__Y, __D, __N % 16);
757
  return _mm256_insertf128_si256 (__X, __Y, __N >> 4);
758
}
759
 
760
#ifdef __x86_64__
761
extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
762
_mm256_insert_epi64 (__m256i __X, int __D, int const __N)
763
{
764
  __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 1);
765
  __Y = _mm_insert_epi16 (__Y, __D, __N % 2);
766
  return _mm256_insertf128_si256 (__X, __Y, __N >> 1);
767
}
768
#endif
769
#else
770
#define _mm256_insertf128_pd(X, Y, O)                                   \
771
  ((__m256d) __builtin_ia32_vinsertf128_pd256 ((__v4df)(__m256d)(X),    \
772
                                               (__v2df)(__m128d)(Y),    \
773
                                               (int)(O)))
774
 
775
#define _mm256_insertf128_ps(X, Y, O)                                   \
776
  ((__m256) __builtin_ia32_vinsertf128_ps256 ((__v8sf)(__m256)(X),      \
777
                                              (__v4sf)(__m128)(Y),      \
778
                                              (int)(O)))
779
 
780
#define _mm256_insertf128_si256(X, Y, O)                                \
781
  ((__m256i) __builtin_ia32_vinsertf128_si256 ((__v8si)(__m256i)(X),    \
782
                                               (__v4si)(__m128i)(Y),    \
783
                                               (int)(O)))
784
 
785
#define _mm256_insert_epi32(X, D, N)                                    \
786
  (__extension__                                                        \
787
   ({                                                                   \
788
      __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 2);           \
789
      __Y = _mm_insert_epi32 (__Y, (D), (N) % 4);                       \
790
      _mm256_insertf128_si256 ((X), __Y, (N) >> 2);                     \
791
    }))
792
 
793
#define _mm256_insert_epi16(X, D, N)                                    \
794
  (__extension__                                                        \
795
   ({                                                                   \
796
      __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 3);           \
797
      __Y = _mm_insert_epi16 (__Y, (D), (N) % 8);                       \
798
      _mm256_insertf128_si256 ((X), __Y, (N) >> 3);                     \
799
    }))
800
 
801
#define _mm256_insert_epi8(X, D, N)                                     \
802
  (__extension__                                                        \
803
   ({                                                                   \
804
      __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 4);           \
805
      __Y = _mm_insert_epi8 (__Y, (D), (N) % 16);                       \
806
      _mm256_insertf128_si256 ((X), __Y, (N) >> 4);                     \
807
    }))
808
 
809
#ifdef __x86_64__
810
#define _mm256_insert_epi64(X, D, N)                                    \
811
  (__extension__                                                        \
812
   ({                                                                   \
813
      __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 1);           \
814
      __Y = _mm_insert_epi64 (__Y, (D), (N) % 2);                       \
815
      _mm256_insertf128_si256 ((X), __Y, (N) >> 1);                     \
816
    }))
817
#endif
818
#endif
819
 
820
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
821
_mm256_load_pd (double const *__P)
822
{
823
  return *(__m256d *)__P;
824
}
825
 
826
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
827
_mm256_store_pd (double *__P, __m256d __A)
828
{
829
  *(__m256d *)__P = __A;
830
}
831
 
832
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
833
_mm256_load_ps (float const *__P)
834
{
835
  return *(__m256 *)__P;
836
}
837
 
838
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
839
_mm256_store_ps (float *__P, __m256 __A)
840
{
841
  *(__m256 *)__P = __A;
842
}
843
 
844
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
845
_mm256_loadu_pd (double const *__P)
846
{
847
  return (__m256d) __builtin_ia32_loadupd256 (__P);
848
}
849
 
850
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
851
_mm256_storeu_pd (double *__P, __m256d __A)
852
{
853
  __builtin_ia32_storeupd256 (__P, (__v4df)__A);
854
}
855
 
856
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
857
_mm256_loadu_ps (float const *__P)
858
{
859
  return (__m256) __builtin_ia32_loadups256 (__P);
860
}
861
 
862
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
863
_mm256_storeu_ps (float *__P, __m256 __A)
864
{
865
  __builtin_ia32_storeups256 (__P, (__v8sf)__A);
866
}
867
 
868
extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
869
_mm256_load_si256 (__m256i const *__P)
870
{
871
  return *__P;
872
}
873
 
874
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
875
_mm256_store_si256 (__m256i *__P, __m256i __A)
876
{
877
  *__P = __A;
878
}
879
 
880
extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
881
_mm256_loadu_si256 (__m256i const *__P)
882
{
883
  return (__m256i) __builtin_ia32_loaddqu256 ((char const *)__P);
884
}
885
 
886
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
887
_mm256_storeu_si256 (__m256i *__P, __m256i __A)
888
{
889
  __builtin_ia32_storedqu256 ((char *)__P, (__v32qi)__A);
890
}
891
 
892
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
893
_mm_maskload_pd (double const *__P, __m128d __M)
894
{
895
  return (__m128d) __builtin_ia32_maskloadpd ((const __v2df *)__P,
896
                                              (__v2df)__M);
897
}
898
 
899
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
900
_mm_maskstore_pd (double *__P, __m128d __M, __m128d __A)
901
{
902
  __builtin_ia32_maskstorepd ((__v2df *)__P, (__v2df)__M, (__v2df)__A);
903
}
904
 
905
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
906
_mm256_maskload_pd (double const *__P, __m256d __M)
907
{
908
  return (__m256d) __builtin_ia32_maskloadpd256 ((const __v4df *)__P,
909
                                                 (__v4df)__M);
910
}
911
 
912
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
913
_mm256_maskstore_pd (double *__P, __m256d __M, __m256d __A)
914
{
915
  __builtin_ia32_maskstorepd256 ((__v4df *)__P, (__v4df)__M, (__v4df)__A);
916
}
917
 
918
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
919
_mm_maskload_ps (float const *__P, __m128 __M)
920
{
921
  return (__m128) __builtin_ia32_maskloadps ((const __v4sf *)__P,
922
                                             (__v4sf)__M);
923
}
924
 
925
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
926
_mm_maskstore_ps (float *__P, __m128 __M, __m128 __A)
927
{
928
  __builtin_ia32_maskstoreps ((__v4sf *)__P, (__v4sf)__M, (__v4sf)__A);
929
}
930
 
931
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
932
_mm256_maskload_ps (float const *__P, __m256 __M)
933
{
934
  return (__m256) __builtin_ia32_maskloadps256 ((const __v8sf *)__P,
935
                                                (__v8sf)__M);
936
}
937
 
938
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
939
_mm256_maskstore_ps (float *__P, __m256 __M, __m256 __A)
940
{
941
  __builtin_ia32_maskstoreps256 ((__v8sf *)__P, (__v8sf)__M, (__v8sf)__A);
942
}
943
 
944
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
945
_mm256_movehdup_ps (__m256 __X)
946
{
947
  return (__m256) __builtin_ia32_movshdup256 ((__v8sf)__X);
948
}
949
 
950
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
951
_mm256_moveldup_ps (__m256 __X)
952
{
953
  return (__m256) __builtin_ia32_movsldup256 ((__v8sf)__X);
954
}
955
 
956
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
957
_mm256_movedup_pd (__m256d __X)
958
{
959
  return (__m256d) __builtin_ia32_movddup256 ((__v4df)__X);
960
}
961
 
962
extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
963
_mm256_lddqu_si256 (__m256i const *__P)
964
{
965
  return (__m256i) __builtin_ia32_lddqu256 ((char const *)__P);
966
}
967
 
968
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
969
_mm256_stream_si256 (__m256i *__A, __m256i __B)
970
{
971
  __builtin_ia32_movntdq256 ((__v4di *)__A, (__v4di)__B);
972
}
973
 
974
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
975
_mm256_stream_pd (double *__A, __m256d __B)
976
{
977
  __builtin_ia32_movntpd256 (__A, (__v4df)__B);
978
}
979
 
980
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
981
_mm256_stream_ps (float *__P, __m256 __A)
982
{
983
  __builtin_ia32_movntps256 (__P, (__v8sf)__A);
984
}
985
 
986
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
987
_mm256_rcp_ps (__m256 __A)
988
{
989
  return (__m256) __builtin_ia32_rcpps256 ((__v8sf)__A);
990
}
991
 
992
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
993
_mm256_rsqrt_ps (__m256 __A)
994
{
995
  return (__m256) __builtin_ia32_rsqrtps256 ((__v8sf)__A);
996
}
997
 
998
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
999
_mm256_sqrt_pd (__m256d __A)
1000
{
1001
  return (__m256d) __builtin_ia32_sqrtpd256 ((__v4df)__A);
1002
}
1003
 
1004
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1005
_mm256_sqrt_ps (__m256 __A)
1006
{
1007
  return (__m256) __builtin_ia32_sqrtps256 ((__v8sf)__A);
1008
}
1009
 
1010
#ifdef __OPTIMIZE__
1011
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1012
_mm256_round_pd (__m256d __V, const int __M)
1013
{
1014
  return (__m256d) __builtin_ia32_roundpd256 ((__v4df)__V, __M);
1015
}
1016
 
1017
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1018
_mm256_round_ps (__m256 __V, const int __M)
1019
{
1020
  return (__m256) __builtin_ia32_roundps256 ((__v8sf)__V, __M);
1021
}
1022
#else
1023
#define _mm256_round_pd(V, M) \
1024
  ((__m256d) __builtin_ia32_roundpd256 ((__v4df)(__m256d)(V), (int)(M)))
1025
 
1026
#define _mm256_round_ps(V, M) \
1027
  ((__m256) __builtin_ia32_roundps256 ((__v8sf)(__m256)(V), (int)(M)))
1028
#endif
1029
 
1030
#define _mm256_ceil_pd(V)       _mm256_round_pd ((V), _MM_FROUND_CEIL)
1031
#define _mm256_floor_pd(V)      _mm256_round_pd ((V), _MM_FROUND_FLOOR)
1032
#define _mm256_ceil_ps(V)       _mm256_round_ps ((V), _MM_FROUND_CEIL)
1033
#define _mm256_floor_ps(V)      _mm256_round_ps ((V), _MM_FROUND_FLOOR)
1034
 
1035
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1036
_mm256_unpackhi_pd (__m256d __A, __m256d __B)
1037
{
1038
  return (__m256d) __builtin_ia32_unpckhpd256 ((__v4df)__A, (__v4df)__B);
1039
}
1040
 
1041
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1042
_mm256_unpacklo_pd (__m256d __A, __m256d __B)
1043
{
1044
  return (__m256d) __builtin_ia32_unpcklpd256 ((__v4df)__A, (__v4df)__B);
1045
}
1046
 
1047
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1048
_mm256_unpackhi_ps (__m256 __A, __m256 __B)
1049
{
1050
  return (__m256) __builtin_ia32_unpckhps256 ((__v8sf)__A, (__v8sf)__B);
1051
}
1052
 
1053
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1054
_mm256_unpacklo_ps (__m256 __A, __m256 __B)
1055
{
1056
  return (__m256) __builtin_ia32_unpcklps256 ((__v8sf)__A, (__v8sf)__B);
1057
}
1058
 
1059
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1060
_mm_testz_pd (__m128d __M, __m128d __V)
1061
{
1062
  return __builtin_ia32_vtestzpd ((__v2df)__M, (__v2df)__V);
1063
}
1064
 
1065
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1066
_mm_testc_pd (__m128d __M, __m128d __V)
1067
{
1068
  return __builtin_ia32_vtestcpd ((__v2df)__M, (__v2df)__V);
1069
}
1070
 
1071
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1072
_mm_testnzc_pd (__m128d __M, __m128d __V)
1073
{
1074
  return __builtin_ia32_vtestnzcpd ((__v2df)__M, (__v2df)__V);
1075
}
1076
 
1077
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1078
_mm_testz_ps (__m128 __M, __m128 __V)
1079
{
1080
  return __builtin_ia32_vtestzps ((__v4sf)__M, (__v4sf)__V);
1081
}
1082
 
1083
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1084
_mm_testc_ps (__m128 __M, __m128 __V)
1085
{
1086
  return __builtin_ia32_vtestcps ((__v4sf)__M, (__v4sf)__V);
1087
}
1088
 
1089
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1090
_mm_testnzc_ps (__m128 __M, __m128 __V)
1091
{
1092
  return __builtin_ia32_vtestnzcps ((__v4sf)__M, (__v4sf)__V);
1093
}
1094
 
1095
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1096
_mm256_testz_pd (__m256d __M, __m256d __V)
1097
{
1098
  return __builtin_ia32_vtestzpd256 ((__v4df)__M, (__v4df)__V);
1099
}
1100
 
1101
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1102
_mm256_testc_pd (__m256d __M, __m256d __V)
1103
{
1104
  return __builtin_ia32_vtestcpd256 ((__v4df)__M, (__v4df)__V);
1105
}
1106
 
1107
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1108
_mm256_testnzc_pd (__m256d __M, __m256d __V)
1109
{
1110
  return __builtin_ia32_vtestnzcpd256 ((__v4df)__M, (__v4df)__V);
1111
}
1112
 
1113
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1114
_mm256_testz_ps (__m256 __M, __m256 __V)
1115
{
1116
  return __builtin_ia32_vtestzps256 ((__v8sf)__M, (__v8sf)__V);
1117
}
1118
 
1119
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1120
_mm256_testc_ps (__m256 __M, __m256 __V)
1121
{
1122
  return __builtin_ia32_vtestcps256 ((__v8sf)__M, (__v8sf)__V);
1123
}
1124
 
1125
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1126
_mm256_testnzc_ps (__m256 __M, __m256 __V)
1127
{
1128
  return __builtin_ia32_vtestnzcps256 ((__v8sf)__M, (__v8sf)__V);
1129
}
1130
 
1131
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1132
_mm256_testz_si256 (__m256i __M, __m256i __V)
1133
{
1134
  return __builtin_ia32_ptestz256 ((__v4di)__M, (__v4di)__V);
1135
}
1136
 
1137
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1138
_mm256_testc_si256 (__m256i __M, __m256i __V)
1139
{
1140
  return __builtin_ia32_ptestc256 ((__v4di)__M, (__v4di)__V);
1141
}
1142
 
1143
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1144
_mm256_testnzc_si256 (__m256i __M, __m256i __V)
1145
{
1146
  return __builtin_ia32_ptestnzc256 ((__v4di)__M, (__v4di)__V);
1147
}
1148
 
1149
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1150
_mm256_movemask_pd (__m256d __A)
1151
{
1152
  return __builtin_ia32_movmskpd256 ((__v4df)__A);
1153
}
1154
 
1155
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1156
_mm256_movemask_ps (__m256 __A)
1157
{
1158
  return __builtin_ia32_movmskps256 ((__v8sf)__A);
1159
}
1160
 
1161
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1162
_mm256_setzero_pd (void)
1163
{
1164
  return __extension__ (__m256d){ 0.0, 0.0, 0.0, 0.0 };
1165
}
1166
 
1167
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1168
_mm256_setzero_ps (void)
1169
{
1170
  return __extension__ (__m256){ 0.0, 0.0, 0.0, 0.0,
1171
                                 0.0, 0.0, 0.0, 0.0 };
1172
}
1173
 
1174
extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1175
_mm256_setzero_si256 (void)
1176
{
1177
  return __extension__ (__m256i)(__v4di){ 0, 0, 0, 0 };
1178
}
1179
 
1180
/* Create the vector [A B C D].  */
1181
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1182
_mm256_set_pd (double __A, double __B, double __C, double __D)
1183
{
1184
  return __extension__ (__m256d){ __D, __C, __B, __A };
1185
}
1186
 
1187
/* Create the vector [A B C D E F G H].  */
1188
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1189
_mm256_set_ps (float __A, float __B, float __C, float __D,
1190
               float __E, float __F, float __G, float __H)
1191
{
1192
  return __extension__ (__m256){ __H, __G, __F, __E,
1193
                                 __D, __C, __B, __A };
1194
}
1195
 
1196
/* Create the vector [A B C D E F G H].  */
1197
extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1198
_mm256_set_epi32 (int __A, int __B, int __C, int __D,
1199
                  int __E, int __F, int __G, int __H)
1200
{
1201
  return __extension__ (__m256i)(__v8si){ __H, __G, __F, __E,
1202
                                          __D, __C, __B, __A };
1203
}
1204
 
1205
extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1206
_mm256_set_epi16 (short __q15, short __q14, short __q13, short __q12,
1207
                  short __q11, short __q10, short __q09, short __q08,
1208
                  short __q07, short __q06, short __q05, short __q04,
1209
                  short __q03, short __q02, short __q01, short __q00)
1210
{
1211
  return __extension__ (__m256i)(__v16hi){
1212
    __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
1213
    __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15
1214
  };
1215
}
1216
 
1217
extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1218
_mm256_set_epi8  (char __q31, char __q30, char __q29, char __q28,
1219
                  char __q27, char __q26, char __q25, char __q24,
1220
                  char __q23, char __q22, char __q21, char __q20,
1221
                  char __q19, char __q18, char __q17, char __q16,
1222
                  char __q15, char __q14, char __q13, char __q12,
1223
                  char __q11, char __q10, char __q09, char __q08,
1224
                  char __q07, char __q06, char __q05, char __q04,
1225
                  char __q03, char __q02, char __q01, char __q00)
1226
{
1227
  return __extension__ (__m256i)(__v32qi){
1228
    __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
1229
    __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15,
1230
    __q16, __q17, __q18, __q19, __q20, __q21, __q22, __q23,
1231
    __q24, __q25, __q26, __q27, __q28, __q29, __q30, __q31
1232
  };
1233
}
1234
 
1235
extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1236
_mm256_set_epi64x (long long __A, long long __B, long long __C,
1237
                   long long __D)
1238
{
1239
  return __extension__ (__m256i)(__v4di){ __D, __C, __B, __A };
1240
}
1241
 
1242
/* Create a vector with all elements equal to A.  */
1243
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1244
_mm256_set1_pd (double __A)
1245
{
1246
  return __extension__ (__m256d){ __A, __A, __A, __A };
1247
}
1248
 
1249
/* Create a vector with all elements equal to A.  */
1250
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1251
_mm256_set1_ps (float __A)
1252
{
1253
  return __extension__ (__m256){ __A, __A, __A, __A,
1254
                                 __A, __A, __A, __A };
1255
}
1256
 
1257
/* Create a vector with all elements equal to A.  */
1258
extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1259
_mm256_set1_epi32 (int __A)
1260
{
1261
  return __extension__ (__m256i)(__v8si){ __A, __A, __A, __A,
1262
                                          __A, __A, __A, __A };
1263
}
1264
 
1265
extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1266
_mm256_set1_epi16 (short __A)
1267
{
1268
  return _mm256_set_epi16 (__A, __A, __A, __A, __A, __A, __A, __A,
1269
                           __A, __A, __A, __A, __A, __A, __A, __A);
1270
}
1271
 
1272
extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1273
_mm256_set1_epi8 (char __A)
1274
{
1275
  return _mm256_set_epi8 (__A, __A, __A, __A, __A, __A, __A, __A,
1276
                          __A, __A, __A, __A, __A, __A, __A, __A,
1277
                          __A, __A, __A, __A, __A, __A, __A, __A,
1278
                          __A, __A, __A, __A, __A, __A, __A, __A);
1279
}
1280
 
1281
extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1282
_mm256_set1_epi64x (long long __A)
1283
{
1284
  return __extension__ (__m256i)(__v4di){ __A, __A, __A, __A };
1285
}
1286
 
1287
/* Create vectors of elements in the reversed order from the
1288
   _mm256_set_XXX functions.  */
1289
 
1290
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1291
_mm256_setr_pd (double __A, double __B, double __C, double __D)
1292
{
1293
  return _mm256_set_pd (__D, __C, __B, __A);
1294
}
1295
 
1296
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1297
_mm256_setr_ps (float __A, float __B, float __C, float __D,
1298
                float __E, float __F, float __G, float __H)
1299
{
1300
  return _mm256_set_ps (__H, __G, __F, __E, __D, __C, __B, __A);
1301
}
1302
 
1303
extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1304
_mm256_setr_epi32 (int __A, int __B, int __C, int __D,
1305
                   int __E, int __F, int __G, int __H)
1306
{
1307
  return _mm256_set_epi32 (__H, __G, __F, __E, __D, __C, __B, __A);
1308
}
1309
 
1310
extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1311
_mm256_setr_epi16 (short __q15, short __q14, short __q13, short __q12,
1312
                   short __q11, short __q10, short __q09, short __q08,
1313
                   short __q07, short __q06, short __q05, short __q04,
1314
                   short __q03, short __q02, short __q01, short __q00)
1315
{
1316
  return _mm256_set_epi16 (__q00, __q01, __q02, __q03,
1317
                           __q04, __q05, __q06, __q07,
1318
                           __q08, __q09, __q10, __q11,
1319
                           __q12, __q13, __q14, __q15);
1320
}
1321
 
1322
extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1323
_mm256_setr_epi8  (char __q31, char __q30, char __q29, char __q28,
1324
                   char __q27, char __q26, char __q25, char __q24,
1325
                   char __q23, char __q22, char __q21, char __q20,
1326
                   char __q19, char __q18, char __q17, char __q16,
1327
                   char __q15, char __q14, char __q13, char __q12,
1328
                   char __q11, char __q10, char __q09, char __q08,
1329
                   char __q07, char __q06, char __q05, char __q04,
1330
                   char __q03, char __q02, char __q01, char __q00)
1331
{
1332
  return _mm256_set_epi8 (__q00, __q01, __q02, __q03,
1333
                          __q04, __q05, __q06, __q07,
1334
                          __q08, __q09, __q10, __q11,
1335
                          __q12, __q13, __q14, __q15,
1336
                          __q16, __q17, __q18, __q19,
1337
                          __q20, __q21, __q22, __q23,
1338
                          __q24, __q25, __q26, __q27,
1339
                          __q28, __q29, __q30, __q31);
1340
}
1341
 
1342
extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1343
_mm256_setr_epi64x (long long __A, long long __B, long long __C,
1344
                    long long __D)
1345
{
1346
  return _mm256_set_epi64x (__D, __C, __B, __A);
1347
}
1348
 
1349
/* Casts between various SP, DP, INT vector types.  Note that these do no
1350
   conversion of values, they just change the type.  */
1351
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1352
_mm256_castpd_ps (__m256d __A)
1353
{
1354
  return (__m256) __A;
1355
}
1356
 
1357
extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1358
_mm256_castpd_si256 (__m256d __A)
1359
{
1360
  return (__m256i) __A;
1361
}
1362
 
1363
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1364
_mm256_castps_pd (__m256 __A)
1365
{
1366
  return (__m256d) __A;
1367
}
1368
 
1369
extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1370
_mm256_castps_si256(__m256 __A)
1371
{
1372
  return (__m256i) __A;
1373
}
1374
 
1375
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1376
_mm256_castsi256_ps (__m256i __A)
1377
{
1378
  return (__m256) __A;
1379
}
1380
 
1381
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1382
_mm256_castsi256_pd (__m256i __A)
1383
{
1384
  return (__m256d) __A;
1385
}
1386
 
1387
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1388
_mm256_castpd256_pd128 (__m256d __A)
1389
{
1390
  return (__m128d) __builtin_ia32_pd_pd256 ((__v4df)__A);
1391
}
1392
 
1393
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1394
_mm256_castps256_ps128 (__m256 __A)
1395
{
1396
  return (__m128) __builtin_ia32_ps_ps256 ((__v8sf)__A);
1397
}
1398
 
1399
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1400
_mm256_castsi256_si128 (__m256i __A)
1401
{
1402
  return (__m128i) __builtin_ia32_si_si256 ((__v8si)__A);
1403
}
1404
 
1405
/* When cast is done from a 128 to 256-bit type, the low 128 bits of
1406
   the 256-bit result contain source parameter value and the upper 128
1407
   bits of the result are undefined.  Those intrinsics shouldn't
1408
   generate any extra moves.  */
1409
 
1410
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1411
_mm256_castpd128_pd256 (__m128d __A)
1412
{
1413
  return (__m256d) __builtin_ia32_pd256_pd ((__v2df)__A);
1414
}
1415
 
1416
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1417
_mm256_castps128_ps256 (__m128 __A)
1418
{
1419
  return (__m256) __builtin_ia32_ps256_ps ((__v4sf)__A);
1420
}
1421
 
1422
extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1423
_mm256_castsi128_si256 (__m128i __A)
1424
{
1425
  return (__m256i) __builtin_ia32_si256_si ((__v4si)__A);
1426
}

powered by: WebSVN 2.1.0

© copyright 1999-2024 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.