OpenCores
URL https://opencores.org/ocsvn/openrisc/openrisc/trunk

Subversion Repositories openrisc

[/] [openrisc/] [tags/] [gnu-dev/] [fsf-gcc-snapshot-1-mar-12/] [or1k-gcc/] [gcc/] [config/] [i386/] [avx2intrin.h] - Blame information for rev 783

Details | Compare with Previous | View Log

Line No. Rev Author Line
1 709 jeremybenn
/* Copyright (C) 2011
2
   Free Software Foundation, Inc.
3
 
4
   This file is part of GCC.
5
 
6
   GCC is free software; you can redistribute it and/or modify
7
   it under the terms of the GNU General Public License as published by
8
   the Free Software Foundation; either version 3, or (at your option)
9
   any later version.
10
 
11
   GCC is distributed in the hope that it will be useful,
12
   but WITHOUT ANY WARRANTY; without even the implied warranty of
13
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14
   GNU General Public License for more details.
15
 
16
   Under Section 7 of GPL version 3, you are granted additional
17
   permissions described in the GCC Runtime Library Exception, version
18
   3.1, as published by the Free Software Foundation.
19
 
20
   You should have received a copy of the GNU General Public License and
21
   a copy of the GCC Runtime Library Exception along with this program;
22
   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
23
   <http://www.gnu.org/licenses/>.  */
24
 
25
#ifndef _IMMINTRIN_H_INCLUDED
26
# error "Never use <avx2intrin.h> directly; include <immintrin.h> instead."
27
#endif
28
 
29
/* Sum absolute 8-bit integer difference of adjacent groups of 4
30
   byte integers in the first 2 operands.  Starting offsets within
31
   operands are determined by the 3rd mask operand.  */
32
#ifdef __OPTIMIZE__
33
extern __inline __m256i
34
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
35
_mm256_mpsadbw_epu8 (__m256i __X, __m256i __Y, const int __M)
36
{
37
  return (__m256i) __builtin_ia32_mpsadbw256 ((__v32qi)__X,
38
                                              (__v32qi)__Y, __M);
39
}
40
#else
41
#define _mm256_mpsadbw_epu8(X, Y, M)                                    \
42
  ((__m256i) __builtin_ia32_mpsadbw256 ((__v32qi)(__m256i)(X),          \
43
                                        (__v32qi)(__m256i)(Y), (int)(M)))
44
#endif
45
 
46
extern __inline __m256i
47
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
48
_mm256_abs_epi8 (__m256i __A)
49
{
50
  return (__m256i)__builtin_ia32_pabsb256 ((__v32qi)__A);
51
}
52
 
53
extern __inline __m256i
54
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
55
_mm256_abs_epi16 (__m256i __A)
56
{
57
  return (__m256i)__builtin_ia32_pabsw256 ((__v16hi)__A);
58
}
59
 
60
extern __inline __m256i
61
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
62
_mm256_abs_epi32 (__m256i __A)
63
{
64
  return (__m256i)__builtin_ia32_pabsd256 ((__v8si)__A);
65
}
66
 
67
extern __inline __m256i
68
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
69
_mm256_packs_epi32 (__m256i __A, __m256i __B)
70
{
71
  return (__m256i)__builtin_ia32_packssdw256 ((__v8si)__A, (__v8si)__B);
72
}
73
 
74
extern __inline __m256i
75
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
76
_mm256_packs_epi16 (__m256i __A, __m256i __B)
77
{
78
  return (__m256i)__builtin_ia32_packsswb256 ((__v16hi)__A, (__v16hi)__B);
79
}
80
 
81
extern __inline __m256i
82
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
83
_mm256_packus_epi32 (__m256i __A, __m256i __B)
84
{
85
  return (__m256i)__builtin_ia32_packusdw256 ((__v8si)__A, (__v8si)__B);
86
}
87
 
88
extern __inline __m256i
89
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
90
_mm256_packus_epi16 (__m256i __A, __m256i __B)
91
{
92
  return (__m256i)__builtin_ia32_packuswb256 ((__v16hi)__A, (__v16hi)__B);
93
}
94
 
95
extern __inline __m256i
96
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
97
_mm256_add_epi8 (__m256i __A, __m256i __B)
98
{
99
  return (__m256i)__builtin_ia32_paddb256 ((__v32qi)__A, (__v32qi)__B);
100
}
101
 
102
extern __inline __m256i
103
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
104
_mm256_add_epi16 (__m256i __A, __m256i __B)
105
{
106
  return (__m256i)__builtin_ia32_paddw256 ((__v16hi)__A, (__v16hi)__B);
107
}
108
 
109
extern __inline __m256i
110
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
111
_mm256_add_epi32 (__m256i __A, __m256i __B)
112
{
113
  return (__m256i)__builtin_ia32_paddd256 ((__v8si)__A, (__v8si)__B);
114
}
115
 
116
extern __inline __m256i
117
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
118
_mm256_add_epi64 (__m256i __A, __m256i __B)
119
{
120
  return (__m256i)__builtin_ia32_paddq256 ((__v4di)__A, (__v4di)__B);
121
}
122
 
123
extern __inline __m256i
124
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
125
_mm256_adds_epi8 (__m256i __A, __m256i __B)
126
{
127
  return (__m256i)__builtin_ia32_paddsb256 ((__v32qi)__A, (__v32qi)__B);
128
}
129
 
130
extern __inline __m256i
131
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
132
_mm256_adds_epi16 (__m256i __A, __m256i __B)
133
{
134
  return (__m256i)__builtin_ia32_paddsw256 ((__v16hi)__A, (__v16hi)__B);
135
}
136
 
137
extern __inline __m256i
138
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
139
_mm256_adds_epu8 (__m256i __A, __m256i __B)
140
{
141
  return (__m256i)__builtin_ia32_paddusb256 ((__v32qi)__A, (__v32qi)__B);
142
}
143
 
144
extern __inline __m256i
145
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
146
_mm256_adds_epu16 (__m256i __A, __m256i __B)
147
{
148
  return (__m256i)__builtin_ia32_paddusw256 ((__v16hi)__A, (__v16hi)__B);
149
}
150
 
151
#ifdef __OPTIMIZE__
152
extern __inline __m256i
153
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
154
_mm256_alignr_epi8 (__m256i __A, __m256i __B, const int __N)
155
{
156
  return (__m256i) __builtin_ia32_palignr256 ((__v4di)__A,
157
                                              (__v4di)__B,
158
                                              __N * 8);
159
}
160
#else
161
/* In that case (__N*8) will be in vreg, and insn will not be matched. */
162
/* Use define instead */
163
#define _mm256_alignr_epi8(A, B, N)                                \
164
  ((__m256i) __builtin_ia32_palignr256 ((__v4di)(__m256i)(A),      \
165
                                        (__v4di)(__m256i)(B),      \
166
                                        (int)(N) * 8))
167
#endif
168
 
169
extern __inline __m256i
170
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
171
_mm256_and_si256 (__m256i __A, __m256i __B)
172
{
173
  return (__m256i) __builtin_ia32_andsi256 ((__v4di)__A, (__v4di)__B);
174
}
175
 
176
extern __inline __m256i
177
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
178
_mm256_andnot_si256 (__m256i __A, __m256i __B)
179
{
180
  return (__m256i) __builtin_ia32_andnotsi256 ((__v4di)__A, (__v4di)__B);
181
}
182
 
183
extern __inline __m256i
184
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
185
_mm256_avg_epu8 (__m256i __A, __m256i __B)
186
{
187
  return (__m256i)__builtin_ia32_pavgb256 ((__v32qi)__A, (__v32qi)__B);
188
}
189
 
190
extern __inline __m256i
191
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
192
_mm256_avg_epu16 (__m256i __A, __m256i __B)
193
{
194
  return (__m256i)__builtin_ia32_pavgw256 ((__v16hi)__A, (__v16hi)__B);
195
}
196
 
197
extern __inline __m256i
198
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
199
_mm256_blendv_epi8 (__m256i __X, __m256i __Y, __m256i __M)
200
{
201
  return (__m256i) __builtin_ia32_pblendvb256 ((__v32qi)__X,
202
                                               (__v32qi)__Y,
203
                                               (__v32qi)__M);
204
}
205
 
206
#ifdef __OPTIMIZE__
207
extern __inline __m256i
208
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
209
_mm256_blend_epi16 (__m256i __X, __m256i __Y, const int __M)
210
{
211
  return (__m256i) __builtin_ia32_pblendw256 ((__v16hi)__X,
212
                                              (__v16hi)__Y,
213
                                               __M);
214
}
215
#else
216
#define _mm256_blend_epi16(X, Y, M)                                     \
217
  ((__m256i) __builtin_ia32_pblendw256 ((__v16hi)(__m256i)(X),          \
218
                                        (__v16hi)(__m256i)(Y), (int)(M)))
219
#endif
220
 
221
extern __inline __m256i
222
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
223
_mm256_cmpeq_epi8 (__m256i __A, __m256i __B)
224
{
225
  return (__m256i)__builtin_ia32_pcmpeqb256 ((__v32qi)__A, (__v32qi)__B);
226
}
227
 
228
extern __inline __m256i
229
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
230
_mm256_cmpeq_epi16 (__m256i __A, __m256i __B)
231
{
232
  return (__m256i)__builtin_ia32_pcmpeqw256 ((__v16hi)__A, (__v16hi)__B);
233
}
234
 
235
extern __inline __m256i
236
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
237
_mm256_cmpeq_epi32 (__m256i __A, __m256i __B)
238
{
239
  return (__m256i)__builtin_ia32_pcmpeqd256 ((__v8si)__A, (__v8si)__B);
240
}
241
 
242
extern __inline __m256i
243
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
244
_mm256_cmpeq_epi64 (__m256i __A, __m256i __B)
245
{
246
  return (__m256i)__builtin_ia32_pcmpeqq256 ((__v4di)__A, (__v4di)__B);
247
}
248
 
249
extern __inline __m256i
250
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
251
_mm256_cmpgt_epi8 (__m256i __A, __m256i __B)
252
{
253
  return (__m256i)__builtin_ia32_pcmpgtb256 ((__v32qi)__A,
254
                                             (__v32qi)__B);
255
}
256
 
257
extern __inline __m256i
258
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
259
_mm256_cmpgt_epi16 (__m256i __A, __m256i __B)
260
{
261
  return (__m256i)__builtin_ia32_pcmpgtw256 ((__v16hi)__A,
262
                                             (__v16hi)__B);
263
}
264
 
265
extern __inline __m256i
266
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
267
_mm256_cmpgt_epi32 (__m256i __A, __m256i __B)
268
{
269
  return (__m256i)__builtin_ia32_pcmpgtd256 ((__v8si)__A,
270
                                             (__v8si)__B);
271
}
272
 
273
extern __inline __m256i
274
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
275
_mm256_cmpgt_epi64 (__m256i __A, __m256i __B)
276
{
277
  return (__m256i)__builtin_ia32_pcmpgtq256 ((__v4di)__A, (__v4di)__B);
278
}
279
 
280
extern __inline __m256i
281
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
282
_mm256_hadd_epi16 (__m256i __X, __m256i __Y)
283
{
284
  return (__m256i) __builtin_ia32_phaddw256 ((__v16hi)__X,
285
                                             (__v16hi)__Y);
286
}
287
 
288
extern __inline __m256i
289
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
290
_mm256_hadd_epi32 (__m256i __X, __m256i __Y)
291
{
292
  return (__m256i) __builtin_ia32_phaddd256 ((__v8si)__X, (__v8si)__Y);
293
}
294
 
295
extern __inline __m256i
296
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
297
_mm256_hadds_epi16 (__m256i __X, __m256i __Y)
298
{
299
  return (__m256i) __builtin_ia32_phaddsw256 ((__v16hi)__X,
300
                                              (__v16hi)__Y);
301
}
302
 
303
extern __inline __m256i
304
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
305
_mm256_hsub_epi16 (__m256i __X, __m256i __Y)
306
{
307
  return (__m256i) __builtin_ia32_phsubw256 ((__v16hi)__X,
308
                                             (__v16hi)__Y);
309
}
310
 
311
extern __inline __m256i
312
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
313
_mm256_hsub_epi32 (__m256i __X, __m256i __Y)
314
{
315
  return (__m256i) __builtin_ia32_phsubd256 ((__v8si)__X, (__v8si)__Y);
316
}
317
 
318
extern __inline __m256i
319
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
320
_mm256_hsubs_epi16 (__m256i __X, __m256i __Y)
321
{
322
  return (__m256i) __builtin_ia32_phsubsw256 ((__v16hi)__X,
323
                                              (__v16hi)__Y);
324
}
325
 
326
extern __inline __m256i
327
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
328
_mm256_maddubs_epi16 (__m256i __X, __m256i __Y)
329
{
330
  return (__m256i) __builtin_ia32_pmaddubsw256 ((__v32qi)__X,
331
                                                (__v32qi)__Y);
332
}
333
 
334
extern __inline __m256i
335
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
336
_mm256_madd_epi16 (__m256i __A, __m256i __B)
337
{
338
  return (__m256i)__builtin_ia32_pmaddwd256 ((__v16hi)__A,
339
                                             (__v16hi)__B);
340
}
341
 
342
extern __inline __m256i
343
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
344
_mm256_max_epi8 (__m256i __A, __m256i __B)
345
{
346
  return (__m256i)__builtin_ia32_pmaxsb256 ((__v32qi)__A, (__v32qi)__B);
347
}
348
 
349
extern __inline __m256i
350
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
351
_mm256_max_epi16 (__m256i __A, __m256i __B)
352
{
353
  return (__m256i)__builtin_ia32_pmaxsw256 ((__v16hi)__A, (__v16hi)__B);
354
}
355
 
356
extern __inline __m256i
357
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
358
_mm256_max_epi32 (__m256i __A, __m256i __B)
359
{
360
  return (__m256i)__builtin_ia32_pmaxsd256 ((__v8si)__A, (__v8si)__B);
361
}
362
 
363
extern __inline __m256i
364
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
365
_mm256_max_epu8 (__m256i __A, __m256i __B)
366
{
367
  return (__m256i)__builtin_ia32_pmaxub256 ((__v32qi)__A, (__v32qi)__B);
368
}
369
 
370
extern __inline __m256i
371
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
372
_mm256_max_epu16 (__m256i __A, __m256i __B)
373
{
374
  return (__m256i)__builtin_ia32_pmaxuw256 ((__v16hi)__A, (__v16hi)__B);
375
}
376
 
377
extern __inline __m256i
378
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
379
_mm256_max_epu32 (__m256i __A, __m256i __B)
380
{
381
  return (__m256i)__builtin_ia32_pmaxud256 ((__v8si)__A, (__v8si)__B);
382
}
383
 
384
extern __inline __m256i
385
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
386
_mm256_min_epi8 (__m256i __A, __m256i __B)
387
{
388
  return (__m256i)__builtin_ia32_pminsb256 ((__v32qi)__A, (__v32qi)__B);
389
}
390
 
391
extern __inline __m256i
392
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
393
_mm256_min_epi16 (__m256i __A, __m256i __B)
394
{
395
  return (__m256i)__builtin_ia32_pminsw256 ((__v16hi)__A, (__v16hi)__B);
396
}
397
 
398
extern __inline __m256i
399
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
400
_mm256_min_epi32 (__m256i __A, __m256i __B)
401
{
402
  return (__m256i)__builtin_ia32_pminsd256 ((__v8si)__A, (__v8si)__B);
403
}
404
 
405
extern __inline __m256i
406
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
407
_mm256_min_epu8 (__m256i __A, __m256i __B)
408
{
409
  return (__m256i)__builtin_ia32_pminub256 ((__v32qi)__A, (__v32qi)__B);
410
}
411
 
412
extern __inline __m256i
413
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
414
_mm256_min_epu16 (__m256i __A, __m256i __B)
415
{
416
  return (__m256i)__builtin_ia32_pminuw256 ((__v16hi)__A, (__v16hi)__B);
417
}
418
 
419
extern __inline __m256i
420
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
421
_mm256_min_epu32 (__m256i __A, __m256i __B)
422
{
423
  return (__m256i)__builtin_ia32_pminud256 ((__v8si)__A, (__v8si)__B);
424
}
425
 
426
extern __inline int
427
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
428
_mm256_movemask_epi8 (__m256i __A)
429
{
430
  return __builtin_ia32_pmovmskb256 ((__v32qi)__A);
431
}
432
 
433
extern __inline __m256i
434
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
435
_mm256_cvtepi8_epi16 (__m128i __X)
436
{
437
  return (__m256i) __builtin_ia32_pmovsxbw256 ((__v16qi)__X);
438
}
439
 
440
extern __inline __m256i
441
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
442
_mm256_cvtepi8_epi32 (__m128i __X)
443
{
444
  return (__m256i) __builtin_ia32_pmovsxbd256 ((__v16qi)__X);
445
}
446
 
447
extern __inline __m256i
448
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
449
_mm256_cvtepi8_epi64 (__m128i __X)
450
{
451
  return (__m256i) __builtin_ia32_pmovsxbq256 ((__v16qi)__X);
452
}
453
 
454
extern __inline __m256i
455
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
456
_mm256_cvtepi16_epi32 (__m128i __X)
457
{
458
  return (__m256i) __builtin_ia32_pmovsxwd256 ((__v8hi)__X);
459
}
460
 
461
extern __inline __m256i
462
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
463
_mm256_cvtepi16_epi64 (__m128i __X)
464
{
465
  return (__m256i) __builtin_ia32_pmovsxwq256 ((__v8hi)__X);
466
}
467
 
468
extern __inline __m256i
469
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
470
_mm256_cvtepi32_epi64 (__m128i __X)
471
{
472
  return (__m256i) __builtin_ia32_pmovsxdq256 ((__v4si)__X);
473
}
474
 
475
extern __inline __m256i
476
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
477
_mm256_cvtepu8_epi16 (__m128i __X)
478
{
479
  return (__m256i) __builtin_ia32_pmovzxbw256 ((__v16qi)__X);
480
}
481
 
482
extern __inline __m256i
483
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
484
_mm256_cvtepu8_epi32 (__m128i __X)
485
{
486
  return (__m256i) __builtin_ia32_pmovzxbd256 ((__v16qi)__X);
487
}
488
 
489
extern __inline __m256i
490
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
491
_mm256_cvtepu8_epi64 (__m128i __X)
492
{
493
  return (__m256i) __builtin_ia32_pmovzxbq256 ((__v16qi)__X);
494
}
495
 
496
extern __inline __m256i
497
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
498
_mm256_cvtepu16_epi32 (__m128i __X)
499
{
500
  return (__m256i) __builtin_ia32_pmovzxwd256 ((__v8hi)__X);
501
}
502
 
503
extern __inline __m256i
504
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
505
_mm256_cvtepu16_epi64 (__m128i __X)
506
{
507
  return (__m256i) __builtin_ia32_pmovzxwq256 ((__v8hi)__X);
508
}
509
 
510
extern __inline __m256i
511
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
512
_mm256_cvtepu32_epi64 (__m128i __X)
513
{
514
  return (__m256i) __builtin_ia32_pmovzxdq256 ((__v4si)__X);
515
}
516
 
517
extern __inline __m256i
518
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
519
_mm256_mul_epi32 (__m256i __X, __m256i __Y)
520
{
521
  return (__m256i) __builtin_ia32_pmuldq256 ((__v8si)__X, (__v8si)__Y);
522
}
523
 
524
extern __inline __m256i
525
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
526
_mm256_mulhrs_epi16 (__m256i __X, __m256i __Y)
527
{
528
  return (__m256i) __builtin_ia32_pmulhrsw256 ((__v16hi)__X,
529
                                               (__v16hi)__Y);
530
}
531
 
532
extern __inline __m256i
533
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
534
_mm256_mulhi_epu16 (__m256i __A, __m256i __B)
535
{
536
  return (__m256i)__builtin_ia32_pmulhuw256 ((__v16hi)__A, (__v16hi)__B);
537
}
538
 
539
extern __inline __m256i
540
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
541
_mm256_mulhi_epi16 (__m256i __A, __m256i __B)
542
{
543
  return (__m256i)__builtin_ia32_pmulhw256 ((__v16hi)__A, (__v16hi)__B);
544
}
545
 
546
extern __inline __m256i
547
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
548
_mm256_mullo_epi16 (__m256i __A, __m256i __B)
549
{
550
  return (__m256i)__builtin_ia32_pmullw256 ((__v16hi)__A, (__v16hi)__B);
551
}
552
 
553
extern __inline __m256i
554
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
555
_mm256_mullo_epi32 (__m256i __A, __m256i __B)
556
{
557
  return (__m256i)__builtin_ia32_pmulld256 ((__v8si)__A, (__v8si)__B);
558
}
559
 
560
extern __inline __m256i
561
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
562
_mm256_mul_epu32 (__m256i __A, __m256i __B)
563
{
564
  return (__m256i)__builtin_ia32_pmuludq256 ((__v8si)__A, (__v8si)__B);
565
}
566
 
567
extern __inline __m256i
568
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
569
_mm256_or_si256 (__m256i __A, __m256i __B)
570
{
571
  return (__m256i)__builtin_ia32_por256 ((__v4di)__A, (__v4di)__B);
572
}
573
 
574
extern __inline __m256i
575
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
576
_mm256_sad_epu8 (__m256i __A, __m256i __B)
577
{
578
  return (__m256i)__builtin_ia32_psadbw256 ((__v32qi)__A, (__v32qi)__B);
579
}
580
 
581
extern __inline __m256i
582
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
583
_mm256_shuffle_epi8 (__m256i __X, __m256i __Y)
584
{
585
  return (__m256i) __builtin_ia32_pshufb256 ((__v32qi)__X,
586
                                             (__v32qi)__Y);
587
}
588
 
589
#ifdef __OPTIMIZE__
590
extern __inline __m256i
591
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
592
_mm256_shuffle_epi32 (__m256i __A, const int __mask)
593
{
594
  return (__m256i)__builtin_ia32_pshufd256 ((__v8si)__A, __mask);
595
}
596
 
597
extern __inline __m256i
598
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
599
_mm256_shufflehi_epi16 (__m256i __A, const int __mask)
600
{
601
  return (__m256i)__builtin_ia32_pshufhw256 ((__v16hi)__A, __mask);
602
}
603
 
604
extern __inline __m256i
605
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
606
_mm256_shufflelo_epi16 (__m256i __A, const int __mask)
607
{
608
  return (__m256i)__builtin_ia32_pshuflw256 ((__v16hi)__A, __mask);
609
}
610
#else
611
#define _mm256_shuffle_epi32(A, N) \
612
  ((__m256i)__builtin_ia32_pshufd256 ((__v8si)(__m256i)(A), (int)(N)))
613
#define _mm256_shufflehi_epi16(A, N) \
614
  ((__m256i)__builtin_ia32_pshufhw256 ((__v16hi)(__m256i)(A), (int)(N)))
615
#define _mm256_shufflelo_epi16(A, N) \
616
  ((__m256i)__builtin_ia32_pshuflw256 ((__v16hi)(__m256i)(A), (int)(N)))
617
#endif
618
 
619
extern __inline __m256i
620
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
621
_mm256_sign_epi8 (__m256i __X, __m256i __Y)
622
{
623
  return (__m256i) __builtin_ia32_psignb256 ((__v32qi)__X, (__v32qi)__Y);
624
}
625
 
626
extern __inline __m256i
627
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
628
_mm256_sign_epi16 (__m256i __X, __m256i __Y)
629
{
630
  return (__m256i) __builtin_ia32_psignw256 ((__v16hi)__X, (__v16hi)__Y);
631
}
632
 
633
extern __inline __m256i
634
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
635
_mm256_sign_epi32 (__m256i __X, __m256i __Y)
636
{
637
  return (__m256i) __builtin_ia32_psignd256 ((__v8si)__X, (__v8si)__Y);
638
}
639
 
640
#ifdef __OPTIMIZE__
641
extern __inline __m256i
642
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
643
_mm256_slli_si256 (__m256i __A, const int __N)
644
{
645
  return (__m256i)__builtin_ia32_pslldqi256 (__A, __N * 8);
646
}
647
#else
648
#define _mm256_slli_si256(A, N) \
649
  ((__m256i)__builtin_ia32_pslldqi256 ((__m256i)(A), (int)(N) * 8))
650
#endif
651
 
652
extern __inline __m256i
653
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
654
_mm256_slli_epi16 (__m256i __A, int __B)
655
{
656
  return (__m256i)__builtin_ia32_psllwi256 ((__v16hi)__A, __B);
657
}
658
 
659
extern __inline __m256i
660
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
661
_mm256_sll_epi16 (__m256i __A, __m128i __B)
662
{
663
  return (__m256i)__builtin_ia32_psllw256((__v16hi)__A, (__v8hi)__B);
664
}
665
 
666
extern __inline __m256i
667
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
668
_mm256_slli_epi32 (__m256i __A, int __B)
669
{
670
  return (__m256i)__builtin_ia32_pslldi256 ((__v8si)__A, __B);
671
}
672
 
673
extern __inline __m256i
674
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
675
_mm256_sll_epi32 (__m256i __A, __m128i __B)
676
{
677
  return (__m256i)__builtin_ia32_pslld256((__v8si)__A, (__v4si)__B);
678
}
679
 
680
extern __inline __m256i
681
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
682
_mm256_slli_epi64 (__m256i __A, int __B)
683
{
684
  return (__m256i)__builtin_ia32_psllqi256 ((__v4di)__A, __B);
685
}
686
 
687
extern __inline __m256i
688
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
689
_mm256_sll_epi64 (__m256i __A, __m128i __B)
690
{
691
  return (__m256i)__builtin_ia32_psllq256((__v4di)__A, (__v2di)__B);
692
}
693
 
694
extern __inline __m256i
695
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
696
_mm256_srai_epi16 (__m256i __A, int __B)
697
{
698
  return (__m256i)__builtin_ia32_psrawi256 ((__v16hi)__A, __B);
699
}
700
 
701
extern __inline __m256i
702
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
703
_mm256_sra_epi16 (__m256i __A, __m128i __B)
704
{
705
  return (__m256i)__builtin_ia32_psraw256 ((__v16hi)__A, (__v8hi)__B);
706
}
707
 
708
extern __inline __m256i
709
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
710
_mm256_srai_epi32 (__m256i __A, int __B)
711
{
712
  return (__m256i)__builtin_ia32_psradi256 ((__v8si)__A, __B);
713
}
714
 
715
extern __inline __m256i
716
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
717
_mm256_sra_epi32 (__m256i __A, __m128i __B)
718
{
719
  return (__m256i)__builtin_ia32_psrad256 ((__v8si)__A, (__v4si)__B);
720
}
721
 
722
#ifdef __OPTIMIZE__
723
extern __inline __m256i
724
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
725
_mm256_srli_si256 (__m256i __A, const int __N)
726
{
727
  return (__m256i)__builtin_ia32_psrldqi256 (__A, __N * 8);
728
}
729
#else
730
#define _mm256_srli_si256(A, N) \
731
  ((__m256i)__builtin_ia32_psrldqi256 ((__m256i)(A), (int)(N) * 8))
732
#endif
733
 
734
extern __inline __m256i
735
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
736
_mm256_srli_epi16 (__m256i __A, int __B)
737
{
738
  return (__m256i)__builtin_ia32_psrlwi256 ((__v16hi)__A, __B);
739
}
740
 
741
extern __inline __m256i
742
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
743
_mm256_srl_epi16 (__m256i __A, __m128i __B)
744
{
745
  return (__m256i)__builtin_ia32_psrlw256((__v16hi)__A, (__v8hi)__B);
746
}
747
 
748
extern __inline __m256i
749
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
750
_mm256_srli_epi32 (__m256i __A, int __B)
751
{
752
  return (__m256i)__builtin_ia32_psrldi256 ((__v8si)__A, __B);
753
}
754
 
755
extern __inline __m256i
756
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
757
_mm256_srl_epi32 (__m256i __A, __m128i __B)
758
{
759
  return (__m256i)__builtin_ia32_psrld256((__v8si)__A, (__v4si)__B);
760
}
761
 
762
extern __inline __m256i
763
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
764
_mm256_srli_epi64 (__m256i __A, int __B)
765
{
766
  return (__m256i)__builtin_ia32_psrlqi256 ((__v4di)__A, __B);
767
}
768
 
769
extern __inline __m256i
770
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
771
_mm256_srl_epi64 (__m256i __A, __m128i __B)
772
{
773
  return (__m256i)__builtin_ia32_psrlq256((__v4di)__A, (__v2di)__B);
774
}
775
 
776
extern __inline __m256i
777
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
778
_mm256_sub_epi8 (__m256i __A, __m256i __B)
779
{
780
  return (__m256i)__builtin_ia32_psubb256 ((__v32qi)__A, (__v32qi)__B);
781
}
782
 
783
extern __inline __m256i
784
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
785
_mm256_sub_epi16 (__m256i __A, __m256i __B)
786
{
787
  return (__m256i)__builtin_ia32_psubw256 ((__v16hi)__A, (__v16hi)__B);
788
}
789
 
790
extern __inline __m256i
791
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
792
_mm256_sub_epi32 (__m256i __A, __m256i __B)
793
{
794
  return (__m256i)__builtin_ia32_psubd256 ((__v8si)__A, (__v8si)__B);
795
}
796
 
797
extern __inline __m256i
798
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
799
_mm256_sub_epi64 (__m256i __A, __m256i __B)
800
{
801
  return (__m256i)__builtin_ia32_psubq256 ((__v4di)__A, (__v4di)__B);
802
}
803
 
804
extern __inline __m256i
805
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
806
_mm256_subs_epi8 (__m256i __A, __m256i __B)
807
{
808
  return (__m256i)__builtin_ia32_psubsb256 ((__v32qi)__A, (__v32qi)__B);
809
}
810
 
811
extern __inline __m256i
812
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
813
_mm256_subs_epi16 (__m256i __A, __m256i __B)
814
{
815
  return (__m256i)__builtin_ia32_psubsw256 ((__v16hi)__A, (__v16hi)__B);
816
}
817
 
818
extern __inline __m256i
819
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
820
_mm256_subs_epu8 (__m256i __A, __m256i __B)
821
{
822
  return (__m256i)__builtin_ia32_psubusb256 ((__v32qi)__A, (__v32qi)__B);
823
}
824
 
825
extern __inline __m256i
826
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
827
_mm256_subs_epu16 (__m256i __A, __m256i __B)
828
{
829
  return (__m256i)__builtin_ia32_psubusw256 ((__v16hi)__A, (__v16hi)__B);
830
}
831
 
832
extern __inline __m256i
833
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
834
_mm256_unpackhi_epi8 (__m256i __A, __m256i __B)
835
{
836
  return (__m256i)__builtin_ia32_punpckhbw256 ((__v32qi)__A, (__v32qi)__B);
837
}
838
 
839
extern __inline __m256i
840
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
841
_mm256_unpackhi_epi16 (__m256i __A, __m256i __B)
842
{
843
  return (__m256i)__builtin_ia32_punpckhwd256 ((__v16hi)__A, (__v16hi)__B);
844
}
845
 
846
extern __inline __m256i
847
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
848
_mm256_unpackhi_epi32 (__m256i __A, __m256i __B)
849
{
850
  return (__m256i)__builtin_ia32_punpckhdq256 ((__v8si)__A, (__v8si)__B);
851
}
852
 
853
extern __inline __m256i
854
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
855
_mm256_unpackhi_epi64 (__m256i __A, __m256i __B)
856
{
857
  return (__m256i)__builtin_ia32_punpckhqdq256 ((__v4di)__A, (__v4di)__B);
858
}
859
 
860
extern __inline __m256i
861
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
862
_mm256_unpacklo_epi8 (__m256i __A, __m256i __B)
863
{
864
  return (__m256i)__builtin_ia32_punpcklbw256 ((__v32qi)__A, (__v32qi)__B);
865
}
866
 
867
extern __inline __m256i
868
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
869
_mm256_unpacklo_epi16 (__m256i __A, __m256i __B)
870
{
871
  return (__m256i)__builtin_ia32_punpcklwd256 ((__v16hi)__A, (__v16hi)__B);
872
}
873
 
874
extern __inline __m256i
875
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
876
_mm256_unpacklo_epi32 (__m256i __A, __m256i __B)
877
{
878
  return (__m256i)__builtin_ia32_punpckldq256 ((__v8si)__A, (__v8si)__B);
879
}
880
 
881
extern __inline __m256i
882
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
883
_mm256_unpacklo_epi64 (__m256i __A, __m256i __B)
884
{
885
  return (__m256i)__builtin_ia32_punpcklqdq256 ((__v4di)__A, (__v4di)__B);
886
}
887
 
888
extern __inline __m256i
889
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
890
_mm256_xor_si256 (__m256i __A, __m256i __B)
891
{
892
  return (__m256i)__builtin_ia32_pxor256 ((__v4di)__A, (__v4di)__B);
893
}
894
 
895
extern __inline __m256i
896
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
897
_mm256_stream_load_si256 (__m256i const *__X)
898
{
899
  return (__m256i) __builtin_ia32_movntdqa256 ((__v4di *) __X);
900
}
901
 
902
extern __inline __m128
903
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
904
_mm_broadcastss_ps (__m128 __X)
905
{
906
  return (__m128) __builtin_ia32_vbroadcastss_ps ((__v4sf)__X);
907
}
908
 
909
extern __inline __m256
910
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
911
_mm256_broadcastss_ps (__m128 __X)
912
{
913
  return (__m256) __builtin_ia32_vbroadcastss_ps256 ((__v4sf)__X);
914
}
915
 
916
extern __inline __m256d
917
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
918
_mm256_broadcastsd_pd (__m128d __X)
919
{
920
  return (__m256d) __builtin_ia32_vbroadcastsd_pd256 ((__v2df)__X);
921
}
922
 
923
extern __inline __m256i
924
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
925
_mm_broadcastsi128_si256 (__m128i __X)
926
{
927
  return (__m256i) __builtin_ia32_vbroadcastsi256 ((__v2di)__X);
928
}
929
 
930
#ifdef __OPTIMIZE__
931
extern __inline __m128i
932
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
933
_mm_blend_epi32 (__m128i __X, __m128i __Y, const int __M)
934
{
935
  return (__m128i) __builtin_ia32_pblendd128 ((__v4si)__X,
936
                                              (__v4si)__Y,
937
                                              __M);
938
}
939
#else
940
#define _mm_blend_epi32(X, Y, M)                                        \
941
  ((__m128i) __builtin_ia32_pblendd128 ((__v4si)(__m128i)(X),           \
942
                                        (__v4si)(__m128i)(Y), (int)(M)))
943
#endif
944
 
945
#ifdef __OPTIMIZE__
946
extern __inline __m256i
947
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
948
_mm256_blend_epi32 (__m256i __X, __m256i __Y, const int __M)
949
{
950
  return (__m256i) __builtin_ia32_pblendd256 ((__v8si)__X,
951
                                              (__v8si)__Y,
952
                                              __M);
953
}
954
#else
955
#define _mm256_blend_epi32(X, Y, M)                                     \
956
  ((__m256i) __builtin_ia32_pblendd256 ((__v8si)(__m256i)(X),           \
957
                                        (__v8si)(__m256i)(Y), (int)(M)))
958
#endif
959
 
960
extern __inline __m256i
961
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
962
_mm256_broadcastb_epi8 (__m128i __X)
963
{
964
  return (__m256i) __builtin_ia32_pbroadcastb256 ((__v16qi)__X);
965
}
966
 
967
extern __inline __m256i
968
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
969
_mm256_broadcastw_epi16 (__m128i __X)
970
{
971
  return (__m256i) __builtin_ia32_pbroadcastw256 ((__v8hi)__X);
972
}
973
 
974
extern __inline __m256i
975
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
976
_mm256_broadcastd_epi32 (__m128i __X)
977
{
978
  return (__m256i) __builtin_ia32_pbroadcastd256 ((__v4si)__X);
979
}
980
 
981
extern __inline __m256i
982
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
983
_mm256_broadcastq_epi64 (__m128i __X)
984
{
985
  return (__m256i) __builtin_ia32_pbroadcastq256 ((__v2di)__X);
986
}
987
 
988
extern __inline __m128i
989
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
990
_mm_broadcastb_epi8 (__m128i __X)
991
{
992
  return (__m128i) __builtin_ia32_pbroadcastb128 ((__v16qi)__X);
993
}
994
 
995
extern __inline __m128i
996
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
997
_mm_broadcastw_epi16 (__m128i __X)
998
{
999
  return (__m128i) __builtin_ia32_pbroadcastw128 ((__v8hi)__X);
1000
}
1001
 
1002
extern __inline __m128i
1003
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1004
_mm_broadcastd_epi32 (__m128i __X)
1005
{
1006
  return (__m128i) __builtin_ia32_pbroadcastd128 ((__v4si)__X);
1007
}
1008
 
1009
extern __inline __m128i
1010
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1011
_mm_broadcastq_epi64 (__m128i __X)
1012
{
1013
  return (__m128i) __builtin_ia32_pbroadcastq128 ((__v2di)__X);
1014
}
1015
 
1016
extern __inline __m256i
1017
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1018
_mm256_permutevar8x32_epi32 (__m256i __X, __m256i __Y)
1019
{
1020
  return (__m256i) __builtin_ia32_permvarsi256 ((__v8si)__X, (__v8si)__Y);
1021
}
1022
 
1023
#ifdef __OPTIMIZE__
1024
extern __inline __m256d
1025
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1026
_mm256_permute4x64_pd (__m256d __X, const int __M)
1027
{
1028
  return (__m256d) __builtin_ia32_permdf256 ((__v4df)__X, __M);
1029
}
1030
#else
1031
#define _mm256_permute4x64_pd(X, M)                            \
1032
  ((__m256d) __builtin_ia32_permdf256 ((__v4df)(__m256d)(X), (int)(M)))
1033
#endif
1034
 
1035
extern __inline __m256
1036
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1037
_mm256_permutevar8x32_ps (__m256 __X, __m256 __Y)
1038
{
1039
  return (__m256) __builtin_ia32_permvarsf256 ((__v8sf)__X,(__v8sf)__Y);
1040
}
1041
 
1042
#ifdef __OPTIMIZE__
1043
extern __inline __m256i
1044
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1045
_mm256_permute4x64_epi64 (__m256i __X, const int __M)
1046
{
1047
  return (__m256i) __builtin_ia32_permdi256 ((__v4di)__X, __M);
1048
}
1049
#else
1050
#define _mm256_permute4x64_epi64(X, M)                         \
1051
  ((__m256i) __builtin_ia32_permdi256 ((__v4di)(__m256i)(X), (int)(M)))
1052
#endif
1053
 
1054
 
1055
#ifdef __OPTIMIZE__
1056
extern __inline __m256i
1057
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1058
_mm256_permute2x128_si256 (__m256i __X, __m256i __Y, const int __M)
1059
{
1060
  return (__m256i) __builtin_ia32_permti256 ((__v4di)__X, (__v4di)__Y, __M);
1061
}
1062
#else
1063
#define _mm256_permute2x128_si256(X, Y, M)                              \
1064
  ((__m256i) __builtin_ia32_permti256 ((__v4di)(__m256i)(X), (__v4di)(__m256i)(Y), (int)(M)))
1065
#endif
1066
 
1067
#ifdef __OPTIMIZE__
1068
extern __inline __m128i
1069
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1070
_mm256_extracti128_si256 (__m256i __X, const int __M)
1071
{
1072
  return (__m128i) __builtin_ia32_extract128i256 ((__v4di)__X, __M);
1073
}
1074
#else
1075
#define _mm256_extracti128_si256(X, M)                          \
1076
  ((__m128i) __builtin_ia32_extract128i256 ((__v4di)(__m256i)(X), (int)(M)))
1077
#endif
1078
 
1079
#ifdef __OPTIMIZE__
1080
extern __inline __m256i
1081
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1082
_mm256_inserti128_si256 (__m256i __X, __m128i __Y, const int __M)
1083
{
1084
  return (__m256i) __builtin_ia32_insert128i256 ((__v4di)__X, (__v2di)__Y, __M);
1085
}
1086
#else
1087
#define _mm256_inserti128_si256(X, Y, M)                         \
1088
  ((__m256i) __builtin_ia32_insert128i256 ((__v4di)(__m256i)(X), \
1089
                                           (__v2di)(__m128i)(Y), \
1090
                                           (int)(M)))
1091
#endif
1092
 
1093
extern __inline __m256i
1094
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1095
_mm256_maskload_epi32 (int const *__X, __m256i __M )
1096
{
1097
  return (__m256i) __builtin_ia32_maskloadd256 ((const __v8si *)__X,
1098
                                                (__v8si)__M);
1099
}
1100
 
1101
extern __inline __m256i
1102
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1103
_mm256_maskload_epi64 (long long const *__X, __m256i __M )
1104
{
1105
  return (__m256i) __builtin_ia32_maskloadq256 ((const __v4di *)__X,
1106
                                                (__v4di)__M);
1107
}
1108
 
1109
extern __inline __m128i
1110
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1111
_mm_maskload_epi32 (int const *__X, __m128i __M )
1112
{
1113
  return (__m128i) __builtin_ia32_maskloadd ((const __v4si *)__X,
1114
                                             (__v4si)__M);
1115
}
1116
 
1117
extern __inline __m128i
1118
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1119
_mm_maskload_epi64 (long long const *__X, __m128i __M )
1120
{
1121
  return (__m128i) __builtin_ia32_maskloadq ((const __v2di *)__X,
1122
                                             (__v2di)__M);
1123
}
1124
 
1125
extern __inline void
1126
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1127
_mm256_maskstore_epi32 (int *__X, __m256i __M, __m256i __Y )
1128
{
1129
  __builtin_ia32_maskstored256 ((__v8si *)__X, (__v8si)__M, (__v8si)__Y);
1130
}
1131
 
1132
extern __inline void
1133
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1134
_mm256_maskstore_epi64 (long long *__X, __m256i __M, __m256i __Y )
1135
{
1136
  __builtin_ia32_maskstoreq256 ((__v4di *)__X, (__v4di)__M, (__v4di)__Y);
1137
}
1138
 
1139
extern __inline void
1140
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1141
_mm_maskstore_epi32 (int *__X, __m128i __M, __m128i __Y )
1142
{
1143
  __builtin_ia32_maskstored ((__v4si *)__X, (__v4si)__M, (__v4si)__Y);
1144
}
1145
 
1146
extern __inline void
1147
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1148
_mm_maskstore_epi64 (long long *__X, __m128i __M, __m128i __Y )
1149
{
1150
  __builtin_ia32_maskstoreq (( __v2di *)__X, (__v2di)__M, (__v2di)__Y);
1151
}
1152
 
1153
extern __inline __m256i
1154
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1155
_mm256_sllv_epi32 (__m256i __X, __m256i __Y)
1156
{
1157
  return (__m256i) __builtin_ia32_psllv8si ((__v8si)__X, (__v8si)__Y);
1158
}
1159
 
1160
extern __inline __m128i
1161
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1162
_mm_sllv_epi32 (__m128i __X, __m128i __Y)
1163
{
1164
  return (__m128i) __builtin_ia32_psllv4si ((__v4si)__X, (__v4si)__Y);
1165
}
1166
 
1167
extern __inline __m256i
1168
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1169
_mm256_sllv_epi64 (__m256i __X, __m256i __Y)
1170
{
1171
  return (__m256i) __builtin_ia32_psllv4di ((__v4di)__X, (__v4di)__Y);
1172
}
1173
 
1174
extern __inline __m128i
1175
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1176
_mm_sllv_epi64 (__m128i __X, __m128i __Y)
1177
{
1178
  return (__m128i) __builtin_ia32_psllv2di ((__v2di)__X, (__v2di)__Y);
1179
}
1180
 
1181
extern __inline __m256i
1182
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1183
_mm256_srav_epi32 (__m256i __X, __m256i __Y)
1184
{
1185
  return (__m256i) __builtin_ia32_psrav8si ((__v8si)__X, (__v8si)__Y);
1186
}
1187
 
1188
extern __inline __m128i
1189
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1190
_mm_srav_epi32 (__m128i __X, __m128i __Y)
1191
{
1192
  return (__m128i) __builtin_ia32_psrav4si ((__v4si)__X, (__v4si)__Y);
1193
}
1194
 
1195
extern __inline __m256i
1196
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1197
_mm256_srlv_epi32 (__m256i __X, __m256i __Y)
1198
{
1199
  return (__m256i) __builtin_ia32_psrlv8si ((__v8si)__X, (__v8si)__Y);
1200
}
1201
 
1202
extern __inline __m128i
1203
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1204
_mm_srlv_epi32 (__m128i __X, __m128i __Y)
1205
{
1206
  return (__m128i) __builtin_ia32_psrlv4si ((__v4si)__X, (__v4si)__Y);
1207
}
1208
 
1209
extern __inline __m256i
1210
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1211
_mm256_srlv_epi64 (__m256i __X, __m256i __Y)
1212
{
1213
  return (__m256i) __builtin_ia32_psrlv4di ((__v4di)__X, (__v4di)__Y);
1214
}
1215
 
1216
extern __inline __m128i
1217
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1218
_mm_srlv_epi64 (__m128i __X, __m128i __Y)
1219
{
1220
  return (__m128i) __builtin_ia32_psrlv2di ((__v2di)__X, (__v2di)__Y);
1221
}
1222
 
1223
#ifdef __OPTIMIZE__
1224
extern __inline __m128d
1225
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1226
_mm_i32gather_pd (double const *base, __m128i index, const int scale)
1227
{
1228
  __v2df src = _mm_setzero_pd ();
1229
  __v2df mask = _mm_cmpeq_pd (src, src);
1230
 
1231
  return (__m128d) __builtin_ia32_gathersiv2df (src,
1232
                                                base,
1233
                                                (__v4si)index,
1234
                                                mask,
1235
                                                scale);
1236
}
1237
 
1238
extern __inline __m128d
1239
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1240
_mm_mask_i32gather_pd (__m128d src, double const *base, __m128i index,
1241
                       __m128d mask, const int scale)
1242
{
1243
  return (__m128d) __builtin_ia32_gathersiv2df ((__v2df)src,
1244
                                                base,
1245
                                                (__v4si)index,
1246
                                                (__v2df)mask,
1247
                                                scale);
1248
}
1249
 
1250
extern __inline __m256d
1251
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1252
_mm256_i32gather_pd (double const *base, __m128i index, const int scale)
1253
{
1254
  __v4df src = _mm256_setzero_pd ();
1255
  __v4df mask = _mm256_cmp_pd (src, src, _CMP_EQ_OQ);
1256
 
1257
  return (__m256d) __builtin_ia32_gathersiv4df (src,
1258
                                                base,
1259
                                                (__v4si)index,
1260
                                                mask,
1261
                                                scale);
1262
}
1263
 
1264
extern __inline __m256d
1265
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1266
_mm256_mask_i32gather_pd (__m256d src, double const *base,
1267
                          __m128i index, __m256d mask, const int scale)
1268
{
1269
  return (__m256d) __builtin_ia32_gathersiv4df ((__v4df)src,
1270
                                                base,
1271
                                                (__v4si)index,
1272
                                                (__v4df)mask,
1273
                                                scale);
1274
}
1275
 
1276
extern __inline __m128d
1277
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1278
_mm_i64gather_pd (double const *base, __m128i index, const int scale)
1279
{
1280
  __v2df src = _mm_setzero_pd ();
1281
  __v2df mask = _mm_cmpeq_pd (src, src);
1282
 
1283
  return (__m128d) __builtin_ia32_gatherdiv2df (src,
1284
                                                base,
1285
                                                (__v2di)index,
1286
                                                mask,
1287
                                                scale);
1288
}
1289
 
1290
extern __inline __m128d
1291
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1292
_mm_mask_i64gather_pd (__m128d src, double const *base, __m128i index,
1293
                       __m128d mask, const int scale)
1294
{
1295
  return (__m128d) __builtin_ia32_gatherdiv2df ((__v2df)src,
1296
                                                base,
1297
                                                (__v2di)index,
1298
                                                (__v2df)mask,
1299
                                                scale);
1300
}
1301
 
1302
extern __inline __m256d
1303
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1304
_mm256_i64gather_pd (double const *base, __m256i index, const int scale)
1305
{
1306
  __v4df src = _mm256_setzero_pd ();
1307
  __v4df mask = _mm256_cmp_pd (src, src, _CMP_EQ_OQ);
1308
 
1309
  return (__m256d) __builtin_ia32_gatherdiv4df (src,
1310
                                                base,
1311
                                                (__v4di)index,
1312
                                                mask,
1313
                                                scale);
1314
}
1315
 
1316
extern __inline __m256d
1317
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1318
_mm256_mask_i64gather_pd (__m256d src, double const *base,
1319
                          __m256i index, __m256d mask, const int scale)
1320
{
1321
  return (__m256d) __builtin_ia32_gatherdiv4df ((__v4df)src,
1322
                                                base,
1323
                                                (__v4di)index,
1324
                                                (__v4df)mask,
1325
                                                scale);
1326
}
1327
 
1328
extern __inline __m128
1329
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1330
_mm_i32gather_ps (float const *base, __m128i index, const int scale)
1331
{
1332
  __v4sf src = _mm_setzero_ps ();
1333
  __v4sf mask = _mm_cmpeq_ps (src, src);
1334
 
1335
  return (__m128) __builtin_ia32_gathersiv4sf (src,
1336
                                               base,
1337
                                               (__v4si)index,
1338
                                               mask,
1339
                                               scale);
1340
}
1341
 
1342
extern __inline __m128
1343
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1344
_mm_mask_i32gather_ps (__m128 src, float const *base, __m128i index,
1345
                       __m128 mask, const int scale)
1346
{
1347
  return (__m128) __builtin_ia32_gathersiv4sf ((__v4sf)src,
1348
                                               base,
1349
                                               (__v4si)index,
1350
                                               (__v4sf)mask,
1351
                                               scale);
1352
}
1353
 
1354
extern __inline __m256
1355
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1356
_mm256_i32gather_ps (float const *base, __m256i index, const int scale)
1357
{
1358
  __v8sf src = _mm256_setzero_ps ();
1359
  __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
1360
 
1361
  return (__m256) __builtin_ia32_gathersiv8sf (src,
1362
                                               base,
1363
                                               (__v8si)index,
1364
                                               mask,
1365
                                               scale);
1366
}
1367
 
1368
extern __inline __m256
1369
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1370
_mm256_mask_i32gather_ps (__m256 src, float const *base,
1371
                          __m256i index, __m256 mask, const int scale)
1372
{
1373
  return (__m256) __builtin_ia32_gathersiv8sf ((__v8sf)src,
1374
                                               base,
1375
                                               (__v8si)index,
1376
                                               (__v8sf)mask,
1377
                                               scale);
1378
}
1379
 
1380
extern __inline __m128
1381
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1382
_mm_i64gather_ps (float const *base, __m128i index, const int scale)
1383
{
1384
  __v4sf src = _mm_setzero_ps ();
1385
  __v4sf mask = _mm_cmpeq_ps (src, src);
1386
 
1387
  return (__m128) __builtin_ia32_gatherdiv4sf (src,
1388
                                               base,
1389
                                               (__v2di)index,
1390
                                               mask,
1391
                                               scale);
1392
}
1393
 
1394
extern __inline __m128
1395
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1396
_mm_mask_i64gather_ps (__m128 src, float const *base, __m128i index,
1397
                       __m128 mask, const int scale)
1398
{
1399
  return (__m128) __builtin_ia32_gatherdiv4sf ((__v4sf)src,
1400
                                                base,
1401
                                                (__v2di)index,
1402
                                                (__v4sf)mask,
1403
                                                scale);
1404
}
1405
 
1406
extern __inline __m128
1407
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1408
_mm256_i64gather_ps (float const *base, __m256i index, const int scale)
1409
{
1410
  __v4sf src = _mm_setzero_ps ();
1411
  __v4sf mask = _mm_cmpeq_ps (src, src);
1412
 
1413
  return (__m128) __builtin_ia32_gatherdiv4sf256 (src,
1414
                                                  base,
1415
                                                  (__v4di)index,
1416
                                                  mask,
1417
                                                  scale);
1418
}
1419
 
1420
extern __inline __m128
1421
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1422
_mm256_mask_i64gather_ps (__m128 src, float const *base,
1423
                          __m256i index, __m128 mask, const int scale)
1424
{
1425
  return (__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf)src,
1426
                                                  base,
1427
                                                  (__v4di)index,
1428
                                                  (__v4sf)mask,
1429
                                                  scale);
1430
}
1431
 
1432
extern __inline __m128i
1433
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1434
_mm_i32gather_epi64 (long long int const *base,
1435
                     __m128i index, const int scale)
1436
{
1437
  __v2di src = __extension__ (__v2di){ 0, 0 };
1438
  __v2di mask = __extension__ (__v2di){ ~0, ~0 };
1439
 
1440
  return (__m128i) __builtin_ia32_gathersiv2di (src,
1441
                                                base,
1442
                                                (__v4si)index,
1443
                                                mask,
1444
                                                scale);
1445
}
1446
 
1447
extern __inline __m128i
1448
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1449
_mm_mask_i32gather_epi64 (__m128i src, long long int const *base,
1450
                          __m128i index, __m128i mask, const int scale)
1451
{
1452
  return (__m128i) __builtin_ia32_gathersiv2di ((__v2di)src,
1453
                                                base,
1454
                                                (__v4si)index,
1455
                                                (__v2di)mask,
1456
                                                scale);
1457
}
1458
 
1459
extern __inline __m256i
1460
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1461
_mm256_i32gather_epi64 (long long int const *base,
1462
                        __m128i index, const int scale)
1463
{
1464
  __v4di src = __extension__ (__v4di){ 0, 0, 0, 0 };
1465
  __v4di mask = __extension__ (__v4di){ ~0, ~0, ~0, ~0 };
1466
 
1467
  return (__m256i) __builtin_ia32_gathersiv4di (src,
1468
                                                base,
1469
                                                (__v4si)index,
1470
                                                mask,
1471
                                                scale);
1472
}
1473
 
1474
extern __inline __m256i
1475
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1476
_mm256_mask_i32gather_epi64 (__m256i src, long long int const *base,
1477
                             __m128i index, __m256i mask, const int scale)
1478
{
1479
  return (__m256i) __builtin_ia32_gathersiv4di ((__v4di)src,
1480
                                                base,
1481
                                                (__v4si)index,
1482
                                                (__v4di)mask,
1483
                                                scale);
1484
}
1485
 
1486
extern __inline __m128i
1487
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1488
_mm_i64gather_epi64 (long long int const *base,
1489
                     __m128i index, const int scale)
1490
{
1491
  __v2di src = __extension__ (__v2di){ 0, 0 };
1492
  __v2di mask = __extension__ (__v2di){ ~0, ~0 };
1493
 
1494
  return (__m128i) __builtin_ia32_gatherdiv2di (src,
1495
                                                base,
1496
                                                (__v2di)index,
1497
                                                mask,
1498
                                                scale);
1499
}
1500
 
1501
extern __inline __m128i
1502
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1503
_mm_mask_i64gather_epi64 (__m128i src, long long int const *base, __m128i index,
1504
                          __m128i mask, const int scale)
1505
{
1506
  return (__m128i) __builtin_ia32_gatherdiv2di ((__v2di)src,
1507
                                                base,
1508
                                                (__v2di)index,
1509
                                                (__v2di)mask,
1510
                                                scale);
1511
}
1512
 
1513
extern __inline __m256i
1514
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1515
_mm256_i64gather_epi64 (long long int const *base,
1516
                        __m256i index, const int scale)
1517
{
1518
  __v4di src = __extension__ (__v4di){ 0, 0, 0, 0 };
1519
  __v4di mask = __extension__ (__v4di){ ~0, ~0, ~0, ~0 };
1520
 
1521
  return (__m256i) __builtin_ia32_gatherdiv4di (src,
1522
                                                base,
1523
                                                (__v4di)index,
1524
                                                mask,
1525
                                                scale);
1526
}
1527
 
1528
extern __inline __m256i
1529
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1530
_mm256_mask_i64gather_epi64 (__m256i src, long long int const *base,
1531
                             __m256i index, __m256i mask, const int scale)
1532
{
1533
  return (__m256i) __builtin_ia32_gatherdiv4di ((__v4di)src,
1534
                                                base,
1535
                                                (__v4di)index,
1536
                                                (__v4di)mask,
1537
                                                scale);
1538
}
1539
 
1540
extern __inline __m128i
1541
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1542
_mm_i32gather_epi32 (int const *base, __m128i index, const int scale)
1543
{
1544
  __v4si src = __extension__ (__v4si){ 0, 0, 0, 0 };
1545
  __v4si mask = __extension__ (__v4si){ ~0, ~0, ~0, ~0 };
1546
 
1547
  return (__m128i) __builtin_ia32_gathersiv4si (src,
1548
                                               base,
1549
                                               (__v4si)index,
1550
                                               mask,
1551
                                               scale);
1552
}
1553
 
1554
extern __inline __m128i
1555
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1556
_mm_mask_i32gather_epi32 (__m128i src, int const *base, __m128i index,
1557
                          __m128i mask, const int scale)
1558
{
1559
  return (__m128i) __builtin_ia32_gathersiv4si ((__v4si)src,
1560
                                                base,
1561
                                                (__v4si)index,
1562
                                                (__v4si)mask,
1563
                                                scale);
1564
}
1565
 
1566
extern __inline __m256i
1567
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1568
_mm256_i32gather_epi32 (int const *base, __m256i index, const int scale)
1569
{
1570
  __v8si src = __extension__ (__v8si){ 0, 0, 0, 0, 0, 0, 0, 0 };
1571
  __v8si mask = __extension__ (__v8si){ ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0 };
1572
 
1573
  return (__m256i) __builtin_ia32_gathersiv8si (src,
1574
                                                base,
1575
                                                (__v8si)index,
1576
                                                mask,
1577
                                                scale);
1578
}
1579
 
1580
extern __inline __m256i
1581
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1582
_mm256_mask_i32gather_epi32 (__m256i src, int const *base,
1583
                             __m256i index, __m256i mask, const int scale)
1584
{
1585
  return (__m256i) __builtin_ia32_gathersiv8si ((__v8si)src,
1586
                                                base,
1587
                                                (__v8si)index,
1588
                                                (__v8si)mask,
1589
                                                scale);
1590
}
1591
 
1592
extern __inline __m128i
1593
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1594
_mm_i64gather_epi32 (int const *base, __m128i index, const int scale)
1595
{
1596
  __v4si src = __extension__ (__v4si){ 0, 0, 0, 0 };
1597
  __v4si mask = __extension__ (__v4si){ ~0, ~0, ~0, ~0 };
1598
 
1599
  return (__m128i) __builtin_ia32_gatherdiv4si (src,
1600
                                                base,
1601
                                                (__v2di)index,
1602
                                                mask,
1603
                                                scale);
1604
}
1605
 
1606
extern __inline __m128i
1607
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1608
_mm_mask_i64gather_epi32 (__m128i src, int const *base, __m128i index,
1609
                          __m128i mask, const int scale)
1610
{
1611
  return (__m128i) __builtin_ia32_gatherdiv4si ((__v4si)src,
1612
                                                base,
1613
                                                (__v2di)index,
1614
                                                (__v4si)mask,
1615
                                                scale);
1616
}
1617
 
1618
extern __inline __m128i
1619
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1620
_mm256_i64gather_epi32 (int const *base, __m256i index, const int scale)
1621
{
1622
  __v4si src = __extension__ (__v4si){ 0, 0, 0, 0 };
1623
  __v4si mask = __extension__ (__v4si){ ~0, ~0, ~0, ~0 };
1624
 
1625
  return (__m128i) __builtin_ia32_gatherdiv4si256 (src,
1626
                                                  base,
1627
                                                  (__v4di)index,
1628
                                                  mask,
1629
                                                  scale);
1630
}
1631
 
1632
extern __inline __m128i
1633
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1634
_mm256_mask_i64gather_epi32 (__m128i src, int const *base,
1635
                             __m256i index, __m128i mask, const int scale)
1636
{
1637
  return (__m128i) __builtin_ia32_gatherdiv4si256 ((__v4si)src,
1638
                                                   base,
1639
                                                   (__v4di)index,
1640
                                                   (__v4si)mask,
1641
                                                   scale);
1642
}
1643
#else /* __OPTIMIZE__ */
1644
#define _mm_i32gather_pd(BASE, INDEX, SCALE)                            \
1645
  (__m128d) __builtin_ia32_gathersiv2df ((__v2df) _mm_setzero_pd (),    \
1646
                                         (double const *)BASE,          \
1647
                                         (__v4si)(__m128i)INDEX,        \
1648
                                         (__v2df)_mm_set1_pd(           \
1649
                                           (double)(long long int) -1), \
1650
                                         (int)SCALE)
1651
 
1652
#define _mm_mask_i32gather_pd(SRC, BASE, INDEX, MASK, SCALE)     \
1653
  (__m128d) __builtin_ia32_gathersiv2df ((__v2df)(__m128d)SRC,   \
1654
                                         (double const *)BASE,   \
1655
                                         (__v4si)(__m128i)INDEX, \
1656
                                         (__v2df)(__m128d)MASK,  \
1657
                                         (int)SCALE)
1658
 
1659
#define _mm256_i32gather_pd(BASE, INDEX, SCALE)                         \
1660
  (__m256d) __builtin_ia32_gathersiv4df ((__v4df) _mm256_setzero_pd (), \
1661
                                         (double const *)BASE,          \
1662
                                         (__v4si)(__m128i)INDEX,        \
1663
                                         (__v4df)_mm256_set1_pd(        \
1664
                                           (double)(long long int) -1), \
1665
                                         (int)SCALE)
1666
 
1667
#define _mm256_mask_i32gather_pd(SRC, BASE, INDEX, MASK, SCALE)  \
1668
  (__m256d) __builtin_ia32_gathersiv4df ((__v4df)(__m256d)SRC,   \
1669
                                         (double const *)BASE,   \
1670
                                         (__v4si)(__m128i)INDEX, \
1671
                                         (__v4df)(__m256d)MASK,  \
1672
                                         (int)SCALE)
1673
 
1674
#define _mm_i64gather_pd(BASE, INDEX, SCALE)                            \
1675
  (__m128d) __builtin_ia32_gatherdiv2df ((__v2df) _mm_setzero_pd (),    \
1676
                                         (double const *)BASE,          \
1677
                                         (__v2di)(__m128i)INDEX,        \
1678
                                         (__v2df)_mm_set1_pd(           \
1679
                                           (double)(long long int) -1), \
1680
                                         (int)SCALE)
1681
 
1682
#define _mm_mask_i64gather_pd(SRC, BASE, INDEX, MASK, SCALE)     \
1683
  (__m128d) __builtin_ia32_gatherdiv2df ((__v2df)(__m128d)SRC,   \
1684
                                         (double const *)BASE,   \
1685
                                         (__v2di)(__m128i)INDEX, \
1686
                                         (__v2df)(__m128d)MASK,  \
1687
                                         (int)SCALE)
1688
 
1689
#define _mm256_i64gather_pd(BASE, INDEX, SCALE)                         \
1690
  (__m256d) __builtin_ia32_gatherdiv4df ((__v4df) _mm256_setzero_pd (), \
1691
                                         (double const *)BASE,          \
1692
                                         (__v4di)(__m256i)INDEX,        \
1693
                                         (__v4df)_mm256_set1_pd(        \
1694
                                           (double)(long long int) -1), \
1695
                                         (int)SCALE)
1696
 
1697
#define _mm256_mask_i64gather_pd(SRC, BASE, INDEX, MASK, SCALE)  \
1698
  (__m256d) __builtin_ia32_gatherdiv4df ((__v4df)(__m256d)SRC,   \
1699
                                         (double const *)BASE,   \
1700
                                         (__v4di)(__m256i)INDEX, \
1701
                                         (__v4df)(__m256d)MASK,  \
1702
                                         (int)SCALE)
1703
 
1704
#define _mm_i32gather_ps(BASE, INDEX, SCALE)                            \
1705
  (__m128) __builtin_ia32_gathersiv4sf ((__v4sf) _mm_setzero_ps (),     \
1706
                                        (float const *)BASE,            \
1707
                                        (__v4si)(__m128i)INDEX,         \
1708
                                        _mm_set1_ps ((float)(int) -1),  \
1709
                                        (int)SCALE)
1710
 
1711
#define _mm_mask_i32gather_ps(SRC, BASE, INDEX, MASK, SCALE)     \
1712
  (__m128) __builtin_ia32_gathersiv4sf ((__v4sf)(__m128d)SRC,    \
1713
                                        (float const *)BASE,     \
1714
                                        (__v4si)(__m128i)INDEX,  \
1715
                                        (__v4sf)(__m128d)MASK,   \
1716
                                        (int)SCALE)
1717
 
1718
#define _mm256_i32gather_ps(BASE, INDEX, SCALE)                        \
1719
  (__m256) __builtin_ia32_gathersiv8sf ((__v8sf) _mm256_setzero_ps (), \
1720
                                        (float const *)BASE,           \
1721
                                        (__v8si)(__m256i)INDEX,        \
1722
                                        (__v8sf)_mm256_set1_ps (       \
1723
                                          (float)(int) -1),            \
1724
                                        (int)SCALE)
1725
 
1726
#define _mm256_mask_i32gather_ps(SRC, BASE, INDEX, MASK, SCALE) \
1727
  (__m256) __builtin_ia32_gathersiv8sf ((__v8sf)(__m256)SRC,    \
1728
                                        (float const *)BASE,    \
1729
                                        (__v8si)(__m256i)INDEX, \
1730
                                        (__v8sf)(__m256d)MASK,  \
1731
                                        (int)SCALE)
1732
 
1733
#define _mm_i64gather_ps(BASE, INDEX, SCALE)                            \
1734
  (__m128) __builtin_ia32_gatherdiv4sf ((__v4sf) _mm_setzero_pd (),     \
1735
                                        (float const *)BASE,            \
1736
                                        (__v2di)(__m128i)INDEX,         \
1737
                                        (__v4sf)_mm_set1_ps (           \
1738
                                          (float)(int) -1),             \
1739
                                        (int)SCALE)
1740
 
1741
#define _mm_mask_i64gather_ps(SRC, BASE, INDEX, MASK, SCALE)     \
1742
  (__m128) __builtin_ia32_gatherdiv4sf ((__v4sf)(__m128)SRC,     \
1743
                                        (float const *)BASE,     \
1744
                                        (__v2di)(__m128i)INDEX,  \
1745
                                        (__v4sf)(__m128d)MASK,   \
1746
                                        (int)SCALE)
1747
 
1748
#define _mm256_i64gather_ps(BASE, INDEX, SCALE)                         \
1749
  (__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf) _mm_setzero_ps (),  \
1750
                                           (float const *)BASE,         \
1751
                                           (__v4di)(__m256i)INDEX,      \
1752
                                           (__v4sf)_mm_set1_ps(         \
1753
                                             (float)(int) -1),          \
1754
                                           (int)SCALE)
1755
 
1756
#define _mm256_mask_i64gather_ps(SRC, BASE, INDEX, MASK, SCALE)    \
1757
  (__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf)(__m128)SRC,    \
1758
                                           (float const *)BASE,    \
1759
                                           (__v4di)(__m256i)INDEX, \
1760
                                           (__v4sf)(__m128)MASK,   \
1761
                                           (int)SCALE)
1762
 
1763
#define _mm_i32gather_epi64(BASE, INDEX, SCALE)                         \
1764
  (__m128i) __builtin_ia32_gathersiv2di ((__v2di) _mm_setzero_si128 (), \
1765
                                         (long long const *)BASE,       \
1766
                                         (__v4si)(__m128i)INDEX,        \
1767
                                         (__v2di)_mm_set1_epi64x (-1),  \
1768
                                         (int)SCALE)
1769
 
1770
#define _mm_mask_i32gather_epi64(SRC, BASE, INDEX, MASK, SCALE)   \
1771
  (__m128i) __builtin_ia32_gathersiv2di ((__v2di)(__m128i)SRC,    \
1772
                                         (long long const *)BASE, \
1773
                                         (__v4si)(__m128i)INDEX,  \
1774
                                         (__v2di)(__m128i)MASK,   \
1775
                                         (int)SCALE)
1776
 
1777
#define _mm256_i32gather_epi64(BASE, INDEX, SCALE)                         \
1778
  (__m256i) __builtin_ia32_gathersiv4di ((__v4di) _mm256_setzero_si256 (), \
1779
                                         (long long const *)BASE,          \
1780
                                         (__v4si)(__m128i)INDEX,           \
1781
                                         (__v4di)_mm256_set1_epi64x (-1),  \
1782
                                         (int)SCALE)
1783
 
1784
#define _mm256_mask_i32gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \
1785
  (__m256i) __builtin_ia32_gathersiv4di ((__v4di)(__m256i)SRC,     \
1786
                                         (long long const *)BASE,  \
1787
                                         (__v4si)(__m128i)INDEX,   \
1788
                                         (__v4di)(__m256i)MASK,    \
1789
                                         (int)SCALE)
1790
 
1791
#define _mm_i64gather_epi64(BASE, INDEX, SCALE)                         \
1792
  (__m128i) __builtin_ia32_gatherdiv2di ((__v2di) _mm_setzero_si128 (), \
1793
                                         (long long const *)BASE,       \
1794
                                         (__v2di)(__m128i)INDEX,        \
1795
                                         (__v2di)_mm_set1_epi64x (-1),  \
1796
                                         (int)SCALE)
1797
 
1798
#define _mm_mask_i64gather_epi64(SRC, BASE, INDEX, MASK, SCALE)   \
1799
  (__m128i) __builtin_ia32_gatherdiv2di ((__v2di)(__m128i)SRC,    \
1800
                                         (long long const *)BASE, \
1801
                                         (__v2di)(__m128i)INDEX,  \
1802
                                         (__v2di)(__m128i)MASK,   \
1803
                                         (int)SCALE)
1804
 
1805
#define _mm256_i64gather_epi64(BASE, INDEX, SCALE)                         \
1806
  (__m256i) __builtin_ia32_gatherdiv4di ((__v4di) _mm256_setzero_si256 (), \
1807
                                         (long long const *)BASE,          \
1808
                                         (__v4di)(__m256i)INDEX,           \
1809
                                         (__v4di)_mm256_set1_epi64x (-1),  \
1810
                                         (int)SCALE)
1811
 
1812
#define _mm256_mask_i64gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \
1813
  (__m256i) __builtin_ia32_gatherdiv4di ((__v4di)(__m256i)SRC,     \
1814
                                         (long long const *)BASE,  \
1815
                                         (__v4di)(__m256i)INDEX,   \
1816
                                         (__v4di)(__m256i)MASK,    \
1817
                                         (int)SCALE)
1818
 
1819
#define _mm_i32gather_epi32(BASE, INDEX, SCALE)                         \
1820
  (__m128i) __builtin_ia32_gathersiv4si ((__v4si) _mm_setzero_si128 (), \
1821
                                         (int const *)BASE,             \
1822
                                         (__v4si)(__m128i)INDEX,        \
1823
                                         (__v4si)_mm_set1_epi32 (-1),   \
1824
                                         (int)SCALE)
1825
 
1826
#define _mm_mask_i32gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \
1827
  (__m128i) __builtin_ia32_gathersiv4si ((__v4si)(__m128i)SRC,  \
1828
                                        (int const *)BASE,      \
1829
                                        (__v4si)(__m128i)INDEX, \
1830
                                        (__v4si)(__m128i)MASK,  \
1831
                                        (int)SCALE)
1832
 
1833
#define _mm256_i32gather_epi32(BASE, INDEX, SCALE)                         \
1834
  (__m256i) __builtin_ia32_gathersiv8si ((__v8si) _mm256_setzero_si256 (), \
1835
                                         (int const *)BASE,                \
1836
                                         (__v8si)(__m256i)INDEX,           \
1837
                                         (__v8si)_mm256_set1_epi32 (-1),   \
1838
                                         (int)SCALE)
1839
 
1840
#define _mm256_mask_i32gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \
1841
  (__m256i) __builtin_ia32_gathersiv8si ((__v8si)(__m256i)SRC,     \
1842
                                        (int const *)BASE,         \
1843
                                        (__v8si)(__m256i)INDEX,    \
1844
                                        (__v8si)(__m256i)MASK,     \
1845
                                        (int)SCALE)
1846
 
1847
#define _mm_i64gather_epi32(BASE, INDEX, SCALE)                         \
1848
  (__m128i) __builtin_ia32_gatherdiv4si ((__v4si) _mm_setzero_si128 (), \
1849
                                         (int const *)BASE,             \
1850
                                         (__v2di)(__m128i)INDEX,        \
1851
                                         (__v4si)_mm_set1_epi32 (-1),   \
1852
                                         (int)SCALE)
1853
 
1854
#define _mm_mask_i64gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \
1855
  (__m128i) __builtin_ia32_gatherdiv4si ((__v4si)(__m128i)SRC,  \
1856
                                        (int const *)BASE,      \
1857
                                        (__v2di)(__m128i)INDEX, \
1858
                                        (__v4si)(__m128i)MASK,  \
1859
                                        (int)SCALE)
1860
 
1861
#define _mm256_i64gather_epi32(BASE, INDEX, SCALE)                         \
1862
  (__m128i) __builtin_ia32_gatherdiv4si256 ((__v4si) _mm_setzero_si128 (), \
1863
                                            (int const *)BASE,             \
1864
                                            (__v4di)(__m256i)INDEX,        \
1865
                                            (__v4si)_mm_set1_epi32(-1),    \
1866
                                            (int)SCALE)
1867
 
1868
#define _mm256_mask_i64gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \
1869
  (__m128i) __builtin_ia32_gatherdiv4si256 ((__v4si)(__m128i)SRC,  \
1870
                                           (int const *)BASE,      \
1871
                                           (__v4di)(__m256i)INDEX, \
1872
                                           (__v4si)(__m128i)MASK,  \
1873
                                           (int)SCALE)
1874
#endif  /* __OPTIMIZE__ */

powered by: WebSVN 2.1.0

© copyright 1999-2024 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.