1 |
318 |
jeremybenn |
/* { dg-do compile } */
|
2 |
|
|
/* { dg-require-effective-target ilp32 } */
|
3 |
|
|
/* { dg-require-effective-target sse2 } */
|
4 |
|
|
/* { dg-options "-O3 -msse2 -fdump-rtl-csa" } */
|
5 |
|
|
/* { dg-skip-if "no stdint" { vxworks_kernel } } */
|
6 |
|
|
|
7 |
|
|
#include <emmintrin.h>
|
8 |
|
|
#include <stdint.h>
|
9 |
|
|
|
10 |
|
|
typedef __SIZE_TYPE__ size_t;
|
11 |
|
|
typedef float vFloat __attribute__ ((__vector_size__ (16)));
|
12 |
|
|
typedef double vDouble __attribute__ ((__vector_size__ (16)));
|
13 |
|
|
typedef struct buf
|
14 |
|
|
{
|
15 |
|
|
void *data;
|
16 |
|
|
unsigned long h;
|
17 |
|
|
unsigned long w;
|
18 |
|
|
size_t bytes;
|
19 |
|
|
} buf;
|
20 |
|
|
|
21 |
|
|
typedef struct job
|
22 |
|
|
{
|
23 |
|
|
struct Job *next;
|
24 |
|
|
void * info;
|
25 |
|
|
long (*func)(struct Job *job);
|
26 |
|
|
long error;
|
27 |
|
|
} job;
|
28 |
|
|
|
29 |
|
|
typedef struct fj
|
30 |
|
|
{
|
31 |
|
|
job hd;
|
32 |
|
|
buf src;
|
33 |
|
|
buf dest;
|
34 |
|
|
float g;
|
35 |
|
|
unsigned int flags;
|
36 |
|
|
} fj;
|
37 |
|
|
|
38 |
|
|
static const double r[256], t[256];
|
39 |
|
|
|
40 |
|
|
long bar (const buf *src, const buf *dest, float g, unsigned int flags)
|
41 |
|
|
{
|
42 |
|
|
float *d0 = (float*) src->data;
|
43 |
|
|
float *d1 = (float*) dest->data;
|
44 |
|
|
uintptr_t w = dest->w;
|
45 |
|
|
uintptr_t idx;
|
46 |
|
|
vFloat p0;
|
47 |
|
|
static const vFloat m0;
|
48 |
|
|
static const vDouble p[3], m, b;
|
49 |
|
|
float *sr = d0;
|
50 |
|
|
float *dr = d1;
|
51 |
|
|
for( idx = 0; idx + 8 <= w; idx += 8 )
|
52 |
|
|
{
|
53 |
|
|
vFloat f0 = _mm_loadu_ps (sr);
|
54 |
|
|
vFloat f1 = _mm_loadu_ps (sr + 4);
|
55 |
|
|
sr += 8;
|
56 |
|
|
vFloat fa0 = _mm_andnot_ps (m0, f0);
|
57 |
|
|
vFloat fa1 = _mm_andnot_ps (m0, f1);
|
58 |
|
|
vDouble v0 = _mm_cvtps_pd (fa0);
|
59 |
|
|
vDouble v1 = _mm_cvtps_pd (_mm_movehl_ps (fa0, fa0));
|
60 |
|
|
vDouble v2 = _mm_cvtps_pd (fa1);
|
61 |
|
|
vDouble v3 = _mm_cvtps_pd (_mm_movehl_ps (fa1, fa1));
|
62 |
|
|
vDouble vi0, vi1, vi2, vi3;
|
63 |
|
|
__m128i b0, b1, b2, b3;
|
64 |
|
|
b0 = _mm_packs_epi32 (_mm_packs_epi32 (b0, b1), _mm_packs_epi32 (b2, b3));
|
65 |
|
|
b1 = _mm_srli_epi64 (b0, 32);
|
66 |
|
|
unsigned int i0 = _mm_cvtsi128_si32 (b0);
|
67 |
|
|
unsigned int i2 = _mm_cvtsi128_si32 (b1);
|
68 |
|
|
v0 -= _mm_loadh_pd (_mm_load_sd (r + (i0 & 0xff)), r + (i0 >> 16));
|
69 |
|
|
v1 -= _mm_loadh_pd (_mm_load_sd (r + (i2 & 0xff)), r + (i2 >> 16));
|
70 |
|
|
b0 = _mm_unpackhi_epi64 (b0, b0);
|
71 |
|
|
b1 = _mm_unpackhi_epi64 (b1, b1);
|
72 |
|
|
unsigned int i4 = _mm_cvtsi128_si32 (b0);
|
73 |
|
|
unsigned int i6 = _mm_cvtsi128_si32 (b1);
|
74 |
|
|
v2 -= _mm_loadh_pd (_mm_load_sd (r + (i4 & 0xff)), r + (i4 >> 16));
|
75 |
|
|
v3 -= _mm_loadh_pd (_mm_load_sd (r + (i6 & 0xff)), r + (i6 >> 16));
|
76 |
|
|
v0 = p[0] + (p[1] + p[2] * v0) * v0;
|
77 |
|
|
v1 = p[0] + (p[1] + p[2] * v1) * v1;
|
78 |
|
|
v2 = p[0] + (p[1] + p[2] * v2) * v2;
|
79 |
|
|
v3 = p[0] + (p[1] + p[2] * v3) * v3;
|
80 |
|
|
vi0 = (vDouble) _mm_slli_epi64 ((__m128i)((vi0 + b) + m), 52);
|
81 |
|
|
vi1 = (vDouble) _mm_slli_epi64 ((__m128i)((vi1 + b) + m), 52);
|
82 |
|
|
vi2 = (vDouble) _mm_slli_epi64 ((__m128i)((vi2 + b) + m), 52);
|
83 |
|
|
vi3 = (vDouble) _mm_slli_epi64 ((__m128i)((vi3 + b) + m), 52);
|
84 |
|
|
vi0 *= _mm_loadh_pd (_mm_load_sd (t + (i0 & 0xff)), t + (i0 >> 16));
|
85 |
|
|
vi1 *= _mm_loadh_pd (_mm_load_sd (t + (i2 & 0xff)), t + (i2 >> 16));
|
86 |
|
|
vi2 *= _mm_loadh_pd (_mm_load_sd (t + (i4 & 0xff)), t + (i4 >> 16));
|
87 |
|
|
vi3 *= _mm_loadh_pd (_mm_load_sd (t + (i6 & 0xff)), t + (i6 >> 16));
|
88 |
|
|
v0 *= vi0;
|
89 |
|
|
v1 *= vi1;
|
90 |
|
|
v2 *= vi2;
|
91 |
|
|
v3 *= vi3;
|
92 |
|
|
vFloat r0 = _mm_movelh_ps (_mm_cvtpd_ps( v0 ), _mm_cvtpd_ps (v1));
|
93 |
|
|
vFloat r1 = _mm_movelh_ps (_mm_cvtpd_ps( v2 ), _mm_cvtpd_ps (v3));
|
94 |
|
|
vFloat z0 = _mm_cmpeq_ps (f0, _mm_setzero_ps());
|
95 |
|
|
vFloat z1 = _mm_cmpeq_ps (f1, _mm_setzero_ps());
|
96 |
|
|
r0 = _mm_andnot_ps (z0, r0);
|
97 |
|
|
r1 = _mm_andnot_ps (z1, r1);
|
98 |
|
|
z0 = _mm_and_ps (z0, p0);
|
99 |
|
|
z1 = _mm_and_ps (z1, p0);
|
100 |
|
|
r0 = _mm_or_ps (r0, z0);
|
101 |
|
|
r1 = _mm_or_ps (r1, z1);
|
102 |
|
|
_mm_storeu_ps (dr, r0);
|
103 |
|
|
_mm_storeu_ps (dr + 4, r1);
|
104 |
|
|
dr += 8;
|
105 |
|
|
}
|
106 |
|
|
return 0;
|
107 |
|
|
}
|
108 |
|
|
|
109 |
|
|
long foo (job *j )
|
110 |
|
|
{
|
111 |
|
|
fj *jd = (fj*) j;
|
112 |
|
|
return bar (&jd->src, &jd->dest, jd->g, jd->flags);
|
113 |
|
|
}
|
114 |
|
|
|
115 |
|
|
/* { dg-final { scan-rtl-dump-not "deleted 1 dead insns" "csa" } } */
|
116 |
|
|
/* { dg-final { cleanup-rtl-dump "csa" } } */
|