1 |
786 |
skrzyp |
// This file is part of the uSTL library, an STL implementation.
|
2 |
|
|
//
|
3 |
|
|
// Copyright (c) 2005-2009 by Mike Sharov <msharov@users.sourceforge.net>
|
4 |
|
|
// This file is free software, distributed under the MIT License.
|
5 |
|
|
//
|
6 |
|
|
/// \file simd.h
|
7 |
|
|
/// \brief SIMD-type algorithms, with hardware acceleration, if available.
|
8 |
|
|
///
|
9 |
|
|
/// All algorithms are container-based because iterator syntax is just too
|
10 |
|
|
/// damn verbose and because the specializations need to be able to tell
|
11 |
|
|
/// how many elements are in the container in order to choose proper SIMD
|
12 |
|
|
/// instruction set (i.e.: 4 floats select SSE, while 2 floats select 3dNow!)
|
13 |
|
|
/// Specializations are only for the tuple template because the container
|
14 |
|
|
/// must be of a fixed and compile-time-known size for the compiler to be
|
15 |
|
|
/// able to choose the specialization.
|
16 |
|
|
|
17 |
|
|
#ifndef SIMD_H_39BE2D970DF4BD00508CCFFB482496F9
|
18 |
|
|
#define SIMD_H_39BE2D970DF4BD00508CCFFB482496F9
|
19 |
|
|
|
20 |
|
|
#include "ulimits.h"
|
21 |
|
|
#if HAVE_MATH_H
|
22 |
|
|
#include <math.h>
|
23 |
|
|
#endif
|
24 |
|
|
|
25 |
|
|
namespace ustl {
|
26 |
|
|
namespace simd {
|
27 |
|
|
|
28 |
|
|
//----------------------------------------------------------------------
|
29 |
|
|
// Generic algorithms
|
30 |
|
|
//----------------------------------------------------------------------
|
31 |
|
|
|
32 |
|
|
/// Applies \p op to each element in \p op1.
|
33 |
|
|
template <typename Ctr, typename UnaryOperation>
|
34 |
|
|
inline void packop (Ctr& op1, UnaryOperation op)
|
35 |
|
|
{
|
36 |
|
|
foreach (typename Ctr::iterator, i, op1)
|
37 |
|
|
op (*i);
|
38 |
|
|
}
|
39 |
|
|
|
40 |
|
|
/// Applies \p op to each element in \p op1 and \p op2 and stores in \p op2.
|
41 |
|
|
template <typename Ctr, typename BinaryOperation>
|
42 |
|
|
inline void packop (const Ctr& op1, Ctr& op2, BinaryOperation op)
|
43 |
|
|
{
|
44 |
|
|
assert (op2.size() <= op1.size());
|
45 |
|
|
typename Ctr::const_iterator i1 (op1.begin());
|
46 |
|
|
typename Ctr::iterator i2 (op2.begin());
|
47 |
|
|
for (; i2 != op2.end(); ++i1, ++i2)
|
48 |
|
|
*i2 = op (*i2, *i1);
|
49 |
|
|
}
|
50 |
|
|
|
51 |
|
|
/// Applies \p op to corresponding elements in \p op1 and \p op2 and stores in \p result.
|
52 |
|
|
template <typename Ctr, typename BinaryOperation>
|
53 |
|
|
inline void packop (const Ctr& op1, const Ctr& op2, Ctr& result, BinaryOperation op)
|
54 |
|
|
{
|
55 |
|
|
assert (op1.size() <= op2.size() && op1.size() <= result.size());
|
56 |
|
|
passign (op1, result);
|
57 |
|
|
packop (op2, result);
|
58 |
|
|
}
|
59 |
|
|
|
60 |
|
|
/// Copies \p op1 into \p result.
|
61 |
|
|
template <typename Ctr>
|
62 |
|
|
inline void passign (const Ctr& op1, Ctr& result)
|
63 |
|
|
{
|
64 |
|
|
assert (op1.size() <= result.size());
|
65 |
|
|
typename Ctr::iterator d (result.begin());
|
66 |
|
|
foreach (typename Ctr::const_iterator, s, op1)
|
67 |
|
|
*d++ = *s;
|
68 |
|
|
}
|
69 |
|
|
|
70 |
|
|
/// Copies \p result.size() elements from \p op1 to \p result.
|
71 |
|
|
template <typename Ctr>
|
72 |
|
|
inline void ipassign (typename Ctr::const_iterator op1, Ctr& result)
|
73 |
|
|
{
|
74 |
|
|
foreach (typename Ctr::iterator, d, result)
|
75 |
|
|
*d = *op1++;
|
76 |
|
|
}
|
77 |
|
|
|
78 |
|
|
template <typename Ctr1, typename Ctr2, typename ConvertFunction>
|
79 |
|
|
inline void pconvert (const Ctr1& op1, Ctr2& op2, ConvertFunction f)
|
80 |
|
|
{
|
81 |
|
|
assert (op1.size() <= op2.size());
|
82 |
|
|
typename Ctr1::const_iterator i1 (op1.begin());
|
83 |
|
|
typename Ctr2::iterator i2 (op2.begin());
|
84 |
|
|
for (; i1 != op1.end(); ++i1, ++i2)
|
85 |
|
|
*i2 = f (*i1);
|
86 |
|
|
}
|
87 |
|
|
|
88 |
|
|
// Functionoids for SIMD operations, like saturation arithmetic, shifts, etc.
|
89 |
|
|
STD_BINARY_FUNCTOR (fpadds, T, ((b > numeric_limits<T>::max() - a) ? numeric_limits<T>::max() : a + b))
|
90 |
|
|
STD_BINARY_FUNCTOR (fpsubs, T, ((a < numeric_limits<T>::min() + b) ? numeric_limits<T>::min() : a - b))
|
91 |
|
|
STD_BINARY_FUNCTOR (fpshl, T, (a << b))
|
92 |
|
|
STD_BINARY_FUNCTOR (fpshr, T, (a >> b))
|
93 |
|
|
STD_BINARY_FUNCTOR (fpmin, T, (min (a, b)))
|
94 |
|
|
STD_BINARY_FUNCTOR (fpmax, T, (max (a, b)))
|
95 |
|
|
STD_BINARY_FUNCTOR (fpavg, T, ((a + b + 1) / 2))
|
96 |
|
|
STD_CONVERSION_FUNCTOR (fcast, (D(a)))
|
97 |
|
|
#if HAVE_MATH_H
|
98 |
|
|
STD_UNARY_FUNCTOR (fpreciprocal,T, (1 / a))
|
99 |
|
|
STD_UNARY_FUNCTOR (fpsqrt, T, (reset_mmx(), T (sqrt (a))))
|
100 |
|
|
STD_UNARY_FUNCTOR (fprecipsqrt, T, (reset_mmx(), 1 / T(sqrt (a))))
|
101 |
|
|
STD_UNARY_FUNCTOR (fsin, T, (reset_mmx(), T (sin (a))))
|
102 |
|
|
STD_UNARY_FUNCTOR (fcos, T, (reset_mmx(), T (cos (a))))
|
103 |
|
|
STD_UNARY_FUNCTOR (ftan, T, (reset_mmx(), T (tan (a))))
|
104 |
|
|
#if HAVE_RINTF
|
105 |
|
|
STD_CONVERSION_FUNCTOR (fround, (reset_mmx(), D(rintf(a))))
|
106 |
|
|
#else
|
107 |
|
|
STD_CONVERSION_FUNCTOR (fround, (reset_mmx(), D(rint(a))))
|
108 |
|
|
#endif
|
109 |
|
|
template <> inline int32_t fround<double,int32_t>::operator()(const double& a) const { reset_mmx(); return (int32_t(rint(a))); }
|
110 |
|
|
#endif
|
111 |
|
|
template <> inline float fpavg<float>::operator()(const float& a, const float& b) const { return ((a + b) / 2); }
|
112 |
|
|
template <> inline double fpavg<double>::operator()(const double& a, const double& b) const { return ((a + b) / 2); }
|
113 |
|
|
|
114 |
|
|
#define SIMD_PACKEDOP1(name, operation) \
|
115 |
|
|
template <typename Ctr> \
|
116 |
|
|
inline void name (Ctr& op1) \
|
117 |
|
|
{ \
|
118 |
|
|
typedef typename Ctr::value_type value_t; \
|
119 |
|
|
packop (op1, operation<value_t>()); \
|
120 |
|
|
}
|
121 |
|
|
#define SIMD_PACKEDOP2(name, operation) \
|
122 |
|
|
template <typename Ctr> \
|
123 |
|
|
inline void name (const Ctr& op1, Ctr& op2) \
|
124 |
|
|
{ \
|
125 |
|
|
typedef typename Ctr::value_type value_t; \
|
126 |
|
|
packop (op1, op2, operation<value_t>()); \
|
127 |
|
|
}
|
128 |
|
|
#define SIMD_PACKEDOP3(name, operation) \
|
129 |
|
|
template <typename Ctr> \
|
130 |
|
|
inline void name (const Ctr& op1, const Ctr& op2, Ctr& result) \
|
131 |
|
|
{ \
|
132 |
|
|
typedef typename Ctr::value_type value_t; \
|
133 |
|
|
packop (op1, op2, result, operation<value_t>()); \
|
134 |
|
|
}
|
135 |
|
|
#define SIMD_SINGLEOP1(name, operation) \
|
136 |
|
|
template <typename T> \
|
137 |
|
|
inline T name (T op) \
|
138 |
|
|
{ \
|
139 |
|
|
operation<T> obj; \
|
140 |
|
|
return (obj(op)); \
|
141 |
|
|
}
|
142 |
|
|
#define SIMD_CONVERTOP(name, operation) \
|
143 |
|
|
template <typename Ctr1, typename Ctr2> \
|
144 |
|
|
inline void name (const Ctr1& op1, Ctr2& op2) \
|
145 |
|
|
{ \
|
146 |
|
|
typedef typename Ctr1::value_type value1_t; \
|
147 |
|
|
typedef typename Ctr2::value_type value2_t; \
|
148 |
|
|
pconvert (op1, op2, operation<value1_t, value2_t>());\
|
149 |
|
|
}
|
150 |
|
|
|
151 |
|
|
SIMD_PACKEDOP2 (padd, plus)
|
152 |
|
|
SIMD_PACKEDOP2 (psub, minus)
|
153 |
|
|
SIMD_PACKEDOP2 (pmul, multiplies)
|
154 |
|
|
SIMD_PACKEDOP2 (pdiv, divides)
|
155 |
|
|
SIMD_PACKEDOP2 (pand, bitwise_and)
|
156 |
|
|
SIMD_PACKEDOP2 (por, bitwise_or)
|
157 |
|
|
SIMD_PACKEDOP2 (pxor, bitwise_xor)
|
158 |
|
|
SIMD_PACKEDOP2 (pshl, fpshl)
|
159 |
|
|
SIMD_PACKEDOP2 (pshr, fpshr)
|
160 |
|
|
SIMD_PACKEDOP2 (psubs, fpsubs)
|
161 |
|
|
SIMD_PACKEDOP2 (pmin, fpmin)
|
162 |
|
|
SIMD_PACKEDOP2 (pmax, fpmax)
|
163 |
|
|
SIMD_PACKEDOP2 (pavg, fpavg)
|
164 |
|
|
|
165 |
|
|
SIMD_PACKEDOP3 (padd, plus)
|
166 |
|
|
SIMD_PACKEDOP3 (psub, minus)
|
167 |
|
|
SIMD_PACKEDOP3 (pmul, multiplies)
|
168 |
|
|
SIMD_PACKEDOP3 (pdiv, divides)
|
169 |
|
|
SIMD_PACKEDOP3 (pand, bitwise_and)
|
170 |
|
|
SIMD_PACKEDOP3 (por, bitwise_or)
|
171 |
|
|
SIMD_PACKEDOP3 (pxor, bitwise_xor)
|
172 |
|
|
SIMD_PACKEDOP3 (pshl, fpshl)
|
173 |
|
|
SIMD_PACKEDOP3 (pshr, fpshr)
|
174 |
|
|
SIMD_PACKEDOP3 (padds, fpadds)
|
175 |
|
|
SIMD_PACKEDOP3 (psubs, fpsubs)
|
176 |
|
|
SIMD_PACKEDOP3 (pmin, fpmin)
|
177 |
|
|
SIMD_PACKEDOP3 (pmax, fpmax)
|
178 |
|
|
SIMD_PACKEDOP3 (pavg, fpavg)
|
179 |
|
|
|
180 |
|
|
#if HAVE_MATH_H
|
181 |
|
|
SIMD_PACKEDOP1 (precip, fpreciprocal)
|
182 |
|
|
SIMD_PACKEDOP1 (psqrt, fpsqrt)
|
183 |
|
|
SIMD_PACKEDOP1 (precipsqrt, fprecipsqrt)
|
184 |
|
|
SIMD_PACKEDOP1 (psin, fsin)
|
185 |
|
|
SIMD_PACKEDOP1 (pcos, fcos)
|
186 |
|
|
SIMD_PACKEDOP1 (ptan, ftan)
|
187 |
|
|
|
188 |
|
|
SIMD_SINGLEOP1 (srecip, fpreciprocal)
|
189 |
|
|
SIMD_SINGLEOP1 (ssqrt, fpsqrt)
|
190 |
|
|
SIMD_SINGLEOP1 (srecipsqrt, fprecipsqrt)
|
191 |
|
|
SIMD_SINGLEOP1 (ssin, fsin)
|
192 |
|
|
SIMD_SINGLEOP1 (scos, fcos)
|
193 |
|
|
SIMD_SINGLEOP1 (stan, ftan)
|
194 |
|
|
|
195 |
|
|
SIMD_CONVERTOP (pround, fround)
|
196 |
|
|
|
197 |
|
|
template <typename T> inline int32_t sround (T op) { fround<T,int32_t> obj; return (obj (op)); }
|
198 |
|
|
#endif
|
199 |
|
|
|
200 |
|
|
#undef SIMD_SINGLEOP1
|
201 |
|
|
#undef SIMD_PACKEDOP3
|
202 |
|
|
#undef SIMD_PACKEDOP2
|
203 |
|
|
#undef SIMD_PACKEDOP1
|
204 |
|
|
|
205 |
|
|
//----------------------------------------------------------------------
|
206 |
|
|
// Vector types to cast tuple data to
|
207 |
|
|
//----------------------------------------------------------------------
|
208 |
|
|
|
209 |
|
|
#if HAVE_VECTOR_EXTENSIONS && __GNUC__ >= 4
|
210 |
|
|
#define VECTOR_ATTRIBUTE(mode,vs) __attribute__((vector_size(vs)))
|
211 |
|
|
#else
|
212 |
|
|
#define VECTOR_ATTRIBUTE(mode,vs)
|
213 |
|
|
#endif
|
214 |
|
|
typedef uint8_t v8qi_t VECTOR_ATTRIBUTE (V8QI,8);
|
215 |
|
|
typedef uint16_t v4hi_t VECTOR_ATTRIBUTE (V4HI,8);
|
216 |
|
|
typedef uint16_t v8hi_t VECTOR_ATTRIBUTE (V8HI,16);
|
217 |
|
|
typedef uint32_t v2si_t VECTOR_ATTRIBUTE (V2SI,8);
|
218 |
|
|
typedef uint32_t v4si_t VECTOR_ATTRIBUTE (V4SI,16);
|
219 |
|
|
#if HAVE_INT64_T
|
220 |
|
|
typedef uint64_t v1di_t VECTOR_ATTRIBUTE (V1DI,8);
|
221 |
|
|
#endif
|
222 |
|
|
typedef float v2sf_t VECTOR_ATTRIBUTE (V2SF,8);
|
223 |
|
|
typedef float v4sf_t VECTOR_ATTRIBUTE (V4SF,16);
|
224 |
|
|
typedef double v2df_t VECTOR_ATTRIBUTE (V2DF,16);
|
225 |
|
|
#undef VECTOR_ATTRIBUTE
|
226 |
|
|
|
227 |
|
|
#define SIMDA_RI(n) "m"(oin[n])
|
228 |
|
|
#define SIMDA_RO(n) "m"(oout[n])
|
229 |
|
|
#define SIMDA_WI(n) "=m"(oin[n])
|
230 |
|
|
#define SIMDA_WO(n) "=m"(oout[n])
|
231 |
|
|
|
232 |
|
|
//----------------------------------------------------------------------
|
233 |
|
|
// Hardware accelerated specializations
|
234 |
|
|
//----------------------------------------------------------------------
|
235 |
|
|
|
236 |
|
|
#define SIMD_PKOP2_SPEC(n, type, optype) \
|
237 |
|
|
template <> \
|
238 |
|
|
inline void packop (const tuple<n,type>& oin, tuple<n,type>& oout, optype<type>)
|
239 |
|
|
#define SIMD_PASSIGN_SPEC(n, type) \
|
240 |
|
|
template <> \
|
241 |
|
|
inline void passign (const tuple<n,type>& oin, tuple<n,type>& oout)
|
242 |
|
|
#define SIMD_IPASSIGN_SPEC(n, type) \
|
243 |
|
|
template <> \
|
244 |
|
|
inline void ipassign (tuple<n,type>::const_iterator oin, tuple<n,type>& oout)
|
245 |
|
|
#define SIMD_CONVERT_SPEC(n, type1, type2, optype) \
|
246 |
|
|
template <> \
|
247 |
|
|
inline void pconvert (const tuple<n,type1>& oin, tuple<n,type2>& oout, optype<type1,type2>)
|
248 |
|
|
|
249 |
|
|
#if CPU_HAS_MMX
|
250 |
|
|
#define STD_MMX_ARGS : "m"(oout[0]), "m"(oin[0]) : "mm0", "st", "memory"
|
251 |
|
|
#define DBL_MMX_ARGS : "m"(oout[0]), "m"(oout[2]), "m"(oin[0]), "m"(oin[2]) : "mm0", "mm1", "st", "st(1)", "memory"
|
252 |
|
|
#define MMX_PKOP2_SPEC(n,type,optype,instruction) \
|
253 |
|
|
SIMD_PKOP2_SPEC(n,type,optype) \
|
254 |
|
|
{ asm ("movq %0, %%mm0\n\t" #instruction " %1, %%mm0\n\tmovq %%mm0, %0" : STD_MMX_ARGS); reset_mmx(); }
|
255 |
|
|
#define MMX_DBL_PKOP2_SPEC(n,type,optype,instruction) \
|
256 |
|
|
SIMD_PKOP2_SPEC(n,type,optype) \
|
257 |
|
|
{ asm ("movq %0, %%mm0\n\tmovq %1, %%mm1\n\t" #instruction " %2, %%mm0\n\t" #instruction " %3, %%mm1\n\tmovq %%mm0, %0\n\tmovq %%mm1, %1" : DBL_MMX_ARGS); reset_mmx(); }
|
258 |
|
|
#define MMX_PASSIGN_SPEC(n,type) \
|
259 |
|
|
SIMD_PASSIGN_SPEC(n,type) \
|
260 |
|
|
{ asm ("movq %1, %%mm0\n\tmovq %%mm0, %0" : STD_MMX_ARGS); reset_mmx(); }
|
261 |
|
|
#define MMX_DBL_PASSIGN_SPEC(n,type) \
|
262 |
|
|
SIMD_PASSIGN_SPEC(n,type) \
|
263 |
|
|
{ asm ("movq %2, %%mm0\n\tmovq %3, %%mm1\n\tmovq %%mm0, %0\n\tmovq %%mm1, %1" : DBL_MMX_ARGS); reset_mmx(); }
|
264 |
|
|
#define MMX_IPASSIGN_SPEC(n,type) \
|
265 |
|
|
SIMD_IPASSIGN_SPEC(n,type) \
|
266 |
|
|
{ asm ("movq %1, %%mm0\n\tmovq %%mm0, %0" : STD_MMX_ARGS); reset_mmx(); }
|
267 |
|
|
#define MMX_DBL_IPASSIGN_SPEC(n,type) \
|
268 |
|
|
SIMD_IPASSIGN_SPEC(n,type) \
|
269 |
|
|
{ asm ("movq %2, %%mm0\n\tmovq %3, %%mm1\n\tmovq %%mm0, %0\n\tmovq %%mm1, %1" : DBL_MMX_ARGS); reset_mmx(); }
|
270 |
|
|
|
271 |
|
|
MMX_PASSIGN_SPEC(8,uint8_t)
|
272 |
|
|
MMX_PKOP2_SPEC(8,uint8_t,plus,paddb)
|
273 |
|
|
MMX_PKOP2_SPEC(8,uint8_t,minus,psubb)
|
274 |
|
|
MMX_PKOP2_SPEC(8,uint8_t,bitwise_and,pand)
|
275 |
|
|
MMX_PKOP2_SPEC(8,uint8_t,bitwise_or,por)
|
276 |
|
|
MMX_PKOP2_SPEC(8,uint8_t,bitwise_xor,pxor)
|
277 |
|
|
MMX_PKOP2_SPEC(8,uint8_t,fpadds,paddusb)
|
278 |
|
|
MMX_PKOP2_SPEC(8,uint8_t,fpsubs,psubusb)
|
279 |
|
|
|
280 |
|
|
MMX_PASSIGN_SPEC(8,int8_t)
|
281 |
|
|
MMX_PKOP2_SPEC(8,int8_t,plus,paddb)
|
282 |
|
|
MMX_PKOP2_SPEC(8,int8_t,minus,psubb)
|
283 |
|
|
MMX_PKOP2_SPEC(8,int8_t,bitwise_and,pand)
|
284 |
|
|
MMX_PKOP2_SPEC(8,int8_t,bitwise_or,por)
|
285 |
|
|
MMX_PKOP2_SPEC(8,int8_t,bitwise_xor,pxor)
|
286 |
|
|
MMX_PKOP2_SPEC(8,int8_t,fpadds,paddsb)
|
287 |
|
|
MMX_PKOP2_SPEC(8,int8_t,fpsubs,psubsb)
|
288 |
|
|
|
289 |
|
|
MMX_PASSIGN_SPEC(4,uint16_t)
|
290 |
|
|
MMX_PKOP2_SPEC(4,uint16_t,plus,paddw)
|
291 |
|
|
MMX_PKOP2_SPEC(4,uint16_t,minus,psubw)
|
292 |
|
|
MMX_PKOP2_SPEC(4,uint16_t,bitwise_and,pand)
|
293 |
|
|
MMX_PKOP2_SPEC(4,uint16_t,bitwise_or,por)
|
294 |
|
|
MMX_PKOP2_SPEC(4,uint16_t,bitwise_xor,pxor)
|
295 |
|
|
/// \todo psllw does not work like other operations, it uses the first element for shift count.
|
296 |
|
|
//MMX_PKOP2_SPEC(4,uint16_t,fpshl,psllw)
|
297 |
|
|
//MMX_PKOP2_SPEC(4,uint16_t,fpshr,psrlw)
|
298 |
|
|
MMX_PKOP2_SPEC(4,uint16_t,fpadds,paddusw)
|
299 |
|
|
MMX_PKOP2_SPEC(4,uint16_t,fpsubs,psubusw)
|
300 |
|
|
|
301 |
|
|
MMX_PASSIGN_SPEC(4,int16_t)
|
302 |
|
|
MMX_PKOP2_SPEC(4,int16_t,plus,paddw)
|
303 |
|
|
MMX_PKOP2_SPEC(4,int16_t,minus,psubw)
|
304 |
|
|
MMX_PKOP2_SPEC(4,int16_t,bitwise_and,pand)
|
305 |
|
|
MMX_PKOP2_SPEC(4,int16_t,bitwise_or,por)
|
306 |
|
|
MMX_PKOP2_SPEC(4,int16_t,bitwise_xor,pxor)
|
307 |
|
|
//MMX_PKOP2_SPEC(4,int16_t,fpshl,psllw)
|
308 |
|
|
//MMX_PKOP2_SPEC(4,int16_t,fpshr,psrlw)
|
309 |
|
|
MMX_PKOP2_SPEC(4,int16_t,fpadds,paddsw)
|
310 |
|
|
MMX_PKOP2_SPEC(4,int16_t,fpsubs,psubsw)
|
311 |
|
|
|
312 |
|
|
MMX_PASSIGN_SPEC(2,uint32_t)
|
313 |
|
|
MMX_PKOP2_SPEC(2,uint32_t,plus,paddd)
|
314 |
|
|
MMX_PKOP2_SPEC(2,uint32_t,minus,psubd)
|
315 |
|
|
MMX_PKOP2_SPEC(2,uint32_t,bitwise_and,pand)
|
316 |
|
|
MMX_PKOP2_SPEC(2,uint32_t,bitwise_or,por)
|
317 |
|
|
MMX_PKOP2_SPEC(2,uint32_t,bitwise_xor,pxor)
|
318 |
|
|
//MMX_PKOP2_SPEC(2,uint32_t,fpshl,pslld)
|
319 |
|
|
//MMX_PKOP2_SPEC(2,uint32_t,fpshr,psrld)
|
320 |
|
|
|
321 |
|
|
MMX_PASSIGN_SPEC(2,int32_t)
|
322 |
|
|
MMX_PKOP2_SPEC(2,int32_t,plus,paddd)
|
323 |
|
|
MMX_PKOP2_SPEC(2,int32_t,minus,psubd)
|
324 |
|
|
MMX_PKOP2_SPEC(2,int32_t,bitwise_and,pand)
|
325 |
|
|
MMX_PKOP2_SPEC(2,int32_t,bitwise_or,por)
|
326 |
|
|
MMX_PKOP2_SPEC(2,int32_t,bitwise_xor,pxor)
|
327 |
|
|
//MMX_PKOP2_SPEC(2,int32_t,fpshl,pslld)
|
328 |
|
|
//MMX_PKOP2_SPEC(2,int32_t,fpshr,psrld)
|
329 |
|
|
|
330 |
|
|
MMX_DBL_PKOP2_SPEC(4,uint32_t,plus,paddd)
|
331 |
|
|
MMX_DBL_PKOP2_SPEC(4,uint32_t,minus,psubd)
|
332 |
|
|
MMX_DBL_PKOP2_SPEC(4,uint32_t,bitwise_and,pand)
|
333 |
|
|
MMX_DBL_PKOP2_SPEC(4,uint32_t,bitwise_or,por)
|
334 |
|
|
MMX_DBL_PKOP2_SPEC(4,uint32_t,bitwise_xor,pxor)
|
335 |
|
|
//MMX_DBL_PKOP2_SPEC(2,uint32_t,fpshl,pslld)
|
336 |
|
|
//MMX_DBL_PKOP2_SPEC(2,uint32_t,fpshr,psrld)
|
337 |
|
|
|
338 |
|
|
MMX_DBL_PKOP2_SPEC(4,int32_t,plus,paddd)
|
339 |
|
|
MMX_DBL_PKOP2_SPEC(4,int32_t,minus,psubd)
|
340 |
|
|
MMX_DBL_PKOP2_SPEC(4,int32_t,bitwise_and,pand)
|
341 |
|
|
MMX_DBL_PKOP2_SPEC(4,int32_t,bitwise_or,por)
|
342 |
|
|
MMX_DBL_PKOP2_SPEC(4,int32_t,bitwise_xor,pxor)
|
343 |
|
|
//MMX_DBL_PKOP2_SPEC(2,int32_t,fpshl,pslld)
|
344 |
|
|
//MMX_DBL_PKOP2_SPEC(2,int32_t,fpshr,psrld)
|
345 |
|
|
|
346 |
|
|
#if CPU_HAS_SSE || CPU_HAS_3DNOW
|
347 |
|
|
MMX_PKOP2_SPEC(8,uint8_t,fpavg,pavgb)
|
348 |
|
|
MMX_PKOP2_SPEC(8,int8_t,fpavg,pavgb)
|
349 |
|
|
MMX_PKOP2_SPEC(4,uint16_t,fpavg,pavgw)
|
350 |
|
|
MMX_PKOP2_SPEC(4,int16_t,fpavg,pavgw)
|
351 |
|
|
MMX_PKOP2_SPEC(8,uint8_t,fpmin,pminub)
|
352 |
|
|
MMX_PKOP2_SPEC(8,uint8_t,fpmax,pmaxub)
|
353 |
|
|
MMX_PKOP2_SPEC(4,int16_t,fpmax,pmaxsw)
|
354 |
|
|
MMX_PKOP2_SPEC(4,int16_t,fpmin,pminsw)
|
355 |
|
|
#endif // CPU_HAS_SSE || CPU_HAS_3DNOW
|
356 |
|
|
|
357 |
|
|
#if CPU_HAS_3DNOW
|
358 |
|
|
MMX_PASSIGN_SPEC(2,float)
|
359 |
|
|
MMX_PKOP2_SPEC(2,float,plus,pfadd)
|
360 |
|
|
MMX_PKOP2_SPEC(2,float,minus,pfsub)
|
361 |
|
|
MMX_PKOP2_SPEC(2,float,multiplies,pfmul)
|
362 |
|
|
MMX_PKOP2_SPEC(2,float,fpmin,pfmin)
|
363 |
|
|
MMX_PKOP2_SPEC(2,float,fpmax,pfmax)
|
364 |
|
|
#ifndef CPU_HAS_SSE
|
365 |
|
|
MMX_DBL_PKOP2_SPEC(4,float,plus,pfadd)
|
366 |
|
|
MMX_DBL_PKOP2_SPEC(4,float,minus,pfsub)
|
367 |
|
|
MMX_DBL_PKOP2_SPEC(4,float,multiplies,pfmul)
|
368 |
|
|
MMX_DBL_PKOP2_SPEC(4,float,fpmin,pfmin)
|
369 |
|
|
MMX_DBL_PKOP2_SPEC(4,float,fpmax,pfmax)
|
370 |
|
|
#endif
|
371 |
|
|
#endif // CPU_HAS_3DNOW
|
372 |
|
|
|
373 |
|
|
MMX_IPASSIGN_SPEC(8,uint8_t)
|
374 |
|
|
MMX_IPASSIGN_SPEC(4,uint16_t)
|
375 |
|
|
MMX_IPASSIGN_SPEC(2,uint32_t)
|
376 |
|
|
MMX_IPASSIGN_SPEC(2,float)
|
377 |
|
|
|
378 |
|
|
#ifndef CPU_HAS_SSE
|
379 |
|
|
MMX_DBL_PASSIGN_SPEC(4,float)
|
380 |
|
|
MMX_DBL_PASSIGN_SPEC(4,uint32_t)
|
381 |
|
|
MMX_DBL_PASSIGN_SPEC(4,int32_t)
|
382 |
|
|
MMX_DBL_IPASSIGN_SPEC(4,float)
|
383 |
|
|
MMX_DBL_IPASSIGN_SPEC(4,uint32_t)
|
384 |
|
|
MMX_DBL_IPASSIGN_SPEC(4,int32_t)
|
385 |
|
|
#endif
|
386 |
|
|
|
387 |
|
|
#undef MMX_IPASSIGN_SPEC
|
388 |
|
|
#undef MMX_PASSIGN_SPEC
|
389 |
|
|
#undef MMX_PKOP2_SPEC
|
390 |
|
|
#undef STD_MMX_ARGS
|
391 |
|
|
#endif // CPU_HAS_MMX
|
392 |
|
|
|
393 |
|
|
#if CPU_HAS_SSE
|
394 |
|
|
#define STD_SSE_ARGS : "m"(oout[0]), "m"(oin[0]) : "xmm0", "memory"
|
395 |
|
|
#define SSE_PKOP2_SPEC(n,type,optype,instruction) \
|
396 |
|
|
SIMD_PKOP2_SPEC(n,type,optype) \
|
397 |
|
|
{ asm ("movups %0, %%xmm0\n\tmovups %1, %%xmm1\n\t" #instruction " %%xmm1, %%xmm0\n\tmovups %%xmm0, %0" : STD_SSE_ARGS);}
|
398 |
|
|
#define SSE_PASSIGN_SPEC(n,type) \
|
399 |
|
|
SIMD_PASSIGN_SPEC(n,type) \
|
400 |
|
|
{ asm ("movups %1, %%xmm0\n\tmovups %%xmm0, %0" : STD_SSE_ARGS);}
|
401 |
|
|
#define SSE_IPASSIGN_SPEC(n,type) \
|
402 |
|
|
SIMD_IPASSIGN_SPEC(n,type) \
|
403 |
|
|
{ asm ("movups %1, %%xmm0\n\tmovups %%xmm0, %0" : STD_SSE_ARGS);}
|
404 |
|
|
SSE_PASSIGN_SPEC(4,float)
|
405 |
|
|
SSE_PASSIGN_SPEC(4,int32_t)
|
406 |
|
|
SSE_PASSIGN_SPEC(4,uint32_t)
|
407 |
|
|
SSE_PKOP2_SPEC(4,float,plus,addps)
|
408 |
|
|
SSE_PKOP2_SPEC(4,float,minus,subps)
|
409 |
|
|
SSE_PKOP2_SPEC(4,float,multiplies,mulps)
|
410 |
|
|
SSE_PKOP2_SPEC(4,float,divides,divps)
|
411 |
|
|
SSE_PKOP2_SPEC(4,float,bitwise_and,andps)
|
412 |
|
|
SSE_PKOP2_SPEC(4,float,bitwise_or,orps)
|
413 |
|
|
SSE_PKOP2_SPEC(4,float,bitwise_xor,xorps)
|
414 |
|
|
SSE_PKOP2_SPEC(4,float,fpmax,maxps)
|
415 |
|
|
SSE_PKOP2_SPEC(4,float,fpmin,minps)
|
416 |
|
|
|
417 |
|
|
SIMD_CONVERT_SPEC(4,float,int32_t,fround) {
|
418 |
|
|
asm ("cvtps2pi %2, %%mm0\n\t"
|
419 |
|
|
"cvtps2pi %3, %%mm1\n\t"
|
420 |
|
|
"movq %%mm0, %0\n\t"
|
421 |
|
|
"movq %%mm1, %1"
|
422 |
|
|
: DBL_MMX_ARGS);
|
423 |
|
|
reset_mmx();
|
424 |
|
|
}
|
425 |
|
|
SIMD_CONVERT_SPEC(4,int32_t,float,fround) {
|
426 |
|
|
asm ("cvtpi2ps %2, %%xmm0\n\t"
|
427 |
|
|
"shufps $0x4E,%%xmm0,%%xmm0\n\t"
|
428 |
|
|
"cvtpi2ps %1, %%xmm0\n\t"
|
429 |
|
|
"movups %%xmm0, %0"
|
430 |
|
|
:: "m"(oout[0]), "m"(oin[0]), "m"(oin[2]) : "xmm0", "memory");
|
431 |
|
|
}
|
432 |
|
|
template <> inline int32_t fround<float,int32_t>::operator()(const float& a) const {
|
433 |
|
|
register int32_t rv;
|
434 |
|
|
asm ("movss %1, %%xmm0\n\t"
|
435 |
|
|
"cvtss2si %%xmm0, %0"
|
436 |
|
|
: "=r"(rv) : "m"(a) : "xmm0" );
|
437 |
|
|
return (rv);
|
438 |
|
|
}
|
439 |
|
|
template <> inline uint32_t fround<float,uint32_t>::operator()(const float& a) const {
|
440 |
|
|
register uint32_t rv;
|
441 |
|
|
asm ("movss %1, %%xmm0\n\t"
|
442 |
|
|
"cvtss2si %%xmm0, %0"
|
443 |
|
|
: "=r"(rv) : "m"(a) : "xmm0" );
|
444 |
|
|
return (rv);
|
445 |
|
|
}
|
446 |
|
|
|
447 |
|
|
SSE_IPASSIGN_SPEC(4,float)
|
448 |
|
|
SSE_IPASSIGN_SPEC(4,int32_t)
|
449 |
|
|
SSE_IPASSIGN_SPEC(4,uint32_t)
|
450 |
|
|
|
451 |
|
|
#undef SSE_IPASSIGN_SPEC
|
452 |
|
|
#undef SSE_PASSIGN_SPEC
|
453 |
|
|
#undef SSE_PKOP2_SPEC
|
454 |
|
|
#undef STD_SSE_ARGS
|
455 |
|
|
#endif // CPU_HAS_SSE
|
456 |
|
|
|
457 |
|
|
#undef SIMDA_RI
|
458 |
|
|
#undef SIMDA_RO
|
459 |
|
|
#undef SIMDA_WI
|
460 |
|
|
#undef SIMDA_WO
|
461 |
|
|
#undef SIMD_PACKEDOP_SPEC
|
462 |
|
|
|
463 |
|
|
} // namespace simd
|
464 |
|
|
} // namespace ustl
|
465 |
|
|
|
466 |
|
|
#endif
|