OpenCores
URL https://opencores.org/ocsvn/openrisc/openrisc/trunk

Subversion Repositories openrisc

[/] [openrisc/] [trunk/] [rtos/] [ecos-3.0/] [packages/] [language/] [cxx/] [ustl/] [current/] [include/] [ustl/] [simd.h] - Blame information for rev 786

Details | Compare with Previous | View Log

Line No. Rev Author Line
1 786 skrzyp
// This file is part of the uSTL library, an STL implementation.
2
//
3
// Copyright (c) 2005-2009 by Mike Sharov <msharov@users.sourceforge.net>
4
// This file is free software, distributed under the MIT License.
5
//
6
/// \file simd.h
7
/// \brief SIMD-type algorithms, with hardware acceleration, if available.
8
///
9
/// All algorithms are container-based because iterator syntax is just too
10
/// damn verbose and because the specializations need to be able to tell
11
/// how many elements are in the container in order to choose proper SIMD
12
/// instruction set (i.e.: 4 floats select SSE, while 2 floats select 3dNow!)
13
/// Specializations are only for the tuple template because the container
14
/// must be of a fixed and compile-time-known size for the compiler to be
15
/// able to choose the specialization.
16
 
17
#ifndef SIMD_H_39BE2D970DF4BD00508CCFFB482496F9
18
#define SIMD_H_39BE2D970DF4BD00508CCFFB482496F9
19
 
20
#include "ulimits.h"
21
#if HAVE_MATH_H
22
    #include <math.h>
23
#endif
24
 
25
namespace ustl {
26
namespace simd {
27
 
28
//----------------------------------------------------------------------
29
// Generic algorithms
30
//----------------------------------------------------------------------
31
 
32
/// Applies \p op to each element in \p op1.
33
template <typename Ctr, typename UnaryOperation>
34
inline void packop (Ctr& op1, UnaryOperation op)
35
{
36
    foreach (typename Ctr::iterator, i, op1)
37
        op (*i);
38
}
39
 
40
/// Applies \p op to each element in \p op1 and \p op2 and stores in \p op2.
41
template <typename Ctr, typename BinaryOperation>
42
inline void packop (const Ctr& op1, Ctr& op2, BinaryOperation op)
43
{
44
    assert (op2.size() <= op1.size());
45
    typename Ctr::const_iterator i1 (op1.begin());
46
    typename Ctr::iterator i2 (op2.begin());
47
    for (; i2 != op2.end(); ++i1, ++i2)
48
        *i2 = op (*i2, *i1);
49
}
50
 
51
/// Applies \p op to corresponding elements in \p op1 and \p op2 and stores in \p result.
52
template <typename Ctr, typename BinaryOperation>
53
inline void packop (const Ctr& op1, const Ctr& op2, Ctr& result, BinaryOperation op)
54
{
55
    assert (op1.size() <= op2.size() && op1.size() <= result.size());
56
    passign (op1, result);
57
    packop (op2, result);
58
}
59
 
60
/// Copies \p op1 into \p result.
61
template <typename Ctr>
62
inline void passign (const Ctr& op1, Ctr& result)
63
{
64
    assert (op1.size() <= result.size());
65
    typename Ctr::iterator d (result.begin());
66
    foreach (typename Ctr::const_iterator, s, op1)
67
        *d++ = *s;
68
}
69
 
70
/// Copies \p result.size() elements from \p op1 to \p result.
71
template <typename Ctr>
72
inline void ipassign (typename Ctr::const_iterator op1, Ctr& result)
73
{
74
    foreach (typename Ctr::iterator, d, result)
75
        *d = *op1++;
76
}
77
 
78
template <typename Ctr1, typename Ctr2, typename ConvertFunction>
79
inline void pconvert (const Ctr1& op1, Ctr2& op2, ConvertFunction f)
80
{
81
    assert (op1.size() <= op2.size());
82
    typename Ctr1::const_iterator i1 (op1.begin());
83
    typename Ctr2::iterator i2 (op2.begin());
84
    for (; i1 != op1.end(); ++i1, ++i2)
85
        *i2 = f (*i1);
86
}
87
 
88
// Functionoids for SIMD operations, like saturation arithmetic, shifts, etc.
89
STD_BINARY_FUNCTOR (fpadds, T, ((b > numeric_limits<T>::max() - a) ? numeric_limits<T>::max() : a + b))
90
STD_BINARY_FUNCTOR (fpsubs, T, ((a < numeric_limits<T>::min() + b) ? numeric_limits<T>::min() : a - b))
91
STD_BINARY_FUNCTOR (fpshl,  T, (a << b))
92
STD_BINARY_FUNCTOR (fpshr,  T, (a >> b))
93
STD_BINARY_FUNCTOR (fpmin,  T, (min (a, b)))
94
STD_BINARY_FUNCTOR (fpmax,  T, (max (a, b)))
95
STD_BINARY_FUNCTOR (fpavg,  T, ((a + b + 1) / 2))
96
STD_CONVERSION_FUNCTOR (fcast, (D(a)))
97
#if HAVE_MATH_H
98
STD_UNARY_FUNCTOR (fpreciprocal,T, (1 / a))
99
STD_UNARY_FUNCTOR (fpsqrt,      T, (reset_mmx(), T (sqrt (a))))
100
STD_UNARY_FUNCTOR (fprecipsqrt, T, (reset_mmx(), 1 / T(sqrt (a))))
101
STD_UNARY_FUNCTOR (fsin,        T, (reset_mmx(), T (sin (a))))
102
STD_UNARY_FUNCTOR (fcos,        T, (reset_mmx(), T (cos (a))))
103
STD_UNARY_FUNCTOR (ftan,        T, (reset_mmx(), T (tan (a))))
104
#if HAVE_RINTF
105
STD_CONVERSION_FUNCTOR (fround, (reset_mmx(), D(rintf(a))))
106
#else
107
STD_CONVERSION_FUNCTOR (fround, (reset_mmx(), D(rint(a))))
108
#endif
109
template <> inline int32_t fround<double,int32_t>::operator()(const double& a) const { reset_mmx(); return (int32_t(rint(a))); }
110
#endif
111
template <> inline float fpavg<float>::operator()(const float& a, const float& b) const { return ((a + b) / 2); }
112
template <> inline double fpavg<double>::operator()(const double& a, const double& b) const { return ((a + b) / 2); }
113
 
114
#define SIMD_PACKEDOP1(name, operation)         \
115
template <typename Ctr>                         \
116
inline void name (Ctr& op1)                     \
117
{                                               \
118
    typedef typename Ctr::value_type value_t;   \
119
    packop (op1, operation<value_t>());         \
120
}
121
#define SIMD_PACKEDOP2(name, operation)         \
122
template <typename Ctr>                         \
123
inline void name (const Ctr& op1, Ctr& op2)     \
124
{                                               \
125
    typedef typename Ctr::value_type value_t;   \
126
    packop (op1, op2, operation<value_t>());    \
127
}
128
#define SIMD_PACKEDOP3(name, operation)                 \
129
template <typename Ctr>                                 \
130
inline void name (const Ctr& op1, const Ctr& op2, Ctr& result)  \
131
{                                                       \
132
    typedef typename Ctr::value_type value_t;           \
133
    packop (op1, op2, result, operation<value_t>());    \
134
}
135
#define SIMD_SINGLEOP1(name, operation)         \
136
template <typename T>                           \
137
inline T name (T op)                            \
138
{                                               \
139
    operation<T> obj;                           \
140
    return (obj(op));                           \
141
}
142
#define SIMD_CONVERTOP(name, operation)         \
143
template <typename Ctr1, typename Ctr2>         \
144
inline void name (const Ctr1& op1, Ctr2& op2)   \
145
{                                               \
146
    typedef typename Ctr1::value_type value1_t; \
147
    typedef typename Ctr2::value_type value2_t; \
148
    pconvert (op1, op2, operation<value1_t, value2_t>());\
149
}
150
 
151
SIMD_PACKEDOP2 (padd, plus)
152
SIMD_PACKEDOP2 (psub, minus)
153
SIMD_PACKEDOP2 (pmul, multiplies)
154
SIMD_PACKEDOP2 (pdiv, divides)
155
SIMD_PACKEDOP2 (pand, bitwise_and)
156
SIMD_PACKEDOP2 (por, bitwise_or)
157
SIMD_PACKEDOP2 (pxor, bitwise_xor)
158
SIMD_PACKEDOP2 (pshl, fpshl)
159
SIMD_PACKEDOP2 (pshr, fpshr)
160
SIMD_PACKEDOP2 (psubs, fpsubs)
161
SIMD_PACKEDOP2 (pmin, fpmin)
162
SIMD_PACKEDOP2 (pmax, fpmax)
163
SIMD_PACKEDOP2 (pavg, fpavg)
164
 
165
SIMD_PACKEDOP3 (padd, plus)
166
SIMD_PACKEDOP3 (psub, minus)
167
SIMD_PACKEDOP3 (pmul, multiplies)
168
SIMD_PACKEDOP3 (pdiv, divides)
169
SIMD_PACKEDOP3 (pand, bitwise_and)
170
SIMD_PACKEDOP3 (por, bitwise_or)
171
SIMD_PACKEDOP3 (pxor, bitwise_xor)
172
SIMD_PACKEDOP3 (pshl, fpshl)
173
SIMD_PACKEDOP3 (pshr, fpshr)
174
SIMD_PACKEDOP3 (padds, fpadds)
175
SIMD_PACKEDOP3 (psubs, fpsubs)
176
SIMD_PACKEDOP3 (pmin, fpmin)
177
SIMD_PACKEDOP3 (pmax, fpmax)
178
SIMD_PACKEDOP3 (pavg, fpavg)
179
 
180
#if HAVE_MATH_H
181
SIMD_PACKEDOP1 (precip, fpreciprocal)
182
SIMD_PACKEDOP1 (psqrt, fpsqrt)
183
SIMD_PACKEDOP1 (precipsqrt, fprecipsqrt)
184
SIMD_PACKEDOP1 (psin, fsin)
185
SIMD_PACKEDOP1 (pcos, fcos)
186
SIMD_PACKEDOP1 (ptan, ftan)
187
 
188
SIMD_SINGLEOP1 (srecip, fpreciprocal)
189
SIMD_SINGLEOP1 (ssqrt, fpsqrt)
190
SIMD_SINGLEOP1 (srecipsqrt, fprecipsqrt)
191
SIMD_SINGLEOP1 (ssin, fsin)
192
SIMD_SINGLEOP1 (scos, fcos)
193
SIMD_SINGLEOP1 (stan, ftan)
194
 
195
SIMD_CONVERTOP (pround, fround)
196
 
197
template <typename T> inline int32_t sround (T op) { fround<T,int32_t> obj; return (obj (op)); }
198
#endif
199
 
200
#undef SIMD_SINGLEOP1
201
#undef SIMD_PACKEDOP3
202
#undef SIMD_PACKEDOP2
203
#undef SIMD_PACKEDOP1
204
 
205
//----------------------------------------------------------------------
206
// Vector types to cast tuple data to
207
//----------------------------------------------------------------------
208
 
209
#if HAVE_VECTOR_EXTENSIONS && __GNUC__ >= 4
210
#define VECTOR_ATTRIBUTE(mode,vs)       __attribute__((vector_size(vs)))
211
#else
212
#define VECTOR_ATTRIBUTE(mode,vs)
213
#endif
214
typedef uint8_t v8qi_t VECTOR_ATTRIBUTE (V8QI,8);
215
typedef uint16_t v4hi_t VECTOR_ATTRIBUTE (V4HI,8);
216
typedef uint16_t v8hi_t VECTOR_ATTRIBUTE (V8HI,16);
217
typedef uint32_t v2si_t VECTOR_ATTRIBUTE (V2SI,8);
218
typedef uint32_t v4si_t VECTOR_ATTRIBUTE (V4SI,16);
219
#if HAVE_INT64_T
220
typedef uint64_t v1di_t VECTOR_ATTRIBUTE (V1DI,8);
221
#endif
222
typedef float v2sf_t VECTOR_ATTRIBUTE (V2SF,8);
223
typedef float v4sf_t VECTOR_ATTRIBUTE (V4SF,16);
224
typedef double v2df_t VECTOR_ATTRIBUTE (V2DF,16);
225
#undef VECTOR_ATTRIBUTE
226
 
227
#define SIMDA_RI(n)             "m"(oin[n])
228
#define SIMDA_RO(n)             "m"(oout[n])
229
#define SIMDA_WI(n)             "=m"(oin[n])
230
#define SIMDA_WO(n)             "=m"(oout[n])
231
 
232
//----------------------------------------------------------------------
233
// Hardware accelerated specializations
234
//----------------------------------------------------------------------
235
 
236
#define SIMD_PKOP2_SPEC(n, type, optype)        \
237
template <>                                     \
238
inline void packop (const tuple<n,type>& oin, tuple<n,type>& oout, optype<type>)
239
#define SIMD_PASSIGN_SPEC(n, type)              \
240
template <>                                     \
241
inline void passign (const tuple<n,type>& oin, tuple<n,type>& oout)
242
#define SIMD_IPASSIGN_SPEC(n, type)             \
243
template <>                                     \
244
inline void ipassign (tuple<n,type>::const_iterator oin, tuple<n,type>& oout)
245
#define SIMD_CONVERT_SPEC(n, type1, type2, optype)      \
246
template <>                                     \
247
inline void pconvert (const tuple<n,type1>& oin, tuple<n,type2>& oout, optype<type1,type2>)
248
 
249
#if CPU_HAS_MMX
250
#define STD_MMX_ARGS    : "m"(oout[0]), "m"(oin[0]) : "mm0", "st", "memory"
251
#define DBL_MMX_ARGS    : "m"(oout[0]), "m"(oout[2]), "m"(oin[0]), "m"(oin[2]) : "mm0", "mm1", "st", "st(1)", "memory"
252
#define MMX_PKOP2_SPEC(n,type,optype,instruction)       \
253
SIMD_PKOP2_SPEC(n,type,optype)          \
254
{ asm ("movq %0, %%mm0\n\t" #instruction " %1, %%mm0\n\tmovq %%mm0, %0" : STD_MMX_ARGS); reset_mmx(); }
255
#define MMX_DBL_PKOP2_SPEC(n,type,optype,instruction)   \
256
SIMD_PKOP2_SPEC(n,type,optype)          \
257
{ asm ("movq %0, %%mm0\n\tmovq %1, %%mm1\n\t" #instruction " %2, %%mm0\n\t" #instruction " %3, %%mm1\n\tmovq %%mm0, %0\n\tmovq %%mm1, %1" : DBL_MMX_ARGS); reset_mmx(); }
258
#define MMX_PASSIGN_SPEC(n,type)        \
259
SIMD_PASSIGN_SPEC(n,type)               \
260
{ asm ("movq %1, %%mm0\n\tmovq %%mm0, %0" : STD_MMX_ARGS); reset_mmx(); }
261
#define MMX_DBL_PASSIGN_SPEC(n,type)    \
262
SIMD_PASSIGN_SPEC(n,type)               \
263
{ asm ("movq %2, %%mm0\n\tmovq %3, %%mm1\n\tmovq %%mm0, %0\n\tmovq %%mm1, %1" : DBL_MMX_ARGS); reset_mmx(); }
264
#define MMX_IPASSIGN_SPEC(n,type)       \
265
SIMD_IPASSIGN_SPEC(n,type)              \
266
{ asm ("movq %1, %%mm0\n\tmovq %%mm0, %0" : STD_MMX_ARGS); reset_mmx(); }
267
#define MMX_DBL_IPASSIGN_SPEC(n,type)   \
268
SIMD_IPASSIGN_SPEC(n,type)              \
269
{ asm ("movq %2, %%mm0\n\tmovq %3, %%mm1\n\tmovq %%mm0, %0\n\tmovq %%mm1, %1" : DBL_MMX_ARGS); reset_mmx(); }
270
 
271
MMX_PASSIGN_SPEC(8,uint8_t)
272
MMX_PKOP2_SPEC(8,uint8_t,plus,paddb)
273
MMX_PKOP2_SPEC(8,uint8_t,minus,psubb)
274
MMX_PKOP2_SPEC(8,uint8_t,bitwise_and,pand)
275
MMX_PKOP2_SPEC(8,uint8_t,bitwise_or,por)
276
MMX_PKOP2_SPEC(8,uint8_t,bitwise_xor,pxor)
277
MMX_PKOP2_SPEC(8,uint8_t,fpadds,paddusb)
278
MMX_PKOP2_SPEC(8,uint8_t,fpsubs,psubusb)
279
 
280
MMX_PASSIGN_SPEC(8,int8_t)
281
MMX_PKOP2_SPEC(8,int8_t,plus,paddb)
282
MMX_PKOP2_SPEC(8,int8_t,minus,psubb)
283
MMX_PKOP2_SPEC(8,int8_t,bitwise_and,pand)
284
MMX_PKOP2_SPEC(8,int8_t,bitwise_or,por)
285
MMX_PKOP2_SPEC(8,int8_t,bitwise_xor,pxor)
286
MMX_PKOP2_SPEC(8,int8_t,fpadds,paddsb)
287
MMX_PKOP2_SPEC(8,int8_t,fpsubs,psubsb)
288
 
289
MMX_PASSIGN_SPEC(4,uint16_t)
290
MMX_PKOP2_SPEC(4,uint16_t,plus,paddw)
291
MMX_PKOP2_SPEC(4,uint16_t,minus,psubw)
292
MMX_PKOP2_SPEC(4,uint16_t,bitwise_and,pand)
293
MMX_PKOP2_SPEC(4,uint16_t,bitwise_or,por)
294
MMX_PKOP2_SPEC(4,uint16_t,bitwise_xor,pxor)
295
/// \todo psllw does not work like other operations, it uses the first element for shift count.
296
//MMX_PKOP2_SPEC(4,uint16_t,fpshl,psllw)
297
//MMX_PKOP2_SPEC(4,uint16_t,fpshr,psrlw)
298
MMX_PKOP2_SPEC(4,uint16_t,fpadds,paddusw)
299
MMX_PKOP2_SPEC(4,uint16_t,fpsubs,psubusw)
300
 
301
MMX_PASSIGN_SPEC(4,int16_t)
302
MMX_PKOP2_SPEC(4,int16_t,plus,paddw)
303
MMX_PKOP2_SPEC(4,int16_t,minus,psubw)
304
MMX_PKOP2_SPEC(4,int16_t,bitwise_and,pand)
305
MMX_PKOP2_SPEC(4,int16_t,bitwise_or,por)
306
MMX_PKOP2_SPEC(4,int16_t,bitwise_xor,pxor)
307
//MMX_PKOP2_SPEC(4,int16_t,fpshl,psllw)
308
//MMX_PKOP2_SPEC(4,int16_t,fpshr,psrlw)
309
MMX_PKOP2_SPEC(4,int16_t,fpadds,paddsw)
310
MMX_PKOP2_SPEC(4,int16_t,fpsubs,psubsw)
311
 
312
MMX_PASSIGN_SPEC(2,uint32_t)
313
MMX_PKOP2_SPEC(2,uint32_t,plus,paddd)
314
MMX_PKOP2_SPEC(2,uint32_t,minus,psubd)
315
MMX_PKOP2_SPEC(2,uint32_t,bitwise_and,pand)
316
MMX_PKOP2_SPEC(2,uint32_t,bitwise_or,por)
317
MMX_PKOP2_SPEC(2,uint32_t,bitwise_xor,pxor)
318
//MMX_PKOP2_SPEC(2,uint32_t,fpshl,pslld)
319
//MMX_PKOP2_SPEC(2,uint32_t,fpshr,psrld)
320
 
321
MMX_PASSIGN_SPEC(2,int32_t)
322
MMX_PKOP2_SPEC(2,int32_t,plus,paddd)
323
MMX_PKOP2_SPEC(2,int32_t,minus,psubd)
324
MMX_PKOP2_SPEC(2,int32_t,bitwise_and,pand)
325
MMX_PKOP2_SPEC(2,int32_t,bitwise_or,por)
326
MMX_PKOP2_SPEC(2,int32_t,bitwise_xor,pxor)
327
//MMX_PKOP2_SPEC(2,int32_t,fpshl,pslld)
328
//MMX_PKOP2_SPEC(2,int32_t,fpshr,psrld)
329
 
330
MMX_DBL_PKOP2_SPEC(4,uint32_t,plus,paddd)
331
MMX_DBL_PKOP2_SPEC(4,uint32_t,minus,psubd)
332
MMX_DBL_PKOP2_SPEC(4,uint32_t,bitwise_and,pand)
333
MMX_DBL_PKOP2_SPEC(4,uint32_t,bitwise_or,por)
334
MMX_DBL_PKOP2_SPEC(4,uint32_t,bitwise_xor,pxor)
335
//MMX_DBL_PKOP2_SPEC(2,uint32_t,fpshl,pslld)
336
//MMX_DBL_PKOP2_SPEC(2,uint32_t,fpshr,psrld)
337
 
338
MMX_DBL_PKOP2_SPEC(4,int32_t,plus,paddd)
339
MMX_DBL_PKOP2_SPEC(4,int32_t,minus,psubd)
340
MMX_DBL_PKOP2_SPEC(4,int32_t,bitwise_and,pand)
341
MMX_DBL_PKOP2_SPEC(4,int32_t,bitwise_or,por)
342
MMX_DBL_PKOP2_SPEC(4,int32_t,bitwise_xor,pxor)
343
//MMX_DBL_PKOP2_SPEC(2,int32_t,fpshl,pslld)
344
//MMX_DBL_PKOP2_SPEC(2,int32_t,fpshr,psrld)
345
 
346
#if CPU_HAS_SSE || CPU_HAS_3DNOW
347
MMX_PKOP2_SPEC(8,uint8_t,fpavg,pavgb)
348
MMX_PKOP2_SPEC(8,int8_t,fpavg,pavgb)
349
MMX_PKOP2_SPEC(4,uint16_t,fpavg,pavgw)
350
MMX_PKOP2_SPEC(4,int16_t,fpavg,pavgw)
351
MMX_PKOP2_SPEC(8,uint8_t,fpmin,pminub)
352
MMX_PKOP2_SPEC(8,uint8_t,fpmax,pmaxub)
353
MMX_PKOP2_SPEC(4,int16_t,fpmax,pmaxsw)
354
MMX_PKOP2_SPEC(4,int16_t,fpmin,pminsw)
355
#endif // CPU_HAS_SSE || CPU_HAS_3DNOW
356
 
357
#if CPU_HAS_3DNOW
358
MMX_PASSIGN_SPEC(2,float)
359
MMX_PKOP2_SPEC(2,float,plus,pfadd)
360
MMX_PKOP2_SPEC(2,float,minus,pfsub)
361
MMX_PKOP2_SPEC(2,float,multiplies,pfmul)
362
MMX_PKOP2_SPEC(2,float,fpmin,pfmin)
363
MMX_PKOP2_SPEC(2,float,fpmax,pfmax)
364
#ifndef CPU_HAS_SSE
365
MMX_DBL_PKOP2_SPEC(4,float,plus,pfadd)
366
MMX_DBL_PKOP2_SPEC(4,float,minus,pfsub)
367
MMX_DBL_PKOP2_SPEC(4,float,multiplies,pfmul)
368
MMX_DBL_PKOP2_SPEC(4,float,fpmin,pfmin)
369
MMX_DBL_PKOP2_SPEC(4,float,fpmax,pfmax)
370
#endif
371
#endif // CPU_HAS_3DNOW
372
 
373
MMX_IPASSIGN_SPEC(8,uint8_t)
374
MMX_IPASSIGN_SPEC(4,uint16_t)
375
MMX_IPASSIGN_SPEC(2,uint32_t)
376
MMX_IPASSIGN_SPEC(2,float)
377
 
378
#ifndef CPU_HAS_SSE
379
MMX_DBL_PASSIGN_SPEC(4,float)
380
MMX_DBL_PASSIGN_SPEC(4,uint32_t)
381
MMX_DBL_PASSIGN_SPEC(4,int32_t)
382
MMX_DBL_IPASSIGN_SPEC(4,float)
383
MMX_DBL_IPASSIGN_SPEC(4,uint32_t)
384
MMX_DBL_IPASSIGN_SPEC(4,int32_t)
385
#endif
386
 
387
#undef MMX_IPASSIGN_SPEC
388
#undef MMX_PASSIGN_SPEC
389
#undef MMX_PKOP2_SPEC
390
#undef STD_MMX_ARGS
391
#endif // CPU_HAS_MMX
392
 
393
#if CPU_HAS_SSE
394
#define STD_SSE_ARGS    : "m"(oout[0]), "m"(oin[0]) : "xmm0", "memory"
395
#define SSE_PKOP2_SPEC(n,type,optype,instruction)       \
396
SIMD_PKOP2_SPEC(n,type,optype)          \
397
{ asm ("movups %0, %%xmm0\n\tmovups %1, %%xmm1\n\t" #instruction " %%xmm1, %%xmm0\n\tmovups %%xmm0, %0" : STD_SSE_ARGS);}
398
#define SSE_PASSIGN_SPEC(n,type)                        \
399
SIMD_PASSIGN_SPEC(n,type)               \
400
{ asm ("movups %1, %%xmm0\n\tmovups %%xmm0, %0" : STD_SSE_ARGS);}
401
#define SSE_IPASSIGN_SPEC(n,type)       \
402
SIMD_IPASSIGN_SPEC(n,type)              \
403
{ asm ("movups %1, %%xmm0\n\tmovups %%xmm0, %0" : STD_SSE_ARGS);}
404
SSE_PASSIGN_SPEC(4,float)
405
SSE_PASSIGN_SPEC(4,int32_t)
406
SSE_PASSIGN_SPEC(4,uint32_t)
407
SSE_PKOP2_SPEC(4,float,plus,addps)
408
SSE_PKOP2_SPEC(4,float,minus,subps)
409
SSE_PKOP2_SPEC(4,float,multiplies,mulps)
410
SSE_PKOP2_SPEC(4,float,divides,divps)
411
SSE_PKOP2_SPEC(4,float,bitwise_and,andps)
412
SSE_PKOP2_SPEC(4,float,bitwise_or,orps)
413
SSE_PKOP2_SPEC(4,float,bitwise_xor,xorps)
414
SSE_PKOP2_SPEC(4,float,fpmax,maxps)
415
SSE_PKOP2_SPEC(4,float,fpmin,minps)
416
 
417
SIMD_CONVERT_SPEC(4,float,int32_t,fround) {
418
    asm ("cvtps2pi %2, %%mm0\n\t"
419
         "cvtps2pi %3, %%mm1\n\t"
420
         "movq %%mm0, %0\n\t"
421
         "movq %%mm1, %1"
422
         : DBL_MMX_ARGS);
423
    reset_mmx();
424
}
425
SIMD_CONVERT_SPEC(4,int32_t,float,fround) {
426
    asm ("cvtpi2ps %2, %%xmm0\n\t"
427
         "shufps $0x4E,%%xmm0,%%xmm0\n\t"
428
         "cvtpi2ps %1, %%xmm0\n\t"
429
         "movups %%xmm0, %0"
430
         :: "m"(oout[0]), "m"(oin[0]), "m"(oin[2]) : "xmm0", "memory");
431
}
432
template <> inline int32_t fround<float,int32_t>::operator()(const float& a) const {
433
    register int32_t rv;
434
    asm ("movss %1, %%xmm0\n\t"
435
         "cvtss2si %%xmm0, %0"
436
         : "=r"(rv) : "m"(a) : "xmm0" );
437
    return (rv);
438
}
439
template <> inline uint32_t fround<float,uint32_t>::operator()(const float& a) const {
440
    register uint32_t rv;
441
    asm ("movss %1, %%xmm0\n\t"
442
         "cvtss2si %%xmm0, %0"
443
         : "=r"(rv) : "m"(a) : "xmm0" );
444
    return (rv);
445
}
446
 
447
SSE_IPASSIGN_SPEC(4,float)
448
SSE_IPASSIGN_SPEC(4,int32_t)
449
SSE_IPASSIGN_SPEC(4,uint32_t)
450
 
451
#undef SSE_IPASSIGN_SPEC
452
#undef SSE_PASSIGN_SPEC
453
#undef SSE_PKOP2_SPEC
454
#undef STD_SSE_ARGS
455
#endif // CPU_HAS_SSE
456
 
457
#undef SIMDA_RI
458
#undef SIMDA_RO
459
#undef SIMDA_WI
460
#undef SIMDA_WO
461
#undef SIMD_PACKEDOP_SPEC
462
 
463
} // namespace simd
464
} // namespace ustl
465
 
466
#endif

powered by: WebSVN 2.1.0

© copyright 1999-2024 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.