URL https://opencores.org/ocsvn/openrisc/openrisc/trunk

Subversion Repositories openrisc

[/] [openrisc/] [trunk/] [rtos/] [ecos-3.0/] [packages/] [language/] [cxx/] [ustl/] [current/] [include/] [ustl/] [simd.h] - Blame information for rev 786

Details | Compare with Previous | View Log


// This file is part of the uSTL library, an STL implementation.
//
// Copyright (c) 2005-2009 by Mike Sharov <msharov@users.sourceforge.net>
// This file is free software, distributed under the MIT License.
//
/// \file simd.h
/// \brief SIMD-type algorithms, with hardware acceleration, if available.
///
/// All algorithms are container-based because iterator syntax is just too
/// damn verbose and because the specializations need to be able to tell
/// how many elements are in the container in order to choose proper SIMD
/// instruction set (i.e.: 4 floats select SSE, while 2 floats select 3dNow!)
/// Specializations are only for the tuple template because the container
/// must be of a fixed and compile-time-known size for the compiler to be
/// able to choose the specialization.
 
#ifndef SIMD_H_39BE2D970DF4BD00508CCFFB482496F9
#define SIMD_H_39BE2D970DF4BD00508CCFFB482496F9
 
#include "ulimits.h"
#if HAVE_MATH_H
    #include <math.h>
#endif
 
namespace ustl {
namespace simd {
 
//----------------------------------------------------------------------
// Generic algorithms
//----------------------------------------------------------------------
 
/// Applies \p op to each element in \p op1.
template <typename Ctr, typename UnaryOperation>
inline void packop (Ctr& op1, UnaryOperation op)
{
    foreach (typename Ctr::iterator, i, op1)
        op (*i);
}
 
/// Applies \p op to each element in \p op1 and \p op2 and stores in \p op2.
template <typename Ctr, typename BinaryOperation>
inline void packop (const Ctr& op1, Ctr& op2, BinaryOperation op)
{
    assert (op2.size() <= op1.size());
    typename Ctr::const_iterator i1 (op1.begin());
    typename Ctr::iterator i2 (op2.begin());
    for (; i2 != op2.end(); ++i1, ++i2)
        *i2 = op (*i2, *i1);
}
 
/// Applies \p op to corresponding elements in \p op1 and \p op2 and stores in \p result.
template <typename Ctr, typename BinaryOperation>
inline void packop (const Ctr& op1, const Ctr& op2, Ctr& result, BinaryOperation op)
{
    assert (op1.size() <= op2.size() && op1.size() <= result.size());
    passign (op1, result);
    packop (op2, result);
}
 
/// Copies \p op1 into \p result.
template <typename Ctr>
inline void passign (const Ctr& op1, Ctr& result)
{
    assert (op1.size() <= result.size());
    typename Ctr::iterator d (result.begin());
    foreach (typename Ctr::const_iterator, s, op1)
        *d++ = *s;
}
 
/// Copies \p result.size() elements from \p op1 to \p result.
template <typename Ctr>
inline void ipassign (typename Ctr::const_iterator op1, Ctr& result)
{
    foreach (typename Ctr::iterator, d, result)
        *d = *op1++;
}
 
template <typename Ctr1, typename Ctr2, typename ConvertFunction>
inline void pconvert (const Ctr1& op1, Ctr2& op2, ConvertFunction f)
{
    assert (op1.size() <= op2.size());
    typename Ctr1::const_iterator i1 (op1.begin());
    typename Ctr2::iterator i2 (op2.begin());
    for (; i1 != op1.end(); ++i1, ++i2)
        *i2 = f (*i1);
}
 
// Functionoids for SIMD operations, like saturation arithmetic, shifts, etc.
STD_BINARY_FUNCTOR (fpadds, T, ((b > numeric_limits<T>::max() - a) ? numeric_limits<T>::max() : a + b))
STD_BINARY_FUNCTOR (fpsubs, T, ((a < numeric_limits<T>::min() + b) ? numeric_limits<T>::min() : a - b))
STD_BINARY_FUNCTOR (fpshl,  T, (a << b))
STD_BINARY_FUNCTOR (fpshr,  T, (a >> b))
STD_BINARY_FUNCTOR (fpmin,  T, (min (a, b)))
STD_BINARY_FUNCTOR (fpmax,  T, (max (a, b)))
STD_BINARY_FUNCTOR (fpavg,  T, ((a + b + 1) / 2))
STD_CONVERSION_FUNCTOR (fcast, (D(a)))
#if HAVE_MATH_H
STD_UNARY_FUNCTOR (fpreciprocal,T, (1 / a))
STD_UNARY_FUNCTOR (fpsqrt,      T, (reset_mmx(), T (sqrt (a))))
STD_UNARY_FUNCTOR (fprecipsqrt, T, (reset_mmx(), 1 / T(sqrt (a))))
STD_UNARY_FUNCTOR (fsin,        T, (reset_mmx(), T (sin (a))))
STD_UNARY_FUNCTOR (fcos,        T, (reset_mmx(), T (cos (a))))
STD_UNARY_FUNCTOR (ftan,        T, (reset_mmx(), T (tan (a))))
#if HAVE_RINTF
STD_CONVERSION_FUNCTOR (fround, (reset_mmx(), D(rintf(a))))
#else
STD_CONVERSION_FUNCTOR (fround, (reset_mmx(), D(rint(a))))
#endif
template <> inline int32_t fround<double,int32_t>::operator()(const double& a) const { reset_mmx(); return (int32_t(rint(a))); }
#endif
template <> inline float fpavg<float>::operator()(const float& a, const float& b) const { return ((a + b) / 2); }
template <> inline double fpavg<double>::operator()(const double& a, const double& b) const { return ((a + b) / 2); }
 
#define SIMD_PACKEDOP1(name, operation)         \
template <typename Ctr>                         \
inline void name (Ctr& op1)                     \
{                                               \
    typedef typename Ctr::value_type value_t;   \
    packop (op1, operation<value_t>());         \
}
#define SIMD_PACKEDOP2(name, operation)         \
template <typename Ctr>                         \
inline void name (const Ctr& op1, Ctr& op2)     \
{                                               \
    typedef typename Ctr::value_type value_t;   \
    packop (op1, op2, operation<value_t>());    \
}
#define SIMD_PACKEDOP3(name, operation)                 \
template <typename Ctr>                                 \
inline void name (const Ctr& op1, const Ctr& op2, Ctr& result)  \
{                                                       \
    typedef typename Ctr::value_type value_t;           \
    packop (op1, op2, result, operation<value_t>());    \
}
#define SIMD_SINGLEOP1(name, operation)         \
template <typename T>                           \
inline T name (T op)                            \
{                                               \
    operation<T> obj;                           \
    return (obj(op));                           \
}
#define SIMD_CONVERTOP(name, operation)         \
template <typename Ctr1, typename Ctr2>         \
inline void name (const Ctr1& op1, Ctr2& op2)   \
{                                               \
    typedef typename Ctr1::value_type value1_t; \
    typedef typename Ctr2::value_type value2_t; \
    pconvert (op1, op2, operation<value1_t, value2_t>());\
}
 
SIMD_PACKEDOP2 (padd, plus)
SIMD_PACKEDOP2 (psub, minus)
SIMD_PACKEDOP2 (pmul, multiplies)
SIMD_PACKEDOP2 (pdiv, divides)
SIMD_PACKEDOP2 (pand, bitwise_and)
SIMD_PACKEDOP2 (por, bitwise_or)
SIMD_PACKEDOP2 (pxor, bitwise_xor)
SIMD_PACKEDOP2 (pshl, fpshl)
SIMD_PACKEDOP2 (pshr, fpshr)
SIMD_PACKEDOP2 (psubs, fpsubs)
SIMD_PACKEDOP2 (pmin, fpmin)
SIMD_PACKEDOP2 (pmax, fpmax)
SIMD_PACKEDOP2 (pavg, fpavg)
 
SIMD_PACKEDOP3 (padd, plus)
SIMD_PACKEDOP3 (psub, minus)
SIMD_PACKEDOP3 (pmul, multiplies)
SIMD_PACKEDOP3 (pdiv, divides)
SIMD_PACKEDOP3 (pand, bitwise_and)
SIMD_PACKEDOP3 (por, bitwise_or)
SIMD_PACKEDOP3 (pxor, bitwise_xor)
SIMD_PACKEDOP3 (pshl, fpshl)
SIMD_PACKEDOP3 (pshr, fpshr)
SIMD_PACKEDOP3 (padds, fpadds)
SIMD_PACKEDOP3 (psubs, fpsubs)
SIMD_PACKEDOP3 (pmin, fpmin)
SIMD_PACKEDOP3 (pmax, fpmax)
SIMD_PACKEDOP3 (pavg, fpavg)
 
#if HAVE_MATH_H
SIMD_PACKEDOP1 (precip, fpreciprocal)
SIMD_PACKEDOP1 (psqrt, fpsqrt)
SIMD_PACKEDOP1 (precipsqrt, fprecipsqrt)
SIMD_PACKEDOP1 (psin, fsin)
SIMD_PACKEDOP1 (pcos, fcos)
SIMD_PACKEDOP1 (ptan, ftan)
 
SIMD_SINGLEOP1 (srecip, fpreciprocal)
SIMD_SINGLEOP1 (ssqrt, fpsqrt)
SIMD_SINGLEOP1 (srecipsqrt, fprecipsqrt)
SIMD_SINGLEOP1 (ssin, fsin)
SIMD_SINGLEOP1 (scos, fcos)
SIMD_SINGLEOP1 (stan, ftan)
 
SIMD_CONVERTOP (pround, fround)
 
template <typename T> inline int32_t sround (T op) { fround<T,int32_t> obj; return (obj (op)); }
#endif
 
#undef SIMD_SINGLEOP1
#undef SIMD_PACKEDOP3
#undef SIMD_PACKEDOP2
#undef SIMD_PACKEDOP1
 
//----------------------------------------------------------------------
// Vector types to cast tuple data to
//----------------------------------------------------------------------
 
#if HAVE_VECTOR_EXTENSIONS && __GNUC__ >= 4
#define VECTOR_ATTRIBUTE(mode,vs)       __attribute__((vector_size(vs)))
#else
#define VECTOR_ATTRIBUTE(mode,vs)
#endif
typedef uint8_t v8qi_t VECTOR_ATTRIBUTE (V8QI,8);
typedef uint16_t v4hi_t VECTOR_ATTRIBUTE (V4HI,8);
typedef uint16_t v8hi_t VECTOR_ATTRIBUTE (V8HI,16);
typedef uint32_t v2si_t VECTOR_ATTRIBUTE (V2SI,8);
typedef uint32_t v4si_t VECTOR_ATTRIBUTE (V4SI,16);
#if HAVE_INT64_T
typedef uint64_t v1di_t VECTOR_ATTRIBUTE (V1DI,8);
#endif
typedef float v2sf_t VECTOR_ATTRIBUTE (V2SF,8);
typedef float v4sf_t VECTOR_ATTRIBUTE (V4SF,16);
typedef double v2df_t VECTOR_ATTRIBUTE (V2DF,16);
#undef VECTOR_ATTRIBUTE
 
#define SIMDA_RI(n)             "m"(oin[n])
#define SIMDA_RO(n)             "m"(oout[n])
#define SIMDA_WI(n)             "=m"(oin[n])
#define SIMDA_WO(n)             "=m"(oout[n])
 
//----------------------------------------------------------------------
// Hardware accelerated specializations
//----------------------------------------------------------------------
 
#define SIMD_PKOP2_SPEC(n, type, optype)        \
template <>                                     \
inline void packop (const tuple<n,type>& oin, tuple<n,type>& oout, optype<type>)
#define SIMD_PASSIGN_SPEC(n, type)              \
template <>                                     \
inline void passign (const tuple<n,type>& oin, tuple<n,type>& oout)
#define SIMD_IPASSIGN_SPEC(n, type)             \
template <>                                     \
inline void ipassign (tuple<n,type>::const_iterator oin, tuple<n,type>& oout)
#define SIMD_CONVERT_SPEC(n, type1, type2, optype)      \
template <>                                     \
inline void pconvert (const tuple<n,type1>& oin, tuple<n,type2>& oout, optype<type1,type2>)
 
#if CPU_HAS_MMX
#define STD_MMX_ARGS    : "m"(oout[0]), "m"(oin[0]) : "mm0", "st", "memory"
#define DBL_MMX_ARGS    : "m"(oout[0]), "m"(oout[2]), "m"(oin[0]), "m"(oin[2]) : "mm0", "mm1", "st", "st(1)", "memory"
#define MMX_PKOP2_SPEC(n,type,optype,instruction)       \
SIMD_PKOP2_SPEC(n,type,optype)          \
{ asm ("movq %0, %%mm0\n\t" #instruction " %1, %%mm0\n\tmovq %%mm0, %0" : STD_MMX_ARGS); reset_mmx(); }
#define MMX_DBL_PKOP2_SPEC(n,type,optype,instruction)   \
SIMD_PKOP2_SPEC(n,type,optype)          \
{ asm ("movq %0, %%mm0\n\tmovq %1, %%mm1\n\t" #instruction " %2, %%mm0\n\t" #instruction " %3, %%mm1\n\tmovq %%mm0, %0\n\tmovq %%mm1, %1" : DBL_MMX_ARGS); reset_mmx(); }
#define MMX_PASSIGN_SPEC(n,type)        \
SIMD_PASSIGN_SPEC(n,type)               \
{ asm ("movq %1, %%mm0\n\tmovq %%mm0, %0" : STD_MMX_ARGS); reset_mmx(); }
#define MMX_DBL_PASSIGN_SPEC(n,type)    \
SIMD_PASSIGN_SPEC(n,type)               \
{ asm ("movq %2, %%mm0\n\tmovq %3, %%mm1\n\tmovq %%mm0, %0\n\tmovq %%mm1, %1" : DBL_MMX_ARGS); reset_mmx(); }
#define MMX_IPASSIGN_SPEC(n,type)       \
SIMD_IPASSIGN_SPEC(n,type)              \
{ asm ("movq %1, %%mm0\n\tmovq %%mm0, %0" : STD_MMX_ARGS); reset_mmx(); }
#define MMX_DBL_IPASSIGN_SPEC(n,type)   \
SIMD_IPASSIGN_SPEC(n,type)              \
{ asm ("movq %2, %%mm0\n\tmovq %3, %%mm1\n\tmovq %%mm0, %0\n\tmovq %%mm1, %1" : DBL_MMX_ARGS); reset_mmx(); }
 
MMX_PASSIGN_SPEC(8,uint8_t)
MMX_PKOP2_SPEC(8,uint8_t,plus,paddb)
MMX_PKOP2_SPEC(8,uint8_t,minus,psubb)
MMX_PKOP2_SPEC(8,uint8_t,bitwise_and,pand)
MMX_PKOP2_SPEC(8,uint8_t,bitwise_or,por)
MMX_PKOP2_SPEC(8,uint8_t,bitwise_xor,pxor)
MMX_PKOP2_SPEC(8,uint8_t,fpadds,paddusb)
MMX_PKOP2_SPEC(8,uint8_t,fpsubs,psubusb)
 
MMX_PASSIGN_SPEC(8,int8_t)
MMX_PKOP2_SPEC(8,int8_t,plus,paddb)
MMX_PKOP2_SPEC(8,int8_t,minus,psubb)
MMX_PKOP2_SPEC(8,int8_t,bitwise_and,pand)
MMX_PKOP2_SPEC(8,int8_t,bitwise_or,por)
MMX_PKOP2_SPEC(8,int8_t,bitwise_xor,pxor)
MMX_PKOP2_SPEC(8,int8_t,fpadds,paddsb)
MMX_PKOP2_SPEC(8,int8_t,fpsubs,psubsb)
 
MMX_PASSIGN_SPEC(4,uint16_t)
MMX_PKOP2_SPEC(4,uint16_t,plus,paddw)
MMX_PKOP2_SPEC(4,uint16_t,minus,psubw)
MMX_PKOP2_SPEC(4,uint16_t,bitwise_and,pand)
MMX_PKOP2_SPEC(4,uint16_t,bitwise_or,por)
MMX_PKOP2_SPEC(4,uint16_t,bitwise_xor,pxor)
/// \todo psllw does not work like other operations, it uses the first element for shift count.
//MMX_PKOP2_SPEC(4,uint16_t,fpshl,psllw)
//MMX_PKOP2_SPEC(4,uint16_t,fpshr,psrlw)
MMX_PKOP2_SPEC(4,uint16_t,fpadds,paddusw)
MMX_PKOP2_SPEC(4,uint16_t,fpsubs,psubusw)
 
MMX_PASSIGN_SPEC(4,int16_t)
MMX_PKOP2_SPEC(4,int16_t,plus,paddw)
MMX_PKOP2_SPEC(4,int16_t,minus,psubw)
MMX_PKOP2_SPEC(4,int16_t,bitwise_and,pand)
MMX_PKOP2_SPEC(4,int16_t,bitwise_or,por)
MMX_PKOP2_SPEC(4,int16_t,bitwise_xor,pxor)
//MMX_PKOP2_SPEC(4,int16_t,fpshl,psllw)
//MMX_PKOP2_SPEC(4,int16_t,fpshr,psrlw)
MMX_PKOP2_SPEC(4,int16_t,fpadds,paddsw)
MMX_PKOP2_SPEC(4,int16_t,fpsubs,psubsw)
 
MMX_PASSIGN_SPEC(2,uint32_t)
MMX_PKOP2_SPEC(2,uint32_t,plus,paddd)
MMX_PKOP2_SPEC(2,uint32_t,minus,psubd)
MMX_PKOP2_SPEC(2,uint32_t,bitwise_and,pand)
MMX_PKOP2_SPEC(2,uint32_t,bitwise_or,por)
MMX_PKOP2_SPEC(2,uint32_t,bitwise_xor,pxor)
//MMX_PKOP2_SPEC(2,uint32_t,fpshl,pslld)
//MMX_PKOP2_SPEC(2,uint32_t,fpshr,psrld)
 
MMX_PASSIGN_SPEC(2,int32_t)
MMX_PKOP2_SPEC(2,int32_t,plus,paddd)
MMX_PKOP2_SPEC(2,int32_t,minus,psubd)
MMX_PKOP2_SPEC(2,int32_t,bitwise_and,pand)
MMX_PKOP2_SPEC(2,int32_t,bitwise_or,por)
MMX_PKOP2_SPEC(2,int32_t,bitwise_xor,pxor)
//MMX_PKOP2_SPEC(2,int32_t,fpshl,pslld)
//MMX_PKOP2_SPEC(2,int32_t,fpshr,psrld)
 
MMX_DBL_PKOP2_SPEC(4,uint32_t,plus,paddd)
MMX_DBL_PKOP2_SPEC(4,uint32_t,minus,psubd)
MMX_DBL_PKOP2_SPEC(4,uint32_t,bitwise_and,pand)
MMX_DBL_PKOP2_SPEC(4,uint32_t,bitwise_or,por)
MMX_DBL_PKOP2_SPEC(4,uint32_t,bitwise_xor,pxor)
//MMX_DBL_PKOP2_SPEC(2,uint32_t,fpshl,pslld)
//MMX_DBL_PKOP2_SPEC(2,uint32_t,fpshr,psrld)
 
MMX_DBL_PKOP2_SPEC(4,int32_t,plus,paddd)
MMX_DBL_PKOP2_SPEC(4,int32_t,minus,psubd)
MMX_DBL_PKOP2_SPEC(4,int32_t,bitwise_and,pand)
MMX_DBL_PKOP2_SPEC(4,int32_t,bitwise_or,por)
MMX_DBL_PKOP2_SPEC(4,int32_t,bitwise_xor,pxor)
//MMX_DBL_PKOP2_SPEC(2,int32_t,fpshl,pslld)
//MMX_DBL_PKOP2_SPEC(2,int32_t,fpshr,psrld)
 
#if CPU_HAS_SSE || CPU_HAS_3DNOW
MMX_PKOP2_SPEC(8,uint8_t,fpavg,pavgb)
MMX_PKOP2_SPEC(8,int8_t,fpavg,pavgb)
MMX_PKOP2_SPEC(4,uint16_t,fpavg,pavgw)
MMX_PKOP2_SPEC(4,int16_t,fpavg,pavgw)
MMX_PKOP2_SPEC(8,uint8_t,fpmin,pminub)
MMX_PKOP2_SPEC(8,uint8_t,fpmax,pmaxub)
MMX_PKOP2_SPEC(4,int16_t,fpmax,pmaxsw)
MMX_PKOP2_SPEC(4,int16_t,fpmin,pminsw)
#endif // CPU_HAS_SSE || CPU_HAS_3DNOW
 
#if CPU_HAS_3DNOW
MMX_PASSIGN_SPEC(2,float)
MMX_PKOP2_SPEC(2,float,plus,pfadd)
MMX_PKOP2_SPEC(2,float,minus,pfsub)
MMX_PKOP2_SPEC(2,float,multiplies,pfmul)
MMX_PKOP2_SPEC(2,float,fpmin,pfmin)
MMX_PKOP2_SPEC(2,float,fpmax,pfmax)
#ifndef CPU_HAS_SSE
MMX_DBL_PKOP2_SPEC(4,float,plus,pfadd)
MMX_DBL_PKOP2_SPEC(4,float,minus,pfsub)
MMX_DBL_PKOP2_SPEC(4,float,multiplies,pfmul)
MMX_DBL_PKOP2_SPEC(4,float,fpmin,pfmin)
MMX_DBL_PKOP2_SPEC(4,float,fpmax,pfmax)
#endif
#endif // CPU_HAS_3DNOW
 
MMX_IPASSIGN_SPEC(8,uint8_t)
MMX_IPASSIGN_SPEC(4,uint16_t)
MMX_IPASSIGN_SPEC(2,uint32_t)
MMX_IPASSIGN_SPEC(2,float)
 
#ifndef CPU_HAS_SSE
MMX_DBL_PASSIGN_SPEC(4,float)
MMX_DBL_PASSIGN_SPEC(4,uint32_t)
MMX_DBL_PASSIGN_SPEC(4,int32_t)
MMX_DBL_IPASSIGN_SPEC(4,float)
MMX_DBL_IPASSIGN_SPEC(4,uint32_t)
MMX_DBL_IPASSIGN_SPEC(4,int32_t)
#endif
 
#undef MMX_IPASSIGN_SPEC
#undef MMX_PASSIGN_SPEC
#undef MMX_PKOP2_SPEC
#undef STD_MMX_ARGS
#endif // CPU_HAS_MMX
 
#if CPU_HAS_SSE
#define STD_SSE_ARGS    : "m"(oout[0]), "m"(oin[0]) : "xmm0", "memory"
#define SSE_PKOP2_SPEC(n,type,optype,instruction)       \
SIMD_PKOP2_SPEC(n,type,optype)          \
{ asm ("movups %0, %%xmm0\n\tmovups %1, %%xmm1\n\t" #instruction " %%xmm1, %%xmm0\n\tmovups %%xmm0, %0" : STD_SSE_ARGS);}
#define SSE_PASSIGN_SPEC(n,type)                        \
SIMD_PASSIGN_SPEC(n,type)               \
{ asm ("movups %1, %%xmm0\n\tmovups %%xmm0, %0" : STD_SSE_ARGS);}
#define SSE_IPASSIGN_SPEC(n,type)       \
SIMD_IPASSIGN_SPEC(n,type)              \
{ asm ("movups %1, %%xmm0\n\tmovups %%xmm0, %0" : STD_SSE_ARGS);}
SSE_PASSIGN_SPEC(4,float)
SSE_PASSIGN_SPEC(4,int32_t)
SSE_PASSIGN_SPEC(4,uint32_t)
SSE_PKOP2_SPEC(4,float,plus,addps)
SSE_PKOP2_SPEC(4,float,minus,subps)
SSE_PKOP2_SPEC(4,float,multiplies,mulps)
SSE_PKOP2_SPEC(4,float,divides,divps)
SSE_PKOP2_SPEC(4,float,bitwise_and,andps)
SSE_PKOP2_SPEC(4,float,bitwise_or,orps)
SSE_PKOP2_SPEC(4,float,bitwise_xor,xorps)
SSE_PKOP2_SPEC(4,float,fpmax,maxps)
SSE_PKOP2_SPEC(4,float,fpmin,minps)
 
SIMD_CONVERT_SPEC(4,float,int32_t,fround) {
    asm ("cvtps2pi %2, %%mm0\n\t"
         "cvtps2pi %3, %%mm1\n\t"
         "movq %%mm0, %0\n\t"
         "movq %%mm1, %1"
         : DBL_MMX_ARGS);
    reset_mmx();
}
SIMD_CONVERT_SPEC(4,int32_t,float,fround) {
    asm ("cvtpi2ps %2, %%xmm0\n\t"
         "shufps $0x4E,%%xmm0,%%xmm0\n\t"
         "cvtpi2ps %1, %%xmm0\n\t"
         "movups %%xmm0, %0"
         :: "m"(oout[0]), "m"(oin[0]), "m"(oin[2]) : "xmm0", "memory");
}
template <> inline int32_t fround<float,int32_t>::operator()(const float& a) const {
    register int32_t rv;
    asm ("movss %1, %%xmm0\n\t"
         "cvtss2si %%xmm0, %0"
         : "=r"(rv) : "m"(a) : "xmm0" );
    return (rv);
}
template <> inline uint32_t fround<float,uint32_t>::operator()(const float& a) const {
    register uint32_t rv;
    asm ("movss %1, %%xmm0\n\t"
         "cvtss2si %%xmm0, %0"
         : "=r"(rv) : "m"(a) : "xmm0" );
    return (rv);
}
 
SSE_IPASSIGN_SPEC(4,float)
SSE_IPASSIGN_SPEC(4,int32_t)
SSE_IPASSIGN_SPEC(4,uint32_t)
 
#undef SSE_IPASSIGN_SPEC
#undef SSE_PASSIGN_SPEC
#undef SSE_PKOP2_SPEC
#undef STD_SSE_ARGS
#endif // CPU_HAS_SSE
 
#undef SIMDA_RI
#undef SIMDA_RO
#undef SIMDA_WI
#undef SIMDA_WO
#undef SIMD_PACKEDOP_SPEC
 
} // namespace simd
} // namespace ustl
 
#endif

Browse

Tools

Subversion Repositories openrisc

[/] [openrisc/] [trunk/] [rtos/] [ecos-3.0/] [packages/] [language/] [cxx/] [ustl/] [current/] [include/] [ustl/] [simd.h] - Blame information for rev 786

Line No.	Rev	Author	Line
1	786	skrzyp	`// This file is part of the uSTL library, an STL implementation.`
2			`//`
3			`// Copyright (c) 2005-2009 by Mike Sharov <msharov@users.sourceforge.net>`
4			`// This file is free software, distributed under the MIT License.`
5			`//`
6			`/// \file simd.h`
7			`/// \brief SIMD-type algorithms, with hardware acceleration, if available.`
8			`///`
9			`/// All algorithms are container-based because iterator syntax is just too`
10			`/// damn verbose and because the specializations need to be able to tell`
11			`/// how many elements are in the container in order to choose proper SIMD`
12			`/// instruction set (i.e.: 4 floats select SSE, while 2 floats select 3dNow!)`
13			`/// Specializations are only for the tuple template because the container`
14			`/// must be of a fixed and compile-time-known size for the compiler to be`
15			`/// able to choose the specialization.`
16
17			`#ifndef SIMD_H_39BE2D970DF4BD00508CCFFB482496F9`
18			`#define SIMD_H_39BE2D970DF4BD00508CCFFB482496F9`
19
20			`#include "ulimits.h"`
21			`#if HAVE_MATH_H`
22			`#include <math.h>`
23			`#endif`
24
25			`namespace ustl {`
26			`namespace simd {`
27
28			`//----------------------------------------------------------------------`
29			`// Generic algorithms`
30			`//----------------------------------------------------------------------`
31
32			`/// Applies \p op to each element in \p op1.`
33			`template <typename Ctr, typename UnaryOperation>`
34			`inline void packop (Ctr& op1, UnaryOperation op)`
35			`{`
36			`foreach (typename Ctr::iterator, i, op1)`
37			`op (*i);`
38			`}`
39
40			`/// Applies \p op to each element in \p op1 and \p op2 and stores in \p op2.`
41			`template <typename Ctr, typename BinaryOperation>`
42			`inline void packop (const Ctr& op1, Ctr& op2, BinaryOperation op)`
43			`{`
44			`assert (op2.size() <= op1.size());`
45			`typename Ctr::const_iterator i1 (op1.begin());`
46			`typename Ctr::iterator i2 (op2.begin());`
47			`for (; i2 != op2.end(); ++i1, ++i2)`
48			`i2 = op (i2, *i1);`
49			`}`
50
51			`/// Applies \p op to corresponding elements in \p op1 and \p op2 and stores in \p result.`
52			`template <typename Ctr, typename BinaryOperation>`
53			`inline void packop (const Ctr& op1, const Ctr& op2, Ctr& result, BinaryOperation op)`
54			`{`
55			`assert (op1.size() <= op2.size() && op1.size() <= result.size());`
56			`passign (op1, result);`
57			`packop (op2, result);`
58			`}`
59
60			`/// Copies \p op1 into \p result.`
61			`template <typename Ctr>`
62			`inline void passign (const Ctr& op1, Ctr& result)`
63			`{`
64			`assert (op1.size() <= result.size());`
65			`typename Ctr::iterator d (result.begin());`
66			`foreach (typename Ctr::const_iterator, s, op1)`
67			`d++ = s;`
68			`}`
69
70			`/// Copies \p result.size() elements from \p op1 to \p result.`
71			`template <typename Ctr>`
72			`inline void ipassign (typename Ctr::const_iterator op1, Ctr& result)`
73			`{`
74			`foreach (typename Ctr::iterator, d, result)`
75			`d = op1++;`
76			`}`
77
78			`template <typename Ctr1, typename Ctr2, typename ConvertFunction>`
79			`inline void pconvert (const Ctr1& op1, Ctr2& op2, ConvertFunction f)`
80			`{`
81			`assert (op1.size() <= op2.size());`
82			`typename Ctr1::const_iterator i1 (op1.begin());`
83			`typename Ctr2::iterator i2 (op2.begin());`
84			`for (; i1 != op1.end(); ++i1, ++i2)`
85			`i2 = f (i1);`
86			`}`
87
88			`// Functionoids for SIMD operations, like saturation arithmetic, shifts, etc.`
89			`STD_BINARY_FUNCTOR (fpadds, T, ((b > numeric_limits<T>::max() - a) ? numeric_limits<T>::max() : a + b))`
90			`STD_BINARY_FUNCTOR (fpsubs, T, ((a < numeric_limits<T>::min() + b) ? numeric_limits<T>::min() : a - b))`
91			`STD_BINARY_FUNCTOR (fpshl, T, (a << b))`
92			`STD_BINARY_FUNCTOR (fpshr, T, (a >> b))`
93			`STD_BINARY_FUNCTOR (fpmin, T, (min (a, b)))`
94			`STD_BINARY_FUNCTOR (fpmax, T, (max (a, b)))`
95			`STD_BINARY_FUNCTOR (fpavg, T, ((a + b + 1) / 2))`
96			`STD_CONVERSION_FUNCTOR (fcast, (D(a)))`
97			`#if HAVE_MATH_H`
98			`STD_UNARY_FUNCTOR (fpreciprocal,T, (1 / a))`
99			`STD_UNARY_FUNCTOR (fpsqrt, T, (reset_mmx(), T (sqrt (a))))`
100			`STD_UNARY_FUNCTOR (fprecipsqrt, T, (reset_mmx(), 1 / T(sqrt (a))))`
101			`STD_UNARY_FUNCTOR (fsin, T, (reset_mmx(), T (sin (a))))`
102			`STD_UNARY_FUNCTOR (fcos, T, (reset_mmx(), T (cos (a))))`
103			`STD_UNARY_FUNCTOR (ftan, T, (reset_mmx(), T (tan (a))))`
104			`#if HAVE_RINTF`
105			`STD_CONVERSION_FUNCTOR (fround, (reset_mmx(), D(rintf(a))))`
106			`#else`
107			`STD_CONVERSION_FUNCTOR (fround, (reset_mmx(), D(rint(a))))`
108			`#endif`
109			`template <> inline int32_t fround<double,int32_t>::operator()(const double& a) const { reset_mmx(); return (int32_t(rint(a))); }`
110			`#endif`
111			`template <> inline float fpavg<float>::operator()(const float& a, const float& b) const { return ((a + b) / 2); }`
112			`template <> inline double fpavg<double>::operator()(const double& a, const double& b) const { return ((a + b) / 2); }`
113
114			`#define SIMD_PACKEDOP1(name, operation) \`
115			`template <typename Ctr> \`
116			`inline void name (Ctr& op1) \`
117			`{ \`
118			`typedef typename Ctr::value_type value_t; \`
119			`packop (op1, operation<value_t>()); \`
120			`}`
121			`#define SIMD_PACKEDOP2(name, operation) \`
122			`template <typename Ctr> \`
123			`inline void name (const Ctr& op1, Ctr& op2) \`
124			`{ \`
125			`typedef typename Ctr::value_type value_t; \`
126			`packop (op1, op2, operation<value_t>()); \`
127			`}`
128			`#define SIMD_PACKEDOP3(name, operation) \`
129			`template <typename Ctr> \`
130			`inline void name (const Ctr& op1, const Ctr& op2, Ctr& result) \`
131			`{ \`
132			`typedef typename Ctr::value_type value_t; \`
133			`packop (op1, op2, result, operation<value_t>()); \`
134			`}`
135			`#define SIMD_SINGLEOP1(name, operation) \`
136			`template <typename T> \`
137			`inline T name (T op) \`
138			`{ \`
139			`operation<T> obj; \`
140			`return (obj(op)); \`
141			`}`
142			`#define SIMD_CONVERTOP(name, operation) \`
143			`template <typename Ctr1, typename Ctr2> \`
144			`inline void name (const Ctr1& op1, Ctr2& op2) \`
145			`{ \`
146			`typedef typename Ctr1::value_type value1_t; \`
147			`typedef typename Ctr2::value_type value2_t; \`
148			`pconvert (op1, op2, operation<value1_t, value2_t>());\`
149			`}`
150
151			`SIMD_PACKEDOP2 (padd, plus)`
152			`SIMD_PACKEDOP2 (psub, minus)`
153			`SIMD_PACKEDOP2 (pmul, multiplies)`
154			`SIMD_PACKEDOP2 (pdiv, divides)`
155			`SIMD_PACKEDOP2 (pand, bitwise_and)`
156			`SIMD_PACKEDOP2 (por, bitwise_or)`
157			`SIMD_PACKEDOP2 (pxor, bitwise_xor)`
158			`SIMD_PACKEDOP2 (pshl, fpshl)`
159			`SIMD_PACKEDOP2 (pshr, fpshr)`
160			`SIMD_PACKEDOP2 (psubs, fpsubs)`
161			`SIMD_PACKEDOP2 (pmin, fpmin)`
162			`SIMD_PACKEDOP2 (pmax, fpmax)`
163			`SIMD_PACKEDOP2 (pavg, fpavg)`
164
165			`SIMD_PACKEDOP3 (padd, plus)`
166			`SIMD_PACKEDOP3 (psub, minus)`
167			`SIMD_PACKEDOP3 (pmul, multiplies)`
168			`SIMD_PACKEDOP3 (pdiv, divides)`
169			`SIMD_PACKEDOP3 (pand, bitwise_and)`
170			`SIMD_PACKEDOP3 (por, bitwise_or)`
171			`SIMD_PACKEDOP3 (pxor, bitwise_xor)`
172			`SIMD_PACKEDOP3 (pshl, fpshl)`
173			`SIMD_PACKEDOP3 (pshr, fpshr)`
174			`SIMD_PACKEDOP3 (padds, fpadds)`
175			`SIMD_PACKEDOP3 (psubs, fpsubs)`
176			`SIMD_PACKEDOP3 (pmin, fpmin)`
177			`SIMD_PACKEDOP3 (pmax, fpmax)`
178			`SIMD_PACKEDOP3 (pavg, fpavg)`
179
180			`#if HAVE_MATH_H`
181			`SIMD_PACKEDOP1 (precip, fpreciprocal)`
182			`SIMD_PACKEDOP1 (psqrt, fpsqrt)`
183			`SIMD_PACKEDOP1 (precipsqrt, fprecipsqrt)`
184			`SIMD_PACKEDOP1 (psin, fsin)`
185			`SIMD_PACKEDOP1 (pcos, fcos)`
186			`SIMD_PACKEDOP1 (ptan, ftan)`
187
188			`SIMD_SINGLEOP1 (srecip, fpreciprocal)`
189			`SIMD_SINGLEOP1 (ssqrt, fpsqrt)`
190			`SIMD_SINGLEOP1 (srecipsqrt, fprecipsqrt)`
191			`SIMD_SINGLEOP1 (ssin, fsin)`
192			`SIMD_SINGLEOP1 (scos, fcos)`
193			`SIMD_SINGLEOP1 (stan, ftan)`
194
195			`SIMD_CONVERTOP (pround, fround)`
196
197			`template <typename T> inline int32_t sround (T op) { fround<T,int32_t> obj; return (obj (op)); }`
198			`#endif`
199
200			`#undef SIMD_SINGLEOP1`
201			`#undef SIMD_PACKEDOP3`
202			`#undef SIMD_PACKEDOP2`
203			`#undef SIMD_PACKEDOP1`
204
205			`//----------------------------------------------------------------------`
206			`// Vector types to cast tuple data to`
207			`//----------------------------------------------------------------------`
208
209			`#if HAVE_VECTOR_EXTENSIONS && __GNUC__ >= 4`
210			`#define VECTOR_ATTRIBUTE(mode,vs) __attribute__((vector_size(vs)))`
211			`#else`
212			`#define VECTOR_ATTRIBUTE(mode,vs)`
213			`#endif`
214			`typedef uint8_t v8qi_t VECTOR_ATTRIBUTE (V8QI,8);`
215			`typedef uint16_t v4hi_t VECTOR_ATTRIBUTE (V4HI,8);`
216			`typedef uint16_t v8hi_t VECTOR_ATTRIBUTE (V8HI,16);`
217			`typedef uint32_t v2si_t VECTOR_ATTRIBUTE (V2SI,8);`
218			`typedef uint32_t v4si_t VECTOR_ATTRIBUTE (V4SI,16);`
219			`#if HAVE_INT64_T`
220			`typedef uint64_t v1di_t VECTOR_ATTRIBUTE (V1DI,8);`
221			`#endif`
222			`typedef float v2sf_t VECTOR_ATTRIBUTE (V2SF,8);`
223			`typedef float v4sf_t VECTOR_ATTRIBUTE (V4SF,16);`
224			`typedef double v2df_t VECTOR_ATTRIBUTE (V2DF,16);`
225			`#undef VECTOR_ATTRIBUTE`
226
227			`#define SIMDA_RI(n) "m"(oin[n])`
228			`#define SIMDA_RO(n) "m"(oout[n])`
229			`#define SIMDA_WI(n) "=m"(oin[n])`
230			`#define SIMDA_WO(n) "=m"(oout[n])`
231
232			`//----------------------------------------------------------------------`
233			`// Hardware accelerated specializations`
234			`//----------------------------------------------------------------------`
235
236			`#define SIMD_PKOP2_SPEC(n, type, optype) \`
237			`template <> \`
238			`inline void packop (const tuple<n,type>& oin, tuple<n,type>& oout, optype<type>)`
239			`#define SIMD_PASSIGN_SPEC(n, type) \`
240			`template <> \`
241			`inline void passign (const tuple<n,type>& oin, tuple<n,type>& oout)`
242			`#define SIMD_IPASSIGN_SPEC(n, type) \`
243			`template <> \`
244			`inline void ipassign (tuple<n,type>::const_iterator oin, tuple<n,type>& oout)`
245			`#define SIMD_CONVERT_SPEC(n, type1, type2, optype) \`
246			`template <> \`
247			`inline void pconvert (const tuple<n,type1>& oin, tuple<n,type2>& oout, optype<type1,type2>)`
248
249			`#if CPU_HAS_MMX`
250			`#define STD_MMX_ARGS : "m"(oout[0]), "m"(oin[0]) : "mm0", "st", "memory"`
251			`#define DBL_MMX_ARGS : "m"(oout[0]), "m"(oout[2]), "m"(oin[0]), "m"(oin[2]) : "mm0", "mm1", "st", "st(1)", "memory"`
252			`#define MMX_PKOP2_SPEC(n,type,optype,instruction) \`
253			`SIMD_PKOP2_SPEC(n,type,optype) \`
254			`{ asm ("movq %0, %%mm0\n\t" #instruction " %1, %%mm0\n\tmovq %%mm0, %0" : STD_MMX_ARGS); reset_mmx(); }`
255			`#define MMX_DBL_PKOP2_SPEC(n,type,optype,instruction) \`
256			`SIMD_PKOP2_SPEC(n,type,optype) \`
257			`{ asm ("movq %0, %%mm0\n\tmovq %1, %%mm1\n\t" #instruction " %2, %%mm0\n\t" #instruction " %3, %%mm1\n\tmovq %%mm0, %0\n\tmovq %%mm1, %1" : DBL_MMX_ARGS); reset_mmx(); }`
258			`#define MMX_PASSIGN_SPEC(n,type) \`
259			`SIMD_PASSIGN_SPEC(n,type) \`
260			`{ asm ("movq %1, %%mm0\n\tmovq %%mm0, %0" : STD_MMX_ARGS); reset_mmx(); }`
261			`#define MMX_DBL_PASSIGN_SPEC(n,type) \`
262			`SIMD_PASSIGN_SPEC(n,type) \`
263			`{ asm ("movq %2, %%mm0\n\tmovq %3, %%mm1\n\tmovq %%mm0, %0\n\tmovq %%mm1, %1" : DBL_MMX_ARGS); reset_mmx(); }`
264			`#define MMX_IPASSIGN_SPEC(n,type) \`
265			`SIMD_IPASSIGN_SPEC(n,type) \`
266			`{ asm ("movq %1, %%mm0\n\tmovq %%mm0, %0" : STD_MMX_ARGS); reset_mmx(); }`
267			`#define MMX_DBL_IPASSIGN_SPEC(n,type) \`
268			`SIMD_IPASSIGN_SPEC(n,type) \`
269			`{ asm ("movq %2, %%mm0\n\tmovq %3, %%mm1\n\tmovq %%mm0, %0\n\tmovq %%mm1, %1" : DBL_MMX_ARGS); reset_mmx(); }`
270
271			`MMX_PASSIGN_SPEC(8,uint8_t)`
272			`MMX_PKOP2_SPEC(8,uint8_t,plus,paddb)`
273			`MMX_PKOP2_SPEC(8,uint8_t,minus,psubb)`
274			`MMX_PKOP2_SPEC(8,uint8_t,bitwise_and,pand)`
275			`MMX_PKOP2_SPEC(8,uint8_t,bitwise_or,por)`
276			`MMX_PKOP2_SPEC(8,uint8_t,bitwise_xor,pxor)`
277			`MMX_PKOP2_SPEC(8,uint8_t,fpadds,paddusb)`
278			`MMX_PKOP2_SPEC(8,uint8_t,fpsubs,psubusb)`
279
280			`MMX_PASSIGN_SPEC(8,int8_t)`
281			`MMX_PKOP2_SPEC(8,int8_t,plus,paddb)`
282			`MMX_PKOP2_SPEC(8,int8_t,minus,psubb)`
283			`MMX_PKOP2_SPEC(8,int8_t,bitwise_and,pand)`
284			`MMX_PKOP2_SPEC(8,int8_t,bitwise_or,por)`
285			`MMX_PKOP2_SPEC(8,int8_t,bitwise_xor,pxor)`
286			`MMX_PKOP2_SPEC(8,int8_t,fpadds,paddsb)`
287			`MMX_PKOP2_SPEC(8,int8_t,fpsubs,psubsb)`
288
289			`MMX_PASSIGN_SPEC(4,uint16_t)`
290			`MMX_PKOP2_SPEC(4,uint16_t,plus,paddw)`
291			`MMX_PKOP2_SPEC(4,uint16_t,minus,psubw)`
292			`MMX_PKOP2_SPEC(4,uint16_t,bitwise_and,pand)`
293			`MMX_PKOP2_SPEC(4,uint16_t,bitwise_or,por)`
294			`MMX_PKOP2_SPEC(4,uint16_t,bitwise_xor,pxor)`
295			`/// \todo psllw does not work like other operations, it uses the first element for shift count.`
296			`//MMX_PKOP2_SPEC(4,uint16_t,fpshl,psllw)`
297			`//MMX_PKOP2_SPEC(4,uint16_t,fpshr,psrlw)`
298			`MMX_PKOP2_SPEC(4,uint16_t,fpadds,paddusw)`
299			`MMX_PKOP2_SPEC(4,uint16_t,fpsubs,psubusw)`
300
301			`MMX_PASSIGN_SPEC(4,int16_t)`
302			`MMX_PKOP2_SPEC(4,int16_t,plus,paddw)`
303			`MMX_PKOP2_SPEC(4,int16_t,minus,psubw)`
304			`MMX_PKOP2_SPEC(4,int16_t,bitwise_and,pand)`
305			`MMX_PKOP2_SPEC(4,int16_t,bitwise_or,por)`
306			`MMX_PKOP2_SPEC(4,int16_t,bitwise_xor,pxor)`
307			`//MMX_PKOP2_SPEC(4,int16_t,fpshl,psllw)`
308			`//MMX_PKOP2_SPEC(4,int16_t,fpshr,psrlw)`
309			`MMX_PKOP2_SPEC(4,int16_t,fpadds,paddsw)`
310			`MMX_PKOP2_SPEC(4,int16_t,fpsubs,psubsw)`
311
312			`MMX_PASSIGN_SPEC(2,uint32_t)`
313			`MMX_PKOP2_SPEC(2,uint32_t,plus,paddd)`
314			`MMX_PKOP2_SPEC(2,uint32_t,minus,psubd)`
315			`MMX_PKOP2_SPEC(2,uint32_t,bitwise_and,pand)`
316			`MMX_PKOP2_SPEC(2,uint32_t,bitwise_or,por)`
317			`MMX_PKOP2_SPEC(2,uint32_t,bitwise_xor,pxor)`
318			`//MMX_PKOP2_SPEC(2,uint32_t,fpshl,pslld)`
319			`//MMX_PKOP2_SPEC(2,uint32_t,fpshr,psrld)`
320
321			`MMX_PASSIGN_SPEC(2,int32_t)`
322			`MMX_PKOP2_SPEC(2,int32_t,plus,paddd)`
323			`MMX_PKOP2_SPEC(2,int32_t,minus,psubd)`
324			`MMX_PKOP2_SPEC(2,int32_t,bitwise_and,pand)`
325			`MMX_PKOP2_SPEC(2,int32_t,bitwise_or,por)`
326			`MMX_PKOP2_SPEC(2,int32_t,bitwise_xor,pxor)`
327			`//MMX_PKOP2_SPEC(2,int32_t,fpshl,pslld)`
328			`//MMX_PKOP2_SPEC(2,int32_t,fpshr,psrld)`
329
330			`MMX_DBL_PKOP2_SPEC(4,uint32_t,plus,paddd)`
331			`MMX_DBL_PKOP2_SPEC(4,uint32_t,minus,psubd)`
332			`MMX_DBL_PKOP2_SPEC(4,uint32_t,bitwise_and,pand)`
333			`MMX_DBL_PKOP2_SPEC(4,uint32_t,bitwise_or,por)`
334			`MMX_DBL_PKOP2_SPEC(4,uint32_t,bitwise_xor,pxor)`
335			`//MMX_DBL_PKOP2_SPEC(2,uint32_t,fpshl,pslld)`
336			`//MMX_DBL_PKOP2_SPEC(2,uint32_t,fpshr,psrld)`
337
338			`MMX_DBL_PKOP2_SPEC(4,int32_t,plus,paddd)`
339			`MMX_DBL_PKOP2_SPEC(4,int32_t,minus,psubd)`
340			`MMX_DBL_PKOP2_SPEC(4,int32_t,bitwise_and,pand)`
341			`MMX_DBL_PKOP2_SPEC(4,int32_t,bitwise_or,por)`
342			`MMX_DBL_PKOP2_SPEC(4,int32_t,bitwise_xor,pxor)`
343			`//MMX_DBL_PKOP2_SPEC(2,int32_t,fpshl,pslld)`
344			`//MMX_DBL_PKOP2_SPEC(2,int32_t,fpshr,psrld)`
345
346			`#if CPU_HAS_SSE \|\| CPU_HAS_3DNOW`
347			`MMX_PKOP2_SPEC(8,uint8_t,fpavg,pavgb)`
348			`MMX_PKOP2_SPEC(8,int8_t,fpavg,pavgb)`
349			`MMX_PKOP2_SPEC(4,uint16_t,fpavg,pavgw)`
350			`MMX_PKOP2_SPEC(4,int16_t,fpavg,pavgw)`
351			`MMX_PKOP2_SPEC(8,uint8_t,fpmin,pminub)`
352			`MMX_PKOP2_SPEC(8,uint8_t,fpmax,pmaxub)`
353			`MMX_PKOP2_SPEC(4,int16_t,fpmax,pmaxsw)`
354			`MMX_PKOP2_SPEC(4,int16_t,fpmin,pminsw)`
355			`#endif // CPU_HAS_SSE \|\| CPU_HAS_3DNOW`
356
357			`#if CPU_HAS_3DNOW`
358			`MMX_PASSIGN_SPEC(2,float)`
359			`MMX_PKOP2_SPEC(2,float,plus,pfadd)`
360			`MMX_PKOP2_SPEC(2,float,minus,pfsub)`
361			`MMX_PKOP2_SPEC(2,float,multiplies,pfmul)`
362			`MMX_PKOP2_SPEC(2,float,fpmin,pfmin)`
363			`MMX_PKOP2_SPEC(2,float,fpmax,pfmax)`
364			`#ifndef CPU_HAS_SSE`
365			`MMX_DBL_PKOP2_SPEC(4,float,plus,pfadd)`
366			`MMX_DBL_PKOP2_SPEC(4,float,minus,pfsub)`
367			`MMX_DBL_PKOP2_SPEC(4,float,multiplies,pfmul)`
368			`MMX_DBL_PKOP2_SPEC(4,float,fpmin,pfmin)`
369			`MMX_DBL_PKOP2_SPEC(4,float,fpmax,pfmax)`
370			`#endif`
371			`#endif // CPU_HAS_3DNOW`
372
373			`MMX_IPASSIGN_SPEC(8,uint8_t)`
374			`MMX_IPASSIGN_SPEC(4,uint16_t)`
375			`MMX_IPASSIGN_SPEC(2,uint32_t)`
376			`MMX_IPASSIGN_SPEC(2,float)`
377
378			`#ifndef CPU_HAS_SSE`
379			`MMX_DBL_PASSIGN_SPEC(4,float)`
380			`MMX_DBL_PASSIGN_SPEC(4,uint32_t)`
381			`MMX_DBL_PASSIGN_SPEC(4,int32_t)`
382			`MMX_DBL_IPASSIGN_SPEC(4,float)`
383			`MMX_DBL_IPASSIGN_SPEC(4,uint32_t)`
384			`MMX_DBL_IPASSIGN_SPEC(4,int32_t)`
385			`#endif`
386
387			`#undef MMX_IPASSIGN_SPEC`
388			`#undef MMX_PASSIGN_SPEC`
389			`#undef MMX_PKOP2_SPEC`
390			`#undef STD_MMX_ARGS`
391			`#endif // CPU_HAS_MMX`
392
393			`#if CPU_HAS_SSE`
394			`#define STD_SSE_ARGS : "m"(oout[0]), "m"(oin[0]) : "xmm0", "memory"`
395			`#define SSE_PKOP2_SPEC(n,type,optype,instruction) \`
396			`SIMD_PKOP2_SPEC(n,type,optype) \`
397			`{ asm ("movups %0, %%xmm0\n\tmovups %1, %%xmm1\n\t" #instruction " %%xmm1, %%xmm0\n\tmovups %%xmm0, %0" : STD_SSE_ARGS);}`
398			`#define SSE_PASSIGN_SPEC(n,type) \`
399			`SIMD_PASSIGN_SPEC(n,type) \`
400			`{ asm ("movups %1, %%xmm0\n\tmovups %%xmm0, %0" : STD_SSE_ARGS);}`
401			`#define SSE_IPASSIGN_SPEC(n,type) \`
402			`SIMD_IPASSIGN_SPEC(n,type) \`
403			`{ asm ("movups %1, %%xmm0\n\tmovups %%xmm0, %0" : STD_SSE_ARGS);}`
404			`SSE_PASSIGN_SPEC(4,float)`
405			`SSE_PASSIGN_SPEC(4,int32_t)`
406			`SSE_PASSIGN_SPEC(4,uint32_t)`
407			`SSE_PKOP2_SPEC(4,float,plus,addps)`
408			`SSE_PKOP2_SPEC(4,float,minus,subps)`
409			`SSE_PKOP2_SPEC(4,float,multiplies,mulps)`
410			`SSE_PKOP2_SPEC(4,float,divides,divps)`
411			`SSE_PKOP2_SPEC(4,float,bitwise_and,andps)`
412			`SSE_PKOP2_SPEC(4,float,bitwise_or,orps)`
413			`SSE_PKOP2_SPEC(4,float,bitwise_xor,xorps)`
414			`SSE_PKOP2_SPEC(4,float,fpmax,maxps)`
415			`SSE_PKOP2_SPEC(4,float,fpmin,minps)`
416
417			`SIMD_CONVERT_SPEC(4,float,int32_t,fround) {`
418			`asm ("cvtps2pi %2, %%mm0\n\t"`
419			`"cvtps2pi %3, %%mm1\n\t"`
420			`"movq %%mm0, %0\n\t"`
421			`"movq %%mm1, %1"`
422			`: DBL_MMX_ARGS);`
423			`reset_mmx();`
424			`}`
425			`SIMD_CONVERT_SPEC(4,int32_t,float,fround) {`
426			`asm ("cvtpi2ps %2, %%xmm0\n\t"`
427			`"shufps $0x4E,%%xmm0,%%xmm0\n\t"`
428			`"cvtpi2ps %1, %%xmm0\n\t"`
429			`"movups %%xmm0, %0"`
430			`:: "m"(oout[0]), "m"(oin[0]), "m"(oin[2]) : "xmm0", "memory");`
431			`}`
432			`template <> inline int32_t fround<float,int32_t>::operator()(const float& a) const {`
433			`register int32_t rv;`
434			`asm ("movss %1, %%xmm0\n\t"`
435			`"cvtss2si %%xmm0, %0"`
436			`: "=r"(rv) : "m"(a) : "xmm0" );`
437			`return (rv);`
438			`}`
439			`template <> inline uint32_t fround<float,uint32_t>::operator()(const float& a) const {`
440			`register uint32_t rv;`
441			`asm ("movss %1, %%xmm0\n\t"`
442			`"cvtss2si %%xmm0, %0"`
443			`: "=r"(rv) : "m"(a) : "xmm0" );`
444			`return (rv);`
445			`}`
446
447			`SSE_IPASSIGN_SPEC(4,float)`
448			`SSE_IPASSIGN_SPEC(4,int32_t)`
449			`SSE_IPASSIGN_SPEC(4,uint32_t)`
450
451			`#undef SSE_IPASSIGN_SPEC`
452			`#undef SSE_PASSIGN_SPEC`
453			`#undef SSE_PKOP2_SPEC`
454			`#undef STD_SSE_ARGS`
455			`#endif // CPU_HAS_SSE`
456
457			`#undef SIMDA_RI`
458			`#undef SIMDA_RO`
459			`#undef SIMDA_WI`
460			`#undef SIMDA_WO`
461			`#undef SIMD_PACKEDOP_SPEC`
462
463			`} // namespace simd`
464			`} // namespace ustl`
465
466			`#endif`