URL
https://opencores.org/ocsvn/openrisc/openrisc/trunk
Subversion Repositories openrisc
[/] [openrisc/] [trunk/] [rtos/] [ecos-3.0/] [packages/] [language/] [cxx/] [ustl/] [current/] [include/] [ustl/] [simd.h] - Rev 786
Compare with Previous | Blame | View Log
// This file is part of the uSTL library, an STL implementation. // // Copyright (c) 2005-2009 by Mike Sharov <msharov@users.sourceforge.net> // This file is free software, distributed under the MIT License. // /// \file simd.h /// \brief SIMD-type algorithms, with hardware acceleration, if available. /// /// All algorithms are container-based because iterator syntax is just too /// damn verbose and because the specializations need to be able to tell /// how many elements are in the container in order to choose proper SIMD /// instruction set (i.e.: 4 floats select SSE, while 2 floats select 3dNow!) /// Specializations are only for the tuple template because the container /// must be of a fixed and compile-time-known size for the compiler to be /// able to choose the specialization. #ifndef SIMD_H_39BE2D970DF4BD00508CCFFB482496F9 #define SIMD_H_39BE2D970DF4BD00508CCFFB482496F9 #include "ulimits.h" #if HAVE_MATH_H #include <math.h> #endif namespace ustl { namespace simd { //---------------------------------------------------------------------- // Generic algorithms //---------------------------------------------------------------------- /// Applies \p op to each element in \p op1. template <typename Ctr, typename UnaryOperation> inline void packop (Ctr& op1, UnaryOperation op) { foreach (typename Ctr::iterator, i, op1) op (*i); } /// Applies \p op to each element in \p op1 and \p op2 and stores in \p op2. template <typename Ctr, typename BinaryOperation> inline void packop (const Ctr& op1, Ctr& op2, BinaryOperation op) { assert (op2.size() <= op1.size()); typename Ctr::const_iterator i1 (op1.begin()); typename Ctr::iterator i2 (op2.begin()); for (; i2 != op2.end(); ++i1, ++i2) *i2 = op (*i2, *i1); } /// Applies \p op to corresponding elements in \p op1 and \p op2 and stores in \p result. template <typename Ctr, typename BinaryOperation> inline void packop (const Ctr& op1, const Ctr& op2, Ctr& result, BinaryOperation op) { assert (op1.size() <= op2.size() && op1.size() <= result.size()); passign (op1, result); packop (op2, result); } /// Copies \p op1 into \p result. template <typename Ctr> inline void passign (const Ctr& op1, Ctr& result) { assert (op1.size() <= result.size()); typename Ctr::iterator d (result.begin()); foreach (typename Ctr::const_iterator, s, op1) *d++ = *s; } /// Copies \p result.size() elements from \p op1 to \p result. template <typename Ctr> inline void ipassign (typename Ctr::const_iterator op1, Ctr& result) { foreach (typename Ctr::iterator, d, result) *d = *op1++; } template <typename Ctr1, typename Ctr2, typename ConvertFunction> inline void pconvert (const Ctr1& op1, Ctr2& op2, ConvertFunction f) { assert (op1.size() <= op2.size()); typename Ctr1::const_iterator i1 (op1.begin()); typename Ctr2::iterator i2 (op2.begin()); for (; i1 != op1.end(); ++i1, ++i2) *i2 = f (*i1); } // Functionoids for SIMD operations, like saturation arithmetic, shifts, etc. STD_BINARY_FUNCTOR (fpadds, T, ((b > numeric_limits<T>::max() - a) ? numeric_limits<T>::max() : a + b)) STD_BINARY_FUNCTOR (fpsubs, T, ((a < numeric_limits<T>::min() + b) ? numeric_limits<T>::min() : a - b)) STD_BINARY_FUNCTOR (fpshl, T, (a << b)) STD_BINARY_FUNCTOR (fpshr, T, (a >> b)) STD_BINARY_FUNCTOR (fpmin, T, (min (a, b))) STD_BINARY_FUNCTOR (fpmax, T, (max (a, b))) STD_BINARY_FUNCTOR (fpavg, T, ((a + b + 1) / 2)) STD_CONVERSION_FUNCTOR (fcast, (D(a))) #if HAVE_MATH_H STD_UNARY_FUNCTOR (fpreciprocal,T, (1 / a)) STD_UNARY_FUNCTOR (fpsqrt, T, (reset_mmx(), T (sqrt (a)))) STD_UNARY_FUNCTOR (fprecipsqrt, T, (reset_mmx(), 1 / T(sqrt (a)))) STD_UNARY_FUNCTOR (fsin, T, (reset_mmx(), T (sin (a)))) STD_UNARY_FUNCTOR (fcos, T, (reset_mmx(), T (cos (a)))) STD_UNARY_FUNCTOR (ftan, T, (reset_mmx(), T (tan (a)))) #if HAVE_RINTF STD_CONVERSION_FUNCTOR (fround, (reset_mmx(), D(rintf(a)))) #else STD_CONVERSION_FUNCTOR (fround, (reset_mmx(), D(rint(a)))) #endif template <> inline int32_t fround<double,int32_t>::operator()(const double& a) const { reset_mmx(); return (int32_t(rint(a))); } #endif template <> inline float fpavg<float>::operator()(const float& a, const float& b) const { return ((a + b) / 2); } template <> inline double fpavg<double>::operator()(const double& a, const double& b) const { return ((a + b) / 2); } #define SIMD_PACKEDOP1(name, operation) \ template <typename Ctr> \ inline void name (Ctr& op1) \ { \ typedef typename Ctr::value_type value_t; \ packop (op1, operation<value_t>()); \ } #define SIMD_PACKEDOP2(name, operation) \ template <typename Ctr> \ inline void name (const Ctr& op1, Ctr& op2) \ { \ typedef typename Ctr::value_type value_t; \ packop (op1, op2, operation<value_t>()); \ } #define SIMD_PACKEDOP3(name, operation) \ template <typename Ctr> \ inline void name (const Ctr& op1, const Ctr& op2, Ctr& result) \ { \ typedef typename Ctr::value_type value_t; \ packop (op1, op2, result, operation<value_t>()); \ } #define SIMD_SINGLEOP1(name, operation) \ template <typename T> \ inline T name (T op) \ { \ operation<T> obj; \ return (obj(op)); \ } #define SIMD_CONVERTOP(name, operation) \ template <typename Ctr1, typename Ctr2> \ inline void name (const Ctr1& op1, Ctr2& op2) \ { \ typedef typename Ctr1::value_type value1_t; \ typedef typename Ctr2::value_type value2_t; \ pconvert (op1, op2, operation<value1_t, value2_t>());\ } SIMD_PACKEDOP2 (padd, plus) SIMD_PACKEDOP2 (psub, minus) SIMD_PACKEDOP2 (pmul, multiplies) SIMD_PACKEDOP2 (pdiv, divides) SIMD_PACKEDOP2 (pand, bitwise_and) SIMD_PACKEDOP2 (por, bitwise_or) SIMD_PACKEDOP2 (pxor, bitwise_xor) SIMD_PACKEDOP2 (pshl, fpshl) SIMD_PACKEDOP2 (pshr, fpshr) SIMD_PACKEDOP2 (psubs, fpsubs) SIMD_PACKEDOP2 (pmin, fpmin) SIMD_PACKEDOP2 (pmax, fpmax) SIMD_PACKEDOP2 (pavg, fpavg) SIMD_PACKEDOP3 (padd, plus) SIMD_PACKEDOP3 (psub, minus) SIMD_PACKEDOP3 (pmul, multiplies) SIMD_PACKEDOP3 (pdiv, divides) SIMD_PACKEDOP3 (pand, bitwise_and) SIMD_PACKEDOP3 (por, bitwise_or) SIMD_PACKEDOP3 (pxor, bitwise_xor) SIMD_PACKEDOP3 (pshl, fpshl) SIMD_PACKEDOP3 (pshr, fpshr) SIMD_PACKEDOP3 (padds, fpadds) SIMD_PACKEDOP3 (psubs, fpsubs) SIMD_PACKEDOP3 (pmin, fpmin) SIMD_PACKEDOP3 (pmax, fpmax) SIMD_PACKEDOP3 (pavg, fpavg) #if HAVE_MATH_H SIMD_PACKEDOP1 (precip, fpreciprocal) SIMD_PACKEDOP1 (psqrt, fpsqrt) SIMD_PACKEDOP1 (precipsqrt, fprecipsqrt) SIMD_PACKEDOP1 (psin, fsin) SIMD_PACKEDOP1 (pcos, fcos) SIMD_PACKEDOP1 (ptan, ftan) SIMD_SINGLEOP1 (srecip, fpreciprocal) SIMD_SINGLEOP1 (ssqrt, fpsqrt) SIMD_SINGLEOP1 (srecipsqrt, fprecipsqrt) SIMD_SINGLEOP1 (ssin, fsin) SIMD_SINGLEOP1 (scos, fcos) SIMD_SINGLEOP1 (stan, ftan) SIMD_CONVERTOP (pround, fround) template <typename T> inline int32_t sround (T op) { fround<T,int32_t> obj; return (obj (op)); } #endif #undef SIMD_SINGLEOP1 #undef SIMD_PACKEDOP3 #undef SIMD_PACKEDOP2 #undef SIMD_PACKEDOP1 //---------------------------------------------------------------------- // Vector types to cast tuple data to //---------------------------------------------------------------------- #if HAVE_VECTOR_EXTENSIONS && __GNUC__ >= 4 #define VECTOR_ATTRIBUTE(mode,vs) __attribute__((vector_size(vs))) #else #define VECTOR_ATTRIBUTE(mode,vs) #endif typedef uint8_t v8qi_t VECTOR_ATTRIBUTE (V8QI,8); typedef uint16_t v4hi_t VECTOR_ATTRIBUTE (V4HI,8); typedef uint16_t v8hi_t VECTOR_ATTRIBUTE (V8HI,16); typedef uint32_t v2si_t VECTOR_ATTRIBUTE (V2SI,8); typedef uint32_t v4si_t VECTOR_ATTRIBUTE (V4SI,16); #if HAVE_INT64_T typedef uint64_t v1di_t VECTOR_ATTRIBUTE (V1DI,8); #endif typedef float v2sf_t VECTOR_ATTRIBUTE (V2SF,8); typedef float v4sf_t VECTOR_ATTRIBUTE (V4SF,16); typedef double v2df_t VECTOR_ATTRIBUTE (V2DF,16); #undef VECTOR_ATTRIBUTE #define SIMDA_RI(n) "m"(oin[n]) #define SIMDA_RO(n) "m"(oout[n]) #define SIMDA_WI(n) "=m"(oin[n]) #define SIMDA_WO(n) "=m"(oout[n]) //---------------------------------------------------------------------- // Hardware accelerated specializations //---------------------------------------------------------------------- #define SIMD_PKOP2_SPEC(n, type, optype) \ template <> \ inline void packop (const tuple<n,type>& oin, tuple<n,type>& oout, optype<type>) #define SIMD_PASSIGN_SPEC(n, type) \ template <> \ inline void passign (const tuple<n,type>& oin, tuple<n,type>& oout) #define SIMD_IPASSIGN_SPEC(n, type) \ template <> \ inline void ipassign (tuple<n,type>::const_iterator oin, tuple<n,type>& oout) #define SIMD_CONVERT_SPEC(n, type1, type2, optype) \ template <> \ inline void pconvert (const tuple<n,type1>& oin, tuple<n,type2>& oout, optype<type1,type2>) #if CPU_HAS_MMX #define STD_MMX_ARGS : "m"(oout[0]), "m"(oin[0]) : "mm0", "st", "memory" #define DBL_MMX_ARGS : "m"(oout[0]), "m"(oout[2]), "m"(oin[0]), "m"(oin[2]) : "mm0", "mm1", "st", "st(1)", "memory" #define MMX_PKOP2_SPEC(n,type,optype,instruction) \ SIMD_PKOP2_SPEC(n,type,optype) \ { asm ("movq %0, %%mm0\n\t" #instruction " %1, %%mm0\n\tmovq %%mm0, %0" : STD_MMX_ARGS); reset_mmx(); } #define MMX_DBL_PKOP2_SPEC(n,type,optype,instruction) \ SIMD_PKOP2_SPEC(n,type,optype) \ { asm ("movq %0, %%mm0\n\tmovq %1, %%mm1\n\t" #instruction " %2, %%mm0\n\t" #instruction " %3, %%mm1\n\tmovq %%mm0, %0\n\tmovq %%mm1, %1" : DBL_MMX_ARGS); reset_mmx(); } #define MMX_PASSIGN_SPEC(n,type) \ SIMD_PASSIGN_SPEC(n,type) \ { asm ("movq %1, %%mm0\n\tmovq %%mm0, %0" : STD_MMX_ARGS); reset_mmx(); } #define MMX_DBL_PASSIGN_SPEC(n,type) \ SIMD_PASSIGN_SPEC(n,type) \ { asm ("movq %2, %%mm0\n\tmovq %3, %%mm1\n\tmovq %%mm0, %0\n\tmovq %%mm1, %1" : DBL_MMX_ARGS); reset_mmx(); } #define MMX_IPASSIGN_SPEC(n,type) \ SIMD_IPASSIGN_SPEC(n,type) \ { asm ("movq %1, %%mm0\n\tmovq %%mm0, %0" : STD_MMX_ARGS); reset_mmx(); } #define MMX_DBL_IPASSIGN_SPEC(n,type) \ SIMD_IPASSIGN_SPEC(n,type) \ { asm ("movq %2, %%mm0\n\tmovq %3, %%mm1\n\tmovq %%mm0, %0\n\tmovq %%mm1, %1" : DBL_MMX_ARGS); reset_mmx(); } MMX_PASSIGN_SPEC(8,uint8_t) MMX_PKOP2_SPEC(8,uint8_t,plus,paddb) MMX_PKOP2_SPEC(8,uint8_t,minus,psubb) MMX_PKOP2_SPEC(8,uint8_t,bitwise_and,pand) MMX_PKOP2_SPEC(8,uint8_t,bitwise_or,por) MMX_PKOP2_SPEC(8,uint8_t,bitwise_xor,pxor) MMX_PKOP2_SPEC(8,uint8_t,fpadds,paddusb) MMX_PKOP2_SPEC(8,uint8_t,fpsubs,psubusb) MMX_PASSIGN_SPEC(8,int8_t) MMX_PKOP2_SPEC(8,int8_t,plus,paddb) MMX_PKOP2_SPEC(8,int8_t,minus,psubb) MMX_PKOP2_SPEC(8,int8_t,bitwise_and,pand) MMX_PKOP2_SPEC(8,int8_t,bitwise_or,por) MMX_PKOP2_SPEC(8,int8_t,bitwise_xor,pxor) MMX_PKOP2_SPEC(8,int8_t,fpadds,paddsb) MMX_PKOP2_SPEC(8,int8_t,fpsubs,psubsb) MMX_PASSIGN_SPEC(4,uint16_t) MMX_PKOP2_SPEC(4,uint16_t,plus,paddw) MMX_PKOP2_SPEC(4,uint16_t,minus,psubw) MMX_PKOP2_SPEC(4,uint16_t,bitwise_and,pand) MMX_PKOP2_SPEC(4,uint16_t,bitwise_or,por) MMX_PKOP2_SPEC(4,uint16_t,bitwise_xor,pxor) /// \todo psllw does not work like other operations, it uses the first element for shift count. //MMX_PKOP2_SPEC(4,uint16_t,fpshl,psllw) //MMX_PKOP2_SPEC(4,uint16_t,fpshr,psrlw) MMX_PKOP2_SPEC(4,uint16_t,fpadds,paddusw) MMX_PKOP2_SPEC(4,uint16_t,fpsubs,psubusw) MMX_PASSIGN_SPEC(4,int16_t) MMX_PKOP2_SPEC(4,int16_t,plus,paddw) MMX_PKOP2_SPEC(4,int16_t,minus,psubw) MMX_PKOP2_SPEC(4,int16_t,bitwise_and,pand) MMX_PKOP2_SPEC(4,int16_t,bitwise_or,por) MMX_PKOP2_SPEC(4,int16_t,bitwise_xor,pxor) //MMX_PKOP2_SPEC(4,int16_t,fpshl,psllw) //MMX_PKOP2_SPEC(4,int16_t,fpshr,psrlw) MMX_PKOP2_SPEC(4,int16_t,fpadds,paddsw) MMX_PKOP2_SPEC(4,int16_t,fpsubs,psubsw) MMX_PASSIGN_SPEC(2,uint32_t) MMX_PKOP2_SPEC(2,uint32_t,plus,paddd) MMX_PKOP2_SPEC(2,uint32_t,minus,psubd) MMX_PKOP2_SPEC(2,uint32_t,bitwise_and,pand) MMX_PKOP2_SPEC(2,uint32_t,bitwise_or,por) MMX_PKOP2_SPEC(2,uint32_t,bitwise_xor,pxor) //MMX_PKOP2_SPEC(2,uint32_t,fpshl,pslld) //MMX_PKOP2_SPEC(2,uint32_t,fpshr,psrld) MMX_PASSIGN_SPEC(2,int32_t) MMX_PKOP2_SPEC(2,int32_t,plus,paddd) MMX_PKOP2_SPEC(2,int32_t,minus,psubd) MMX_PKOP2_SPEC(2,int32_t,bitwise_and,pand) MMX_PKOP2_SPEC(2,int32_t,bitwise_or,por) MMX_PKOP2_SPEC(2,int32_t,bitwise_xor,pxor) //MMX_PKOP2_SPEC(2,int32_t,fpshl,pslld) //MMX_PKOP2_SPEC(2,int32_t,fpshr,psrld) MMX_DBL_PKOP2_SPEC(4,uint32_t,plus,paddd) MMX_DBL_PKOP2_SPEC(4,uint32_t,minus,psubd) MMX_DBL_PKOP2_SPEC(4,uint32_t,bitwise_and,pand) MMX_DBL_PKOP2_SPEC(4,uint32_t,bitwise_or,por) MMX_DBL_PKOP2_SPEC(4,uint32_t,bitwise_xor,pxor) //MMX_DBL_PKOP2_SPEC(2,uint32_t,fpshl,pslld) //MMX_DBL_PKOP2_SPEC(2,uint32_t,fpshr,psrld) MMX_DBL_PKOP2_SPEC(4,int32_t,plus,paddd) MMX_DBL_PKOP2_SPEC(4,int32_t,minus,psubd) MMX_DBL_PKOP2_SPEC(4,int32_t,bitwise_and,pand) MMX_DBL_PKOP2_SPEC(4,int32_t,bitwise_or,por) MMX_DBL_PKOP2_SPEC(4,int32_t,bitwise_xor,pxor) //MMX_DBL_PKOP2_SPEC(2,int32_t,fpshl,pslld) //MMX_DBL_PKOP2_SPEC(2,int32_t,fpshr,psrld) #if CPU_HAS_SSE || CPU_HAS_3DNOW MMX_PKOP2_SPEC(8,uint8_t,fpavg,pavgb) MMX_PKOP2_SPEC(8,int8_t,fpavg,pavgb) MMX_PKOP2_SPEC(4,uint16_t,fpavg,pavgw) MMX_PKOP2_SPEC(4,int16_t,fpavg,pavgw) MMX_PKOP2_SPEC(8,uint8_t,fpmin,pminub) MMX_PKOP2_SPEC(8,uint8_t,fpmax,pmaxub) MMX_PKOP2_SPEC(4,int16_t,fpmax,pmaxsw) MMX_PKOP2_SPEC(4,int16_t,fpmin,pminsw) #endif // CPU_HAS_SSE || CPU_HAS_3DNOW #if CPU_HAS_3DNOW MMX_PASSIGN_SPEC(2,float) MMX_PKOP2_SPEC(2,float,plus,pfadd) MMX_PKOP2_SPEC(2,float,minus,pfsub) MMX_PKOP2_SPEC(2,float,multiplies,pfmul) MMX_PKOP2_SPEC(2,float,fpmin,pfmin) MMX_PKOP2_SPEC(2,float,fpmax,pfmax) #ifndef CPU_HAS_SSE MMX_DBL_PKOP2_SPEC(4,float,plus,pfadd) MMX_DBL_PKOP2_SPEC(4,float,minus,pfsub) MMX_DBL_PKOP2_SPEC(4,float,multiplies,pfmul) MMX_DBL_PKOP2_SPEC(4,float,fpmin,pfmin) MMX_DBL_PKOP2_SPEC(4,float,fpmax,pfmax) #endif #endif // CPU_HAS_3DNOW MMX_IPASSIGN_SPEC(8,uint8_t) MMX_IPASSIGN_SPEC(4,uint16_t) MMX_IPASSIGN_SPEC(2,uint32_t) MMX_IPASSIGN_SPEC(2,float) #ifndef CPU_HAS_SSE MMX_DBL_PASSIGN_SPEC(4,float) MMX_DBL_PASSIGN_SPEC(4,uint32_t) MMX_DBL_PASSIGN_SPEC(4,int32_t) MMX_DBL_IPASSIGN_SPEC(4,float) MMX_DBL_IPASSIGN_SPEC(4,uint32_t) MMX_DBL_IPASSIGN_SPEC(4,int32_t) #endif #undef MMX_IPASSIGN_SPEC #undef MMX_PASSIGN_SPEC #undef MMX_PKOP2_SPEC #undef STD_MMX_ARGS #endif // CPU_HAS_MMX #if CPU_HAS_SSE #define STD_SSE_ARGS : "m"(oout[0]), "m"(oin[0]) : "xmm0", "memory" #define SSE_PKOP2_SPEC(n,type,optype,instruction) \ SIMD_PKOP2_SPEC(n,type,optype) \ { asm ("movups %0, %%xmm0\n\tmovups %1, %%xmm1\n\t" #instruction " %%xmm1, %%xmm0\n\tmovups %%xmm0, %0" : STD_SSE_ARGS);} #define SSE_PASSIGN_SPEC(n,type) \ SIMD_PASSIGN_SPEC(n,type) \ { asm ("movups %1, %%xmm0\n\tmovups %%xmm0, %0" : STD_SSE_ARGS);} #define SSE_IPASSIGN_SPEC(n,type) \ SIMD_IPASSIGN_SPEC(n,type) \ { asm ("movups %1, %%xmm0\n\tmovups %%xmm0, %0" : STD_SSE_ARGS);} SSE_PASSIGN_SPEC(4,float) SSE_PASSIGN_SPEC(4,int32_t) SSE_PASSIGN_SPEC(4,uint32_t) SSE_PKOP2_SPEC(4,float,plus,addps) SSE_PKOP2_SPEC(4,float,minus,subps) SSE_PKOP2_SPEC(4,float,multiplies,mulps) SSE_PKOP2_SPEC(4,float,divides,divps) SSE_PKOP2_SPEC(4,float,bitwise_and,andps) SSE_PKOP2_SPEC(4,float,bitwise_or,orps) SSE_PKOP2_SPEC(4,float,bitwise_xor,xorps) SSE_PKOP2_SPEC(4,float,fpmax,maxps) SSE_PKOP2_SPEC(4,float,fpmin,minps) SIMD_CONVERT_SPEC(4,float,int32_t,fround) { asm ("cvtps2pi %2, %%mm0\n\t" "cvtps2pi %3, %%mm1\n\t" "movq %%mm0, %0\n\t" "movq %%mm1, %1" : DBL_MMX_ARGS); reset_mmx(); } SIMD_CONVERT_SPEC(4,int32_t,float,fround) { asm ("cvtpi2ps %2, %%xmm0\n\t" "shufps $0x4E,%%xmm0,%%xmm0\n\t" "cvtpi2ps %1, %%xmm0\n\t" "movups %%xmm0, %0" :: "m"(oout[0]), "m"(oin[0]), "m"(oin[2]) : "xmm0", "memory"); } template <> inline int32_t fround<float,int32_t>::operator()(const float& a) const { register int32_t rv; asm ("movss %1, %%xmm0\n\t" "cvtss2si %%xmm0, %0" : "=r"(rv) : "m"(a) : "xmm0" ); return (rv); } template <> inline uint32_t fround<float,uint32_t>::operator()(const float& a) const { register uint32_t rv; asm ("movss %1, %%xmm0\n\t" "cvtss2si %%xmm0, %0" : "=r"(rv) : "m"(a) : "xmm0" ); return (rv); } SSE_IPASSIGN_SPEC(4,float) SSE_IPASSIGN_SPEC(4,int32_t) SSE_IPASSIGN_SPEC(4,uint32_t) #undef SSE_IPASSIGN_SPEC #undef SSE_PASSIGN_SPEC #undef SSE_PKOP2_SPEC #undef STD_SSE_ARGS #endif // CPU_HAS_SSE #undef SIMDA_RI #undef SIMDA_RO #undef SIMDA_WI #undef SIMDA_WO #undef SIMD_PACKEDOP_SPEC } // namespace simd } // namespace ustl #endif