URL https://opencores.org/ocsvn/openrisc/openrisc/trunk

Subversion Repositories openrisc

[/] [openrisc/] [trunk/] [rtos/] [ecos-3.0/] [packages/] [language/] [cxx/] [ustl/] [current/] [include/] [ustl/] [ulaalgo.h] - Blame information for rev 786

Details | Compare with Previous | View Log

Line No.	Rev	Author	Line
1	786	skrzyp	`// This file is part of the uSTL library, an STL implementation.`
2			`//`
3			`// Copyright (c) 2005-2009 by Mike Sharov <msharov@users.sourceforge.net>`
4			`// This file is free software, distributed under the MIT License.`
5
6			`#ifndef ULAALGO_H_2E403D182E83FB596AFB800E68B255A1`
7			`#define ULAALGO_H_2E403D182E83FB596AFB800E68B255A1`
8
9			`#include "umatrix.h"`
10			`#include "simd.h"`
11
12			`namespace ustl {`
13
14			`/// \brief Creates an identity matrix in \p m`
15			`/// \ingroup NumericAlgorithms`
16			`template <size_t NX, size_t NY, typename T>`
17			`void load_identity (matrix<NX,NY,T>& m)`
18			`{`
19			`fill_n (m.begin(), NX * NY, 0);`
20			`for (typename matrix<NX,NY,T>::iterator i = m.begin(); i < m.end(); i += NX + 1)`
21			`*i = 1;`
22			`}`
23
24			`/// \brief Multiplies two matrices`
25			`/// \ingroup NumericAlgorithms`
26			`template <size_t NX, size_t NY, typename T>`
27			`matrix<NY,NY,T> operator* (const matrix<NX,NY,T>& m1, const matrix<NY,NX,T>& m2)`
28			`{`
29			`matrix<NY,NY,T> mr;`
30			`for (uoff_t ry = 0; ry < NY; ++ ry) {`
31			`for (uoff_t rx = 0; rx < NY; ++ rx) {`
32			`T dpv (0);`
33			`for (uoff_t x = 0; x < NX; ++ x)`
34			`dpv += m1[ry][x] * m2[x][rx];`
35			`mr[ry][rx] = dpv;`
36			`}`
37			`}`
38			`return (mr);`
39			`}`
40
41			`/// \brief Transforms vector \p t with matrix \p m`
42			`/// \ingroup NumericAlgorithms`
43			`template <size_t NX, size_t NY, typename T>`
44			`tuple<NX,T> operator* (const tuple<NY,T>& t, const matrix<NX,NY,T>& m)`
45			`{`
46			`tuple<NX,T> tr;`
47			`for (uoff_t x = 0; x < NX; ++ x) {`
48			`T dpv (0);`
49			`for (uoff_t y = 0; y < NY; ++ y)`
50			`dpv += t[y] * m[y][x];`
51			`tr[x] = dpv;`
52			`}`
53			`return (tr);`
54			`}`
55
56			`/// \brief Transposes (exchanges rows and columns) matrix \p m.`
57			`/// \ingroup NumericAlgorithms`
58			`template <size_t N, typename T>`
59			`void transpose (matrix<N,N,T>& m)`
60			`{`
61			`for (uoff_t x = 0; x < N; ++ x)`
62			`for (uoff_t y = x; y < N; ++ y)`
63			`swap (m[x][y], m[y][x]);`
64			`}`
65
66			`#if WANT_UNROLLED_COPY`
67
68			`#if CPU_HAS_SSE`
69
70			`#if linux // Non-linux gcc versions (BSD, Solaris) can't handle "x" constraint and provide no alternative.`
71			`template <>`
72			`inline void load_identity (matrix<4,4,float>& m)`
73			`{`
74			`asm (`
75			`"movaps %4, %%xmm1 \n\t" // 1 0 0 0`
76			`"movups %4, %0 \n\t" // 1 0 0 0`
77			`"shufps $0xB1,%%xmm1,%%xmm1 \n\t" // 0 1 0 0`
78			`"movups %%xmm1, %1 \n\t" // 0 1 0 0`
79			`"shufps $0x4F,%4,%%xmm1 \n\t" // 0 0 1 0`
80			`"shufps $0x1B,%4,%4 \n\t" // 0 0 0 1`
81			`"movups %%xmm1, %2 \n\t" // 0 0 1 0`
82			`"movups %4, %3" // 0 0 0 1`
83			`: "=m"(m[0][0]), "=m"(m[1][0]), "=m"(m[2][0]), "=m"(m[3][0])`
84			`: "x"(1.0f)`
85			`: "xmm1", "memory"`
86			`);`
87			`asm ("":::"memory");`
88			`}`
89			`#endif`
90
91			`inline void _sse_load_matrix (const float* m)`
92			`{`
93			`asm (`
94			`"movups %0, %%xmm4 \n\t" // xmm4 = m[1 2 3 4]`
95			`"movups %1, %%xmm5 \n\t" // xmm5 = m[1 2 3 4]`
96			`"movups %2, %%xmm6 \n\t" // xmm6 = m[1 2 3 4]`
97			`"movups %3, %%xmm7" // xmm7 = m[1 2 3 4]`
98			`: : "m"(m[0]), "m"(m[4]), "m"(m[8]), "m"(m[12])`
99			`: "xmm4", "xmm5", "xmm6", "xmm7", "memory"`
100			`);`
101			`}`
102
103			`inline void _sse_transform_to_vector (float* result)`
104			`{`
105			`asm (`
106			`"movaps %%xmm0, %%xmm1 \n\t" // xmm1 = t[0 1 2 3]`
107			`"movaps %%xmm0, %%xmm2 \n\t" // xmm1 = t[0 1 2 3]`
108			`"movaps %%xmm0, %%xmm3 \n\t" // xmm1 = t[0 1 2 3]`
109			`"shufps $0x00, %%xmm0, %%xmm0 \n\t" // xmm0 = t[0 0 0 0]`
110			`"shufps $0x66, %%xmm1, %%xmm1 \n\t" // xmm1 = t[1 1 1 1]`
111			`"shufps $0xAA, %%xmm2, %%xmm2 \n\t" // xmm2 = t[2 2 2 2]`
112			`"shufps $0xFF, %%xmm3, %%xmm3 \n\t" // xmm3 = t[3 3 3 3]`
113			`"mulps %%xmm4, %%xmm0 \n\t" // xmm0 = t[0 0 0 0] * m[0 1 2 3]`
114			`"mulps %%xmm5, %%xmm1 \n\t" // xmm1 = t[1 1 1 1] * m[0 1 2 3]`
115			`"addps %%xmm1, %%xmm0 \n\t" // xmm0 = xmm0 + xmm1`
116			`"mulps %%xmm6, %%xmm2 \n\t" // xmm2 = t[2 2 2 2] * m[0 1 2 3]`
117			`"mulps %%xmm7, %%xmm3 \n\t" // xmm3 = t[3 3 3 3] * m[0 1 2 3]`
118			`"addps %%xmm3, %%xmm2 \n\t" // xmm2 = xmm2 + xmm3`
119			`"addps %%xmm2, %%xmm0 \n\t" // xmm0 = result`
120			`"movups %%xmm0, %0"`
121			`: "=m"(result[0]), "=m"(result[1]), "=m"(result[2]), "=m"(result[3]) :`
122			`: "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "memory"`
123			`);`
124			`}`
125
126			`template <>`
127			`inline tuple<4,float> operator* (const tuple<4,float>& t, const matrix<4,4,float>& m)`
128			`{`
129			`tuple<4,float> result;`
130			`_sse_load_matrix (m.begin());`
131			`asm ("movups %0, %%xmm0" : : "m"(t[0]), "m"(t[1]), "m"(t[2]), "m"(t[3]) : "xmm0", "memory");`
132			`_sse_transform_to_vector (result.begin());`
133			`return (result);`
134			`}`
135
136			`template <>`
137			`inline matrix<4,4,float> operator* (const matrix<4,4,float>& m1, const matrix<4,4,float>& m2)`
138			`{`
139			`matrix<4,4,float> result;`
140			`_sse_load_matrix (m2.begin());`
141			`for (uoff_t r = 0; r < 4; ++ r) {`
142			`asm ("movups %0, %%xmm0" : : "m"(m1[r][0]), "m"(m1[r][1]), "m"(m1[r][2]), "m"(m1[r][3]) : "xmm0", "memory");`
143			`_sse_transform_to_vector (result[r]);`
144			`}`
145			`return (result);`
146			`}`
147
148			`#elif CPU_HAS_3DNOW`
149
150			`/// Specialization for 4-component vector transform, the slow part of 3D graphics.`
151			`template <>`
152			`static tuple<4,float> operator* (const tuple<4,float>& t, const matrix<4,4,float>& m)`
153			`{`
154			`tuple<4,float> result;`
155			`// This is taken from "AMD Athlon Code Optimization Guide" from AMD. 18 cycles!`
156			`// If you are writing a 3D engine, you may want to copy it instead of calling it`
157			`// because of the femms instruction at the end, which takes 2 cycles.`
158			`asm (`
159			`"movq %2, %%mm0 \n\t" // y \| x`
160			`"movq %3, %%mm1 \n\t" // w \| z`
161			`"movq %%mm0, %%mm2 \n\t" // y \| x`
162			`"movq %4, %%mm3 \n\t" // m[0][1] \| m[0][0]`
163			`"punpckldq %%mm0, %%mm0 \n\t" // x \| x`
164			`"movq %6, %%mm4 \n\t" // m[1][1] \| m[1][0]`
165			`"pfmul %%mm0, %%mm3 \n\t" // xm[0][1] \| xm[0][0]`
166			`"punpckhdq %%mm2, %%mm2 \n\t" // y \| y`
167			`"pfmul %%mm2, %%mm4 \n\t" // ym[1][1] \| ym[1][0]`
168			`"movq %5, %%mm5 \n\t" // m[0][3] \| m[0][2]`
169			`"movq %7, %%mm7 \n\t" // m[1][3] \| m[1][2]`
170			`"movq %%mm1, %%mm6 \n\t" // w \| z`
171			`"pfmul %%mm0, %%mm5 \n\t" // xm[0][3] \| v0>xm[0][2]`
172			`"movq %8, %%mm0 \n\t" // m[2][1] \| m[2][0]`
173			`"punpckldq %%mm1, %%mm1 \n\t" // z \| z`
174			`"pfmul %%mm2, %%mm7 \n\t" // ym[1][3] \| ym[1][2]`
175			`"movq %9, %%mm2 \n\t" // m[2][3] \| m[2][2]`
176			`"pfmul %%mm1, %%mm0 \n\t" // zm[2][1] \| zm[2][0]`
177			`"pfadd %%mm4, %%mm3 \n\t" // xm[0][1]+ym[1][1] \| xm[0][0]+ym[1][0]`
178			`"movq %10, %%mm4 \n\t" // m[3][1] \| m[3][0]`
179			`"pfmul %%mm1, %%mm2 \n\t" // zm[2][3] \| zm[2][2]`
180			`"pfadd %%mm7, %%mm5 \n\t" // xm[0][3]+ym[1][3] \| xm[0][2]+ym[1][2]`
181			`"movq %11, %%mm1 \n\t" // m[3][3] \| m[3][2]`
182			`"punpckhdq %%mm6, %%mm6 \n\t" // w \| w`
183			`"pfadd %%mm0, %%mm3 \n\t" // xm[0][1]+ym[1][1]+zm[2][1] \| xm[0][0]+ym[1][0]+zm[2][0]`
184			`"pfmul %%mm6, %%mm4 \n\t" // wm[3][1] \| wm[3][0]`
185			`"pfmul %%mm6, %%mm1 \n\t" // wm[3][3] \| wm[3][2]`
186			`"pfadd %%mm2, %%mm5 \n\t" // xm[0][3]+ym[1][3]+zm[2][3] \| xm[0][2]+ym[1][2]+zm[2][2]`
187			`"pfadd %%mm4, %%mm3 \n\t" // xm[0][1]+ym[1][1]+zm[2][1]+wm[3][1] \| xm[0][0]+ym[1][0]+zm[2][0]+wm[3][0]`
188			`"movq %%mm3, %0 \n\t" // store result->y \| result->x`
189			`"pfadd %%mm1, %%mm5 \n\t" // xm[0][3]+ym[1][3]+zm[2][3]+wm[3][3] \| xm[0][2]+ym[1][2]+zm[2][2]+wm[3][2]`
190			`"movq %%mm5, %1" // store result->w \| result->z`
191			`: "=m"(result[0]), "=m"(result[2])`
192			`: "m"(t[0]), "m"(t[2]),`
193			`"m"(m[0][0]), "m"(m[0][2]),`
194			`"m"(m[1][0]), "m"(m[1][2]),`
195			`"m"(m[2][0]), "m"(m[2][2]),`
196			`"m"(m[3][0]), "m"(m[3][2])`
197			`: ALL_MMX_REGS_CHANGELIST, "memory"`
198			`);`
199			`asm ("":::"memory");`
200			`simd::reset_mmx();`
201			`return (result);`
202			`}`
203
204			`#else // If no processor extensions, just unroll the multiplication`
205
206			`/// Specialization for 4-component vector transform, the slow part of 3D graphics.`
207			`template <>`
208			`static tuple<4,float> operator* (const tuple<4,float>& t, const matrix<4,4,float>& m)`
209			`{`
210			`tuple<4,float> tr;`
211			`for (uoff_t i = 0; i < 4; ++ i)`
212			`tr[i] = t[0] * m[0][i] + t[1] * m[1][i] + t[2] * m[2][i] + t[3] * m[3][i];`
213			`return (tr);`
214			`}`
215
216			`#endif // CPU_HAS_3DNOW`
217			`#endif // WANT_UNROLLED_COPY`
218
219			`} // namespace ustl`
220
221			`#endif`

Browse

Tools

Subversion Repositories openrisc

[/] [openrisc/] [trunk/] [rtos/] [ecos-3.0/] [packages/] [language/] [cxx/] [ustl/] [current/] [include/] [ustl/] [ulaalgo.h] - Blame information for rev 786