1 |
786 |
skrzyp |
// This file is part of the uSTL library, an STL implementation.
|
2 |
|
|
//
|
3 |
|
|
// Copyright (c) 2005-2009 by Mike Sharov <msharov@users.sourceforge.net>
|
4 |
|
|
// This file is free software, distributed under the MIT License.
|
5 |
|
|
|
6 |
|
|
#ifndef ULAALGO_H_2E403D182E83FB596AFB800E68B255A1
|
7 |
|
|
#define ULAALGO_H_2E403D182E83FB596AFB800E68B255A1
|
8 |
|
|
|
9 |
|
|
#include "umatrix.h"
|
10 |
|
|
#include "simd.h"
|
11 |
|
|
|
12 |
|
|
namespace ustl {
|
13 |
|
|
|
14 |
|
|
/// \brief Creates an identity matrix in \p m
|
15 |
|
|
/// \ingroup NumericAlgorithms
|
16 |
|
|
template <size_t NX, size_t NY, typename T>
|
17 |
|
|
void load_identity (matrix<NX,NY,T>& m)
|
18 |
|
|
{
|
19 |
|
|
fill_n (m.begin(), NX * NY, 0);
|
20 |
|
|
for (typename matrix<NX,NY,T>::iterator i = m.begin(); i < m.end(); i += NX + 1)
|
21 |
|
|
*i = 1;
|
22 |
|
|
}
|
23 |
|
|
|
24 |
|
|
/// \brief Multiplies two matrices
|
25 |
|
|
/// \ingroup NumericAlgorithms
|
26 |
|
|
template <size_t NX, size_t NY, typename T>
|
27 |
|
|
matrix<NY,NY,T> operator* (const matrix<NX,NY,T>& m1, const matrix<NY,NX,T>& m2)
|
28 |
|
|
{
|
29 |
|
|
matrix<NY,NY,T> mr;
|
30 |
|
|
for (uoff_t ry = 0; ry < NY; ++ ry) {
|
31 |
|
|
for (uoff_t rx = 0; rx < NY; ++ rx) {
|
32 |
|
|
T dpv (0);
|
33 |
|
|
for (uoff_t x = 0; x < NX; ++ x)
|
34 |
|
|
dpv += m1[ry][x] * m2[x][rx];
|
35 |
|
|
mr[ry][rx] = dpv;
|
36 |
|
|
}
|
37 |
|
|
}
|
38 |
|
|
return (mr);
|
39 |
|
|
}
|
40 |
|
|
|
41 |
|
|
/// \brief Transforms vector \p t with matrix \p m
|
42 |
|
|
/// \ingroup NumericAlgorithms
|
43 |
|
|
template <size_t NX, size_t NY, typename T>
|
44 |
|
|
tuple<NX,T> operator* (const tuple<NY,T>& t, const matrix<NX,NY,T>& m)
|
45 |
|
|
{
|
46 |
|
|
tuple<NX,T> tr;
|
47 |
|
|
for (uoff_t x = 0; x < NX; ++ x) {
|
48 |
|
|
T dpv (0);
|
49 |
|
|
for (uoff_t y = 0; y < NY; ++ y)
|
50 |
|
|
dpv += t[y] * m[y][x];
|
51 |
|
|
tr[x] = dpv;
|
52 |
|
|
}
|
53 |
|
|
return (tr);
|
54 |
|
|
}
|
55 |
|
|
|
56 |
|
|
/// \brief Transposes (exchanges rows and columns) matrix \p m.
|
57 |
|
|
/// \ingroup NumericAlgorithms
|
58 |
|
|
template <size_t N, typename T>
|
59 |
|
|
void transpose (matrix<N,N,T>& m)
|
60 |
|
|
{
|
61 |
|
|
for (uoff_t x = 0; x < N; ++ x)
|
62 |
|
|
for (uoff_t y = x; y < N; ++ y)
|
63 |
|
|
swap (m[x][y], m[y][x]);
|
64 |
|
|
}
|
65 |
|
|
|
66 |
|
|
#if WANT_UNROLLED_COPY
|
67 |
|
|
|
68 |
|
|
#if CPU_HAS_SSE
|
69 |
|
|
|
70 |
|
|
#if linux // Non-linux gcc versions (BSD, Solaris) can't handle "x" constraint and provide no alternative.
|
71 |
|
|
template <>
|
72 |
|
|
inline void load_identity (matrix<4,4,float>& m)
|
73 |
|
|
{
|
74 |
|
|
asm (
|
75 |
|
|
"movaps %4, %%xmm1 \n\t" // 1 0 0 0
|
76 |
|
|
"movups %4, %0 \n\t" // 1 0 0 0
|
77 |
|
|
"shufps $0xB1,%%xmm1,%%xmm1 \n\t" // 0 1 0 0
|
78 |
|
|
"movups %%xmm1, %1 \n\t" // 0 1 0 0
|
79 |
|
|
"shufps $0x4F,%4,%%xmm1 \n\t" // 0 0 1 0
|
80 |
|
|
"shufps $0x1B,%4,%4 \n\t" // 0 0 0 1
|
81 |
|
|
"movups %%xmm1, %2 \n\t" // 0 0 1 0
|
82 |
|
|
"movups %4, %3" // 0 0 0 1
|
83 |
|
|
: "=m"(m[0][0]), "=m"(m[1][0]), "=m"(m[2][0]), "=m"(m[3][0])
|
84 |
|
|
: "x"(1.0f)
|
85 |
|
|
: "xmm1", "memory"
|
86 |
|
|
);
|
87 |
|
|
asm ("":::"memory");
|
88 |
|
|
}
|
89 |
|
|
#endif
|
90 |
|
|
|
91 |
|
|
inline void _sse_load_matrix (const float* m)
|
92 |
|
|
{
|
93 |
|
|
asm (
|
94 |
|
|
"movups %0, %%xmm4 \n\t" // xmm4 = m[1 2 3 4]
|
95 |
|
|
"movups %1, %%xmm5 \n\t" // xmm5 = m[1 2 3 4]
|
96 |
|
|
"movups %2, %%xmm6 \n\t" // xmm6 = m[1 2 3 4]
|
97 |
|
|
"movups %3, %%xmm7" // xmm7 = m[1 2 3 4]
|
98 |
|
|
: : "m"(m[0]), "m"(m[4]), "m"(m[8]), "m"(m[12])
|
99 |
|
|
: "xmm4", "xmm5", "xmm6", "xmm7", "memory"
|
100 |
|
|
);
|
101 |
|
|
}
|
102 |
|
|
|
103 |
|
|
inline void _sse_transform_to_vector (float* result)
|
104 |
|
|
{
|
105 |
|
|
asm (
|
106 |
|
|
"movaps %%xmm0, %%xmm1 \n\t" // xmm1 = t[0 1 2 3]
|
107 |
|
|
"movaps %%xmm0, %%xmm2 \n\t" // xmm1 = t[0 1 2 3]
|
108 |
|
|
"movaps %%xmm0, %%xmm3 \n\t" // xmm1 = t[0 1 2 3]
|
109 |
|
|
"shufps $0x00, %%xmm0, %%xmm0 \n\t" // xmm0 = t[0 0 0 0]
|
110 |
|
|
"shufps $0x66, %%xmm1, %%xmm1 \n\t" // xmm1 = t[1 1 1 1]
|
111 |
|
|
"shufps $0xAA, %%xmm2, %%xmm2 \n\t" // xmm2 = t[2 2 2 2]
|
112 |
|
|
"shufps $0xFF, %%xmm3, %%xmm3 \n\t" // xmm3 = t[3 3 3 3]
|
113 |
|
|
"mulps %%xmm4, %%xmm0 \n\t" // xmm0 = t[0 0 0 0] * m[0 1 2 3]
|
114 |
|
|
"mulps %%xmm5, %%xmm1 \n\t" // xmm1 = t[1 1 1 1] * m[0 1 2 3]
|
115 |
|
|
"addps %%xmm1, %%xmm0 \n\t" // xmm0 = xmm0 + xmm1
|
116 |
|
|
"mulps %%xmm6, %%xmm2 \n\t" // xmm2 = t[2 2 2 2] * m[0 1 2 3]
|
117 |
|
|
"mulps %%xmm7, %%xmm3 \n\t" // xmm3 = t[3 3 3 3] * m[0 1 2 3]
|
118 |
|
|
"addps %%xmm3, %%xmm2 \n\t" // xmm2 = xmm2 + xmm3
|
119 |
|
|
"addps %%xmm2, %%xmm0 \n\t" // xmm0 = result
|
120 |
|
|
"movups %%xmm0, %0"
|
121 |
|
|
: "=m"(result[0]), "=m"(result[1]), "=m"(result[2]), "=m"(result[3]) :
|
122 |
|
|
: "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "memory"
|
123 |
|
|
);
|
124 |
|
|
}
|
125 |
|
|
|
126 |
|
|
template <>
|
127 |
|
|
inline tuple<4,float> operator* (const tuple<4,float>& t, const matrix<4,4,float>& m)
|
128 |
|
|
{
|
129 |
|
|
tuple<4,float> result;
|
130 |
|
|
_sse_load_matrix (m.begin());
|
131 |
|
|
asm ("movups %0, %%xmm0" : : "m"(t[0]), "m"(t[1]), "m"(t[2]), "m"(t[3]) : "xmm0", "memory");
|
132 |
|
|
_sse_transform_to_vector (result.begin());
|
133 |
|
|
return (result);
|
134 |
|
|
}
|
135 |
|
|
|
136 |
|
|
template <>
|
137 |
|
|
inline matrix<4,4,float> operator* (const matrix<4,4,float>& m1, const matrix<4,4,float>& m2)
|
138 |
|
|
{
|
139 |
|
|
matrix<4,4,float> result;
|
140 |
|
|
_sse_load_matrix (m2.begin());
|
141 |
|
|
for (uoff_t r = 0; r < 4; ++ r) {
|
142 |
|
|
asm ("movups %0, %%xmm0" : : "m"(m1[r][0]), "m"(m1[r][1]), "m"(m1[r][2]), "m"(m1[r][3]) : "xmm0", "memory");
|
143 |
|
|
_sse_transform_to_vector (result[r]);
|
144 |
|
|
}
|
145 |
|
|
return (result);
|
146 |
|
|
}
|
147 |
|
|
|
148 |
|
|
#elif CPU_HAS_3DNOW
|
149 |
|
|
|
150 |
|
|
/// Specialization for 4-component vector transform, the slow part of 3D graphics.
|
151 |
|
|
template <>
|
152 |
|
|
static tuple<4,float> operator* (const tuple<4,float>& t, const matrix<4,4,float>& m)
|
153 |
|
|
{
|
154 |
|
|
tuple<4,float> result;
|
155 |
|
|
// This is taken from "AMD Athlon Code Optimization Guide" from AMD. 18 cycles!
|
156 |
|
|
// If you are writing a 3D engine, you may want to copy it instead of calling it
|
157 |
|
|
// because of the femms instruction at the end, which takes 2 cycles.
|
158 |
|
|
asm (
|
159 |
|
|
"movq %2, %%mm0 \n\t" // y | x
|
160 |
|
|
"movq %3, %%mm1 \n\t" // w | z
|
161 |
|
|
"movq %%mm0, %%mm2 \n\t" // y | x
|
162 |
|
|
"movq %4, %%mm3 \n\t" // m[0][1] | m[0][0]
|
163 |
|
|
"punpckldq %%mm0, %%mm0 \n\t" // x | x
|
164 |
|
|
"movq %6, %%mm4 \n\t" // m[1][1] | m[1][0]
|
165 |
|
|
"pfmul %%mm0, %%mm3 \n\t" // x*m[0][1] | x*m[0][0]
|
166 |
|
|
"punpckhdq %%mm2, %%mm2 \n\t" // y | y
|
167 |
|
|
"pfmul %%mm2, %%mm4 \n\t" // y*m[1][1] | y*m[1][0]
|
168 |
|
|
"movq %5, %%mm5 \n\t" // m[0][3] | m[0][2]
|
169 |
|
|
"movq %7, %%mm7 \n\t" // m[1][3] | m[1][2]
|
170 |
|
|
"movq %%mm1, %%mm6 \n\t" // w | z
|
171 |
|
|
"pfmul %%mm0, %%mm5 \n\t" // x*m[0][3] | v0>x*m[0][2]
|
172 |
|
|
"movq %8, %%mm0 \n\t" // m[2][1] | m[2][0]
|
173 |
|
|
"punpckldq %%mm1, %%mm1 \n\t" // z | z
|
174 |
|
|
"pfmul %%mm2, %%mm7 \n\t" // y*m[1][3] | y*m[1][2]
|
175 |
|
|
"movq %9, %%mm2 \n\t" // m[2][3] | m[2][2]
|
176 |
|
|
"pfmul %%mm1, %%mm0 \n\t" // z*m[2][1] | z*m[2][0]
|
177 |
|
|
"pfadd %%mm4, %%mm3 \n\t" // x*m[0][1]+y*m[1][1] | x*m[0][0]+y*m[1][0]
|
178 |
|
|
"movq %10, %%mm4 \n\t" // m[3][1] | m[3][0]
|
179 |
|
|
"pfmul %%mm1, %%mm2 \n\t" // z*m[2][3] | z*m[2][2]
|
180 |
|
|
"pfadd %%mm7, %%mm5 \n\t" // x*m[0][3]+y*m[1][3] | x*m[0][2]+y*m[1][2]
|
181 |
|
|
"movq %11, %%mm1 \n\t" // m[3][3] | m[3][2]
|
182 |
|
|
"punpckhdq %%mm6, %%mm6 \n\t" // w | w
|
183 |
|
|
"pfadd %%mm0, %%mm3 \n\t" // x*m[0][1]+y*m[1][1]+z*m[2][1] | x*m[0][0]+y*m[1][0]+z*m[2][0]
|
184 |
|
|
"pfmul %%mm6, %%mm4 \n\t" // w*m[3][1] | w*m[3][0]
|
185 |
|
|
"pfmul %%mm6, %%mm1 \n\t" // w*m[3][3] | w*m[3][2]
|
186 |
|
|
"pfadd %%mm2, %%mm5 \n\t" // x*m[0][3]+y*m[1][3]+z*m[2][3] | x*m[0][2]+y*m[1][2]+z*m[2][2]
|
187 |
|
|
"pfadd %%mm4, %%mm3 \n\t" // x*m[0][1]+y*m[1][1]+z*m[2][1]+w*m[3][1] | x*m[0][0]+y*m[1][0]+z*m[2][0]+w*m[3][0]
|
188 |
|
|
"movq %%mm3, %0 \n\t" // store result->y | result->x
|
189 |
|
|
"pfadd %%mm1, %%mm5 \n\t" // x*m[0][3]+y*m[1][3]+z*m[2][3]+w*m[3][3] | x*m[0][2]+y*m[1][2]+z*m[2][2]+w*m[3][2]
|
190 |
|
|
"movq %%mm5, %1" // store result->w | result->z
|
191 |
|
|
: "=m"(result[0]), "=m"(result[2])
|
192 |
|
|
: "m"(t[0]), "m"(t[2]),
|
193 |
|
|
"m"(m[0][0]), "m"(m[0][2]),
|
194 |
|
|
"m"(m[1][0]), "m"(m[1][2]),
|
195 |
|
|
"m"(m[2][0]), "m"(m[2][2]),
|
196 |
|
|
"m"(m[3][0]), "m"(m[3][2])
|
197 |
|
|
: ALL_MMX_REGS_CHANGELIST, "memory"
|
198 |
|
|
);
|
199 |
|
|
asm ("":::"memory");
|
200 |
|
|
simd::reset_mmx();
|
201 |
|
|
return (result);
|
202 |
|
|
}
|
203 |
|
|
|
204 |
|
|
#else // If no processor extensions, just unroll the multiplication
|
205 |
|
|
|
206 |
|
|
/// Specialization for 4-component vector transform, the slow part of 3D graphics.
|
207 |
|
|
template <>
|
208 |
|
|
static tuple<4,float> operator* (const tuple<4,float>& t, const matrix<4,4,float>& m)
|
209 |
|
|
{
|
210 |
|
|
tuple<4,float> tr;
|
211 |
|
|
for (uoff_t i = 0; i < 4; ++ i)
|
212 |
|
|
tr[i] = t[0] * m[0][i] + t[1] * m[1][i] + t[2] * m[2][i] + t[3] * m[3][i];
|
213 |
|
|
return (tr);
|
214 |
|
|
}
|
215 |
|
|
|
216 |
|
|
#endif // CPU_HAS_3DNOW
|
217 |
|
|
#endif // WANT_UNROLLED_COPY
|
218 |
|
|
|
219 |
|
|
} // namespace ustl
|
220 |
|
|
|
221 |
|
|
#endif
|