OpenCores
URL https://opencores.org/ocsvn/openrisc/openrisc/trunk

Subversion Repositories openrisc

[/] [openrisc/] [trunk/] [rtos/] [ecos-3.0/] [packages/] [language/] [cxx/] [ustl/] [current/] [include/] [ustl/] [ulaalgo.h] - Blame information for rev 786

Details | Compare with Previous | View Log

Line No. Rev Author Line
1 786 skrzyp
// This file is part of the uSTL library, an STL implementation.
2
//
3
// Copyright (c) 2005-2009 by Mike Sharov <msharov@users.sourceforge.net>
4
// This file is free software, distributed under the MIT License.
5
 
6
#ifndef ULAALGO_H_2E403D182E83FB596AFB800E68B255A1
7
#define ULAALGO_H_2E403D182E83FB596AFB800E68B255A1
8
 
9
#include "umatrix.h"
10
#include "simd.h"
11
 
12
namespace ustl {
13
 
14
/// \brief Creates an identity matrix in \p m
15
/// \ingroup NumericAlgorithms
16
template <size_t NX, size_t NY, typename T>
17
void load_identity (matrix<NX,NY,T>& m)
18
{
19
    fill_n (m.begin(), NX * NY, 0);
20
    for (typename matrix<NX,NY,T>::iterator i = m.begin(); i < m.end(); i += NX + 1)
21
        *i = 1;
22
}
23
 
24
/// \brief Multiplies two matrices
25
/// \ingroup NumericAlgorithms
26
template <size_t NX, size_t NY, typename T>
27
matrix<NY,NY,T> operator* (const matrix<NX,NY,T>& m1, const matrix<NY,NX,T>& m2)
28
{
29
    matrix<NY,NY,T> mr;
30
    for (uoff_t ry = 0; ry < NY; ++ ry) {
31
        for (uoff_t rx = 0; rx < NY; ++ rx) {
32
            T dpv (0);
33
            for (uoff_t x = 0; x < NX; ++ x)
34
                dpv += m1[ry][x] * m2[x][rx];
35
            mr[ry][rx] = dpv;
36
        }
37
    }
38
    return (mr);
39
}
40
 
41
/// \brief Transforms vector \p t with matrix \p m
42
/// \ingroup NumericAlgorithms
43
template <size_t NX, size_t NY, typename T>
44
tuple<NX,T> operator* (const tuple<NY,T>& t, const matrix<NX,NY,T>& m)
45
{
46
    tuple<NX,T> tr;
47
    for (uoff_t x = 0; x < NX; ++ x) {
48
        T dpv (0);
49
        for (uoff_t y = 0; y < NY; ++ y)
50
            dpv += t[y] * m[y][x];
51
        tr[x] = dpv;
52
    }
53
    return (tr);
54
}
55
 
56
/// \brief Transposes (exchanges rows and columns) matrix \p m.
57
/// \ingroup NumericAlgorithms
58
template <size_t N, typename T>
59
void transpose (matrix<N,N,T>& m)
60
{
61
    for (uoff_t x = 0; x < N; ++ x)
62
        for (uoff_t y = x; y < N; ++ y)
63
            swap (m[x][y], m[y][x]);
64
}
65
 
66
#if WANT_UNROLLED_COPY
67
 
68
#if CPU_HAS_SSE
69
 
70
#if linux // Non-linux gcc versions (BSD, Solaris) can't handle "x" constraint and provide no alternative.
71
template <>
72
inline void load_identity (matrix<4,4,float>& m)
73
{
74
    asm (
75
        "movaps %4, %%xmm1              \n\t"   // 1 0 0 0
76
        "movups %4, %0                  \n\t"   // 1 0 0 0
77
        "shufps $0xB1,%%xmm1,%%xmm1     \n\t"   // 0 1 0 0
78
        "movups %%xmm1, %1              \n\t"   // 0 1 0 0
79
        "shufps $0x4F,%4,%%xmm1         \n\t"   // 0 0 1 0
80
        "shufps $0x1B,%4,%4             \n\t"   // 0 0 0 1
81
        "movups %%xmm1, %2              \n\t"   // 0 0 1 0
82
        "movups %4, %3"                         // 0 0 0 1
83
        : "=m"(m[0][0]), "=m"(m[1][0]), "=m"(m[2][0]), "=m"(m[3][0])
84
        : "x"(1.0f)
85
        : "xmm1", "memory"
86
    );
87
    asm ("":::"memory");
88
}
89
#endif
90
 
91
inline void _sse_load_matrix (const float* m)
92
{
93
    asm (
94
        "movups %0, %%xmm4      \n\t"   // xmm4 = m[1 2 3 4]
95
        "movups %1, %%xmm5      \n\t"   // xmm5 = m[1 2 3 4]
96
        "movups %2, %%xmm6      \n\t"   // xmm6 = m[1 2 3 4]
97
        "movups %3, %%xmm7"             // xmm7 = m[1 2 3 4]
98
        : : "m"(m[0]), "m"(m[4]), "m"(m[8]), "m"(m[12])
99
        : "xmm4", "xmm5", "xmm6", "xmm7", "memory"
100
    );
101
}
102
 
103
inline void _sse_transform_to_vector (float* result)
104
{
105
    asm (
106
        "movaps %%xmm0, %%xmm1          \n\t" // xmm1 = t[0 1 2 3]
107
        "movaps %%xmm0, %%xmm2          \n\t" // xmm1 = t[0 1 2 3]
108
        "movaps %%xmm0, %%xmm3          \n\t" // xmm1 = t[0 1 2 3]
109
        "shufps $0x00, %%xmm0, %%xmm0   \n\t" // xmm0 = t[0 0 0 0]
110
        "shufps $0x66, %%xmm1, %%xmm1   \n\t" // xmm1 = t[1 1 1 1]
111
        "shufps $0xAA, %%xmm2, %%xmm2   \n\t" // xmm2 = t[2 2 2 2]
112
        "shufps $0xFF, %%xmm3, %%xmm3   \n\t" // xmm3 = t[3 3 3 3]
113
        "mulps  %%xmm4, %%xmm0          \n\t" // xmm0 = t[0 0 0 0] * m[0 1 2 3]
114
        "mulps  %%xmm5, %%xmm1          \n\t" // xmm1 = t[1 1 1 1] * m[0 1 2 3]
115
        "addps  %%xmm1, %%xmm0          \n\t" // xmm0 = xmm0 + xmm1
116
        "mulps  %%xmm6, %%xmm2          \n\t" // xmm2 = t[2 2 2 2] * m[0 1 2 3]
117
        "mulps  %%xmm7, %%xmm3          \n\t" // xmm3 = t[3 3 3 3] * m[0 1 2 3]
118
        "addps  %%xmm3, %%xmm2          \n\t" // xmm2 = xmm2 + xmm3
119
        "addps  %%xmm2, %%xmm0          \n\t" // xmm0 = result
120
        "movups %%xmm0, %0"
121
        : "=m"(result[0]), "=m"(result[1]), "=m"(result[2]), "=m"(result[3]) :
122
        : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "memory"
123
    );
124
}
125
 
126
template <>
127
inline tuple<4,float> operator* (const tuple<4,float>& t, const matrix<4,4,float>& m)
128
{
129
    tuple<4,float> result;
130
    _sse_load_matrix (m.begin());
131
    asm ("movups %0, %%xmm0" : : "m"(t[0]), "m"(t[1]), "m"(t[2]), "m"(t[3]) : "xmm0", "memory");
132
    _sse_transform_to_vector (result.begin());
133
    return (result);
134
}
135
 
136
template <>
137
inline matrix<4,4,float> operator* (const matrix<4,4,float>& m1, const matrix<4,4,float>& m2)
138
{
139
    matrix<4,4,float> result;
140
    _sse_load_matrix (m2.begin());
141
    for (uoff_t r = 0; r < 4; ++ r) {
142
        asm ("movups %0, %%xmm0" : : "m"(m1[r][0]), "m"(m1[r][1]), "m"(m1[r][2]), "m"(m1[r][3]) : "xmm0", "memory");
143
        _sse_transform_to_vector (result[r]);
144
    }
145
    return (result);
146
}
147
 
148
#elif CPU_HAS_3DNOW
149
 
150
/// Specialization for 4-component vector transform, the slow part of 3D graphics.
151
template <>
152
static tuple<4,float> operator* (const tuple<4,float>& t, const matrix<4,4,float>& m)
153
{
154
    tuple<4,float> result;
155
    // This is taken from "AMD Athlon Code Optimization Guide" from AMD. 18 cycles!
156
    // If you are writing a 3D engine, you may want to copy it instead of calling it
157
    // because of the femms instruction at the end, which takes 2 cycles.
158
    asm (
159
        "movq      %2, %%mm0            \n\t"   //            y | x
160
        "movq      %3, %%mm1            \n\t"   //            w | z
161
        "movq      %%mm0, %%mm2         \n\t"   //            y | x
162
        "movq      %4, %%mm3            \n\t"   //      m[0][1] | m[0][0]
163
        "punpckldq  %%mm0, %%mm0        \n\t"   //            x | x
164
        "movq      %6, %%mm4            \n\t"   //      m[1][1] | m[1][0]
165
        "pfmul     %%mm0, %%mm3         \n\t"   //    x*m[0][1] | x*m[0][0]
166
        "punpckhdq  %%mm2, %%mm2        \n\t"   //            y | y
167
        "pfmul     %%mm2, %%mm4         \n\t"   //    y*m[1][1] | y*m[1][0]
168
        "movq      %5, %%mm5            \n\t"   //      m[0][3] | m[0][2]
169
        "movq      %7, %%mm7            \n\t"   //      m[1][3] | m[1][2]
170
        "movq      %%mm1, %%mm6         \n\t"   //            w | z
171
        "pfmul     %%mm0, %%mm5         \n\t"   //    x*m[0][3] | v0>x*m[0][2]
172
        "movq      %8, %%mm0            \n\t"   //      m[2][1] | m[2][0]
173
        "punpckldq  %%mm1, %%mm1        \n\t"   //            z | z
174
        "pfmul     %%mm2, %%mm7         \n\t"   //    y*m[1][3] | y*m[1][2]
175
        "movq      %9, %%mm2            \n\t"   //      m[2][3] | m[2][2]
176
        "pfmul     %%mm1, %%mm0         \n\t"   //    z*m[2][1] | z*m[2][0]
177
        "pfadd     %%mm4, %%mm3         \n\t"   // x*m[0][1]+y*m[1][1] | x*m[0][0]+y*m[1][0]
178
        "movq      %10, %%mm4           \n\t"   //      m[3][1] | m[3][0]
179
        "pfmul     %%mm1, %%mm2         \n\t"   //    z*m[2][3] | z*m[2][2]
180
        "pfadd     %%mm7, %%mm5         \n\t"   // x*m[0][3]+y*m[1][3] | x*m[0][2]+y*m[1][2]
181
        "movq      %11, %%mm1           \n\t"   //      m[3][3] | m[3][2]
182
        "punpckhdq  %%mm6, %%mm6        \n\t"   //            w | w
183
        "pfadd     %%mm0, %%mm3         \n\t"   // x*m[0][1]+y*m[1][1]+z*m[2][1] | x*m[0][0]+y*m[1][0]+z*m[2][0]
184
        "pfmul     %%mm6, %%mm4         \n\t"   //    w*m[3][1] | w*m[3][0]
185
        "pfmul     %%mm6, %%mm1         \n\t"   //    w*m[3][3] | w*m[3][2]
186
        "pfadd     %%mm2, %%mm5         \n\t"   // x*m[0][3]+y*m[1][3]+z*m[2][3] | x*m[0][2]+y*m[1][2]+z*m[2][2]
187
        "pfadd     %%mm4, %%mm3         \n\t"   // x*m[0][1]+y*m[1][1]+z*m[2][1]+w*m[3][1] | x*m[0][0]+y*m[1][0]+z*m[2][0]+w*m[3][0]
188
        "movq      %%mm3, %0            \n\t"   // store result->y | result->x
189
        "pfadd     %%mm1, %%mm5         \n\t"   // x*m[0][3]+y*m[1][3]+z*m[2][3]+w*m[3][3] | x*m[0][2]+y*m[1][2]+z*m[2][2]+w*m[3][2]
190
        "movq      %%mm5, %1"                   // store result->w | result->z
191
        : "=m"(result[0]), "=m"(result[2])
192
        : "m"(t[0]), "m"(t[2]),
193
          "m"(m[0][0]), "m"(m[0][2]),
194
          "m"(m[1][0]), "m"(m[1][2]),
195
          "m"(m[2][0]), "m"(m[2][2]),
196
          "m"(m[3][0]), "m"(m[3][2])
197
        : ALL_MMX_REGS_CHANGELIST, "memory"
198
    );
199
    asm ("":::"memory");
200
    simd::reset_mmx();
201
    return (result);
202
}
203
 
204
#else   // If no processor extensions, just unroll the multiplication
205
 
206
/// Specialization for 4-component vector transform, the slow part of 3D graphics.
207
template <>
208
static tuple<4,float> operator* (const tuple<4,float>& t, const matrix<4,4,float>& m)
209
{
210
    tuple<4,float> tr;
211
    for (uoff_t i = 0; i < 4; ++ i)
212
        tr[i] = t[0] * m[0][i] + t[1] * m[1][i] + t[2] * m[2][i] + t[3] * m[3][i];
213
    return (tr);
214
}
215
 
216
#endif  // CPU_HAS_3DNOW
217
#endif  // WANT_UNROLLED_COPY
218
 
219
} // namespace ustl
220
 
221
#endif

powered by: WebSVN 2.1.0

© copyright 1999-2024 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.