1 |
786 |
skrzyp |
// This file is part of the uSTL library, an STL implementation.
|
2 |
|
|
//
|
3 |
|
|
// Copyright (c) 2005-2009 by Mike Sharov <msharov@users.sourceforge.net>
|
4 |
|
|
// This file is free software, distributed under the MIT License.
|
5 |
|
|
|
6 |
|
|
#ifndef NDEBUG // Optimized code here. asserts slow it down, and are checked elsewhere.
|
7 |
|
|
#define NDEBUG
|
8 |
|
|
#endif
|
9 |
|
|
|
10 |
|
|
#include "ualgo.h"
|
11 |
|
|
|
12 |
|
|
namespace ustl {
|
13 |
|
|
|
14 |
|
|
// Generic version for implementing fill_nX_fast on non-i386 architectures.
|
15 |
|
|
template <typename T> static inline void stosv (T*& p, size_t n, T v)
|
16 |
|
|
{ while (n--) *p++ = v; }
|
17 |
|
|
|
18 |
|
|
#if defined(__i386__) || defined(__x86_64__)
|
19 |
|
|
|
20 |
|
|
//----------------------------------------------------------------------
|
21 |
|
|
// Copy functions
|
22 |
|
|
//----------------------------------------------------------------------
|
23 |
|
|
|
24 |
|
|
#if __GNUC__ >= 3
|
25 |
|
|
static inline void movsb_dir_up (void) INLINE;
|
26 |
|
|
static inline void movsb_dir_down (void) INLINE;
|
27 |
|
|
static inline void movsb (const void*& src, size_t nBytes, void*& dest) INLINE;
|
28 |
|
|
static inline void movsd (const void*& src, size_t nWords, void*& dest) INLINE;
|
29 |
|
|
#endif
|
30 |
|
|
|
31 |
|
|
static inline void movsb_dir_up (void) { asm volatile ("cld"); }
|
32 |
|
|
static inline void movsb_dir_down (void) { asm volatile ("std"); }
|
33 |
|
|
|
34 |
|
|
static inline void movsb (const void*& src, size_t nBytes, void*& dest)
|
35 |
|
|
{
|
36 |
|
|
asm volatile ("rep;\n\tmovsb"
|
37 |
|
|
: "=&S"(src), "=&D"(dest), "=&c"(nBytes)
|
38 |
|
|
: "0"(src), "1"(dest), "2"(nBytes)
|
39 |
|
|
: "memory");
|
40 |
|
|
}
|
41 |
|
|
|
42 |
|
|
static inline void movsd (const void*& src, size_t nWords, void*& dest)
|
43 |
|
|
{
|
44 |
|
|
asm volatile ("rep;\n\tmovsl"
|
45 |
|
|
: "=&S"(src), "=&D"(dest), "=&c"(nWords)
|
46 |
|
|
: "0"(src), "1"(dest), "2"(nWords)
|
47 |
|
|
: "memory");
|
48 |
|
|
}
|
49 |
|
|
|
50 |
|
|
template <> inline void stosv (uint8_t*& p, size_t n, uint8_t v)
|
51 |
|
|
{ asm volatile ("rep;\n\tstosb" : "=&D"(p), "=c"(n) : "0"(p), "1"(n), "a"(v) : "memory"); }
|
52 |
|
|
template <> inline void stosv (uint16_t*& p, size_t n, uint16_t v)
|
53 |
|
|
{ asm volatile ("rep;\n\tstosw" : "=&D"(p), "=c"(n) : "0"(p), "1"(n), "a"(v) : "memory"); }
|
54 |
|
|
template <> inline void stosv (uint32_t*& p, size_t n, uint32_t v)
|
55 |
|
|
{ asm volatile ("rep;\n\tstosl" : "=&D"(p), "=c"(n) : "0"(p), "1"(n), "a"(v) : "memory"); }
|
56 |
|
|
|
57 |
|
|
#if CPU_HAS_MMX
|
58 |
|
|
#define MMX_ALIGN 16U // Data must be aligned on this grain
|
59 |
|
|
#define MMX_BS 32U // Assembly routines copy data this many bytes at a time.
|
60 |
|
|
|
61 |
|
|
static inline void simd_block_copy (const void* src, void* dest) INLINE;
|
62 |
|
|
static inline void simd_block_store (uint8_t* dest) INLINE;
|
63 |
|
|
static inline void simd_block_cleanup (void) INLINE;
|
64 |
|
|
|
65 |
|
|
static inline void simd_block_copy (const void* src, void* dest)
|
66 |
|
|
{
|
67 |
|
|
const char* csrc ((const char*) src);
|
68 |
|
|
char* cdest ((char*) dest);
|
69 |
|
|
#if CPU_HAS_SSE
|
70 |
|
|
asm (
|
71 |
|
|
"movaps\t%2, %%xmm0 \n\t"
|
72 |
|
|
"movaps\t%3, %%xmm1 \n\t"
|
73 |
|
|
"movntps\t%%xmm0, %0 \n\t"
|
74 |
|
|
"movntps\t%%xmm1, %1"
|
75 |
|
|
: "=m"(cdest[0]), "=m"(cdest[16])
|
76 |
|
|
: "m"(csrc[0]), "m"(csrc[16])
|
77 |
|
|
: "xmm0", "xmm1", "memory");
|
78 |
|
|
#else
|
79 |
|
|
asm (
|
80 |
|
|
"movq %4, %%mm0 \n\t"
|
81 |
|
|
"movq %5, %%mm1 \n\t"
|
82 |
|
|
"movq %6, %%mm2 \n\t"
|
83 |
|
|
"movq %7, %%mm3 \n\t"
|
84 |
|
|
"movq %%mm0, %0 \n\t"
|
85 |
|
|
"movq %%mm1, %1 \n\t"
|
86 |
|
|
"movq %%mm2, %2 \n\t"
|
87 |
|
|
"movq %%mm3, %3"
|
88 |
|
|
: "=m"(cdest[0]), "=m"(cdest[8]), "=m"(cdest[16]), "=m"(cdest[24])
|
89 |
|
|
: "m"(csrc[0]), "m"(csrc[8]), "m"(csrc[16]), "m"(csrc[24])
|
90 |
|
|
: "mm0", "mm1", "mm2", "mm3", "st", "st(1)", "st(2)", "st(3)", "memory");
|
91 |
|
|
#endif
|
92 |
|
|
}
|
93 |
|
|
|
94 |
|
|
static inline void simd_block_store (uint8_t* dest)
|
95 |
|
|
{
|
96 |
|
|
#if CPU_HAS_SSE
|
97 |
|
|
asm volatile (
|
98 |
|
|
"movntq %%mm0, %0\n\t"
|
99 |
|
|
"movntq %%mm0, %1\n\t"
|
100 |
|
|
"movntq %%mm0, %2\n\t"
|
101 |
|
|
"movntq %%mm0, %3"
|
102 |
|
|
: "=m"(dest[0]), "=m"(dest[8]), "=m"(dest[16]), "=m"(dest[24])
|
103 |
|
|
:: "memory");
|
104 |
|
|
#else
|
105 |
|
|
asm volatile (
|
106 |
|
|
"movq %%mm0, %0 \n\t"
|
107 |
|
|
"movq %%mm0, %1 \n\t"
|
108 |
|
|
"movq %%mm0, %2 \n\t"
|
109 |
|
|
"movq %%mm0, %3"
|
110 |
|
|
: "=m"(dest[0]), "=m"(dest[8]), "=m"(dest[16]), "=m"(dest[24])
|
111 |
|
|
:: "memory");
|
112 |
|
|
#endif
|
113 |
|
|
}
|
114 |
|
|
|
115 |
|
|
static inline void simd_block_cleanup (void)
|
116 |
|
|
{
|
117 |
|
|
#if !CPU_HAS_SSE
|
118 |
|
|
simd::reset_mmx();
|
119 |
|
|
#endif
|
120 |
|
|
asm volatile ("sfence");
|
121 |
|
|
}
|
122 |
|
|
|
123 |
|
|
/// The fastest optimized raw memory copy.
|
124 |
|
|
void copy_n_fast (const void* src, size_t nBytes, void* dest) throw()
|
125 |
|
|
{
|
126 |
|
|
movsb_dir_up();
|
127 |
|
|
size_t nHeadBytes = Align(uintptr_t(src), MMX_ALIGN) - uintptr_t(src);
|
128 |
|
|
nHeadBytes = min (nHeadBytes, nBytes);
|
129 |
|
|
movsb (src, nHeadBytes, dest);
|
130 |
|
|
nBytes -= nHeadBytes;
|
131 |
|
|
if (!(uintptr_t(dest) % MMX_ALIGN)) {
|
132 |
|
|
const size_t nMiddleBlocks = nBytes / MMX_BS;
|
133 |
|
|
for (uoff_t i = 0; i < nMiddleBlocks; ++ i) {
|
134 |
|
|
prefetch (advance (src, 512), 0, 0);
|
135 |
|
|
simd_block_copy (src, dest);
|
136 |
|
|
src = advance (src, MMX_BS);
|
137 |
|
|
dest = advance (dest, MMX_BS);
|
138 |
|
|
}
|
139 |
|
|
simd_block_cleanup();
|
140 |
|
|
nBytes %= MMX_BS;
|
141 |
|
|
}
|
142 |
|
|
movsb (src, nBytes, dest);
|
143 |
|
|
}
|
144 |
|
|
#endif // CPU_HAS_MMX
|
145 |
|
|
|
146 |
|
|
/// The fastest optimized backwards raw memory copy.
|
147 |
|
|
void copy_backward_fast (const void* first, const void* last, void* result) throw()
|
148 |
|
|
{
|
149 |
|
|
prefetch (first, 0, 0);
|
150 |
|
|
prefetch (result, 1, 0);
|
151 |
|
|
size_t nBytes (distance (first, last));
|
152 |
|
|
movsb_dir_down();
|
153 |
|
|
size_t nHeadBytes = uintptr_t(last) % 4;
|
154 |
|
|
last = advance (last, -1);
|
155 |
|
|
result = advance (result, -1);
|
156 |
|
|
movsb (last, nHeadBytes, result);
|
157 |
|
|
nBytes -= nHeadBytes;
|
158 |
|
|
if (uintptr_t(result) % 4 == 3) {
|
159 |
|
|
const size_t nMiddleBlocks = nBytes / 4;
|
160 |
|
|
last = advance (last, -3);
|
161 |
|
|
result = advance (result, -3);
|
162 |
|
|
movsd (last, nMiddleBlocks, result);
|
163 |
|
|
nBytes %= 4;
|
164 |
|
|
}
|
165 |
|
|
movsb (last, nBytes, result);
|
166 |
|
|
movsb_dir_up();
|
167 |
|
|
}
|
168 |
|
|
#endif // __i386__
|
169 |
|
|
|
170 |
|
|
//----------------------------------------------------------------------
|
171 |
|
|
// Fill functions
|
172 |
|
|
//----------------------------------------------------------------------
|
173 |
|
|
|
174 |
|
|
#if CPU_HAS_MMX
|
175 |
|
|
template <typename T> static inline void build_block (T) {}
|
176 |
|
|
template <> inline void build_block (uint8_t v)
|
177 |
|
|
{
|
178 |
|
|
asm volatile (
|
179 |
|
|
"movd %0, %%mm0\n\tpunpcklbw %%mm0, %%mm0\n\tpshufw $0, %%mm0, %%mm0"
|
180 |
|
|
: : "g"(uint32_t(v)) : "mm0");
|
181 |
|
|
}
|
182 |
|
|
template <> inline void build_block (uint16_t v)
|
183 |
|
|
{
|
184 |
|
|
asm volatile (
|
185 |
|
|
"movd %0, %%mm0\n\tpshufw $0, %%mm0, %%mm0"
|
186 |
|
|
: : "g"(uint32_t(v)) : "mm0");
|
187 |
|
|
}
|
188 |
|
|
template <> inline void build_block (uint32_t v)
|
189 |
|
|
{
|
190 |
|
|
asm volatile (
|
191 |
|
|
"movd %0, %%mm0\n\tpunpckldq %%mm0, %%mm0"
|
192 |
|
|
: : "g"(uint32_t(v)) : "mm0");
|
193 |
|
|
}
|
194 |
|
|
|
195 |
|
|
static inline void simd_block_fill_loop (uint8_t*& dest, size_t count)
|
196 |
|
|
{
|
197 |
|
|
prefetch (advance (dest, 512), 1, 0);
|
198 |
|
|
for (const uint8_t* destEnd = dest + count * MMX_BS; dest < destEnd; dest += MMX_BS)
|
199 |
|
|
simd_block_store (dest);
|
200 |
|
|
simd_block_cleanup();
|
201 |
|
|
simd::reset_mmx();
|
202 |
|
|
}
|
203 |
|
|
|
204 |
|
|
template <typename T>
|
205 |
|
|
static inline void fill_n_fast (T* dest, size_t count, T v)
|
206 |
|
|
{
|
207 |
|
|
size_t nHead = Align(uintptr_t(dest), MMX_ALIGN) - uintptr_t(dest) / sizeof(T);
|
208 |
|
|
nHead = min (nHead, count);
|
209 |
|
|
stosv (dest, nHead, v);
|
210 |
|
|
count -= nHead;
|
211 |
|
|
build_block (v);
|
212 |
|
|
uint8_t* bdest = (uint8_t*) dest;
|
213 |
|
|
simd_block_fill_loop (bdest, count * sizeof(T) / MMX_BS);
|
214 |
|
|
count %= MMX_BS;
|
215 |
|
|
dest = (T*) bdest;
|
216 |
|
|
stosv (dest, count, v);
|
217 |
|
|
}
|
218 |
|
|
|
219 |
|
|
void fill_n8_fast (uint8_t* dest, size_t count, uint8_t v) throw()
|
220 |
|
|
{ fill_n_fast (dest, count, v); }
|
221 |
|
|
void fill_n16_fast (uint16_t* dest, size_t count, uint16_t v) throw()
|
222 |
|
|
{ fill_n_fast (dest, count, v); }
|
223 |
|
|
void fill_n32_fast (uint32_t* dest, size_t count, uint32_t v) throw()
|
224 |
|
|
{ fill_n_fast (dest, count, v); }
|
225 |
|
|
#else
|
226 |
|
|
void fill_n8_fast (uint8_t* dest, size_t count, uint8_t v) throw() { memset (dest, v, count); }
|
227 |
|
|
void fill_n16_fast (uint16_t* dest, size_t count, uint16_t v) throw() { stosv (dest, count, v); }
|
228 |
|
|
void fill_n32_fast (uint32_t* dest, size_t count, uint32_t v) throw() { stosv (dest, count, v); }
|
229 |
|
|
#endif // CPU_HAS_MMX
|
230 |
|
|
|
231 |
|
|
/// Exchanges ranges [first, middle) and [middle, last)
|
232 |
|
|
void rotate_fast (void* first, void* middle, void* last) throw()
|
233 |
|
|
{
|
234 |
|
|
#if HAVE_ALLOCA_H
|
235 |
|
|
const size_t half1 (distance (first, middle)), half2 (distance (middle, last));
|
236 |
|
|
const size_t hmin (min (half1, half2));
|
237 |
|
|
if (!hmin)
|
238 |
|
|
return;
|
239 |
|
|
void* buf = alloca (hmin);
|
240 |
|
|
if (buf) {
|
241 |
|
|
if (half2 < half1) {
|
242 |
|
|
copy_n_fast (middle, half2, buf);
|
243 |
|
|
copy_backward_fast (first, middle, last);
|
244 |
|
|
copy_n_fast (buf, half2, first);
|
245 |
|
|
} else {
|
246 |
|
|
copy_n_fast (first, half1, buf);
|
247 |
|
|
copy_n_fast (middle, half2, first);
|
248 |
|
|
copy_n_fast (buf, half1, advance (first, half2));
|
249 |
|
|
}
|
250 |
|
|
} else
|
251 |
|
|
#else
|
252 |
|
|
if (first == middle || middle == last)
|
253 |
|
|
return;
|
254 |
|
|
#endif
|
255 |
|
|
{
|
256 |
|
|
char* f = (char*) first;
|
257 |
|
|
char* m = (char*) middle;
|
258 |
|
|
char* l = (char*) last;
|
259 |
|
|
reverse (f, m);
|
260 |
|
|
reverse (m, l);
|
261 |
|
|
while (f != m && m != l)
|
262 |
|
|
iter_swap (f++, --l);
|
263 |
|
|
reverse (f, (f == m ? l : m));
|
264 |
|
|
}
|
265 |
|
|
}
|
266 |
|
|
|
267 |
|
|
#if __GNUC__ < 4
|
268 |
|
|
size_t popcount (uint32_t v)
|
269 |
|
|
{
|
270 |
|
|
const uint32_t w = v - ((v >> 1) & 0x55555555); // Algorithm from AMD optimization guide
|
271 |
|
|
const uint32_t x = (w & 0x33333333) + ((w >> 2) & 0x33333333);
|
272 |
|
|
return (((x + (x >> 4) & 0x0F0F0F0F) * 0x01010101) >> 24);
|
273 |
|
|
}
|
274 |
|
|
|
275 |
|
|
#if HAVE_INT64_T
|
276 |
|
|
/// \brief Returns the number of 1s in \p v in binary.
|
277 |
|
|
size_t popcount (uint64_t v)
|
278 |
|
|
{
|
279 |
|
|
v -= (v >> 1) & UINT64_C(0x5555555555555555); // Algorithm from Wikipedia
|
280 |
|
|
v = (v & UINT64_C(0x3333333333333333)) + ((v >> 2) & UINT64_C(0x3333333333333333));
|
281 |
|
|
v = (v + (v >> 4)) & UINT64_C(0x0F0F0F0F0F0F0F0F);
|
282 |
|
|
return ((v * UINT64_C(0x0101010101010101)) >> 56);
|
283 |
|
|
}
|
284 |
|
|
#endif // HAVE_INT64_T
|
285 |
|
|
#endif // !__GNUC__
|
286 |
|
|
|
287 |
|
|
//----------------------------------------------------------------------
|
288 |
|
|
// Miscellaneous instantiated stuff from headers which don't have enough
|
289 |
|
|
// to warrant creation of a separate file.cc
|
290 |
|
|
//----------------------------------------------------------------------
|
291 |
|
|
|
292 |
|
|
// Used in uspecial to print printable characters
|
293 |
|
|
const char _FmtPrtChr[2][8]={"'%c'","%d"};
|
294 |
|
|
|
295 |
|
|
} // namespace ustl
|