OpenCores
URL https://opencores.org/ocsvn/openrisc/openrisc/trunk

Subversion Repositories openrisc

[/] [openrisc/] [trunk/] [rtos/] [ecos-3.0/] [packages/] [language/] [cxx/] [ustl/] [current/] [src/] [ualgobase.cpp] - Blame information for rev 786

Details | Compare with Previous | View Log

Line No. Rev Author Line
1 786 skrzyp
// This file is part of the uSTL library, an STL implementation.
2
//
3
// Copyright (c) 2005-2009 by Mike Sharov <msharov@users.sourceforge.net>
4
// This file is free software, distributed under the MIT License.
5
 
6
#ifndef NDEBUG  // Optimized code here. asserts slow it down, and are checked elsewhere.
7
#define NDEBUG
8
#endif
9
 
10
#include "ualgo.h"
11
 
12
namespace ustl {
13
 
14
// Generic version for implementing fill_nX_fast on non-i386 architectures.
15
template <typename T> static inline void stosv (T*& p, size_t n, T v)
16
    { while (n--) *p++ = v; }
17
 
18
#if defined(__i386__) || defined(__x86_64__)
19
 
20
//----------------------------------------------------------------------
21
// Copy functions
22
//----------------------------------------------------------------------
23
 
24
#if __GNUC__ >= 3
25
static inline void movsb_dir_up (void) INLINE;
26
static inline void movsb_dir_down (void) INLINE;
27
static inline void movsb (const void*& src, size_t nBytes, void*& dest) INLINE;
28
static inline void movsd (const void*& src, size_t nWords, void*& dest) INLINE;
29
#endif
30
 
31
static inline void movsb_dir_up (void) { asm volatile ("cld"); }
32
static inline void movsb_dir_down (void) { asm volatile ("std"); }
33
 
34
static inline void movsb (const void*& src, size_t nBytes, void*& dest)
35
{
36
    asm volatile ("rep;\n\tmovsb"
37
        : "=&S"(src), "=&D"(dest), "=&c"(nBytes)
38
        : "0"(src), "1"(dest), "2"(nBytes)
39
        : "memory");
40
}
41
 
42
static inline void movsd (const void*& src, size_t nWords, void*& dest)
43
{
44
    asm volatile ("rep;\n\tmovsl"
45
        : "=&S"(src), "=&D"(dest), "=&c"(nWords)
46
        : "0"(src), "1"(dest), "2"(nWords)
47
        : "memory");
48
}
49
 
50
template <> inline void stosv (uint8_t*& p, size_t n, uint8_t v)
51
{ asm volatile ("rep;\n\tstosb" : "=&D"(p), "=c"(n) : "0"(p), "1"(n), "a"(v) : "memory"); }
52
template <> inline void stosv (uint16_t*& p, size_t n, uint16_t v)
53
{ asm volatile ("rep;\n\tstosw" : "=&D"(p), "=c"(n) : "0"(p), "1"(n), "a"(v) : "memory"); }
54
template <> inline void stosv (uint32_t*& p, size_t n, uint32_t v)
55
{ asm volatile ("rep;\n\tstosl" : "=&D"(p), "=c"(n) : "0"(p), "1"(n), "a"(v) : "memory"); }
56
 
57
#if CPU_HAS_MMX
58
#define MMX_ALIGN       16U     // Data must be aligned on this grain
59
#define MMX_BS          32U     // Assembly routines copy data this many bytes at a time.
60
 
61
static inline void simd_block_copy (const void* src, void* dest) INLINE;
62
static inline void simd_block_store (uint8_t* dest) INLINE;
63
static inline void simd_block_cleanup (void) INLINE;
64
 
65
static inline void simd_block_copy (const void* src, void* dest)
66
{
67
    const char* csrc ((const char*) src);
68
    char* cdest ((char*) dest);
69
    #if CPU_HAS_SSE
70
    asm (
71
        "movaps\t%2, %%xmm0     \n\t"
72
        "movaps\t%3, %%xmm1     \n\t"
73
        "movntps\t%%xmm0, %0    \n\t"
74
        "movntps\t%%xmm1, %1"
75
        : "=m"(cdest[0]), "=m"(cdest[16])
76
        : "m"(csrc[0]), "m"(csrc[16])
77
        : "xmm0", "xmm1", "memory");
78
    #else
79
    asm (
80
        "movq   %4, %%mm0       \n\t"
81
        "movq   %5, %%mm1       \n\t"
82
        "movq   %6, %%mm2       \n\t"
83
        "movq   %7, %%mm3       \n\t"
84
        "movq   %%mm0, %0       \n\t"
85
        "movq   %%mm1, %1       \n\t"
86
        "movq   %%mm2, %2       \n\t"
87
        "movq   %%mm3, %3"
88
        : "=m"(cdest[0]), "=m"(cdest[8]), "=m"(cdest[16]), "=m"(cdest[24])
89
        : "m"(csrc[0]), "m"(csrc[8]), "m"(csrc[16]), "m"(csrc[24])
90
        : "mm0", "mm1", "mm2", "mm3", "st", "st(1)", "st(2)", "st(3)", "memory");
91
    #endif
92
}
93
 
94
static inline void simd_block_store (uint8_t* dest)
95
{
96
    #if CPU_HAS_SSE
97
    asm volatile (
98
        "movntq %%mm0, %0\n\t"
99
        "movntq %%mm0, %1\n\t"
100
        "movntq %%mm0, %2\n\t"
101
        "movntq %%mm0, %3"
102
        : "=m"(dest[0]), "=m"(dest[8]), "=m"(dest[16]), "=m"(dest[24])
103
        :: "memory");
104
    #else
105
    asm volatile (
106
        "movq %%mm0, %0 \n\t"
107
        "movq %%mm0, %1 \n\t"
108
        "movq %%mm0, %2 \n\t"
109
        "movq %%mm0, %3"
110
        : "=m"(dest[0]), "=m"(dest[8]), "=m"(dest[16]), "=m"(dest[24])
111
        :: "memory");
112
    #endif
113
}
114
 
115
static inline void simd_block_cleanup (void)
116
{
117
    #if !CPU_HAS_SSE
118
        simd::reset_mmx();
119
    #endif
120
    asm volatile ("sfence");
121
}
122
 
123
/// The fastest optimized raw memory copy.
124
void copy_n_fast (const void* src, size_t nBytes, void* dest) throw()
125
{
126
    movsb_dir_up();
127
    size_t nHeadBytes = Align(uintptr_t(src), MMX_ALIGN) - uintptr_t(src);
128
    nHeadBytes = min (nHeadBytes, nBytes);
129
    movsb (src, nHeadBytes, dest);
130
    nBytes -= nHeadBytes;
131
    if (!(uintptr_t(dest) % MMX_ALIGN)) {
132
        const size_t nMiddleBlocks = nBytes / MMX_BS;
133
        for (uoff_t i = 0; i < nMiddleBlocks; ++ i) {
134
            prefetch (advance (src, 512), 0, 0);
135
            simd_block_copy (src, dest);
136
            src = advance (src, MMX_BS);
137
            dest = advance (dest, MMX_BS);
138
        }
139
        simd_block_cleanup();
140
        nBytes %= MMX_BS;
141
    }
142
    movsb (src, nBytes, dest);
143
}
144
#endif // CPU_HAS_MMX
145
 
146
/// The fastest optimized backwards raw memory copy.
147
void copy_backward_fast (const void* first, const void* last, void* result) throw()
148
{
149
    prefetch (first, 0, 0);
150
    prefetch (result, 1, 0);
151
    size_t nBytes (distance (first, last));
152
    movsb_dir_down();
153
    size_t nHeadBytes = uintptr_t(last) % 4;
154
    last = advance (last, -1);
155
    result = advance (result, -1);
156
    movsb (last, nHeadBytes, result);
157
    nBytes -= nHeadBytes;
158
    if (uintptr_t(result) % 4 == 3) {
159
        const size_t nMiddleBlocks = nBytes / 4;
160
        last = advance (last, -3);
161
        result = advance (result, -3);
162
        movsd (last, nMiddleBlocks, result);
163
        nBytes %= 4;
164
    }
165
    movsb (last, nBytes, result);
166
    movsb_dir_up();
167
}
168
#endif // __i386__
169
 
170
//----------------------------------------------------------------------
171
// Fill functions
172
//----------------------------------------------------------------------
173
 
174
#if CPU_HAS_MMX
175
template <typename T> static inline void build_block (T) {}
176
template <> inline void build_block (uint8_t v)
177
{
178
    asm volatile (
179
        "movd %0, %%mm0\n\tpunpcklbw %%mm0, %%mm0\n\tpshufw $0, %%mm0, %%mm0"
180
        : : "g"(uint32_t(v)) : "mm0");
181
}
182
template <> inline void build_block (uint16_t v)
183
{
184
    asm volatile (
185
        "movd %0, %%mm0\n\tpshufw $0, %%mm0, %%mm0"
186
        : : "g"(uint32_t(v)) : "mm0");
187
}
188
template <> inline void build_block (uint32_t v)
189
{
190
    asm volatile (
191
        "movd %0, %%mm0\n\tpunpckldq %%mm0, %%mm0"
192
        : : "g"(uint32_t(v)) : "mm0");
193
}
194
 
195
static inline void simd_block_fill_loop (uint8_t*& dest, size_t count)
196
{
197
    prefetch (advance (dest, 512), 1, 0);
198
    for (const uint8_t* destEnd = dest + count * MMX_BS; dest < destEnd; dest += MMX_BS)
199
        simd_block_store (dest);
200
    simd_block_cleanup();
201
    simd::reset_mmx();
202
}
203
 
204
template <typename T>
205
static inline void fill_n_fast (T* dest, size_t count, T v)
206
{
207
    size_t nHead = Align(uintptr_t(dest), MMX_ALIGN) - uintptr_t(dest) / sizeof(T);
208
    nHead = min (nHead, count);
209
    stosv (dest, nHead, v);
210
    count -= nHead;
211
    build_block (v);
212
    uint8_t* bdest = (uint8_t*) dest;
213
    simd_block_fill_loop (bdest, count * sizeof(T) / MMX_BS);
214
    count %= MMX_BS;
215
    dest = (T*) bdest;
216
    stosv (dest, count, v);
217
}
218
 
219
void fill_n8_fast (uint8_t* dest, size_t count, uint8_t v) throw()
220
    { fill_n_fast (dest, count, v); }
221
void fill_n16_fast (uint16_t* dest, size_t count, uint16_t v) throw()
222
    { fill_n_fast (dest, count, v); }
223
void fill_n32_fast (uint32_t* dest, size_t count, uint32_t v) throw()
224
    { fill_n_fast (dest, count, v); }
225
#else
226
void fill_n8_fast (uint8_t* dest, size_t count, uint8_t v) throw() { memset (dest, v, count); }
227
void fill_n16_fast (uint16_t* dest, size_t count, uint16_t v) throw() { stosv (dest, count, v); }
228
void fill_n32_fast (uint32_t* dest, size_t count, uint32_t v) throw() { stosv (dest, count, v); }
229
#endif // CPU_HAS_MMX
230
 
231
/// Exchanges ranges [first, middle) and [middle, last)
232
void rotate_fast (void* first, void* middle, void* last) throw()
233
{
234
#if HAVE_ALLOCA_H
235
    const size_t half1 (distance (first, middle)), half2 (distance (middle, last));
236
    const size_t hmin (min (half1, half2));
237
    if (!hmin)
238
        return;
239
    void* buf = alloca (hmin);
240
    if (buf) {
241
        if (half2 < half1) {
242
            copy_n_fast (middle, half2, buf);
243
            copy_backward_fast (first, middle, last);
244
            copy_n_fast (buf, half2, first);
245
        } else {
246
            copy_n_fast (first, half1, buf);
247
            copy_n_fast (middle, half2, first);
248
            copy_n_fast (buf, half1, advance (first, half2));
249
        }
250
    } else
251
#else
252
    if (first == middle || middle == last)
253
        return;
254
#endif
255
    {
256
        char* f = (char*) first;
257
        char* m = (char*) middle;
258
        char* l = (char*) last;
259
        reverse (f, m);
260
        reverse (m, l);
261
        while (f != m && m != l)
262
            iter_swap (f++, --l);
263
        reverse (f, (f == m ? l : m));
264
    }
265
}
266
 
267
#if __GNUC__ < 4
268
size_t popcount (uint32_t v)
269
{
270
    const uint32_t w = v - ((v >> 1) & 0x55555555); // Algorithm from AMD optimization guide
271
    const uint32_t x = (w & 0x33333333) + ((w >> 2) & 0x33333333);
272
    return (((x + (x >> 4) & 0x0F0F0F0F) * 0x01010101) >> 24);
273
}
274
 
275
#if HAVE_INT64_T
276
/// \brief Returns the number of 1s in \p v in binary.
277
size_t popcount (uint64_t v)
278
{
279
    v -= (v >> 1) & UINT64_C(0x5555555555555555);               // Algorithm from Wikipedia
280
    v = (v & UINT64_C(0x3333333333333333)) + ((v >> 2) & UINT64_C(0x3333333333333333));
281
    v = (v + (v >> 4)) & UINT64_C(0x0F0F0F0F0F0F0F0F);
282
    return ((v * UINT64_C(0x0101010101010101)) >> 56);
283
}
284
#endif  // HAVE_INT64_T
285
#endif  // !__GNUC__
286
 
287
//----------------------------------------------------------------------
288
// Miscellaneous instantiated stuff from headers which don't have enough
289
// to warrant creation of a separate file.cc
290
//----------------------------------------------------------------------
291
 
292
// Used in uspecial to print printable characters
293
const char _FmtPrtChr[2][8]={"'%c'","%d"};
294
 
295
} // namespace ustl

powered by: WebSVN 2.1.0

© copyright 1999-2024 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.