1 |
148 |
jeremybenn |
/*
|
2 |
|
|
(C) Copyright 2008
|
3 |
|
|
International Business Machines Corporation,
|
4 |
|
|
All rights reserved.
|
5 |
|
|
|
6 |
|
|
Redistribution and use in source and binary forms, with or without
|
7 |
|
|
modification, are permitted provided that the following conditions are met:
|
8 |
|
|
|
9 |
|
|
* Redistributions of source code must retain the above copyright notice,
|
10 |
|
|
this list of conditions and the following disclaimer.
|
11 |
|
|
* Redistributions in binary form must reproduce the above copyright
|
12 |
|
|
notice, this list of conditions and the following disclaimer in the
|
13 |
|
|
documentation and/or other materials provided with the distribution.
|
14 |
|
|
* Neither the names of the copyright holders nor the names of their
|
15 |
|
|
contributors may be used to endorse or promote products derived from
|
16 |
|
|
this software without specific prior written permission.
|
17 |
|
|
|
18 |
|
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
19 |
|
|
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
20 |
|
|
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
21 |
|
|
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
22 |
|
|
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
23 |
|
|
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
24 |
|
|
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
25 |
|
|
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
26 |
|
|
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
27 |
|
|
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
28 |
|
|
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
29 |
|
|
*/
|
30 |
|
|
|
31 |
|
|
#include <spu_intrinsics.h>
|
32 |
|
|
#include "vec_literal.h"
|
33 |
|
|
#include <string.h>
|
34 |
|
|
|
35 |
|
|
/*
|
36 |
|
|
* Supply an inline _strncpy for strcpy/cat and strncpy/cat. Relies on
|
37 |
|
|
* checklen and lastzero code being optimized out when they are constant
|
38 |
|
|
* zero values.
|
39 |
|
|
*/
|
40 |
|
|
static inline void * _strncpy(char * __restrict__ dest, const char *
|
41 |
|
|
__restrict__ src, size_t maxlen, int
|
42 |
|
|
checklen, int lastzero)
|
43 |
|
|
{
|
44 |
|
|
int adjust, offset, soffset, doffset, shift;
|
45 |
|
|
vec_uchar16 *vsrc, *vdest;
|
46 |
|
|
vec_uchar16 sdata1, sdata2, sdata, shuffle;
|
47 |
|
|
vec_uchar16 mask1, maskzero, cmp0;
|
48 |
|
|
vec_uint4 nonzeroes, gathered_cmp, vtmp, vtmp2;
|
49 |
|
|
vec_uint4 curlen; /* assumes size_t is 4 bytes */
|
50 |
|
|
const vec_uint4 val31 = { 31, 31, 31, 31 };
|
51 |
|
|
const vec_uint4 val_0123 = { 0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F };
|
52 |
|
|
const vec_uchar16 all_ones = { 0xff,0xff,0xff,0xff, 0xff,0xff,0xff,0xff,
|
53 |
|
|
0xff,0xff,0xff,0xff, 0xff,0xff,0xff,0xff };
|
54 |
|
|
|
55 |
|
|
vsrc = (vec_uchar16 *) src;
|
56 |
|
|
vdest = (vec_uchar16 *) dest;
|
57 |
|
|
soffset = (int) src & 15;
|
58 |
|
|
doffset = (int) dest & 15;
|
59 |
|
|
|
60 |
|
|
if (checklen) {
|
61 |
|
|
/*
|
62 |
|
|
* Set curlen so it is the number of bytes we would copy if starting
|
63 |
|
|
* from vdest & ~0xf.
|
64 |
|
|
*
|
65 |
|
|
* curlen could probably be replaced by comparing vdest plus some
|
66 |
|
|
* offset to dest + maxlen, that would help mainly in the while loop
|
67 |
|
|
* but would lose only one instruction (the curlen -= 16).
|
68 |
|
|
*/
|
69 |
|
|
curlen = spu_splats((unsigned int) (maxlen + doffset));
|
70 |
|
|
}
|
71 |
|
|
|
72 |
|
|
/*
|
73 |
|
|
* Setup a shuffle pattern to align the source string with the
|
74 |
|
|
* alignment of the destination string.
|
75 |
|
|
*/
|
76 |
|
|
vtmp = spu_cmpgt(spu_promote(doffset, 0), spu_promote(soffset, 0));
|
77 |
|
|
adjust = spu_extract(vtmp, 0);
|
78 |
|
|
offset = soffset - doffset;
|
79 |
|
|
offset += adjust & 16;
|
80 |
|
|
shuffle = spu_splats((unsigned char) offset);
|
81 |
|
|
shuffle = (vec_uchar16) spu_add((vec_uint4) shuffle, val_0123);
|
82 |
|
|
|
83 |
|
|
vsrc += adjust;
|
84 |
|
|
sdata1 = *vsrc++;
|
85 |
|
|
sdata2 = *vsrc++;
|
86 |
|
|
sdata = spu_shuffle(sdata1, sdata2, shuffle);
|
87 |
|
|
|
88 |
|
|
/*
|
89 |
|
|
* mask out leading bytes
|
90 |
|
|
*/
|
91 |
|
|
mask1 = spu_rlmaskqwbyte(all_ones, -doffset);
|
92 |
|
|
|
93 |
|
|
cmp0 = spu_and(mask1, spu_cmpeq(sdata, 0));
|
94 |
|
|
nonzeroes = spu_cntlz(spu_gather(cmp0));
|
95 |
|
|
/*
|
96 |
|
|
* First element of nonzeroes - 15 is the number of leading non-zero
|
97 |
|
|
* bytes plus 1 for the zero byte.
|
98 |
|
|
*/
|
99 |
|
|
if (checklen) {
|
100 |
|
|
vtmp = spu_add(curlen, 15);
|
101 |
|
|
vtmp2 = spu_cmpgt(nonzeroes, vtmp);
|
102 |
|
|
nonzeroes = spu_sel(nonzeroes, vtmp, vtmp2);
|
103 |
|
|
}
|
104 |
|
|
|
105 |
|
|
vtmp = spu_cmpgt(nonzeroes, val31);
|
106 |
|
|
/*
|
107 |
|
|
* Note: using immediate (constant 31) vs a vector value (val31) does
|
108 |
|
|
* not give different results, and we have to have a vector val31 for
|
109 |
|
|
* the spu_sel below, so use val31 everywhere.
|
110 |
|
|
*/
|
111 |
|
|
vtmp = spu_sel(nonzeroes, val31, vtmp);
|
112 |
|
|
/*
|
113 |
|
|
* So vtmp is now min(nonzeroes, 31), the number of bytes + 16 that we
|
114 |
|
|
* want to copy from the first 16 bytes of the source.
|
115 |
|
|
*/
|
116 |
|
|
if (checklen) {
|
117 |
|
|
curlen = spu_sub(vtmp, curlen);
|
118 |
|
|
curlen = spu_sub(15, curlen);
|
119 |
|
|
}
|
120 |
|
|
|
121 |
|
|
/*
|
122 |
|
|
* We want a right shift 0xff with fill by ones of (vtmp - 15) bytes, but
|
123 |
|
|
* that doesn't exist so use spu_slqwbyte and vtmp all ones left by
|
124 |
|
|
* (31 - vtmp). Note: this can also use spu_rlqwbytebc with spu_rlqw.
|
125 |
|
|
*/
|
126 |
|
|
shift = spu_extract(spu_sub(val31, vtmp), 0);
|
127 |
|
|
maskzero = spu_slqwbyte(all_ones, shift);
|
128 |
|
|
maskzero = spu_and(mask1, maskzero);
|
129 |
|
|
*vdest = spu_sel(*vdest, sdata, maskzero);
|
130 |
|
|
|
131 |
|
|
vtmp = spu_cmpgt(nonzeroes, val31);
|
132 |
|
|
if (checklen) {
|
133 |
|
|
vtmp2 = spu_cmpgt(curlen, 0);
|
134 |
|
|
vtmp = spu_and(vtmp, vtmp2);
|
135 |
|
|
}
|
136 |
|
|
if (spu_extract(vtmp, 0)) {
|
137 |
|
|
sdata1 = sdata2;
|
138 |
|
|
sdata2 = *vsrc++;
|
139 |
|
|
sdata = spu_shuffle(sdata1, sdata2, shuffle);
|
140 |
|
|
cmp0 = spu_cmpeq(sdata, 0);
|
141 |
|
|
gathered_cmp = spu_gather(cmp0);
|
142 |
|
|
/*
|
143 |
|
|
* Copy 16 bytes at a time.
|
144 |
|
|
*/
|
145 |
|
|
while ((spu_extract(gathered_cmp, 0) == 0) &&
|
146 |
|
|
(!checklen || (spu_extract(curlen, 0) > 15))) {
|
147 |
|
|
if (checklen)
|
148 |
|
|
curlen = spu_add(curlen, -16);
|
149 |
|
|
*++vdest = sdata;
|
150 |
|
|
sdata1 = sdata2;
|
151 |
|
|
sdata2 = *vsrc++;
|
152 |
|
|
sdata = spu_shuffle(sdata1, sdata2, shuffle);
|
153 |
|
|
cmp0 = spu_cmpeq(sdata, 0);
|
154 |
|
|
gathered_cmp = spu_gather(cmp0);
|
155 |
|
|
}
|
156 |
|
|
/*
|
157 |
|
|
* Copy 0 to 15 trailing bytes, either up to the smaller of curlen or
|
158 |
|
|
* the number of non-zero bytes.
|
159 |
|
|
*/
|
160 |
|
|
nonzeroes = spu_cntlz(gathered_cmp);
|
161 |
|
|
if (checklen) {
|
162 |
|
|
vtmp = spu_add(curlen, 15);
|
163 |
|
|
vtmp2 = spu_cmpgt(nonzeroes, vtmp);
|
164 |
|
|
nonzeroes = spu_sel(nonzeroes, vtmp, vtmp2);
|
165 |
|
|
curlen = spu_sub(nonzeroes, curlen);
|
166 |
|
|
curlen = spu_sub(15, curlen);
|
167 |
|
|
}
|
168 |
|
|
shift = spu_extract(spu_sub(val31, nonzeroes), 0);
|
169 |
|
|
maskzero = spu_slqwbyte(all_ones, shift);
|
170 |
|
|
++vdest;
|
171 |
|
|
*vdest = spu_sel(*vdest, sdata, maskzero);
|
172 |
|
|
}
|
173 |
|
|
|
174 |
|
|
if (checklen && lastzero) {
|
175 |
|
|
/*
|
176 |
|
|
* For strncat.
|
177 |
|
|
*/
|
178 |
|
|
dest[maxlen - spu_extract(curlen, 0)] = '\0';
|
179 |
|
|
}
|
180 |
|
|
|
181 |
|
|
/* Pad null bytes if the length of the "src" is less than "n" (strncpy). */
|
182 |
|
|
if (checklen && !lastzero && spu_extract(curlen, 0))
|
183 |
|
|
memset(dest + maxlen - spu_extract(curlen, 0), 0, spu_extract(curlen, 0));
|
184 |
|
|
return (dest);
|
185 |
|
|
}
|