1 |
148 |
jeremybenn |
/*
|
2 |
|
|
(C) Copyright 2001,2006,
|
3 |
|
|
International Business Machines Corporation,
|
4 |
|
|
Sony Computer Entertainment, Incorporated,
|
5 |
|
|
Toshiba Corporation,
|
6 |
|
|
|
7 |
|
|
All rights reserved.
|
8 |
|
|
|
9 |
|
|
Redistribution and use in source and binary forms, with or without
|
10 |
|
|
modification, are permitted provided that the following conditions are met:
|
11 |
|
|
|
12 |
|
|
* Redistributions of source code must retain the above copyright notice,
|
13 |
|
|
this list of conditions and the following disclaimer.
|
14 |
|
|
* Redistributions in binary form must reproduce the above copyright
|
15 |
|
|
notice, this list of conditions and the following disclaimer in the
|
16 |
|
|
documentation and/or other materials provided with the distribution.
|
17 |
|
|
* Neither the names of the copyright holders nor the names of their
|
18 |
|
|
contributors may be used to endorse or promote products derived from this
|
19 |
|
|
software without specific prior written permission.
|
20 |
|
|
|
21 |
|
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
22 |
|
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
23 |
|
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
24 |
|
|
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
25 |
|
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
26 |
|
|
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
27 |
|
|
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
28 |
|
|
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
29 |
|
|
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
30 |
|
|
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
31 |
|
|
POSSIBILITY OF SUCH DAMAGE.
|
32 |
|
|
*/
|
33 |
|
|
#include <spu_intrinsics.h>
|
34 |
|
|
#include <stddef.h>
|
35 |
|
|
#include "vec_literal.h"
|
36 |
|
|
|
37 |
|
|
/* Copy n bytes from memory area src to memory area dest.
|
38 |
|
|
* Copying is performed as if the n characters pointed to
|
39 |
|
|
* by src are first copied into a temporary array that does
|
40 |
|
|
* not overlap the src and dest arrays. Then the n characters
|
41 |
|
|
* of the temporary array are copied into the destination
|
42 |
|
|
* array. The memmove subroutine returns a pointer to dest.
|
43 |
|
|
*/
|
44 |
|
|
|
45 |
|
|
void * memmove(void * __restrict__ dest, const void * __restrict__ src, size_t n)
|
46 |
|
|
{
|
47 |
|
|
int adjust, delta;
|
48 |
|
|
unsigned int soffset1, soffset2, doffset1, doffset2;
|
49 |
|
|
vec_uchar16 *vSrc, *vDst;
|
50 |
|
|
vec_uchar16 sdata1, sdata2, sdata, ddata, shuffle;
|
51 |
|
|
vec_uchar16 mask, mask1, mask2, mask3, one = spu_splats((unsigned char)-1);
|
52 |
|
|
|
53 |
|
|
soffset1 = (unsigned int)(src) & 15;
|
54 |
|
|
doffset1 = (unsigned int)(dest) & 15;
|
55 |
|
|
doffset2 = ((unsigned int)(dest) + n) & 15;
|
56 |
|
|
|
57 |
|
|
/* Construct a series of masks used to data insert. The masks
|
58 |
|
|
* contains 0 bit when the destination word is unchanged, 1 when it
|
59 |
|
|
* must be replaced by source bits.
|
60 |
|
|
*
|
61 |
|
|
* mask1 = mask for leading unchanged bytes
|
62 |
|
|
* mask2 = mask for trailing unchange bytes
|
63 |
|
|
* mask3 = mask indicating the more than one qword is being changed.
|
64 |
|
|
*/
|
65 |
|
|
mask = one;
|
66 |
|
|
mask1 = spu_rlmaskqwbyte(mask, -doffset1);
|
67 |
|
|
mask2 = spu_slqwbyte(mask, 16-doffset2);
|
68 |
|
|
mask3 = (vec_uchar16)spu_cmpgt(spu_splats((unsigned int)(doffset1 + n)), 15);
|
69 |
|
|
|
70 |
|
|
vDst = (vec_uchar16 *)(dest);
|
71 |
|
|
|
72 |
|
|
delta = (int)soffset1 - (int)doffset1;
|
73 |
|
|
|
74 |
|
|
/* The follow check only works if the SPU addresses are not
|
75 |
|
|
* wrapped. No provisions have been made to correct for this
|
76 |
|
|
* limitation.
|
77 |
|
|
*/
|
78 |
|
|
if (((unsigned int)dest - (unsigned int)src) >= (unsigned int)n) {
|
79 |
|
|
/* Forward copy. Perform a memcpy.
|
80 |
|
|
*
|
81 |
|
|
* Handle any leading destination partial quadwords as
|
82 |
|
|
* well a very short copy (ie, such that the n characters
|
83 |
|
|
* all reside in a single (destination) quadword.
|
84 |
|
|
*/
|
85 |
|
|
vSrc = (vec_uchar16 *)(src);
|
86 |
|
|
vDst = (vec_uchar16 *)(dest);
|
87 |
|
|
|
88 |
|
|
/* Handle any leading destination partial quadwords as
|
89 |
|
|
* well a very short copy (ie, such that the n characters
|
90 |
|
|
* all reside in a single (destination) quadword.
|
91 |
|
|
*/
|
92 |
|
|
soffset1 = (unsigned int)(src) & 15;
|
93 |
|
|
doffset1 = (unsigned int)(dest) & 15;
|
94 |
|
|
doffset2 = ((unsigned int)(dest) + n) & 15;
|
95 |
|
|
|
96 |
|
|
/* Compute a shuffle pattern used to align the source string
|
97 |
|
|
* with the alignment of the destination string.
|
98 |
|
|
*/
|
99 |
|
|
|
100 |
|
|
adjust = (int)spu_extract(spu_cmpgt(spu_promote(doffset1, 0), spu_promote(soffset1, 0)), 0);
|
101 |
|
|
delta = (int)soffset1 - (int)doffset1;
|
102 |
|
|
delta += adjust & 16;
|
103 |
|
|
|
104 |
|
|
shuffle = (vec_uchar16)spu_add((vec_uint4)spu_splats((unsigned char)delta),
|
105 |
|
|
VEC_LITERAL(vec_uint4, 0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F));
|
106 |
|
|
|
107 |
|
|
vSrc += adjust;
|
108 |
|
|
|
109 |
|
|
sdata1 = *vSrc++;
|
110 |
|
|
sdata2 = *vSrc++;
|
111 |
|
|
|
112 |
|
|
ddata = *vDst;
|
113 |
|
|
sdata = spu_shuffle(sdata1, sdata2, shuffle);
|
114 |
|
|
|
115 |
|
|
/* Construct a series of masks used to data insert. The masks
|
116 |
|
|
* contain 0 when the destination word is unchanged, 1 when it
|
117 |
|
|
* must be replaced by source bytes.
|
118 |
|
|
*
|
119 |
|
|
* mask1 = mask for leading unchanged bytes
|
120 |
|
|
* mask2 = mask for trailing unchange bytes
|
121 |
|
|
* mask3 = mask indicating the more than one qword is being changed.
|
122 |
|
|
*/
|
123 |
|
|
mask = one;
|
124 |
|
|
mask1 = spu_rlmaskqwbyte(mask, -doffset1);
|
125 |
|
|
mask2 = spu_slqwbyte(mask, 16-doffset2);
|
126 |
|
|
mask3 = (vec_uchar16)spu_cmpgt(spu_splats((unsigned int)(doffset1 + n)), 15);
|
127 |
|
|
|
128 |
|
|
*vDst++ = spu_sel(ddata, sdata, spu_and(mask1, spu_or(mask2, mask3)));
|
129 |
|
|
|
130 |
|
|
n += doffset1;
|
131 |
|
|
|
132 |
|
|
/* Handle complete destination quadwords
|
133 |
|
|
*/
|
134 |
|
|
while (n > 31) {
|
135 |
|
|
sdata1 = sdata2;
|
136 |
|
|
sdata2 = *vSrc++;
|
137 |
|
|
*vDst++ = spu_shuffle(sdata1, sdata2, shuffle);
|
138 |
|
|
n -= 16;
|
139 |
|
|
}
|
140 |
|
|
|
141 |
|
|
/* Handle any trailing partial (destination) quadwords
|
142 |
|
|
*/
|
143 |
|
|
mask = spu_and((vec_uchar16)spu_cmpgt(spu_splats((unsigned int)n), 16), mask2);
|
144 |
|
|
*vDst = spu_sel(*vDst, spu_shuffle(sdata2, *vSrc, shuffle), mask);
|
145 |
|
|
|
146 |
|
|
} else {
|
147 |
|
|
/* Backward copy.
|
148 |
|
|
*
|
149 |
|
|
* Handle any leading destination partial quadwords as
|
150 |
|
|
* well a very short copy (ie, such that the n characters
|
151 |
|
|
* all reside in a single (destination) quadword.
|
152 |
|
|
*/
|
153 |
|
|
vSrc = (vec_uchar16 *)((unsigned int)src + n-1);
|
154 |
|
|
vDst = (vec_uchar16 *)((unsigned int)dest + n-1);
|
155 |
|
|
|
156 |
|
|
/* Handle any leading destination partial quadwords as
|
157 |
|
|
* well a very short copy (ie, such that the n characters
|
158 |
|
|
* all reside in a single (destination) quadword.
|
159 |
|
|
*/
|
160 |
|
|
soffset1 = (unsigned int)(src) & 15;
|
161 |
|
|
soffset2 = (unsigned int)(vSrc) & 15;
|
162 |
|
|
doffset1 = (unsigned int)(dest) & 15;
|
163 |
|
|
doffset2 = (unsigned int)(vDst) & 15;
|
164 |
|
|
|
165 |
|
|
/* Compute a shuffle pattern used to align the source string
|
166 |
|
|
* with the alignment of the destination string.
|
167 |
|
|
*/
|
168 |
|
|
adjust = (int)spu_extract(spu_cmpgt(spu_promote(soffset2, 0), spu_promote(doffset2, 0)), 0);
|
169 |
|
|
delta = (int)doffset2 - (int)soffset2;
|
170 |
|
|
delta += adjust & 16;
|
171 |
|
|
|
172 |
|
|
shuffle = (vec_uchar16)spu_sub(VEC_LITERAL(vec_uint4, 0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F),
|
173 |
|
|
(vec_uint4)spu_splats((unsigned char)delta));
|
174 |
|
|
|
175 |
|
|
vSrc -= adjust;
|
176 |
|
|
|
177 |
|
|
sdata2 = *vSrc--;
|
178 |
|
|
sdata1 = *vSrc--;
|
179 |
|
|
|
180 |
|
|
ddata = *vDst;
|
181 |
|
|
sdata = spu_shuffle(sdata1, sdata2, shuffle);
|
182 |
|
|
|
183 |
|
|
/* Construct a series of masks used to data insert. The masks
|
184 |
|
|
* contain 0 when the destination word is unchanged, 1 when it
|
185 |
|
|
* must be replaced by source bytes.
|
186 |
|
|
*
|
187 |
|
|
* mask1 = mask for leading unchanged bytes
|
188 |
|
|
* mask2 = mask for trailing unchange bytes
|
189 |
|
|
* mask3 = mask indicating the more than one qword is being changed.
|
190 |
|
|
*/
|
191 |
|
|
mask = one;
|
192 |
|
|
mask1 = spu_rlmaskqwbyte(mask, -doffset1);
|
193 |
|
|
mask2 = spu_slqwbyte(mask, 15-doffset2);
|
194 |
|
|
mask3 = (vec_uchar16)spu_cmpgt(spu_splats((int)(doffset2 - n)), -2);
|
195 |
|
|
|
196 |
|
|
*vDst-- = spu_sel(ddata, sdata, spu_and(mask2, spu_orc(mask1, mask3)));
|
197 |
|
|
|
198 |
|
|
n -= doffset2 + 1;
|
199 |
|
|
|
200 |
|
|
/* Handle complete destination quadwords
|
201 |
|
|
*/
|
202 |
|
|
while ((int)n > 15) {
|
203 |
|
|
sdata2 = sdata1;
|
204 |
|
|
sdata1 = *vSrc--;
|
205 |
|
|
*vDst-- = spu_shuffle(sdata1, sdata2, shuffle);
|
206 |
|
|
n -= 16;
|
207 |
|
|
}
|
208 |
|
|
|
209 |
|
|
/* Handle any trailing partial (destination) quadwords
|
210 |
|
|
*/
|
211 |
|
|
mask = spu_and((vec_uchar16)spu_cmpgt(spu_splats((int)n), 0), mask1);
|
212 |
|
|
*vDst = spu_sel(*vDst, spu_shuffle(*vSrc, sdata1, shuffle), mask);
|
213 |
|
|
}
|
214 |
|
|
return (dest);
|
215 |
|
|
}
|
216 |
|
|
|