1 |
207 |
jeremybenn |
/* -------------------------------------------------------------- */
|
2 |
|
|
/* (C)Copyright 2007,2008, */
|
3 |
|
|
/* International Business Machines Corporation */
|
4 |
|
|
/* All Rights Reserved. */
|
5 |
|
|
/* */
|
6 |
|
|
/* Redistribution and use in source and binary forms, with or */
|
7 |
|
|
/* without modification, are permitted provided that the */
|
8 |
|
|
/* following conditions are met: */
|
9 |
|
|
/* */
|
10 |
|
|
/* - Redistributions of source code must retain the above copyright*/
|
11 |
|
|
/* notice, this list of conditions and the following disclaimer. */
|
12 |
|
|
/* */
|
13 |
|
|
/* - Redistributions in binary form must reproduce the above */
|
14 |
|
|
/* copyright notice, this list of conditions and the following */
|
15 |
|
|
/* disclaimer in the documentation and/or other materials */
|
16 |
|
|
/* provided with the distribution. */
|
17 |
|
|
/* */
|
18 |
|
|
/* - Neither the name of IBM Corporation nor the names of its */
|
19 |
|
|
/* contributors may be used to endorse or promote products */
|
20 |
|
|
/* derived from this software without specific prior written */
|
21 |
|
|
/* permission. */
|
22 |
|
|
/* */
|
23 |
|
|
/* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND */
|
24 |
|
|
/* CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
25 |
|
|
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
26 |
|
|
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
27 |
|
|
/* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR */
|
28 |
|
|
/* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, */
|
29 |
|
|
/* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT */
|
30 |
|
|
/* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; */
|
31 |
|
|
/* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) */
|
32 |
|
|
/* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN */
|
33 |
|
|
/* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR */
|
34 |
|
|
/* OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, */
|
35 |
|
|
/* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
|
36 |
|
|
/* -------------------------------------------------------------- */
|
37 |
|
|
/* PROLOG END TAG zYx */
|
38 |
|
|
#ifdef __SPU__
|
39 |
|
|
#ifndef _ERFF4_H_
|
40 |
|
|
#define _ERFF4_H_ 1
|
41 |
|
|
|
42 |
|
|
#include <spu_intrinsics.h>
|
43 |
|
|
|
44 |
|
|
/*
|
45 |
|
|
* FUNCTION
|
46 |
|
|
* vector float _erff4(vector float x)
|
47 |
|
|
*
|
48 |
|
|
* DESCRIPTION
|
49 |
|
|
* The erff4 function computes the error function of each element of x.
|
50 |
|
|
*
|
51 |
|
|
* C99 Special Cases:
|
52 |
|
|
* - erf(+0) returns +0
|
53 |
|
|
* - erf(-0) returns -0
|
54 |
|
|
* - erf(+infinite) returns +1
|
55 |
|
|
* - erf(-infinite) returns -1
|
56 |
|
|
*
|
57 |
|
|
*/
|
58 |
|
|
|
59 |
|
|
static __inline vector float _erff4(vector float x)
|
60 |
|
|
{
|
61 |
|
|
vec_float4 sign_maskf = spu_splats(-0.0f);
|
62 |
|
|
vec_float4 zerof = spu_splats(0.0f);
|
63 |
|
|
vec_float4 onef = spu_splats(1.0f);
|
64 |
|
|
vec_float4 clamp = spu_splats(3.9199876f);
|
65 |
|
|
vec_float4 xabs = spu_andc(x, sign_maskf);
|
66 |
|
|
vec_float4 xsign = spu_and(x, sign_maskf);
|
67 |
|
|
vec_float4 result;
|
68 |
|
|
|
69 |
|
|
|
70 |
|
|
/*
|
71 |
|
|
* First thing we do is setup the description of each partition.
|
72 |
|
|
* This consists of:
|
73 |
|
|
* - Start x of partition
|
74 |
|
|
* - Offset (used for evaluating power series expanded around a point)
|
75 |
|
|
* - Truncation adjustment.
|
76 |
|
|
*/
|
77 |
|
|
|
78 |
|
|
|
79 |
|
|
/***************************************************************
|
80 |
|
|
* REGION 0: Approximation Near 0 from Above
|
81 |
|
|
*
|
82 |
|
|
*/
|
83 |
|
|
#define SDM_ERFF4_0_START 0.0f
|
84 |
|
|
#define SDM_ERFF4_0_OFF 0.0f
|
85 |
|
|
#define SDM_ERFF4_0_TRUNC 2u
|
86 |
|
|
|
87 |
|
|
#define SDM_ERFF4_0_00 0.0f
|
88 |
|
|
#define SDM_ERFF4_0_01 1.12837916709551257389615890312154f
|
89 |
|
|
#define SDM_ERFF4_0_02 0.0f
|
90 |
|
|
#define SDM_ERFF4_0_03 -0.37612638903183752463205296770955f
|
91 |
|
|
#define SDM_ERFF4_0_04 0.0f
|
92 |
|
|
#define SDM_ERFF4_0_05 0.11283791670955125738961589031073f
|
93 |
|
|
#define SDM_ERFF4_0_06 0.0f
|
94 |
|
|
#define SDM_ERFF4_0_07 -0.02686617064513125175943235483588f
|
95 |
|
|
#define SDM_ERFF4_0_08 0.0f
|
96 |
|
|
#define SDM_ERFF4_0_09 0.00522397762544218784211184677371f
|
97 |
|
|
#define SDM_ERFF4_0_10 0.0f
|
98 |
|
|
//#define SDM_ERFF4_0_11 -0.00085483270234508528325466583569f
|
99 |
|
|
|
100 |
|
|
|
101 |
|
|
|
102 |
|
|
/***************************************************************
|
103 |
|
|
* REGION 1: Above 0 and Below 1
|
104 |
|
|
*/
|
105 |
|
|
#define SDM_ERFF4_1_START 0.07f
|
106 |
|
|
#define SDM_ERFF4_1_OFF 0.0625f
|
107 |
|
|
#define SDM_ERFF4_1_TRUNC 1u
|
108 |
|
|
|
109 |
|
|
#define SDM_ERFF4_1_00 0.0704319777223870780505900559232967439190042883f
|
110 |
|
|
#define SDM_ERFF4_1_01 1.1239800336253906104888456836298420746260842545f
|
111 |
|
|
#define SDM_ERFF4_1_02 -0.0702487521015869131555528552268651296641302713f
|
112 |
|
|
#define SDM_ERFF4_1_03 -0.3717329798708974154481338589088279778060226856f
|
113 |
|
|
#define SDM_ERFF4_1_04 0.0350329063214945152846051348331892508611482993f
|
114 |
|
|
#define SDM_ERFF4_1_05 0.1106440713032318617523250293018186620702780982f
|
115 |
|
|
#define SDM_ERFF4_1_06 -0.0116471931712158678624014740659716890227703402f
|
116 |
|
|
#define SDM_ERFF4_1_07 -0.0261358409084263503958678377968739965222786482f
|
117 |
|
|
#define SDM_ERFF4_1_08 0.0029041996223118476954500365511415181291113910f
|
118 |
|
|
#define SDM_ERFF4_1_09 0.0050416329596619035812041623972929782386498567f
|
119 |
|
|
#define SDM_ERFF4_1_10 -0.0005793225670734356072895029723913210064918149f
|
120 |
|
|
//#define SDM_ERFF4_1_11 -0.0008184112733188406359323913130525859730689332f
|
121 |
|
|
|
122 |
|
|
|
123 |
|
|
|
124 |
|
|
/***************************************************************
|
125 |
|
|
* REGION 2:
|
126 |
|
|
*/
|
127 |
|
|
#define SDM_ERFF4_2_START 0.13f
|
128 |
|
|
#define SDM_ERFF4_2_OFF 0.1875f
|
129 |
|
|
#define SDM_ERFF4_2_TRUNC 1u
|
130 |
|
|
|
131 |
|
|
#define SDM_ERFF4_2_00 0.2091176770593758483008706390019410965937912290f
|
132 |
|
|
#define SDM_ERFF4_2_01 1.0893988034775673230502318110338693557898033315f
|
133 |
|
|
#define SDM_ERFF4_2_02 -0.2042622756520438730719184645688505042105881396f
|
134 |
|
|
#define SDM_ERFF4_2_03 -0.3376001500360169568827541289401834722369442864f
|
135 |
|
|
#define SDM_ERFF4_2_04 0.0997374392832245473983976877777590352590762400f
|
136 |
|
|
#define SDM_ERFF4_2_05 0.0937997370645632460099464120987231140266525679f
|
137 |
|
|
#define SDM_ERFF4_2_06 -0.0324591340420617488485277008302392706957527828f
|
138 |
|
|
#define SDM_ERFF4_2_07 -0.0205943885488331791711970665266474471714543313f
|
139 |
|
|
#define SDM_ERFF4_2_08 0.0079208906865255014554772269570592999495375181f
|
140 |
|
|
#define SDM_ERFF4_2_09 0.0036744273281123333893101007014150883409965011f
|
141 |
|
|
#define SDM_ERFF4_2_10 -0.0015459493690754127608506357908913858038162608f
|
142 |
|
|
//#define SDM_ERFF4_2_11 -0.0005485671070180836650399266219057172124875094f
|
143 |
|
|
|
144 |
|
|
|
145 |
|
|
|
146 |
|
|
/***************************************************************
|
147 |
|
|
* REGION 3:
|
148 |
|
|
*/
|
149 |
|
|
#define SDM_ERFF4_3_START 0.25f
|
150 |
|
|
#define SDM_ERFF4_3_OFF 0.5f
|
151 |
|
|
#define SDM_ERFF4_3_TRUNC 2u
|
152 |
|
|
|
153 |
|
|
#define SDM_ERFF4_3_00 0.5204998778130465376827466538919645287364515699f
|
154 |
|
|
#define SDM_ERFF4_3_01 0.8787825789354447940937239548244578983625218956f
|
155 |
|
|
#define SDM_ERFF4_3_02 -0.4393912894677223970468619774122289491812609947f
|
156 |
|
|
#define SDM_ERFF4_3_03 -0.1464637631559074656822873258040763163937536583f
|
157 |
|
|
#define SDM_ERFF4_3_04 0.1830797039448843321028591572550953954921920811f
|
158 |
|
|
#define SDM_ERFF4_3_05 0.0073231881577953732841143662902038158196876832f
|
159 |
|
|
#define SDM_ERFF4_3_06 -0.0500417857449350507747815029830594081011991688f
|
160 |
|
|
#define SDM_ERFF4_3_07 0.0054052103069442040906558417856266259621504328f
|
161 |
|
|
#define SDM_ERFF4_3_08 0.0100475885141180567975497704160236877764167320f
|
162 |
|
|
#define SDM_ERFF4_3_09 -0.0021674118390300459951330548378744759122422210f
|
163 |
|
|
#define SDM_ERFF4_3_10 -0.0015694967741624277200510981457278746801387524f
|
164 |
|
|
//#define SDM_ERFF4_3_11 0.0004973489167651373192082360776274483020158863f
|
165 |
|
|
|
166 |
|
|
|
167 |
|
|
|
168 |
|
|
/***************************************************************
|
169 |
|
|
* REGION 4:
|
170 |
|
|
*/
|
171 |
|
|
#define SDM_ERFF4_4_START 0.77f
|
172 |
|
|
#define SDM_ERFF4_4_OFF 1.0f
|
173 |
|
|
#define SDM_ERFF4_4_TRUNC 1u
|
174 |
|
|
|
175 |
|
|
#define SDM_ERFF4_4_00 0.8427007929497148693412206350826092590442f
|
176 |
|
|
#define SDM_ERFF4_4_01 0.4151074974205947033402682494413373653605f
|
177 |
|
|
#define SDM_ERFF4_4_02 -0.4151074974205947033402682494413373653605f
|
178 |
|
|
#define SDM_ERFF4_4_03 0.1383691658068649011134227498137791217898f
|
179 |
|
|
#define SDM_ERFF4_4_04 0.0691845829034324505567113749068895608946f
|
180 |
|
|
#define SDM_ERFF4_4_05 -0.0691845829034324505567113749068895608946f
|
181 |
|
|
#define SDM_ERFF4_4_06 0.0046123055268954967037807583271259707263f
|
182 |
|
|
#define SDM_ERFF4_4_07 0.0151547181597994891695653487891281895293f
|
183 |
|
|
#define SDM_ERFF4_4_08 -0.0047770307242846215860586425530947553951f
|
184 |
|
|
#define SDM_ERFF4_4_09 -0.0018851883701199847638468972527538689873f
|
185 |
|
|
#define SDM_ERFF4_4_10 0.0012262875805634852347353603488787303121f
|
186 |
|
|
//#define SDM_ERFF4_4_11 0.0000855239913717274641321540324726821411f
|
187 |
|
|
|
188 |
|
|
|
189 |
|
|
|
190 |
|
|
/***************************************************************
|
191 |
|
|
* REGION 5:
|
192 |
|
|
*/
|
193 |
|
|
#define SDM_ERFF4_5_START 1.36f
|
194 |
|
|
#define SDM_ERFF4_5_OFF 1.875f
|
195 |
|
|
#define SDM_ERFF4_5_TRUNC 1u
|
196 |
|
|
|
197 |
|
|
#define SDM_ERFF4_5_00 0.99199005767011997029646305969122440092668f
|
198 |
|
|
#define SDM_ERFF4_5_01 0.03354582842421607459425032786195496507386f
|
199 |
|
|
#define SDM_ERFF4_5_02 -0.06289842829540513986421936474116555951979f
|
200 |
|
|
#define SDM_ERFF4_5_03 0.06744109256118439996552409663913862770819f
|
201 |
|
|
#define SDM_ERFF4_5_04 -0.04225988151097532834627238568547061029869f
|
202 |
|
|
#define SDM_ERFF4_5_05 0.01146258336487617627004706027236136941544f
|
203 |
|
|
#define SDM_ERFF4_5_06 0.00410518713321247739022655684589964019683f
|
204 |
|
|
#define SDM_ERFF4_5_07 -0.00492839390823910723763257456562751425198f
|
205 |
|
|
#define SDM_ERFF4_5_08 0.00143050168737012207687743571780226012058f
|
206 |
|
|
#define SDM_ERFF4_5_09 0.00036225644575338665306295794978774160986f
|
207 |
|
|
#define SDM_ERFF4_5_10 -0.00039015757824554169745459780322413823624f
|
208 |
|
|
//#define SDM_ERFF4_5_11 0.00007372993782406230817649249567932577159f
|
209 |
|
|
|
210 |
|
|
|
211 |
|
|
|
212 |
|
|
/***************************************************************
|
213 |
|
|
* REGION 6:
|
214 |
|
|
*/
|
215 |
|
|
#define SDM_ERFF4_6_START 2.0f
|
216 |
|
|
#define SDM_ERFF4_6_OFF 2.5f
|
217 |
|
|
#define SDM_ERFF4_6_TRUNC 1u
|
218 |
|
|
|
219 |
|
|
#define SDM_ERFF4_6_00 0.999593047982555041060435784260025087279f
|
220 |
|
|
#define SDM_ERFF4_6_01 0.002178284230352709720386678564097264007f
|
221 |
|
|
#define SDM_ERFF4_6_02 -0.005445710575881774300966696410243160031f
|
222 |
|
|
#define SDM_ERFF4_6_03 0.008350089549685387261482267829039512051f
|
223 |
|
|
#define SDM_ERFF4_6_04 -0.008622375078479475976530602649551670054f
|
224 |
|
|
#define SDM_ERFF4_6_05 0.006117348213573859798085922300839816434f
|
225 |
|
|
#define SDM_ERFF4_6_06 -0.002798490157050356237996774544152735014f
|
226 |
|
|
#define SDM_ERFF4_6_07 0.000542410061327906884739143174194854432f
|
227 |
|
|
#define SDM_ERFF4_6_08 0.000260670173895134533751630061303802055f
|
228 |
|
|
#define SDM_ERFF4_6_09 -0.000250285386311056635227961206817778392f
|
229 |
|
|
#define SDM_ERFF4_6_10 0.000078801328907504400502579703621546608f
|
230 |
|
|
//#define SDM_ERFF4_6_11 5.137004620216358263402877651297096663210e-6f
|
231 |
|
|
|
232 |
|
|
|
233 |
|
|
|
234 |
|
|
/***************************************************************
|
235 |
|
|
* REGION 7:
|
236 |
|
|
*/
|
237 |
|
|
#define SDM_ERFF4_7_START 2.75f
|
238 |
|
|
#define SDM_ERFF4_7_OFF 3.5f
|
239 |
|
|
#define SDM_ERFF4_7_TRUNC 1u
|
240 |
|
|
|
241 |
|
|
#define SDM_ERFF4_7_00 0.999999256901627658587254476316243904363263f
|
242 |
|
|
#define SDM_ERFF4_7_01 5.399426777384782511586818937495781413007869e-6f
|
243 |
|
|
#define SDM_ERFF4_7_02 -0.000018897993720846738790553866281235234945f
|
244 |
|
|
#define SDM_ERFF4_7_03 0.000042295509756180796340763415010383621069f
|
245 |
|
|
#define SDM_ERFF4_7_04 -0.000067717810833034147332818020841092925222f
|
246 |
|
|
#define SDM_ERFF4_7_05 0.000082116282239393567363716204674415008991f
|
247 |
|
|
#define SDM_ERFF4_7_06 -0.000077744246390483389302250766562526063763f
|
248 |
|
|
#define SDM_ERFF4_7_07 0.000058192750619199206596604051163855823527f
|
249 |
|
|
#define SDM_ERFF4_7_08 -0.000034259175422410008064403380504975403351f
|
250 |
|
|
#define SDM_ERFF4_7_09 0.000015330768263696827211862952666453348031f
|
251 |
|
|
#define SDM_ERFF4_7_10 -4.641017709492666503521243665632827470977627e-6f
|
252 |
|
|
//#define SDM_ERFF4_7_11 4.447037356176705948450355327103423490366212e-7f
|
253 |
|
|
|
254 |
|
|
|
255 |
|
|
|
256 |
|
|
|
257 |
|
|
|
258 |
|
|
/***************************************************************
|
259 |
|
|
* Now we load the description of each partition.
|
260 |
|
|
*/
|
261 |
|
|
|
262 |
|
|
/* Start point for each partition */
|
263 |
|
|
vec_float4 r1start = spu_splats(SDM_ERFF4_1_START);
|
264 |
|
|
vec_float4 r2start = spu_splats(SDM_ERFF4_2_START);
|
265 |
|
|
vec_float4 r3start = spu_splats(SDM_ERFF4_3_START);
|
266 |
|
|
vec_float4 r4start = spu_splats(SDM_ERFF4_4_START);
|
267 |
|
|
vec_float4 r5start = spu_splats(SDM_ERFF4_5_START);
|
268 |
|
|
vec_float4 r6start = spu_splats(SDM_ERFF4_6_START);
|
269 |
|
|
vec_float4 r7start = spu_splats(SDM_ERFF4_7_START);
|
270 |
|
|
|
271 |
|
|
/* X Offset for each partition */
|
272 |
|
|
vec_float4 xoffseta = (vec_float4) {SDM_ERFF4_0_OFF, SDM_ERFF4_1_OFF, SDM_ERFF4_2_OFF, SDM_ERFF4_3_OFF};
|
273 |
|
|
vec_float4 xoffsetb = (vec_float4) {SDM_ERFF4_4_OFF, SDM_ERFF4_5_OFF, SDM_ERFF4_6_OFF, SDM_ERFF4_7_OFF};
|
274 |
|
|
|
275 |
|
|
/* Truncation Correction for each partition */
|
276 |
|
|
vec_uint4 tcorra = (vec_uint4) {SDM_ERFF4_0_TRUNC, SDM_ERFF4_1_TRUNC, SDM_ERFF4_2_TRUNC, SDM_ERFF4_3_TRUNC};
|
277 |
|
|
vec_uint4 tcorrb = (vec_uint4) {SDM_ERFF4_4_TRUNC, SDM_ERFF4_5_TRUNC, SDM_ERFF4_6_TRUNC, SDM_ERFF4_7_TRUNC};
|
278 |
|
|
|
279 |
|
|
/* The coefficients for each partition */
|
280 |
|
|
vec_float4 c00a = (vec_float4) {SDM_ERFF4_0_00, SDM_ERFF4_1_00, SDM_ERFF4_2_00, SDM_ERFF4_3_00};
|
281 |
|
|
vec_float4 c01a = (vec_float4) {SDM_ERFF4_0_01, SDM_ERFF4_1_01, SDM_ERFF4_2_01, SDM_ERFF4_3_01};
|
282 |
|
|
vec_float4 c02a = (vec_float4) {SDM_ERFF4_0_02, SDM_ERFF4_1_02, SDM_ERFF4_2_02, SDM_ERFF4_3_02};
|
283 |
|
|
vec_float4 c03a = (vec_float4) {SDM_ERFF4_0_03, SDM_ERFF4_1_03, SDM_ERFF4_2_03, SDM_ERFF4_3_03};
|
284 |
|
|
vec_float4 c04a = (vec_float4) {SDM_ERFF4_0_04, SDM_ERFF4_1_04, SDM_ERFF4_2_04, SDM_ERFF4_3_04};
|
285 |
|
|
vec_float4 c05a = (vec_float4) {SDM_ERFF4_0_05, SDM_ERFF4_1_05, SDM_ERFF4_2_05, SDM_ERFF4_3_05};
|
286 |
|
|
vec_float4 c06a = (vec_float4) {SDM_ERFF4_0_06, SDM_ERFF4_1_06, SDM_ERFF4_2_06, SDM_ERFF4_3_06};
|
287 |
|
|
vec_float4 c07a = (vec_float4) {SDM_ERFF4_0_07, SDM_ERFF4_1_07, SDM_ERFF4_2_07, SDM_ERFF4_3_07};
|
288 |
|
|
vec_float4 c08a = (vec_float4) {SDM_ERFF4_0_08, SDM_ERFF4_1_08, SDM_ERFF4_2_08, SDM_ERFF4_3_08};
|
289 |
|
|
vec_float4 c09a = (vec_float4) {SDM_ERFF4_0_09, SDM_ERFF4_1_09, SDM_ERFF4_2_09, SDM_ERFF4_3_09};
|
290 |
|
|
vec_float4 c10a = (vec_float4) {SDM_ERFF4_0_10, SDM_ERFF4_1_10, SDM_ERFF4_2_10, SDM_ERFF4_3_10};
|
291 |
|
|
|
292 |
|
|
vec_float4 c00b = (vec_float4) {SDM_ERFF4_4_00, SDM_ERFF4_5_00, SDM_ERFF4_6_00, SDM_ERFF4_7_00};
|
293 |
|
|
vec_float4 c01b = (vec_float4) {SDM_ERFF4_4_01, SDM_ERFF4_5_01, SDM_ERFF4_6_01, SDM_ERFF4_7_01};
|
294 |
|
|
vec_float4 c02b = (vec_float4) {SDM_ERFF4_4_02, SDM_ERFF4_5_02, SDM_ERFF4_6_02, SDM_ERFF4_7_02};
|
295 |
|
|
vec_float4 c03b = (vec_float4) {SDM_ERFF4_4_03, SDM_ERFF4_5_03, SDM_ERFF4_6_03, SDM_ERFF4_7_03};
|
296 |
|
|
vec_float4 c04b = (vec_float4) {SDM_ERFF4_4_04, SDM_ERFF4_5_04, SDM_ERFF4_6_04, SDM_ERFF4_7_04};
|
297 |
|
|
vec_float4 c05b = (vec_float4) {SDM_ERFF4_4_05, SDM_ERFF4_5_05, SDM_ERFF4_6_05, SDM_ERFF4_7_05};
|
298 |
|
|
vec_float4 c06b = (vec_float4) {SDM_ERFF4_4_06, SDM_ERFF4_5_06, SDM_ERFF4_6_06, SDM_ERFF4_7_06};
|
299 |
|
|
vec_float4 c07b = (vec_float4) {SDM_ERFF4_4_07, SDM_ERFF4_5_07, SDM_ERFF4_6_07, SDM_ERFF4_7_07};
|
300 |
|
|
vec_float4 c08b = (vec_float4) {SDM_ERFF4_4_08, SDM_ERFF4_5_08, SDM_ERFF4_6_08, SDM_ERFF4_7_08};
|
301 |
|
|
vec_float4 c09b = (vec_float4) {SDM_ERFF4_4_09, SDM_ERFF4_5_09, SDM_ERFF4_6_09, SDM_ERFF4_7_09};
|
302 |
|
|
vec_float4 c10b = (vec_float4) {SDM_ERFF4_4_10, SDM_ERFF4_5_10, SDM_ERFF4_6_10, SDM_ERFF4_7_10};
|
303 |
|
|
|
304 |
|
|
|
305 |
|
|
vec_uchar16 shuffle0 = (vec_uchar16) spu_splats(0x00010203);
|
306 |
|
|
vec_uchar16 shuffle1 = (vec_uchar16) spu_splats(0x04050607);
|
307 |
|
|
vec_uchar16 shuffle2 = (vec_uchar16) spu_splats(0x08090A0B);
|
308 |
|
|
vec_uchar16 shuffle3 = (vec_uchar16) spu_splats(0x0C0D0E0F);
|
309 |
|
|
vec_uchar16 shuffle4 = (vec_uchar16) spu_splats(0x10111213);
|
310 |
|
|
vec_uchar16 shuffle5 = (vec_uchar16) spu_splats(0x14151617);
|
311 |
|
|
vec_uchar16 shuffle6 = (vec_uchar16) spu_splats(0x18191A1B);
|
312 |
|
|
vec_uchar16 shuffle7 = (vec_uchar16) spu_splats(0x1C1D1E1F);
|
313 |
|
|
|
314 |
|
|
|
315 |
|
|
/*
|
316 |
|
|
* Determine the shuffle pattern based on which partition
|
317 |
|
|
* each element of x is in.
|
318 |
|
|
*/
|
319 |
|
|
|
320 |
|
|
vec_uchar16 gt_r1start = (vec_uchar16)spu_cmpabsgt(x, r1start);
|
321 |
|
|
vec_uchar16 gt_r2start = (vec_uchar16)spu_cmpabsgt(x, r2start);
|
322 |
|
|
vec_uchar16 gt_r3start = (vec_uchar16)spu_cmpabsgt(x, r3start);
|
323 |
|
|
vec_uchar16 gt_r4start = (vec_uchar16)spu_cmpabsgt(x, r4start);
|
324 |
|
|
vec_uchar16 gt_r5start = (vec_uchar16)spu_cmpabsgt(x, r5start);
|
325 |
|
|
vec_uchar16 gt_r6start = (vec_uchar16)spu_cmpabsgt(x, r6start);
|
326 |
|
|
vec_uchar16 gt_r7start = (vec_uchar16)spu_cmpabsgt(x, r7start);
|
327 |
|
|
|
328 |
|
|
vec_uchar16 shufflepattern;
|
329 |
|
|
shufflepattern = spu_sel(shuffle0, shuffle1, gt_r1start);
|
330 |
|
|
shufflepattern = spu_sel(shufflepattern, shuffle2, gt_r2start);
|
331 |
|
|
shufflepattern = spu_sel(shufflepattern, shuffle3, gt_r3start);
|
332 |
|
|
shufflepattern = spu_sel(shufflepattern, shuffle4, gt_r4start);
|
333 |
|
|
shufflepattern = spu_sel(shufflepattern, shuffle5, gt_r5start);
|
334 |
|
|
shufflepattern = spu_sel(shufflepattern, shuffle6, gt_r6start);
|
335 |
|
|
shufflepattern = spu_sel(shufflepattern, shuffle7, gt_r7start);
|
336 |
|
|
|
337 |
|
|
|
338 |
|
|
|
339 |
|
|
/* Use the shuffle pattern to select the coefficients */
|
340 |
|
|
|
341 |
|
|
vec_float4 coeff_10 = spu_shuffle(c10a, c10b, shufflepattern);
|
342 |
|
|
vec_float4 coeff_09 = spu_shuffle(c09a, c09b, shufflepattern);
|
343 |
|
|
vec_float4 coeff_08 = spu_shuffle(c08a, c08b, shufflepattern);
|
344 |
|
|
vec_float4 coeff_07 = spu_shuffle(c07a, c07b, shufflepattern);
|
345 |
|
|
vec_float4 coeff_06 = spu_shuffle(c06a, c06b, shufflepattern);
|
346 |
|
|
vec_float4 coeff_05 = spu_shuffle(c05a, c05b, shufflepattern);
|
347 |
|
|
vec_float4 coeff_04 = spu_shuffle(c04a, c04b, shufflepattern);
|
348 |
|
|
vec_float4 coeff_03 = spu_shuffle(c03a, c03b, shufflepattern);
|
349 |
|
|
vec_float4 coeff_02 = spu_shuffle(c02a, c02b, shufflepattern);
|
350 |
|
|
vec_float4 coeff_01 = spu_shuffle(c01a, c01b, shufflepattern);
|
351 |
|
|
vec_float4 coeff_00 = spu_shuffle(c00a, c00b, shufflepattern);
|
352 |
|
|
|
353 |
|
|
vec_float4 xoffset = spu_shuffle(xoffseta, xoffsetb, shufflepattern);
|
354 |
|
|
vec_uint4 tcorrection = spu_shuffle(tcorra, tcorrb, shufflepattern);
|
355 |
|
|
|
356 |
|
|
|
357 |
|
|
/*
|
358 |
|
|
* We've completed the coeff. setup. Now we actually do the
|
359 |
|
|
* approximation below.
|
360 |
|
|
*/
|
361 |
|
|
|
362 |
|
|
/* Adjust x value here (for approximations about a point) */
|
363 |
|
|
vec_float4 xappr = spu_sub(xabs, xoffset);
|
364 |
|
|
|
365 |
|
|
|
366 |
|
|
/* Now we do the multiplies.
|
367 |
|
|
* Use Horner's method.
|
368 |
|
|
*/
|
369 |
|
|
result = coeff_10;
|
370 |
|
|
result = spu_madd(xappr, result, coeff_09);
|
371 |
|
|
result = spu_madd(xappr, result, coeff_08);
|
372 |
|
|
result = spu_madd(xappr, result, coeff_07);
|
373 |
|
|
result = spu_madd(xappr, result, coeff_06);
|
374 |
|
|
result = spu_madd(xappr, result, coeff_05);
|
375 |
|
|
result = spu_madd(xappr, result, coeff_04);
|
376 |
|
|
result = spu_madd(xappr, result, coeff_03);
|
377 |
|
|
result = spu_madd(xappr, result, coeff_02);
|
378 |
|
|
result = spu_madd(xappr, result, coeff_01);
|
379 |
|
|
result = spu_madd(xappr, result, coeff_00);
|
380 |
|
|
|
381 |
|
|
|
382 |
|
|
/* Adjust due to systematic truncation. Note that the correction
|
383 |
|
|
* value is always non-negative, so the result is cast as uint
|
384 |
|
|
* to do the adjustment.
|
385 |
|
|
*/
|
386 |
|
|
result = (vec_float4)spu_add((vec_uint4)result, tcorrection);
|
387 |
|
|
|
388 |
|
|
|
389 |
|
|
/*
|
390 |
|
|
* Special Cases
|
391 |
|
|
*/
|
392 |
|
|
|
393 |
|
|
/* Erf(0) = 0 */
|
394 |
|
|
result = spu_sel(result, zerof, spu_cmpeq(xabs, zerof));
|
395 |
|
|
|
396 |
|
|
/* Erf(infinity) = 1 */
|
397 |
|
|
result = spu_sel(result, onef, spu_cmpgt(xabs, clamp));
|
398 |
|
|
|
399 |
|
|
|
400 |
|
|
/* Preserve sign in result, since erf(-x) = -erf(x) */
|
401 |
|
|
result = spu_or(result, xsign);
|
402 |
|
|
|
403 |
|
|
return result;
|
404 |
|
|
}
|
405 |
|
|
|
406 |
|
|
#endif /* _ERFF4_H_ */
|
407 |
|
|
#endif /* __SPU__ */
|