1 |
148 |
jeremybenn |
/* -------------------------------------------------------------- */
|
2 |
|
|
/* (C)Copyright 2007,2008, */
|
3 |
|
|
/* International Business Machines Corporation */
|
4 |
|
|
/* All Rights Reserved. */
|
5 |
|
|
/* */
|
6 |
|
|
/* Redistribution and use in source and binary forms, with or */
|
7 |
|
|
/* without modification, are permitted provided that the */
|
8 |
|
|
/* following conditions are met: */
|
9 |
|
|
/* */
|
10 |
|
|
/* - Redistributions of source code must retain the above copyright*/
|
11 |
|
|
/* notice, this list of conditions and the following disclaimer. */
|
12 |
|
|
/* */
|
13 |
|
|
/* - Redistributions in binary form must reproduce the above */
|
14 |
|
|
/* copyright notice, this list of conditions and the following */
|
15 |
|
|
/* disclaimer in the documentation and/or other materials */
|
16 |
|
|
/* provided with the distribution. */
|
17 |
|
|
/* */
|
18 |
|
|
/* - Neither the name of IBM Corporation nor the names of its */
|
19 |
|
|
/* contributors may be used to endorse or promote products */
|
20 |
|
|
/* derived from this software without specific prior written */
|
21 |
|
|
/* permission. */
|
22 |
|
|
/* */
|
23 |
|
|
/* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND */
|
24 |
|
|
/* CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
25 |
|
|
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
26 |
|
|
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
27 |
|
|
/* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR */
|
28 |
|
|
/* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, */
|
29 |
|
|
/* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT */
|
30 |
|
|
/* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; */
|
31 |
|
|
/* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) */
|
32 |
|
|
/* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN */
|
33 |
|
|
/* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR */
|
34 |
|
|
/* OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, */
|
35 |
|
|
/* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
|
36 |
|
|
/* -------------------------------------------------------------- */
|
37 |
|
|
/* PROLOG END TAG zYx */
|
38 |
|
|
#ifdef __SPU__
|
39 |
|
|
|
40 |
|
|
#ifndef _LGAMMAF4_H_
|
41 |
|
|
#define _LGAMMAF4_H_ 1
|
42 |
|
|
|
43 |
|
|
#include <spu_intrinsics.h>
|
44 |
|
|
|
45 |
|
|
#include "logf4.h"
|
46 |
|
|
#include "divf4.h"
|
47 |
|
|
#include "recipf4.h"
|
48 |
|
|
#include "truncf4.h"
|
49 |
|
|
#include "sinf4.h"
|
50 |
|
|
|
51 |
|
|
|
52 |
|
|
/*
|
53 |
|
|
* FUNCTION
|
54 |
|
|
* vector float _lgammaf4(vector float x) - Natural Log of Gamma Function
|
55 |
|
|
*
|
56 |
|
|
* DESCRIPTION
|
57 |
|
|
* _lgammaf4 calculates the natural logarithm of the absolute value of the gamma
|
58 |
|
|
* function for the corresponding elements of the input vector.
|
59 |
|
|
*
|
60 |
|
|
* C99 Special Cases:
|
61 |
|
|
* lgamma(0) returns +infinity
|
62 |
|
|
* lgamma(1) returns +0
|
63 |
|
|
* lgamma(2) returns +0
|
64 |
|
|
* lgamma(negative integer) returns +infinity
|
65 |
|
|
* lgamma(+infinity) returns +infinity
|
66 |
|
|
* lgamma(-infinity) returns +infinity
|
67 |
|
|
*
|
68 |
|
|
* Other Cases:
|
69 |
|
|
* lgamma(Nan) returns Nan
|
70 |
|
|
* lgamma(Denorm) treated as lgamma(0) and returns +infinity
|
71 |
|
|
*
|
72 |
|
|
*/
|
73 |
|
|
|
74 |
|
|
static __inline vector float _lgammaf4(vector float x)
|
75 |
|
|
{
|
76 |
|
|
vec_float4 result;
|
77 |
|
|
vec_float4 halflog2pi = spu_splats(9.189385332046727417803297364056E-1f);
|
78 |
|
|
vec_float4 logpi = spu_splats(1.1447298858494001741434273513530587116472948129153f);
|
79 |
|
|
vec_float4 inff = (vec_float4)spu_splats(0x7F800000);
|
80 |
|
|
vec_float4 zerof = spu_splats(0.0f);
|
81 |
|
|
vec_float4 onef = spu_splats(1.0f);
|
82 |
|
|
vec_float4 twof = spu_splats(2.0f);
|
83 |
|
|
vec_float4 sign_maskf = spu_splats(-0.0f);
|
84 |
|
|
vec_float4 pi = spu_splats(3.14159265358979323846264338328f);
|
85 |
|
|
|
86 |
|
|
|
87 |
|
|
/*
|
88 |
|
|
* Unfortunately, some of the approximation methods for lgamma require
|
89 |
|
|
* other basic math computations. Get those out of the way now. The
|
90 |
|
|
* compiler seems to good a good job of scheduling this code with
|
91 |
|
|
* the code that follows.
|
92 |
|
|
*/
|
93 |
|
|
vec_uint4 gt0 = spu_cmpgt(x, zerof);
|
94 |
|
|
vec_float4 xabs = spu_andc(x, sign_maskf);
|
95 |
|
|
vec_float4 ln_x = _logf4(xabs);
|
96 |
|
|
vec_float4 inv_x = _recipf4(xabs);
|
97 |
|
|
vec_float4 xtrunc = _truncf4(x);
|
98 |
|
|
vec_float4 inv_xsqu = spu_mul(inv_x, inv_x);
|
99 |
|
|
vec_uint4 isnaninf = spu_cmpgt((vec_uint4)xabs, 0x7F7FFFFF);
|
100 |
|
|
vec_uint4 ret_zero = spu_or(spu_cmpeq(x, onef), spu_cmpeq(x, twof));
|
101 |
|
|
|
102 |
|
|
|
103 |
|
|
/*
|
104 |
|
|
* First thing we do is setup the description of each partition.
|
105 |
|
|
* This consists of:
|
106 |
|
|
* - Start x of partition
|
107 |
|
|
* - Offset (used for evaluating power series expanded around a point)
|
108 |
|
|
* - Truncation adjustment.
|
109 |
|
|
* - Is approx method in region a rational approximation or just a polynomial
|
110 |
|
|
* - The coefficients used in the poly or rational approximation
|
111 |
|
|
*/
|
112 |
|
|
|
113 |
|
|
|
114 |
|
|
/***************************************************************
|
115 |
|
|
* REGION 0: Approximation Near 0 from Above
|
116 |
|
|
*
|
117 |
|
|
* Use Maclaurin Expansion of lgamma()
|
118 |
|
|
*
|
119 |
|
|
* lgamma(z) = -ln(z) - z * EulerMascheroni + Sum[(-1)^n * z^n * Zeta(n)/n]
|
120 |
|
|
*/
|
121 |
|
|
|
122 |
|
|
#define SDM_LGF4_0_START 0.0f
|
123 |
|
|
#define SDM_LGF4_0_OFF 0.0f
|
124 |
|
|
#define SDM_LGF4_0_TRUNC 2u
|
125 |
|
|
#define SDM_LGF4_0_RATIONAL 0x0u
|
126 |
|
|
|
127 |
|
|
#define SDM_LGF4_0_00 0.0f
|
128 |
|
|
#define SDM_LGF4_0_01 -0.5772156649015328606065121f
|
129 |
|
|
#define SDM_LGF4_0_02 0.8224670334241132182362076f
|
130 |
|
|
#define SDM_LGF4_0_03 -0.4006856343865314284665794f
|
131 |
|
|
#define SDM_LGF4_0_04 0.2705808084277845478790009f
|
132 |
|
|
#define SDM_LGF4_0_05 -0.2073855510286739852662731f
|
133 |
|
|
#define SDM_LGF4_0_06 1.6955717699740818995241965496515E-1f
|
134 |
|
|
#define SDM_LGF4_0_07 -1.4404989676884611811997107854997E-1f
|
135 |
|
|
#define SDM_LGF4_0_08 1.2550966952474304242233565481358E-1f
|
136 |
|
|
#define SDM_LGF4_0_09 -1.1133426586956469049087252991471E-1f
|
137 |
|
|
#define SDM_LGF4_0_10 1.0009945751278180853371459589003E-1f
|
138 |
|
|
#define SDM_LGF4_0_11 -9.0954017145829042232609298411497E-2f
|
139 |
|
|
|
140 |
|
|
|
141 |
|
|
|
142 |
|
|
/***************************************************************
|
143 |
|
|
* REGION 1: Above 0 and Below 1
|
144 |
|
|
*/
|
145 |
|
|
#define SDM_LGF4_1_START 0.20f
|
146 |
|
|
#define SDM_LGF4_1_OFF 0.0f
|
147 |
|
|
#define SDM_LGF4_1_TRUNC 0u
|
148 |
|
|
#define SDM_LGF4_1_RATIONAL 0xFFFFFFFFu
|
149 |
|
|
|
150 |
|
|
/* Numerator */
|
151 |
|
|
#define SDM_LGF4_1_06 5.5247592697706124892083167601451981186889952720891079f
|
152 |
|
|
#define SDM_LGF4_1_07 188.42248906442882644741346270888237140890625699348872f
|
153 |
|
|
#define SDM_LGF4_1_08 730.89115027907050579364152184942040244662318995470771f
|
154 |
|
|
#define SDM_LGF4_1_09 -517.93391251349155395618464682404141737699116911423096f
|
155 |
|
|
#define SDM_LGF4_1_10 -866.81293419754982917624255525168901081630973644141406f
|
156 |
|
|
#define SDM_LGF4_1_11 459.90872804523394478152324135956113729930154636775805f
|
157 |
|
|
|
158 |
|
|
/* Denominator */
|
159 |
|
|
#define SDM_LGF4_1_00 1.0f
|
160 |
|
|
#define SDM_LGF4_1_01 62.356015559548850893358835861387218304619374633480009f
|
161 |
|
|
#define SDM_LGF4_1_02 553.64875642095755724931612658933597252336243693499682f
|
162 |
|
|
#define SDM_LGF4_1_03 997.28805670393557265195865662557219661414263910835386f
|
163 |
|
|
#define SDM_LGF4_1_04 257.10520661440946455560646958565998121417179154677712f
|
164 |
|
|
#define SDM_LGF4_1_05 -15.398409585547124178878369413880017200739911288666830f
|
165 |
|
|
|
166 |
|
|
|
167 |
|
|
|
168 |
|
|
/***************************************************************
|
169 |
|
|
* REGION 2: Above 0 and Below 1
|
170 |
|
|
*/
|
171 |
|
|
#define SDM_LGF4_2_START 0.60f
|
172 |
|
|
#define SDM_LGF4_2_OFF 0.69f
|
173 |
|
|
#define SDM_LGF4_2_TRUNC 1u
|
174 |
|
|
#define SDM_LGF4_2_RATIONAL 0x0u
|
175 |
|
|
|
176 |
|
|
/* This is a power series expanson of LogGamma around 0.69 */
|
177 |
|
|
#define SDM_LGF4_2_00 0.27321026793030387025442491383648273204234f
|
178 |
|
|
#define SDM_LGF4_2_01 -1.24869016926209356266849815723905575347988f
|
179 |
|
|
#define SDM_LGF4_2_02 1.44985879780363867173410158693003578927407f
|
180 |
|
|
#define SDM_LGF4_2_03 -1.11686573274718166516744313082147691068190f
|
181 |
|
|
#define SDM_LGF4_2_04 1.14079150485439143731395820215710950729505f
|
182 |
|
|
#define SDM_LGF4_2_05 -1.29512166953091144888197173527810141620764f
|
183 |
|
|
#define SDM_LGF4_2_06 1.55206382120790061136858894716459302629069f
|
184 |
|
|
#define SDM_LGF4_2_07 -1.92227237154565289482911310272968704445560f
|
185 |
|
|
#define SDM_LGF4_2_08 2.43478939488445894670349784581009987461638f
|
186 |
|
|
#define SDM_LGF4_2_09 -3.13512449573283650741385084753752461908870f
|
187 |
|
|
#define SDM_LGF4_2_10 4.08851456399492725127969680590409811177590f
|
188 |
|
|
#define SDM_LGF4_2_11 5.38629680478093362448042704719642976375265f
|
189 |
|
|
|
190 |
|
|
|
191 |
|
|
|
192 |
|
|
/***************************************************************
|
193 |
|
|
* REGION 3: Around 1
|
194 |
|
|
*/
|
195 |
|
|
#define SDM_LGF4_3_START 0.74f
|
196 |
|
|
#define SDM_LGF4_3_OFF 1.0f
|
197 |
|
|
#define SDM_LGF4_3_TRUNC 2u
|
198 |
|
|
#define SDM_LGF4_3_RATIONAL 0x0u
|
199 |
|
|
|
200 |
|
|
#define SDM_LGF4_3_11 -0.90954017145829042232609298411497266951691494159836e-1f
|
201 |
|
|
#define SDM_LGF4_3_10 0.10009945751278180853371459589003190170060195315645f
|
202 |
|
|
#define SDM_LGF4_3_09 -0.11133426586956469049087252991471245116506731682165f
|
203 |
|
|
#define SDM_LGF4_3_08 0.12550966952474304242233565481358155815737009883123f
|
204 |
|
|
#define SDM_LGF4_3_07 -0.14404989676884611811997107854997096565712336579503f
|
205 |
|
|
#define SDM_LGF4_3_06 0.16955717699740818995241965496515342131696958167214f
|
206 |
|
|
#define SDM_LGF4_3_05 -0.20738555102867398526627309729140683361141618390038f
|
207 |
|
|
#define SDM_LGF4_3_04 0.27058080842778454787900092413529197569368773797968f
|
208 |
|
|
#define SDM_LGF4_3_03 -0.40068563438653142846657938717048333025499543078016f
|
209 |
|
|
#define SDM_LGF4_3_02 0.82246703342411321823620758332301259460947495060340f
|
210 |
|
|
#define SDM_LGF4_3_01 -0.57721566490153286060651209008240243104215933593992f
|
211 |
|
|
#define SDM_LGF4_3_00 0.0f
|
212 |
|
|
|
213 |
|
|
|
214 |
|
|
|
215 |
|
|
/***************************************************************
|
216 |
|
|
* REGION 4: Above 1 to Below 2
|
217 |
|
|
*/
|
218 |
|
|
|
219 |
|
|
#define SDM_LGF4_4_START 1.25f
|
220 |
|
|
#define SDM_LGF4_4_OFF 1.4616321449683623412626595423257213284681962040064f
|
221 |
|
|
#define SDM_LGF4_4_TRUNC 1u
|
222 |
|
|
#define SDM_LGF4_4_RATIONAL 0x0u
|
223 |
|
|
|
224 |
|
|
#define SDM_LGF4_4_00 -0.12148629053584960809551455717769158215135617313000f
|
225 |
|
|
#define SDM_LGF4_4_01 0.0f
|
226 |
|
|
#define SDM_LGF4_4_02 0.48383612272381058521372238085482537020562860838860f
|
227 |
|
|
#define SDM_LGF4_4_03 -0.14758772299453070203095509395083641661852764909458f
|
228 |
|
|
#define SDM_LGF4_4_04 0.064624940238912752656100346425238557063086033931734f
|
229 |
|
|
#define SDM_LGF4_4_05 -0.032788541088481305500850258549331278505894787737970f
|
230 |
|
|
#define SDM_LGF4_4_06 0.017970675115210394292863824811126161810628596070981f
|
231 |
|
|
#define SDM_LGF4_4_07 -0.010314223036636387275160254800730296612070784399082f
|
232 |
|
|
#define SDM_LGF4_4_08 0.0061005360205178884031365656884883648099463048507839f
|
233 |
|
|
#define SDM_LGF4_4_09 -0.0036845696083163732546953776004972425913603137160767f
|
234 |
|
|
#define SDM_LGF4_4_10 0.00225976482322181046596248251178293952686321035f
|
235 |
|
|
#define SDM_LGF4_4_11 -0.00140225144590445083080002880374741201782467331f
|
236 |
|
|
|
237 |
|
|
|
238 |
|
|
|
239 |
|
|
/***************************************************************
|
240 |
|
|
* REGION 5: Around 2
|
241 |
|
|
*/
|
242 |
|
|
|
243 |
|
|
#define SDM_LGF4_5_START 1.50f
|
244 |
|
|
#define SDM_LGF4_5_OFF 2.0f
|
245 |
|
|
#define SDM_LGF4_5_TRUNC 1u
|
246 |
|
|
#define SDM_LGF4_5_RATIONAL 0x0u
|
247 |
|
|
|
248 |
|
|
#define SDM_LGF4_5_00 0.0f
|
249 |
|
|
#define SDM_LGF4_5_01 0.42278433509846713939348790991759756895784066406008f
|
250 |
|
|
#define SDM_LGF4_5_02 0.32246703342411321823620758332301259460947495060340f
|
251 |
|
|
#define SDM_LGF4_5_03 -0.6735230105319809513324605383714999692166209744683e-1f
|
252 |
|
|
#define SDM_LGF4_5_04 0.2058080842778454787900092413529197569368773797968e-1f
|
253 |
|
|
#define SDM_LGF4_5_05 -0.738555102867398526627309729140683361141618390038e-2f
|
254 |
|
|
#define SDM_LGF4_5_06 0.289051033074152328575298829848675465030291500547e-2f
|
255 |
|
|
#define SDM_LGF4_5_07 -0.119275391170326097711393569282810851426622293789e-2f
|
256 |
|
|
#define SDM_LGF4_5_08 0.50966952474304242233565481358155815737009883123e-3f
|
257 |
|
|
#define SDM_LGF4_5_09 -0.22315475845357937976141880360134005395620571054e-3f
|
258 |
|
|
#define SDM_LGF4_5_10 0.9945751278180853371459589003190170060195315645e-4f
|
259 |
|
|
#define SDM_LGF4_5_11 -0.44926236738133141700207502406357860782403250745e-4f
|
260 |
|
|
|
261 |
|
|
|
262 |
|
|
|
263 |
|
|
/***************************************************************
|
264 |
|
|
* REGION 6: Above 2 to Below Stirlings
|
265 |
|
|
*/
|
266 |
|
|
|
267 |
|
|
#define SDM_LGF4_6_START 2.48f
|
268 |
|
|
#define SDM_LGF4_6_OFF 0.0f
|
269 |
|
|
#define SDM_LGF4_6_TRUNC 2u
|
270 |
|
|
#define SDM_LGF4_6_RATIONAL 0xFFFFFFFFu
|
271 |
|
|
|
272 |
|
|
/* Numerator */
|
273 |
|
|
#define SDM_LGF4_6_06 2.8952045264375719070927153893062450394256201846894266f
|
274 |
|
|
#define SDM_LGF4_6_07 0.9017557380149600532583460408941390566399250566546766f
|
275 |
|
|
#define SDM_LGF4_6_08 -5.0120743649109868270726470406381462995568837028633266f
|
276 |
|
|
#define SDM_LGF4_6_09 0.5723176665030477945174549923532715487712277062412760f
|
277 |
|
|
#define SDM_LGF4_6_10 0.6107282478237180956153912232438073421489100296366786f
|
278 |
|
|
#define SDM_LGF4_6_11 0.0312308625200519550078820867041868696010490562277303f
|
279 |
|
|
|
280 |
|
|
/* Denominator */
|
281 |
|
|
#define SDM_LGF4_6_00 1.0f
|
282 |
|
|
#define SDM_LGF4_6_01 4.3592151369378598515798083402849838078885877442021500f
|
283 |
|
|
#define SDM_LGF4_6_02 2.6245676641191702420707093818412405820501009602499853f
|
284 |
|
|
#define SDM_LGF4_6_03 0.3438846837443412565179153619145215759074092780311669f
|
285 |
|
|
#define SDM_LGF4_6_04 0.0078092905528158343621764949220712317164193605131159f
|
286 |
|
|
#define SDM_LGF4_6_05 -0.000015217018272713076443927141674684568030697337620f
|
287 |
|
|
|
288 |
|
|
|
289 |
|
|
|
290 |
|
|
/***************************************************************
|
291 |
|
|
* REGION 7: Stirlings - Above 6.0
|
292 |
|
|
*
|
293 |
|
|
*/
|
294 |
|
|
|
295 |
|
|
#define SDM_LGF4_7_START 7.80f
|
296 |
|
|
#define SDM_LGF4_7_OFF 0.0f
|
297 |
|
|
#define SDM_LGF4_7_TRUNC 5u
|
298 |
|
|
#define SDM_LGF4_7_RATIONAL 0x0u
|
299 |
|
|
|
300 |
|
|
#define SDM_LGF4_7_00 8.3333333333333333333333333333333333333333333333333333333333333333333333E-2f
|
301 |
|
|
#define SDM_LGF4_7_01 -2.7777777777777777777777777777777777777777777777777777777777777777777778E-3f
|
302 |
|
|
#define SDM_LGF4_7_02 7.9365079365079365079365079365079365079365079365079365079365079365079365E-4f
|
303 |
|
|
#define SDM_LGF4_7_03 -5.9523809523809523809523809523809523809523809523809523809523809523809524E-4f
|
304 |
|
|
#define SDM_LGF4_7_04 8.4175084175084175084175084175084175084175084175084175084175084175084175E-4f
|
305 |
|
|
#define SDM_LGF4_7_05 -1.9175269175269175269175269175269175269175269175269175269175269175269175E-3f
|
306 |
|
|
#define SDM_LGF4_7_06 6.4102564102564102564102564102564102564102564102564102564102564102564103E-3f
|
307 |
|
|
#define SDM_LGF4_7_07 0.0f
|
308 |
|
|
#define SDM_LGF4_7_08 0.0f
|
309 |
|
|
#define SDM_LGF4_7_09 0.0f
|
310 |
|
|
#define SDM_LGF4_7_10 0.0f
|
311 |
|
|
#define SDM_LGF4_7_11 0.0f
|
312 |
|
|
|
313 |
|
|
|
314 |
|
|
/*
|
315 |
|
|
* Now we load the description of each partition.
|
316 |
|
|
*/
|
317 |
|
|
|
318 |
|
|
/* Start point for each partition */
|
319 |
|
|
vec_float4 r1start = spu_splats(SDM_LGF4_1_START);
|
320 |
|
|
vec_float4 r2start = spu_splats(SDM_LGF4_2_START);
|
321 |
|
|
vec_float4 r3start = spu_splats(SDM_LGF4_3_START);
|
322 |
|
|
vec_float4 r4start = spu_splats(SDM_LGF4_4_START);
|
323 |
|
|
vec_float4 r5start = spu_splats(SDM_LGF4_5_START);
|
324 |
|
|
vec_float4 r6start = spu_splats(SDM_LGF4_6_START);
|
325 |
|
|
vec_float4 r7start = spu_splats(SDM_LGF4_7_START);
|
326 |
|
|
|
327 |
|
|
/* X Offset for each partition */
|
328 |
|
|
vec_float4 xoffseta = (vec_float4) {SDM_LGF4_0_OFF, SDM_LGF4_1_OFF, SDM_LGF4_2_OFF, SDM_LGF4_3_OFF};
|
329 |
|
|
vec_float4 xoffsetb = (vec_float4) {SDM_LGF4_4_OFF, SDM_LGF4_5_OFF, SDM_LGF4_6_OFF, SDM_LGF4_7_OFF};
|
330 |
|
|
|
331 |
|
|
/* Truncation Correction for each partition */
|
332 |
|
|
vec_uint4 tcorra = (vec_uint4) {SDM_LGF4_0_TRUNC, SDM_LGF4_1_TRUNC, SDM_LGF4_2_TRUNC, SDM_LGF4_3_TRUNC};
|
333 |
|
|
vec_uint4 tcorrb = (vec_uint4) {SDM_LGF4_4_TRUNC, SDM_LGF4_5_TRUNC, SDM_LGF4_6_TRUNC, SDM_LGF4_7_TRUNC};
|
334 |
|
|
|
335 |
|
|
/* Is partition a Rational Approximation */
|
336 |
|
|
vec_uint4 israta = (vec_uint4) {SDM_LGF4_0_RATIONAL, SDM_LGF4_1_RATIONAL, SDM_LGF4_2_RATIONAL, SDM_LGF4_3_RATIONAL};
|
337 |
|
|
vec_uint4 isratb = (vec_uint4) {SDM_LGF4_4_RATIONAL, SDM_LGF4_5_RATIONAL, SDM_LGF4_6_RATIONAL, SDM_LGF4_7_RATIONAL};
|
338 |
|
|
|
339 |
|
|
/* The polynomial coefficients for all partitions */
|
340 |
|
|
vec_float4 c00a = (vec_float4) {SDM_LGF4_0_00, SDM_LGF4_1_00, SDM_LGF4_2_00, SDM_LGF4_3_00};
|
341 |
|
|
vec_float4 c01a = (vec_float4) {SDM_LGF4_0_01, SDM_LGF4_1_01, SDM_LGF4_2_01, SDM_LGF4_3_01};
|
342 |
|
|
vec_float4 c02a = (vec_float4) {SDM_LGF4_0_02, SDM_LGF4_1_02, SDM_LGF4_2_02, SDM_LGF4_3_02};
|
343 |
|
|
vec_float4 c03a = (vec_float4) {SDM_LGF4_0_03, SDM_LGF4_1_03, SDM_LGF4_2_03, SDM_LGF4_3_03};
|
344 |
|
|
vec_float4 c04a = (vec_float4) {SDM_LGF4_0_04, SDM_LGF4_1_04, SDM_LGF4_2_04, SDM_LGF4_3_04};
|
345 |
|
|
vec_float4 c05a = (vec_float4) {SDM_LGF4_0_05, SDM_LGF4_1_05, SDM_LGF4_2_05, SDM_LGF4_3_05};
|
346 |
|
|
vec_float4 c06a = (vec_float4) {SDM_LGF4_0_06, SDM_LGF4_1_06, SDM_LGF4_2_06, SDM_LGF4_3_06};
|
347 |
|
|
vec_float4 c07a = (vec_float4) {SDM_LGF4_0_07, SDM_LGF4_1_07, SDM_LGF4_2_07, SDM_LGF4_3_07};
|
348 |
|
|
vec_float4 c08a = (vec_float4) {SDM_LGF4_0_08, SDM_LGF4_1_08, SDM_LGF4_2_08, SDM_LGF4_3_08};
|
349 |
|
|
vec_float4 c09a = (vec_float4) {SDM_LGF4_0_09, SDM_LGF4_1_09, SDM_LGF4_2_09, SDM_LGF4_3_09};
|
350 |
|
|
vec_float4 c10a = (vec_float4) {SDM_LGF4_0_10, SDM_LGF4_1_10, SDM_LGF4_2_10, SDM_LGF4_3_10};
|
351 |
|
|
vec_float4 c11a = (vec_float4) {SDM_LGF4_0_11, SDM_LGF4_1_11, SDM_LGF4_2_11, SDM_LGF4_3_11};
|
352 |
|
|
|
353 |
|
|
vec_float4 c00b = (vec_float4) {SDM_LGF4_4_00, SDM_LGF4_5_00, SDM_LGF4_6_00, SDM_LGF4_7_00};
|
354 |
|
|
vec_float4 c01b = (vec_float4) {SDM_LGF4_4_01, SDM_LGF4_5_01, SDM_LGF4_6_01, SDM_LGF4_7_01};
|
355 |
|
|
vec_float4 c02b = (vec_float4) {SDM_LGF4_4_02, SDM_LGF4_5_02, SDM_LGF4_6_02, SDM_LGF4_7_02};
|
356 |
|
|
vec_float4 c03b = (vec_float4) {SDM_LGF4_4_03, SDM_LGF4_5_03, SDM_LGF4_6_03, SDM_LGF4_7_03};
|
357 |
|
|
vec_float4 c04b = (vec_float4) {SDM_LGF4_4_04, SDM_LGF4_5_04, SDM_LGF4_6_04, SDM_LGF4_7_04};
|
358 |
|
|
vec_float4 c05b = (vec_float4) {SDM_LGF4_4_05, SDM_LGF4_5_05, SDM_LGF4_6_05, SDM_LGF4_7_05};
|
359 |
|
|
vec_float4 c06b = (vec_float4) {SDM_LGF4_4_06, SDM_LGF4_5_06, SDM_LGF4_6_06, SDM_LGF4_7_06};
|
360 |
|
|
vec_float4 c07b = (vec_float4) {SDM_LGF4_4_07, SDM_LGF4_5_07, SDM_LGF4_6_07, SDM_LGF4_7_07};
|
361 |
|
|
vec_float4 c08b = (vec_float4) {SDM_LGF4_4_08, SDM_LGF4_5_08, SDM_LGF4_6_08, SDM_LGF4_7_08};
|
362 |
|
|
vec_float4 c09b = (vec_float4) {SDM_LGF4_4_09, SDM_LGF4_5_09, SDM_LGF4_6_09, SDM_LGF4_7_09};
|
363 |
|
|
vec_float4 c10b = (vec_float4) {SDM_LGF4_4_10, SDM_LGF4_5_10, SDM_LGF4_6_10, SDM_LGF4_7_10};
|
364 |
|
|
vec_float4 c11b = (vec_float4) {SDM_LGF4_4_11, SDM_LGF4_5_11, SDM_LGF4_6_11, SDM_LGF4_7_11};
|
365 |
|
|
|
366 |
|
|
|
367 |
|
|
vec_uchar16 shuffle0 = (vec_uchar16) spu_splats(0x00010203);
|
368 |
|
|
vec_uchar16 shuffle1 = (vec_uchar16) spu_splats(0x04050607);
|
369 |
|
|
vec_uchar16 shuffle2 = (vec_uchar16) spu_splats(0x08090A0B);
|
370 |
|
|
vec_uchar16 shuffle3 = (vec_uchar16) spu_splats(0x0C0D0E0F);
|
371 |
|
|
vec_uchar16 shuffle4 = (vec_uchar16) spu_splats(0x10111213);
|
372 |
|
|
vec_uchar16 shuffle5 = (vec_uchar16) spu_splats(0x14151617);
|
373 |
|
|
vec_uchar16 shuffle6 = (vec_uchar16) spu_splats(0x18191A1B);
|
374 |
|
|
vec_uchar16 shuffle7 = (vec_uchar16) spu_splats(0x1C1D1E1F);
|
375 |
|
|
|
376 |
|
|
|
377 |
|
|
/*
|
378 |
|
|
* Determine the shuffle pattern based on which partition
|
379 |
|
|
* each element of x is in.
|
380 |
|
|
*/
|
381 |
|
|
|
382 |
|
|
vec_uchar16 gt_r1start = (vec_uchar16)spu_cmpgt(xabs, r1start);
|
383 |
|
|
vec_uchar16 gt_r2start = (vec_uchar16)spu_cmpgt(xabs, r2start);
|
384 |
|
|
vec_uchar16 gt_r3start = (vec_uchar16)spu_cmpgt(xabs, r3start);
|
385 |
|
|
vec_uchar16 gt_r4start = (vec_uchar16)spu_cmpgt(xabs, r4start);
|
386 |
|
|
vec_uchar16 gt_r5start = (vec_uchar16)spu_cmpgt(xabs, r5start);
|
387 |
|
|
vec_uchar16 gt_r6start = (vec_uchar16)spu_cmpgt(xabs, r6start);
|
388 |
|
|
vec_uchar16 gt_r7start = (vec_uchar16)spu_cmpgt(xabs, r7start);
|
389 |
|
|
|
390 |
|
|
vec_uchar16 shufflepattern;
|
391 |
|
|
shufflepattern = spu_sel(shuffle0, shuffle1, gt_r1start);
|
392 |
|
|
shufflepattern = spu_sel(shufflepattern, shuffle2, gt_r2start);
|
393 |
|
|
shufflepattern = spu_sel(shufflepattern, shuffle3, gt_r3start);
|
394 |
|
|
shufflepattern = spu_sel(shufflepattern, shuffle4, gt_r4start);
|
395 |
|
|
shufflepattern = spu_sel(shufflepattern, shuffle5, gt_r5start);
|
396 |
|
|
shufflepattern = spu_sel(shufflepattern, shuffle6, gt_r6start);
|
397 |
|
|
shufflepattern = spu_sel(shufflepattern, shuffle7, gt_r7start);
|
398 |
|
|
|
399 |
|
|
|
400 |
|
|
|
401 |
|
|
/* Use the shuffle pattern to select the coefficients */
|
402 |
|
|
|
403 |
|
|
vec_float4 coeff_00 = spu_shuffle(c00a, c00b, shufflepattern);
|
404 |
|
|
vec_float4 coeff_01 = spu_shuffle(c01a, c01b, shufflepattern);
|
405 |
|
|
vec_float4 coeff_02 = spu_shuffle(c02a, c02b, shufflepattern);
|
406 |
|
|
vec_float4 coeff_03 = spu_shuffle(c03a, c03b, shufflepattern);
|
407 |
|
|
vec_float4 coeff_04 = spu_shuffle(c04a, c04b, shufflepattern);
|
408 |
|
|
vec_float4 coeff_06 = spu_shuffle(c06a, c06b, shufflepattern);
|
409 |
|
|
vec_float4 coeff_07 = spu_shuffle(c07a, c07b, shufflepattern);
|
410 |
|
|
vec_float4 coeff_05 = spu_shuffle(c05a, c05b, shufflepattern);
|
411 |
|
|
vec_float4 coeff_08 = spu_shuffle(c08a, c08b, shufflepattern);
|
412 |
|
|
vec_float4 coeff_09 = spu_shuffle(c09a, c09b, shufflepattern);
|
413 |
|
|
vec_float4 coeff_10 = spu_shuffle(c10a, c10b, shufflepattern);
|
414 |
|
|
vec_float4 coeff_11 = spu_shuffle(c11a, c11b, shufflepattern);
|
415 |
|
|
|
416 |
|
|
vec_float4 xoffset = spu_shuffle(xoffseta, xoffsetb, shufflepattern);
|
417 |
|
|
vec_uint4 tcorrection = spu_shuffle(tcorra, tcorrb, shufflepattern);
|
418 |
|
|
vec_uint4 isrational = spu_shuffle(israta, isratb, shufflepattern);
|
419 |
|
|
|
420 |
|
|
/*
|
421 |
|
|
* We've completed the coeff. setup. Now we actually do the
|
422 |
|
|
* approximation below.
|
423 |
|
|
*/
|
424 |
|
|
|
425 |
|
|
/* Adjust x value here (for approximations about a point) */
|
426 |
|
|
vec_float4 xappr = spu_sub(xabs, xoffset);
|
427 |
|
|
|
428 |
|
|
/* If in Stirling partition, do some setup before the madds */
|
429 |
|
|
xappr = spu_sel(xappr, inv_xsqu, (vector unsigned int)gt_r7start);
|
430 |
|
|
|
431 |
|
|
|
432 |
|
|
|
433 |
|
|
/* Now we do the multiplies - either a big polynomial or
|
434 |
|
|
* a rational approximation. Use Horner's method.
|
435 |
|
|
*/
|
436 |
|
|
result = coeff_11;
|
437 |
|
|
result = spu_madd(xappr, result, coeff_10);
|
438 |
|
|
result = spu_madd(xappr, result, coeff_09);
|
439 |
|
|
result = spu_madd(xappr, result, coeff_08);
|
440 |
|
|
result = spu_madd(xappr, result, coeff_07);
|
441 |
|
|
result = spu_madd(xappr, result, coeff_06);
|
442 |
|
|
|
443 |
|
|
/* For rational approximations, we save numerator. */
|
444 |
|
|
vec_float4 resultn = result;
|
445 |
|
|
|
446 |
|
|
/* For rational appr,, reset result for calculation of denominator. */
|
447 |
|
|
result = spu_sel(result, spu_splats(0.0f), isrational);
|
448 |
|
|
|
449 |
|
|
result = spu_madd(xappr, result, coeff_05);
|
450 |
|
|
result = spu_madd(xappr, result, coeff_04);
|
451 |
|
|
result = spu_madd(xappr, result, coeff_03);
|
452 |
|
|
result = spu_madd(xappr, result, coeff_02);
|
453 |
|
|
result = spu_madd(xappr, result, coeff_01);
|
454 |
|
|
result = spu_madd(xappr, result, coeff_00);
|
455 |
|
|
|
456 |
|
|
/* Select either the polynomial or rational result */
|
457 |
|
|
result = spu_sel(result, _divf4(resultn, result), isrational);
|
458 |
|
|
|
459 |
|
|
/*
|
460 |
|
|
* Now we have to do a bit of additional calculations for
|
461 |
|
|
* partitions that weren't simple polynomial or rational
|
462 |
|
|
* approximations.
|
463 |
|
|
*/
|
464 |
|
|
|
465 |
|
|
/* Finish the Near 0 formula */
|
466 |
|
|
result = spu_sel(spu_sub(result, ln_x), result, (vector unsigned int)gt_r1start);
|
467 |
|
|
|
468 |
|
|
/* Finish Stirling's Approximation */
|
469 |
|
|
vec_float4 resultstirling = spu_madd(spu_sub(xabs, spu_splats(0.5f)), ln_x, halflog2pi);
|
470 |
|
|
resultstirling = spu_sub(resultstirling, xabs);
|
471 |
|
|
resultstirling = spu_add(spu_mul(result,inv_x), resultstirling);
|
472 |
|
|
result = spu_sel(result, resultstirling, (vector unsigned int)gt_r7start);
|
473 |
|
|
|
474 |
|
|
|
475 |
|
|
/* Adjust due to systematic truncation */
|
476 |
|
|
result = (vec_float4)spu_add((vec_uint4)result, tcorrection);
|
477 |
|
|
|
478 |
|
|
|
479 |
|
|
/*
|
480 |
|
|
* Approximation for Negative X
|
481 |
|
|
*
|
482 |
|
|
* Use reflection relation:
|
483 |
|
|
*
|
484 |
|
|
* gamma(x) * gamma(-x) = -pi/(x sin(pi x))
|
485 |
|
|
*
|
486 |
|
|
* lgamma(x) = log(pi/(-x sin(pi x))) - lgamma(-x)
|
487 |
|
|
*
|
488 |
|
|
*/
|
489 |
|
|
vec_float4 nresult = spu_mul(x, _sinf4(spu_mul(x, pi)));
|
490 |
|
|
nresult = spu_andc(nresult, sign_maskf);
|
491 |
|
|
nresult = spu_sub(logpi, spu_add(result, _logf4(nresult)));
|
492 |
|
|
nresult = (vec_float4)spu_add((vec_uint4)nresult, spu_splats(1u));
|
493 |
|
|
|
494 |
|
|
result = spu_sel(nresult, result, gt0);
|
495 |
|
|
|
496 |
|
|
|
497 |
|
|
/*
|
498 |
|
|
* Special Cases
|
499 |
|
|
*/
|
500 |
|
|
|
501 |
|
|
/* x = non-positive integer, return infinity */
|
502 |
|
|
vec_uint4 isnonposint = spu_andc(spu_cmpeq(x, xtrunc), gt0);
|
503 |
|
|
result = spu_sel(result, inff, spu_or(isnonposint, spu_cmpgt(x, spu_splats(4.2e36f))));
|
504 |
|
|
result = spu_sel(result, inff, spu_andc(spu_cmpeq(x, xtrunc), gt0));
|
505 |
|
|
|
506 |
|
|
/* Zeros of function */
|
507 |
|
|
result = spu_sel(result, zerof, ret_zero);
|
508 |
|
|
|
509 |
|
|
/* x = +/- infinity or nan, return |x| */
|
510 |
|
|
result = spu_sel(result, xabs, isnaninf);
|
511 |
|
|
|
512 |
|
|
|
513 |
|
|
return result;
|
514 |
|
|
}
|
515 |
|
|
|
516 |
|
|
#endif /* _LGAMMAF4_H_ */
|
517 |
|
|
#endif /* __SPU__ */
|