OpenCores
URL https://opencores.org/ocsvn/openrisc_2011-10-31/openrisc_2011-10-31/trunk

Subversion Repositories openrisc_2011-10-31

[/] [openrisc/] [trunk/] [gnu-src/] [newlib-1.18.0/] [newlib/] [libm/] [machine/] [spu/] [headers/] [lgammaf4.h] - Blame information for rev 207

Details | Compare with Previous | View Log

Line No. Rev Author Line
1 207 jeremybenn
/* --------------------------------------------------------------  */
2
/* (C)Copyright 2007,2008,                                         */
3
/* International Business Machines Corporation                     */
4
/* All Rights Reserved.                                            */
5
/*                                                                 */
6
/* Redistribution and use in source and binary forms, with or      */
7
/* without modification, are permitted provided that the           */
8
/* following conditions are met:                                   */
9
/*                                                                 */
10
/* - Redistributions of source code must retain the above copyright*/
11
/*   notice, this list of conditions and the following disclaimer. */
12
/*                                                                 */
13
/* - Redistributions in binary form must reproduce the above       */
14
/*   copyright notice, this list of conditions and the following   */
15
/*   disclaimer in the documentation and/or other materials        */
16
/*   provided with the distribution.                               */
17
/*                                                                 */
18
/* - Neither the name of IBM Corporation nor the names of its      */
19
/*   contributors may be used to endorse or promote products       */
20
/*   derived from this software without specific prior written     */
21
/*   permission.                                                   */
22
/*                                                                 */
23
/* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND          */
24
/* CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,     */
25
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF        */
26
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE        */
27
/* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR            */
28
/* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,    */
29
/* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT    */
30
/* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;    */
31
/* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)        */
32
/* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN       */
33
/* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR    */
34
/* OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,  */
35
/* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.              */
36
/* --------------------------------------------------------------  */
37
/* PROLOG END TAG zYx                                              */
38
#ifdef __SPU__
39
 
40
#ifndef _LGAMMAF4_H_
41
#define _LGAMMAF4_H_    1
42
 
43
#include <spu_intrinsics.h>
44
 
45
#include "logf4.h"
46
#include "divf4.h"
47
#include "recipf4.h"
48
#include "truncf4.h"
49
#include "sinf4.h"
50
 
51
 
52
/*
53
 * FUNCTION
54
 *      vector float _lgammaf4(vector float x) - Natural Log of Gamma Function
55
 *
56
 * DESCRIPTION
57
 *      _lgammaf4 calculates the natural logarithm of the absolute value of the gamma
58
 *      function for the corresponding elements of the input vector.
59
 *
60
 * C99 Special Cases:
61
 *      lgamma(0) returns +infinity
62
 *      lgamma(1) returns +0
63
 *      lgamma(2) returns +0
64
 *      lgamma(negative integer) returns +infinity
65
 *      lgamma(+infinity) returns +infinity
66
 *      lgamma(-infinity) returns +infinity
67
 *
68
 * Other Cases:
69
 *  lgamma(Nan) returns Nan
70
 *  lgamma(Denorm) treated as lgamma(0) and returns +infinity
71
 *
72
 */
73
 
74
static __inline vector float _lgammaf4(vector float x)
75
{
76
  vec_float4 result;
77
  vec_float4 halflog2pi = spu_splats(9.189385332046727417803297364056E-1f);
78
  vec_float4 logpi      = spu_splats(1.1447298858494001741434273513530587116472948129153f);
79
  vec_float4 inff       = (vec_float4)spu_splats(0x7F800000);
80
  vec_float4 zerof      = spu_splats(0.0f);
81
  vec_float4 onef       = spu_splats(1.0f);
82
  vec_float4 twof       = spu_splats(2.0f);
83
  vec_float4 sign_maskf = spu_splats(-0.0f);
84
  vec_float4 pi         = spu_splats(3.14159265358979323846264338328f);
85
 
86
 
87
  /*
88
   * Unfortunately, some of the approximation methods for lgamma require
89
   * other basic math computations. Get those out of the way now. The
90
   * compiler seems to good a good job of scheduling this code with
91
   * the code that follows.
92
   */
93
  vec_uint4  gt0      = spu_cmpgt(x, zerof);
94
  vec_float4 xabs     = spu_andc(x, sign_maskf);
95
  vec_float4 ln_x     = _logf4(xabs);
96
  vec_float4 inv_x    = _recipf4(xabs);
97
  vec_float4 xtrunc   = _truncf4(x);
98
  vec_float4 inv_xsqu = spu_mul(inv_x, inv_x);
99
  vec_uint4 isnaninf  = spu_cmpgt((vec_uint4)xabs, 0x7F7FFFFF);
100
  vec_uint4 ret_zero  = spu_or(spu_cmpeq(x, onef), spu_cmpeq(x, twof));
101
 
102
 
103
  /*
104
   * First thing we do is setup the description of each partition.
105
   * This consists of:
106
   * - Start x of partition
107
   * - Offset (used for evaluating power series expanded around a point)
108
   * - Truncation adjustment.
109
   * - Is approx method in region a rational approximation or just a polynomial
110
   * - The coefficients used in the poly or rational approximation
111
   */
112
 
113
 
114
  /***************************************************************
115
   * REGION 0: Approximation Near 0 from Above
116
   *
117
   * Use Maclaurin Expansion of lgamma()
118
   *
119
   * lgamma(z) = -ln(z) - z * EulerMascheroni + Sum[(-1)^n * z^n * Zeta(n)/n]
120
   */
121
 
122
#define SDM_LGF4_0_START     0.0f
123
#define SDM_LGF4_0_OFF       0.0f
124
#define SDM_LGF4_0_TRUNC     2u
125
#define SDM_LGF4_0_RATIONAL  0x0u
126
 
127
#define SDM_LGF4_0_00   0.0f
128
#define SDM_LGF4_0_01  -0.5772156649015328606065121f
129
#define SDM_LGF4_0_02   0.8224670334241132182362076f
130
#define SDM_LGF4_0_03  -0.4006856343865314284665794f
131
#define SDM_LGF4_0_04   0.2705808084277845478790009f
132
#define SDM_LGF4_0_05  -0.2073855510286739852662731f
133
#define SDM_LGF4_0_06   1.6955717699740818995241965496515E-1f
134
#define SDM_LGF4_0_07  -1.4404989676884611811997107854997E-1f
135
#define SDM_LGF4_0_08   1.2550966952474304242233565481358E-1f
136
#define SDM_LGF4_0_09  -1.1133426586956469049087252991471E-1f
137
#define SDM_LGF4_0_10   1.0009945751278180853371459589003E-1f
138
#define SDM_LGF4_0_11  -9.0954017145829042232609298411497E-2f
139
 
140
 
141
 
142
  /***************************************************************
143
   * REGION 1: Above 0 and Below 1
144
   */
145
#define SDM_LGF4_1_START     0.20f
146
#define SDM_LGF4_1_OFF       0.0f
147
#define SDM_LGF4_1_TRUNC     0u
148
#define SDM_LGF4_1_RATIONAL  0xFFFFFFFFu
149
 
150
/* Numerator */
151
#define SDM_LGF4_1_06  5.5247592697706124892083167601451981186889952720891079f
152
#define SDM_LGF4_1_07  188.42248906442882644741346270888237140890625699348872f
153
#define SDM_LGF4_1_08  730.89115027907050579364152184942040244662318995470771f
154
#define SDM_LGF4_1_09  -517.93391251349155395618464682404141737699116911423096f
155
#define SDM_LGF4_1_10  -866.81293419754982917624255525168901081630973644141406f
156
#define SDM_LGF4_1_11  459.90872804523394478152324135956113729930154636775805f
157
 
158
/* Denominator */
159
#define SDM_LGF4_1_00   1.0f
160
#define SDM_LGF4_1_01   62.356015559548850893358835861387218304619374633480009f
161
#define SDM_LGF4_1_02   553.64875642095755724931612658933597252336243693499682f
162
#define SDM_LGF4_1_03   997.28805670393557265195865662557219661414263910835386f
163
#define SDM_LGF4_1_04   257.10520661440946455560646958565998121417179154677712f
164
#define SDM_LGF4_1_05   -15.398409585547124178878369413880017200739911288666830f
165
 
166
 
167
 
168
  /***************************************************************
169
   * REGION 2: Above 0 and Below 1
170
   */
171
#define SDM_LGF4_2_START     0.60f
172
#define SDM_LGF4_2_OFF       0.69f
173
#define SDM_LGF4_2_TRUNC     1u
174
#define SDM_LGF4_2_RATIONAL  0x0u
175
 
176
/* This is a power series expanson of LogGamma around 0.69 */
177
#define SDM_LGF4_2_00   0.27321026793030387025442491383648273204234f
178
#define SDM_LGF4_2_01  -1.24869016926209356266849815723905575347988f
179
#define SDM_LGF4_2_02   1.44985879780363867173410158693003578927407f
180
#define SDM_LGF4_2_03  -1.11686573274718166516744313082147691068190f
181
#define SDM_LGF4_2_04   1.14079150485439143731395820215710950729505f
182
#define SDM_LGF4_2_05  -1.29512166953091144888197173527810141620764f
183
#define SDM_LGF4_2_06   1.55206382120790061136858894716459302629069f
184
#define SDM_LGF4_2_07  -1.92227237154565289482911310272968704445560f
185
#define SDM_LGF4_2_08   2.43478939488445894670349784581009987461638f
186
#define SDM_LGF4_2_09  -3.13512449573283650741385084753752461908870f
187
#define SDM_LGF4_2_10   4.08851456399492725127969680590409811177590f
188
#define SDM_LGF4_2_11   5.38629680478093362448042704719642976375265f
189
 
190
 
191
 
192
  /***************************************************************
193
   * REGION 3: Around 1
194
   */
195
#define SDM_LGF4_3_START     0.74f
196
#define SDM_LGF4_3_OFF       1.0f
197
#define SDM_LGF4_3_TRUNC     2u
198
#define SDM_LGF4_3_RATIONAL  0x0u
199
 
200
#define SDM_LGF4_3_11   -0.90954017145829042232609298411497266951691494159836e-1f
201
#define SDM_LGF4_3_10    0.10009945751278180853371459589003190170060195315645f
202
#define SDM_LGF4_3_09   -0.11133426586956469049087252991471245116506731682165f
203
#define SDM_LGF4_3_08    0.12550966952474304242233565481358155815737009883123f
204
#define SDM_LGF4_3_07   -0.14404989676884611811997107854997096565712336579503f
205
#define SDM_LGF4_3_06    0.16955717699740818995241965496515342131696958167214f
206
#define SDM_LGF4_3_05   -0.20738555102867398526627309729140683361141618390038f
207
#define SDM_LGF4_3_04    0.27058080842778454787900092413529197569368773797968f
208
#define SDM_LGF4_3_03   -0.40068563438653142846657938717048333025499543078016f
209
#define SDM_LGF4_3_02    0.82246703342411321823620758332301259460947495060340f
210
#define SDM_LGF4_3_01   -0.57721566490153286060651209008240243104215933593992f
211
#define SDM_LGF4_3_00    0.0f
212
 
213
 
214
 
215
  /***************************************************************
216
   * REGION 4:  Above 1 to Below 2
217
   */
218
 
219
#define SDM_LGF4_4_START     1.25f
220
#define SDM_LGF4_4_OFF       1.4616321449683623412626595423257213284681962040064f
221
#define SDM_LGF4_4_TRUNC     1u
222
#define SDM_LGF4_4_RATIONAL  0x0u
223
 
224
#define SDM_LGF4_4_00   -0.12148629053584960809551455717769158215135617313000f
225
#define SDM_LGF4_4_01    0.0f
226
#define SDM_LGF4_4_02    0.48383612272381058521372238085482537020562860838860f
227
#define SDM_LGF4_4_03   -0.14758772299453070203095509395083641661852764909458f
228
#define SDM_LGF4_4_04    0.064624940238912752656100346425238557063086033931734f
229
#define SDM_LGF4_4_05   -0.032788541088481305500850258549331278505894787737970f
230
#define SDM_LGF4_4_06    0.017970675115210394292863824811126161810628596070981f
231
#define SDM_LGF4_4_07   -0.010314223036636387275160254800730296612070784399082f
232
#define SDM_LGF4_4_08    0.0061005360205178884031365656884883648099463048507839f
233
#define SDM_LGF4_4_09   -0.0036845696083163732546953776004972425913603137160767f
234
#define SDM_LGF4_4_10    0.00225976482322181046596248251178293952686321035f
235
#define SDM_LGF4_4_11   -0.00140225144590445083080002880374741201782467331f
236
 
237
 
238
 
239
  /***************************************************************
240
   * REGION 5: Around 2
241
   */
242
 
243
#define SDM_LGF4_5_START     1.50f
244
#define SDM_LGF4_5_OFF       2.0f
245
#define SDM_LGF4_5_TRUNC     1u
246
#define SDM_LGF4_5_RATIONAL  0x0u
247
 
248
#define SDM_LGF4_5_00    0.0f
249
#define SDM_LGF4_5_01    0.42278433509846713939348790991759756895784066406008f
250
#define SDM_LGF4_5_02    0.32246703342411321823620758332301259460947495060340f
251
#define SDM_LGF4_5_03   -0.6735230105319809513324605383714999692166209744683e-1f
252
#define SDM_LGF4_5_04    0.2058080842778454787900092413529197569368773797968e-1f
253
#define SDM_LGF4_5_05   -0.738555102867398526627309729140683361141618390038e-2f
254
#define SDM_LGF4_5_06    0.289051033074152328575298829848675465030291500547e-2f
255
#define SDM_LGF4_5_07   -0.119275391170326097711393569282810851426622293789e-2f
256
#define SDM_LGF4_5_08    0.50966952474304242233565481358155815737009883123e-3f
257
#define SDM_LGF4_5_09   -0.22315475845357937976141880360134005395620571054e-3f
258
#define SDM_LGF4_5_10    0.9945751278180853371459589003190170060195315645e-4f
259
#define SDM_LGF4_5_11   -0.44926236738133141700207502406357860782403250745e-4f
260
 
261
 
262
 
263
  /***************************************************************
264
   * REGION 6: Above 2 to Below Stirlings
265
   */
266
 
267
#define SDM_LGF4_6_START     2.48f
268
#define SDM_LGF4_6_OFF       0.0f
269
#define SDM_LGF4_6_TRUNC     2u
270
#define SDM_LGF4_6_RATIONAL  0xFFFFFFFFu
271
 
272
/* Numerator */
273
#define SDM_LGF4_6_06  2.8952045264375719070927153893062450394256201846894266f
274
#define SDM_LGF4_6_07  0.9017557380149600532583460408941390566399250566546766f
275
#define SDM_LGF4_6_08 -5.0120743649109868270726470406381462995568837028633266f
276
#define SDM_LGF4_6_09  0.5723176665030477945174549923532715487712277062412760f
277
#define SDM_LGF4_6_10  0.6107282478237180956153912232438073421489100296366786f
278
#define SDM_LGF4_6_11  0.0312308625200519550078820867041868696010490562277303f
279
 
280
/* Denominator */
281
#define SDM_LGF4_6_00  1.0f
282
#define SDM_LGF4_6_01  4.3592151369378598515798083402849838078885877442021500f
283
#define SDM_LGF4_6_02  2.6245676641191702420707093818412405820501009602499853f
284
#define SDM_LGF4_6_03  0.3438846837443412565179153619145215759074092780311669f
285
#define SDM_LGF4_6_04  0.0078092905528158343621764949220712317164193605131159f
286
#define SDM_LGF4_6_05  -0.000015217018272713076443927141674684568030697337620f
287
 
288
 
289
 
290
  /***************************************************************
291
   * REGION 7: Stirlings - Above 6.0
292
   *
293
   */
294
 
295
#define SDM_LGF4_7_START     7.80f
296
#define SDM_LGF4_7_OFF       0.0f
297
#define SDM_LGF4_7_TRUNC     5u
298
#define SDM_LGF4_7_RATIONAL  0x0u
299
 
300
#define SDM_LGF4_7_00    8.3333333333333333333333333333333333333333333333333333333333333333333333E-2f
301
#define SDM_LGF4_7_01   -2.7777777777777777777777777777777777777777777777777777777777777777777778E-3f
302
#define SDM_LGF4_7_02    7.9365079365079365079365079365079365079365079365079365079365079365079365E-4f
303
#define SDM_LGF4_7_03   -5.9523809523809523809523809523809523809523809523809523809523809523809524E-4f
304
#define SDM_LGF4_7_04    8.4175084175084175084175084175084175084175084175084175084175084175084175E-4f
305
#define SDM_LGF4_7_05   -1.9175269175269175269175269175269175269175269175269175269175269175269175E-3f
306
#define SDM_LGF4_7_06    6.4102564102564102564102564102564102564102564102564102564102564102564103E-3f
307
#define SDM_LGF4_7_07   0.0f
308
#define SDM_LGF4_7_08   0.0f
309
#define SDM_LGF4_7_09   0.0f
310
#define SDM_LGF4_7_10   0.0f
311
#define SDM_LGF4_7_11   0.0f
312
 
313
 
314
  /*
315
   * Now we load the description of each partition.
316
   */
317
 
318
  /* Start point for each partition */
319
  vec_float4 r1start = spu_splats(SDM_LGF4_1_START);
320
  vec_float4 r2start = spu_splats(SDM_LGF4_2_START);
321
  vec_float4 r3start = spu_splats(SDM_LGF4_3_START);
322
  vec_float4 r4start = spu_splats(SDM_LGF4_4_START);
323
  vec_float4 r5start = spu_splats(SDM_LGF4_5_START);
324
  vec_float4 r6start = spu_splats(SDM_LGF4_6_START);
325
  vec_float4 r7start = spu_splats(SDM_LGF4_7_START);
326
 
327
  /* X Offset for each partition */
328
  vec_float4 xoffseta = (vec_float4) {SDM_LGF4_0_OFF, SDM_LGF4_1_OFF, SDM_LGF4_2_OFF, SDM_LGF4_3_OFF};
329
  vec_float4 xoffsetb = (vec_float4) {SDM_LGF4_4_OFF, SDM_LGF4_5_OFF, SDM_LGF4_6_OFF, SDM_LGF4_7_OFF};
330
 
331
  /* Truncation Correction for each partition */
332
  vec_uint4 tcorra = (vec_uint4) {SDM_LGF4_0_TRUNC, SDM_LGF4_1_TRUNC, SDM_LGF4_2_TRUNC, SDM_LGF4_3_TRUNC};
333
  vec_uint4 tcorrb = (vec_uint4) {SDM_LGF4_4_TRUNC, SDM_LGF4_5_TRUNC, SDM_LGF4_6_TRUNC, SDM_LGF4_7_TRUNC};
334
 
335
  /* Is partition a Rational Approximation */
336
  vec_uint4 israta = (vec_uint4) {SDM_LGF4_0_RATIONAL, SDM_LGF4_1_RATIONAL, SDM_LGF4_2_RATIONAL, SDM_LGF4_3_RATIONAL};
337
  vec_uint4 isratb = (vec_uint4) {SDM_LGF4_4_RATIONAL, SDM_LGF4_5_RATIONAL, SDM_LGF4_6_RATIONAL, SDM_LGF4_7_RATIONAL};
338
 
339
  /* The polynomial coefficients for all partitions */
340
  vec_float4 c00a = (vec_float4) {SDM_LGF4_0_00, SDM_LGF4_1_00, SDM_LGF4_2_00, SDM_LGF4_3_00};
341
  vec_float4 c01a = (vec_float4) {SDM_LGF4_0_01, SDM_LGF4_1_01, SDM_LGF4_2_01, SDM_LGF4_3_01};
342
  vec_float4 c02a = (vec_float4) {SDM_LGF4_0_02, SDM_LGF4_1_02, SDM_LGF4_2_02, SDM_LGF4_3_02};
343
  vec_float4 c03a = (vec_float4) {SDM_LGF4_0_03, SDM_LGF4_1_03, SDM_LGF4_2_03, SDM_LGF4_3_03};
344
  vec_float4 c04a = (vec_float4) {SDM_LGF4_0_04, SDM_LGF4_1_04, SDM_LGF4_2_04, SDM_LGF4_3_04};
345
  vec_float4 c05a = (vec_float4) {SDM_LGF4_0_05, SDM_LGF4_1_05, SDM_LGF4_2_05, SDM_LGF4_3_05};
346
  vec_float4 c06a = (vec_float4) {SDM_LGF4_0_06, SDM_LGF4_1_06, SDM_LGF4_2_06, SDM_LGF4_3_06};
347
  vec_float4 c07a = (vec_float4) {SDM_LGF4_0_07, SDM_LGF4_1_07, SDM_LGF4_2_07, SDM_LGF4_3_07};
348
  vec_float4 c08a = (vec_float4) {SDM_LGF4_0_08, SDM_LGF4_1_08, SDM_LGF4_2_08, SDM_LGF4_3_08};
349
  vec_float4 c09a = (vec_float4) {SDM_LGF4_0_09, SDM_LGF4_1_09, SDM_LGF4_2_09, SDM_LGF4_3_09};
350
  vec_float4 c10a = (vec_float4) {SDM_LGF4_0_10, SDM_LGF4_1_10, SDM_LGF4_2_10, SDM_LGF4_3_10};
351
  vec_float4 c11a = (vec_float4) {SDM_LGF4_0_11, SDM_LGF4_1_11, SDM_LGF4_2_11, SDM_LGF4_3_11};
352
 
353
  vec_float4 c00b = (vec_float4) {SDM_LGF4_4_00, SDM_LGF4_5_00, SDM_LGF4_6_00, SDM_LGF4_7_00};
354
  vec_float4 c01b = (vec_float4) {SDM_LGF4_4_01, SDM_LGF4_5_01, SDM_LGF4_6_01, SDM_LGF4_7_01};
355
  vec_float4 c02b = (vec_float4) {SDM_LGF4_4_02, SDM_LGF4_5_02, SDM_LGF4_6_02, SDM_LGF4_7_02};
356
  vec_float4 c03b = (vec_float4) {SDM_LGF4_4_03, SDM_LGF4_5_03, SDM_LGF4_6_03, SDM_LGF4_7_03};
357
  vec_float4 c04b = (vec_float4) {SDM_LGF4_4_04, SDM_LGF4_5_04, SDM_LGF4_6_04, SDM_LGF4_7_04};
358
  vec_float4 c05b = (vec_float4) {SDM_LGF4_4_05, SDM_LGF4_5_05, SDM_LGF4_6_05, SDM_LGF4_7_05};
359
  vec_float4 c06b = (vec_float4) {SDM_LGF4_4_06, SDM_LGF4_5_06, SDM_LGF4_6_06, SDM_LGF4_7_06};
360
  vec_float4 c07b = (vec_float4) {SDM_LGF4_4_07, SDM_LGF4_5_07, SDM_LGF4_6_07, SDM_LGF4_7_07};
361
  vec_float4 c08b = (vec_float4) {SDM_LGF4_4_08, SDM_LGF4_5_08, SDM_LGF4_6_08, SDM_LGF4_7_08};
362
  vec_float4 c09b = (vec_float4) {SDM_LGF4_4_09, SDM_LGF4_5_09, SDM_LGF4_6_09, SDM_LGF4_7_09};
363
  vec_float4 c10b = (vec_float4) {SDM_LGF4_4_10, SDM_LGF4_5_10, SDM_LGF4_6_10, SDM_LGF4_7_10};
364
  vec_float4 c11b = (vec_float4) {SDM_LGF4_4_11, SDM_LGF4_5_11, SDM_LGF4_6_11, SDM_LGF4_7_11};
365
 
366
 
367
  vec_uchar16 shuffle0 = (vec_uchar16) spu_splats(0x00010203);
368
  vec_uchar16 shuffle1 = (vec_uchar16) spu_splats(0x04050607);
369
  vec_uchar16 shuffle2 = (vec_uchar16) spu_splats(0x08090A0B);
370
  vec_uchar16 shuffle3 = (vec_uchar16) spu_splats(0x0C0D0E0F);
371
  vec_uchar16 shuffle4 = (vec_uchar16) spu_splats(0x10111213);
372
  vec_uchar16 shuffle5 = (vec_uchar16) spu_splats(0x14151617);
373
  vec_uchar16 shuffle6 = (vec_uchar16) spu_splats(0x18191A1B);
374
  vec_uchar16 shuffle7 = (vec_uchar16) spu_splats(0x1C1D1E1F);
375
 
376
 
377
  /*
378
   * Determine the shuffle pattern based on which partition
379
   * each element of x is in.
380
   */
381
 
382
  vec_uchar16 gt_r1start = (vec_uchar16)spu_cmpgt(xabs, r1start);
383
  vec_uchar16 gt_r2start = (vec_uchar16)spu_cmpgt(xabs, r2start);
384
  vec_uchar16 gt_r3start = (vec_uchar16)spu_cmpgt(xabs, r3start);
385
  vec_uchar16 gt_r4start = (vec_uchar16)spu_cmpgt(xabs, r4start);
386
  vec_uchar16 gt_r5start = (vec_uchar16)spu_cmpgt(xabs, r5start);
387
  vec_uchar16 gt_r6start = (vec_uchar16)spu_cmpgt(xabs, r6start);
388
  vec_uchar16 gt_r7start = (vec_uchar16)spu_cmpgt(xabs, r7start);
389
 
390
  vec_uchar16 shufflepattern;
391
  shufflepattern = spu_sel(shuffle0, shuffle1, gt_r1start);
392
  shufflepattern = spu_sel(shufflepattern, shuffle2, gt_r2start);
393
  shufflepattern = spu_sel(shufflepattern, shuffle3, gt_r3start);
394
  shufflepattern = spu_sel(shufflepattern, shuffle4, gt_r4start);
395
  shufflepattern = spu_sel(shufflepattern, shuffle5, gt_r5start);
396
  shufflepattern = spu_sel(shufflepattern, shuffle6, gt_r6start);
397
  shufflepattern = spu_sel(shufflepattern, shuffle7, gt_r7start);
398
 
399
 
400
 
401
  /* Use the shuffle pattern to select the coefficients */
402
 
403
  vec_float4 coeff_00 = spu_shuffle(c00a, c00b, shufflepattern);
404
  vec_float4 coeff_01 = spu_shuffle(c01a, c01b, shufflepattern);
405
  vec_float4 coeff_02 = spu_shuffle(c02a, c02b, shufflepattern);
406
  vec_float4 coeff_03 = spu_shuffle(c03a, c03b, shufflepattern);
407
  vec_float4 coeff_04 = spu_shuffle(c04a, c04b, shufflepattern);
408
  vec_float4 coeff_06 = spu_shuffle(c06a, c06b, shufflepattern);
409
  vec_float4 coeff_07 = spu_shuffle(c07a, c07b, shufflepattern);
410
  vec_float4 coeff_05 = spu_shuffle(c05a, c05b, shufflepattern);
411
  vec_float4 coeff_08 = spu_shuffle(c08a, c08b, shufflepattern);
412
  vec_float4 coeff_09 = spu_shuffle(c09a, c09b, shufflepattern);
413
  vec_float4 coeff_10 = spu_shuffle(c10a, c10b, shufflepattern);
414
  vec_float4 coeff_11 = spu_shuffle(c11a, c11b, shufflepattern);
415
 
416
  vec_float4 xoffset     = spu_shuffle(xoffseta, xoffsetb, shufflepattern);
417
  vec_uint4  tcorrection = spu_shuffle(tcorra,   tcorrb,   shufflepattern);
418
  vec_uint4  isrational  = spu_shuffle(israta,   isratb,   shufflepattern);
419
 
420
  /*
421
   * We've completed the coeff. setup. Now we actually do the
422
   * approximation below.
423
   */
424
 
425
  /* Adjust x value here (for approximations about a point) */
426
  vec_float4 xappr = spu_sub(xabs, xoffset);
427
 
428
  /* If in Stirling partition, do some setup before the madds */
429
  xappr = spu_sel(xappr, inv_xsqu, (vector unsigned int)gt_r7start);
430
 
431
 
432
 
433
  /* Now we do the multiplies - either a big polynomial or
434
   * a rational approximation. Use Horner's method.
435
   */
436
  result = coeff_11;
437
  result = spu_madd(xappr, result, coeff_10);
438
  result = spu_madd(xappr, result, coeff_09);
439
  result = spu_madd(xappr, result, coeff_08);
440
  result = spu_madd(xappr, result, coeff_07);
441
  result = spu_madd(xappr, result, coeff_06);
442
 
443
  /* For rational approximations, we save numerator. */
444
  vec_float4 resultn = result;
445
 
446
  /* For rational appr,, reset result for calculation of denominator. */
447
  result = spu_sel(result, spu_splats(0.0f), isrational);
448
 
449
  result = spu_madd(xappr, result, coeff_05);
450
  result = spu_madd(xappr, result, coeff_04);
451
  result = spu_madd(xappr, result, coeff_03);
452
  result = spu_madd(xappr, result, coeff_02);
453
  result = spu_madd(xappr, result, coeff_01);
454
  result = spu_madd(xappr, result, coeff_00);
455
 
456
  /* Select either the polynomial or rational result */
457
  result = spu_sel(result, _divf4(resultn, result), isrational);
458
 
459
  /*
460
   * Now we have to do a bit of additional calculations for
461
   * partitions that weren't simple polynomial or rational
462
   * approximations.
463
   */
464
 
465
  /* Finish the Near 0 formula */
466
  result = spu_sel(spu_sub(result, ln_x), result, (vector unsigned int)gt_r1start);
467
 
468
  /* Finish Stirling's Approximation */
469
  vec_float4 resultstirling = spu_madd(spu_sub(xabs, spu_splats(0.5f)), ln_x, halflog2pi);
470
  resultstirling = spu_sub(resultstirling, xabs);
471
  resultstirling = spu_add(spu_mul(result,inv_x), resultstirling);
472
  result = spu_sel(result, resultstirling, (vector unsigned int)gt_r7start);
473
 
474
 
475
  /* Adjust due to systematic truncation */
476
  result = (vec_float4)spu_add((vec_uint4)result, tcorrection);
477
 
478
 
479
  /*
480
   * Approximation for Negative X
481
   *
482
   * Use reflection relation:
483
   *
484
   * gamma(x) * gamma(-x) = -pi/(x sin(pi x))
485
   *
486
   * lgamma(x) = log(pi/(-x sin(pi x))) - lgamma(-x)
487
   *
488
   */
489
  vec_float4 nresult = spu_mul(x, _sinf4(spu_mul(x, pi)));
490
  nresult = spu_andc(nresult, sign_maskf);
491
  nresult = spu_sub(logpi, spu_add(result, _logf4(nresult)));
492
  nresult = (vec_float4)spu_add((vec_uint4)nresult, spu_splats(1u));
493
 
494
  result = spu_sel(nresult, result, gt0);
495
 
496
 
497
  /*
498
   * Special Cases
499
   */
500
 
501
  /* x = non-positive integer, return infinity */
502
  vec_uint4 isnonposint = spu_andc(spu_cmpeq(x, xtrunc), gt0);
503
  result = spu_sel(result, inff, spu_or(isnonposint, spu_cmpgt(x, spu_splats(4.2e36f))));
504
  result = spu_sel(result, inff, spu_andc(spu_cmpeq(x, xtrunc), gt0));
505
 
506
  /* Zeros of function */
507
  result = spu_sel(result, zerof, ret_zero);
508
 
509
  /* x = +/- infinity or nan, return |x| */
510
  result   = spu_sel(result, xabs, isnaninf);
511
 
512
 
513
  return result;
514
}
515
 
516
#endif /* _LGAMMAF4_H_ */
517
#endif /* __SPU__ */

powered by: WebSVN 2.1.0

© copyright 1999-2024 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.