OpenCores
URL https://opencores.org/ocsvn/openrisc/openrisc/trunk

Subversion Repositories openrisc

[/] [openrisc/] [trunk/] [rtos/] [rtems/] [c/] [src/] [lib/] [libcpu/] [m68k/] [m68040/] [fpsp/] [setox.S] - Blame information for rev 173

Details | Compare with Previous | View Log

Line No. Rev Author Line
1 30 unneback
//
2
//      $Id: setox.S,v 1.2 2001-09-27 12:01:22 chris Exp $
3
//
4
//      setox.sa 3.1 12/10/90
5
//
6
//      The entry point setox computes the exponential of a value.
7
//      setoxd does the same except the input value is a denormalized
8
//      number. setoxm1 computes exp(X)-1, and setoxm1d computes
9
//      exp(X)-1 for denormalized X.
10
//
11
//      INPUT
12
//      -----
13
//      Double-extended value in memory location pointed to by address
14
//      register a0.
15
//
16
//      OUTPUT
17
//      ------
18
//      exp(X) or exp(X)-1 returned in floating-point register fp0.
19
//
20
//      ACCURACY and MONOTONICITY
21
//      -------------------------
22
//      The returned result is within 0.85 ulps in 64 significant bit, i.e.
23
//      within 0.5001 ulp to 53 bits if the result is subsequently rounded
24
//      to double precision. The result is provably monotonic in double
25
//      precision.
26
//
27
//      SPEED
28
//      -----
29
//      Two timings are measured, both in the copy-back mode. The
30
//      first one is measured when the function is invoked the first time
31
//      (so the instructions and data are not in cache), and the
32
//      second one is measured when the function is reinvoked at the same
33
//      input argument.
34
//
35
//      The program setox takes approximately 210/190 cycles for input
36
//      argument X whose magnitude is less than 16380 log2, which
37
//      is the usual situation. For the less common arguments,
38
//      depending on their values, the program may run faster or slower --
39
//      but no worse than 10% slower even in the extreme cases.
40
//
41
//      The program setoxm1 takes approximately ???/??? cycles for input
42
//      argument X, 0.25 <= |X| < 70log2. For |X| < 0.25, it takes
43
//      approximately ???/??? cycles. For the less common arguments,
44
//      depending on their values, the program may run faster or slower --
45
//      but no worse than 10% slower even in the extreme cases.
46
//
47
//      ALGORITHM and IMPLEMENTATION NOTES
48
//      ----------------------------------
49
//
50
//      setoxd
51
//      ------
52
//      Step 1. Set ans := 1.0
53
//
54
//      Step 2. Return  ans := ans + sign(X)*2^(-126). Exit.
55
//      Notes:  This will always generate one exception -- inexact.
56
//
57
//
58
//      setox
59
//      -----
60
//
61
//      Step 1. Filter out extreme cases of input argument.
62
//              1.1     If |X| >= 2^(-65), go to Step 1.3.
63
//              1.2     Go to Step 7.
64
//              1.3     If |X| < 16380 log(2), go to Step 2.
65
//              1.4     Go to Step 8.
66
//      Notes:  The usual case should take the branches 1.1 -> 1.3 -> 2.
67
//               To avoid the use of floating-point comparisons, a
68
//               compact representation of |X| is used. This format is a
69
//               32-bit integer, the upper (more significant) 16 bits are
70
//               the sign and biased exponent field of |X|; the lower 16
71
//               bits are the 16 most significant fraction (including the
72
//               explicit bit) bits of |X|. Consequently, the comparisons
73
//               in Steps 1.1 and 1.3 can be performed by integer comparison.
74
//               Note also that the constant 16380 log(2) used in Step 1.3
75
//               is also in the compact form. Thus taking the branch
76
//               to Step 2 guarantees |X| < 16380 log(2). There is no harm
77
//               to have a small number of cases where |X| is less than,
78
//               but close to, 16380 log(2) and the branch to Step 9 is
79
//               taken.
80
//
81
//      Step 2. Calculate N = round-to-nearest-int( X * 64/log2 ).
82
//              2.1     Set AdjFlag := 0 (indicates the branch 1.3 -> 2 was taken)
83
//              2.2     N := round-to-nearest-integer( X * 64/log2 ).
84
//              2.3     Calculate       J = N mod 64; so J = 0,1,2,..., or 63.
85
//              2.4     Calculate       M = (N - J)/64; so N = 64M + J.
86
//              2.5     Calculate the address of the stored value of 2^(J/64).
87
//              2.6     Create the value Scale = 2^M.
88
//      Notes:  The calculation in 2.2 is really performed by
89
//
90
//                      Z := X * constant
91
//                      N := round-to-nearest-integer(Z)
92
//
93
//               where
94
//
95
//                      constant := single-precision( 64/log 2 ).
96
//
97
//               Using a single-precision constant avoids memory access.
98
//               Another effect of using a single-precision "constant" is
99
//               that the calculated value Z is
100
//
101
//                      Z = X*(64/log2)*(1+eps), |eps| <= 2^(-24).
102
//
103
//               This error has to be considered later in Steps 3 and 4.
104
//
105
//      Step 3. Calculate X - N*log2/64.
106
//              3.1     R := X + N*L1, where L1 := single-precision(-log2/64).
107
//              3.2     R := R + N*L2, L2 := extended-precision(-log2/64 - L1).
108
//      Notes:  a) The way L1 and L2 are chosen ensures L1+L2 approximate
109
//               the value      -log2/64        to 88 bits of accuracy.
110
//               b) N*L1 is exact because N is no longer than 22 bits and
111
//               L1 is no longer than 24 bits.
112
//               c) The calculation X+N*L1 is also exact due to cancellation.
113
//               Thus, R is practically X+N(L1+L2) to full 64 bits.
114
//               d) It is important to estimate how large can |R| be after
115
//               Step 3.2.
116
//
117
//                      N = rnd-to-int( X*64/log2 (1+eps) ), |eps|<=2^(-24)
118
//                      X*64/log2 (1+eps)       =       N + f,  |f| <= 0.5
119
//                      X*64/log2 - N   =       f - eps*X 64/log2
120
//                      X - N*log2/64   =       f*log2/64 - eps*X
121
//
122
//
123
//               Now |X| <= 16446 log2, thus
124
//
125
//                      |X - N*log2/64| <= (0.5 + 16446/2^(18))*log2/64
126
//                                      <= 0.57 log2/64.
127
//               This bound will be used in Step 4.
128
//
129
//      Step 4. Approximate exp(R)-1 by a polynomial
130
//                      p = R + R*R*(A1 + R*(A2 + R*(A3 + R*(A4 + R*A5))))
131
//      Notes:  a) In order to reduce memory access, the coefficients are
132
//               made as "short" as possible: A1 (which is 1/2), A4 and A5
133
//               are single precision; A2 and A3 are double precision.
134
//               b) Even with the restrictions above,
135
//                      |p - (exp(R)-1)| < 2^(-68.8) for all |R| <= 0.0062.
136
//               Note that 0.0062 is slightly bigger than 0.57 log2/64.
137
//               c) To fully utilize the pipeline, p is separated into
138
//               two independent pieces of roughly equal complexities
139
//                      p = [ R + R*S*(A2 + S*A4) ]     +
140
//                              [ S*(A1 + S*(A3 + S*A5)) ]
141
//               where S = R*R.
142
//
143
//      Step 5. Compute 2^(J/64)*exp(R) = 2^(J/64)*(1+p) by
144
//                              ans := T + ( T*p + t)
145
//               where T and t are the stored values for 2^(J/64).
146
//      Notes:  2^(J/64) is stored as T and t where T+t approximates
147
//               2^(J/64) to roughly 85 bits; T is in extended precision
148
//               and t is in single precision. Note also that T is rounded
149
//               to 62 bits so that the last two bits of T are zero. The
150
//               reason for such a special form is that T-1, T-2, and T-8
151
//               will all be exact --- a property that will give much
152
//               more accurate computation of the function EXPM1.
153
//
154
//      Step 6. Reconstruction of exp(X)
155
//                      exp(X) = 2^M * 2^(J/64) * exp(R).
156
//              6.1     If AdjFlag = 0, go to 6.3
157
//              6.2     ans := ans * AdjScale
158
//              6.3     Restore the user FPCR
159
//              6.4     Return ans := ans * Scale. Exit.
160
//      Notes:  If AdjFlag = 0, we have X = Mlog2 + Jlog2/64 + R,
161
//               |M| <= 16380, and Scale = 2^M. Moreover, exp(X) will
162
//               neither overflow nor underflow. If AdjFlag = 1, that
163
//               means that
164
//                      X = (M1+M)log2 + Jlog2/64 + R, |M1+M| >= 16380.
165
//               Hence, exp(X) may overflow or underflow or neither.
166
//               When that is the case, AdjScale = 2^(M1) where M1 is
167
//               approximately M. Thus 6.2 will never cause over/underflow.
168
//               Possible exception in 6.4 is overflow or underflow.
169
//               The inexact exception is not generated in 6.4. Although
170
//               one can argue that the inexact flag should always be
171
//               raised, to simulate that exception cost to much than the
172
//               flag is worth in practical uses.
173
//
174
//      Step 7. Return 1 + X.
175
//              7.1     ans := X
176
//              7.2     Restore user FPCR.
177
//              7.3     Return ans := 1 + ans. Exit
178
//      Notes:  For non-zero X, the inexact exception will always be
179
//               raised by 7.3. That is the only exception raised by 7.3.
180
//               Note also that we use the FMOVEM instruction to move X
181
//               in Step 7.1 to avoid unnecessary trapping. (Although
182
//               the FMOVEM may not seem relevant since X is normalized,
183
//               the precaution will be useful in the library version of
184
//               this code where the separate entry for denormalized inputs
185
//               will be done away with.)
186
//
187
//      Step 8. Handle exp(X) where |X| >= 16380log2.
188
//              8.1     If |X| > 16480 log2, go to Step 9.
189
//              (mimic 2.2 - 2.6)
190
//              8.2     N := round-to-integer( X * 64/log2 )
191
//              8.3     Calculate J = N mod 64, J = 0,1,...,63
192
//              8.4     K := (N-J)/64, M1 := truncate(K/2), M = K-M1, AdjFlag := 1.
193
//              8.5     Calculate the address of the stored value 2^(J/64).
194
//              8.6     Create the values Scale = 2^M, AdjScale = 2^M1.
195
//              8.7     Go to Step 3.
196
//      Notes:  Refer to notes for 2.2 - 2.6.
197
//
198
//      Step 9. Handle exp(X), |X| > 16480 log2.
199
//              9.1     If X < 0, go to 9.3
200
//              9.2     ans := Huge, go to 9.4
201
//              9.3     ans := Tiny.
202
//              9.4     Restore user FPCR.
203
//              9.5     Return ans := ans * ans. Exit.
204
//      Notes:  Exp(X) will surely overflow or underflow, depending on
205
//               X's sign. "Huge" and "Tiny" are respectively large/tiny
206
//               extended-precision numbers whose square over/underflow
207
//               with an inexact result. Thus, 9.5 always raises the
208
//               inexact together with either overflow or underflow.
209
//
210
//
211
//      setoxm1d
212
//      --------
213
//
214
//      Step 1. Set ans := 0
215
//
216
//      Step 2. Return  ans := X + ans. Exit.
217
//      Notes:  This will return X with the appropriate rounding
218
//               precision prescribed by the user FPCR.
219
//
220
//      setoxm1
221
//      -------
222
//
223
//      Step 1. Check |X|
224
//              1.1     If |X| >= 1/4, go to Step 1.3.
225
//              1.2     Go to Step 7.
226
//              1.3     If |X| < 70 log(2), go to Step 2.
227
//              1.4     Go to Step 10.
228
//      Notes:  The usual case should take the branches 1.1 -> 1.3 -> 2.
229
//               However, it is conceivable |X| can be small very often
230
//               because EXPM1 is intended to evaluate exp(X)-1 accurately
231
//               when |X| is small. For further details on the comparisons,
232
//               see the notes on Step 1 of setox.
233
//
234
//      Step 2. Calculate N = round-to-nearest-int( X * 64/log2 ).
235
//              2.1     N := round-to-nearest-integer( X * 64/log2 ).
236
//              2.2     Calculate       J = N mod 64; so J = 0,1,2,..., or 63.
237
//              2.3     Calculate       M = (N - J)/64; so N = 64M + J.
238
//              2.4     Calculate the address of the stored value of 2^(J/64).
239
//              2.5     Create the values Sc = 2^M and OnebySc := -2^(-M).
240
//      Notes:  See the notes on Step 2 of setox.
241
//
242
//      Step 3. Calculate X - N*log2/64.
243
//              3.1     R := X + N*L1, where L1 := single-precision(-log2/64).
244
//              3.2     R := R + N*L2, L2 := extended-precision(-log2/64 - L1).
245
//      Notes:  Applying the analysis of Step 3 of setox in this case
246
//               shows that |R| <= 0.0055 (note that |X| <= 70 log2 in
247
//               this case).
248
//
249
//      Step 4. Approximate exp(R)-1 by a polynomial
250
//                      p = R+R*R*(A1+R*(A2+R*(A3+R*(A4+R*(A5+R*A6)))))
251
//      Notes:  a) In order to reduce memory access, the coefficients are
252
//               made as "short" as possible: A1 (which is 1/2), A5 and A6
253
//               are single precision; A2, A3 and A4 are double precision.
254
//               b) Even with the restriction above,
255
//                      |p - (exp(R)-1)| <      |R| * 2^(-72.7)
256
//               for all |R| <= 0.0055.
257
//               c) To fully utilize the pipeline, p is separated into
258
//               two independent pieces of roughly equal complexity
259
//                      p = [ R*S*(A2 + S*(A4 + S*A6)) ]        +
260
//                              [ R + S*(A1 + S*(A3 + S*A5)) ]
261
//               where S = R*R.
262
//
263
//      Step 5. Compute 2^(J/64)*p by
264
//                              p := T*p
265
//               where T and t are the stored values for 2^(J/64).
266
//      Notes:  2^(J/64) is stored as T and t where T+t approximates
267
//               2^(J/64) to roughly 85 bits; T is in extended precision
268
//               and t is in single precision. Note also that T is rounded
269
//               to 62 bits so that the last two bits of T are zero. The
270
//               reason for such a special form is that T-1, T-2, and T-8
271
//               will all be exact --- a property that will be exploited
272
//               in Step 6 below. The total relative error in p is no
273
//               bigger than 2^(-67.7) compared to the final result.
274
//
275
//      Step 6. Reconstruction of exp(X)-1
276
//                      exp(X)-1 = 2^M * ( 2^(J/64) + p - 2^(-M) ).
277
//              6.1     If M <= 63, go to Step 6.3.
278
//              6.2     ans := T + (p + (t + OnebySc)). Go to 6.6
279
//              6.3     If M >= -3, go to 6.5.
280
//              6.4     ans := (T + (p + t)) + OnebySc. Go to 6.6
281
//              6.5     ans := (T + OnebySc) + (p + t).
282
//              6.6     Restore user FPCR.
283
//              6.7     Return ans := Sc * ans. Exit.
284
//      Notes:  The various arrangements of the expressions give accurate
285
//               evaluations.
286
//
287
//      Step 7. exp(X)-1 for |X| < 1/4.
288
//              7.1     If |X| >= 2^(-65), go to Step 9.
289
//              7.2     Go to Step 8.
290
//
291
//      Step 8. Calculate exp(X)-1, |X| < 2^(-65).
292
//              8.1     If |X| < 2^(-16312), goto 8.3
293
//              8.2     Restore FPCR; return ans := X - 2^(-16382). Exit.
294
//              8.3     X := X * 2^(140).
295
//              8.4     Restore FPCR; ans := ans - 2^(-16382).
296
//               Return ans := ans*2^(140). Exit
297
//      Notes:  The idea is to return "X - tiny" under the user
298
//               precision and rounding modes. To avoid unnecessary
299
//               inefficiency, we stay away from denormalized numbers the
300
//               best we can. For |X| >= 2^(-16312), the straightforward
301
//               8.2 generates the inexact exception as the case warrants.
302
//
303
//      Step 9. Calculate exp(X)-1, |X| < 1/4, by a polynomial
304
//                      p = X + X*X*(B1 + X*(B2 + ... + X*B12))
305
//      Notes:  a) In order to reduce memory access, the coefficients are
306
//               made as "short" as possible: B1 (which is 1/2), B9 to B12
307
//               are single precision; B3 to B8 are double precision; and
308
//               B2 is double extended.
309
//               b) Even with the restriction above,
310
//                      |p - (exp(X)-1)| < |X| 2^(-70.6)
311
//               for all |X| <= 0.251.
312
//               Note that 0.251 is slightly bigger than 1/4.
313
//               c) To fully preserve accuracy, the polynomial is computed
314
//               as     X + ( S*B1 +    Q ) where S = X*X and
315
//                      Q       =       X*S*(B2 + X*(B3 + ... + X*B12))
316
//               d) To fully utilize the pipeline, Q is separated into
317
//               two independent pieces of roughly equal complexity
318
//                      Q = [ X*S*(B2 + S*(B4 + ... + S*B12)) ] +
319
//                              [ S*S*(B3 + S*(B5 + ... + S*B11)) ]
320
//
321
//      Step 10.        Calculate exp(X)-1 for |X| >= 70 log 2.
322
//              10.1 If X >= 70log2 , exp(X) - 1 = exp(X) for all practical
323
//               purposes. Therefore, go to Step 1 of setox.
324
//              10.2 If X <= -70log2, exp(X) - 1 = -1 for all practical purposes.
325
//               ans := -1
326
//               Restore user FPCR
327
//               Return ans := ans + 2^(-126). Exit.
328
//      Notes:  10.2 will always create an inexact and return -1 + tiny
329
//               in the user rounding precision and mode.
330
//
331
//
332
 
333
//              Copyright (C) Motorola, Inc. 1990
334
//                      All Rights Reserved
335
//
336
//      THIS IS UNPUBLISHED PROPRIETARY SOURCE CODE OF MOTOROLA
337
//      The copyright notice above does not evidence any
338
//      actual or intended publication of such source code.
339
 
340
//setox idnt    2,1 | Motorola 040 Floating Point Software Package
341
 
342
        |section        8
343
 
344
#include "fpsp.defs"
345
 
346
L2:     .long   0x3FDC0000,0x82E30865,0x4361C4C6,0x00000000
347
 
348
EXPA3:  .long   0x3FA55555,0x55554431
349
EXPA2:  .long   0x3FC55555,0x55554018
350
 
351
HUGE:   .long   0x7FFE0000,0xFFFFFFFF,0xFFFFFFFF,0x00000000
352
TINY:   .long   0x00010000,0xFFFFFFFF,0xFFFFFFFF,0x00000000
353
 
354
EM1A4:  .long   0x3F811111,0x11174385
355
EM1A3:  .long   0x3FA55555,0x55554F5A
356
 
357
EM1A2:  .long   0x3FC55555,0x55555555,0x00000000,0x00000000
358
 
359
EM1B8:  .long   0x3EC71DE3,0xA5774682
360
EM1B7:  .long   0x3EFA01A0,0x19D7CB68
361
 
362
EM1B6:  .long   0x3F2A01A0,0x1A019DF3
363
EM1B5:  .long   0x3F56C16C,0x16C170E2
364
 
365
EM1B4:  .long   0x3F811111,0x11111111
366
EM1B3:  .long   0x3FA55555,0x55555555
367
 
368
EM1B2:  .long   0x3FFC0000,0xAAAAAAAA,0xAAAAAAAB
369
        .long   0x00000000
370
 
371
TWO140: .long   0x48B00000,0x00000000
372
TWON140:        .long   0x37300000,0x00000000
373
 
374
EXPTBL:
375
        .long   0x3FFF0000,0x80000000,0x00000000,0x00000000
376
        .long   0x3FFF0000,0x8164D1F3,0xBC030774,0x9F841A9B
377
        .long   0x3FFF0000,0x82CD8698,0xAC2BA1D8,0x9FC1D5B9
378
        .long   0x3FFF0000,0x843A28C3,0xACDE4048,0xA0728369
379
        .long   0x3FFF0000,0x85AAC367,0xCC487B14,0x1FC5C95C
380
        .long   0x3FFF0000,0x871F6196,0x9E8D1010,0x1EE85C9F
381
        .long   0x3FFF0000,0x88980E80,0x92DA8528,0x9FA20729
382
        .long   0x3FFF0000,0x8A14D575,0x496EFD9C,0xA07BF9AF
383
        .long   0x3FFF0000,0x8B95C1E3,0xEA8BD6E8,0xA0020DCF
384
        .long   0x3FFF0000,0x8D1ADF5B,0x7E5BA9E4,0x205A63DA
385
        .long   0x3FFF0000,0x8EA4398B,0x45CD53C0,0x1EB70051
386
        .long   0x3FFF0000,0x9031DC43,0x1466B1DC,0x1F6EB029
387
        .long   0x3FFF0000,0x91C3D373,0xAB11C338,0xA0781494
388
        .long   0x3FFF0000,0x935A2B2F,0x13E6E92C,0x9EB319B0
389
        .long   0x3FFF0000,0x94F4EFA8,0xFEF70960,0x2017457D
390
        .long   0x3FFF0000,0x96942D37,0x20185A00,0x1F11D537
391
        .long   0x3FFF0000,0x9837F051,0x8DB8A970,0x9FB952DD
392
        .long   0x3FFF0000,0x99E04593,0x20B7FA64,0x1FE43087
393
        .long   0x3FFF0000,0x9B8D39B9,0xD54E5538,0x1FA2A818
394
        .long   0x3FFF0000,0x9D3ED9A7,0x2CFFB750,0x1FDE494D
395
        .long   0x3FFF0000,0x9EF53260,0x91A111AC,0x20504890
396
        .long   0x3FFF0000,0xA0B0510F,0xB9714FC4,0xA073691C
397
        .long   0x3FFF0000,0xA2704303,0x0C496818,0x1F9B7A05
398
        .long   0x3FFF0000,0xA43515AE,0x09E680A0,0xA0797126
399
        .long   0x3FFF0000,0xA5FED6A9,0xB15138EC,0xA071A140
400
        .long   0x3FFF0000,0xA7CD93B4,0xE9653568,0x204F62DA
401
        .long   0x3FFF0000,0xA9A15AB4,0xEA7C0EF8,0x1F283C4A
402
        .long   0x3FFF0000,0xAB7A39B5,0xA93ED338,0x9F9A7FDC
403
        .long   0x3FFF0000,0xAD583EEA,0x42A14AC8,0xA05B3FAC
404
        .long   0x3FFF0000,0xAF3B78AD,0x690A4374,0x1FDF2610
405
        .long   0x3FFF0000,0xB123F581,0xD2AC2590,0x9F705F90
406
        .long   0x3FFF0000,0xB311C412,0xA9112488,0x201F678A
407
        .long   0x3FFF0000,0xB504F333,0xF9DE6484,0x1F32FB13
408
        .long   0x3FFF0000,0xB6FD91E3,0x28D17790,0x20038B30
409
        .long   0x3FFF0000,0xB8FBAF47,0x62FB9EE8,0x200DC3CC
410
        .long   0x3FFF0000,0xBAFF5AB2,0x133E45FC,0x9F8B2AE6
411
        .long   0x3FFF0000,0xBD08A39F,0x580C36C0,0xA02BBF70
412
        .long   0x3FFF0000,0xBF1799B6,0x7A731084,0xA00BF518
413
        .long   0x3FFF0000,0xC12C4CCA,0x66709458,0xA041DD41
414
        .long   0x3FFF0000,0xC346CCDA,0x24976408,0x9FDF137B
415
        .long   0x3FFF0000,0xC5672A11,0x5506DADC,0x201F1568
416
        .long   0x3FFF0000,0xC78D74C8,0xABB9B15C,0x1FC13A2E
417
        .long   0x3FFF0000,0xC9B9BD86,0x6E2F27A4,0xA03F8F03
418
        .long   0x3FFF0000,0xCBEC14FE,0xF2727C5C,0x1FF4907D
419
        .long   0x3FFF0000,0xCE248C15,0x1F8480E4,0x9E6E53E4
420
        .long   0x3FFF0000,0xD06333DA,0xEF2B2594,0x1FD6D45C
421
        .long   0x3FFF0000,0xD2A81D91,0xF12AE45C,0xA076EDB9
422
        .long   0x3FFF0000,0xD4F35AAB,0xCFEDFA20,0x9FA6DE21
423
        .long   0x3FFF0000,0xD744FCCA,0xD69D6AF4,0x1EE69A2F
424
        .long   0x3FFF0000,0xD99D15C2,0x78AFD7B4,0x207F439F
425
        .long   0x3FFF0000,0xDBFBB797,0xDAF23754,0x201EC207
426
        .long   0x3FFF0000,0xDE60F482,0x5E0E9124,0x9E8BE175
427
        .long   0x3FFF0000,0xE0CCDEEC,0x2A94E110,0x20032C4B
428
        .long   0x3FFF0000,0xE33F8972,0xBE8A5A50,0x2004DFF5
429
        .long   0x3FFF0000,0xE5B906E7,0x7C8348A8,0x1E72F47A
430
        .long   0x3FFF0000,0xE8396A50,0x3C4BDC68,0x1F722F22
431
        .long   0x3FFF0000,0xEAC0C6E7,0xDD243930,0xA017E945
432
        .long   0x3FFF0000,0xED4F301E,0xD9942B84,0x1F401A5B
433
        .long   0x3FFF0000,0xEFE4B99B,0xDCDAF5CC,0x9FB9A9E3
434
        .long   0x3FFF0000,0xF281773C,0x59FFB138,0x20744C05
435
        .long   0x3FFF0000,0xF5257D15,0x2486CC2C,0x1F773A19
436
        .long   0x3FFF0000,0xF7D0DF73,0x0AD13BB8,0x1FFE90D5
437
        .long   0x3FFF0000,0xFA83B2DB,0x722A033C,0xA041ED22
438
        .long   0x3FFF0000,0xFD3E0C0C,0xF486C174,0x1F853F3A
439
 
440
        .set    ADJFLAG,L_SCR2
441
        .set    SCALE,FP_SCR1
442
        .set    ADJSCALE,FP_SCR2
443
        .set    SC,FP_SCR3
444
        .set    ONEBYSC,FP_SCR4
445
 
446
        | xref  t_frcinx
447
        |xref   t_extdnrm
448
        |xref   t_unfl
449
        |xref   t_ovfl
450
 
451
        .global setoxd
452
setoxd:
453
//--entry point for EXP(X), X is denormalized
454
        movel           (%a0),%d0
455
        andil           #0x80000000,%d0
456
        oril            #0x00800000,%d0         // ...sign(X)*2^(-126)
457
        movel           %d0,-(%sp)
458
        fmoves          #0x3F800000,%fp0
459
        fmovel          %d1,%fpcr
460
        fadds           (%sp)+,%fp0
461
        bra             t_frcinx
462
 
463
        .global setox
464
setox:
465
//--entry point for EXP(X), here X is finite, non-zero, and not NaN's
466
 
467
//--Step 1.
468
        movel           (%a0),%d0        // ...load part of input X
469
        andil           #0x7FFF0000,%d0 // ...biased expo. of X
470
        cmpil           #0x3FBE0000,%d0 // ...2^(-65)
471
        bges            EXPC1           // ...normal case
472
        bra             EXPSM
473
 
474
EXPC1:
475
//--The case |X| >= 2^(-65)
476
        movew           4(%a0),%d0      // ...expo. and partial sig. of |X|
477
        cmpil           #0x400CB167,%d0 // ...16380 log2 trunc. 16 bits
478
        blts            EXPMAIN  // ...normal case
479
        bra             EXPBIG
480
 
481
EXPMAIN:
482
//--Step 2.
483
//--This is the normal branch:  2^(-65) <= |X| < 16380 log2.
484
        fmovex          (%a0),%fp0      // ...load input from (a0)
485
 
486
        fmovex          %fp0,%fp1
487
        fmuls           #0x42B8AA3B,%fp0        // ...64/log2 * X
488
        fmovemx %fp2-%fp2/%fp3,-(%a7)           // ...save fp2
489
        movel           #0,ADJFLAG(%a6)
490
        fmovel          %fp0,%d0                // ...N = int( X * 64/log2 )
491
        lea             EXPTBL,%a1
492
        fmovel          %d0,%fp0                // ...convert to floating-format
493
 
494
        movel           %d0,L_SCR1(%a6) // ...save N temporarily
495
        andil           #0x3F,%d0               // ...D0 is J = N mod 64
496
        lsll            #4,%d0
497
        addal           %d0,%a1         // ...address of 2^(J/64)
498
        movel           L_SCR1(%a6),%d0
499
        asrl            #6,%d0          // ...D0 is M
500
        addiw           #0x3FFF,%d0     // ...biased expo. of 2^(M)
501
        movew           L2,L_SCR1(%a6)  // ...prefetch L2, no need in CB
502
 
503
EXPCONT1:
504
//--Step 3.
505
//--fp1,fp2 saved on the stack. fp0 is N, fp1 is X,
506
//--a0 points to 2^(J/64), D0 is biased expo. of 2^(M)
507
        fmovex          %fp0,%fp2
508
        fmuls           #0xBC317218,%fp0        // ...N * L1, L1 = lead(-log2/64)
509
        fmulx           L2,%fp2         // ...N * L2, L1+L2 = -log2/64
510
        faddx           %fp1,%fp0               // ...X + N*L1
511
        faddx           %fp2,%fp0               // ...fp0 is R, reduced arg.
512
//      MOVE.W          #$3FA5,EXPA3    ...load EXPA3 in cache
513
 
514
//--Step 4.
515
//--WE NOW COMPUTE EXP(R)-1 BY A POLYNOMIAL
516
//-- R + R*R*(A1 + R*(A2 + R*(A3 + R*(A4 + R*A5))))
517
//--TO FULLY UTILIZE THE PIPELINE, WE COMPUTE S = R*R
518
//--[R+R*S*(A2+S*A4)] + [S*(A1+S*(A3+S*A5))]
519
 
520
        fmovex          %fp0,%fp1
521
        fmulx           %fp1,%fp1               // ...fp1 IS S = R*R
522
 
523
        fmoves          #0x3AB60B70,%fp2        // ...fp2 IS A5
524
//      MOVE.W          #0,2(%a1)       ...load 2^(J/64) in cache
525
 
526
        fmulx           %fp1,%fp2               // ...fp2 IS S*A5
527
        fmovex          %fp1,%fp3
528
        fmuls           #0x3C088895,%fp3        // ...fp3 IS S*A4
529
 
530
        faddd           EXPA3,%fp2      // ...fp2 IS A3+S*A5
531
        faddd           EXPA2,%fp3      // ...fp3 IS A2+S*A4
532
 
533
        fmulx           %fp1,%fp2               // ...fp2 IS S*(A3+S*A5)
534
        movew           %d0,SCALE(%a6)  // ...SCALE is 2^(M) in extended
535
        clrw            SCALE+2(%a6)
536
        movel           #0x80000000,SCALE+4(%a6)
537
        clrl            SCALE+8(%a6)
538
 
539
        fmulx           %fp1,%fp3               // ...fp3 IS S*(A2+S*A4)
540
 
541
        fadds           #0x3F000000,%fp2        // ...fp2 IS A1+S*(A3+S*A5)
542
        fmulx           %fp0,%fp3               // ...fp3 IS R*S*(A2+S*A4)
543
 
544
        fmulx           %fp1,%fp2               // ...fp2 IS S*(A1+S*(A3+S*A5))
545
        faddx           %fp3,%fp0               // ...fp0 IS R+R*S*(A2+S*A4),
546
//                                      ...fp3 released
547
 
548
        fmovex          (%a1)+,%fp1     // ...fp1 is lead. pt. of 2^(J/64)
549
        faddx           %fp2,%fp0               // ...fp0 is EXP(R) - 1
550
//                                      ...fp2 released
551
 
552
//--Step 5
553
//--final reconstruction process
554
//--EXP(X) = 2^M * ( 2^(J/64) + 2^(J/64)*(EXP(R)-1) )
555
 
556
        fmulx           %fp1,%fp0               // ...2^(J/64)*(Exp(R)-1)
557
        fmovemx (%a7)+,%fp2-%fp2/%fp3   // ...fp2 restored
558
        fadds           (%a1),%fp0      // ...accurate 2^(J/64)
559
 
560
        faddx           %fp1,%fp0               // ...2^(J/64) + 2^(J/64)*...
561
        movel           ADJFLAG(%a6),%d0
562
 
563
//--Step 6
564
        tstl            %d0
565
        beqs            NORMAL
566
ADJUST:
567
        fmulx           ADJSCALE(%a6),%fp0
568
NORMAL:
569
        fmovel          %d1,%FPCR               // ...restore user FPCR
570
        fmulx           SCALE(%a6),%fp0 // ...multiply 2^(M)
571
        bra             t_frcinx
572
 
573
EXPSM:
574
//--Step 7
575
        fmovemx (%a0),%fp0-%fp0 // ...in case X is denormalized
576
        fmovel          %d1,%FPCR
577
        fadds           #0x3F800000,%fp0        // ...1+X in user mode
578
        bra             t_frcinx
579
 
580
EXPBIG:
581
//--Step 8
582
        cmpil           #0x400CB27C,%d0 // ...16480 log2
583
        bgts            EXP2BIG
584
//--Steps 8.2 -- 8.6
585
        fmovex          (%a0),%fp0      // ...load input from (a0)
586
 
587
        fmovex          %fp0,%fp1
588
        fmuls           #0x42B8AA3B,%fp0        // ...64/log2 * X
589
        fmovemx  %fp2-%fp2/%fp3,-(%a7)          // ...save fp2
590
        movel           #1,ADJFLAG(%a6)
591
        fmovel          %fp0,%d0                // ...N = int( X * 64/log2 )
592
        lea             EXPTBL,%a1
593
        fmovel          %d0,%fp0                // ...convert to floating-format
594
        movel           %d0,L_SCR1(%a6)                 // ...save N temporarily
595
        andil           #0x3F,%d0                // ...D0 is J = N mod 64
596
        lsll            #4,%d0
597
        addal           %d0,%a1                 // ...address of 2^(J/64)
598
        movel           L_SCR1(%a6),%d0
599
        asrl            #6,%d0                  // ...D0 is K
600
        movel           %d0,L_SCR1(%a6)                 // ...save K temporarily
601
        asrl            #1,%d0                  // ...D0 is M1
602
        subl            %d0,L_SCR1(%a6)                 // ...a1 is M
603
        addiw           #0x3FFF,%d0             // ...biased expo. of 2^(M1)
604
        movew           %d0,ADJSCALE(%a6)               // ...ADJSCALE := 2^(M1)
605
        clrw            ADJSCALE+2(%a6)
606
        movel           #0x80000000,ADJSCALE+4(%a6)
607
        clrl            ADJSCALE+8(%a6)
608
        movel           L_SCR1(%a6),%d0                 // ...D0 is M
609
        addiw           #0x3FFF,%d0             // ...biased expo. of 2^(M)
610
        bra             EXPCONT1                // ...go back to Step 3
611
 
612
EXP2BIG:
613
//--Step 9
614
        fmovel          %d1,%FPCR
615
        movel           (%a0),%d0
616
        bclrb           #sign_bit,(%a0)         // ...setox always returns positive
617
        cmpil           #0,%d0
618
        blt             t_unfl
619
        bra             t_ovfl
620
 
621
        .global setoxm1d
622
setoxm1d:
623
//--entry point for EXPM1(X), here X is denormalized
624
//--Step 0.
625
        bra             t_extdnrm
626
 
627
 
628
        .global setoxm1
629
setoxm1:
630
//--entry point for EXPM1(X), here X is finite, non-zero, non-NaN
631
 
632
//--Step 1.
633
//--Step 1.1
634
        movel           (%a0),%d0        // ...load part of input X
635
        andil           #0x7FFF0000,%d0 // ...biased expo. of X
636
        cmpil           #0x3FFD0000,%d0 // ...1/4
637
        bges            EM1CON1  // ...|X| >= 1/4
638
        bra             EM1SM
639
 
640
EM1CON1:
641
//--Step 1.3
642
//--The case |X| >= 1/4
643
        movew           4(%a0),%d0      // ...expo. and partial sig. of |X|
644
        cmpil           #0x4004C215,%d0 // ...70log2 rounded up to 16 bits
645
        bles            EM1MAIN  // ...1/4 <= |X| <= 70log2
646
        bra             EM1BIG
647
 
648
EM1MAIN:
649
//--Step 2.
650
//--This is the case:   1/4 <= |X| <= 70 log2.
651
        fmovex          (%a0),%fp0      // ...load input from (a0)
652
 
653
        fmovex          %fp0,%fp1
654
        fmuls           #0x42B8AA3B,%fp0        // ...64/log2 * X
655
        fmovemx %fp2-%fp2/%fp3,-(%a7)           // ...save fp2
656
//      MOVE.W          #$3F81,EM1A4            ...prefetch in CB mode
657
        fmovel          %fp0,%d0                // ...N = int( X * 64/log2 )
658
        lea             EXPTBL,%a1
659
        fmovel          %d0,%fp0                // ...convert to floating-format
660
 
661
        movel           %d0,L_SCR1(%a6)                 // ...save N temporarily
662
        andil           #0x3F,%d0                // ...D0 is J = N mod 64
663
        lsll            #4,%d0
664
        addal           %d0,%a1                 // ...address of 2^(J/64)
665
        movel           L_SCR1(%a6),%d0
666
        asrl            #6,%d0                  // ...D0 is M
667
        movel           %d0,L_SCR1(%a6)                 // ...save a copy of M
668
//      MOVE.W          #$3FDC,L2               ...prefetch L2 in CB mode
669
 
670
//--Step 3.
671
//--fp1,fp2 saved on the stack. fp0 is N, fp1 is X,
672
//--a0 points to 2^(J/64), D0 and a1 both contain M
673
        fmovex          %fp0,%fp2
674
        fmuls           #0xBC317218,%fp0        // ...N * L1, L1 = lead(-log2/64)
675
        fmulx           L2,%fp2         // ...N * L2, L1+L2 = -log2/64
676
        faddx           %fp1,%fp0        // ...X + N*L1
677
        faddx           %fp2,%fp0        // ...fp0 is R, reduced arg.
678
//      MOVE.W          #$3FC5,EM1A2            ...load EM1A2 in cache
679
        addiw           #0x3FFF,%d0             // ...D0 is biased expo. of 2^M
680
 
681
//--Step 4.
682
//--WE NOW COMPUTE EXP(R)-1 BY A POLYNOMIAL
683
//-- R + R*R*(A1 + R*(A2 + R*(A3 + R*(A4 + R*(A5 + R*A6)))))
684
//--TO FULLY UTILIZE THE PIPELINE, WE COMPUTE S = R*R
685
//--[R*S*(A2+S*(A4+S*A6))] + [R+S*(A1+S*(A3+S*A5))]
686
 
687
        fmovex          %fp0,%fp1
688
        fmulx           %fp1,%fp1               // ...fp1 IS S = R*R
689
 
690
        fmoves          #0x3950097B,%fp2        // ...fp2 IS a6
691
//      MOVE.W          #0,2(%a1)       ...load 2^(J/64) in cache
692
 
693
        fmulx           %fp1,%fp2               // ...fp2 IS S*A6
694
        fmovex          %fp1,%fp3
695
        fmuls           #0x3AB60B6A,%fp3        // ...fp3 IS S*A5
696
 
697
        faddd           EM1A4,%fp2      // ...fp2 IS A4+S*A6
698
        faddd           EM1A3,%fp3      // ...fp3 IS A3+S*A5
699
        movew           %d0,SC(%a6)             // ...SC is 2^(M) in extended
700
        clrw            SC+2(%a6)
701
        movel           #0x80000000,SC+4(%a6)
702
        clrl            SC+8(%a6)
703
 
704
        fmulx           %fp1,%fp2               // ...fp2 IS S*(A4+S*A6)
705
        movel           L_SCR1(%a6),%d0         // ...D0 is     M
706
        negw            %d0             // ...D0 is -M
707
        fmulx           %fp1,%fp3               // ...fp3 IS S*(A3+S*A5)
708
        addiw           #0x3FFF,%d0     // ...biased expo. of 2^(-M)
709
        faddd           EM1A2,%fp2      // ...fp2 IS A2+S*(A4+S*A6)
710
        fadds           #0x3F000000,%fp3        // ...fp3 IS A1+S*(A3+S*A5)
711
 
712
        fmulx           %fp1,%fp2               // ...fp2 IS S*(A2+S*(A4+S*A6))
713
        oriw            #0x8000,%d0     // ...signed/expo. of -2^(-M)
714
        movew           %d0,ONEBYSC(%a6)        // ...OnebySc is -2^(-M)
715
        clrw            ONEBYSC+2(%a6)
716
        movel           #0x80000000,ONEBYSC+4(%a6)
717
        clrl            ONEBYSC+8(%a6)
718
        fmulx           %fp3,%fp1               // ...fp1 IS S*(A1+S*(A3+S*A5))
719
//                                      ...fp3 released
720
 
721
        fmulx           %fp0,%fp2               // ...fp2 IS R*S*(A2+S*(A4+S*A6))
722
        faddx           %fp1,%fp0               // ...fp0 IS R+S*(A1+S*(A3+S*A5))
723
//                                      ...fp1 released
724
 
725
        faddx           %fp2,%fp0               // ...fp0 IS EXP(R)-1
726
//                                      ...fp2 released
727
        fmovemx (%a7)+,%fp2-%fp2/%fp3   // ...fp2 restored
728
 
729
//--Step 5
730
//--Compute 2^(J/64)*p
731
 
732
        fmulx           (%a1),%fp0      // ...2^(J/64)*(Exp(R)-1)
733
 
734
//--Step 6
735
//--Step 6.1
736
        movel           L_SCR1(%a6),%d0         // ...retrieve M
737
        cmpil           #63,%d0
738
        bles            MLE63
739
//--Step 6.2    M >= 64
740
        fmoves          12(%a1),%fp1    // ...fp1 is t
741
        faddx           ONEBYSC(%a6),%fp1       // ...fp1 is t+OnebySc
742
        faddx           %fp1,%fp0               // ...p+(t+OnebySc), fp1 released
743
        faddx           (%a1),%fp0      // ...T+(p+(t+OnebySc))
744
        bras            EM1SCALE
745
MLE63:
746
//--Step 6.3    M <= 63
747
        cmpil           #-3,%d0
748
        bges            MGEN3
749
MLTN3:
750
//--Step 6.4    M <= -4
751
        fadds           12(%a1),%fp0    // ...p+t
752
        faddx           (%a1),%fp0      // ...T+(p+t)
753
        faddx           ONEBYSC(%a6),%fp0       // ...OnebySc + (T+(p+t))
754
        bras            EM1SCALE
755
MGEN3:
756
//--Step 6.5    -3 <= M <= 63
757
        fmovex          (%a1)+,%fp1     // ...fp1 is T
758
        fadds           (%a1),%fp0      // ...fp0 is p+t
759
        faddx           ONEBYSC(%a6),%fp1       // ...fp1 is T+OnebySc
760
        faddx           %fp1,%fp0               // ...(T+OnebySc)+(p+t)
761
 
762
EM1SCALE:
763
//--Step 6.6
764
        fmovel          %d1,%FPCR
765
        fmulx           SC(%a6),%fp0
766
 
767
        bra             t_frcinx
768
 
769
EM1SM:
770
//--Step 7      |X| < 1/4.
771
        cmpil           #0x3FBE0000,%d0 // ...2^(-65)
772
        bges            EM1POLY
773
 
774
EM1TINY:
775
//--Step 8      |X| < 2^(-65)
776
        cmpil           #0x00330000,%d0 // ...2^(-16312)
777
        blts            EM12TINY
778
//--Step 8.2
779
        movel           #0x80010000,SC(%a6)     // ...SC is -2^(-16382)
780
        movel           #0x80000000,SC+4(%a6)
781
        clrl            SC+8(%a6)
782
        fmovex          (%a0),%fp0
783
        fmovel          %d1,%FPCR
784
        faddx           SC(%a6),%fp0
785
 
786
        bra             t_frcinx
787
 
788
EM12TINY:
789
//--Step 8.3
790
        fmovex          (%a0),%fp0
791
        fmuld           TWO140,%fp0
792
        movel           #0x80010000,SC(%a6)
793
        movel           #0x80000000,SC+4(%a6)
794
        clrl            SC+8(%a6)
795
        faddx           SC(%a6),%fp0
796
        fmovel          %d1,%FPCR
797
        fmuld           TWON140,%fp0
798
 
799
        bra             t_frcinx
800
 
801
EM1POLY:
802
//--Step 9      exp(X)-1 by a simple polynomial
803
        fmovex          (%a0),%fp0      // ...fp0 is X
804
        fmulx           %fp0,%fp0               // ...fp0 is S := X*X
805
        fmovemx %fp2-%fp2/%fp3,-(%a7)   // ...save fp2
806
        fmoves          #0x2F30CAA8,%fp1        // ...fp1 is B12
807
        fmulx           %fp0,%fp1               // ...fp1 is S*B12
808
        fmoves          #0x310F8290,%fp2        // ...fp2 is B11
809
        fadds           #0x32D73220,%fp1        // ...fp1 is B10+S*B12
810
 
811
        fmulx           %fp0,%fp2               // ...fp2 is S*B11
812
        fmulx           %fp0,%fp1               // ...fp1 is S*(B10 + ...
813
 
814
        fadds           #0x3493F281,%fp2        // ...fp2 is B9+S*...
815
        faddd           EM1B8,%fp1      // ...fp1 is B8+S*...
816
 
817
        fmulx           %fp0,%fp2               // ...fp2 is S*(B9+...
818
        fmulx           %fp0,%fp1               // ...fp1 is S*(B8+...
819
 
820
        faddd           EM1B7,%fp2      // ...fp2 is B7+S*...
821
        faddd           EM1B6,%fp1      // ...fp1 is B6+S*...
822
 
823
        fmulx           %fp0,%fp2               // ...fp2 is S*(B7+...
824
        fmulx           %fp0,%fp1               // ...fp1 is S*(B6+...
825
 
826
        faddd           EM1B5,%fp2      // ...fp2 is B5+S*...
827
        faddd           EM1B4,%fp1      // ...fp1 is B4+S*...
828
 
829
        fmulx           %fp0,%fp2               // ...fp2 is S*(B5+...
830
        fmulx           %fp0,%fp1               // ...fp1 is S*(B4+...
831
 
832
        faddd           EM1B3,%fp2      // ...fp2 is B3+S*...
833
        faddx           EM1B2,%fp1      // ...fp1 is B2+S*...
834
 
835
        fmulx           %fp0,%fp2               // ...fp2 is S*(B3+...
836
        fmulx           %fp0,%fp1               // ...fp1 is S*(B2+...
837
 
838
        fmulx           %fp0,%fp2               // ...fp2 is S*S*(B3+...)
839
        fmulx           (%a0),%fp1      // ...fp1 is X*S*(B2...
840
 
841
        fmuls           #0x3F000000,%fp0        // ...fp0 is S*B1
842
        faddx           %fp2,%fp1               // ...fp1 is Q
843
//                                      ...fp2 released
844
 
845
        fmovemx (%a7)+,%fp2-%fp2/%fp3   // ...fp2 restored
846
 
847
        faddx           %fp1,%fp0               // ...fp0 is S*B1+Q
848
//                                      ...fp1 released
849
 
850
        fmovel          %d1,%FPCR
851
        faddx           (%a0),%fp0
852
 
853
        bra             t_frcinx
854
 
855
EM1BIG:
856
//--Step 10     |X| > 70 log2
857
        movel           (%a0),%d0
858
        cmpil           #0,%d0
859
        bgt             EXPC1
860
//--Step 10.2
861
        fmoves          #0xBF800000,%fp0        // ...fp0 is -1
862
        fmovel          %d1,%FPCR
863
        fadds           #0x00800000,%fp0        // ...-1 + 2^(-126)
864
 
865
        bra             t_frcinx
866
 
867
        |end

powered by: WebSVN 2.1.0

© copyright 1999-2024 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.