OpenCores
URL https://opencores.org/ocsvn/or1k/or1k/trunk

Subversion Repositories or1k

[/] [or1k/] [trunk/] [linux/] [linux-2.4/] [arch/] [m68k/] [fpsp040/] [setox.S] - Blame information for rev 1765

Details | Compare with Previous | View Log

Line No. Rev Author Line
1 1275 phoenix
|
2
|       setox.sa 3.1 12/10/90
3
|
4
|       The entry point setox computes the exponential of a value.
5
|       setoxd does the same except the input value is a denormalized
6
|       number. setoxm1 computes exp(X)-1, and setoxm1d computes
7
|       exp(X)-1 for denormalized X.
8
|
9
|       INPUT
10
|       -----
11
|       Double-extended value in memory location pointed to by address
12
|       register a0.
13
|
14
|       OUTPUT
15
|       ------
16
|       exp(X) or exp(X)-1 returned in floating-point register fp0.
17
|
18
|       ACCURACY and MONOTONICITY
19
|       -------------------------
20
|       The returned result is within 0.85 ulps in 64 significant bit, i.e.
21
|       within 0.5001 ulp to 53 bits if the result is subsequently rounded
22
|       to double precision. The result is provably monotonic in double
23
|       precision.
24
|
25
|       SPEED
26
|       -----
27
|       Two timings are measured, both in the copy-back mode. The
28
|       first one is measured when the function is invoked the first time
29
|       (so the instructions and data are not in cache), and the
30
|       second one is measured when the function is reinvoked at the same
31
|       input argument.
32
|
33
|       The program setox takes approximately 210/190 cycles for input
34
|       argument X whose magnitude is less than 16380 log2, which
35
|       is the usual situation. For the less common arguments,
36
|       depending on their values, the program may run faster or slower --
37
|       but no worse than 10% slower even in the extreme cases.
38
|
39
|       The program setoxm1 takes approximately ???/??? cycles for input
40
|       argument X, 0.25 <= |X| < 70log2. For |X| < 0.25, it takes
41
|       approximately ???/??? cycles. For the less common arguments,
42
|       depending on their values, the program may run faster or slower --
43
|       but no worse than 10% slower even in the extreme cases.
44
|
45
|       ALGORITHM and IMPLEMENTATION NOTES
46
|       ----------------------------------
47
|
48
|       setoxd
49
|       ------
50
|       Step 1. Set ans := 1.0
51
|
52
|       Step 2. Return  ans := ans + sign(X)*2^(-126). Exit.
53
|       Notes:  This will always generate one exception -- inexact.
54
|
55
|
56
|       setox
57
|       -----
58
|
59
|       Step 1. Filter out extreme cases of input argument.
60
|               1.1     If |X| >= 2^(-65), go to Step 1.3.
61
|               1.2     Go to Step 7.
62
|               1.3     If |X| < 16380 log(2), go to Step 2.
63
|               1.4     Go to Step 8.
64
|       Notes:  The usual case should take the branches 1.1 -> 1.3 -> 2.
65
|                To avoid the use of floating-point comparisons, a
66
|                compact representation of |X| is used. This format is a
67
|                32-bit integer, the upper (more significant) 16 bits are
68
|                the sign and biased exponent field of |X|; the lower 16
69
|                bits are the 16 most significant fraction (including the
70
|                explicit bit) bits of |X|. Consequently, the comparisons
71
|                in Steps 1.1 and 1.3 can be performed by integer comparison.
72
|                Note also that the constant 16380 log(2) used in Step 1.3
73
|                is also in the compact form. Thus taking the branch
74
|                to Step 2 guarantees |X| < 16380 log(2). There is no harm
75
|                to have a small number of cases where |X| is less than,
76
|                but close to, 16380 log(2) and the branch to Step 9 is
77
|                taken.
78
|
79
|       Step 2. Calculate N = round-to-nearest-int( X * 64/log2 ).
80
|               2.1     Set AdjFlag := 0 (indicates the branch 1.3 -> 2 was taken)
81
|               2.2     N := round-to-nearest-integer( X * 64/log2 ).
82
|               2.3     Calculate       J = N mod 64; so J = 0,1,2,..., or 63.
83
|               2.4     Calculate       M = (N - J)/64; so N = 64M + J.
84
|               2.5     Calculate the address of the stored value of 2^(J/64).
85
|               2.6     Create the value Scale = 2^M.
86
|       Notes:  The calculation in 2.2 is really performed by
87
|
88
|                       Z := X * constant
89
|                       N := round-to-nearest-integer(Z)
90
|
91
|                where
92
|
93
|                       constant := single-precision( 64/log 2 ).
94
|
95
|                Using a single-precision constant avoids memory access.
96
|                Another effect of using a single-precision "constant" is
97
|                that the calculated value Z is
98
|
99
|                       Z = X*(64/log2)*(1+eps), |eps| <= 2^(-24).
100
|
101
|                This error has to be considered later in Steps 3 and 4.
102
|
103
|       Step 3. Calculate X - N*log2/64.
104
|               3.1     R := X + N*L1, where L1 := single-precision(-log2/64).
105
|               3.2     R := R + N*L2, L2 := extended-precision(-log2/64 - L1).
106
|       Notes:  a) The way L1 and L2 are chosen ensures L1+L2 approximate
107
|                the value      -log2/64        to 88 bits of accuracy.
108
|                b) N*L1 is exact because N is no longer than 22 bits and
109
|                L1 is no longer than 24 bits.
110
|                c) The calculation X+N*L1 is also exact due to cancellation.
111
|                Thus, R is practically X+N(L1+L2) to full 64 bits.
112
|                d) It is important to estimate how large can |R| be after
113
|                Step 3.2.
114
|
115
|                       N = rnd-to-int( X*64/log2 (1+eps) ), |eps|<=2^(-24)
116
|                       X*64/log2 (1+eps)       =       N + f,  |f| <= 0.5
117
|                       X*64/log2 - N   =       f - eps*X 64/log2
118
|                       X - N*log2/64   =       f*log2/64 - eps*X
119
|
120
|
121
|                Now |X| <= 16446 log2, thus
122
|
123
|                       |X - N*log2/64| <= (0.5 + 16446/2^(18))*log2/64
124
|                                       <= 0.57 log2/64.
125
|                This bound will be used in Step 4.
126
|
127
|       Step 4. Approximate exp(R)-1 by a polynomial
128
|                       p = R + R*R*(A1 + R*(A2 + R*(A3 + R*(A4 + R*A5))))
129
|       Notes:  a) In order to reduce memory access, the coefficients are
130
|                made as "short" as possible: A1 (which is 1/2), A4 and A5
131
|                are single precision; A2 and A3 are double precision.
132
|                b) Even with the restrictions above,
133
|                       |p - (exp(R)-1)| < 2^(-68.8) for all |R| <= 0.0062.
134
|                Note that 0.0062 is slightly bigger than 0.57 log2/64.
135
|                c) To fully utilize the pipeline, p is separated into
136
|                two independent pieces of roughly equal complexities
137
|                       p = [ R + R*S*(A2 + S*A4) ]     +
138
|                               [ S*(A1 + S*(A3 + S*A5)) ]
139
|                where S = R*R.
140
|
141
|       Step 5. Compute 2^(J/64)*exp(R) = 2^(J/64)*(1+p) by
142
|                               ans := T + ( T*p + t)
143
|                where T and t are the stored values for 2^(J/64).
144
|       Notes:  2^(J/64) is stored as T and t where T+t approximates
145
|                2^(J/64) to roughly 85 bits; T is in extended precision
146
|                and t is in single precision. Note also that T is rounded
147
|                to 62 bits so that the last two bits of T are zero. The
148
|                reason for such a special form is that T-1, T-2, and T-8
149
|                will all be exact --- a property that will give much
150
|                more accurate computation of the function EXPM1.
151
|
152
|       Step 6. Reconstruction of exp(X)
153
|                       exp(X) = 2^M * 2^(J/64) * exp(R).
154
|               6.1     If AdjFlag = 0, go to 6.3
155
|               6.2     ans := ans * AdjScale
156
|               6.3     Restore the user FPCR
157
|               6.4     Return ans := ans * Scale. Exit.
158
|       Notes:  If AdjFlag = 0, we have X = Mlog2 + Jlog2/64 + R,
159
|                |M| <= 16380, and Scale = 2^M. Moreover, exp(X) will
160
|                neither overflow nor underflow. If AdjFlag = 1, that
161
|                means that
162
|                       X = (M1+M)log2 + Jlog2/64 + R, |M1+M| >= 16380.
163
|                Hence, exp(X) may overflow or underflow or neither.
164
|                When that is the case, AdjScale = 2^(M1) where M1 is
165
|                approximately M. Thus 6.2 will never cause over/underflow.
166
|                Possible exception in 6.4 is overflow or underflow.
167
|                The inexact exception is not generated in 6.4. Although
168
|                one can argue that the inexact flag should always be
169
|                raised, to simulate that exception cost to much than the
170
|                flag is worth in practical uses.
171
|
172
|       Step 7. Return 1 + X.
173
|               7.1     ans := X
174
|               7.2     Restore user FPCR.
175
|               7.3     Return ans := 1 + ans. Exit
176
|       Notes:  For non-zero X, the inexact exception will always be
177
|                raised by 7.3. That is the only exception raised by 7.3.
178
|                Note also that we use the FMOVEM instruction to move X
179
|                in Step 7.1 to avoid unnecessary trapping. (Although
180
|                the FMOVEM may not seem relevant since X is normalized,
181
|                the precaution will be useful in the library version of
182
|                this code where the separate entry for denormalized inputs
183
|                will be done away with.)
184
|
185
|       Step 8. Handle exp(X) where |X| >= 16380log2.
186
|               8.1     If |X| > 16480 log2, go to Step 9.
187
|               (mimic 2.2 - 2.6)
188
|               8.2     N := round-to-integer( X * 64/log2 )
189
|               8.3     Calculate J = N mod 64, J = 0,1,...,63
190
|               8.4     K := (N-J)/64, M1 := truncate(K/2), M = K-M1, AdjFlag := 1.
191
|               8.5     Calculate the address of the stored value 2^(J/64).
192
|               8.6     Create the values Scale = 2^M, AdjScale = 2^M1.
193
|               8.7     Go to Step 3.
194
|       Notes:  Refer to notes for 2.2 - 2.6.
195
|
196
|       Step 9. Handle exp(X), |X| > 16480 log2.
197
|               9.1     If X < 0, go to 9.3
198
|               9.2     ans := Huge, go to 9.4
199
|               9.3     ans := Tiny.
200
|               9.4     Restore user FPCR.
201
|               9.5     Return ans := ans * ans. Exit.
202
|       Notes:  Exp(X) will surely overflow or underflow, depending on
203
|                X's sign. "Huge" and "Tiny" are respectively large/tiny
204
|                extended-precision numbers whose square over/underflow
205
|                with an inexact result. Thus, 9.5 always raises the
206
|                inexact together with either overflow or underflow.
207
|
208
|
209
|       setoxm1d
210
|       --------
211
|
212
|       Step 1. Set ans := 0
213
|
214
|       Step 2. Return  ans := X + ans. Exit.
215
|       Notes:  This will return X with the appropriate rounding
216
|                precision prescribed by the user FPCR.
217
|
218
|       setoxm1
219
|       -------
220
|
221
|       Step 1. Check |X|
222
|               1.1     If |X| >= 1/4, go to Step 1.3.
223
|               1.2     Go to Step 7.
224
|               1.3     If |X| < 70 log(2), go to Step 2.
225
|               1.4     Go to Step 10.
226
|       Notes:  The usual case should take the branches 1.1 -> 1.3 -> 2.
227
|                However, it is conceivable |X| can be small very often
228
|                because EXPM1 is intended to evaluate exp(X)-1 accurately
229
|                when |X| is small. For further details on the comparisons,
230
|                see the notes on Step 1 of setox.
231
|
232
|       Step 2. Calculate N = round-to-nearest-int( X * 64/log2 ).
233
|               2.1     N := round-to-nearest-integer( X * 64/log2 ).
234
|               2.2     Calculate       J = N mod 64; so J = 0,1,2,..., or 63.
235
|               2.3     Calculate       M = (N - J)/64; so N = 64M + J.
236
|               2.4     Calculate the address of the stored value of 2^(J/64).
237
|               2.5     Create the values Sc = 2^M and OnebySc := -2^(-M).
238
|       Notes:  See the notes on Step 2 of setox.
239
|
240
|       Step 3. Calculate X - N*log2/64.
241
|               3.1     R := X + N*L1, where L1 := single-precision(-log2/64).
242
|               3.2     R := R + N*L2, L2 := extended-precision(-log2/64 - L1).
243
|       Notes:  Applying the analysis of Step 3 of setox in this case
244
|                shows that |R| <= 0.0055 (note that |X| <= 70 log2 in
245
|                this case).
246
|
247
|       Step 4. Approximate exp(R)-1 by a polynomial
248
|                       p = R+R*R*(A1+R*(A2+R*(A3+R*(A4+R*(A5+R*A6)))))
249
|       Notes:  a) In order to reduce memory access, the coefficients are
250
|                made as "short" as possible: A1 (which is 1/2), A5 and A6
251
|                are single precision; A2, A3 and A4 are double precision.
252
|                b) Even with the restriction above,
253
|                       |p - (exp(R)-1)| <      |R| * 2^(-72.7)
254
|                for all |R| <= 0.0055.
255
|                c) To fully utilize the pipeline, p is separated into
256
|                two independent pieces of roughly equal complexity
257
|                       p = [ R*S*(A2 + S*(A4 + S*A6)) ]        +
258
|                               [ R + S*(A1 + S*(A3 + S*A5)) ]
259
|                where S = R*R.
260
|
261
|       Step 5. Compute 2^(J/64)*p by
262
|                               p := T*p
263
|                where T and t are the stored values for 2^(J/64).
264
|       Notes:  2^(J/64) is stored as T and t where T+t approximates
265
|                2^(J/64) to roughly 85 bits; T is in extended precision
266
|                and t is in single precision. Note also that T is rounded
267
|                to 62 bits so that the last two bits of T are zero. The
268
|                reason for such a special form is that T-1, T-2, and T-8
269
|                will all be exact --- a property that will be exploited
270
|                in Step 6 below. The total relative error in p is no
271
|                bigger than 2^(-67.7) compared to the final result.
272
|
273
|       Step 6. Reconstruction of exp(X)-1
274
|                       exp(X)-1 = 2^M * ( 2^(J/64) + p - 2^(-M) ).
275
|               6.1     If M <= 63, go to Step 6.3.
276
|               6.2     ans := T + (p + (t + OnebySc)). Go to 6.6
277
|               6.3     If M >= -3, go to 6.5.
278
|               6.4     ans := (T + (p + t)) + OnebySc. Go to 6.6
279
|               6.5     ans := (T + OnebySc) + (p + t).
280
|               6.6     Restore user FPCR.
281
|               6.7     Return ans := Sc * ans. Exit.
282
|       Notes:  The various arrangements of the expressions give accurate
283
|                evaluations.
284
|
285
|       Step 7. exp(X)-1 for |X| < 1/4.
286
|               7.1     If |X| >= 2^(-65), go to Step 9.
287
|               7.2     Go to Step 8.
288
|
289
|       Step 8. Calculate exp(X)-1, |X| < 2^(-65).
290
|               8.1     If |X| < 2^(-16312), goto 8.3
291
|               8.2     Restore FPCR; return ans := X - 2^(-16382). Exit.
292
|               8.3     X := X * 2^(140).
293
|               8.4     Restore FPCR; ans := ans - 2^(-16382).
294
|                Return ans := ans*2^(140). Exit
295
|       Notes:  The idea is to return "X - tiny" under the user
296
|                precision and rounding modes. To avoid unnecessary
297
|                inefficiency, we stay away from denormalized numbers the
298
|                best we can. For |X| >= 2^(-16312), the straightforward
299
|                8.2 generates the inexact exception as the case warrants.
300
|
301
|       Step 9. Calculate exp(X)-1, |X| < 1/4, by a polynomial
302
|                       p = X + X*X*(B1 + X*(B2 + ... + X*B12))
303
|       Notes:  a) In order to reduce memory access, the coefficients are
304
|                made as "short" as possible: B1 (which is 1/2), B9 to B12
305
|                are single precision; B3 to B8 are double precision; and
306
|                B2 is double extended.
307
|                b) Even with the restriction above,
308
|                       |p - (exp(X)-1)| < |X| 2^(-70.6)
309
|                for all |X| <= 0.251.
310
|                Note that 0.251 is slightly bigger than 1/4.
311
|                c) To fully preserve accuracy, the polynomial is computed
312
|                as     X + ( S*B1 +    Q ) where S = X*X and
313
|                       Q       =       X*S*(B2 + X*(B3 + ... + X*B12))
314
|                d) To fully utilize the pipeline, Q is separated into
315
|                two independent pieces of roughly equal complexity
316
|                       Q = [ X*S*(B2 + S*(B4 + ... + S*B12)) ] +
317
|                               [ S*S*(B3 + S*(B5 + ... + S*B11)) ]
318
|
319
|       Step 10.        Calculate exp(X)-1 for |X| >= 70 log 2.
320
|               10.1 If X >= 70log2 , exp(X) - 1 = exp(X) for all practical
321
|                purposes. Therefore, go to Step 1 of setox.
322
|               10.2 If X <= -70log2, exp(X) - 1 = -1 for all practical purposes.
323
|                ans := -1
324
|                Restore user FPCR
325
|                Return ans := ans + 2^(-126). Exit.
326
|       Notes:  10.2 will always create an inexact and return -1 + tiny
327
|                in the user rounding precision and mode.
328
|
329
|
330
 
331
|               Copyright (C) Motorola, Inc. 1990
332
|                       All Rights Reserved
333
|
334
|       THIS IS UNPUBLISHED PROPRIETARY SOURCE CODE OF MOTOROLA
335
|       The copyright notice above does not evidence any
336
|       actual or intended publication of such source code.
337
 
338
|setox  idnt    2,1 | Motorola 040 Floating Point Software Package
339
 
340
        |section        8
341
 
342
        .include "fpsp.h"
343
 
344
L2:     .long   0x3FDC0000,0x82E30865,0x4361C4C6,0x00000000
345
 
346
EXPA3:  .long   0x3FA55555,0x55554431
347
EXPA2:  .long   0x3FC55555,0x55554018
348
 
349
HUGE:   .long   0x7FFE0000,0xFFFFFFFF,0xFFFFFFFF,0x00000000
350
TINY:   .long   0x00010000,0xFFFFFFFF,0xFFFFFFFF,0x00000000
351
 
352
EM1A4:  .long   0x3F811111,0x11174385
353
EM1A3:  .long   0x3FA55555,0x55554F5A
354
 
355
EM1A2:  .long   0x3FC55555,0x55555555,0x00000000,0x00000000
356
 
357
EM1B8:  .long   0x3EC71DE3,0xA5774682
358
EM1B7:  .long   0x3EFA01A0,0x19D7CB68
359
 
360
EM1B6:  .long   0x3F2A01A0,0x1A019DF3
361
EM1B5:  .long   0x3F56C16C,0x16C170E2
362
 
363
EM1B4:  .long   0x3F811111,0x11111111
364
EM1B3:  .long   0x3FA55555,0x55555555
365
 
366
EM1B2:  .long   0x3FFC0000,0xAAAAAAAA,0xAAAAAAAB
367
        .long   0x00000000
368
 
369
TWO140: .long   0x48B00000,0x00000000
370
TWON140:        .long   0x37300000,0x00000000
371
 
372
EXPTBL:
373
        .long   0x3FFF0000,0x80000000,0x00000000,0x00000000
374
        .long   0x3FFF0000,0x8164D1F3,0xBC030774,0x9F841A9B
375
        .long   0x3FFF0000,0x82CD8698,0xAC2BA1D8,0x9FC1D5B9
376
        .long   0x3FFF0000,0x843A28C3,0xACDE4048,0xA0728369
377
        .long   0x3FFF0000,0x85AAC367,0xCC487B14,0x1FC5C95C
378
        .long   0x3FFF0000,0x871F6196,0x9E8D1010,0x1EE85C9F
379
        .long   0x3FFF0000,0x88980E80,0x92DA8528,0x9FA20729
380
        .long   0x3FFF0000,0x8A14D575,0x496EFD9C,0xA07BF9AF
381
        .long   0x3FFF0000,0x8B95C1E3,0xEA8BD6E8,0xA0020DCF
382
        .long   0x3FFF0000,0x8D1ADF5B,0x7E5BA9E4,0x205A63DA
383
        .long   0x3FFF0000,0x8EA4398B,0x45CD53C0,0x1EB70051
384
        .long   0x3FFF0000,0x9031DC43,0x1466B1DC,0x1F6EB029
385
        .long   0x3FFF0000,0x91C3D373,0xAB11C338,0xA0781494
386
        .long   0x3FFF0000,0x935A2B2F,0x13E6E92C,0x9EB319B0
387
        .long   0x3FFF0000,0x94F4EFA8,0xFEF70960,0x2017457D
388
        .long   0x3FFF0000,0x96942D37,0x20185A00,0x1F11D537
389
        .long   0x3FFF0000,0x9837F051,0x8DB8A970,0x9FB952DD
390
        .long   0x3FFF0000,0x99E04593,0x20B7FA64,0x1FE43087
391
        .long   0x3FFF0000,0x9B8D39B9,0xD54E5538,0x1FA2A818
392
        .long   0x3FFF0000,0x9D3ED9A7,0x2CFFB750,0x1FDE494D
393
        .long   0x3FFF0000,0x9EF53260,0x91A111AC,0x20504890
394
        .long   0x3FFF0000,0xA0B0510F,0xB9714FC4,0xA073691C
395
        .long   0x3FFF0000,0xA2704303,0x0C496818,0x1F9B7A05
396
        .long   0x3FFF0000,0xA43515AE,0x09E680A0,0xA0797126
397
        .long   0x3FFF0000,0xA5FED6A9,0xB15138EC,0xA071A140
398
        .long   0x3FFF0000,0xA7CD93B4,0xE9653568,0x204F62DA
399
        .long   0x3FFF0000,0xA9A15AB4,0xEA7C0EF8,0x1F283C4A
400
        .long   0x3FFF0000,0xAB7A39B5,0xA93ED338,0x9F9A7FDC
401
        .long   0x3FFF0000,0xAD583EEA,0x42A14AC8,0xA05B3FAC
402
        .long   0x3FFF0000,0xAF3B78AD,0x690A4374,0x1FDF2610
403
        .long   0x3FFF0000,0xB123F581,0xD2AC2590,0x9F705F90
404
        .long   0x3FFF0000,0xB311C412,0xA9112488,0x201F678A
405
        .long   0x3FFF0000,0xB504F333,0xF9DE6484,0x1F32FB13
406
        .long   0x3FFF0000,0xB6FD91E3,0x28D17790,0x20038B30
407
        .long   0x3FFF0000,0xB8FBAF47,0x62FB9EE8,0x200DC3CC
408
        .long   0x3FFF0000,0xBAFF5AB2,0x133E45FC,0x9F8B2AE6
409
        .long   0x3FFF0000,0xBD08A39F,0x580C36C0,0xA02BBF70
410
        .long   0x3FFF0000,0xBF1799B6,0x7A731084,0xA00BF518
411
        .long   0x3FFF0000,0xC12C4CCA,0x66709458,0xA041DD41
412
        .long   0x3FFF0000,0xC346CCDA,0x24976408,0x9FDF137B
413
        .long   0x3FFF0000,0xC5672A11,0x5506DADC,0x201F1568
414
        .long   0x3FFF0000,0xC78D74C8,0xABB9B15C,0x1FC13A2E
415
        .long   0x3FFF0000,0xC9B9BD86,0x6E2F27A4,0xA03F8F03
416
        .long   0x3FFF0000,0xCBEC14FE,0xF2727C5C,0x1FF4907D
417
        .long   0x3FFF0000,0xCE248C15,0x1F8480E4,0x9E6E53E4
418
        .long   0x3FFF0000,0xD06333DA,0xEF2B2594,0x1FD6D45C
419
        .long   0x3FFF0000,0xD2A81D91,0xF12AE45C,0xA076EDB9
420
        .long   0x3FFF0000,0xD4F35AAB,0xCFEDFA20,0x9FA6DE21
421
        .long   0x3FFF0000,0xD744FCCA,0xD69D6AF4,0x1EE69A2F
422
        .long   0x3FFF0000,0xD99D15C2,0x78AFD7B4,0x207F439F
423
        .long   0x3FFF0000,0xDBFBB797,0xDAF23754,0x201EC207
424
        .long   0x3FFF0000,0xDE60F482,0x5E0E9124,0x9E8BE175
425
        .long   0x3FFF0000,0xE0CCDEEC,0x2A94E110,0x20032C4B
426
        .long   0x3FFF0000,0xE33F8972,0xBE8A5A50,0x2004DFF5
427
        .long   0x3FFF0000,0xE5B906E7,0x7C8348A8,0x1E72F47A
428
        .long   0x3FFF0000,0xE8396A50,0x3C4BDC68,0x1F722F22
429
        .long   0x3FFF0000,0xEAC0C6E7,0xDD243930,0xA017E945
430
        .long   0x3FFF0000,0xED4F301E,0xD9942B84,0x1F401A5B
431
        .long   0x3FFF0000,0xEFE4B99B,0xDCDAF5CC,0x9FB9A9E3
432
        .long   0x3FFF0000,0xF281773C,0x59FFB138,0x20744C05
433
        .long   0x3FFF0000,0xF5257D15,0x2486CC2C,0x1F773A19
434
        .long   0x3FFF0000,0xF7D0DF73,0x0AD13BB8,0x1FFE90D5
435
        .long   0x3FFF0000,0xFA83B2DB,0x722A033C,0xA041ED22
436
        .long   0x3FFF0000,0xFD3E0C0C,0xF486C174,0x1F853F3A
437
 
438
        .set    ADJFLAG,L_SCR2
439
        .set    SCALE,FP_SCR1
440
        .set    ADJSCALE,FP_SCR2
441
        .set    SC,FP_SCR3
442
        .set    ONEBYSC,FP_SCR4
443
 
444
        | xref  t_frcinx
445
        |xref   t_extdnrm
446
        |xref   t_unfl
447
        |xref   t_ovfl
448
 
449
        .global setoxd
450
setoxd:
451
|--entry point for EXP(X), X is denormalized
452
        movel           (%a0),%d0
453
        andil           #0x80000000,%d0
454
        oril            #0x00800000,%d0         | ...sign(X)*2^(-126)
455
        movel           %d0,-(%sp)
456
        fmoves          #0x3F800000,%fp0
457
        fmovel          %d1,%fpcr
458
        fadds           (%sp)+,%fp0
459
        bra             t_frcinx
460
 
461
        .global setox
462
setox:
463
|--entry point for EXP(X), here X is finite, non-zero, and not NaN's
464
 
465
|--Step 1.
466
        movel           (%a0),%d0        | ...load part of input X
467
        andil           #0x7FFF0000,%d0 | ...biased expo. of X
468
        cmpil           #0x3FBE0000,%d0 | ...2^(-65)
469
        bges            EXPC1           | ...normal case
470
        bra             EXPSM
471
 
472
EXPC1:
473
|--The case |X| >= 2^(-65)
474
        movew           4(%a0),%d0      | ...expo. and partial sig. of |X|
475
        cmpil           #0x400CB167,%d0 | ...16380 log2 trunc. 16 bits
476
        blts            EXPMAIN  | ...normal case
477
        bra             EXPBIG
478
 
479
EXPMAIN:
480
|--Step 2.
481
|--This is the normal branch:   2^(-65) <= |X| < 16380 log2.
482
        fmovex          (%a0),%fp0      | ...load input from (a0)
483
 
484
        fmovex          %fp0,%fp1
485
        fmuls           #0x42B8AA3B,%fp0        | ...64/log2 * X
486
        fmovemx %fp2-%fp2/%fp3,-(%a7)           | ...save fp2
487
        movel           #0,ADJFLAG(%a6)
488
        fmovel          %fp0,%d0                | ...N = int( X * 64/log2 )
489
        lea             EXPTBL,%a1
490
        fmovel          %d0,%fp0                | ...convert to floating-format
491
 
492
        movel           %d0,L_SCR1(%a6) | ...save N temporarily
493
        andil           #0x3F,%d0               | ...D0 is J = N mod 64
494
        lsll            #4,%d0
495
        addal           %d0,%a1         | ...address of 2^(J/64)
496
        movel           L_SCR1(%a6),%d0
497
        asrl            #6,%d0          | ...D0 is M
498
        addiw           #0x3FFF,%d0     | ...biased expo. of 2^(M)
499
        movew           L2,L_SCR1(%a6)  | ...prefetch L2, no need in CB
500
 
501
EXPCONT1:
502
|--Step 3.
503
|--fp1,fp2 saved on the stack. fp0 is N, fp1 is X,
504
|--a0 points to 2^(J/64), D0 is biased expo. of 2^(M)
505
        fmovex          %fp0,%fp2
506
        fmuls           #0xBC317218,%fp0        | ...N * L1, L1 = lead(-log2/64)
507
        fmulx           L2,%fp2         | ...N * L2, L1+L2 = -log2/64
508
        faddx           %fp1,%fp0               | ...X + N*L1
509
        faddx           %fp2,%fp0               | ...fp0 is R, reduced arg.
510
|       MOVE.W          #$3FA5,EXPA3    ...load EXPA3 in cache
511
 
512
|--Step 4.
513
|--WE NOW COMPUTE EXP(R)-1 BY A POLYNOMIAL
514
|-- R + R*R*(A1 + R*(A2 + R*(A3 + R*(A4 + R*A5))))
515
|--TO FULLY UTILIZE THE PIPELINE, WE COMPUTE S = R*R
516
|--[R+R*S*(A2+S*A4)] + [S*(A1+S*(A3+S*A5))]
517
 
518
        fmovex          %fp0,%fp1
519
        fmulx           %fp1,%fp1               | ...fp1 IS S = R*R
520
 
521
        fmoves          #0x3AB60B70,%fp2        | ...fp2 IS A5
522
|       MOVE.W          #0,2(%a1)       ...load 2^(J/64) in cache
523
 
524
        fmulx           %fp1,%fp2               | ...fp2 IS S*A5
525
        fmovex          %fp1,%fp3
526
        fmuls           #0x3C088895,%fp3        | ...fp3 IS S*A4
527
 
528
        faddd           EXPA3,%fp2      | ...fp2 IS A3+S*A5
529
        faddd           EXPA2,%fp3      | ...fp3 IS A2+S*A4
530
 
531
        fmulx           %fp1,%fp2               | ...fp2 IS S*(A3+S*A5)
532
        movew           %d0,SCALE(%a6)  | ...SCALE is 2^(M) in extended
533
        clrw            SCALE+2(%a6)
534
        movel           #0x80000000,SCALE+4(%a6)
535
        clrl            SCALE+8(%a6)
536
 
537
        fmulx           %fp1,%fp3               | ...fp3 IS S*(A2+S*A4)
538
 
539
        fadds           #0x3F000000,%fp2        | ...fp2 IS A1+S*(A3+S*A5)
540
        fmulx           %fp0,%fp3               | ...fp3 IS R*S*(A2+S*A4)
541
 
542
        fmulx           %fp1,%fp2               | ...fp2 IS S*(A1+S*(A3+S*A5))
543
        faddx           %fp3,%fp0               | ...fp0 IS R+R*S*(A2+S*A4),
544
|                                       ...fp3 released
545
 
546
        fmovex          (%a1)+,%fp1     | ...fp1 is lead. pt. of 2^(J/64)
547
        faddx           %fp2,%fp0               | ...fp0 is EXP(R) - 1
548
|                                       ...fp2 released
549
 
550
|--Step 5
551
|--final reconstruction process
552
|--EXP(X) = 2^M * ( 2^(J/64) + 2^(J/64)*(EXP(R)-1) )
553
 
554
        fmulx           %fp1,%fp0               | ...2^(J/64)*(Exp(R)-1)
555
        fmovemx (%a7)+,%fp2-%fp2/%fp3   | ...fp2 restored
556
        fadds           (%a1),%fp0      | ...accurate 2^(J/64)
557
 
558
        faddx           %fp1,%fp0               | ...2^(J/64) + 2^(J/64)*...
559
        movel           ADJFLAG(%a6),%d0
560
 
561
|--Step 6
562
        tstl            %d0
563
        beqs            NORMAL
564
ADJUST:
565
        fmulx           ADJSCALE(%a6),%fp0
566
NORMAL:
567
        fmovel          %d1,%FPCR               | ...restore user FPCR
568
        fmulx           SCALE(%a6),%fp0 | ...multiply 2^(M)
569
        bra             t_frcinx
570
 
571
EXPSM:
572
|--Step 7
573
        fmovemx (%a0),%fp0-%fp0 | ...in case X is denormalized
574
        fmovel          %d1,%FPCR
575
        fadds           #0x3F800000,%fp0        | ...1+X in user mode
576
        bra             t_frcinx
577
 
578
EXPBIG:
579
|--Step 8
580
        cmpil           #0x400CB27C,%d0 | ...16480 log2
581
        bgts            EXP2BIG
582
|--Steps 8.2 -- 8.6
583
        fmovex          (%a0),%fp0      | ...load input from (a0)
584
 
585
        fmovex          %fp0,%fp1
586
        fmuls           #0x42B8AA3B,%fp0        | ...64/log2 * X
587
        fmovemx  %fp2-%fp2/%fp3,-(%a7)          | ...save fp2
588
        movel           #1,ADJFLAG(%a6)
589
        fmovel          %fp0,%d0                | ...N = int( X * 64/log2 )
590
        lea             EXPTBL,%a1
591
        fmovel          %d0,%fp0                | ...convert to floating-format
592
        movel           %d0,L_SCR1(%a6)                 | ...save N temporarily
593
        andil           #0x3F,%d0                | ...D0 is J = N mod 64
594
        lsll            #4,%d0
595
        addal           %d0,%a1                 | ...address of 2^(J/64)
596
        movel           L_SCR1(%a6),%d0
597
        asrl            #6,%d0                  | ...D0 is K
598
        movel           %d0,L_SCR1(%a6)                 | ...save K temporarily
599
        asrl            #1,%d0                  | ...D0 is M1
600
        subl            %d0,L_SCR1(%a6)                 | ...a1 is M
601
        addiw           #0x3FFF,%d0             | ...biased expo. of 2^(M1)
602
        movew           %d0,ADJSCALE(%a6)               | ...ADJSCALE := 2^(M1)
603
        clrw            ADJSCALE+2(%a6)
604
        movel           #0x80000000,ADJSCALE+4(%a6)
605
        clrl            ADJSCALE+8(%a6)
606
        movel           L_SCR1(%a6),%d0                 | ...D0 is M
607
        addiw           #0x3FFF,%d0             | ...biased expo. of 2^(M)
608
        bra             EXPCONT1                | ...go back to Step 3
609
 
610
EXP2BIG:
611
|--Step 9
612
        fmovel          %d1,%FPCR
613
        movel           (%a0),%d0
614
        bclrb           #sign_bit,(%a0)         | ...setox always returns positive
615
        cmpil           #0,%d0
616
        blt             t_unfl
617
        bra             t_ovfl
618
 
619
        .global setoxm1d
620
setoxm1d:
621
|--entry point for EXPM1(X), here X is denormalized
622
|--Step 0.
623
        bra             t_extdnrm
624
 
625
 
626
        .global setoxm1
627
setoxm1:
628
|--entry point for EXPM1(X), here X is finite, non-zero, non-NaN
629
 
630
|--Step 1.
631
|--Step 1.1
632
        movel           (%a0),%d0        | ...load part of input X
633
        andil           #0x7FFF0000,%d0 | ...biased expo. of X
634
        cmpil           #0x3FFD0000,%d0 | ...1/4
635
        bges            EM1CON1  | ...|X| >= 1/4
636
        bra             EM1SM
637
 
638
EM1CON1:
639
|--Step 1.3
640
|--The case |X| >= 1/4
641
        movew           4(%a0),%d0      | ...expo. and partial sig. of |X|
642
        cmpil           #0x4004C215,%d0 | ...70log2 rounded up to 16 bits
643
        bles            EM1MAIN  | ...1/4 <= |X| <= 70log2
644
        bra             EM1BIG
645
 
646
EM1MAIN:
647
|--Step 2.
648
|--This is the case:    1/4 <= |X| <= 70 log2.
649
        fmovex          (%a0),%fp0      | ...load input from (a0)
650
 
651
        fmovex          %fp0,%fp1
652
        fmuls           #0x42B8AA3B,%fp0        | ...64/log2 * X
653
        fmovemx %fp2-%fp2/%fp3,-(%a7)           | ...save fp2
654
|       MOVE.W          #$3F81,EM1A4            ...prefetch in CB mode
655
        fmovel          %fp0,%d0                | ...N = int( X * 64/log2 )
656
        lea             EXPTBL,%a1
657
        fmovel          %d0,%fp0                | ...convert to floating-format
658
 
659
        movel           %d0,L_SCR1(%a6)                 | ...save N temporarily
660
        andil           #0x3F,%d0                | ...D0 is J = N mod 64
661
        lsll            #4,%d0
662
        addal           %d0,%a1                 | ...address of 2^(J/64)
663
        movel           L_SCR1(%a6),%d0
664
        asrl            #6,%d0                  | ...D0 is M
665
        movel           %d0,L_SCR1(%a6)                 | ...save a copy of M
666
|       MOVE.W          #$3FDC,L2               ...prefetch L2 in CB mode
667
 
668
|--Step 3.
669
|--fp1,fp2 saved on the stack. fp0 is N, fp1 is X,
670
|--a0 points to 2^(J/64), D0 and a1 both contain M
671
        fmovex          %fp0,%fp2
672
        fmuls           #0xBC317218,%fp0        | ...N * L1, L1 = lead(-log2/64)
673
        fmulx           L2,%fp2         | ...N * L2, L1+L2 = -log2/64
674
        faddx           %fp1,%fp0        | ...X + N*L1
675
        faddx           %fp2,%fp0        | ...fp0 is R, reduced arg.
676
|       MOVE.W          #$3FC5,EM1A2            ...load EM1A2 in cache
677
        addiw           #0x3FFF,%d0             | ...D0 is biased expo. of 2^M
678
 
679
|--Step 4.
680
|--WE NOW COMPUTE EXP(R)-1 BY A POLYNOMIAL
681
|-- R + R*R*(A1 + R*(A2 + R*(A3 + R*(A4 + R*(A5 + R*A6)))))
682
|--TO FULLY UTILIZE THE PIPELINE, WE COMPUTE S = R*R
683
|--[R*S*(A2+S*(A4+S*A6))] + [R+S*(A1+S*(A3+S*A5))]
684
 
685
        fmovex          %fp0,%fp1
686
        fmulx           %fp1,%fp1               | ...fp1 IS S = R*R
687
 
688
        fmoves          #0x3950097B,%fp2        | ...fp2 IS a6
689
|       MOVE.W          #0,2(%a1)       ...load 2^(J/64) in cache
690
 
691
        fmulx           %fp1,%fp2               | ...fp2 IS S*A6
692
        fmovex          %fp1,%fp3
693
        fmuls           #0x3AB60B6A,%fp3        | ...fp3 IS S*A5
694
 
695
        faddd           EM1A4,%fp2      | ...fp2 IS A4+S*A6
696
        faddd           EM1A3,%fp3      | ...fp3 IS A3+S*A5
697
        movew           %d0,SC(%a6)             | ...SC is 2^(M) in extended
698
        clrw            SC+2(%a6)
699
        movel           #0x80000000,SC+4(%a6)
700
        clrl            SC+8(%a6)
701
 
702
        fmulx           %fp1,%fp2               | ...fp2 IS S*(A4+S*A6)
703
        movel           L_SCR1(%a6),%d0         | ...D0 is      M
704
        negw            %d0             | ...D0 is -M
705
        fmulx           %fp1,%fp3               | ...fp3 IS S*(A3+S*A5)
706
        addiw           #0x3FFF,%d0     | ...biased expo. of 2^(-M)
707
        faddd           EM1A2,%fp2      | ...fp2 IS A2+S*(A4+S*A6)
708
        fadds           #0x3F000000,%fp3        | ...fp3 IS A1+S*(A3+S*A5)
709
 
710
        fmulx           %fp1,%fp2               | ...fp2 IS S*(A2+S*(A4+S*A6))
711
        oriw            #0x8000,%d0     | ...signed/expo. of -2^(-M)
712
        movew           %d0,ONEBYSC(%a6)        | ...OnebySc is -2^(-M)
713
        clrw            ONEBYSC+2(%a6)
714
        movel           #0x80000000,ONEBYSC+4(%a6)
715
        clrl            ONEBYSC+8(%a6)
716
        fmulx           %fp3,%fp1               | ...fp1 IS S*(A1+S*(A3+S*A5))
717
|                                       ...fp3 released
718
 
719
        fmulx           %fp0,%fp2               | ...fp2 IS R*S*(A2+S*(A4+S*A6))
720
        faddx           %fp1,%fp0               | ...fp0 IS R+S*(A1+S*(A3+S*A5))
721
|                                       ...fp1 released
722
 
723
        faddx           %fp2,%fp0               | ...fp0 IS EXP(R)-1
724
|                                       ...fp2 released
725
        fmovemx (%a7)+,%fp2-%fp2/%fp3   | ...fp2 restored
726
 
727
|--Step 5
728
|--Compute 2^(J/64)*p
729
 
730
        fmulx           (%a1),%fp0      | ...2^(J/64)*(Exp(R)-1)
731
 
732
|--Step 6
733
|--Step 6.1
734
        movel           L_SCR1(%a6),%d0         | ...retrieve M
735
        cmpil           #63,%d0
736
        bles            MLE63
737
|--Step 6.2     M >= 64
738
        fmoves          12(%a1),%fp1    | ...fp1 is t
739
        faddx           ONEBYSC(%a6),%fp1       | ...fp1 is t+OnebySc
740
        faddx           %fp1,%fp0               | ...p+(t+OnebySc), fp1 released
741
        faddx           (%a1),%fp0      | ...T+(p+(t+OnebySc))
742
        bras            EM1SCALE
743
MLE63:
744
|--Step 6.3     M <= 63
745
        cmpil           #-3,%d0
746
        bges            MGEN3
747
MLTN3:
748
|--Step 6.4     M <= -4
749
        fadds           12(%a1),%fp0    | ...p+t
750
        faddx           (%a1),%fp0      | ...T+(p+t)
751
        faddx           ONEBYSC(%a6),%fp0       | ...OnebySc + (T+(p+t))
752
        bras            EM1SCALE
753
MGEN3:
754
|--Step 6.5     -3 <= M <= 63
755
        fmovex          (%a1)+,%fp1     | ...fp1 is T
756
        fadds           (%a1),%fp0      | ...fp0 is p+t
757
        faddx           ONEBYSC(%a6),%fp1       | ...fp1 is T+OnebySc
758
        faddx           %fp1,%fp0               | ...(T+OnebySc)+(p+t)
759
 
760
EM1SCALE:
761
|--Step 6.6
762
        fmovel          %d1,%FPCR
763
        fmulx           SC(%a6),%fp0
764
 
765
        bra             t_frcinx
766
 
767
EM1SM:
768
|--Step 7       |X| < 1/4.
769
        cmpil           #0x3FBE0000,%d0 | ...2^(-65)
770
        bges            EM1POLY
771
 
772
EM1TINY:
773
|--Step 8       |X| < 2^(-65)
774
        cmpil           #0x00330000,%d0 | ...2^(-16312)
775
        blts            EM12TINY
776
|--Step 8.2
777
        movel           #0x80010000,SC(%a6)     | ...SC is -2^(-16382)
778
        movel           #0x80000000,SC+4(%a6)
779
        clrl            SC+8(%a6)
780
        fmovex          (%a0),%fp0
781
        fmovel          %d1,%FPCR
782
        faddx           SC(%a6),%fp0
783
 
784
        bra             t_frcinx
785
 
786
EM12TINY:
787
|--Step 8.3
788
        fmovex          (%a0),%fp0
789
        fmuld           TWO140,%fp0
790
        movel           #0x80010000,SC(%a6)
791
        movel           #0x80000000,SC+4(%a6)
792
        clrl            SC+8(%a6)
793
        faddx           SC(%a6),%fp0
794
        fmovel          %d1,%FPCR
795
        fmuld           TWON140,%fp0
796
 
797
        bra             t_frcinx
798
 
799
EM1POLY:
800
|--Step 9       exp(X)-1 by a simple polynomial
801
        fmovex          (%a0),%fp0      | ...fp0 is X
802
        fmulx           %fp0,%fp0               | ...fp0 is S := X*X
803
        fmovemx %fp2-%fp2/%fp3,-(%a7)   | ...save fp2
804
        fmoves          #0x2F30CAA8,%fp1        | ...fp1 is B12
805
        fmulx           %fp0,%fp1               | ...fp1 is S*B12
806
        fmoves          #0x310F8290,%fp2        | ...fp2 is B11
807
        fadds           #0x32D73220,%fp1        | ...fp1 is B10+S*B12
808
 
809
        fmulx           %fp0,%fp2               | ...fp2 is S*B11
810
        fmulx           %fp0,%fp1               | ...fp1 is S*(B10 + ...
811
 
812
        fadds           #0x3493F281,%fp2        | ...fp2 is B9+S*...
813
        faddd           EM1B8,%fp1      | ...fp1 is B8+S*...
814
 
815
        fmulx           %fp0,%fp2               | ...fp2 is S*(B9+...
816
        fmulx           %fp0,%fp1               | ...fp1 is S*(B8+...
817
 
818
        faddd           EM1B7,%fp2      | ...fp2 is B7+S*...
819
        faddd           EM1B6,%fp1      | ...fp1 is B6+S*...
820
 
821
        fmulx           %fp0,%fp2               | ...fp2 is S*(B7+...
822
        fmulx           %fp0,%fp1               | ...fp1 is S*(B6+...
823
 
824
        faddd           EM1B5,%fp2      | ...fp2 is B5+S*...
825
        faddd           EM1B4,%fp1      | ...fp1 is B4+S*...
826
 
827
        fmulx           %fp0,%fp2               | ...fp2 is S*(B5+...
828
        fmulx           %fp0,%fp1               | ...fp1 is S*(B4+...
829
 
830
        faddd           EM1B3,%fp2      | ...fp2 is B3+S*...
831
        faddx           EM1B2,%fp1      | ...fp1 is B2+S*...
832
 
833
        fmulx           %fp0,%fp2               | ...fp2 is S*(B3+...
834
        fmulx           %fp0,%fp1               | ...fp1 is S*(B2+...
835
 
836
        fmulx           %fp0,%fp2               | ...fp2 is S*S*(B3+...)
837
        fmulx           (%a0),%fp1      | ...fp1 is X*S*(B2...
838
 
839
        fmuls           #0x3F000000,%fp0        | ...fp0 is S*B1
840
        faddx           %fp2,%fp1               | ...fp1 is Q
841
|                                       ...fp2 released
842
 
843
        fmovemx (%a7)+,%fp2-%fp2/%fp3   | ...fp2 restored
844
 
845
        faddx           %fp1,%fp0               | ...fp0 is S*B1+Q
846
|                                       ...fp1 released
847
 
848
        fmovel          %d1,%FPCR
849
        faddx           (%a0),%fp0
850
 
851
        bra             t_frcinx
852
 
853
EM1BIG:
854
|--Step 10      |X| > 70 log2
855
        movel           (%a0),%d0
856
        cmpil           #0,%d0
857
        bgt             EXPC1
858
|--Step 10.2
859
        fmoves          #0xBF800000,%fp0        | ...fp0 is -1
860
        fmovel          %d1,%FPCR
861
        fadds           #0x00800000,%fp0        | ...-1 + 2^(-126)
862
 
863
        bra             t_frcinx
864
 
865
        |end

powered by: WebSVN 2.1.0

© copyright 1999-2024 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.