OpenCores
URL https://opencores.org/ocsvn/fpu100/fpu100/trunk

Subversion Repositories fpu100

[/] [fpu100/] [branches/] [avendor/] [test_bench/] [SoftFloat/] [softfloat/] [bits32/] [softfloat.c] - Blame information for rev 6

Go to most recent revision | Details | Compare with Previous | View Log

Line No. Rev Author Line
1 6 jidan
 
2
/*============================================================================
3
 
4
This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
5
Package, Release 2b.
6
 
7
Written by John R. Hauser.  This work was made possible in part by the
8
International Computer Science Institute, located at Suite 600, 1947 Center
9
Street, Berkeley, California 94704.  Funding was partially provided by the
10
National Science Foundation under grant MIP-9311980.  The original version
11
of this code was written as part of a project to build a fixed-point vector
12
processor in collaboration with the University of California at Berkeley,
13
overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
14
is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
15
arithmetic/SoftFloat.html'.
16
 
17
THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort has
18
been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
19
RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
20
AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
21
COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
22
EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
23
INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
24
OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
25
 
26
Derivative works are acceptable, even for commercial purposes, so long as
27
(1) the source code for the derivative work includes prominent notice that
28
the work is derivative, and (2) the source code includes prominent notice with
29
these four paragraphs for those parts of this code that are retained.
30
 
31
=============================================================================*/
32
 
33
#include "milieu.h"
34
#include "softfloat.h"
35
 
36
/*----------------------------------------------------------------------------
37
| Floating-point rounding mode and exception flags.
38
*----------------------------------------------------------------------------*/
39
int8 float_rounding_mode = float_round_nearest_even;
40
int8 float_exception_flags = 0;
41
 
42
/*----------------------------------------------------------------------------
43
| Primitive arithmetic functions, including multi-word arithmetic, and
44
| division and square root approximations.  (Can be specialized to target if
45
| desired.)
46
*----------------------------------------------------------------------------*/
47
#include "softfloat-macros"
48
 
49
/*----------------------------------------------------------------------------
50
| Functions and definitions to determine:  (1) whether tininess for underflow
51
| is detected before or after rounding by default, (2) what (if anything)
52
| happens when exceptions are raised, (3) how signaling NaNs are distinguished
53
| from quiet NaNs, (4) the default generated quiet NaNs, and (4) how NaNs
54
| are propagated from function inputs to output.  These details are target-
55
| specific.
56
*----------------------------------------------------------------------------*/
57
#include "softfloat-specialize"
58
 
59
/*----------------------------------------------------------------------------
60
| Returns the fraction bits of the single-precision floating-point value `a'.
61
*----------------------------------------------------------------------------*/
62
 
63
INLINE bits32 extractFloat32Frac( float32 a )
64
{
65
 
66
    return a & 0x007FFFFF;
67
 
68
}
69
 
70
/*----------------------------------------------------------------------------
71
| Returns the exponent bits of the single-precision floating-point value `a'.
72
*----------------------------------------------------------------------------*/
73
 
74
INLINE int16 extractFloat32Exp( float32 a )
75
{
76
 
77
    return ( a>>23 ) & 0xFF;
78
 
79
}
80
 
81
/*----------------------------------------------------------------------------
82
| Returns the sign bit of the single-precision floating-point value `a'.
83
*----------------------------------------------------------------------------*/
84
 
85
INLINE flag extractFloat32Sign( float32 a )
86
{
87
 
88
    return a>>31;
89
 
90
}
91
 
92
/*----------------------------------------------------------------------------
93
| Normalizes the subnormal single-precision floating-point value represented
94
| by the denormalized significand `aSig'.  The normalized exponent and
95
| significand are stored at the locations pointed to by `zExpPtr' and
96
| `zSigPtr', respectively.
97
*----------------------------------------------------------------------------*/
98
 
99
static void
100
 normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
101
{
102
    int8 shiftCount;
103
 
104
    shiftCount = countLeadingZeros32( aSig ) - 8;
105
    *zSigPtr = aSig<<shiftCount;
106
    *zExpPtr = 1 - shiftCount;
107
 
108
}
109
 
110
/*----------------------------------------------------------------------------
111
| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
112
| single-precision floating-point value, returning the result.  After being
113
| shifted into the proper positions, the three fields are simply added
114
| together to form the result.  This means that any integer portion of `zSig'
115
| will be added into the exponent.  Since a properly normalized significand
116
| will have an integer portion equal to 1, the `zExp' input should be 1 less
117
| than the desired result exponent whenever `zSig' is a complete, normalized
118
| significand.
119
*----------------------------------------------------------------------------*/
120
 
121
INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
122
{
123
 
124
    return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
125
 
126
}
127
 
128
/*----------------------------------------------------------------------------
129
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
130
| and significand `zSig', and returns the proper single-precision floating-
131
| point value corresponding to the abstract input.  Ordinarily, the abstract
132
| value is simply rounded and packed into the single-precision format, with
133
| the inexact exception raised if the abstract input cannot be represented
134
| exactly.  However, if the abstract value is too large, the overflow and
135
| inexact exceptions are raised and an infinity or maximal finite value is
136
| returned.  If the abstract value is too small, the input value is rounded to
137
| a subnormal number, and the underflow and inexact exceptions are raised if
138
| the abstract input cannot be represented exactly as a subnormal single-
139
| precision floating-point number.
140
|     The input significand `zSig' has its binary point between bits 30
141
| and 29, which is 7 bits to the left of the usual location.  This shifted
142
| significand must be normalized or smaller.  If `zSig' is not normalized,
143
| `zExp' must be 0; in that case, the result returned is a subnormal number,
144
| and it must not require rounding.  In the usual case that `zSig' is
145
| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
146
| The handling of underflow and overflow follows the IEC/IEEE Standard for
147
| Binary Floating-Point Arithmetic.
148
*----------------------------------------------------------------------------*/
149
 
150
static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
151
{
152
    int8 roundingMode;
153
    flag roundNearestEven;
154
    int8 roundIncrement, roundBits;
155
    flag isTiny;
156
 
157
    roundingMode = float_rounding_mode;
158
    roundNearestEven = roundingMode == float_round_nearest_even;
159
    roundIncrement = 0x40;
160
    if ( ! roundNearestEven ) {
161
        if ( roundingMode == float_round_to_zero ) {
162
            roundIncrement = 0;
163
        }
164
        else {
165
            roundIncrement = 0x7F;
166
            if ( zSign ) {
167
                if ( roundingMode == float_round_up ) roundIncrement = 0;
168
            }
169
            else {
170
                if ( roundingMode == float_round_down ) roundIncrement = 0;
171
            }
172
        }
173
    }
174
    roundBits = zSig & 0x7F;
175
    if ( 0xFD <= (bits16) zExp ) {
176
        if (    ( 0xFD < zExp )
177
             || (    ( zExp == 0xFD )
178
                  && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
179
           ) {
180
            float_raise(float_flag_inexact );
181
            float_raise( float_flag_overflow);
182
            return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
183
        }
184
        if ( zExp < 0 ) {
185
            isTiny =
186
                   ( float_detect_tininess == float_tininess_before_rounding )
187
                || ( zExp < -1 )
188
                || ( zSig + roundIncrement < 0x80000000 );
189
            shift32RightJamming( zSig, - zExp, &zSig );
190
            zExp = 0;
191
            roundBits = zSig & 0x7F;
192
            if ( isTiny && roundBits ) float_raise( float_flag_underflow );
193
        }
194
    }
195
    if ( roundBits ) float_exception_flags |= float_flag_inexact;
196
    zSig = ( zSig + roundIncrement )>>7;
197
    zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
198
    if ( zSig == 0 ) zExp = 0;
199
    return packFloat32( zSign, zExp, zSig );
200
 
201
}
202
 
203
/*----------------------------------------------------------------------------
204
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
205
| and significand `zSig', and returns the proper single-precision floating-
206
| point value corresponding to the abstract input.  This routine is just like
207
| `roundAndPackFloat32' except that `zSig' does not have to be normalized.
208
| Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
209
| floating-point exponent.
210
*----------------------------------------------------------------------------*/
211
 
212
static float32
213
 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
214
{
215
    int8 shiftCount;
216
 
217
    shiftCount = countLeadingZeros32( zSig ) - 1;
218
    return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount );
219
 
220
}
221
 
222
/*----------------------------------------------------------------------------
223
| Returns the least-significant 32 fraction bits of the double-precision
224
| floating-point value `a'.
225
*----------------------------------------------------------------------------*/
226
 
227
INLINE bits32 extractFloat64Frac1( float64 a )
228
{
229
 
230
    return a.low;
231
 
232
}
233
 
234
/*----------------------------------------------------------------------------
235
| Returns the most-significant 20 fraction bits of the double-precision
236
| floating-point value `a'.
237
*----------------------------------------------------------------------------*/
238
 
239
INLINE bits32 extractFloat64Frac0( float64 a )
240
{
241
 
242
    return a.high & 0x000FFFFF;
243
 
244
}
245
 
246
/*----------------------------------------------------------------------------
247
| Returns the exponent bits of the double-precision floating-point value `a'.
248
*----------------------------------------------------------------------------*/
249
 
250
INLINE int16 extractFloat64Exp( float64 a )
251
{
252
 
253
    return ( a.high>>20 ) & 0x7FF;
254
 
255
}
256
 
257
/*----------------------------------------------------------------------------
258
| Returns the sign bit of the double-precision floating-point value `a'.
259
*----------------------------------------------------------------------------*/
260
 
261
INLINE flag extractFloat64Sign( float64 a )
262
{
263
 
264
    return a.high>>31;
265
 
266
}
267
 
268
/*----------------------------------------------------------------------------
269
| Normalizes the subnormal double-precision floating-point value represented
270
| by the denormalized significand formed by the concatenation of `aSig0' and
271
| `aSig1'.  The normalized exponent is stored at the location pointed to by
272
| `zExpPtr'.  The most significant 21 bits of the normalized significand are
273
| stored at the location pointed to by `zSig0Ptr', and the least significant
274
| 32 bits of the normalized significand are stored at the location pointed to
275
| by `zSig1Ptr'.
276
*----------------------------------------------------------------------------*/
277
 
278
static void
279
 normalizeFloat64Subnormal(
280
     bits32 aSig0,
281
     bits32 aSig1,
282
     int16 *zExpPtr,
283
     bits32 *zSig0Ptr,
284
     bits32 *zSig1Ptr
285
 )
286
{
287
    int8 shiftCount;
288
 
289
    if ( aSig0 == 0 ) {
290
        shiftCount = countLeadingZeros32( aSig1 ) - 11;
291
        if ( shiftCount < 0 ) {
292
            *zSig0Ptr = aSig1>>( - shiftCount );
293
            *zSig1Ptr = aSig1<<( shiftCount & 31 );
294
        }
295
        else {
296
            *zSig0Ptr = aSig1<<shiftCount;
297
            *zSig1Ptr = 0;
298
        }
299
        *zExpPtr = - shiftCount - 31;
300
    }
301
    else {
302
        shiftCount = countLeadingZeros32( aSig0 ) - 11;
303
        shortShift64Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
304
        *zExpPtr = 1 - shiftCount;
305
    }
306
 
307
}
308
 
309
/*----------------------------------------------------------------------------
310
| Packs the sign `zSign', the exponent `zExp', and the significand formed by
311
| the concatenation of `zSig0' and `zSig1' into a double-precision floating-
312
| point value, returning the result.  After being shifted into the proper
313
| positions, the three fields `zSign', `zExp', and `zSig0' are simply added
314
| together to form the most significant 32 bits of the result.  This means
315
| that any integer portion of `zSig0' will be added into the exponent.  Since
316
| a properly normalized significand will have an integer portion equal to 1,
317
| the `zExp' input should be 1 less than the desired result exponent whenever
318
| `zSig0' and `zSig1' concatenated form a complete, normalized significand.
319
*----------------------------------------------------------------------------*/
320
 
321
INLINE float64
322
 packFloat64( flag zSign, int16 zExp, bits32 zSig0, bits32 zSig1 )
323
{
324
    float64 z;
325
 
326
    z.low = zSig1;
327
    z.high = ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<20 ) + zSig0;
328
    return z;
329
 
330
}
331
 
332
/*----------------------------------------------------------------------------
333
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
334
| and extended significand formed by the concatenation of `zSig0', `zSig1',
335
| and `zSig2', and returns the proper double-precision floating-point value
336
| corresponding to the abstract input.  Ordinarily, the abstract value is
337
| simply rounded and packed into the double-precision format, with the inexact
338
| exception raised if the abstract input cannot be represented exactly.
339
| However, if the abstract value is too large, the overflow and inexact
340
| exceptions are raised and an infinity or maximal finite value is returned.
341
| If the abstract value is too small, the input value is rounded to a
342
| subnormal number, and the underflow and inexact exceptions are raised if the
343
| abstract input cannot be represented exactly as a subnormal double-precision
344
| floating-point number.
345
|     The input significand must be normalized or smaller.  If the input
346
| significand is not normalized, `zExp' must be 0; in that case, the result
347
| returned is a subnormal number, and it must not require rounding.  In the
348
| usual case that the input significand is normalized, `zExp' must be 1 less
349
| than the ``true'' floating-point exponent.  The handling of underflow and
350
| overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
351
*----------------------------------------------------------------------------*/
352
 
353
static float64
354
 roundAndPackFloat64(
355
     flag zSign, int16 zExp, bits32 zSig0, bits32 zSig1, bits32 zSig2 )
356
{
357
    int8 roundingMode;
358
    flag roundNearestEven, increment, isTiny;
359
 
360
    roundingMode = float_rounding_mode;
361
    roundNearestEven = ( roundingMode == float_round_nearest_even );
362
    increment = ( (sbits32) zSig2 < 0 );
363
    if ( ! roundNearestEven ) {
364
        if ( roundingMode == float_round_to_zero ) {
365
            increment = 0;
366
        }
367
        else {
368
            if ( zSign ) {
369
                increment = ( roundingMode == float_round_down ) && zSig2;
370
            }
371
            else {
372
                increment = ( roundingMode == float_round_up ) && zSig2;
373
            }
374
        }
375
    }
376
    if ( 0x7FD <= (bits16) zExp ) {
377
        if (    ( 0x7FD < zExp )
378
             || (    ( zExp == 0x7FD )
379
                  && eq64( 0x001FFFFF, 0xFFFFFFFF, zSig0, zSig1 )
380
                  && increment
381
                )
382
           ) {
383
            float_raise(float_flag_inexact );
384
                        float_raise( float_flag_overflow);
385
            if (    ( roundingMode == float_round_to_zero )
386
                 || ( zSign && ( roundingMode == float_round_up ) )
387
                 || ( ! zSign && ( roundingMode == float_round_down ) )
388
               ) {
389
                return packFloat64( zSign, 0x7FE, 0x000FFFFF, 0xFFFFFFFF );
390
            }
391
            return packFloat64( zSign, 0x7FF, 0, 0 );
392
        }
393
        if ( zExp < 0 ) {
394
            isTiny =
395
                   ( float_detect_tininess == float_tininess_before_rounding )
396
                || ( zExp < -1 )
397
                || ! increment
398
                || lt64( zSig0, zSig1, 0x001FFFFF, 0xFFFFFFFF );
399
            shift64ExtraRightJamming(
400
                zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
401
            zExp = 0;
402
            if ( isTiny && zSig2 ) float_raise( float_flag_underflow );
403
            if ( roundNearestEven ) {
404
                increment = ( (sbits32) zSig2 < 0 );
405
            }
406
            else {
407
                if ( zSign ) {
408
                    increment = ( roundingMode == float_round_down ) && zSig2;
409
                }
410
                else {
411
                    increment = ( roundingMode == float_round_up ) && zSig2;
412
                }
413
            }
414
        }
415
    }
416
    if ( zSig2 ) float_exception_flags |= float_flag_inexact;
417
    if ( increment ) {
418
        add64( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
419
        zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
420
    }
421
    else {
422
        if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
423
    }
424
    return packFloat64( zSign, zExp, zSig0, zSig1 );
425
 
426
}
427
 
428
/*----------------------------------------------------------------------------
429
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
430
| and significand formed by the concatenation of `zSig0' and `zSig1', and
431
| returns the proper double-precision floating-point value corresponding
432
| to the abstract input.  This routine is just like `roundAndPackFloat64'
433
| except that the input significand has fewer bits and does not have to be
434
| normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
435
| point exponent.
436
*----------------------------------------------------------------------------*/
437
 
438
static float64
439
 normalizeRoundAndPackFloat64(
440
     flag zSign, int16 zExp, bits32 zSig0, bits32 zSig1 )
441
{
442
    int8 shiftCount;
443
    bits32 zSig2;
444
 
445
    if ( zSig0 == 0 ) {
446
        zSig0 = zSig1;
447
        zSig1 = 0;
448
        zExp -= 32;
449
    }
450
    shiftCount = countLeadingZeros32( zSig0 ) - 11;
451
    if ( 0 <= shiftCount ) {
452
        zSig2 = 0;
453
        shortShift64Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
454
    }
455
    else {
456
        shift64ExtraRightJamming(
457
            zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
458
    }
459
    zExp -= shiftCount;
460
    return roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2 );
461
 
462
}
463
 
464
/*----------------------------------------------------------------------------
465
| Returns the result of converting the 32-bit two's complement integer `a' to
466
| the single-precision floating-point format.  The conversion is performed
467
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
468
*----------------------------------------------------------------------------*/
469
 
470
float32 int32_to_float32( int32 a )
471
{
472
    flag zSign;
473
 
474
    if ( a == 0 ) return 0;
475
    if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
476
    zSign = ( a < 0 );
477
    return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a );
478
 
479
}
480
 
481
/*----------------------------------------------------------------------------
482
| Returns the result of converting the 32-bit two's complement integer `a' to
483
| the double-precision floating-point format.  The conversion is performed
484
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
485
*----------------------------------------------------------------------------*/
486
 
487
float64 int32_to_float64( int32 a )
488
{
489
    flag zSign;
490
    bits32 absA;
491
    int8 shiftCount;
492
    bits32 zSig0, zSig1;
493
 
494
    if ( a == 0 ) return packFloat64( 0, 0, 0, 0 );
495
    zSign = ( a < 0 );
496
    absA = zSign ? - a : a;
497
    shiftCount = countLeadingZeros32( absA ) - 11;
498
    if ( 0 <= shiftCount ) {
499
        zSig0 = absA<<shiftCount;
500
        zSig1 = 0;
501
    }
502
    else {
503
        shift64Right( absA, 0, - shiftCount, &zSig0, &zSig1 );
504
    }
505
    return packFloat64( zSign, 0x412 - shiftCount, zSig0, zSig1 );
506
 
507
}
508
 
509
/*----------------------------------------------------------------------------
510
| Returns the result of converting the single-precision floating-point value
511
| `a' to the 32-bit two's complement integer format.  The conversion is
512
| performed according to the IEC/IEEE Standard for Binary Floating-Point
513
| Arithmetic---which means in particular that the conversion is rounded
514
| according to the current rounding mode.  If `a' is a NaN, the largest
515
| positive integer is returned.  Otherwise, if the conversion overflows, the
516
| largest integer with the same sign as `a' is returned.
517
*----------------------------------------------------------------------------*/
518
 
519
int32 float32_to_int32( float32 a )
520
{
521
    flag aSign;
522
    int16 aExp, shiftCount;
523
    bits32 aSig, aSigExtra;
524
    int32 z;
525
    int8 roundingMode;
526
 
527
    aSig = extractFloat32Frac( a );
528
    aExp = extractFloat32Exp( a );
529
    aSign = extractFloat32Sign( a );
530
    shiftCount = aExp - 0x96;
531
    if ( 0 <= shiftCount ) {
532
        if ( 0x9E <= aExp ) {
533
            if ( a != 0xCF000000 ) {
534
                float_raise( float_flag_invalid );
535
                if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
536
                    return 0x7FFFFFFF;
537
                }
538
            }
539
            return (sbits32) 0x80000000;
540
        }
541
        z = ( aSig | 0x00800000 )<<shiftCount;
542
        if ( aSign ) z = - z;
543
    }
544
    else {
545
        if ( aExp < 0x7E ) {
546
            aSigExtra = aExp | aSig;
547
            z = 0;
548
        }
549
        else {
550
            aSig |= 0x00800000;
551
            aSigExtra = aSig<<( shiftCount & 31 );
552
            z = aSig>>( - shiftCount );
553
        }
554
        if ( aSigExtra ) float_exception_flags |= float_flag_inexact;
555
        roundingMode = float_rounding_mode;
556
        if ( roundingMode == float_round_nearest_even ) {
557
            if ( (sbits32) aSigExtra < 0 ) {
558
                ++z;
559
                if ( (bits32) ( aSigExtra<<1 ) == 0 ) z &= ~1;
560
            }
561
            if ( aSign ) z = - z;
562
        }
563
        else {
564
            aSigExtra = ( aSigExtra != 0 );
565
            if ( aSign ) {
566
                z += ( roundingMode == float_round_down ) & aSigExtra;
567
                z = - z;
568
            }
569
            else {
570
                z += ( roundingMode == float_round_up ) & aSigExtra;
571
            }
572
        }
573
    }
574
    return z;
575
 
576
}
577
 
578
/*----------------------------------------------------------------------------
579
| Returns the result of converting the single-precision floating-point value
580
| `a' to the 32-bit two's complement integer format.  The conversion is
581
| performed according to the IEC/IEEE Standard for Binary Floating-Point
582
| Arithmetic, except that the conversion is always rounded toward zero.
583
| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
584
| the conversion overflows, the largest integer with the same sign as `a' is
585
| returned.
586
*----------------------------------------------------------------------------*/
587
 
588
int32 float32_to_int32_round_to_zero( float32 a )
589
{
590
    flag aSign;
591
    int16 aExp, shiftCount;
592
    bits32 aSig;
593
    int32 z;
594
 
595
    aSig = extractFloat32Frac( a );
596
    aExp = extractFloat32Exp( a );
597
    aSign = extractFloat32Sign( a );
598
    shiftCount = aExp - 0x9E;
599
    if ( 0 <= shiftCount ) {
600
        if ( a != 0xCF000000 ) {
601
            float_raise( float_flag_invalid );
602
            if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
603
        }
604
        return (sbits32) 0x80000000;
605
    }
606
    else if ( aExp <= 0x7E ) {
607
        if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
608
        return 0;
609
    }
610
    aSig = ( aSig | 0x00800000 )<<8;
611
    z = aSig>>( - shiftCount );
612
    if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
613
        float_exception_flags |= float_flag_inexact;
614
    }
615
    if ( aSign ) z = - z;
616
    return z;
617
 
618
}
619
 
620
/*----------------------------------------------------------------------------
621
| Returns the result of converting the single-precision floating-point value
622
| `a' to the double-precision floating-point format.  The conversion is
623
| performed according to the IEC/IEEE Standard for Binary Floating-Point
624
| Arithmetic.
625
*----------------------------------------------------------------------------*/
626
 
627
float64 float32_to_float64( float32 a )
628
{
629
    flag aSign;
630
    int16 aExp;
631
    bits32 aSig, zSig0, zSig1;
632
 
633
    aSig = extractFloat32Frac( a );
634
    aExp = extractFloat32Exp( a );
635
    aSign = extractFloat32Sign( a );
636
    if ( aExp == 0xFF ) {
637
        if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) );
638
        return packFloat64( aSign, 0x7FF, 0, 0 );
639
    }
640
    if ( aExp == 0 ) {
641
        if ( aSig == 0 ) return packFloat64( aSign, 0, 0, 0 );
642
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
643
        --aExp;
644
    }
645
    shift64Right( aSig, 0, 3, &zSig0, &zSig1 );
646
    return packFloat64( aSign, aExp + 0x380, zSig0, zSig1 );
647
 
648
}
649
 
650
/*----------------------------------------------------------------------------
651
| Rounds the single-precision floating-point value `a' to an integer,
652
| and returns the result as a single-precision floating-point value.  The
653
| operation is performed according to the IEC/IEEE Standard for Binary
654
| Floating-Point Arithmetic.
655
*----------------------------------------------------------------------------*/
656
 
657
float32 float32_round_to_int( float32 a )
658
{
659
    flag aSign;
660
    int16 aExp;
661
    bits32 lastBitMask, roundBitsMask;
662
    int8 roundingMode;
663
    float32 z;
664
 
665
    aExp = extractFloat32Exp( a );
666
    if ( 0x96 <= aExp ) {
667
        if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
668
            return propagateFloat32NaN( a, a );
669
        }
670
        return a;
671
    }
672
    if ( aExp <= 0x7E ) {
673
        if ( (bits32) ( a<<1 ) == 0 ) return a;
674
        float_exception_flags |= float_flag_inexact;
675
        aSign = extractFloat32Sign( a );
676
        switch ( float_rounding_mode ) {
677
         case float_round_nearest_even:
678
            if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
679
                return packFloat32( aSign, 0x7F, 0 );
680
            }
681
            break;
682
         case float_round_down:
683
            return aSign ? 0xBF800000 : 0;
684
         case float_round_up:
685
            return aSign ? 0x80000000 : 0x3F800000;
686
        }
687
        return packFloat32( aSign, 0, 0 );
688
    }
689
    lastBitMask = 1;
690
    lastBitMask <<= 0x96 - aExp;
691
    roundBitsMask = lastBitMask - 1;
692
    z = a;
693
    roundingMode = float_rounding_mode;
694
    if ( roundingMode == float_round_nearest_even ) {
695
        z += lastBitMask>>1;
696
        if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
697
    }
698
    else if ( roundingMode != float_round_to_zero ) {
699
        if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
700
            z += roundBitsMask;
701
        }
702
    }
703
    z &= ~ roundBitsMask;
704
    if ( z != a ) float_exception_flags |= float_flag_inexact;
705
    return z;
706
 
707
}
708
 
709
/*----------------------------------------------------------------------------
710
| Returns the result of adding the absolute values of the single-precision
711
| floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
712
| before being returned.  `zSign' is ignored if the result is a NaN.
713
| The addition is performed according to the IEC/IEEE Standard for Binary
714
| Floating-Point Arithmetic.
715
*----------------------------------------------------------------------------*/
716
 
717
static float32 addFloat32Sigs( float32 a, float32 b, flag zSign )
718
{
719
    int16 aExp, bExp, zExp;
720
    bits32 aSig, bSig, zSig;
721
    int16 expDiff;
722
 
723
    aSig = extractFloat32Frac( a );
724
    aExp = extractFloat32Exp( a );
725
    bSig = extractFloat32Frac( b );
726
    bExp = extractFloat32Exp( b );
727
    expDiff = aExp - bExp;
728
    aSig <<= 6;
729
    bSig <<= 6;
730
    if ( 0 < expDiff ) {
731
        if ( aExp == 0xFF ) {
732
            if ( aSig ) return propagateFloat32NaN( a, b );
733
            return a;
734
        }
735
        if ( bExp == 0 ) {
736
            --expDiff;
737
        }
738
        else {
739
            bSig |= 0x20000000;
740
        }
741
        shift32RightJamming( bSig, expDiff, &bSig );
742
        zExp = aExp;
743
    }
744
    else if ( expDiff < 0 ) {
745
        if ( bExp == 0xFF ) {
746
            if ( bSig ) return propagateFloat32NaN( a, b );
747
            return packFloat32( zSign, 0xFF, 0 );
748
        }
749
        if ( aExp == 0 ) {
750
            ++expDiff;
751
        }
752
        else {
753
            aSig |= 0x20000000;
754
        }
755
        shift32RightJamming( aSig, - expDiff, &aSig );
756
        zExp = bExp;
757
    }
758
    else {
759
        if ( aExp == 0xFF ) {
760
            if ( aSig | bSig ) return propagateFloat32NaN( a, b );
761
            return a;
762
        }
763
        if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
764
        zSig = 0x40000000 + aSig + bSig;
765
        zExp = aExp;
766
        goto roundAndPack;
767
    }
768
    aSig |= 0x20000000;
769
    zSig = ( aSig + bSig )<<1;
770
    --zExp;
771
    if ( (sbits32) zSig < 0 ) {
772
        zSig = aSig + bSig;
773
        ++zExp;
774
    }
775
 roundAndPack:
776
    return roundAndPackFloat32( zSign, zExp, zSig );
777
 
778
}
779
 
780
/*----------------------------------------------------------------------------
781
| Returns the result of subtracting the absolute values of the single-
782
| precision floating-point values `a' and `b'.  If `zSign' is 1, the
783
| difference is negated before being returned.  `zSign' is ignored if the
784
| result is a NaN.  The subtraction is performed according to the IEC/IEEE
785
| Standard for Binary Floating-Point Arithmetic.
786
*----------------------------------------------------------------------------*/
787
 
788
static float32 subFloat32Sigs( float32 a, float32 b, flag zSign )
789
{
790
    int16 aExp, bExp, zExp;
791
    bits32 aSig, bSig, zSig;
792
    int16 expDiff;
793
 
794
    aSig = extractFloat32Frac( a );
795
    aExp = extractFloat32Exp( a );
796
    bSig = extractFloat32Frac( b );
797
    bExp = extractFloat32Exp( b );
798
    expDiff = aExp - bExp;
799
    aSig <<= 7;
800
    bSig <<= 7;
801
    if ( 0 < expDiff ) goto aExpBigger;
802
    if ( expDiff < 0 ) goto bExpBigger;
803
    if ( aExp == 0xFF ) {
804
        if ( aSig | bSig ) return propagateFloat32NaN( a, b );
805
        float_raise( float_flag_invalid );
806
        return float32_default_nan;
807
    }
808
    if ( aExp == 0 ) {
809
        aExp = 1;
810
        bExp = 1;
811
    }
812
    if ( bSig < aSig ) goto aBigger;
813
    if ( aSig < bSig ) goto bBigger;
814
    return packFloat32( float_rounding_mode == float_round_down, 0, 0 );
815
 bExpBigger:
816
    if ( bExp == 0xFF ) {
817
        if ( bSig ) return propagateFloat32NaN( a, b );
818
        return packFloat32( zSign ^ 1, 0xFF, 0 );
819
    }
820
    if ( aExp == 0 ) {
821
        ++expDiff;
822
    }
823
    else {
824
        aSig |= 0x40000000;
825
    }
826
    shift32RightJamming( aSig, - expDiff, &aSig );
827
    bSig |= 0x40000000;
828
 bBigger:
829
    zSig = bSig - aSig;
830
    zExp = bExp;
831
    zSign ^= 1;
832
    goto normalizeRoundAndPack;
833
 aExpBigger:
834
    if ( aExp == 0xFF ) {
835
        if ( aSig ) return propagateFloat32NaN( a, b );
836
        return a;
837
    }
838
    if ( bExp == 0 ) {
839
        --expDiff;
840
    }
841
    else {
842
        bSig |= 0x40000000;
843
    }
844
    shift32RightJamming( bSig, expDiff, &bSig );
845
    aSig |= 0x40000000;
846
 aBigger:
847
    zSig = aSig - bSig;
848
    zExp = aExp;
849
 normalizeRoundAndPack:
850
    --zExp;
851
    return normalizeRoundAndPackFloat32( zSign, zExp, zSig );
852
 
853
}
854
 
855
/*----------------------------------------------------------------------------
856
| Returns the result of adding the single-precision floating-point values `a'
857
| and `b'.  The operation is performed according to the IEC/IEEE Standard for
858
| Binary Floating-Point Arithmetic.
859
*----------------------------------------------------------------------------*/
860
/*
861
        -- fpu operations (fpu_op_i):
862
                -- ========================
863
                -- 000 = add,
864
                -- 001 = substract,
865
                -- 010 = multiply,
866
                -- 011 = divide,
867
                -- 100 = square root
868
                -- 101 = unused
869
                -- 110 = unused
870
                -- 111 = unused
871
 
872
        -- Rounding Mode:
873
        -- ==============
874
        -- 00 = round to nearest even(default),
875
        -- 01 = round to zero,
876
        -- 10 = round up,
877
        -- 11 = round down
878
 
879
*/
880
void print_roundmode(void)
881
{
882
                if (float_rounding_mode==float_round_nearest_even) printf("00");         else
883
                if (float_rounding_mode==float_round_to_zero) printf("01");     else
884
                if (float_rounding_mode==float_round_up) printf("10");  else
885
                if (float_rounding_mode==float_round_down) printf("11");
886
 
887
}
888
 
889
 
890
float32 float32_add2( float32 a, float32 b )
891
{
892
    flag aSign, bSign;
893
 
894
    aSign = extractFloat32Sign( a );
895
    bSign = extractFloat32Sign( b );
896
    if ( aSign == bSign ) {
897
        return addFloat32Sigs( a, b, aSign );
898
    }
899
    else {
900
        return subFloat32Sigs( a, b, aSign );
901
    }
902
 
903
}
904
 
905
// for FPU testbench
906
float32 float32_add( float32 a, float32 b )
907
{
908
        float32 output = float32_add2( a, b );
909
                /*//This can be used for copy&paste test mode
910
                printf("\t\t\twait for CLK_PERIOD; start_i <= '1';\n");
911
                printf("\t\t\topa_i <= X\"%x\";\n",a);
912
                printf("\t\t\topb_i <= X\"%x\";\n",b);
913
                printf("\t\t\tfpu_op_i <= \"000\";\n");
914
                printf("\t\t\trmode_i <= \""); print_roundmode(); printf("\";\n");
915
                printf("\t\t\twait for CLK_PERIOD; start_i <= '0'; wait until ready_o='1';\n");
916
                printf("\t\t\tassert output_o=x\"%x\"\n" ,output);
917
                //for exceptions, but didn't work!!
918
                //printf("\t\t\tassert output_o=x\"%x\" and ine_o='%d' and overflow_o='%d' and underflow_o='%d' and div_zero_o='%d' and qnan_o='%d' \n" ,output,exceptions.ine ,exceptions.overflow, exceptions.underflow, exceptions.div_zero, exceptions.invalid);
919
                printf("\t\t\treport \"Error!!!\"\n");
920
                printf("\t\t\tseverity failure;\n\n");*/
921
 
922
 
923
                printf("%x       \n",a);
924
                printf("%x       \n",b);
925
                printf("000\n");
926
                print_roundmode(); printf("\n");
927
                printf("%x       \n\n", output);
928
 
929
        return output;
930
}
931
 
932
/*----------------------------------------------------------------------------
933
| Returns the result of subtracting the single-precision floating-point values
934
| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
935
| for Binary Floating-Point Arithmetic.
936
*----------------------------------------------------------------------------*/
937
 
938
float32 float32_sub2( float32 a, float32 b )
939
{
940
    flag aSign, bSign;
941
 
942
    aSign = extractFloat32Sign( a );
943
    bSign = extractFloat32Sign( b );
944
    if ( aSign == bSign ) {
945
        return subFloat32Sigs( a, b, aSign );
946
    }
947
    else {
948
        return addFloat32Sigs( a, b, aSign );
949
    }
950
 
951
}
952
// for FPU testbench
953
float32 float32_sub( float32 a, float32 b )
954
{
955
        float32 output = float32_sub2( a, b );
956
                /*printf("\t\t\twait for CLK_PERIOD; start_i <= '1';\n");
957
                printf("\t\t\topa_i <= X\"%x\";\n",a);
958
                printf("\t\t\topb_i <= X\"%x\";\n",b);
959
                printf("\t\t\tfpu_op_i <= \"001\";\n");
960
                printf("\t\t\trmode_i <= \""); print_roundmode(); printf("\";\n");
961
                printf("\t\t\twait for CLK_PERIOD; start_i <= '0'; wait until ready_o='1';\n");
962
                printf("\t\t\tassert output_o=x\"%x\"\n" ,output);
963
                printf("\t\t\treport \"Error!!!\"\n");
964
                printf("\t\t\tseverity failure;\n\n");*/
965
 
966
                printf("%x       \n",a);
967
                printf("%x       \n",b);
968
                printf("001\n");
969
                print_roundmode(); printf("\n");
970
                printf("%x       \n\n", output);
971
 
972
        return output;
973
}
974
 
975
/*----------------------------------------------------------------------------
976
| Returns the result of multiplying the single-precision floating-point values
977
| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
978
| for Binary Floating-Point Arithmetic.
979
*----------------------------------------------------------------------------*/
980
 
981
float32 float32_mul2( float32 a, float32 b )
982
{
983
    flag aSign, bSign, zSign;
984
    int16 aExp, bExp, zExp;
985
    bits32 aSig, bSig, zSig0, zSig1;
986
 
987
    aSig = extractFloat32Frac( a );
988
    aExp = extractFloat32Exp( a );
989
    aSign = extractFloat32Sign( a );
990
    bSig = extractFloat32Frac( b );
991
    bExp = extractFloat32Exp( b );
992
    bSign = extractFloat32Sign( b );
993
    zSign = aSign ^ bSign;
994
    if ( aExp == 0xFF ) {
995
        if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
996
            return propagateFloat32NaN( a, b );
997
        }
998
        if ( ( bExp | bSig ) == 0 ) {
999
            float_raise( float_flag_invalid );
1000
            return float32_default_nan;
1001
        }
1002
        return packFloat32( zSign, 0xFF, 0 );
1003
    }
1004
    if ( bExp == 0xFF ) {
1005
        if ( bSig ) return propagateFloat32NaN( a, b );
1006
        if ( ( aExp | aSig ) == 0 ) {
1007
            float_raise( float_flag_invalid );
1008
            return float32_default_nan;
1009
        }
1010
        return packFloat32( zSign, 0xFF, 0 );
1011
    }
1012
    if ( aExp == 0 ) {
1013
        if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1014
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1015
    }
1016
    if ( bExp == 0 ) {
1017
        if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1018
        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1019
    }
1020
    zExp = aExp + bExp - 0x7F;
1021
    aSig = ( aSig | 0x00800000 )<<7;
1022
    bSig = ( bSig | 0x00800000 )<<8;
1023
    mul32To64( aSig, bSig, &zSig0, &zSig1 );
1024
    zSig0 |= ( zSig1 != 0 );
1025
    if ( 0 <= (sbits32) ( zSig0<<1 ) ) {
1026
        zSig0 <<= 1;
1027
        --zExp;
1028
    }
1029
    return roundAndPackFloat32( zSign, zExp, zSig0 );
1030
}
1031
// for FPU testbench
1032
float32 float32_mul( float32 a, float32 b )
1033
{
1034
        float32 output = float32_mul2( a, b );
1035
                /*printf("\t\t\twait for CLK_PERIOD; start_i <= '1';\n");
1036
                printf("\t\t\topa_i <= X\"%x\";\n",a);
1037
                printf("\t\t\topb_i <= X\"%x\";\n",b);
1038
                printf("\t\t\tfpu_op_i <= \"010\";\n");
1039
                printf("\t\t\trmode_i <= \""); print_roundmode(); printf("\";\n");
1040
                printf("\t\t\twait for CLK_PERIOD; start_i <= '0'; wait until ready_o='1';\n");
1041
                printf("\t\t\tassert output_o=x\"%x\"\n" ,output);
1042
                printf("\t\t\treport \"Error!!!\"\n");
1043
                printf("\t\t\tseverity failure;\n\n");*/
1044
 
1045
                printf("%x       \n",a);
1046
                printf("%x       \n",b);
1047
                printf("010\n");
1048
                print_roundmode(); printf("\n");
1049
                printf("%x       \n\n", output);
1050
        return output;
1051
}
1052
 
1053
/*----------------------------------------------------------------------------
1054
| Returns the result of dividing the single-precision floating-point value `a'
1055
| by the corresponding value `b'.  The operation is performed according to the
1056
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1057
*----------------------------------------------------------------------------*/
1058
 
1059
 
1060
float32 float32_div2( float32 a, float32 b )
1061
{
1062
    flag aSign, bSign, zSign;
1063
    int16 aExp, bExp, zExp;
1064
    bits32 aSig, bSig, zSig, rem0, rem1, term0, term1;
1065
 
1066
    aSig = extractFloat32Frac( a );
1067
    aExp = extractFloat32Exp( a );
1068
    aSign = extractFloat32Sign( a );
1069
    bSig = extractFloat32Frac( b );
1070
    bExp = extractFloat32Exp( b );
1071
    bSign = extractFloat32Sign( b );
1072
    zSign = aSign ^ bSign;
1073
    if ( aExp == 0xFF ) {
1074
        if ( aSig ) return propagateFloat32NaN( a, b );
1075
        if ( bExp == 0xFF ) {
1076
            if ( bSig ) return propagateFloat32NaN( a, b );
1077
            float_raise( float_flag_invalid );
1078
            return float32_default_nan;
1079
        }
1080
        return packFloat32( zSign, 0xFF, 0 );
1081
    }
1082
    if ( bExp == 0xFF ) {
1083
        if ( bSig ) return propagateFloat32NaN( a, b );
1084
        return packFloat32( zSign, 0, 0 );
1085
    }
1086
    if ( bExp == 0 ) {
1087
        if ( bSig == 0 ) {
1088
            if ( ( aExp | aSig ) == 0 ) {
1089
                float_raise( float_flag_invalid );
1090
                return float32_default_nan;
1091
            }
1092
            float_raise( float_flag_divbyzero );
1093
            return packFloat32( zSign, 0xFF, 0 );
1094
        }
1095
        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1096
    }
1097
    if ( aExp == 0 ) {
1098
        if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1099
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1100
    }
1101
    zExp = aExp - bExp + 0x7D;
1102
    aSig = ( aSig | 0x00800000 )<<7;
1103
    bSig = ( bSig | 0x00800000 )<<8;
1104
    if ( bSig <= ( aSig + aSig ) ) {
1105
        aSig >>= 1;
1106
        ++zExp;
1107
    }
1108
    zSig = estimateDiv64To32( aSig, 0, bSig );
1109
    if ( ( zSig & 0x3F ) <= 2 ) {
1110
        mul32To64( bSig, zSig, &term0, &term1 );
1111
        sub64( aSig, 0, term0, term1, &rem0, &rem1 );
1112
        while ( (sbits32) rem0 < 0 ) {
1113
            --zSig;
1114
            add64( rem0, rem1, 0, bSig, &rem0, &rem1 );
1115
        }
1116
        zSig |= ( rem1 != 0 );
1117
    }
1118
    return roundAndPackFloat32( zSign, zExp, zSig );
1119
 
1120
}
1121
 
1122
// for FPU testbench
1123
float32 float32_div( float32 a, float32 b ){
1124
        float32 output = float32_div2( a, b );
1125
                /*printf("\t\t\twait for CLK_PERIOD; start_i <= '1';\n");
1126
                printf("\t\t\topa_i <= X\"%x\";\n",a);
1127
                printf("\t\t\topb_i <= X\"%x\";\n",b);
1128
                printf("\t\t\tfpu_op_i <= \"011\";\n");
1129
                printf("\t\t\trmode_i <= \""); print_roundmode(); printf("\";\n");
1130
                printf("\t\t\twait for CLK_PERIOD; start_i <= '0'; wait until ready_o='1';\n");
1131
                printf("\t\t\tassert output_o=x\"%x\"\n" ,output);
1132
                printf("\t\t\treport \"Error!!!\"\n");
1133
                printf("\t\t\tseverity failure;\n\n");*/
1134
 
1135
                printf("%x       \n",a);
1136
                printf("%x       \n",b);
1137
                printf("011\n");
1138
                print_roundmode(); printf("\n");
1139
                printf("%x       \n\n", output);
1140
        return output;
1141
}
1142
 
1143
/*----------------------------------------------------------------------------
1144
| Returns the remainder of the single-precision floating-point value `a'
1145
| with respect to the corresponding value `b'.  The operation is performed
1146
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1147
*----------------------------------------------------------------------------*/
1148
 
1149
float32 float32_rem( float32 a, float32 b )
1150
{
1151
    flag aSign, bSign, zSign;
1152
    int16 aExp, bExp, expDiff;
1153
    bits32 aSig, bSig, q, allZero, alternateASig;
1154
    sbits32 sigMean;
1155
 
1156
    aSig = extractFloat32Frac( a );
1157
    aExp = extractFloat32Exp( a );
1158
    aSign = extractFloat32Sign( a );
1159
    bSig = extractFloat32Frac( b );
1160
    bExp = extractFloat32Exp( b );
1161
    bSign = extractFloat32Sign( b );
1162
    if ( aExp == 0xFF ) {
1163
        if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1164
            return propagateFloat32NaN( a, b );
1165
        }
1166
        float_raise( float_flag_invalid );
1167
        return float32_default_nan;
1168
    }
1169
    if ( bExp == 0xFF ) {
1170
        if ( bSig ) return propagateFloat32NaN( a, b );
1171
        return a;
1172
    }
1173
    if ( bExp == 0 ) {
1174
        if ( bSig == 0 ) {
1175
            float_raise( float_flag_invalid );
1176
            return float32_default_nan;
1177
        }
1178
        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1179
    }
1180
    if ( aExp == 0 ) {
1181
        if ( aSig == 0 ) return a;
1182
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1183
    }
1184
    expDiff = aExp - bExp;
1185
    aSig = ( aSig | 0x00800000 )<<8;
1186
    bSig = ( bSig | 0x00800000 )<<8;
1187
    if ( expDiff < 0 ) {
1188
        if ( expDiff < -1 ) return a;
1189
        aSig >>= 1;
1190
    }
1191
    q = ( bSig <= aSig );
1192
    if ( q ) aSig -= bSig;
1193
    expDiff -= 32;
1194
    while ( 0 < expDiff ) {
1195
        q = estimateDiv64To32( aSig, 0, bSig );
1196
        q = ( 2 < q ) ? q - 2 : 0;
1197
        aSig = - ( ( bSig>>2 ) * q );
1198
        expDiff -= 30;
1199
    }
1200
    expDiff += 32;
1201
    if ( 0 < expDiff ) {
1202
        q = estimateDiv64To32( aSig, 0, bSig );
1203
        q = ( 2 < q ) ? q - 2 : 0;
1204
        q >>= 32 - expDiff;
1205
        bSig >>= 2;
1206
        aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
1207
    }
1208
    else {
1209
        aSig >>= 2;
1210
        bSig >>= 2;
1211
    }
1212
    do {
1213
        alternateASig = aSig;
1214
        ++q;
1215
        aSig -= bSig;
1216
    } while ( 0 <= (sbits32) aSig );
1217
    sigMean = aSig + alternateASig;
1218
    if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
1219
        aSig = alternateASig;
1220
    }
1221
    zSign = ( (sbits32) aSig < 0 );
1222
    if ( zSign ) aSig = - aSig;
1223
    return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig );
1224
 
1225
}
1226
 
1227
/*----------------------------------------------------------------------------
1228
| Returns the square root of the single-precision floating-point value `a'.
1229
| The operation is performed according to the IEC/IEEE Standard for Binary
1230
| Floating-Point Arithmetic.
1231
*----------------------------------------------------------------------------*/
1232
 
1233
float32 float32_sqrt2( float32 a )
1234
{
1235
    flag aSign;
1236
    int16 aExp, zExp;
1237
    bits32 aSig, zSig, rem0, rem1, term0, term1;
1238
 
1239
    aSig = extractFloat32Frac( a );
1240
    aExp = extractFloat32Exp( a );
1241
    aSign = extractFloat32Sign( a );
1242
    if ( aExp == 0xFF ) {
1243
        if ( aSig ) return propagateFloat32NaN( a, 0 );
1244
        if ( ! aSign ) return a;
1245
        float_raise( float_flag_invalid );
1246
        return float32_default_nan;
1247
    }
1248
    if ( aSign ) {
1249
        if ( ( aExp | aSig ) == 0 ) return a;
1250
        float_raise( float_flag_invalid );
1251
        return float32_default_nan;
1252
    }
1253
    if ( aExp == 0 ) {
1254
        if ( aSig == 0 ) return 0;
1255
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1256
    }
1257
    zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
1258
    aSig = ( aSig | 0x00800000 )<<8;
1259
    zSig = estimateSqrt32( aExp, aSig ) + 2;
1260
    if ( ( zSig & 0x7F ) <= 5 ) {
1261
        if ( zSig < 2 ) {
1262
            zSig = 0x7FFFFFFF;
1263
            goto roundAndPack;
1264
        }
1265
        else {
1266
            aSig >>= aExp & 1;
1267
            mul32To64( zSig, zSig, &term0, &term1 );
1268
            sub64( aSig, 0, term0, term1, &rem0, &rem1 );
1269
            while ( (sbits32) rem0 < 0 ) {
1270
                --zSig;
1271
                shortShift64Left( 0, zSig, 1, &term0, &term1 );
1272
                term1 |= 1;
1273
                add64( rem0, rem1, term0, term1, &rem0, &rem1 );
1274
            }
1275
            zSig |= ( ( rem0 | rem1 ) != 0 );
1276
        }
1277
    }
1278
    shift32RightJamming( zSig, 1, &zSig );
1279
 roundAndPack:
1280
    return roundAndPackFloat32( 0, zExp, zSig );
1281
 
1282
}
1283
// for FPU testbench
1284
float32 float32_sqrt( float32 a ){
1285
        float32 output = float32_sqrt2( a );
1286
                /*printf("\t\t\twait for CLK_PERIOD; start_i <= '1';\n");
1287
                if(a==0) printf("\t\t\topa_i <= X\"00000000\";\n"); else printf("\t\t\topa_i <= X\"%x\";\n",a);
1288
                //printf("\t\t\topb_i <= X\"%x\";\n",b);
1289
                printf("\t\t\tfpu_op_i <= \"100\";\n");
1290
                printf("\t\t\trmode_i <= \""); print_roundmode(); printf("\";\n");
1291
                printf("\t\t\twait for CLK_PERIOD; start_i <= '0'; wait until ready_o='1';\n");
1292
                printf("\t\t\tassert output_o=x\"%x\"\n" ,output);
1293
                printf("\t\t\treport \"Error!!!\"\n");
1294
                printf("\t\t\tseverity failure;\n\n");*/
1295
 
1296
                printf("%x       \n",a);
1297
                printf("00000000       \n");
1298
                printf("100\n");
1299
                print_roundmode(); printf("\n");
1300
                printf("%x       \n\n", output);
1301
        return output;
1302
}
1303
 
1304
/*----------------------------------------------------------------------------
1305
| Returns 1 if the single-precision floating-point value `a' is equal to
1306
| the corresponding value `b', and 0 otherwise.  The comparison is performed
1307
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1308
*----------------------------------------------------------------------------*/
1309
 
1310
flag float32_eq( float32 a, float32 b )
1311
{
1312
 
1313
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1314
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1315
       ) {
1316
        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
1317
            float_raise( float_flag_invalid );
1318
        }
1319
        return 0;
1320
    }
1321
    return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
1322
 
1323
}
1324
 
1325
/*----------------------------------------------------------------------------
1326
| Returns 1 if the single-precision floating-point value `a' is less than
1327
| or equal to the corresponding value `b', and 0 otherwise.  The comparison
1328
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1329
| Arithmetic.
1330
*----------------------------------------------------------------------------*/
1331
 
1332
flag float32_le( float32 a, float32 b )
1333
{
1334
    flag aSign, bSign;
1335
 
1336
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1337
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1338
       ) {
1339
        float_raise( float_flag_invalid );
1340
        return 0;
1341
    }
1342
    aSign = extractFloat32Sign( a );
1343
    bSign = extractFloat32Sign( b );
1344
    if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
1345
    return ( a == b ) || ( aSign ^ ( a < b ) );
1346
 
1347
}
1348
 
1349
/*----------------------------------------------------------------------------
1350
| Returns 1 if the single-precision floating-point value `a' is less than
1351
| the corresponding value `b', and 0 otherwise.  The comparison is performed
1352
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1353
*----------------------------------------------------------------------------*/
1354
 
1355
flag float32_lt( float32 a, float32 b )
1356
{
1357
    flag aSign, bSign;
1358
 
1359
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1360
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1361
       ) {
1362
        float_raise( float_flag_invalid );
1363
        return 0;
1364
    }
1365
    aSign = extractFloat32Sign( a );
1366
    bSign = extractFloat32Sign( b );
1367
    if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
1368
    return ( a != b ) && ( aSign ^ ( a < b ) );
1369
 
1370
}
1371
 
1372
/*----------------------------------------------------------------------------
1373
| Returns 1 if the single-precision floating-point value `a' is equal to
1374
| the corresponding value `b', and 0 otherwise.  The invalid exception is
1375
| raised if either operand is a NaN.  Otherwise, the comparison is performed
1376
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1377
*----------------------------------------------------------------------------*/
1378
 
1379
flag float32_eq_signaling( float32 a, float32 b )
1380
{
1381
 
1382
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1383
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1384
       ) {
1385
        float_raise( float_flag_invalid );
1386
        return 0;
1387
    }
1388
    return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
1389
 
1390
}
1391
 
1392
/*----------------------------------------------------------------------------
1393
| Returns 1 if the single-precision floating-point value `a' is less than or
1394
| equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
1395
| cause an exception.  Otherwise, the comparison is performed according to the
1396
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1397
*----------------------------------------------------------------------------*/
1398
 
1399
flag float32_le_quiet( float32 a, float32 b )
1400
{
1401
    flag aSign, bSign;
1402
    int16 aExp, bExp;
1403
 
1404
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1405
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1406
       ) {
1407
        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
1408
            float_raise( float_flag_invalid );
1409
        }
1410
        return 0;
1411
    }
1412
    aSign = extractFloat32Sign( a );
1413
    bSign = extractFloat32Sign( b );
1414
    if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
1415
    return ( a == b ) || ( aSign ^ ( a < b ) );
1416
 
1417
}
1418
 
1419
/*----------------------------------------------------------------------------
1420
| Returns 1 if the single-precision floating-point value `a' is less than
1421
| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
1422
| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
1423
| Standard for Binary Floating-Point Arithmetic.
1424
*----------------------------------------------------------------------------*/
1425
 
1426
flag float32_lt_quiet( float32 a, float32 b )
1427
{
1428
    flag aSign, bSign;
1429
 
1430
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1431
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1432
       ) {
1433
        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
1434
            float_raise( float_flag_invalid );
1435
        }
1436
        return 0;
1437
    }
1438
    aSign = extractFloat32Sign( a );
1439
    bSign = extractFloat32Sign( b );
1440
    if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
1441
    return ( a != b ) && ( aSign ^ ( a < b ) );
1442
 
1443
}
1444
 
1445
/*----------------------------------------------------------------------------
1446
| Returns the result of converting the double-precision floating-point value
1447
| `a' to the 32-bit two's complement integer format.  The conversion is
1448
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1449
| Arithmetic---which means in particular that the conversion is rounded
1450
| according to the current rounding mode.  If `a' is a NaN, the largest
1451
| positive integer is returned.  Otherwise, if the conversion overflows, the
1452
| largest integer with the same sign as `a' is returned.
1453
*----------------------------------------------------------------------------*/
1454
 
1455
int32 float64_to_int32( float64 a )
1456
{
1457
    flag aSign;
1458
    int16 aExp, shiftCount;
1459
    bits32 aSig0, aSig1, absZ, aSigExtra;
1460
    int32 z;
1461
    int8 roundingMode;
1462
 
1463
    aSig1 = extractFloat64Frac1( a );
1464
    aSig0 = extractFloat64Frac0( a );
1465
    aExp = extractFloat64Exp( a );
1466
    aSign = extractFloat64Sign( a );
1467
    shiftCount = aExp - 0x413;
1468
    if ( 0 <= shiftCount ) {
1469
        if ( 0x41E < aExp ) {
1470
            if ( ( aExp == 0x7FF ) && ( aSig0 | aSig1 ) ) aSign = 0;
1471
            goto invalid;
1472
        }
1473
        shortShift64Left(
1474
            aSig0 | 0x00100000, aSig1, shiftCount, &absZ, &aSigExtra );
1475
        if ( 0x80000000 < absZ ) goto invalid;
1476
    }
1477
    else {
1478
        aSig1 = ( aSig1 != 0 );
1479
        if ( aExp < 0x3FE ) {
1480
            aSigExtra = aExp | aSig0 | aSig1;
1481
            absZ = 0;
1482
        }
1483
        else {
1484
            aSig0 |= 0x00100000;
1485
            aSigExtra = ( aSig0<<( shiftCount & 31 ) ) | aSig1;
1486
            absZ = aSig0>>( - shiftCount );
1487
        }
1488
    }
1489
    roundingMode = float_rounding_mode;
1490
    if ( roundingMode == float_round_nearest_even ) {
1491
        if ( (sbits32) aSigExtra < 0 ) {
1492
            ++absZ;
1493
            if ( (bits32) ( aSigExtra<<1 ) == 0 ) absZ &= ~1;
1494
        }
1495
        z = aSign ? - absZ : absZ;
1496
    }
1497
    else {
1498
        aSigExtra = ( aSigExtra != 0 );
1499
        if ( aSign ) {
1500
            z = - (   absZ
1501
                    + ( ( roundingMode == float_round_down ) & aSigExtra ) );
1502
        }
1503
        else {
1504
            z = absZ + ( ( roundingMode == float_round_up ) & aSigExtra );
1505
        }
1506
    }
1507
    if ( ( aSign ^ ( z < 0 ) ) && z ) {
1508
 invalid:
1509
        float_raise( float_flag_invalid );
1510
        return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
1511
    }
1512
    if ( aSigExtra ) float_exception_flags |= float_flag_inexact;
1513
    return z;
1514
 
1515
}
1516
 
1517
/*----------------------------------------------------------------------------
1518
| Returns the result of converting the double-precision floating-point value
1519
| `a' to the 32-bit two's complement integer format.  The conversion is
1520
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1521
| Arithmetic, except that the conversion is always rounded toward zero.
1522
| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1523
| the conversion overflows, the largest integer with the same sign as `a' is
1524
| returned.
1525
*----------------------------------------------------------------------------*/
1526
 
1527
int32 float64_to_int32_round_to_zero( float64 a )
1528
{
1529
    flag aSign;
1530
    int16 aExp, shiftCount;
1531
    bits32 aSig0, aSig1, absZ, aSigExtra;
1532
    int32 z;
1533
 
1534
    aSig1 = extractFloat64Frac1( a );
1535
    aSig0 = extractFloat64Frac0( a );
1536
    aExp = extractFloat64Exp( a );
1537
    aSign = extractFloat64Sign( a );
1538
    shiftCount = aExp - 0x413;
1539
    if ( 0 <= shiftCount ) {
1540
        if ( 0x41E < aExp ) {
1541
            if ( ( aExp == 0x7FF ) && ( aSig0 | aSig1 ) ) aSign = 0;
1542
            goto invalid;
1543
        }
1544
        shortShift64Left(
1545
            aSig0 | 0x00100000, aSig1, shiftCount, &absZ, &aSigExtra );
1546
    }
1547
    else {
1548
        if ( aExp < 0x3FF ) {
1549
            if ( aExp | aSig0 | aSig1 ) {
1550
                float_exception_flags |= float_flag_inexact;
1551
            }
1552
            return 0;
1553
        }
1554
        aSig0 |= 0x00100000;
1555
        aSigExtra = ( aSig0<<( shiftCount & 31 ) ) | aSig1;
1556
        absZ = aSig0>>( - shiftCount );
1557
    }
1558
    z = aSign ? - absZ : absZ;
1559
    if ( ( aSign ^ ( z < 0 ) ) && z ) {
1560
 invalid:
1561
        float_raise( float_flag_invalid );
1562
        return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
1563
    }
1564
    if ( aSigExtra ) float_exception_flags |= float_flag_inexact;
1565
    return z;
1566
 
1567
}
1568
 
1569
/*----------------------------------------------------------------------------
1570
| Returns the result of converting the double-precision floating-point value
1571
| `a' to the single-precision floating-point format.  The conversion is
1572
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1573
| Arithmetic.
1574
*----------------------------------------------------------------------------*/
1575
 
1576
float32 float64_to_float32( float64 a )
1577
{
1578
    flag aSign;
1579
    int16 aExp;
1580
    bits32 aSig0, aSig1, zSig;
1581
    bits32 allZero;
1582
 
1583
    aSig1 = extractFloat64Frac1( a );
1584
    aSig0 = extractFloat64Frac0( a );
1585
    aExp = extractFloat64Exp( a );
1586
    aSign = extractFloat64Sign( a );
1587
    if ( aExp == 0x7FF ) {
1588
        if ( aSig0 | aSig1 ) {
1589
            return commonNaNToFloat32( float64ToCommonNaN( a ) );
1590
        }
1591
        return packFloat32( aSign, 0xFF, 0 );
1592
    }
1593
    shift64RightJamming( aSig0, aSig1, 22, &allZero, &zSig );
1594
    if ( aExp ) zSig |= 0x40000000;
1595
    return roundAndPackFloat32( aSign, aExp - 0x381, zSig );
1596
 
1597
}
1598
 
1599
/*----------------------------------------------------------------------------
1600
| Rounds the double-precision floating-point value `a' to an integer,
1601
| and returns the result as a double-precision floating-point value.  The
1602
| operation is performed according to the IEC/IEEE Standard for Binary
1603
| Floating-Point Arithmetic.
1604
*----------------------------------------------------------------------------*/
1605
 
1606
float64 float64_round_to_int( float64 a )
1607
{
1608
    flag aSign;
1609
    int16 aExp;
1610
    bits32 lastBitMask, roundBitsMask;
1611
    int8 roundingMode;
1612
    float64 z;
1613
 
1614
    aExp = extractFloat64Exp( a );
1615
    if ( 0x413 <= aExp ) {
1616
        if ( 0x433 <= aExp ) {
1617
            if (    ( aExp == 0x7FF )
1618
                 && ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) ) ) {
1619
                return propagateFloat64NaN( a, a );
1620
            }
1621
            return a;
1622
        }
1623
        lastBitMask = 1;
1624
        lastBitMask = ( lastBitMask<<( 0x432 - aExp ) )<<1;
1625
        roundBitsMask = lastBitMask - 1;
1626
        z = a;
1627
        roundingMode = float_rounding_mode;
1628
        if ( roundingMode == float_round_nearest_even ) {
1629
            if ( lastBitMask ) {
1630
                add64( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
1631
                if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
1632
            }
1633
            else {
1634
                if ( (sbits32) z.low < 0 ) {
1635
                    ++z.high;
1636
                    if ( (bits32) ( z.low<<1 ) == 0 ) z.high &= ~1;
1637
                }
1638
            }
1639
        }
1640
        else if ( roundingMode != float_round_to_zero ) {
1641
            if (   extractFloat64Sign( z )
1642
                 ^ ( roundingMode == float_round_up ) ) {
1643
                add64( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
1644
            }
1645
        }
1646
        z.low &= ~ roundBitsMask;
1647
    }
1648
    else {
1649
        if ( aExp <= 0x3FE ) {
1650
            if ( ( ( (bits32) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
1651
            float_exception_flags |= float_flag_inexact;
1652
            aSign = extractFloat64Sign( a );
1653
            switch ( float_rounding_mode ) {
1654
             case float_round_nearest_even:
1655
                if (    ( aExp == 0x3FE )
1656
                     && ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) )
1657
                   ) {
1658
                    return packFloat64( aSign, 0x3FF, 0, 0 );
1659
                }
1660
                break;
1661
             case float_round_down:
1662
                return
1663
                      aSign ? packFloat64( 1, 0x3FF, 0, 0 )
1664
                    : packFloat64( 0, 0, 0, 0 );
1665
             case float_round_up:
1666
                return
1667
                      aSign ? packFloat64( 1, 0, 0, 0 )
1668
                    : packFloat64( 0, 0x3FF, 0, 0 );
1669
            }
1670
            return packFloat64( aSign, 0, 0, 0 );
1671
        }
1672
        lastBitMask = 1;
1673
        lastBitMask <<= 0x413 - aExp;
1674
        roundBitsMask = lastBitMask - 1;
1675
        z.low = 0;
1676
        z.high = a.high;
1677
        roundingMode = float_rounding_mode;
1678
        if ( roundingMode == float_round_nearest_even ) {
1679
            z.high += lastBitMask>>1;
1680
            if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
1681
                z.high &= ~ lastBitMask;
1682
            }
1683
        }
1684
        else if ( roundingMode != float_round_to_zero ) {
1685
            if (   extractFloat64Sign( z )
1686
                 ^ ( roundingMode == float_round_up ) ) {
1687
                z.high |= ( a.low != 0 );
1688
                z.high += roundBitsMask;
1689
            }
1690
        }
1691
        z.high &= ~ roundBitsMask;
1692
    }
1693
    if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
1694
        float_exception_flags |= float_flag_inexact;
1695
    }
1696
    return z;
1697
 
1698
}
1699
 
1700
/*----------------------------------------------------------------------------
1701
| Returns the result of adding the absolute values of the double-precision
1702
| floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
1703
| before being returned.  `zSign' is ignored if the result is a NaN.
1704
| The addition is performed according to the IEC/IEEE Standard for Binary
1705
| Floating-Point Arithmetic.
1706
*----------------------------------------------------------------------------*/
1707
 
1708
static float64 addFloat64Sigs( float64 a, float64 b, flag zSign )
1709
{
1710
    int16 aExp, bExp, zExp;
1711
    bits32 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
1712
    int16 expDiff;
1713
 
1714
    aSig1 = extractFloat64Frac1( a );
1715
    aSig0 = extractFloat64Frac0( a );
1716
    aExp = extractFloat64Exp( a );
1717
    bSig1 = extractFloat64Frac1( b );
1718
    bSig0 = extractFloat64Frac0( b );
1719
    bExp = extractFloat64Exp( b );
1720
    expDiff = aExp - bExp;
1721
    if ( 0 < expDiff ) {
1722
        if ( aExp == 0x7FF ) {
1723
            if ( aSig0 | aSig1 ) return propagateFloat64NaN( a, b );
1724
            return a;
1725
        }
1726
        if ( bExp == 0 ) {
1727
            --expDiff;
1728
        }
1729
        else {
1730
            bSig0 |= 0x00100000;
1731
        }
1732
        shift64ExtraRightJamming(
1733
            bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
1734
        zExp = aExp;
1735
    }
1736
    else if ( expDiff < 0 ) {
1737
        if ( bExp == 0x7FF ) {
1738
            if ( bSig0 | bSig1 ) return propagateFloat64NaN( a, b );
1739
            return packFloat64( zSign, 0x7FF, 0, 0 );
1740
        }
1741
        if ( aExp == 0 ) {
1742
            ++expDiff;
1743
        }
1744
        else {
1745
            aSig0 |= 0x00100000;
1746
        }
1747
        shift64ExtraRightJamming(
1748
            aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
1749
        zExp = bExp;
1750
    }
1751
    else {
1752
        if ( aExp == 0x7FF ) {
1753
            if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
1754
                return propagateFloat64NaN( a, b );
1755
            }
1756
            return a;
1757
        }
1758
        add64( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
1759
        if ( aExp == 0 ) return packFloat64( zSign, 0, zSig0, zSig1 );
1760
        zSig2 = 0;
1761
        zSig0 |= 0x00200000;
1762
        zExp = aExp;
1763
        goto shiftRight1;
1764
    }
1765
    aSig0 |= 0x00100000;
1766
    add64( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
1767
    --zExp;
1768
    if ( zSig0 < 0x00200000 ) goto roundAndPack;
1769
    ++zExp;
1770
 shiftRight1:
1771
    shift64ExtraRightJamming( zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
1772
 roundAndPack:
1773
    return roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2 );
1774
 
1775
}
1776
 
1777
/*----------------------------------------------------------------------------
1778
| Returns the result of subtracting the absolute values of the double-
1779
| precision floating-point values `a' and `b'.  If `zSign' is 1, the
1780
| difference is negated before being returned.  `zSign' is ignored if the
1781
| result is a NaN.  The subtraction is performed according to the IEC/IEEE
1782
| Standard for Binary Floating-Point Arithmetic.
1783
*----------------------------------------------------------------------------*/
1784
 
1785
static float64 subFloat64Sigs( float64 a, float64 b, flag zSign )
1786
{
1787
    int16 aExp, bExp, zExp;
1788
    bits32 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
1789
    int16 expDiff;
1790
    float64 z;
1791
 
1792
    aSig1 = extractFloat64Frac1( a );
1793
    aSig0 = extractFloat64Frac0( a );
1794
    aExp = extractFloat64Exp( a );
1795
    bSig1 = extractFloat64Frac1( b );
1796
    bSig0 = extractFloat64Frac0( b );
1797
    bExp = extractFloat64Exp( b );
1798
    expDiff = aExp - bExp;
1799
    shortShift64Left( aSig0, aSig1, 10, &aSig0, &aSig1 );
1800
    shortShift64Left( bSig0, bSig1, 10, &bSig0, &bSig1 );
1801
    if ( 0 < expDiff ) goto aExpBigger;
1802
    if ( expDiff < 0 ) goto bExpBigger;
1803
    if ( aExp == 0x7FF ) {
1804
        if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
1805
            return propagateFloat64NaN( a, b );
1806
        }
1807
        float_raise( float_flag_invalid );
1808
        z.low = float64_default_nan_low;
1809
        z.high = float64_default_nan_high;
1810
        return z;
1811
    }
1812
    if ( aExp == 0 ) {
1813
        aExp = 1;
1814
        bExp = 1;
1815
    }
1816
    if ( bSig0 < aSig0 ) goto aBigger;
1817
    if ( aSig0 < bSig0 ) goto bBigger;
1818
    if ( bSig1 < aSig1 ) goto aBigger;
1819
    if ( aSig1 < bSig1 ) goto bBigger;
1820
    return packFloat64( float_rounding_mode == float_round_down, 0, 0, 0 );
1821
 bExpBigger:
1822
    if ( bExp == 0x7FF ) {
1823
        if ( bSig0 | bSig1 ) return propagateFloat64NaN( a, b );
1824
        return packFloat64( zSign ^ 1, 0x7FF, 0, 0 );
1825
    }
1826
    if ( aExp == 0 ) {
1827
        ++expDiff;
1828
    }
1829
    else {
1830
        aSig0 |= 0x40000000;
1831
    }
1832
    shift64RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
1833
    bSig0 |= 0x40000000;
1834
 bBigger:
1835
    sub64( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
1836
    zExp = bExp;
1837
    zSign ^= 1;
1838
    goto normalizeRoundAndPack;
1839
 aExpBigger:
1840
    if ( aExp == 0x7FF ) {
1841
        if ( aSig0 | aSig1 ) return propagateFloat64NaN( a, b );
1842
        return a;
1843
    }
1844
    if ( bExp == 0 ) {
1845
        --expDiff;
1846
    }
1847
    else {
1848
        bSig0 |= 0x40000000;
1849
    }
1850
    shift64RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
1851
    aSig0 |= 0x40000000;
1852
 aBigger:
1853
    sub64( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
1854
    zExp = aExp;
1855
 normalizeRoundAndPack:
1856
    --zExp;
1857
    return normalizeRoundAndPackFloat64( zSign, zExp - 10, zSig0, zSig1 );
1858
 
1859
}
1860
 
1861
/*----------------------------------------------------------------------------
1862
| Returns the result of adding the double-precision floating-point values `a'
1863
| and `b'.  The operation is performed according to the IEC/IEEE Standard for
1864
| Binary Floating-Point Arithmetic.
1865
*----------------------------------------------------------------------------*/
1866
 
1867
float64 float64_add( float64 a, float64 b )
1868
{
1869
    flag aSign, bSign;
1870
 
1871
    aSign = extractFloat64Sign( a );
1872
    bSign = extractFloat64Sign( b );
1873
    if ( aSign == bSign ) {
1874
        return addFloat64Sigs( a, b, aSign );
1875
    }
1876
    else {
1877
        return subFloat64Sigs( a, b, aSign );
1878
    }
1879
 
1880
}
1881
 
1882
/*----------------------------------------------------------------------------
1883
| Returns the result of subtracting the double-precision floating-point values
1884
| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1885
| for Binary Floating-Point Arithmetic.
1886
*----------------------------------------------------------------------------*/
1887
 
1888
float64 float64_sub( float64 a, float64 b )
1889
{
1890
    flag aSign, bSign;
1891
 
1892
    aSign = extractFloat64Sign( a );
1893
    bSign = extractFloat64Sign( b );
1894
    if ( aSign == bSign ) {
1895
        return subFloat64Sigs( a, b, aSign );
1896
    }
1897
    else {
1898
        return addFloat64Sigs( a, b, aSign );
1899
    }
1900
 
1901
}
1902
 
1903
/*----------------------------------------------------------------------------
1904
| Returns the result of multiplying the double-precision floating-point values
1905
| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1906
| for Binary Floating-Point Arithmetic.
1907
*----------------------------------------------------------------------------*/
1908
 
1909
float64 float64_mul( float64 a, float64 b )
1910
{
1911
    flag aSign, bSign, zSign;
1912
    int16 aExp, bExp, zExp;
1913
    bits32 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
1914
    float64 z;
1915
 
1916
    aSig1 = extractFloat64Frac1( a );
1917
    aSig0 = extractFloat64Frac0( a );
1918
    aExp = extractFloat64Exp( a );
1919
    aSign = extractFloat64Sign( a );
1920
    bSig1 = extractFloat64Frac1( b );
1921
    bSig0 = extractFloat64Frac0( b );
1922
    bExp = extractFloat64Exp( b );
1923
    bSign = extractFloat64Sign( b );
1924
    zSign = aSign ^ bSign;
1925
    if ( aExp == 0x7FF ) {
1926
        if (    ( aSig0 | aSig1 )
1927
             || ( ( bExp == 0x7FF ) && ( bSig0 | bSig1 ) ) ) {
1928
            return propagateFloat64NaN( a, b );
1929
        }
1930
        if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
1931
        return packFloat64( zSign, 0x7FF, 0, 0 );
1932
    }
1933
    if ( bExp == 0x7FF ) {
1934
        if ( bSig0 | bSig1 ) return propagateFloat64NaN( a, b );
1935
        if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
1936
 invalid:
1937
            float_raise( float_flag_invalid );
1938
            z.low = float64_default_nan_low;
1939
            z.high = float64_default_nan_high;
1940
            return z;
1941
        }
1942
        return packFloat64( zSign, 0x7FF, 0, 0 );
1943
    }
1944
    if ( aExp == 0 ) {
1945
        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat64( zSign, 0, 0, 0 );
1946
        normalizeFloat64Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
1947
    }
1948
    if ( bExp == 0 ) {
1949
        if ( ( bSig0 | bSig1 ) == 0 ) return packFloat64( zSign, 0, 0, 0 );
1950
        normalizeFloat64Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
1951
    }
1952
    zExp = aExp + bExp - 0x400;
1953
    aSig0 |= 0x00100000;
1954
    shortShift64Left( bSig0, bSig1, 12, &bSig0, &bSig1 );
1955
    mul64To128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
1956
    add64( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
1957
    zSig2 |= ( zSig3 != 0 );
1958
    if ( 0x00200000 <= zSig0 ) {
1959
        shift64ExtraRightJamming(
1960
            zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
1961
        ++zExp;
1962
    }
1963
    return roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2 );
1964
 
1965
}
1966
 
1967
/*----------------------------------------------------------------------------
1968
| Returns the result of dividing the double-precision floating-point value `a'
1969
| by the corresponding value `b'.  The operation is performed according to the
1970
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1971
*----------------------------------------------------------------------------*/
1972
 
1973
float64 float64_div( float64 a, float64 b )
1974
{
1975
    flag aSign, bSign, zSign;
1976
    int16 aExp, bExp, zExp;
1977
    bits32 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
1978
    bits32 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
1979
    float64 z;
1980
 
1981
    aSig1 = extractFloat64Frac1( a );
1982
    aSig0 = extractFloat64Frac0( a );
1983
    aExp = extractFloat64Exp( a );
1984
    aSign = extractFloat64Sign( a );
1985
    bSig1 = extractFloat64Frac1( b );
1986
    bSig0 = extractFloat64Frac0( b );
1987
    bExp = extractFloat64Exp( b );
1988
    bSign = extractFloat64Sign( b );
1989
    zSign = aSign ^ bSign;
1990
    if ( aExp == 0x7FF ) {
1991
        if ( aSig0 | aSig1 ) return propagateFloat64NaN( a, b );
1992
        if ( bExp == 0x7FF ) {
1993
            if ( bSig0 | bSig1 ) return propagateFloat64NaN( a, b );
1994
            goto invalid;
1995
        }
1996
        return packFloat64( zSign, 0x7FF, 0, 0 );
1997
    }
1998
    if ( bExp == 0x7FF ) {
1999
        if ( bSig0 | bSig1 ) return propagateFloat64NaN( a, b );
2000
        return packFloat64( zSign, 0, 0, 0 );
2001
    }
2002
    if ( bExp == 0 ) {
2003
        if ( ( bSig0 | bSig1 ) == 0 ) {
2004
            if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
2005
 invalid:
2006
                float_raise( float_flag_invalid );
2007
                z.low = float64_default_nan_low;
2008
                z.high = float64_default_nan_high;
2009
                return z;
2010
            }
2011
            float_raise( float_flag_divbyzero );
2012
            return packFloat64( zSign, 0x7FF, 0, 0 );
2013
        }
2014
        normalizeFloat64Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
2015
    }
2016
    if ( aExp == 0 ) {
2017
        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat64( zSign, 0, 0, 0 );
2018
        normalizeFloat64Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
2019
    }
2020
    zExp = aExp - bExp + 0x3FD;
2021
    shortShift64Left( aSig0 | 0x00100000, aSig1, 11, &aSig0, &aSig1 );
2022
    shortShift64Left( bSig0 | 0x00100000, bSig1, 11, &bSig0, &bSig1 );
2023
    if ( le64( bSig0, bSig1, aSig0, aSig1 ) ) {
2024
        shift64Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
2025
        ++zExp;
2026
    }
2027
    zSig0 = estimateDiv64To32( aSig0, aSig1, bSig0 );
2028
    mul64By32To96( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
2029
    sub96( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
2030
    while ( (sbits32) rem0 < 0 ) {
2031
        --zSig0;
2032
        add96( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
2033
    }
2034
    zSig1 = estimateDiv64To32( rem1, rem2, bSig0 );
2035
    if ( ( zSig1 & 0x3FF ) <= 4 ) {
2036
        mul64By32To96( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
2037
        sub96( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
2038
        while ( (sbits32) rem1 < 0 ) {
2039
            --zSig1;
2040
            add96( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
2041
        }
2042
        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
2043
    }
2044
    shift64ExtraRightJamming( zSig0, zSig1, 0, 11, &zSig0, &zSig1, &zSig2 );
2045
    return roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2 );
2046
 
2047
}
2048
 
2049
/*----------------------------------------------------------------------------
2050
| Returns the remainder of the double-precision floating-point value `a'
2051
| with respect to the corresponding value `b'.  The operation is performed
2052
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2053
*----------------------------------------------------------------------------*/
2054
 
2055
float64 float64_rem( float64 a, float64 b )
2056
{
2057
    flag aSign, bSign, zSign;
2058
    int16 aExp, bExp, expDiff;
2059
    bits32 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
2060
    bits32 allZero, alternateASig0, alternateASig1, sigMean1;
2061
    sbits32 sigMean0;
2062
    float64 z;
2063
 
2064
    aSig1 = extractFloat64Frac1( a );
2065
    aSig0 = extractFloat64Frac0( a );
2066
    aExp = extractFloat64Exp( a );
2067
    aSign = extractFloat64Sign( a );
2068
    bSig1 = extractFloat64Frac1( b );
2069
    bSig0 = extractFloat64Frac0( b );
2070
    bExp = extractFloat64Exp( b );
2071
    bSign = extractFloat64Sign( b );
2072
    if ( aExp == 0x7FF ) {
2073
        if (    ( aSig0 | aSig1 )
2074
             || ( ( bExp == 0x7FF ) && ( bSig0 | bSig1 ) ) ) {
2075
            return propagateFloat64NaN( a, b );
2076
        }
2077
        goto invalid;
2078
    }
2079
    if ( bExp == 0x7FF ) {
2080
        if ( bSig0 | bSig1 ) return propagateFloat64NaN( a, b );
2081
        return a;
2082
    }
2083
    if ( bExp == 0 ) {
2084
        if ( ( bSig0 | bSig1 ) == 0 ) {
2085
 invalid:
2086
            float_raise( float_flag_invalid );
2087
            z.low = float64_default_nan_low;
2088
            z.high = float64_default_nan_high;
2089
            return z;
2090
        }
2091
        normalizeFloat64Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
2092
    }
2093
    if ( aExp == 0 ) {
2094
        if ( ( aSig0 | aSig1 ) == 0 ) return a;
2095
        normalizeFloat64Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
2096
    }
2097
    expDiff = aExp - bExp;
2098
    if ( expDiff < -1 ) return a;
2099
    shortShift64Left(
2100
        aSig0 | 0x00100000, aSig1, 11 - ( expDiff < 0 ), &aSig0, &aSig1 );
2101
    shortShift64Left( bSig0 | 0x00100000, bSig1, 11, &bSig0, &bSig1 );
2102
    q = le64( bSig0, bSig1, aSig0, aSig1 );
2103
    if ( q ) sub64( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
2104
    expDiff -= 32;
2105
    while ( 0 < expDiff ) {
2106
        q = estimateDiv64To32( aSig0, aSig1, bSig0 );
2107
        q = ( 4 < q ) ? q - 4 : 0;
2108
        mul64By32To96( bSig0, bSig1, q, &term0, &term1, &term2 );
2109
        shortShift96Left( term0, term1, term2, 29, &term1, &term2, &allZero );
2110
        shortShift64Left( aSig0, aSig1, 29, &aSig0, &allZero );
2111
        sub64( aSig0, 0, term1, term2, &aSig0, &aSig1 );
2112
        expDiff -= 29;
2113
    }
2114
    if ( -32 < expDiff ) {
2115
        q = estimateDiv64To32( aSig0, aSig1, bSig0 );
2116
        q = ( 4 < q ) ? q - 4 : 0;
2117
        q >>= - expDiff;
2118
        shift64Right( bSig0, bSig1, 8, &bSig0, &bSig1 );
2119
        expDiff += 24;
2120
        if ( expDiff < 0 ) {
2121
            shift64Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
2122
        }
2123
        else {
2124
            shortShift64Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
2125
        }
2126
        mul64By32To96( bSig0, bSig1, q, &term0, &term1, &term2 );
2127
        sub64( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
2128
    }
2129
    else {
2130
        shift64Right( aSig0, aSig1, 8, &aSig0, &aSig1 );
2131
        shift64Right( bSig0, bSig1, 8, &bSig0, &bSig1 );
2132
    }
2133
    do {
2134
        alternateASig0 = aSig0;
2135
        alternateASig1 = aSig1;
2136
        ++q;
2137
        sub64( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
2138
    } while ( 0 <= (sbits32) aSig0 );
2139
    add64(
2140
        aSig0, aSig1, alternateASig0, alternateASig1, &sigMean0, &sigMean1 );
2141
    if (    ( sigMean0 < 0 )
2142
         || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
2143
        aSig0 = alternateASig0;
2144
        aSig1 = alternateASig1;
2145
    }
2146
    zSign = ( (sbits32) aSig0 < 0 );
2147
    if ( zSign ) sub64( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
2148
    return
2149
        normalizeRoundAndPackFloat64( aSign ^ zSign, bExp - 4, aSig0, aSig1 );
2150
 
2151
}
2152
 
2153
/*----------------------------------------------------------------------------
2154
| Returns the square root of the double-precision floating-point value `a'.
2155
| The operation is performed according to the IEC/IEEE Standard for Binary
2156
| Floating-Point Arithmetic.
2157
*----------------------------------------------------------------------------*/
2158
 
2159
float64 float64_sqrt( float64 a )
2160
{
2161
    flag aSign;
2162
    int16 aExp, zExp;
2163
    bits32 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
2164
    bits32 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
2165
    float64 z;
2166
 
2167
    aSig1 = extractFloat64Frac1( a );
2168
    aSig0 = extractFloat64Frac0( a );
2169
    aExp = extractFloat64Exp( a );
2170
    aSign = extractFloat64Sign( a );
2171
    if ( aExp == 0x7FF ) {
2172
        if ( aSig0 | aSig1 ) return propagateFloat64NaN( a, a );
2173
        if ( ! aSign ) return a;
2174
        goto invalid;
2175
    }
2176
    if ( aSign ) {
2177
        if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
2178
 invalid:
2179
        float_raise( float_flag_invalid );
2180
        z.low = float64_default_nan_low;
2181
        z.high = float64_default_nan_high;
2182
        return z;
2183
    }
2184
    if ( aExp == 0 ) {
2185
        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat64( 0, 0, 0, 0 );
2186
        normalizeFloat64Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
2187
    }
2188
    zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
2189
    aSig0 |= 0x00100000;
2190
    shortShift64Left( aSig0, aSig1, 11, &term0, &term1 );
2191
    zSig0 = ( estimateSqrt32( aExp, term0 )>>1 ) + 1;
2192
    if ( zSig0 == 0 ) zSig0 = 0x7FFFFFFF;
2193
    doubleZSig0 = zSig0 + zSig0;
2194
    shortShift64Left( aSig0, aSig1, 9 - ( aExp & 1 ), &aSig0, &aSig1 );
2195
    mul32To64( zSig0, zSig0, &term0, &term1 );
2196
    sub64( aSig0, aSig1, term0, term1, &rem0, &rem1 );
2197
    while ( (sbits32) rem0 < 0 ) {
2198
        --zSig0;
2199
        doubleZSig0 -= 2;
2200
        add64( rem0, rem1, 0, doubleZSig0 | 1, &rem0, &rem1 );
2201
    }
2202
    zSig1 = estimateDiv64To32( rem1, 0, doubleZSig0 );
2203
    if ( ( zSig1 & 0x1FF ) <= 5 ) {
2204
        if ( zSig1 == 0 ) zSig1 = 1;
2205
        mul32To64( doubleZSig0, zSig1, &term1, &term2 );
2206
        sub64( rem1, 0, term1, term2, &rem1, &rem2 );
2207
        mul32To64( zSig1, zSig1, &term2, &term3 );
2208
        sub96( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
2209
        while ( (sbits32) rem1 < 0 ) {
2210
            --zSig1;
2211
            shortShift64Left( 0, zSig1, 1, &term2, &term3 );
2212
            term3 |= 1;
2213
            term2 |= doubleZSig0;
2214
            add96( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
2215
        }
2216
        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
2217
    }
2218
    shift64ExtraRightJamming( zSig0, zSig1, 0, 10, &zSig0, &zSig1, &zSig2 );
2219
    return roundAndPackFloat64( 0, zExp, zSig0, zSig1, zSig2 );
2220
 
2221
}
2222
 
2223
/*----------------------------------------------------------------------------
2224
| Returns 1 if the double-precision floating-point value `a' is equal to
2225
| the corresponding value `b', and 0 otherwise.  The comparison is performed
2226
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2227
*----------------------------------------------------------------------------*/
2228
 
2229
flag float64_eq( float64 a, float64 b )
2230
{
2231
 
2232
    if (    (    ( extractFloat64Exp( a ) == 0x7FF )
2233
              && ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) ) )
2234
         || (    ( extractFloat64Exp( b ) == 0x7FF )
2235
              && ( extractFloat64Frac0( b ) | extractFloat64Frac1( b ) ) )
2236
       ) {
2237
        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2238
            float_raise( float_flag_invalid );
2239
        }
2240
        return 0;
2241
    }
2242
    return
2243
           ( a.low == b.low )
2244
        && (    ( a.high == b.high )
2245
             || (    ( a.low == 0 )
2246
                  && ( (bits32) ( ( a.high | b.high )<<1 ) == 0 ) )
2247
           );
2248
 
2249
}
2250
 
2251
/*----------------------------------------------------------------------------
2252
| Returns 1 if the double-precision floating-point value `a' is less than
2253
| or equal to the corresponding value `b', and 0 otherwise.  The comparison
2254
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
2255
| Arithmetic.
2256
*----------------------------------------------------------------------------*/
2257
 
2258
flag float64_le( float64 a, float64 b )
2259
{
2260
    flag aSign, bSign;
2261
 
2262
    if (    (    ( extractFloat64Exp( a ) == 0x7FF )
2263
              && ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) ) )
2264
         || (    ( extractFloat64Exp( b ) == 0x7FF )
2265
              && ( extractFloat64Frac0( b ) | extractFloat64Frac1( b ) ) )
2266
       ) {
2267
        float_raise( float_flag_invalid );
2268
        return 0;
2269
    }
2270
    aSign = extractFloat64Sign( a );
2271
    bSign = extractFloat64Sign( b );
2272
    if ( aSign != bSign ) {
2273
        return
2274
               aSign
2275
            || (    ( ( (bits32) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
2276
                 == 0 );
2277
    }
2278
    return
2279
          aSign ? le64( b.high, b.low, a.high, a.low )
2280
        : le64( a.high, a.low, b.high, b.low );
2281
 
2282
}
2283
 
2284
/*----------------------------------------------------------------------------
2285
| Returns 1 if the double-precision floating-point value `a' is less than
2286
| the corresponding value `b', and 0 otherwise.  The comparison is performed
2287
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2288
*----------------------------------------------------------------------------*/
2289
 
2290
flag float64_lt( float64 a, float64 b )
2291
{
2292
    flag aSign, bSign;
2293
 
2294
    if (    (    ( extractFloat64Exp( a ) == 0x7FF )
2295
              && ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) ) )
2296
         || (    ( extractFloat64Exp( b ) == 0x7FF )
2297
              && ( extractFloat64Frac0( b ) | extractFloat64Frac1( b ) ) )
2298
       ) {
2299
        float_raise( float_flag_invalid );
2300
        return 0;
2301
    }
2302
    aSign = extractFloat64Sign( a );
2303
    bSign = extractFloat64Sign( b );
2304
    if ( aSign != bSign ) {
2305
        return
2306
               aSign
2307
            && (    ( ( (bits32) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
2308
                 != 0 );
2309
    }
2310
    return
2311
          aSign ? lt64( b.high, b.low, a.high, a.low )
2312
        : lt64( a.high, a.low, b.high, b.low );
2313
 
2314
}
2315
 
2316
/*----------------------------------------------------------------------------
2317
| Returns 1 if the double-precision floating-point value `a' is equal to
2318
| the corresponding value `b', and 0 otherwise.  The invalid exception is
2319
| raised if either operand is a NaN.  Otherwise, the comparison is performed
2320
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2321
*----------------------------------------------------------------------------*/
2322
 
2323
flag float64_eq_signaling( float64 a, float64 b )
2324
{
2325
 
2326
    if (    (    ( extractFloat64Exp( a ) == 0x7FF )
2327
              && ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) ) )
2328
         || (    ( extractFloat64Exp( b ) == 0x7FF )
2329
              && ( extractFloat64Frac0( b ) | extractFloat64Frac1( b ) ) )
2330
       ) {
2331
        float_raise( float_flag_invalid );
2332
        return 0;
2333
    }
2334
    return
2335
           ( a.low == b.low )
2336
        && (    ( a.high == b.high )
2337
             || (    ( a.low == 0 )
2338
                  && ( (bits32) ( ( a.high | b.high )<<1 ) == 0 ) )
2339
           );
2340
 
2341
}
2342
 
2343
/*----------------------------------------------------------------------------
2344
| Returns 1 if the double-precision floating-point value `a' is less than or
2345
| equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2346
| cause an exception.  Otherwise, the comparison is performed according to the
2347
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2348
*----------------------------------------------------------------------------*/
2349
 
2350
flag float64_le_quiet( float64 a, float64 b )
2351
{
2352
    flag aSign, bSign;
2353
 
2354
    if (    (    ( extractFloat64Exp( a ) == 0x7FF )
2355
              && ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) ) )
2356
         || (    ( extractFloat64Exp( b ) == 0x7FF )
2357
              && ( extractFloat64Frac0( b ) | extractFloat64Frac1( b ) ) )
2358
       ) {
2359
        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2360
            float_raise( float_flag_invalid );
2361
        }
2362
        return 0;
2363
    }
2364
    aSign = extractFloat64Sign( a );
2365
    bSign = extractFloat64Sign( b );
2366
    if ( aSign != bSign ) {
2367
        return
2368
               aSign
2369
            || (    ( ( (bits32) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
2370
                 == 0 );
2371
    }
2372
    return
2373
          aSign ? le64( b.high, b.low, a.high, a.low )
2374
        : le64( a.high, a.low, b.high, b.low );
2375
 
2376
}
2377
 
2378
/*----------------------------------------------------------------------------
2379
| Returns 1 if the double-precision floating-point value `a' is less than
2380
| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2381
| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
2382
| Standard for Binary Floating-Point Arithmetic.
2383
*----------------------------------------------------------------------------*/
2384
 
2385
flag float64_lt_quiet( float64 a, float64 b )
2386
{
2387
    flag aSign, bSign;
2388
 
2389
    if (    (    ( extractFloat64Exp( a ) == 0x7FF )
2390
              && ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) ) )
2391
         || (    ( extractFloat64Exp( b ) == 0x7FF )
2392
              && ( extractFloat64Frac0( b ) | extractFloat64Frac1( b ) ) )
2393
       ) {
2394
        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2395
            float_raise( float_flag_invalid );
2396
        }
2397
        return 0;
2398
    }
2399
    aSign = extractFloat64Sign( a );
2400
    bSign = extractFloat64Sign( b );
2401
    if ( aSign != bSign ) {
2402
        return
2403
               aSign
2404
            && (    ( ( (bits32) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
2405
                 != 0 );
2406
    }
2407
    return
2408
          aSign ? lt64( b.high, b.low, a.high, a.low )
2409
        : lt64( a.high, a.low, b.high, b.low );
2410
 
2411
}
2412
 

powered by: WebSVN 2.1.0

© copyright 1999-2024 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.