OpenCores
URL https://opencores.org/ocsvn/or1k/or1k/trunk

Subversion Repositories or1k

[/] [or1k/] [trunk/] [linux/] [linux-2.4/] [arch/] [arm/] [nwfpe/] [softfloat.c] - Blame information for rev 1765

Details | Compare with Previous | View Log

Line No. Rev Author Line
1 1275 phoenix
/*
2
===============================================================================
3
 
4
This C source file is part of the SoftFloat IEC/IEEE Floating-point
5
Arithmetic Package, Release 2.
6
 
7
Written by John R. Hauser.  This work was made possible in part by the
8
International Computer Science Institute, located at Suite 600, 1947 Center
9
Street, Berkeley, California 94704.  Funding was partially provided by the
10
National Science Foundation under grant MIP-9311980.  The original version
11
of this code was written as part of a project to build a fixed-point vector
12
processor in collaboration with the University of California at Berkeley,
13
overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
14
is available through the web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
15
arithmetic/softfloat.html'.
16
 
17
THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
18
has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
19
TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
20
PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
21
AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
22
 
23
Derivative works are acceptable, even for commercial purposes, so long as
24
(1) they include prominent notice that the work is derivative, and (2) they
25
include prominent notice akin to these three paragraphs for those parts of
26
this code that are retained.
27
 
28
===============================================================================
29
*/
30
 
31
#include "fpa11.h"
32
//#include "milieu.h"
33
//#include "softfloat.h"
34
 
35
/*
36
-------------------------------------------------------------------------------
37
Floating-point rounding mode, extended double-precision rounding precision,
38
and exception flags.
39
-------------------------------------------------------------------------------
40
*/
41
int8 float_rounding_mode = float_round_nearest_even;
42
int8 floatx80_rounding_precision = 80;
43
int8 float_exception_flags;
44
 
45
/*
46
-------------------------------------------------------------------------------
47
Primitive arithmetic functions, including multi-word arithmetic, and
48
division and square root approximations.  (Can be specialized to target if
49
desired.)
50
-------------------------------------------------------------------------------
51
*/
52
#include "softfloat-macros"
53
 
54
/*
55
-------------------------------------------------------------------------------
56
Functions and definitions to determine:  (1) whether tininess for underflow
57
is detected before or after rounding by default, (2) what (if anything)
58
happens when exceptions are raised, (3) how signaling NaNs are distinguished
59
from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
60
are propagated from function inputs to output.  These details are target-
61
specific.
62
-------------------------------------------------------------------------------
63
*/
64
#include "softfloat-specialize"
65
 
66
/*
67
-------------------------------------------------------------------------------
68
Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
69
and 7, and returns the properly rounded 32-bit integer corresponding to the
70
input.  If `zSign' is nonzero, the input is negated before being converted
71
to an integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point
72
input is simply rounded to an integer, with the inexact exception raised if
73
the input cannot be represented exactly as an integer.  If the fixed-point
74
input is too large, however, the invalid exception is raised and the largest
75
positive or negative integer is returned.
76
-------------------------------------------------------------------------------
77
*/
78
static int32 roundAndPackInt32( flag zSign, bits64 absZ )
79
{
80
    int8 roundingMode;
81
    flag roundNearestEven;
82
    int8 roundIncrement, roundBits;
83
    int32 z;
84
 
85
    roundingMode = float_rounding_mode;
86
    roundNearestEven = ( roundingMode == float_round_nearest_even );
87
    roundIncrement = 0x40;
88
    if ( ! roundNearestEven ) {
89
        if ( roundingMode == float_round_to_zero ) {
90
            roundIncrement = 0;
91
        }
92
        else {
93
            roundIncrement = 0x7F;
94
            if ( zSign ) {
95
                if ( roundingMode == float_round_up ) roundIncrement = 0;
96
            }
97
            else {
98
                if ( roundingMode == float_round_down ) roundIncrement = 0;
99
            }
100
        }
101
    }
102
    roundBits = absZ & 0x7F;
103
    absZ = ( absZ + roundIncrement )>>7;
104
    absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
105
    z = absZ;
106
    if ( zSign ) z = - z;
107
    if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
108
        float_exception_flags |= float_flag_invalid;
109
        return zSign ? 0x80000000 : 0x7FFFFFFF;
110
    }
111
    if ( roundBits ) float_exception_flags |= float_flag_inexact;
112
    return z;
113
 
114
}
115
 
116
/*
117
-------------------------------------------------------------------------------
118
Returns the fraction bits of the single-precision floating-point value `a'.
119
-------------------------------------------------------------------------------
120
*/
121
INLINE bits32 extractFloat32Frac( float32 a )
122
{
123
 
124
    return a & 0x007FFFFF;
125
 
126
}
127
 
128
/*
129
-------------------------------------------------------------------------------
130
Returns the exponent bits of the single-precision floating-point value `a'.
131
-------------------------------------------------------------------------------
132
*/
133
INLINE int16 extractFloat32Exp( float32 a )
134
{
135
 
136
    return ( a>>23 ) & 0xFF;
137
 
138
}
139
 
140
/*
141
-------------------------------------------------------------------------------
142
Returns the sign bit of the single-precision floating-point value `a'.
143
-------------------------------------------------------------------------------
144
*/
145
#if 0   /* in softfloat.h */
146
INLINE flag extractFloat32Sign( float32 a )
147
{
148
 
149
    return a>>31;
150
 
151
}
152
#endif
153
 
154
/*
155
-------------------------------------------------------------------------------
156
Normalizes the subnormal single-precision floating-point value represented
157
by the denormalized significand `aSig'.  The normalized exponent and
158
significand are stored at the locations pointed to by `zExpPtr' and
159
`zSigPtr', respectively.
160
-------------------------------------------------------------------------------
161
*/
162
static void
163
 normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
164
{
165
    int8 shiftCount;
166
 
167
    shiftCount = countLeadingZeros32( aSig ) - 8;
168
    *zSigPtr = aSig<<shiftCount;
169
    *zExpPtr = 1 - shiftCount;
170
 
171
}
172
 
173
/*
174
-------------------------------------------------------------------------------
175
Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
176
single-precision floating-point value, returning the result.  After being
177
shifted into the proper positions, the three fields are simply added
178
together to form the result.  This means that any integer portion of `zSig'
179
will be added into the exponent.  Since a properly normalized significand
180
will have an integer portion equal to 1, the `zExp' input should be 1 less
181
than the desired result exponent whenever `zSig' is a complete, normalized
182
significand.
183
-------------------------------------------------------------------------------
184
*/
185
INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
186
{
187
#if 0
188
   float32 f;
189
   __asm__("@ packFloat32                               \n\
190
            mov %0, %1, asl #31                         \n\
191
            orr %0, %2, asl #23                         \n\
192
            orr %0, %3"
193
            : /* no outputs */
194
            : "g" (f), "g" (zSign), "g" (zExp), "g" (zSig)
195
            : "cc");
196
   return f;
197
#else
198
    return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
199
#endif 
200
}
201
 
202
/*
203
-------------------------------------------------------------------------------
204
Takes an abstract floating-point value having sign `zSign', exponent `zExp',
205
and significand `zSig', and returns the proper single-precision floating-
206
point value corresponding to the abstract input.  Ordinarily, the abstract
207
value is simply rounded and packed into the single-precision format, with
208
the inexact exception raised if the abstract input cannot be represented
209
exactly.  If the abstract value is too large, however, the overflow and
210
inexact exceptions are raised and an infinity or maximal finite value is
211
returned.  If the abstract value is too small, the input value is rounded to
212
a subnormal number, and the underflow and inexact exceptions are raised if
213
the abstract input cannot be represented exactly as a subnormal single-
214
precision floating-point number.
215
    The input significand `zSig' has its binary point between bits 30
216
and 29, which is 7 bits to the left of the usual location.  This shifted
217
significand must be normalized or smaller.  If `zSig' is not normalized,
218
`zExp' must be 0; in that case, the result returned is a subnormal number,
219
and it must not require rounding.  In the usual case that `zSig' is
220
normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
221
The handling of underflow and overflow follows the IEC/IEEE Standard for
222
Binary Floating-point Arithmetic.
223
-------------------------------------------------------------------------------
224
*/
225
static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
226
{
227
    int8 roundingMode;
228
    flag roundNearestEven;
229
    int8 roundIncrement, roundBits;
230
    flag isTiny;
231
 
232
    roundingMode = float_rounding_mode;
233
    roundNearestEven = ( roundingMode == float_round_nearest_even );
234
    roundIncrement = 0x40;
235
    if ( ! roundNearestEven ) {
236
        if ( roundingMode == float_round_to_zero ) {
237
            roundIncrement = 0;
238
        }
239
        else {
240
            roundIncrement = 0x7F;
241
            if ( zSign ) {
242
                if ( roundingMode == float_round_up ) roundIncrement = 0;
243
            }
244
            else {
245
                if ( roundingMode == float_round_down ) roundIncrement = 0;
246
            }
247
        }
248
    }
249
    roundBits = zSig & 0x7F;
250
    if ( 0xFD <= (bits16) zExp ) {
251
        if (    ( 0xFD < zExp )
252
             || (    ( zExp == 0xFD )
253
                  && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
254
           ) {
255
            float_raise( float_flag_overflow | float_flag_inexact );
256
            return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
257
        }
258
        if ( zExp < 0 ) {
259
            isTiny =
260
                   ( float_detect_tininess == float_tininess_before_rounding )
261
                || ( zExp < -1 )
262
                || ( zSig + roundIncrement < 0x80000000 );
263
            shift32RightJamming( zSig, - zExp, &zSig );
264
            zExp = 0;
265
            roundBits = zSig & 0x7F;
266
            if ( isTiny && roundBits ) float_raise( float_flag_underflow );
267
        }
268
    }
269
    if ( roundBits ) float_exception_flags |= float_flag_inexact;
270
    zSig = ( zSig + roundIncrement )>>7;
271
    zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
272
    if ( zSig == 0 ) zExp = 0;
273
    return packFloat32( zSign, zExp, zSig );
274
 
275
}
276
 
277
/*
278
-------------------------------------------------------------------------------
279
Takes an abstract floating-point value having sign `zSign', exponent `zExp',
280
and significand `zSig', and returns the proper single-precision floating-
281
point value corresponding to the abstract input.  This routine is just like
282
`roundAndPackFloat32' except that `zSig' does not have to be normalized in
283
any way.  In all cases, `zExp' must be 1 less than the ``true'' floating-
284
point exponent.
285
-------------------------------------------------------------------------------
286
*/
287
static float32
288
 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
289
{
290
    int8 shiftCount;
291
 
292
    shiftCount = countLeadingZeros32( zSig ) - 1;
293
    return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount );
294
 
295
}
296
 
297
/*
298
-------------------------------------------------------------------------------
299
Returns the fraction bits of the double-precision floating-point value `a'.
300
-------------------------------------------------------------------------------
301
*/
302
INLINE bits64 extractFloat64Frac( float64 a )
303
{
304
 
305
    return a & LIT64( 0x000FFFFFFFFFFFFF );
306
 
307
}
308
 
309
/*
310
-------------------------------------------------------------------------------
311
Returns the exponent bits of the double-precision floating-point value `a'.
312
-------------------------------------------------------------------------------
313
*/
314
INLINE int16 extractFloat64Exp( float64 a )
315
{
316
 
317
    return ( a>>52 ) & 0x7FF;
318
 
319
}
320
 
321
/*
322
-------------------------------------------------------------------------------
323
Returns the sign bit of the double-precision floating-point value `a'.
324
-------------------------------------------------------------------------------
325
*/
326
#if 0   /* in softfloat.h */
327
INLINE flag extractFloat64Sign( float64 a )
328
{
329
 
330
    return a>>63;
331
 
332
}
333
#endif
334
 
335
/*
336
-------------------------------------------------------------------------------
337
Normalizes the subnormal double-precision floating-point value represented
338
by the denormalized significand `aSig'.  The normalized exponent and
339
significand are stored at the locations pointed to by `zExpPtr' and
340
`zSigPtr', respectively.
341
-------------------------------------------------------------------------------
342
*/
343
static void
344
 normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
345
{
346
    int8 shiftCount;
347
 
348
    shiftCount = countLeadingZeros64( aSig ) - 11;
349
    *zSigPtr = aSig<<shiftCount;
350
    *zExpPtr = 1 - shiftCount;
351
 
352
}
353
 
354
/*
355
-------------------------------------------------------------------------------
356
Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
357
double-precision floating-point value, returning the result.  After being
358
shifted into the proper positions, the three fields are simply added
359
together to form the result.  This means that any integer portion of `zSig'
360
will be added into the exponent.  Since a properly normalized significand
361
will have an integer portion equal to 1, the `zExp' input should be 1 less
362
than the desired result exponent whenever `zSig' is a complete, normalized
363
significand.
364
-------------------------------------------------------------------------------
365
*/
366
INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
367
{
368
 
369
    return ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<52 ) + zSig;
370
 
371
}
372
 
373
/*
374
-------------------------------------------------------------------------------
375
Takes an abstract floating-point value having sign `zSign', exponent `zExp',
376
and significand `zSig', and returns the proper double-precision floating-
377
point value corresponding to the abstract input.  Ordinarily, the abstract
378
value is simply rounded and packed into the double-precision format, with
379
the inexact exception raised if the abstract input cannot be represented
380
exactly.  If the abstract value is too large, however, the overflow and
381
inexact exceptions are raised and an infinity or maximal finite value is
382
returned.  If the abstract value is too small, the input value is rounded to
383
a subnormal number, and the underflow and inexact exceptions are raised if
384
the abstract input cannot be represented exactly as a subnormal double-
385
precision floating-point number.
386
    The input significand `zSig' has its binary point between bits 62
387
and 61, which is 10 bits to the left of the usual location.  This shifted
388
significand must be normalized or smaller.  If `zSig' is not normalized,
389
`zExp' must be 0; in that case, the result returned is a subnormal number,
390
and it must not require rounding.  In the usual case that `zSig' is
391
normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
392
The handling of underflow and overflow follows the IEC/IEEE Standard for
393
Binary Floating-point Arithmetic.
394
-------------------------------------------------------------------------------
395
*/
396
static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
397
{
398
    int8 roundingMode;
399
    flag roundNearestEven;
400
    int16 roundIncrement, roundBits;
401
    flag isTiny;
402
 
403
    roundingMode = float_rounding_mode;
404
    roundNearestEven = ( roundingMode == float_round_nearest_even );
405
    roundIncrement = 0x200;
406
    if ( ! roundNearestEven ) {
407
        if ( roundingMode == float_round_to_zero ) {
408
            roundIncrement = 0;
409
        }
410
        else {
411
            roundIncrement = 0x3FF;
412
            if ( zSign ) {
413
                if ( roundingMode == float_round_up ) roundIncrement = 0;
414
            }
415
            else {
416
                if ( roundingMode == float_round_down ) roundIncrement = 0;
417
            }
418
        }
419
    }
420
    roundBits = zSig & 0x3FF;
421
    if ( 0x7FD <= (bits16) zExp ) {
422
        if (    ( 0x7FD < zExp )
423
             || (    ( zExp == 0x7FD )
424
                  && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
425
           ) {
426
            //register int lr = __builtin_return_address(0);
427
            //printk("roundAndPackFloat64 called from 0x%08x\n",lr);
428
            float_raise( float_flag_overflow | float_flag_inexact );
429
            return packFloat64( zSign, 0x7FF, 0 ) - ( roundIncrement == 0 );
430
        }
431
        if ( zExp < 0 ) {
432
            isTiny =
433
                   ( float_detect_tininess == float_tininess_before_rounding )
434
                || ( zExp < -1 )
435
                || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
436
            shift64RightJamming( zSig, - zExp, &zSig );
437
            zExp = 0;
438
            roundBits = zSig & 0x3FF;
439
            if ( isTiny && roundBits ) float_raise( float_flag_underflow );
440
        }
441
    }
442
    if ( roundBits ) float_exception_flags |= float_flag_inexact;
443
    zSig = ( zSig + roundIncrement )>>10;
444
    zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
445
    if ( zSig == 0 ) zExp = 0;
446
    return packFloat64( zSign, zExp, zSig );
447
 
448
}
449
 
450
/*
451
-------------------------------------------------------------------------------
452
Takes an abstract floating-point value having sign `zSign', exponent `zExp',
453
and significand `zSig', and returns the proper double-precision floating-
454
point value corresponding to the abstract input.  This routine is just like
455
`roundAndPackFloat64' except that `zSig' does not have to be normalized in
456
any way.  In all cases, `zExp' must be 1 less than the ``true'' floating-
457
point exponent.
458
-------------------------------------------------------------------------------
459
*/
460
static float64
461
 normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
462
{
463
    int8 shiftCount;
464
 
465
    shiftCount = countLeadingZeros64( zSig ) - 1;
466
    return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount );
467
 
468
}
469
 
470
#ifdef FLOATX80
471
 
472
/*
473
-------------------------------------------------------------------------------
474
Returns the fraction bits of the extended double-precision floating-point
475
value `a'.
476
-------------------------------------------------------------------------------
477
*/
478
INLINE bits64 extractFloatx80Frac( floatx80 a )
479
{
480
 
481
    return a.low;
482
 
483
}
484
 
485
/*
486
-------------------------------------------------------------------------------
487
Returns the exponent bits of the extended double-precision floating-point
488
value `a'.
489
-------------------------------------------------------------------------------
490
*/
491
INLINE int32 extractFloatx80Exp( floatx80 a )
492
{
493
 
494
    return a.high & 0x7FFF;
495
 
496
}
497
 
498
/*
499
-------------------------------------------------------------------------------
500
Returns the sign bit of the extended double-precision floating-point value
501
`a'.
502
-------------------------------------------------------------------------------
503
*/
504
INLINE flag extractFloatx80Sign( floatx80 a )
505
{
506
 
507
    return a.high>>15;
508
 
509
}
510
 
511
/*
512
-------------------------------------------------------------------------------
513
Normalizes the subnormal extended double-precision floating-point value
514
represented by the denormalized significand `aSig'.  The normalized exponent
515
and significand are stored at the locations pointed to by `zExpPtr' and
516
`zSigPtr', respectively.
517
-------------------------------------------------------------------------------
518
*/
519
static void
520
 normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
521
{
522
    int8 shiftCount;
523
 
524
    shiftCount = countLeadingZeros64( aSig );
525
    *zSigPtr = aSig<<shiftCount;
526
    *zExpPtr = 1 - shiftCount;
527
 
528
}
529
 
530
/*
531
-------------------------------------------------------------------------------
532
Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
533
extended double-precision floating-point value, returning the result.
534
-------------------------------------------------------------------------------
535
*/
536
INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
537
{
538
    floatx80 z;
539
 
540
    z.low = zSig;
541
    z.high = ( ( (bits16) zSign )<<15 ) + zExp;
542
    return z;
543
 
544
}
545
 
546
/*
547
-------------------------------------------------------------------------------
548
Takes an abstract floating-point value having sign `zSign', exponent `zExp',
549
and extended significand formed by the concatenation of `zSig0' and `zSig1',
550
and returns the proper extended double-precision floating-point value
551
corresponding to the abstract input.  Ordinarily, the abstract value is
552
rounded and packed into the extended double-precision format, with the
553
inexact exception raised if the abstract input cannot be represented
554
exactly.  If the abstract value is too large, however, the overflow and
555
inexact exceptions are raised and an infinity or maximal finite value is
556
returned.  If the abstract value is too small, the input value is rounded to
557
a subnormal number, and the underflow and inexact exceptions are raised if
558
the abstract input cannot be represented exactly as a subnormal extended
559
double-precision floating-point number.
560
    If `roundingPrecision' is 32 or 64, the result is rounded to the same
561
number of bits as single or double precision, respectively.  Otherwise, the
562
result is rounded to the full precision of the extended double-precision
563
format.
564
    The input significand must be normalized or smaller.  If the input
565
significand is not normalized, `zExp' must be 0; in that case, the result
566
returned is a subnormal number, and it must not require rounding.  The
567
handling of underflow and overflow follows the IEC/IEEE Standard for Binary
568
Floating-point Arithmetic.
569
-------------------------------------------------------------------------------
570
*/
571
static floatx80
572
 roundAndPackFloatx80(
573
     int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
574
 )
575
{
576
    int8 roundingMode;
577
    flag roundNearestEven, increment, isTiny;
578
    int64 roundIncrement, roundMask, roundBits;
579
 
580
    roundingMode = float_rounding_mode;
581
    roundNearestEven = ( roundingMode == float_round_nearest_even );
582
    if ( roundingPrecision == 80 ) goto precision80;
583
    if ( roundingPrecision == 64 ) {
584
        roundIncrement = LIT64( 0x0000000000000400 );
585
        roundMask = LIT64( 0x00000000000007FF );
586
    }
587
    else if ( roundingPrecision == 32 ) {
588
        roundIncrement = LIT64( 0x0000008000000000 );
589
        roundMask = LIT64( 0x000000FFFFFFFFFF );
590
    }
591
    else {
592
        goto precision80;
593
    }
594
    zSig0 |= ( zSig1 != 0 );
595
    if ( ! roundNearestEven ) {
596
        if ( roundingMode == float_round_to_zero ) {
597
            roundIncrement = 0;
598
        }
599
        else {
600
            roundIncrement = roundMask;
601
            if ( zSign ) {
602
                if ( roundingMode == float_round_up ) roundIncrement = 0;
603
            }
604
            else {
605
                if ( roundingMode == float_round_down ) roundIncrement = 0;
606
            }
607
        }
608
    }
609
    roundBits = zSig0 & roundMask;
610
    if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
611
        if (    ( 0x7FFE < zExp )
612
             || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
613
           ) {
614
            goto overflow;
615
        }
616
        if ( zExp <= 0 ) {
617
            isTiny =
618
                   ( float_detect_tininess == float_tininess_before_rounding )
619
                || ( zExp < 0 )
620
                || ( zSig0 <= zSig0 + roundIncrement );
621
            shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
622
            zExp = 0;
623
            roundBits = zSig0 & roundMask;
624
            if ( isTiny && roundBits ) float_raise( float_flag_underflow );
625
            if ( roundBits ) float_exception_flags |= float_flag_inexact;
626
            zSig0 += roundIncrement;
627
            if ( (sbits64) zSig0 < 0 ) zExp = 1;
628
            roundIncrement = roundMask + 1;
629
            if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
630
                roundMask |= roundIncrement;
631
            }
632
            zSig0 &= ~ roundMask;
633
            return packFloatx80( zSign, zExp, zSig0 );
634
        }
635
    }
636
    if ( roundBits ) float_exception_flags |= float_flag_inexact;
637
    zSig0 += roundIncrement;
638
    if ( zSig0 < roundIncrement ) {
639
        ++zExp;
640
        zSig0 = LIT64( 0x8000000000000000 );
641
    }
642
    roundIncrement = roundMask + 1;
643
    if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
644
        roundMask |= roundIncrement;
645
    }
646
    zSig0 &= ~ roundMask;
647
    if ( zSig0 == 0 ) zExp = 0;
648
    return packFloatx80( zSign, zExp, zSig0 );
649
 precision80:
650
    increment = ( (sbits64) zSig1 < 0 );
651
    if ( ! roundNearestEven ) {
652
        if ( roundingMode == float_round_to_zero ) {
653
            increment = 0;
654
        }
655
        else {
656
            if ( zSign ) {
657
                increment = ( roundingMode == float_round_down ) && zSig1;
658
            }
659
            else {
660
                increment = ( roundingMode == float_round_up ) && zSig1;
661
            }
662
        }
663
    }
664
    if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
665
        if (    ( 0x7FFE < zExp )
666
             || (    ( zExp == 0x7FFE )
667
                  && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
668
                  && increment
669
                )
670
           ) {
671
            roundMask = 0;
672
 overflow:
673
            float_raise( float_flag_overflow | float_flag_inexact );
674
            if (    ( roundingMode == float_round_to_zero )
675
                 || ( zSign && ( roundingMode == float_round_up ) )
676
                 || ( ! zSign && ( roundingMode == float_round_down ) )
677
               ) {
678
                return packFloatx80( zSign, 0x7FFE, ~ roundMask );
679
            }
680
            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
681
        }
682
        if ( zExp <= 0 ) {
683
            isTiny =
684
                   ( float_detect_tininess == float_tininess_before_rounding )
685
                || ( zExp < 0 )
686
                || ! increment
687
                || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
688
            shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
689
            zExp = 0;
690
            if ( isTiny && zSig1 ) float_raise( float_flag_underflow );
691
            if ( zSig1 ) float_exception_flags |= float_flag_inexact;
692
            if ( roundNearestEven ) {
693
                increment = ( (sbits64) zSig1 < 0 );
694
            }
695
            else {
696
                if ( zSign ) {
697
                    increment = ( roundingMode == float_round_down ) && zSig1;
698
                }
699
                else {
700
                    increment = ( roundingMode == float_round_up ) && zSig1;
701
                }
702
            }
703
            if ( increment ) {
704
                ++zSig0;
705
                zSig0 &= ~ ( ( zSig1 + zSig1 == 0 ) & roundNearestEven );
706
                if ( (sbits64) zSig0 < 0 ) zExp = 1;
707
            }
708
            return packFloatx80( zSign, zExp, zSig0 );
709
        }
710
    }
711
    if ( zSig1 ) float_exception_flags |= float_flag_inexact;
712
    if ( increment ) {
713
        ++zSig0;
714
        if ( zSig0 == 0 ) {
715
            ++zExp;
716
            zSig0 = LIT64( 0x8000000000000000 );
717
        }
718
        else {
719
            zSig0 &= ~ ( ( zSig1 + zSig1 == 0 ) & roundNearestEven );
720
        }
721
    }
722
    else {
723
        if ( zSig0 == 0 ) zExp = 0;
724
    }
725
 
726
    return packFloatx80( zSign, zExp, zSig0 );
727
}
728
 
729
/*
730
-------------------------------------------------------------------------------
731
Takes an abstract floating-point value having sign `zSign', exponent
732
`zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
733
and returns the proper extended double-precision floating-point value
734
corresponding to the abstract input.  This routine is just like
735
`roundAndPackFloatx80' except that the input significand does not have to be
736
normalized.
737
-------------------------------------------------------------------------------
738
*/
739
static floatx80
740
 normalizeRoundAndPackFloatx80(
741
     int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
742
 )
743
{
744
    int8 shiftCount;
745
 
746
    if ( zSig0 == 0 ) {
747
        zSig0 = zSig1;
748
        zSig1 = 0;
749
        zExp -= 64;
750
    }
751
    shiftCount = countLeadingZeros64( zSig0 );
752
    shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
753
    zExp -= shiftCount;
754
    return
755
        roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
756
 
757
}
758
 
759
#endif
760
 
761
/*
762
-------------------------------------------------------------------------------
763
Returns the result of converting the 32-bit two's complement integer `a' to
764
the single-precision floating-point format.  The conversion is performed
765
according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
766
-------------------------------------------------------------------------------
767
*/
768
float32 int32_to_float32( int32 a )
769
{
770
    flag zSign;
771
 
772
    if ( a == 0 ) return 0;
773
    if ( a == 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
774
    zSign = ( a < 0 );
775
    return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a );
776
 
777
}
778
 
779
/*
780
-------------------------------------------------------------------------------
781
Returns the result of converting the 32-bit two's complement integer `a' to
782
the double-precision floating-point format.  The conversion is performed
783
according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
784
-------------------------------------------------------------------------------
785
*/
786
float64 int32_to_float64( int32 a )
787
{
788
    flag aSign;
789
    uint32 absA;
790
    int8 shiftCount;
791
    bits64 zSig;
792
 
793
    if ( a == 0 ) return 0;
794
    aSign = ( a < 0 );
795
    absA = aSign ? - a : a;
796
    shiftCount = countLeadingZeros32( absA ) + 21;
797
    zSig = absA;
798
    return packFloat64( aSign, 0x432 - shiftCount, zSig<<shiftCount );
799
 
800
}
801
 
802
#ifdef FLOATX80
803
 
804
/*
805
-------------------------------------------------------------------------------
806
Returns the result of converting the 32-bit two's complement integer `a'
807
to the extended double-precision floating-point format.  The conversion
808
is performed according to the IEC/IEEE Standard for Binary Floating-point
809
Arithmetic.
810
-------------------------------------------------------------------------------
811
*/
812
floatx80 int32_to_floatx80( int32 a )
813
{
814
    flag zSign;
815
    uint32 absA;
816
    int8 shiftCount;
817
    bits64 zSig;
818
 
819
    if ( a == 0 ) return packFloatx80( 0, 0, 0 );
820
    zSign = ( a < 0 );
821
    absA = zSign ? - a : a;
822
    shiftCount = countLeadingZeros32( absA ) + 32;
823
    zSig = absA;
824
    return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
825
 
826
}
827
 
828
#endif
829
 
830
/*
831
-------------------------------------------------------------------------------
832
Returns the result of converting the single-precision floating-point value
833
`a' to the 32-bit two's complement integer format.  The conversion is
834
performed according to the IEC/IEEE Standard for Binary Floating-point
835
Arithmetic---which means in particular that the conversion is rounded
836
according to the current rounding mode.  If `a' is a NaN, the largest
837
positive integer is returned.  Otherwise, if the conversion overflows, the
838
largest integer with the same sign as `a' is returned.
839
-------------------------------------------------------------------------------
840
*/
841
int32 float32_to_int32( float32 a )
842
{
843
    flag aSign;
844
    int16 aExp, shiftCount;
845
    bits32 aSig;
846
    bits64 zSig;
847
 
848
    aSig = extractFloat32Frac( a );
849
    aExp = extractFloat32Exp( a );
850
    aSign = extractFloat32Sign( a );
851
    if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
852
    if ( aExp ) aSig |= 0x00800000;
853
    shiftCount = 0xAF - aExp;
854
    zSig = aSig;
855
    zSig <<= 32;
856
    if ( 0 < shiftCount ) shift64RightJamming( zSig, shiftCount, &zSig );
857
    return roundAndPackInt32( aSign, zSig );
858
 
859
}
860
 
861
/*
862
-------------------------------------------------------------------------------
863
Returns the result of converting the single-precision floating-point value
864
`a' to the 32-bit two's complement integer format.  The conversion is
865
performed according to the IEC/IEEE Standard for Binary Floating-point
866
Arithmetic, except that the conversion is always rounded toward zero.  If
867
`a' is a NaN, the largest positive integer is returned.  Otherwise, if the
868
conversion overflows, the largest integer with the same sign as `a' is
869
returned.
870
-------------------------------------------------------------------------------
871
*/
872
int32 float32_to_int32_round_to_zero( float32 a )
873
{
874
    flag aSign;
875
    int16 aExp, shiftCount;
876
    bits32 aSig;
877
    int32 z;
878
 
879
    aSig = extractFloat32Frac( a );
880
    aExp = extractFloat32Exp( a );
881
    aSign = extractFloat32Sign( a );
882
    shiftCount = aExp - 0x9E;
883
    if ( 0 <= shiftCount ) {
884
        if ( a == 0xCF000000 ) return 0x80000000;
885
        float_raise( float_flag_invalid );
886
        if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
887
        return 0x80000000;
888
    }
889
    else if ( aExp <= 0x7E ) {
890
        if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
891
        return 0;
892
    }
893
    aSig = ( aSig | 0x00800000 )<<8;
894
    z = aSig>>( - shiftCount );
895
    if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
896
        float_exception_flags |= float_flag_inexact;
897
    }
898
    return aSign ? - z : z;
899
 
900
}
901
 
902
/*
903
-------------------------------------------------------------------------------
904
Returns the result of converting the single-precision floating-point value
905
`a' to the double-precision floating-point format.  The conversion is
906
performed according to the IEC/IEEE Standard for Binary Floating-point
907
Arithmetic.
908
-------------------------------------------------------------------------------
909
*/
910
float64 float32_to_float64( float32 a )
911
{
912
    flag aSign;
913
    int16 aExp;
914
    bits32 aSig;
915
 
916
    aSig = extractFloat32Frac( a );
917
    aExp = extractFloat32Exp( a );
918
    aSign = extractFloat32Sign( a );
919
    if ( aExp == 0xFF ) {
920
        if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) );
921
        return packFloat64( aSign, 0x7FF, 0 );
922
    }
923
    if ( aExp == 0 ) {
924
        if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
925
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
926
        --aExp;
927
    }
928
    return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
929
 
930
}
931
 
932
#ifdef FLOATX80
933
 
934
/*
935
-------------------------------------------------------------------------------
936
Returns the result of converting the single-precision floating-point value
937
`a' to the extended double-precision floating-point format.  The conversion
938
is performed according to the IEC/IEEE Standard for Binary Floating-point
939
Arithmetic.
940
-------------------------------------------------------------------------------
941
*/
942
floatx80 float32_to_floatx80( float32 a )
943
{
944
    flag aSign;
945
    int16 aExp;
946
    bits32 aSig;
947
 
948
    aSig = extractFloat32Frac( a );
949
    aExp = extractFloat32Exp( a );
950
    aSign = extractFloat32Sign( a );
951
    if ( aExp == 0xFF ) {
952
        if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) );
953
        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
954
    }
955
    if ( aExp == 0 ) {
956
        if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
957
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
958
    }
959
    aSig |= 0x00800000;
960
    return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
961
 
962
}
963
 
964
#endif
965
 
966
/*
967
-------------------------------------------------------------------------------
968
Rounds the single-precision floating-point value `a' to an integer, and
969
returns the result as a single-precision floating-point value.  The
970
operation is performed according to the IEC/IEEE Standard for Binary
971
Floating-point Arithmetic.
972
-------------------------------------------------------------------------------
973
*/
974
float32 float32_round_to_int( float32 a )
975
{
976
    flag aSign;
977
    int16 aExp;
978
    bits32 lastBitMask, roundBitsMask;
979
    int8 roundingMode;
980
    float32 z;
981
 
982
    aExp = extractFloat32Exp( a );
983
    if ( 0x96 <= aExp ) {
984
        if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
985
            return propagateFloat32NaN( a, a );
986
        }
987
        return a;
988
    }
989
    if ( aExp <= 0x7E ) {
990
        if ( (bits32) ( a<<1 ) == 0 ) return a;
991
        float_exception_flags |= float_flag_inexact;
992
        aSign = extractFloat32Sign( a );
993
        switch ( float_rounding_mode ) {
994
         case float_round_nearest_even:
995
            if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
996
                return packFloat32( aSign, 0x7F, 0 );
997
            }
998
            break;
999
         case float_round_down:
1000
            return aSign ? 0xBF800000 : 0;
1001
         case float_round_up:
1002
            return aSign ? 0x80000000 : 0x3F800000;
1003
        }
1004
        return packFloat32( aSign, 0, 0 );
1005
    }
1006
    lastBitMask = 1;
1007
    lastBitMask <<= 0x96 - aExp;
1008
    roundBitsMask = lastBitMask - 1;
1009
    z = a;
1010
    roundingMode = float_rounding_mode;
1011
    if ( roundingMode == float_round_nearest_even ) {
1012
        z += lastBitMask>>1;
1013
        if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1014
    }
1015
    else if ( roundingMode != float_round_to_zero ) {
1016
        if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1017
            z += roundBitsMask;
1018
        }
1019
    }
1020
    z &= ~ roundBitsMask;
1021
    if ( z != a ) float_exception_flags |= float_flag_inexact;
1022
    return z;
1023
 
1024
}
1025
 
1026
/*
1027
-------------------------------------------------------------------------------
1028
Returns the result of adding the absolute values of the single-precision
1029
floating-point values `a' and `b'.  If `zSign' is true, the sum is negated
1030
before being returned.  `zSign' is ignored if the result is a NaN.  The
1031
addition is performed according to the IEC/IEEE Standard for Binary
1032
Floating-point Arithmetic.
1033
-------------------------------------------------------------------------------
1034
*/
1035
static float32 addFloat32Sigs( float32 a, float32 b, flag zSign )
1036
{
1037
    int16 aExp, bExp, zExp;
1038
    bits32 aSig, bSig, zSig;
1039
    int16 expDiff;
1040
 
1041
    aSig = extractFloat32Frac( a );
1042
    aExp = extractFloat32Exp( a );
1043
    bSig = extractFloat32Frac( b );
1044
    bExp = extractFloat32Exp( b );
1045
    expDiff = aExp - bExp;
1046
    aSig <<= 6;
1047
    bSig <<= 6;
1048
    if ( 0 < expDiff ) {
1049
        if ( aExp == 0xFF ) {
1050
            if ( aSig ) return propagateFloat32NaN( a, b );
1051
            return a;
1052
        }
1053
        if ( bExp == 0 ) {
1054
            --expDiff;
1055
        }
1056
        else {
1057
            bSig |= 0x20000000;
1058
        }
1059
        shift32RightJamming( bSig, expDiff, &bSig );
1060
        zExp = aExp;
1061
    }
1062
    else if ( expDiff < 0 ) {
1063
        if ( bExp == 0xFF ) {
1064
            if ( bSig ) return propagateFloat32NaN( a, b );
1065
            return packFloat32( zSign, 0xFF, 0 );
1066
        }
1067
        if ( aExp == 0 ) {
1068
            ++expDiff;
1069
        }
1070
        else {
1071
            aSig |= 0x20000000;
1072
        }
1073
        shift32RightJamming( aSig, - expDiff, &aSig );
1074
        zExp = bExp;
1075
    }
1076
    else {
1077
        if ( aExp == 0xFF ) {
1078
            if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1079
            return a;
1080
        }
1081
        if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1082
        zSig = 0x40000000 + aSig + bSig;
1083
        zExp = aExp;
1084
        goto roundAndPack;
1085
    }
1086
    aSig |= 0x20000000;
1087
    zSig = ( aSig + bSig )<<1;
1088
    --zExp;
1089
    if ( (sbits32) zSig < 0 ) {
1090
        zSig = aSig + bSig;
1091
        ++zExp;
1092
    }
1093
 roundAndPack:
1094
    return roundAndPackFloat32( zSign, zExp, zSig );
1095
 
1096
}
1097
 
1098
/*
1099
-------------------------------------------------------------------------------
1100
Returns the result of subtracting the absolute values of the single-
1101
precision floating-point values `a' and `b'.  If `zSign' is true, the
1102
difference is negated before being returned.  `zSign' is ignored if the
1103
result is a NaN.  The subtraction is performed according to the IEC/IEEE
1104
Standard for Binary Floating-point Arithmetic.
1105
-------------------------------------------------------------------------------
1106
*/
1107
static float32 subFloat32Sigs( float32 a, float32 b, flag zSign )
1108
{
1109
    int16 aExp, bExp, zExp;
1110
    bits32 aSig, bSig, zSig;
1111
    int16 expDiff;
1112
 
1113
    aSig = extractFloat32Frac( a );
1114
    aExp = extractFloat32Exp( a );
1115
    bSig = extractFloat32Frac( b );
1116
    bExp = extractFloat32Exp( b );
1117
    expDiff = aExp - bExp;
1118
    aSig <<= 7;
1119
    bSig <<= 7;
1120
    if ( 0 < expDiff ) goto aExpBigger;
1121
    if ( expDiff < 0 ) goto bExpBigger;
1122
    if ( aExp == 0xFF ) {
1123
        if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1124
        float_raise( float_flag_invalid );
1125
        return float32_default_nan;
1126
    }
1127
    if ( aExp == 0 ) {
1128
        aExp = 1;
1129
        bExp = 1;
1130
    }
1131
    if ( bSig < aSig ) goto aBigger;
1132
    if ( aSig < bSig ) goto bBigger;
1133
    return packFloat32( float_rounding_mode == float_round_down, 0, 0 );
1134
 bExpBigger:
1135
    if ( bExp == 0xFF ) {
1136
        if ( bSig ) return propagateFloat32NaN( a, b );
1137
        return packFloat32( zSign ^ 1, 0xFF, 0 );
1138
    }
1139
    if ( aExp == 0 ) {
1140
        ++expDiff;
1141
    }
1142
    else {
1143
        aSig |= 0x40000000;
1144
    }
1145
    shift32RightJamming( aSig, - expDiff, &aSig );
1146
    bSig |= 0x40000000;
1147
 bBigger:
1148
    zSig = bSig - aSig;
1149
    zExp = bExp;
1150
    zSign ^= 1;
1151
    goto normalizeRoundAndPack;
1152
 aExpBigger:
1153
    if ( aExp == 0xFF ) {
1154
        if ( aSig ) return propagateFloat32NaN( a, b );
1155
        return a;
1156
    }
1157
    if ( bExp == 0 ) {
1158
        --expDiff;
1159
    }
1160
    else {
1161
        bSig |= 0x40000000;
1162
    }
1163
    shift32RightJamming( bSig, expDiff, &bSig );
1164
    aSig |= 0x40000000;
1165
 aBigger:
1166
    zSig = aSig - bSig;
1167
    zExp = aExp;
1168
 normalizeRoundAndPack:
1169
    --zExp;
1170
    return normalizeRoundAndPackFloat32( zSign, zExp, zSig );
1171
 
1172
}
1173
 
1174
/*
1175
-------------------------------------------------------------------------------
1176
Returns the result of adding the single-precision floating-point values `a'
1177
and `b'.  The operation is performed according to the IEC/IEEE Standard for
1178
Binary Floating-point Arithmetic.
1179
-------------------------------------------------------------------------------
1180
*/
1181
float32 float32_add( float32 a, float32 b )
1182
{
1183
    flag aSign, bSign;
1184
 
1185
    aSign = extractFloat32Sign( a );
1186
    bSign = extractFloat32Sign( b );
1187
    if ( aSign == bSign ) {
1188
        return addFloat32Sigs( a, b, aSign );
1189
    }
1190
    else {
1191
        return subFloat32Sigs( a, b, aSign );
1192
    }
1193
 
1194
}
1195
 
1196
/*
1197
-------------------------------------------------------------------------------
1198
Returns the result of subtracting the single-precision floating-point values
1199
`a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1200
for Binary Floating-point Arithmetic.
1201
-------------------------------------------------------------------------------
1202
*/
1203
float32 float32_sub( float32 a, float32 b )
1204
{
1205
    flag aSign, bSign;
1206
 
1207
    aSign = extractFloat32Sign( a );
1208
    bSign = extractFloat32Sign( b );
1209
    if ( aSign == bSign ) {
1210
        return subFloat32Sigs( a, b, aSign );
1211
    }
1212
    else {
1213
        return addFloat32Sigs( a, b, aSign );
1214
    }
1215
 
1216
}
1217
 
1218
/*
1219
-------------------------------------------------------------------------------
1220
Returns the result of multiplying the single-precision floating-point values
1221
`a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1222
for Binary Floating-point Arithmetic.
1223
-------------------------------------------------------------------------------
1224
*/
1225
float32 float32_mul( float32 a, float32 b )
1226
{
1227
    flag aSign, bSign, zSign;
1228
    int16 aExp, bExp, zExp;
1229
    bits32 aSig, bSig;
1230
    bits64 zSig64;
1231
    bits32 zSig;
1232
 
1233
    aSig = extractFloat32Frac( a );
1234
    aExp = extractFloat32Exp( a );
1235
    aSign = extractFloat32Sign( a );
1236
    bSig = extractFloat32Frac( b );
1237
    bExp = extractFloat32Exp( b );
1238
    bSign = extractFloat32Sign( b );
1239
    zSign = aSign ^ bSign;
1240
    if ( aExp == 0xFF ) {
1241
        if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1242
            return propagateFloat32NaN( a, b );
1243
        }
1244
        if ( ( bExp | bSig ) == 0 ) {
1245
            float_raise( float_flag_invalid );
1246
            return float32_default_nan;
1247
        }
1248
        return packFloat32( zSign, 0xFF, 0 );
1249
    }
1250
    if ( bExp == 0xFF ) {
1251
        if ( bSig ) return propagateFloat32NaN( a, b );
1252
        if ( ( aExp | aSig ) == 0 ) {
1253
            float_raise( float_flag_invalid );
1254
            return float32_default_nan;
1255
        }
1256
        return packFloat32( zSign, 0xFF, 0 );
1257
    }
1258
    if ( aExp == 0 ) {
1259
        if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1260
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1261
    }
1262
    if ( bExp == 0 ) {
1263
        if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1264
        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1265
    }
1266
    zExp = aExp + bExp - 0x7F;
1267
    aSig = ( aSig | 0x00800000 )<<7;
1268
    bSig = ( bSig | 0x00800000 )<<8;
1269
    shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1270
    zSig = zSig64;
1271
    if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1272
        zSig <<= 1;
1273
        --zExp;
1274
    }
1275
    return roundAndPackFloat32( zSign, zExp, zSig );
1276
 
1277
}
1278
 
1279
/*
1280
-------------------------------------------------------------------------------
1281
Returns the result of dividing the single-precision floating-point value `a'
1282
by the corresponding value `b'.  The operation is performed according to the
1283
IEC/IEEE Standard for Binary Floating-point Arithmetic.
1284
-------------------------------------------------------------------------------
1285
*/
1286
float32 float32_div( float32 a, float32 b )
1287
{
1288
    flag aSign, bSign, zSign;
1289
    int16 aExp, bExp, zExp;
1290
    bits32 aSig, bSig, zSig;
1291
 
1292
    aSig = extractFloat32Frac( a );
1293
    aExp = extractFloat32Exp( a );
1294
    aSign = extractFloat32Sign( a );
1295
    bSig = extractFloat32Frac( b );
1296
    bExp = extractFloat32Exp( b );
1297
    bSign = extractFloat32Sign( b );
1298
    zSign = aSign ^ bSign;
1299
    if ( aExp == 0xFF ) {
1300
        if ( aSig ) return propagateFloat32NaN( a, b );
1301
        if ( bExp == 0xFF ) {
1302
            if ( bSig ) return propagateFloat32NaN( a, b );
1303
            float_raise( float_flag_invalid );
1304
            return float32_default_nan;
1305
        }
1306
        return packFloat32( zSign, 0xFF, 0 );
1307
    }
1308
    if ( bExp == 0xFF ) {
1309
        if ( bSig ) return propagateFloat32NaN( a, b );
1310
        return packFloat32( zSign, 0, 0 );
1311
    }
1312
    if ( bExp == 0 ) {
1313
        if ( bSig == 0 ) {
1314
            if ( ( aExp | aSig ) == 0 ) {
1315
                float_raise( float_flag_invalid );
1316
                return float32_default_nan;
1317
            }
1318
            float_raise( float_flag_divbyzero );
1319
            return packFloat32( zSign, 0xFF, 0 );
1320
        }
1321
        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1322
    }
1323
    if ( aExp == 0 ) {
1324
        if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1325
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1326
    }
1327
    zExp = aExp - bExp + 0x7D;
1328
    aSig = ( aSig | 0x00800000 )<<7;
1329
    bSig = ( bSig | 0x00800000 )<<8;
1330
    if ( bSig <= ( aSig + aSig ) ) {
1331
        aSig >>= 1;
1332
        ++zExp;
1333
    }
1334
    zSig = ( ( (bits64) aSig )<<32 ) / bSig;
1335
    if ( ( zSig & 0x3F ) == 0 ) {
1336
        zSig |= ( ( (bits64) bSig ) * zSig != ( (bits64) aSig )<<32 );
1337
    }
1338
    return roundAndPackFloat32( zSign, zExp, zSig );
1339
 
1340
}
1341
 
1342
/*
1343
-------------------------------------------------------------------------------
1344
Returns the remainder of the single-precision floating-point value `a'
1345
with respect to the corresponding value `b'.  The operation is performed
1346
according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1347
-------------------------------------------------------------------------------
1348
*/
1349
float32 float32_rem( float32 a, float32 b )
1350
{
1351
    flag aSign, bSign, zSign;
1352
    int16 aExp, bExp, expDiff;
1353
    bits32 aSig, bSig;
1354
    bits32 q;
1355
    bits64 aSig64, bSig64, q64;
1356
    bits32 alternateASig;
1357
    sbits32 sigMean;
1358
 
1359
    aSig = extractFloat32Frac( a );
1360
    aExp = extractFloat32Exp( a );
1361
    aSign = extractFloat32Sign( a );
1362
    bSig = extractFloat32Frac( b );
1363
    bExp = extractFloat32Exp( b );
1364
    bSign = extractFloat32Sign( b );
1365
    if ( aExp == 0xFF ) {
1366
        if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1367
            return propagateFloat32NaN( a, b );
1368
        }
1369
        float_raise( float_flag_invalid );
1370
        return float32_default_nan;
1371
    }
1372
    if ( bExp == 0xFF ) {
1373
        if ( bSig ) return propagateFloat32NaN( a, b );
1374
        return a;
1375
    }
1376
    if ( bExp == 0 ) {
1377
        if ( bSig == 0 ) {
1378
            float_raise( float_flag_invalid );
1379
            return float32_default_nan;
1380
        }
1381
        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1382
    }
1383
    if ( aExp == 0 ) {
1384
        if ( aSig == 0 ) return a;
1385
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1386
    }
1387
    expDiff = aExp - bExp;
1388
    aSig |= 0x00800000;
1389
    bSig |= 0x00800000;
1390
    if ( expDiff < 32 ) {
1391
        aSig <<= 8;
1392
        bSig <<= 8;
1393
        if ( expDiff < 0 ) {
1394
            if ( expDiff < -1 ) return a;
1395
            aSig >>= 1;
1396
        }
1397
        q = ( bSig <= aSig );
1398
        if ( q ) aSig -= bSig;
1399
        if ( 0 < expDiff ) {
1400
            q = ( ( (bits64) aSig )<<32 ) / bSig;
1401
            q >>= 32 - expDiff;
1402
            bSig >>= 2;
1403
            aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
1404
        }
1405
        else {
1406
            aSig >>= 2;
1407
            bSig >>= 2;
1408
        }
1409
    }
1410
    else {
1411
        if ( bSig <= aSig ) aSig -= bSig;
1412
        aSig64 = ( (bits64) aSig )<<40;
1413
        bSig64 = ( (bits64) bSig )<<40;
1414
        expDiff -= 64;
1415
        while ( 0 < expDiff ) {
1416
            q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1417
            q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1418
            aSig64 = - ( ( bSig * q64 )<<38 );
1419
            expDiff -= 62;
1420
        }
1421
        expDiff += 64;
1422
        q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1423
        q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1424
        q = q64>>( 64 - expDiff );
1425
        bSig <<= 6;
1426
        aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
1427
    }
1428
    do {
1429
        alternateASig = aSig;
1430
        ++q;
1431
        aSig -= bSig;
1432
    } while ( 0 <= (sbits32) aSig );
1433
    sigMean = aSig + alternateASig;
1434
    if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
1435
        aSig = alternateASig;
1436
    }
1437
    zSign = ( (sbits32) aSig < 0 );
1438
    if ( zSign ) aSig = - aSig;
1439
    return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig );
1440
 
1441
}
1442
 
1443
/*
1444
-------------------------------------------------------------------------------
1445
Returns the square root of the single-precision floating-point value `a'.
1446
The operation is performed according to the IEC/IEEE Standard for Binary
1447
Floating-point Arithmetic.
1448
-------------------------------------------------------------------------------
1449
*/
1450
float32 float32_sqrt( float32 a )
1451
{
1452
    flag aSign;
1453
    int16 aExp, zExp;
1454
    bits32 aSig, zSig;
1455
    bits64 rem, term;
1456
 
1457
    aSig = extractFloat32Frac( a );
1458
    aExp = extractFloat32Exp( a );
1459
    aSign = extractFloat32Sign( a );
1460
    if ( aExp == 0xFF ) {
1461
        if ( aSig ) return propagateFloat32NaN( a, 0 );
1462
        if ( ! aSign ) return a;
1463
        float_raise( float_flag_invalid );
1464
        return float32_default_nan;
1465
    }
1466
    if ( aSign ) {
1467
        if ( ( aExp | aSig ) == 0 ) return a;
1468
        float_raise( float_flag_invalid );
1469
        return float32_default_nan;
1470
    }
1471
    if ( aExp == 0 ) {
1472
        if ( aSig == 0 ) return 0;
1473
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1474
    }
1475
    zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
1476
    aSig = ( aSig | 0x00800000 )<<8;
1477
    zSig = estimateSqrt32( aExp, aSig ) + 2;
1478
    if ( ( zSig & 0x7F ) <= 5 ) {
1479
        if ( zSig < 2 ) {
1480
            zSig = 0xFFFFFFFF;
1481
        }
1482
        else {
1483
            aSig >>= aExp & 1;
1484
            term = ( (bits64) zSig ) * zSig;
1485
            rem = ( ( (bits64) aSig )<<32 ) - term;
1486
            while ( (sbits64) rem < 0 ) {
1487
                --zSig;
1488
                rem += ( ( (bits64) zSig )<<1 ) | 1;
1489
            }
1490
            zSig |= ( rem != 0 );
1491
        }
1492
    }
1493
    shift32RightJamming( zSig, 1, &zSig );
1494
    return roundAndPackFloat32( 0, zExp, zSig );
1495
 
1496
}
1497
 
1498
/*
1499
-------------------------------------------------------------------------------
1500
Returns 1 if the single-precision floating-point value `a' is equal to the
1501
corresponding value `b', and 0 otherwise.  The comparison is performed
1502
according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1503
-------------------------------------------------------------------------------
1504
*/
1505
flag float32_eq( float32 a, float32 b )
1506
{
1507
 
1508
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1509
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1510
       ) {
1511
        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
1512
            float_raise( float_flag_invalid );
1513
        }
1514
        return 0;
1515
    }
1516
    return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
1517
 
1518
}
1519
 
1520
/*
1521
-------------------------------------------------------------------------------
1522
Returns 1 if the single-precision floating-point value `a' is less than or
1523
equal to the corresponding value `b', and 0 otherwise.  The comparison is
1524
performed according to the IEC/IEEE Standard for Binary Floating-point
1525
Arithmetic.
1526
-------------------------------------------------------------------------------
1527
*/
1528
flag float32_le( float32 a, float32 b )
1529
{
1530
    flag aSign, bSign;
1531
 
1532
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1533
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1534
       ) {
1535
        float_raise( float_flag_invalid );
1536
        return 0;
1537
    }
1538
    aSign = extractFloat32Sign( a );
1539
    bSign = extractFloat32Sign( b );
1540
    if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
1541
    return ( a == b ) || ( aSign ^ ( a < b ) );
1542
 
1543
}
1544
 
1545
/*
1546
-------------------------------------------------------------------------------
1547
Returns 1 if the single-precision floating-point value `a' is less than
1548
the corresponding value `b', and 0 otherwise.  The comparison is performed
1549
according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1550
-------------------------------------------------------------------------------
1551
*/
1552
flag float32_lt( float32 a, float32 b )
1553
{
1554
    flag aSign, bSign;
1555
 
1556
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1557
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1558
       ) {
1559
        float_raise( float_flag_invalid );
1560
        return 0;
1561
    }
1562
    aSign = extractFloat32Sign( a );
1563
    bSign = extractFloat32Sign( b );
1564
    if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
1565
    return ( a != b ) && ( aSign ^ ( a < b ) );
1566
 
1567
}
1568
 
1569
/*
1570
-------------------------------------------------------------------------------
1571
Returns 1 if the single-precision floating-point value `a' is equal to the
1572
corresponding value `b', and 0 otherwise.  The invalid exception is raised
1573
if either operand is a NaN.  Otherwise, the comparison is performed
1574
according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1575
-------------------------------------------------------------------------------
1576
*/
1577
flag float32_eq_signaling( float32 a, float32 b )
1578
{
1579
 
1580
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1581
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1582
       ) {
1583
        float_raise( float_flag_invalid );
1584
        return 0;
1585
    }
1586
    return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
1587
 
1588
}
1589
 
1590
/*
1591
-------------------------------------------------------------------------------
1592
Returns 1 if the single-precision floating-point value `a' is less than or
1593
equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
1594
cause an exception.  Otherwise, the comparison is performed according to the
1595
IEC/IEEE Standard for Binary Floating-point Arithmetic.
1596
-------------------------------------------------------------------------------
1597
*/
1598
flag float32_le_quiet( float32 a, float32 b )
1599
{
1600
    flag aSign, bSign;
1601
    //int16 aExp, bExp;
1602
 
1603
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1604
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1605
       ) {
1606
        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
1607
            float_raise( float_flag_invalid );
1608
        }
1609
        return 0;
1610
    }
1611
    aSign = extractFloat32Sign( a );
1612
    bSign = extractFloat32Sign( b );
1613
    if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
1614
    return ( a == b ) || ( aSign ^ ( a < b ) );
1615
 
1616
}
1617
 
1618
/*
1619
-------------------------------------------------------------------------------
1620
Returns 1 if the single-precision floating-point value `a' is less than
1621
the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
1622
exception.  Otherwise, the comparison is performed according to the IEC/IEEE
1623
Standard for Binary Floating-point Arithmetic.
1624
-------------------------------------------------------------------------------
1625
*/
1626
flag float32_lt_quiet( float32 a, float32 b )
1627
{
1628
    flag aSign, bSign;
1629
 
1630
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1631
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1632
       ) {
1633
        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
1634
            float_raise( float_flag_invalid );
1635
        }
1636
        return 0;
1637
    }
1638
    aSign = extractFloat32Sign( a );
1639
    bSign = extractFloat32Sign( b );
1640
    if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
1641
    return ( a != b ) && ( aSign ^ ( a < b ) );
1642
 
1643
}
1644
 
1645
/*
1646
-------------------------------------------------------------------------------
1647
Returns the result of converting the double-precision floating-point value
1648
`a' to the 32-bit two's complement integer format.  The conversion is
1649
performed according to the IEC/IEEE Standard for Binary Floating-point
1650
Arithmetic---which means in particular that the conversion is rounded
1651
according to the current rounding mode.  If `a' is a NaN, the largest
1652
positive integer is returned.  Otherwise, if the conversion overflows, the
1653
largest integer with the same sign as `a' is returned.
1654
-------------------------------------------------------------------------------
1655
*/
1656
int32 float64_to_int32( float64 a )
1657
{
1658
    flag aSign;
1659
    int16 aExp, shiftCount;
1660
    bits64 aSig;
1661
 
1662
    aSig = extractFloat64Frac( a );
1663
    aExp = extractFloat64Exp( a );
1664
    aSign = extractFloat64Sign( a );
1665
    if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1666
    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
1667
    shiftCount = 0x42C - aExp;
1668
    if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
1669
    return roundAndPackInt32( aSign, aSig );
1670
 
1671
}
1672
 
1673
/*
1674
-------------------------------------------------------------------------------
1675
Returns the result of converting the double-precision floating-point value
1676
`a' to the 32-bit two's complement integer format.  The conversion is
1677
performed according to the IEC/IEEE Standard for Binary Floating-point
1678
Arithmetic, except that the conversion is always rounded toward zero.  If
1679
`a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1680
conversion overflows, the largest integer with the same sign as `a' is
1681
returned.
1682
-------------------------------------------------------------------------------
1683
*/
1684
int32 float64_to_int32_round_to_zero( float64 a )
1685
{
1686
    flag aSign;
1687
    int16 aExp, shiftCount;
1688
    bits64 aSig, savedASig;
1689
    int32 z;
1690
 
1691
    aSig = extractFloat64Frac( a );
1692
    aExp = extractFloat64Exp( a );
1693
    aSign = extractFloat64Sign( a );
1694
    shiftCount = 0x433 - aExp;
1695
    if ( shiftCount < 21 ) {
1696
        if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1697
        goto invalid;
1698
    }
1699
    else if ( 52 < shiftCount ) {
1700
        if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
1701
        return 0;
1702
    }
1703
    aSig |= LIT64( 0x0010000000000000 );
1704
    savedASig = aSig;
1705
    aSig >>= shiftCount;
1706
    z = aSig;
1707
    if ( aSign ) z = - z;
1708
    if ( ( z < 0 ) ^ aSign ) {
1709
 invalid:
1710
        float_exception_flags |= float_flag_invalid;
1711
        return aSign ? 0x80000000 : 0x7FFFFFFF;
1712
    }
1713
    if ( ( aSig<<shiftCount ) != savedASig ) {
1714
        float_exception_flags |= float_flag_inexact;
1715
    }
1716
    return z;
1717
 
1718
}
1719
 
1720
/*
1721
-------------------------------------------------------------------------------
1722
Returns the result of converting the double-precision floating-point value
1723
`a' to the 32-bit two's complement unsigned integer format.  The conversion
1724
is performed according to the IEC/IEEE Standard for Binary Floating-point
1725
Arithmetic---which means in particular that the conversion is rounded
1726
according to the current rounding mode.  If `a' is a NaN, the largest
1727
positive integer is returned.  Otherwise, if the conversion overflows, the
1728
largest positive integer is returned.
1729
-------------------------------------------------------------------------------
1730
*/
1731
int32 float64_to_uint32( float64 a )
1732
{
1733
    flag aSign;
1734
    int16 aExp, shiftCount;
1735
    bits64 aSig;
1736
 
1737
    aSig = extractFloat64Frac( a );
1738
    aExp = extractFloat64Exp( a );
1739
    aSign = 0; //extractFloat64Sign( a );
1740
    //if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1741
    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
1742
    shiftCount = 0x42C - aExp;
1743
    if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
1744
    return roundAndPackInt32( aSign, aSig );
1745
}
1746
 
1747
/*
1748
-------------------------------------------------------------------------------
1749
Returns the result of converting the double-precision floating-point value
1750
`a' to the 32-bit two's complement integer format.  The conversion is
1751
performed according to the IEC/IEEE Standard for Binary Floating-point
1752
Arithmetic, except that the conversion is always rounded toward zero.  If
1753
`a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1754
conversion overflows, the largest positive integer is returned.
1755
-------------------------------------------------------------------------------
1756
*/
1757
int32 float64_to_uint32_round_to_zero( float64 a )
1758
{
1759
    flag aSign;
1760
    int16 aExp, shiftCount;
1761
    bits64 aSig, savedASig;
1762
    int32 z;
1763
 
1764
    aSig = extractFloat64Frac( a );
1765
    aExp = extractFloat64Exp( a );
1766
    aSign = extractFloat64Sign( a );
1767
    shiftCount = 0x433 - aExp;
1768
    if ( shiftCount < 21 ) {
1769
        if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1770
        goto invalid;
1771
    }
1772
    else if ( 52 < shiftCount ) {
1773
        if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
1774
        return 0;
1775
    }
1776
    aSig |= LIT64( 0x0010000000000000 );
1777
    savedASig = aSig;
1778
    aSig >>= shiftCount;
1779
    z = aSig;
1780
    if ( aSign ) z = - z;
1781
    if ( ( z < 0 ) ^ aSign ) {
1782
 invalid:
1783
        float_exception_flags |= float_flag_invalid;
1784
        return aSign ? 0x80000000 : 0x7FFFFFFF;
1785
    }
1786
    if ( ( aSig<<shiftCount ) != savedASig ) {
1787
        float_exception_flags |= float_flag_inexact;
1788
    }
1789
    return z;
1790
}
1791
 
1792
/*
1793
-------------------------------------------------------------------------------
1794
Returns the result of converting the double-precision floating-point value
1795
`a' to the single-precision floating-point format.  The conversion is
1796
performed according to the IEC/IEEE Standard for Binary Floating-point
1797
Arithmetic.
1798
-------------------------------------------------------------------------------
1799
*/
1800
float32 float64_to_float32( float64 a )
1801
{
1802
    flag aSign;
1803
    int16 aExp;
1804
    bits64 aSig;
1805
    bits32 zSig;
1806
 
1807
    aSig = extractFloat64Frac( a );
1808
    aExp = extractFloat64Exp( a );
1809
    aSign = extractFloat64Sign( a );
1810
    if ( aExp == 0x7FF ) {
1811
        if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) );
1812
        return packFloat32( aSign, 0xFF, 0 );
1813
    }
1814
    shift64RightJamming( aSig, 22, &aSig );
1815
    zSig = aSig;
1816
    if ( aExp || zSig ) {
1817
        zSig |= 0x40000000;
1818
        aExp -= 0x381;
1819
    }
1820
    return roundAndPackFloat32( aSign, aExp, zSig );
1821
 
1822
}
1823
 
1824
#ifdef FLOATX80
1825
 
1826
/*
1827
-------------------------------------------------------------------------------
1828
Returns the result of converting the double-precision floating-point value
1829
`a' to the extended double-precision floating-point format.  The conversion
1830
is performed according to the IEC/IEEE Standard for Binary Floating-point
1831
Arithmetic.
1832
-------------------------------------------------------------------------------
1833
*/
1834
floatx80 float64_to_floatx80( float64 a )
1835
{
1836
    flag aSign;
1837
    int16 aExp;
1838
    bits64 aSig;
1839
 
1840
    aSig = extractFloat64Frac( a );
1841
    aExp = extractFloat64Exp( a );
1842
    aSign = extractFloat64Sign( a );
1843
    if ( aExp == 0x7FF ) {
1844
        if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) );
1845
        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1846
    }
1847
    if ( aExp == 0 ) {
1848
        if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1849
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
1850
    }
1851
    return
1852
        packFloatx80(
1853
            aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
1854
 
1855
}
1856
 
1857
#endif
1858
 
1859
/*
1860
-------------------------------------------------------------------------------
1861
Rounds the double-precision floating-point value `a' to an integer, and
1862
returns the result as a double-precision floating-point value.  The
1863
operation is performed according to the IEC/IEEE Standard for Binary
1864
Floating-point Arithmetic.
1865
-------------------------------------------------------------------------------
1866
*/
1867
float64 float64_round_to_int( float64 a )
1868
{
1869
    flag aSign;
1870
    int16 aExp;
1871
    bits64 lastBitMask, roundBitsMask;
1872
    int8 roundingMode;
1873
    float64 z;
1874
 
1875
    aExp = extractFloat64Exp( a );
1876
    if ( 0x433 <= aExp ) {
1877
        if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
1878
            return propagateFloat64NaN( a, a );
1879
        }
1880
        return a;
1881
    }
1882
    if ( aExp <= 0x3FE ) {
1883
        if ( (bits64) ( a<<1 ) == 0 ) return a;
1884
        float_exception_flags |= float_flag_inexact;
1885
        aSign = extractFloat64Sign( a );
1886
        switch ( float_rounding_mode ) {
1887
         case float_round_nearest_even:
1888
            if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
1889
                return packFloat64( aSign, 0x3FF, 0 );
1890
            }
1891
            break;
1892
         case float_round_down:
1893
            return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
1894
         case float_round_up:
1895
            return
1896
            aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
1897
        }
1898
        return packFloat64( aSign, 0, 0 );
1899
    }
1900
    lastBitMask = 1;
1901
    lastBitMask <<= 0x433 - aExp;
1902
    roundBitsMask = lastBitMask - 1;
1903
    z = a;
1904
    roundingMode = float_rounding_mode;
1905
    if ( roundingMode == float_round_nearest_even ) {
1906
        z += lastBitMask>>1;
1907
        if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1908
    }
1909
    else if ( roundingMode != float_round_to_zero ) {
1910
        if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1911
            z += roundBitsMask;
1912
        }
1913
    }
1914
    z &= ~ roundBitsMask;
1915
    if ( z != a ) float_exception_flags |= float_flag_inexact;
1916
    return z;
1917
 
1918
}
1919
 
1920
/*
1921
-------------------------------------------------------------------------------
1922
Returns the result of adding the absolute values of the double-precision
1923
floating-point values `a' and `b'.  If `zSign' is true, the sum is negated
1924
before being returned.  `zSign' is ignored if the result is a NaN.  The
1925
addition is performed according to the IEC/IEEE Standard for Binary
1926
Floating-point Arithmetic.
1927
-------------------------------------------------------------------------------
1928
*/
1929
static float64 addFloat64Sigs( float64 a, float64 b, flag zSign )
1930
{
1931
    int16 aExp, bExp, zExp;
1932
    bits64 aSig, bSig, zSig;
1933
    int16 expDiff;
1934
 
1935
    aSig = extractFloat64Frac( a );
1936
    aExp = extractFloat64Exp( a );
1937
    bSig = extractFloat64Frac( b );
1938
    bExp = extractFloat64Exp( b );
1939
    expDiff = aExp - bExp;
1940
    aSig <<= 9;
1941
    bSig <<= 9;
1942
    if ( 0 < expDiff ) {
1943
        if ( aExp == 0x7FF ) {
1944
            if ( aSig ) return propagateFloat64NaN( a, b );
1945
            return a;
1946
        }
1947
        if ( bExp == 0 ) {
1948
            --expDiff;
1949
        }
1950
        else {
1951
            bSig |= LIT64( 0x2000000000000000 );
1952
        }
1953
        shift64RightJamming( bSig, expDiff, &bSig );
1954
        zExp = aExp;
1955
    }
1956
    else if ( expDiff < 0 ) {
1957
        if ( bExp == 0x7FF ) {
1958
            if ( bSig ) return propagateFloat64NaN( a, b );
1959
            return packFloat64( zSign, 0x7FF, 0 );
1960
        }
1961
        if ( aExp == 0 ) {
1962
            ++expDiff;
1963
        }
1964
        else {
1965
            aSig |= LIT64( 0x2000000000000000 );
1966
        }
1967
        shift64RightJamming( aSig, - expDiff, &aSig );
1968
        zExp = bExp;
1969
    }
1970
    else {
1971
        if ( aExp == 0x7FF ) {
1972
            if ( aSig | bSig ) return propagateFloat64NaN( a, b );
1973
            return a;
1974
        }
1975
        if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
1976
        zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
1977
        zExp = aExp;
1978
        goto roundAndPack;
1979
    }
1980
    aSig |= LIT64( 0x2000000000000000 );
1981
    zSig = ( aSig + bSig )<<1;
1982
    --zExp;
1983
    if ( (sbits64) zSig < 0 ) {
1984
        zSig = aSig + bSig;
1985
        ++zExp;
1986
    }
1987
 roundAndPack:
1988
    return roundAndPackFloat64( zSign, zExp, zSig );
1989
 
1990
}
1991
 
1992
/*
1993
-------------------------------------------------------------------------------
1994
Returns the result of subtracting the absolute values of the double-
1995
precision floating-point values `a' and `b'.  If `zSign' is true, the
1996
difference is negated before being returned.  `zSign' is ignored if the
1997
result is a NaN.  The subtraction is performed according to the IEC/IEEE
1998
Standard for Binary Floating-point Arithmetic.
1999
-------------------------------------------------------------------------------
2000
*/
2001
static float64 subFloat64Sigs( float64 a, float64 b, flag zSign )
2002
{
2003
    int16 aExp, bExp, zExp;
2004
    bits64 aSig, bSig, zSig;
2005
    int16 expDiff;
2006
 
2007
    aSig = extractFloat64Frac( a );
2008
    aExp = extractFloat64Exp( a );
2009
    bSig = extractFloat64Frac( b );
2010
    bExp = extractFloat64Exp( b );
2011
    expDiff = aExp - bExp;
2012
    aSig <<= 10;
2013
    bSig <<= 10;
2014
    if ( 0 < expDiff ) goto aExpBigger;
2015
    if ( expDiff < 0 ) goto bExpBigger;
2016
    if ( aExp == 0x7FF ) {
2017
        if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2018
        float_raise( float_flag_invalid );
2019
        return float64_default_nan;
2020
    }
2021
    if ( aExp == 0 ) {
2022
        aExp = 1;
2023
        bExp = 1;
2024
    }
2025
    if ( bSig < aSig ) goto aBigger;
2026
    if ( aSig < bSig ) goto bBigger;
2027
    return packFloat64( float_rounding_mode == float_round_down, 0, 0 );
2028
 bExpBigger:
2029
    if ( bExp == 0x7FF ) {
2030
        if ( bSig ) return propagateFloat64NaN( a, b );
2031
        return packFloat64( zSign ^ 1, 0x7FF, 0 );
2032
    }
2033
    if ( aExp == 0 ) {
2034
        ++expDiff;
2035
    }
2036
    else {
2037
        aSig |= LIT64( 0x4000000000000000 );
2038
    }
2039
    shift64RightJamming( aSig, - expDiff, &aSig );
2040
    bSig |= LIT64( 0x4000000000000000 );
2041
 bBigger:
2042
    zSig = bSig - aSig;
2043
    zExp = bExp;
2044
    zSign ^= 1;
2045
    goto normalizeRoundAndPack;
2046
 aExpBigger:
2047
    if ( aExp == 0x7FF ) {
2048
        if ( aSig ) return propagateFloat64NaN( a, b );
2049
        return a;
2050
    }
2051
    if ( bExp == 0 ) {
2052
        --expDiff;
2053
    }
2054
    else {
2055
        bSig |= LIT64( 0x4000000000000000 );
2056
    }
2057
    shift64RightJamming( bSig, expDiff, &bSig );
2058
    aSig |= LIT64( 0x4000000000000000 );
2059
 aBigger:
2060
    zSig = aSig - bSig;
2061
    zExp = aExp;
2062
 normalizeRoundAndPack:
2063
    --zExp;
2064
    return normalizeRoundAndPackFloat64( zSign, zExp, zSig );
2065
 
2066
}
2067
 
2068
/*
2069
-------------------------------------------------------------------------------
2070
Returns the result of adding the double-precision floating-point values `a'
2071
and `b'.  The operation is performed according to the IEC/IEEE Standard for
2072
Binary Floating-point Arithmetic.
2073
-------------------------------------------------------------------------------
2074
*/
2075
float64 float64_add( float64 a, float64 b )
2076
{
2077
    flag aSign, bSign;
2078
 
2079
    aSign = extractFloat64Sign( a );
2080
    bSign = extractFloat64Sign( b );
2081
    if ( aSign == bSign ) {
2082
        return addFloat64Sigs( a, b, aSign );
2083
    }
2084
    else {
2085
        return subFloat64Sigs( a, b, aSign );
2086
    }
2087
 
2088
}
2089
 
2090
/*
2091
-------------------------------------------------------------------------------
2092
Returns the result of subtracting the double-precision floating-point values
2093
`a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2094
for Binary Floating-point Arithmetic.
2095
-------------------------------------------------------------------------------
2096
*/
2097
float64 float64_sub( float64 a, float64 b )
2098
{
2099
    flag aSign, bSign;
2100
 
2101
    aSign = extractFloat64Sign( a );
2102
    bSign = extractFloat64Sign( b );
2103
    if ( aSign == bSign ) {
2104
        return subFloat64Sigs( a, b, aSign );
2105
    }
2106
    else {
2107
        return addFloat64Sigs( a, b, aSign );
2108
    }
2109
 
2110
}
2111
 
2112
/*
2113
-------------------------------------------------------------------------------
2114
Returns the result of multiplying the double-precision floating-point values
2115
`a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2116
for Binary Floating-point Arithmetic.
2117
-------------------------------------------------------------------------------
2118
*/
2119
float64 float64_mul( float64 a, float64 b )
2120
{
2121
    flag aSign, bSign, zSign;
2122
    int16 aExp, bExp, zExp;
2123
    bits64 aSig, bSig, zSig0, zSig1;
2124
 
2125
    aSig = extractFloat64Frac( a );
2126
    aExp = extractFloat64Exp( a );
2127
    aSign = extractFloat64Sign( a );
2128
    bSig = extractFloat64Frac( b );
2129
    bExp = extractFloat64Exp( b );
2130
    bSign = extractFloat64Sign( b );
2131
    zSign = aSign ^ bSign;
2132
    if ( aExp == 0x7FF ) {
2133
        if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2134
            return propagateFloat64NaN( a, b );
2135
        }
2136
        if ( ( bExp | bSig ) == 0 ) {
2137
            float_raise( float_flag_invalid );
2138
            return float64_default_nan;
2139
        }
2140
        return packFloat64( zSign, 0x7FF, 0 );
2141
    }
2142
    if ( bExp == 0x7FF ) {
2143
        if ( bSig ) return propagateFloat64NaN( a, b );
2144
        if ( ( aExp | aSig ) == 0 ) {
2145
            float_raise( float_flag_invalid );
2146
            return float64_default_nan;
2147
        }
2148
        return packFloat64( zSign, 0x7FF, 0 );
2149
    }
2150
    if ( aExp == 0 ) {
2151
        if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2152
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2153
    }
2154
    if ( bExp == 0 ) {
2155
        if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2156
        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2157
    }
2158
    zExp = aExp + bExp - 0x3FF;
2159
    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2160
    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2161
    mul64To128( aSig, bSig, &zSig0, &zSig1 );
2162
    zSig0 |= ( zSig1 != 0 );
2163
    if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2164
        zSig0 <<= 1;
2165
        --zExp;
2166
    }
2167
    return roundAndPackFloat64( zSign, zExp, zSig0 );
2168
 
2169
}
2170
 
2171
/*
2172
-------------------------------------------------------------------------------
2173
Returns the result of dividing the double-precision floating-point value `a'
2174
by the corresponding value `b'.  The operation is performed according to
2175
the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2176
-------------------------------------------------------------------------------
2177
*/
2178
float64 float64_div( float64 a, float64 b )
2179
{
2180
    flag aSign, bSign, zSign;
2181
    int16 aExp, bExp, zExp;
2182
    bits64 aSig, bSig, zSig;
2183
    bits64 rem0, rem1;
2184
    bits64 term0, term1;
2185
 
2186
    aSig = extractFloat64Frac( a );
2187
    aExp = extractFloat64Exp( a );
2188
    aSign = extractFloat64Sign( a );
2189
    bSig = extractFloat64Frac( b );
2190
    bExp = extractFloat64Exp( b );
2191
    bSign = extractFloat64Sign( b );
2192
    zSign = aSign ^ bSign;
2193
    if ( aExp == 0x7FF ) {
2194
        if ( aSig ) return propagateFloat64NaN( a, b );
2195
        if ( bExp == 0x7FF ) {
2196
            if ( bSig ) return propagateFloat64NaN( a, b );
2197
            float_raise( float_flag_invalid );
2198
            return float64_default_nan;
2199
        }
2200
        return packFloat64( zSign, 0x7FF, 0 );
2201
    }
2202
    if ( bExp == 0x7FF ) {
2203
        if ( bSig ) return propagateFloat64NaN( a, b );
2204
        return packFloat64( zSign, 0, 0 );
2205
    }
2206
    if ( bExp == 0 ) {
2207
        if ( bSig == 0 ) {
2208
            if ( ( aExp | aSig ) == 0 ) {
2209
                float_raise( float_flag_invalid );
2210
                return float64_default_nan;
2211
            }
2212
            float_raise( float_flag_divbyzero );
2213
            return packFloat64( zSign, 0x7FF, 0 );
2214
        }
2215
        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2216
    }
2217
    if ( aExp == 0 ) {
2218
        if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2219
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2220
    }
2221
    zExp = aExp - bExp + 0x3FD;
2222
    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2223
    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2224
    if ( bSig <= ( aSig + aSig ) ) {
2225
        aSig >>= 1;
2226
        ++zExp;
2227
    }
2228
    zSig = estimateDiv128To64( aSig, 0, bSig );
2229
    if ( ( zSig & 0x1FF ) <= 2 ) {
2230
        mul64To128( bSig, zSig, &term0, &term1 );
2231
        sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2232
        while ( (sbits64) rem0 < 0 ) {
2233
            --zSig;
2234
            add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
2235
        }
2236
        zSig |= ( rem1 != 0 );
2237
    }
2238
    return roundAndPackFloat64( zSign, zExp, zSig );
2239
 
2240
}
2241
 
2242
/*
2243
-------------------------------------------------------------------------------
2244
Returns the remainder of the double-precision floating-point value `a'
2245
with respect to the corresponding value `b'.  The operation is performed
2246
according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2247
-------------------------------------------------------------------------------
2248
*/
2249
float64 float64_rem( float64 a, float64 b )
2250
{
2251
    flag aSign, bSign, zSign;
2252
    int16 aExp, bExp, expDiff;
2253
    bits64 aSig, bSig;
2254
    bits64 q, alternateASig;
2255
    sbits64 sigMean;
2256
 
2257
    aSig = extractFloat64Frac( a );
2258
    aExp = extractFloat64Exp( a );
2259
    aSign = extractFloat64Sign( a );
2260
    bSig = extractFloat64Frac( b );
2261
    bExp = extractFloat64Exp( b );
2262
    bSign = extractFloat64Sign( b );
2263
    if ( aExp == 0x7FF ) {
2264
        if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2265
            return propagateFloat64NaN( a, b );
2266
        }
2267
        float_raise( float_flag_invalid );
2268
        return float64_default_nan;
2269
    }
2270
    if ( bExp == 0x7FF ) {
2271
        if ( bSig ) return propagateFloat64NaN( a, b );
2272
        return a;
2273
    }
2274
    if ( bExp == 0 ) {
2275
        if ( bSig == 0 ) {
2276
            float_raise( float_flag_invalid );
2277
            return float64_default_nan;
2278
        }
2279
        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2280
    }
2281
    if ( aExp == 0 ) {
2282
        if ( aSig == 0 ) return a;
2283
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2284
    }
2285
    expDiff = aExp - bExp;
2286
    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
2287
    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2288
    if ( expDiff < 0 ) {
2289
        if ( expDiff < -1 ) return a;
2290
        aSig >>= 1;
2291
    }
2292
    q = ( bSig <= aSig );
2293
    if ( q ) aSig -= bSig;
2294
    expDiff -= 64;
2295
    while ( 0 < expDiff ) {
2296
        q = estimateDiv128To64( aSig, 0, bSig );
2297
        q = ( 2 < q ) ? q - 2 : 0;
2298
        aSig = - ( ( bSig>>2 ) * q );
2299
        expDiff -= 62;
2300
    }
2301
    expDiff += 64;
2302
    if ( 0 < expDiff ) {
2303
        q = estimateDiv128To64( aSig, 0, bSig );
2304
        q = ( 2 < q ) ? q - 2 : 0;
2305
        q >>= 64 - expDiff;
2306
        bSig >>= 2;
2307
        aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2308
    }
2309
    else {
2310
        aSig >>= 2;
2311
        bSig >>= 2;
2312
    }
2313
    do {
2314
        alternateASig = aSig;
2315
        ++q;
2316
        aSig -= bSig;
2317
    } while ( 0 <= (sbits64) aSig );
2318
    sigMean = aSig + alternateASig;
2319
    if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2320
        aSig = alternateASig;
2321
    }
2322
    zSign = ( (sbits64) aSig < 0 );
2323
    if ( zSign ) aSig = - aSig;
2324
    return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig );
2325
 
2326
}
2327
 
2328
/*
2329
-------------------------------------------------------------------------------
2330
Returns the square root of the double-precision floating-point value `a'.
2331
The operation is performed according to the IEC/IEEE Standard for Binary
2332
Floating-point Arithmetic.
2333
-------------------------------------------------------------------------------
2334
*/
2335
float64 float64_sqrt( float64 a )
2336
{
2337
    flag aSign;
2338
    int16 aExp, zExp;
2339
    bits64 aSig, zSig;
2340
    bits64 rem0, rem1, term0, term1; //, shiftedRem;
2341
    //float64 z;
2342
 
2343
    aSig = extractFloat64Frac( a );
2344
    aExp = extractFloat64Exp( a );
2345
    aSign = extractFloat64Sign( a );
2346
    if ( aExp == 0x7FF ) {
2347
        if ( aSig ) return propagateFloat64NaN( a, a );
2348
        if ( ! aSign ) return a;
2349
        float_raise( float_flag_invalid );
2350
        return float64_default_nan;
2351
    }
2352
    if ( aSign ) {
2353
        if ( ( aExp | aSig ) == 0 ) return a;
2354
        float_raise( float_flag_invalid );
2355
        return float64_default_nan;
2356
    }
2357
    if ( aExp == 0 ) {
2358
        if ( aSig == 0 ) return 0;
2359
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2360
    }
2361
    zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
2362
    aSig |= LIT64( 0x0010000000000000 );
2363
    zSig = estimateSqrt32( aExp, aSig>>21 );
2364
    zSig <<= 31;
2365
    aSig <<= 9 - ( aExp & 1 );
2366
    zSig = estimateDiv128To64( aSig, 0, zSig ) + zSig + 2;
2367
    if ( ( zSig & 0x3FF ) <= 5 ) {
2368
        if ( zSig < 2 ) {
2369
            zSig = LIT64( 0xFFFFFFFFFFFFFFFF );
2370
        }
2371
        else {
2372
            aSig <<= 2;
2373
            mul64To128( zSig, zSig, &term0, &term1 );
2374
            sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2375
            while ( (sbits64) rem0 < 0 ) {
2376
                --zSig;
2377
                shortShift128Left( 0, zSig, 1, &term0, &term1 );
2378
                term1 |= 1;
2379
                add128( rem0, rem1, term0, term1, &rem0, &rem1 );
2380
            }
2381
            zSig |= ( ( rem0 | rem1 ) != 0 );
2382
        }
2383
    }
2384
    shift64RightJamming( zSig, 1, &zSig );
2385
    return roundAndPackFloat64( 0, zExp, zSig );
2386
 
2387
}
2388
 
2389
/*
2390
-------------------------------------------------------------------------------
2391
Returns 1 if the double-precision floating-point value `a' is equal to the
2392
corresponding value `b', and 0 otherwise.  The comparison is performed
2393
according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2394
-------------------------------------------------------------------------------
2395
*/
2396
flag float64_eq( float64 a, float64 b )
2397
{
2398
 
2399
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2400
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2401
       ) {
2402
        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2403
            float_raise( float_flag_invalid );
2404
        }
2405
        return 0;
2406
    }
2407
    return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
2408
 
2409
}
2410
 
2411
/*
2412
-------------------------------------------------------------------------------
2413
Returns 1 if the double-precision floating-point value `a' is less than or
2414
equal to the corresponding value `b', and 0 otherwise.  The comparison is
2415
performed according to the IEC/IEEE Standard for Binary Floating-point
2416
Arithmetic.
2417
-------------------------------------------------------------------------------
2418
*/
2419
flag float64_le( float64 a, float64 b )
2420
{
2421
    flag aSign, bSign;
2422
 
2423
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2424
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2425
       ) {
2426
        float_raise( float_flag_invalid );
2427
        return 0;
2428
    }
2429
    aSign = extractFloat64Sign( a );
2430
    bSign = extractFloat64Sign( b );
2431
    if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
2432
    return ( a == b ) || ( aSign ^ ( a < b ) );
2433
 
2434
}
2435
 
2436
/*
2437
-------------------------------------------------------------------------------
2438
Returns 1 if the double-precision floating-point value `a' is less than
2439
the corresponding value `b', and 0 otherwise.  The comparison is performed
2440
according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2441
-------------------------------------------------------------------------------
2442
*/
2443
flag float64_lt( float64 a, float64 b )
2444
{
2445
    flag aSign, bSign;
2446
 
2447
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2448
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2449
       ) {
2450
        float_raise( float_flag_invalid );
2451
        return 0;
2452
    }
2453
    aSign = extractFloat64Sign( a );
2454
    bSign = extractFloat64Sign( b );
2455
    if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
2456
    return ( a != b ) && ( aSign ^ ( a < b ) );
2457
 
2458
}
2459
 
2460
/*
2461
-------------------------------------------------------------------------------
2462
Returns 1 if the double-precision floating-point value `a' is equal to the
2463
corresponding value `b', and 0 otherwise.  The invalid exception is raised
2464
if either operand is a NaN.  Otherwise, the comparison is performed
2465
according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2466
-------------------------------------------------------------------------------
2467
*/
2468
flag float64_eq_signaling( float64 a, float64 b )
2469
{
2470
 
2471
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2472
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2473
       ) {
2474
        float_raise( float_flag_invalid );
2475
        return 0;
2476
    }
2477
    return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
2478
 
2479
}
2480
 
2481
/*
2482
-------------------------------------------------------------------------------
2483
Returns 1 if the double-precision floating-point value `a' is less than or
2484
equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2485
cause an exception.  Otherwise, the comparison is performed according to the
2486
IEC/IEEE Standard for Binary Floating-point Arithmetic.
2487
-------------------------------------------------------------------------------
2488
*/
2489
flag float64_le_quiet( float64 a, float64 b )
2490
{
2491
    flag aSign, bSign;
2492
    //int16 aExp, bExp;
2493
 
2494
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2495
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2496
       ) {
2497
        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2498
            float_raise( float_flag_invalid );
2499
        }
2500
        return 0;
2501
    }
2502
    aSign = extractFloat64Sign( a );
2503
    bSign = extractFloat64Sign( b );
2504
    if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
2505
    return ( a == b ) || ( aSign ^ ( a < b ) );
2506
 
2507
}
2508
 
2509
/*
2510
-------------------------------------------------------------------------------
2511
Returns 1 if the double-precision floating-point value `a' is less than
2512
the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2513
exception.  Otherwise, the comparison is performed according to the IEC/IEEE
2514
Standard for Binary Floating-point Arithmetic.
2515
-------------------------------------------------------------------------------
2516
*/
2517
flag float64_lt_quiet( float64 a, float64 b )
2518
{
2519
    flag aSign, bSign;
2520
 
2521
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2522
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2523
       ) {
2524
        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2525
            float_raise( float_flag_invalid );
2526
        }
2527
        return 0;
2528
    }
2529
    aSign = extractFloat64Sign( a );
2530
    bSign = extractFloat64Sign( b );
2531
    if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
2532
    return ( a != b ) && ( aSign ^ ( a < b ) );
2533
 
2534
}
2535
 
2536
#ifdef FLOATX80
2537
 
2538
/*
2539
-------------------------------------------------------------------------------
2540
Returns the result of converting the extended double-precision floating-
2541
point value `a' to the 32-bit two's complement integer format.  The
2542
conversion is performed according to the IEC/IEEE Standard for Binary
2543
Floating-point Arithmetic---which means in particular that the conversion
2544
is rounded according to the current rounding mode.  If `a' is a NaN, the
2545
largest positive integer is returned.  Otherwise, if the conversion
2546
overflows, the largest integer with the same sign as `a' is returned.
2547
-------------------------------------------------------------------------------
2548
*/
2549
int32 floatx80_to_int32( floatx80 a )
2550
{
2551
    flag aSign;
2552
    int32 aExp, shiftCount;
2553
    bits64 aSig;
2554
 
2555
    aSig = extractFloatx80Frac( a );
2556
    aExp = extractFloatx80Exp( a );
2557
    aSign = extractFloatx80Sign( a );
2558
    if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
2559
    shiftCount = 0x4037 - aExp;
2560
    if ( shiftCount <= 0 ) shiftCount = 1;
2561
    shift64RightJamming( aSig, shiftCount, &aSig );
2562
    return roundAndPackInt32( aSign, aSig );
2563
 
2564
}
2565
 
2566
/*
2567
-------------------------------------------------------------------------------
2568
Returns the result of converting the extended double-precision floating-
2569
point value `a' to the 32-bit two's complement integer format.  The
2570
conversion is performed according to the IEC/IEEE Standard for Binary
2571
Floating-point Arithmetic, except that the conversion is always rounded
2572
toward zero.  If `a' is a NaN, the largest positive integer is returned.
2573
Otherwise, if the conversion overflows, the largest integer with the same
2574
sign as `a' is returned.
2575
-------------------------------------------------------------------------------
2576
*/
2577
int32 floatx80_to_int32_round_to_zero( floatx80 a )
2578
{
2579
    flag aSign;
2580
    int32 aExp, shiftCount;
2581
    bits64 aSig, savedASig;
2582
    int32 z;
2583
 
2584
    aSig = extractFloatx80Frac( a );
2585
    aExp = extractFloatx80Exp( a );
2586
    aSign = extractFloatx80Sign( a );
2587
    shiftCount = 0x403E - aExp;
2588
    if ( shiftCount < 32 ) {
2589
        if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
2590
        goto invalid;
2591
    }
2592
    else if ( 63 < shiftCount ) {
2593
        if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
2594
        return 0;
2595
    }
2596
    savedASig = aSig;
2597
    aSig >>= shiftCount;
2598
    z = aSig;
2599
    if ( aSign ) z = - z;
2600
    if ( ( z < 0 ) ^ aSign ) {
2601
 invalid:
2602
        float_exception_flags |= float_flag_invalid;
2603
        return aSign ? 0x80000000 : 0x7FFFFFFF;
2604
    }
2605
    if ( ( aSig<<shiftCount ) != savedASig ) {
2606
        float_exception_flags |= float_flag_inexact;
2607
    }
2608
    return z;
2609
 
2610
}
2611
 
2612
/*
2613
-------------------------------------------------------------------------------
2614
Returns the result of converting the extended double-precision floating-
2615
point value `a' to the single-precision floating-point format.  The
2616
conversion is performed according to the IEC/IEEE Standard for Binary
2617
Floating-point Arithmetic.
2618
-------------------------------------------------------------------------------
2619
*/
2620
float32 floatx80_to_float32( floatx80 a )
2621
{
2622
    flag aSign;
2623
    int32 aExp;
2624
    bits64 aSig;
2625
 
2626
    aSig = extractFloatx80Frac( a );
2627
    aExp = extractFloatx80Exp( a );
2628
    aSign = extractFloatx80Sign( a );
2629
    if ( aExp == 0x7FFF ) {
2630
        if ( (bits64) ( aSig<<1 ) ) {
2631
            return commonNaNToFloat32( floatx80ToCommonNaN( a ) );
2632
        }
2633
        return packFloat32( aSign, 0xFF, 0 );
2634
    }
2635
    shift64RightJamming( aSig, 33, &aSig );
2636
    if ( aExp || aSig ) aExp -= 0x3F81;
2637
    return roundAndPackFloat32( aSign, aExp, aSig );
2638
 
2639
}
2640
 
2641
/*
2642
-------------------------------------------------------------------------------
2643
Returns the result of converting the extended double-precision floating-
2644
point value `a' to the double-precision floating-point format.  The
2645
conversion is performed according to the IEC/IEEE Standard for Binary
2646
Floating-point Arithmetic.
2647
-------------------------------------------------------------------------------
2648
*/
2649
float64 floatx80_to_float64( floatx80 a )
2650
{
2651
    flag aSign;
2652
    int32 aExp;
2653
    bits64 aSig, zSig;
2654
 
2655
    aSig = extractFloatx80Frac( a );
2656
    aExp = extractFloatx80Exp( a );
2657
    aSign = extractFloatx80Sign( a );
2658
    if ( aExp == 0x7FFF ) {
2659
        if ( (bits64) ( aSig<<1 ) ) {
2660
            return commonNaNToFloat64( floatx80ToCommonNaN( a ) );
2661
        }
2662
        return packFloat64( aSign, 0x7FF, 0 );
2663
    }
2664
    shift64RightJamming( aSig, 1, &zSig );
2665
    if ( aExp || aSig ) aExp -= 0x3C01;
2666
    return roundAndPackFloat64( aSign, aExp, zSig );
2667
 
2668
}
2669
 
2670
/*
2671
-------------------------------------------------------------------------------
2672
Rounds the extended double-precision floating-point value `a' to an integer,
2673
and returns the result as an extended quadruple-precision floating-point
2674
value.  The operation is performed according to the IEC/IEEE Standard for
2675
Binary Floating-point Arithmetic.
2676
-------------------------------------------------------------------------------
2677
*/
2678
floatx80 floatx80_round_to_int( floatx80 a )
2679
{
2680
    flag aSign;
2681
    int32 aExp;
2682
    bits64 lastBitMask, roundBitsMask;
2683
    int8 roundingMode;
2684
    floatx80 z;
2685
 
2686
    aExp = extractFloatx80Exp( a );
2687
    if ( 0x403E <= aExp ) {
2688
        if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
2689
            return propagateFloatx80NaN( a, a );
2690
        }
2691
        return a;
2692
    }
2693
    if ( aExp <= 0x3FFE ) {
2694
        if (    ( aExp == 0 )
2695
             && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
2696
            return a;
2697
        }
2698
        float_exception_flags |= float_flag_inexact;
2699
        aSign = extractFloatx80Sign( a );
2700
        switch ( float_rounding_mode ) {
2701
         case float_round_nearest_even:
2702
            if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
2703
               ) {
2704
                return
2705
                    packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
2706
            }
2707
            break;
2708
         case float_round_down:
2709
            return
2710
                  aSign ?
2711
                      packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
2712
                : packFloatx80( 0, 0, 0 );
2713
         case float_round_up:
2714
            return
2715
                  aSign ? packFloatx80( 1, 0, 0 )
2716
                : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
2717
        }
2718
        return packFloatx80( aSign, 0, 0 );
2719
    }
2720
    lastBitMask = 1;
2721
    lastBitMask <<= 0x403E - aExp;
2722
    roundBitsMask = lastBitMask - 1;
2723
    z = a;
2724
    roundingMode = float_rounding_mode;
2725
    if ( roundingMode == float_round_nearest_even ) {
2726
        z.low += lastBitMask>>1;
2727
        if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
2728
    }
2729
    else if ( roundingMode != float_round_to_zero ) {
2730
        if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
2731
            z.low += roundBitsMask;
2732
        }
2733
    }
2734
    z.low &= ~ roundBitsMask;
2735
    if ( z.low == 0 ) {
2736
        ++z.high;
2737
        z.low = LIT64( 0x8000000000000000 );
2738
    }
2739
    if ( z.low != a.low ) float_exception_flags |= float_flag_inexact;
2740
    return z;
2741
 
2742
}
2743
 
2744
/*
2745
-------------------------------------------------------------------------------
2746
Returns the result of adding the absolute values of the extended double-
2747
precision floating-point values `a' and `b'.  If `zSign' is true, the sum is
2748
negated before being returned.  `zSign' is ignored if the result is a NaN.
2749
The addition is performed according to the IEC/IEEE Standard for Binary
2750
Floating-point Arithmetic.
2751
-------------------------------------------------------------------------------
2752
*/
2753
static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
2754
{
2755
    int32 aExp, bExp, zExp;
2756
    bits64 aSig, bSig, zSig0, zSig1;
2757
    int32 expDiff;
2758
 
2759
    aSig = extractFloatx80Frac( a );
2760
    aExp = extractFloatx80Exp( a );
2761
    bSig = extractFloatx80Frac( b );
2762
    bExp = extractFloatx80Exp( b );
2763
    expDiff = aExp - bExp;
2764
    if ( 0 < expDiff ) {
2765
        if ( aExp == 0x7FFF ) {
2766
            if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
2767
            return a;
2768
        }
2769
        if ( bExp == 0 ) --expDiff;
2770
        shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
2771
        zExp = aExp;
2772
    }
2773
    else if ( expDiff < 0 ) {
2774
        if ( bExp == 0x7FFF ) {
2775
            if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
2776
            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2777
        }
2778
        if ( aExp == 0 ) ++expDiff;
2779
        shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
2780
        zExp = bExp;
2781
    }
2782
    else {
2783
        if ( aExp == 0x7FFF ) {
2784
            if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
2785
                return propagateFloatx80NaN( a, b );
2786
            }
2787
            return a;
2788
        }
2789
        zSig1 = 0;
2790
        zSig0 = aSig + bSig;
2791
        if ( aExp == 0 ) {
2792
            normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
2793
            goto roundAndPack;
2794
        }
2795
        zExp = aExp;
2796
        goto shiftRight1;
2797
    }
2798
 
2799
    zSig0 = aSig + bSig;
2800
 
2801
    if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
2802
 shiftRight1:
2803
    shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
2804
    zSig0 |= LIT64( 0x8000000000000000 );
2805
    ++zExp;
2806
 roundAndPack:
2807
    return
2808
        roundAndPackFloatx80(
2809
            floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
2810
 
2811
}
2812
 
2813
/*
2814
-------------------------------------------------------------------------------
2815
Returns the result of subtracting the absolute values of the extended
2816
double-precision floating-point values `a' and `b'.  If `zSign' is true,
2817
the difference is negated before being returned.  `zSign' is ignored if the
2818
result is a NaN.  The subtraction is performed according to the IEC/IEEE
2819
Standard for Binary Floating-point Arithmetic.
2820
-------------------------------------------------------------------------------
2821
*/
2822
static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
2823
{
2824
    int32 aExp, bExp, zExp;
2825
    bits64 aSig, bSig, zSig0, zSig1;
2826
    int32 expDiff;
2827
    floatx80 z;
2828
 
2829
    aSig = extractFloatx80Frac( a );
2830
    aExp = extractFloatx80Exp( a );
2831
    bSig = extractFloatx80Frac( b );
2832
    bExp = extractFloatx80Exp( b );
2833
    expDiff = aExp - bExp;
2834
    if ( 0 < expDiff ) goto aExpBigger;
2835
    if ( expDiff < 0 ) goto bExpBigger;
2836
    if ( aExp == 0x7FFF ) {
2837
        if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
2838
            return propagateFloatx80NaN( a, b );
2839
        }
2840
        float_raise( float_flag_invalid );
2841
        z.low = floatx80_default_nan_low;
2842
        z.high = floatx80_default_nan_high;
2843
        return z;
2844
    }
2845
    if ( aExp == 0 ) {
2846
        aExp = 1;
2847
        bExp = 1;
2848
    }
2849
    zSig1 = 0;
2850
    if ( bSig < aSig ) goto aBigger;
2851
    if ( aSig < bSig ) goto bBigger;
2852
    return packFloatx80( float_rounding_mode == float_round_down, 0, 0 );
2853
 bExpBigger:
2854
    if ( bExp == 0x7FFF ) {
2855
        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
2856
        return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
2857
    }
2858
    if ( aExp == 0 ) ++expDiff;
2859
    shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
2860
 bBigger:
2861
    sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
2862
    zExp = bExp;
2863
    zSign ^= 1;
2864
    goto normalizeRoundAndPack;
2865
 aExpBigger:
2866
    if ( aExp == 0x7FFF ) {
2867
        if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
2868
        return a;
2869
    }
2870
    if ( bExp == 0 ) --expDiff;
2871
    shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
2872
 aBigger:
2873
    sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
2874
    zExp = aExp;
2875
 normalizeRoundAndPack:
2876
    return
2877
        normalizeRoundAndPackFloatx80(
2878
            floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
2879
 
2880
}
2881
 
2882
/*
2883
-------------------------------------------------------------------------------
2884
Returns the result of adding the extended double-precision floating-point
2885
values `a' and `b'.  The operation is performed according to the IEC/IEEE
2886
Standard for Binary Floating-point Arithmetic.
2887
-------------------------------------------------------------------------------
2888
*/
2889
floatx80 floatx80_add( floatx80 a, floatx80 b )
2890
{
2891
    flag aSign, bSign;
2892
 
2893
    aSign = extractFloatx80Sign( a );
2894
    bSign = extractFloatx80Sign( b );
2895
    if ( aSign == bSign ) {
2896
        return addFloatx80Sigs( a, b, aSign );
2897
    }
2898
    else {
2899
        return subFloatx80Sigs( a, b, aSign );
2900
    }
2901
 
2902
}
2903
 
2904
/*
2905
-------------------------------------------------------------------------------
2906
Returns the result of subtracting the extended double-precision floating-
2907
point values `a' and `b'.  The operation is performed according to the
2908
IEC/IEEE Standard for Binary Floating-point Arithmetic.
2909
-------------------------------------------------------------------------------
2910
*/
2911
floatx80 floatx80_sub( floatx80 a, floatx80 b )
2912
{
2913
    flag aSign, bSign;
2914
 
2915
    aSign = extractFloatx80Sign( a );
2916
    bSign = extractFloatx80Sign( b );
2917
    if ( aSign == bSign ) {
2918
        return subFloatx80Sigs( a, b, aSign );
2919
    }
2920
    else {
2921
        return addFloatx80Sigs( a, b, aSign );
2922
    }
2923
 
2924
}
2925
 
2926
/*
2927
-------------------------------------------------------------------------------
2928
Returns the result of multiplying the extended double-precision floating-
2929
point values `a' and `b'.  The operation is performed according to the
2930
IEC/IEEE Standard for Binary Floating-point Arithmetic.
2931
-------------------------------------------------------------------------------
2932
*/
2933
floatx80 floatx80_mul( floatx80 a, floatx80 b )
2934
{
2935
    flag aSign, bSign, zSign;
2936
    int32 aExp, bExp, zExp;
2937
    bits64 aSig, bSig, zSig0, zSig1;
2938
    floatx80 z;
2939
 
2940
    aSig = extractFloatx80Frac( a );
2941
    aExp = extractFloatx80Exp( a );
2942
    aSign = extractFloatx80Sign( a );
2943
    bSig = extractFloatx80Frac( b );
2944
    bExp = extractFloatx80Exp( b );
2945
    bSign = extractFloatx80Sign( b );
2946
    zSign = aSign ^ bSign;
2947
    if ( aExp == 0x7FFF ) {
2948
        if (    (bits64) ( aSig<<1 )
2949
             || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
2950
            return propagateFloatx80NaN( a, b );
2951
        }
2952
        if ( ( bExp | bSig ) == 0 ) goto invalid;
2953
        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2954
    }
2955
    if ( bExp == 0x7FFF ) {
2956
        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
2957
        if ( ( aExp | aSig ) == 0 ) {
2958
 invalid:
2959
            float_raise( float_flag_invalid );
2960
            z.low = floatx80_default_nan_low;
2961
            z.high = floatx80_default_nan_high;
2962
            return z;
2963
        }
2964
        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2965
    }
2966
    if ( aExp == 0 ) {
2967
        if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
2968
        normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
2969
    }
2970
    if ( bExp == 0 ) {
2971
        if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
2972
        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
2973
    }
2974
    zExp = aExp + bExp - 0x3FFE;
2975
    mul64To128( aSig, bSig, &zSig0, &zSig1 );
2976
    if ( 0 < (sbits64) zSig0 ) {
2977
        shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
2978
        --zExp;
2979
    }
2980
    return
2981
        roundAndPackFloatx80(
2982
            floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
2983
 
2984
}
2985
 
2986
/*
2987
-------------------------------------------------------------------------------
2988
Returns the result of dividing the extended double-precision floating-point
2989
value `a' by the corresponding value `b'.  The operation is performed
2990
according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2991
-------------------------------------------------------------------------------
2992
*/
2993
floatx80 floatx80_div( floatx80 a, floatx80 b )
2994
{
2995
    flag aSign, bSign, zSign;
2996
    int32 aExp, bExp, zExp;
2997
    bits64 aSig, bSig, zSig0, zSig1;
2998
    bits64 rem0, rem1, rem2, term0, term1, term2;
2999
    floatx80 z;
3000
 
3001
    aSig = extractFloatx80Frac( a );
3002
    aExp = extractFloatx80Exp( a );
3003
    aSign = extractFloatx80Sign( a );
3004
    bSig = extractFloatx80Frac( b );
3005
    bExp = extractFloatx80Exp( b );
3006
    bSign = extractFloatx80Sign( b );
3007
    zSign = aSign ^ bSign;
3008
    if ( aExp == 0x7FFF ) {
3009
        if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3010
        if ( bExp == 0x7FFF ) {
3011
            if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3012
            goto invalid;
3013
        }
3014
        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3015
    }
3016
    if ( bExp == 0x7FFF ) {
3017
        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3018
        return packFloatx80( zSign, 0, 0 );
3019
    }
3020
    if ( bExp == 0 ) {
3021
        if ( bSig == 0 ) {
3022
            if ( ( aExp | aSig ) == 0 ) {
3023
 invalid:
3024
                float_raise( float_flag_invalid );
3025
                z.low = floatx80_default_nan_low;
3026
                z.high = floatx80_default_nan_high;
3027
                return z;
3028
            }
3029
            float_raise( float_flag_divbyzero );
3030
            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3031
        }
3032
        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3033
    }
3034
    if ( aExp == 0 ) {
3035
        if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3036
        normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3037
    }
3038
    zExp = aExp - bExp + 0x3FFE;
3039
    rem1 = 0;
3040
    if ( bSig <= aSig ) {
3041
        shift128Right( aSig, 0, 1, &aSig, &rem1 );
3042
        ++zExp;
3043
    }
3044
    zSig0 = estimateDiv128To64( aSig, rem1, bSig );
3045
    mul64To128( bSig, zSig0, &term0, &term1 );
3046
    sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
3047
    while ( (sbits64) rem0 < 0 ) {
3048
        --zSig0;
3049
        add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3050
    }
3051
    zSig1 = estimateDiv128To64( rem1, 0, bSig );
3052
    if ( (bits64) ( zSig1<<1 ) <= 8 ) {
3053
        mul64To128( bSig, zSig1, &term1, &term2 );
3054
        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3055
        while ( (sbits64) rem1 < 0 ) {
3056
            --zSig1;
3057
            add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
3058
        }
3059
        zSig1 |= ( ( rem1 | rem2 ) != 0 );
3060
    }
3061
    return
3062
        roundAndPackFloatx80(
3063
            floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3064
 
3065
}
3066
 
3067
/*
3068
-------------------------------------------------------------------------------
3069
Returns the remainder of the extended double-precision floating-point value
3070
`a' with respect to the corresponding value `b'.  The operation is performed
3071
according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
3072
-------------------------------------------------------------------------------
3073
*/
3074
floatx80 floatx80_rem( floatx80 a, floatx80 b )
3075
{
3076
    flag aSign, bSign, zSign;
3077
    int32 aExp, bExp, expDiff;
3078
    bits64 aSig0, aSig1, bSig;
3079
    bits64 q, term0, term1, alternateASig0, alternateASig1;
3080
    floatx80 z;
3081
 
3082
    aSig0 = extractFloatx80Frac( a );
3083
    aExp = extractFloatx80Exp( a );
3084
    aSign = extractFloatx80Sign( a );
3085
    bSig = extractFloatx80Frac( b );
3086
    bExp = extractFloatx80Exp( b );
3087
    bSign = extractFloatx80Sign( b );
3088
    if ( aExp == 0x7FFF ) {
3089
        if (    (bits64) ( aSig0<<1 )
3090
             || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3091
            return propagateFloatx80NaN( a, b );
3092
        }
3093
        goto invalid;
3094
    }
3095
    if ( bExp == 0x7FFF ) {
3096
        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3097
        return a;
3098
    }
3099
    if ( bExp == 0 ) {
3100
        if ( bSig == 0 ) {
3101
 invalid:
3102
            float_raise( float_flag_invalid );
3103
            z.low = floatx80_default_nan_low;
3104
            z.high = floatx80_default_nan_high;
3105
            return z;
3106
        }
3107
        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3108
    }
3109
    if ( aExp == 0 ) {
3110
        if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
3111
        normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3112
    }
3113
    bSig |= LIT64( 0x8000000000000000 );
3114
    zSign = aSign;
3115
    expDiff = aExp - bExp;
3116
    aSig1 = 0;
3117
    if ( expDiff < 0 ) {
3118
        if ( expDiff < -1 ) return a;
3119
        shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
3120
        expDiff = 0;
3121
    }
3122
    q = ( bSig <= aSig0 );
3123
    if ( q ) aSig0 -= bSig;
3124
    expDiff -= 64;
3125
    while ( 0 < expDiff ) {
3126
        q = estimateDiv128To64( aSig0, aSig1, bSig );
3127
        q = ( 2 < q ) ? q - 2 : 0;
3128
        mul64To128( bSig, q, &term0, &term1 );
3129
        sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3130
        shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
3131
        expDiff -= 62;
3132
    }
3133
    expDiff += 64;
3134
    if ( 0 < expDiff ) {
3135
        q = estimateDiv128To64( aSig0, aSig1, bSig );
3136
        q = ( 2 < q ) ? q - 2 : 0;
3137
        q >>= 64 - expDiff;
3138
        mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
3139
        sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3140
        shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
3141
        while ( le128( term0, term1, aSig0, aSig1 ) ) {
3142
            ++q;
3143
            sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3144
        }
3145
    }
3146
    else {
3147
        term1 = 0;
3148
        term0 = bSig;
3149
    }
3150
    sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
3151
    if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
3152
         || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
3153
              && ( q & 1 ) )
3154
       ) {
3155
        aSig0 = alternateASig0;
3156
        aSig1 = alternateASig1;
3157
        zSign = ! zSign;
3158
    }
3159
    return
3160
        normalizeRoundAndPackFloatx80(
3161
            80, zSign, bExp + expDiff, aSig0, aSig1 );
3162
 
3163
}
3164
 
3165
/*
3166
-------------------------------------------------------------------------------
3167
Returns the square root of the extended double-precision floating-point
3168
value `a'.  The operation is performed according to the IEC/IEEE Standard
3169
for Binary Floating-point Arithmetic.
3170
-------------------------------------------------------------------------------
3171
*/
3172
floatx80 floatx80_sqrt( floatx80 a )
3173
{
3174
    flag aSign;
3175
    int32 aExp, zExp;
3176
    bits64 aSig0, aSig1, zSig0, zSig1;
3177
    bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
3178
    bits64 shiftedRem0, shiftedRem1;
3179
    floatx80 z;
3180
 
3181
    aSig0 = extractFloatx80Frac( a );
3182
    aExp = extractFloatx80Exp( a );
3183
    aSign = extractFloatx80Sign( a );
3184
    if ( aExp == 0x7FFF ) {
3185
        if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a );
3186
        if ( ! aSign ) return a;
3187
        goto invalid;
3188
    }
3189
    if ( aSign ) {
3190
        if ( ( aExp | aSig0 ) == 0 ) return a;
3191
 invalid:
3192
        float_raise( float_flag_invalid );
3193
        z.low = floatx80_default_nan_low;
3194
        z.high = floatx80_default_nan_high;
3195
        return z;
3196
    }
3197
    if ( aExp == 0 ) {
3198
        if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
3199
        normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3200
    }
3201
    zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
3202
    zSig0 = estimateSqrt32( aExp, aSig0>>32 );
3203
    zSig0 <<= 31;
3204
    aSig1 = 0;
3205
    shift128Right( aSig0, 0, ( aExp & 1 ) + 2, &aSig0, &aSig1 );
3206
    zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0 ) + zSig0 + 4;
3207
    if ( 0 <= (sbits64) zSig0 ) zSig0 = LIT64( 0xFFFFFFFFFFFFFFFF );
3208
    shortShift128Left( aSig0, aSig1, 2, &aSig0, &aSig1 );
3209
    mul64To128( zSig0, zSig0, &term0, &term1 );
3210
    sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
3211
    while ( (sbits64) rem0 < 0 ) {
3212
        --zSig0;
3213
        shortShift128Left( 0, zSig0, 1, &term0, &term1 );
3214
        term1 |= 1;
3215
        add128( rem0, rem1, term0, term1, &rem0, &rem1 );
3216
    }
3217
    shortShift128Left( rem0, rem1, 63, &shiftedRem0, &shiftedRem1 );
3218
    zSig1 = estimateDiv128To64( shiftedRem0, shiftedRem1, zSig0 );
3219
    if ( (bits64) ( zSig1<<1 ) <= 10 ) {
3220
        if ( zSig1 == 0 ) zSig1 = 1;
3221
        mul64To128( zSig0, zSig1, &term1, &term2 );
3222
        shortShift128Left( term1, term2, 1, &term1, &term2 );
3223
        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3224
        mul64To128( zSig1, zSig1, &term2, &term3 );
3225
        sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
3226
        while ( (sbits64) rem1 < 0 ) {
3227
            --zSig1;
3228
            shortShift192Left( 0, zSig0, zSig1, 1, &term1, &term2, &term3 );
3229
            term3 |= 1;
3230
            add192(
3231
                rem1, rem2, rem3, term1, term2, term3, &rem1, &rem2, &rem3 );
3232
        }
3233
        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
3234
    }
3235
    return
3236
        roundAndPackFloatx80(
3237
            floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
3238
 
3239
}
3240
 
3241
/*
3242
-------------------------------------------------------------------------------
3243
Returns 1 if the extended double-precision floating-point value `a' is
3244
equal to the corresponding value `b', and 0 otherwise.  The comparison is
3245
performed according to the IEC/IEEE Standard for Binary Floating-point
3246
Arithmetic.
3247
-------------------------------------------------------------------------------
3248
*/
3249
flag floatx80_eq( floatx80 a, floatx80 b )
3250
{
3251
 
3252
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3253
              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3254
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3255
              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3256
       ) {
3257
        if (    floatx80_is_signaling_nan( a )
3258
             || floatx80_is_signaling_nan( b ) ) {
3259
            float_raise( float_flag_invalid );
3260
        }
3261
        return 0;
3262
    }
3263
    return
3264
           ( a.low == b.low )
3265
        && (    ( a.high == b.high )
3266
             || (    ( a.low == 0 )
3267
                  && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
3268
           );
3269
 
3270
}
3271
 
3272
/*
3273
-------------------------------------------------------------------------------
3274
Returns 1 if the extended double-precision floating-point value `a' is
3275
less than or equal to the corresponding value `b', and 0 otherwise.  The
3276
comparison is performed according to the IEC/IEEE Standard for Binary
3277
Floating-point Arithmetic.
3278
-------------------------------------------------------------------------------
3279
*/
3280
flag floatx80_le( floatx80 a, floatx80 b )
3281
{
3282
    flag aSign, bSign;
3283
 
3284
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3285
              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3286
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3287
              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3288
       ) {
3289
        float_raise( float_flag_invalid );
3290
        return 0;
3291
    }
3292
    aSign = extractFloatx80Sign( a );
3293
    bSign = extractFloatx80Sign( b );
3294
    if ( aSign != bSign ) {
3295
        return
3296
               aSign
3297
            || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3298
                 == 0 );
3299
    }
3300
    return
3301
          aSign ? le128( b.high, b.low, a.high, a.low )
3302
        : le128( a.high, a.low, b.high, b.low );
3303
 
3304
}
3305
 
3306
/*
3307
-------------------------------------------------------------------------------
3308
Returns 1 if the extended double-precision floating-point value `a' is
3309
less than the corresponding value `b', and 0 otherwise.  The comparison
3310
is performed according to the IEC/IEEE Standard for Binary Floating-point
3311
Arithmetic.
3312
-------------------------------------------------------------------------------
3313
*/
3314
flag floatx80_lt( floatx80 a, floatx80 b )
3315
{
3316
    flag aSign, bSign;
3317
 
3318
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3319
              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3320
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3321
              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3322
       ) {
3323
        float_raise( float_flag_invalid );
3324
        return 0;
3325
    }
3326
    aSign = extractFloatx80Sign( a );
3327
    bSign = extractFloatx80Sign( b );
3328
    if ( aSign != bSign ) {
3329
        return
3330
               aSign
3331
            && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3332
                 != 0 );
3333
    }
3334
    return
3335
          aSign ? lt128( b.high, b.low, a.high, a.low )
3336
        : lt128( a.high, a.low, b.high, b.low );
3337
 
3338
}
3339
 
3340
/*
3341
-------------------------------------------------------------------------------
3342
Returns 1 if the extended double-precision floating-point value `a' is equal
3343
to the corresponding value `b', and 0 otherwise.  The invalid exception is
3344
raised if either operand is a NaN.  Otherwise, the comparison is performed
3345
according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
3346
-------------------------------------------------------------------------------
3347
*/
3348
flag floatx80_eq_signaling( floatx80 a, floatx80 b )
3349
{
3350
 
3351
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3352
              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3353
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3354
              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3355
       ) {
3356
        float_raise( float_flag_invalid );
3357
        return 0;
3358
    }
3359
    return
3360
           ( a.low == b.low )
3361
        && (    ( a.high == b.high )
3362
             || (    ( a.low == 0 )
3363
                  && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
3364
           );
3365
 
3366
}
3367
 
3368
/*
3369
-------------------------------------------------------------------------------
3370
Returns 1 if the extended double-precision floating-point value `a' is less
3371
than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
3372
do not cause an exception.  Otherwise, the comparison is performed according
3373
to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
3374
-------------------------------------------------------------------------------
3375
*/
3376
flag floatx80_le_quiet( floatx80 a, floatx80 b )
3377
{
3378
    flag aSign, bSign;
3379
 
3380
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3381
              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3382
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3383
              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3384
       ) {
3385
        if (    floatx80_is_signaling_nan( a )
3386
             || floatx80_is_signaling_nan( b ) ) {
3387
            float_raise( float_flag_invalid );
3388
        }
3389
        return 0;
3390
    }
3391
    aSign = extractFloatx80Sign( a );
3392
    bSign = extractFloatx80Sign( b );
3393
    if ( aSign != bSign ) {
3394
        return
3395
               aSign
3396
            || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3397
                 == 0 );
3398
    }
3399
    return
3400
          aSign ? le128( b.high, b.low, a.high, a.low )
3401
        : le128( a.high, a.low, b.high, b.low );
3402
 
3403
}
3404
 
3405
/*
3406
-------------------------------------------------------------------------------
3407
Returns 1 if the extended double-precision floating-point value `a' is less
3408
than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
3409
an exception.  Otherwise, the comparison is performed according to the
3410
IEC/IEEE Standard for Binary Floating-point Arithmetic.
3411
-------------------------------------------------------------------------------
3412
*/
3413
flag floatx80_lt_quiet( floatx80 a, floatx80 b )
3414
{
3415
    flag aSign, bSign;
3416
 
3417
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3418
              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3419
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3420
              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3421
       ) {
3422
        if (    floatx80_is_signaling_nan( a )
3423
             || floatx80_is_signaling_nan( b ) ) {
3424
            float_raise( float_flag_invalid );
3425
        }
3426
        return 0;
3427
    }
3428
    aSign = extractFloatx80Sign( a );
3429
    bSign = extractFloatx80Sign( b );
3430
    if ( aSign != bSign ) {
3431
        return
3432
               aSign
3433
            && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3434
                 != 0 );
3435
    }
3436
    return
3437
          aSign ? lt128( b.high, b.low, a.high, a.low )
3438
        : lt128( a.high, a.low, b.high, b.low );
3439
 
3440
}
3441
 
3442
#endif
3443
 

powered by: WebSVN 2.1.0

© copyright 1999-2024 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.