OpenCores
URL https://opencores.org/ocsvn/fpu100/fpu100/trunk

Subversion Repositories fpu100

[/] [fpu100/] [tags/] [arelease/] [test_bench/] [SoftFloat/] [softfloat/] [bits64/] [softfloat.c] - Blame information for rev 6

Go to most recent revision | Details | Compare with Previous | View Log

Line No. Rev Author Line
1 6 jidan
 
2
/*============================================================================
3
 
4
This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
5
Package, Release 2b.
6
 
7
Written by John R. Hauser.  This work was made possible in part by the
8
International Computer Science Institute, located at Suite 600, 1947 Center
9
Street, Berkeley, California 94704.  Funding was partially provided by the
10
National Science Foundation under grant MIP-9311980.  The original version
11
of this code was written as part of a project to build a fixed-point vector
12
processor in collaboration with the University of California at Berkeley,
13
overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
14
is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
15
arithmetic/SoftFloat.html'.
16
 
17
THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort has
18
been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
19
RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
20
AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
21
COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
22
EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
23
INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
24
OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
25
 
26
Derivative works are acceptable, even for commercial purposes, so long as
27
(1) the source code for the derivative work includes prominent notice that
28
the work is derivative, and (2) the source code includes prominent notice with
29
these four paragraphs for those parts of this code that are retained.
30
 
31
=============================================================================*/
32
 
33
#include "milieu.h"
34
#include "softfloat.h"
35
 
36
/*----------------------------------------------------------------------------
37
| Floating-point rounding mode, extended double-precision rounding precision,
38
| and exception flags.
39
*----------------------------------------------------------------------------*/
40
int8 float_rounding_mode = float_round_nearest_even;
41
int8 float_exception_flags = 0;
42
#ifdef FLOATX80
43
int8 floatx80_rounding_precision = 80;
44
#endif
45
 
46
/*----------------------------------------------------------------------------
47
| Primitive arithmetic functions, including multi-word arithmetic, and
48
| division and square root approximations.  (Can be specialized to target if
49
| desired.)
50
*----------------------------------------------------------------------------*/
51
#include "softfloat-macros"
52
 
53
/*----------------------------------------------------------------------------
54
| Functions and definitions to determine:  (1) whether tininess for underflow
55
| is detected before or after rounding by default, (2) what (if anything)
56
| happens when exceptions are raised, (3) how signaling NaNs are distinguished
57
| from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
58
| are propagated from function inputs to output.  These details are target-
59
| specific.
60
*----------------------------------------------------------------------------*/
61
#include "softfloat-specialize"
62
 
63
/*----------------------------------------------------------------------------
64
| Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
65
| and 7, and returns the properly rounded 32-bit integer corresponding to the
66
| input.  If `zSign' is 1, the input is negated before being converted to an
67
| integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
68
| is simply rounded to an integer, with the inexact exception raised if the
69
| input cannot be represented exactly as an integer.  However, if the fixed-
70
| point input is too large, the invalid exception is raised and the largest
71
| positive or negative integer is returned.
72
*----------------------------------------------------------------------------*/
73
 
74
static int32 roundAndPackInt32( flag zSign, bits64 absZ )
75
{
76
    int8 roundingMode;
77
    flag roundNearestEven;
78
    int8 roundIncrement, roundBits;
79
    int32 z;
80
 
81
    roundingMode = float_rounding_mode;
82
    roundNearestEven = ( roundingMode == float_round_nearest_even );
83
    roundIncrement = 0x40;
84
    if ( ! roundNearestEven ) {
85
        if ( roundingMode == float_round_to_zero ) {
86
            roundIncrement = 0;
87
        }
88
        else {
89
            roundIncrement = 0x7F;
90
            if ( zSign ) {
91
                if ( roundingMode == float_round_up ) roundIncrement = 0;
92
            }
93
            else {
94
                if ( roundingMode == float_round_down ) roundIncrement = 0;
95
            }
96
        }
97
    }
98
    roundBits = absZ & 0x7F;
99
    absZ = ( absZ + roundIncrement )>>7;
100
    absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
101
    z = absZ;
102
    if ( zSign ) z = - z;
103
    if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
104
        float_raise( float_flag_invalid );
105
        return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
106
    }
107
    if ( roundBits ) float_exception_flags |= float_flag_inexact;
108
    return z;
109
 
110
}
111
 
112
/*----------------------------------------------------------------------------
113
| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
114
| `absZ1', with binary point between bits 63 and 64 (between the input words),
115
| and returns the properly rounded 64-bit integer corresponding to the input.
116
| If `zSign' is 1, the input is negated before being converted to an integer.
117
| Ordinarily, the fixed-point input is simply rounded to an integer, with
118
| the inexact exception raised if the input cannot be represented exactly as
119
| an integer.  However, if the fixed-point input is too large, the invalid
120
| exception is raised and the largest positive or negative integer is
121
| returned.
122
*----------------------------------------------------------------------------*/
123
 
124
static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 )
125
{
126
    int8 roundingMode;
127
    flag roundNearestEven, increment;
128
    int64 z;
129
 
130
    roundingMode = float_rounding_mode;
131
    roundNearestEven = ( roundingMode == float_round_nearest_even );
132
    increment = ( (sbits64) absZ1 < 0 );
133
    if ( ! roundNearestEven ) {
134
        if ( roundingMode == float_round_to_zero ) {
135
            increment = 0;
136
        }
137
        else {
138
            if ( zSign ) {
139
                increment = ( roundingMode == float_round_down ) && absZ1;
140
            }
141
            else {
142
                increment = ( roundingMode == float_round_up ) && absZ1;
143
            }
144
        }
145
    }
146
    if ( increment ) {
147
        ++absZ0;
148
        if ( absZ0 == 0 ) goto overflow;
149
        absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
150
    }
151
    z = absZ0;
152
    if ( zSign ) z = - z;
153
    if ( z && ( ( z < 0 ) ^ zSign ) ) {
154
 overflow:
155
        float_raise( float_flag_invalid );
156
        return
157
              zSign ? (sbits64) LIT64( 0x8000000000000000 )
158
            : LIT64( 0x7FFFFFFFFFFFFFFF );
159
    }
160
    if ( absZ1 ) float_exception_flags |= float_flag_inexact;
161
    return z;
162
 
163
}
164
 
165
/*----------------------------------------------------------------------------
166
| Returns the fraction bits of the single-precision floating-point value `a'.
167
*----------------------------------------------------------------------------*/
168
 
169
INLINE bits32 extractFloat32Frac( float32 a )
170
{
171
 
172
    return a & 0x007FFFFF;
173
 
174
}
175
 
176
/*----------------------------------------------------------------------------
177
| Returns the exponent bits of the single-precision floating-point value `a'.
178
*----------------------------------------------------------------------------*/
179
 
180
INLINE int16 extractFloat32Exp( float32 a )
181
{
182
 
183
    return ( a>>23 ) & 0xFF;
184
 
185
}
186
 
187
/*----------------------------------------------------------------------------
188
| Returns the sign bit of the single-precision floating-point value `a'.
189
*----------------------------------------------------------------------------*/
190
 
191
INLINE flag extractFloat32Sign( float32 a )
192
{
193
 
194
    return a>>31;
195
 
196
}
197
 
198
/*----------------------------------------------------------------------------
199
| Normalizes the subnormal single-precision floating-point value represented
200
| by the denormalized significand `aSig'.  The normalized exponent and
201
| significand are stored at the locations pointed to by `zExpPtr' and
202
| `zSigPtr', respectively.
203
*----------------------------------------------------------------------------*/
204
 
205
static void
206
 normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
207
{
208
    int8 shiftCount;
209
 
210
    shiftCount = countLeadingZeros32( aSig ) - 8;
211
    *zSigPtr = aSig<<shiftCount;
212
    *zExpPtr = 1 - shiftCount;
213
 
214
}
215
 
216
/*----------------------------------------------------------------------------
217
| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
218
| single-precision floating-point value, returning the result.  After being
219
| shifted into the proper positions, the three fields are simply added
220
| together to form the result.  This means that any integer portion of `zSig'
221
| will be added into the exponent.  Since a properly normalized significand
222
| will have an integer portion equal to 1, the `zExp' input should be 1 less
223
| than the desired result exponent whenever `zSig' is a complete, normalized
224
| significand.
225
*----------------------------------------------------------------------------*/
226
 
227
INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
228
{
229
 
230
    return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
231
 
232
}
233
 
234
/*----------------------------------------------------------------------------
235
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
236
| and significand `zSig', and returns the proper single-precision floating-
237
| point value corresponding to the abstract input.  Ordinarily, the abstract
238
| value is simply rounded and packed into the single-precision format, with
239
| the inexact exception raised if the abstract input cannot be represented
240
| exactly.  However, if the abstract value is too large, the overflow and
241
| inexact exceptions are raised and an infinity or maximal finite value is
242
| returned.  If the abstract value is too small, the input value is rounded to
243
| a subnormal number, and the underflow and inexact exceptions are raised if
244
| the abstract input cannot be represented exactly as a subnormal single-
245
| precision floating-point number.
246
|     The input significand `zSig' has its binary point between bits 30
247
| and 29, which is 7 bits to the left of the usual location.  This shifted
248
| significand must be normalized or smaller.  If `zSig' is not normalized,
249
| `zExp' must be 0; in that case, the result returned is a subnormal number,
250
| and it must not require rounding.  In the usual case that `zSig' is
251
| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
252
| The handling of underflow and overflow follows the IEC/IEEE Standard for
253
| Binary Floating-Point Arithmetic.
254
*----------------------------------------------------------------------------*/
255
 
256
static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
257
{
258
    int8 roundingMode;
259
    flag roundNearestEven;
260
    int8 roundIncrement, roundBits;
261
    flag isTiny;
262
 
263
    roundingMode = float_rounding_mode;
264
    roundNearestEven = ( roundingMode == float_round_nearest_even );
265
    roundIncrement = 0x40;
266
    if ( ! roundNearestEven ) {
267
        if ( roundingMode == float_round_to_zero ) {
268
            roundIncrement = 0;
269
        }
270
        else {
271
            roundIncrement = 0x7F;
272
            if ( zSign ) {
273
                if ( roundingMode == float_round_up ) roundIncrement = 0;
274
            }
275
            else {
276
                if ( roundingMode == float_round_down ) roundIncrement = 0;
277
            }
278
        }
279
    }
280
    roundBits = zSig & 0x7F;
281
    if ( 0xFD <= (bits16) zExp ) {
282
        if (    ( 0xFD < zExp )
283
             || (    ( zExp == 0xFD )
284
                  && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
285
           ) {
286
            float_raise( float_flag_overflow | float_flag_inexact );
287
            return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
288
        }
289
        if ( zExp < 0 ) {
290
            isTiny =
291
                   ( float_detect_tininess == float_tininess_before_rounding )
292
                || ( zExp < -1 )
293
                || ( zSig + roundIncrement < 0x80000000 );
294
            shift32RightJamming( zSig, - zExp, &zSig );
295
            zExp = 0;
296
            roundBits = zSig & 0x7F;
297
            if ( isTiny && roundBits ) float_raise( float_flag_underflow );
298
        }
299
    }
300
    if ( roundBits ) float_exception_flags |= float_flag_inexact;
301
    zSig = ( zSig + roundIncrement )>>7;
302
    zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
303
    if ( zSig == 0 ) zExp = 0;
304
    return packFloat32( zSign, zExp, zSig );
305
 
306
}
307
 
308
/*----------------------------------------------------------------------------
309
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
310
| and significand `zSig', and returns the proper single-precision floating-
311
| point value corresponding to the abstract input.  This routine is just like
312
| `roundAndPackFloat32' except that `zSig' does not have to be normalized.
313
| Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
314
| floating-point exponent.
315
*----------------------------------------------------------------------------*/
316
 
317
static float32
318
 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
319
{
320
    int8 shiftCount;
321
 
322
    shiftCount = countLeadingZeros32( zSig ) - 1;
323
    return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount );
324
 
325
}
326
 
327
/*----------------------------------------------------------------------------
328
| Returns the fraction bits of the double-precision floating-point value `a'.
329
*----------------------------------------------------------------------------*/
330
 
331
INLINE bits64 extractFloat64Frac( float64 a )
332
{
333
 
334
    return a & LIT64( 0x000FFFFFFFFFFFFF );
335
 
336
}
337
 
338
/*----------------------------------------------------------------------------
339
| Returns the exponent bits of the double-precision floating-point value `a'.
340
*----------------------------------------------------------------------------*/
341
 
342
INLINE int16 extractFloat64Exp( float64 a )
343
{
344
 
345
    return ( a>>52 ) & 0x7FF;
346
 
347
}
348
 
349
/*----------------------------------------------------------------------------
350
| Returns the sign bit of the double-precision floating-point value `a'.
351
*----------------------------------------------------------------------------*/
352
 
353
INLINE flag extractFloat64Sign( float64 a )
354
{
355
 
356
    return a>>63;
357
 
358
}
359
 
360
/*----------------------------------------------------------------------------
361
| Normalizes the subnormal double-precision floating-point value represented
362
| by the denormalized significand `aSig'.  The normalized exponent and
363
| significand are stored at the locations pointed to by `zExpPtr' and
364
| `zSigPtr', respectively.
365
*----------------------------------------------------------------------------*/
366
 
367
static void
368
 normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
369
{
370
    int8 shiftCount;
371
 
372
    shiftCount = countLeadingZeros64( aSig ) - 11;
373
    *zSigPtr = aSig<<shiftCount;
374
    *zExpPtr = 1 - shiftCount;
375
 
376
}
377
 
378
/*----------------------------------------------------------------------------
379
| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
380
| double-precision floating-point value, returning the result.  After being
381
| shifted into the proper positions, the three fields are simply added
382
| together to form the result.  This means that any integer portion of `zSig'
383
| will be added into the exponent.  Since a properly normalized significand
384
| will have an integer portion equal to 1, the `zExp' input should be 1 less
385
| than the desired result exponent whenever `zSig' is a complete, normalized
386
| significand.
387
*----------------------------------------------------------------------------*/
388
 
389
INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
390
{
391
 
392
    return ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<52 ) + zSig;
393
 
394
}
395
 
396
/*----------------------------------------------------------------------------
397
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
398
| and significand `zSig', and returns the proper double-precision floating-
399
| point value corresponding to the abstract input.  Ordinarily, the abstract
400
| value is simply rounded and packed into the double-precision format, with
401
| the inexact exception raised if the abstract input cannot be represented
402
| exactly.  However, if the abstract value is too large, the overflow and
403
| inexact exceptions are raised and an infinity or maximal finite value is
404
| returned.  If the abstract value is too small, the input value is rounded
405
| to a subnormal number, and the underflow and inexact exceptions are raised
406
| if the abstract input cannot be represented exactly as a subnormal double-
407
| precision floating-point number.
408
|     The input significand `zSig' has its binary point between bits 62
409
| and 61, which is 10 bits to the left of the usual location.  This shifted
410
| significand must be normalized or smaller.  If `zSig' is not normalized,
411
| `zExp' must be 0; in that case, the result returned is a subnormal number,
412
| and it must not require rounding.  In the usual case that `zSig' is
413
| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
414
| The handling of underflow and overflow follows the IEC/IEEE Standard for
415
| Binary Floating-Point Arithmetic.
416
*----------------------------------------------------------------------------*/
417
 
418
static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
419
{
420
    int8 roundingMode;
421
    flag roundNearestEven;
422
    int16 roundIncrement, roundBits;
423
    flag isTiny;
424
 
425
    roundingMode = float_rounding_mode;
426
    roundNearestEven = ( roundingMode == float_round_nearest_even );
427
    roundIncrement = 0x200;
428
    if ( ! roundNearestEven ) {
429
        if ( roundingMode == float_round_to_zero ) {
430
            roundIncrement = 0;
431
        }
432
        else {
433
            roundIncrement = 0x3FF;
434
            if ( zSign ) {
435
                if ( roundingMode == float_round_up ) roundIncrement = 0;
436
            }
437
            else {
438
                if ( roundingMode == float_round_down ) roundIncrement = 0;
439
            }
440
        }
441
    }
442
    roundBits = zSig & 0x3FF;
443
    if ( 0x7FD <= (bits16) zExp ) {
444
        if (    ( 0x7FD < zExp )
445
             || (    ( zExp == 0x7FD )
446
                  && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
447
           ) {
448
            float_raise( float_flag_overflow | float_flag_inexact );
449
            return packFloat64( zSign, 0x7FF, 0 ) - ( roundIncrement == 0 );
450
        }
451
        if ( zExp < 0 ) {
452
            isTiny =
453
                   ( float_detect_tininess == float_tininess_before_rounding )
454
                || ( zExp < -1 )
455
                || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
456
            shift64RightJamming( zSig, - zExp, &zSig );
457
            zExp = 0;
458
            roundBits = zSig & 0x3FF;
459
            if ( isTiny && roundBits ) float_raise( float_flag_underflow );
460
        }
461
    }
462
    if ( roundBits ) float_exception_flags |= float_flag_inexact;
463
    zSig = ( zSig + roundIncrement )>>10;
464
    zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
465
    if ( zSig == 0 ) zExp = 0;
466
    return packFloat64( zSign, zExp, zSig );
467
 
468
}
469
 
470
/*----------------------------------------------------------------------------
471
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
472
| and significand `zSig', and returns the proper double-precision floating-
473
| point value corresponding to the abstract input.  This routine is just like
474
| `roundAndPackFloat64' except that `zSig' does not have to be normalized.
475
| Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
476
| floating-point exponent.
477
*----------------------------------------------------------------------------*/
478
 
479
static float64
480
 normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
481
{
482
    int8 shiftCount;
483
 
484
    shiftCount = countLeadingZeros64( zSig ) - 1;
485
    return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount );
486
 
487
}
488
 
489
#ifdef FLOATX80
490
 
491
/*----------------------------------------------------------------------------
492
| Returns the fraction bits of the extended double-precision floating-point
493
| value `a'.
494
*----------------------------------------------------------------------------*/
495
 
496
INLINE bits64 extractFloatx80Frac( floatx80 a )
497
{
498
 
499
    return a.low;
500
 
501
}
502
 
503
/*----------------------------------------------------------------------------
504
| Returns the exponent bits of the extended double-precision floating-point
505
| value `a'.
506
*----------------------------------------------------------------------------*/
507
 
508
INLINE int32 extractFloatx80Exp( floatx80 a )
509
{
510
 
511
    return a.high & 0x7FFF;
512
 
513
}
514
 
515
/*----------------------------------------------------------------------------
516
| Returns the sign bit of the extended double-precision floating-point value
517
| `a'.
518
*----------------------------------------------------------------------------*/
519
 
520
INLINE flag extractFloatx80Sign( floatx80 a )
521
{
522
 
523
    return a.high>>15;
524
 
525
}
526
 
527
/*----------------------------------------------------------------------------
528
| Normalizes the subnormal extended double-precision floating-point value
529
| represented by the denormalized significand `aSig'.  The normalized exponent
530
| and significand are stored at the locations pointed to by `zExpPtr' and
531
| `zSigPtr', respectively.
532
*----------------------------------------------------------------------------*/
533
 
534
static void
535
 normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
536
{
537
    int8 shiftCount;
538
 
539
    shiftCount = countLeadingZeros64( aSig );
540
    *zSigPtr = aSig<<shiftCount;
541
    *zExpPtr = 1 - shiftCount;
542
 
543
}
544
 
545
/*----------------------------------------------------------------------------
546
| Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
547
| extended double-precision floating-point value, returning the result.
548
*----------------------------------------------------------------------------*/
549
 
550
INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
551
{
552
    floatx80 z;
553
 
554
    z.low = zSig;
555
    z.high = ( ( (bits16) zSign )<<15 ) + zExp;
556
    return z;
557
 
558
}
559
 
560
/*----------------------------------------------------------------------------
561
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
562
| and extended significand formed by the concatenation of `zSig0' and `zSig1',
563
| and returns the proper extended double-precision floating-point value
564
| corresponding to the abstract input.  Ordinarily, the abstract value is
565
| rounded and packed into the extended double-precision format, with the
566
| inexact exception raised if the abstract input cannot be represented
567
| exactly.  However, if the abstract value is too large, the overflow and
568
| inexact exceptions are raised and an infinity or maximal finite value is
569
| returned.  If the abstract value is too small, the input value is rounded to
570
| a subnormal number, and the underflow and inexact exceptions are raised if
571
| the abstract input cannot be represented exactly as a subnormal extended
572
| double-precision floating-point number.
573
|     If `roundingPrecision' is 32 or 64, the result is rounded to the same
574
| number of bits as single or double precision, respectively.  Otherwise, the
575
| result is rounded to the full precision of the extended double-precision
576
| format.
577
|     The input significand must be normalized or smaller.  If the input
578
| significand is not normalized, `zExp' must be 0; in that case, the result
579
| returned is a subnormal number, and it must not require rounding.  The
580
| handling of underflow and overflow follows the IEC/IEEE Standard for Binary
581
| Floating-Point Arithmetic.
582
*----------------------------------------------------------------------------*/
583
 
584
static floatx80
585
 roundAndPackFloatx80(
586
     int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
587
 )
588
{
589
    int8 roundingMode;
590
    flag roundNearestEven, increment, isTiny;
591
    int64 roundIncrement, roundMask, roundBits;
592
 
593
    roundingMode = float_rounding_mode;
594
    roundNearestEven = ( roundingMode == float_round_nearest_even );
595
    if ( roundingPrecision == 80 ) goto precision80;
596
    if ( roundingPrecision == 64 ) {
597
        roundIncrement = LIT64( 0x0000000000000400 );
598
        roundMask = LIT64( 0x00000000000007FF );
599
    }
600
    else if ( roundingPrecision == 32 ) {
601
        roundIncrement = LIT64( 0x0000008000000000 );
602
        roundMask = LIT64( 0x000000FFFFFFFFFF );
603
    }
604
    else {
605
        goto precision80;
606
    }
607
    zSig0 |= ( zSig1 != 0 );
608
    if ( ! roundNearestEven ) {
609
        if ( roundingMode == float_round_to_zero ) {
610
            roundIncrement = 0;
611
        }
612
        else {
613
            roundIncrement = roundMask;
614
            if ( zSign ) {
615
                if ( roundingMode == float_round_up ) roundIncrement = 0;
616
            }
617
            else {
618
                if ( roundingMode == float_round_down ) roundIncrement = 0;
619
            }
620
        }
621
    }
622
    roundBits = zSig0 & roundMask;
623
    if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
624
        if (    ( 0x7FFE < zExp )
625
             || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
626
           ) {
627
            goto overflow;
628
        }
629
        if ( zExp <= 0 ) {
630
            isTiny =
631
                   ( float_detect_tininess == float_tininess_before_rounding )
632
                || ( zExp < 0 )
633
                || ( zSig0 <= zSig0 + roundIncrement );
634
            shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
635
            zExp = 0;
636
            roundBits = zSig0 & roundMask;
637
            if ( isTiny && roundBits ) float_raise( float_flag_underflow );
638
            if ( roundBits ) float_exception_flags |= float_flag_inexact;
639
            zSig0 += roundIncrement;
640
            if ( (sbits64) zSig0 < 0 ) zExp = 1;
641
            roundIncrement = roundMask + 1;
642
            if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
643
                roundMask |= roundIncrement;
644
            }
645
            zSig0 &= ~ roundMask;
646
            return packFloatx80( zSign, zExp, zSig0 );
647
        }
648
    }
649
    if ( roundBits ) float_exception_flags |= float_flag_inexact;
650
    zSig0 += roundIncrement;
651
    if ( zSig0 < roundIncrement ) {
652
        ++zExp;
653
        zSig0 = LIT64( 0x8000000000000000 );
654
    }
655
    roundIncrement = roundMask + 1;
656
    if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
657
        roundMask |= roundIncrement;
658
    }
659
    zSig0 &= ~ roundMask;
660
    if ( zSig0 == 0 ) zExp = 0;
661
    return packFloatx80( zSign, zExp, zSig0 );
662
 precision80:
663
    increment = ( (sbits64) zSig1 < 0 );
664
    if ( ! roundNearestEven ) {
665
        if ( roundingMode == float_round_to_zero ) {
666
            increment = 0;
667
        }
668
        else {
669
            if ( zSign ) {
670
                increment = ( roundingMode == float_round_down ) && zSig1;
671
            }
672
            else {
673
                increment = ( roundingMode == float_round_up ) && zSig1;
674
            }
675
        }
676
    }
677
    if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
678
        if (    ( 0x7FFE < zExp )
679
             || (    ( zExp == 0x7FFE )
680
                  && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
681
                  && increment
682
                )
683
           ) {
684
            roundMask = 0;
685
 overflow:
686
            float_raise( float_flag_overflow | float_flag_inexact );
687
            if (    ( roundingMode == float_round_to_zero )
688
                 || ( zSign && ( roundingMode == float_round_up ) )
689
                 || ( ! zSign && ( roundingMode == float_round_down ) )
690
               ) {
691
                return packFloatx80( zSign, 0x7FFE, ~ roundMask );
692
            }
693
            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
694
        }
695
        if ( zExp <= 0 ) {
696
            isTiny =
697
                   ( float_detect_tininess == float_tininess_before_rounding )
698
                || ( zExp < 0 )
699
                || ! increment
700
                || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
701
            shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
702
            zExp = 0;
703
            if ( isTiny && zSig1 ) float_raise( float_flag_underflow );
704
            if ( zSig1 ) float_exception_flags |= float_flag_inexact;
705
            if ( roundNearestEven ) {
706
                increment = ( (sbits64) zSig1 < 0 );
707
            }
708
            else {
709
                if ( zSign ) {
710
                    increment = ( roundingMode == float_round_down ) && zSig1;
711
                }
712
                else {
713
                    increment = ( roundingMode == float_round_up ) && zSig1;
714
                }
715
            }
716
            if ( increment ) {
717
                ++zSig0;
718
                zSig0 &=
719
                    ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
720
                if ( (sbits64) zSig0 < 0 ) zExp = 1;
721
            }
722
            return packFloatx80( zSign, zExp, zSig0 );
723
        }
724
    }
725
    if ( zSig1 ) float_exception_flags |= float_flag_inexact;
726
    if ( increment ) {
727
        ++zSig0;
728
        if ( zSig0 == 0 ) {
729
            ++zExp;
730
            zSig0 = LIT64( 0x8000000000000000 );
731
        }
732
        else {
733
            zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
734
        }
735
    }
736
    else {
737
        if ( zSig0 == 0 ) zExp = 0;
738
    }
739
    return packFloatx80( zSign, zExp, zSig0 );
740
 
741
}
742
 
743
/*----------------------------------------------------------------------------
744
| Takes an abstract floating-point value having sign `zSign', exponent
745
| `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
746
| and returns the proper extended double-precision floating-point value
747
| corresponding to the abstract input.  This routine is just like
748
| `roundAndPackFloatx80' except that the input significand does not have to be
749
| normalized.
750
*----------------------------------------------------------------------------*/
751
 
752
static floatx80
753
 normalizeRoundAndPackFloatx80(
754
     int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
755
 )
756
{
757
    int8 shiftCount;
758
 
759
    if ( zSig0 == 0 ) {
760
        zSig0 = zSig1;
761
        zSig1 = 0;
762
        zExp -= 64;
763
    }
764
    shiftCount = countLeadingZeros64( zSig0 );
765
    shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
766
    zExp -= shiftCount;
767
    return
768
        roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
769
 
770
}
771
 
772
#endif
773
 
774
#ifdef FLOAT128
775
 
776
/*----------------------------------------------------------------------------
777
| Returns the least-significant 64 fraction bits of the quadruple-precision
778
| floating-point value `a'.
779
*----------------------------------------------------------------------------*/
780
 
781
INLINE bits64 extractFloat128Frac1( float128 a )
782
{
783
 
784
    return a.low;
785
 
786
}
787
 
788
/*----------------------------------------------------------------------------
789
| Returns the most-significant 48 fraction bits of the quadruple-precision
790
| floating-point value `a'.
791
*----------------------------------------------------------------------------*/
792
 
793
INLINE bits64 extractFloat128Frac0( float128 a )
794
{
795
 
796
    return a.high & LIT64( 0x0000FFFFFFFFFFFF );
797
 
798
}
799
 
800
/*----------------------------------------------------------------------------
801
| Returns the exponent bits of the quadruple-precision floating-point value
802
| `a'.
803
*----------------------------------------------------------------------------*/
804
 
805
INLINE int32 extractFloat128Exp( float128 a )
806
{
807
 
808
    return ( a.high>>48 ) & 0x7FFF;
809
 
810
}
811
 
812
/*----------------------------------------------------------------------------
813
| Returns the sign bit of the quadruple-precision floating-point value `a'.
814
*----------------------------------------------------------------------------*/
815
 
816
INLINE flag extractFloat128Sign( float128 a )
817
{
818
 
819
    return a.high>>63;
820
 
821
}
822
 
823
/*----------------------------------------------------------------------------
824
| Normalizes the subnormal quadruple-precision floating-point value
825
| represented by the denormalized significand formed by the concatenation of
826
| `aSig0' and `aSig1'.  The normalized exponent is stored at the location
827
| pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
828
| significand are stored at the location pointed to by `zSig0Ptr', and the
829
| least significant 64 bits of the normalized significand are stored at the
830
| location pointed to by `zSig1Ptr'.
831
*----------------------------------------------------------------------------*/
832
 
833
static void
834
 normalizeFloat128Subnormal(
835
     bits64 aSig0,
836
     bits64 aSig1,
837
     int32 *zExpPtr,
838
     bits64 *zSig0Ptr,
839
     bits64 *zSig1Ptr
840
 )
841
{
842
    int8 shiftCount;
843
 
844
    if ( aSig0 == 0 ) {
845
        shiftCount = countLeadingZeros64( aSig1 ) - 15;
846
        if ( shiftCount < 0 ) {
847
            *zSig0Ptr = aSig1>>( - shiftCount );
848
            *zSig1Ptr = aSig1<<( shiftCount & 63 );
849
        }
850
        else {
851
            *zSig0Ptr = aSig1<<shiftCount;
852
            *zSig1Ptr = 0;
853
        }
854
        *zExpPtr = - shiftCount - 63;
855
    }
856
    else {
857
        shiftCount = countLeadingZeros64( aSig0 ) - 15;
858
        shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
859
        *zExpPtr = 1 - shiftCount;
860
    }
861
 
862
}
863
 
864
/*----------------------------------------------------------------------------
865
| Packs the sign `zSign', the exponent `zExp', and the significand formed
866
| by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
867
| floating-point value, returning the result.  After being shifted into the
868
| proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
869
| added together to form the most significant 32 bits of the result.  This
870
| means that any integer portion of `zSig0' will be added into the exponent.
871
| Since a properly normalized significand will have an integer portion equal
872
| to 1, the `zExp' input should be 1 less than the desired result exponent
873
| whenever `zSig0' and `zSig1' concatenated form a complete, normalized
874
| significand.
875
*----------------------------------------------------------------------------*/
876
 
877
INLINE float128
878
 packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
879
{
880
    float128 z;
881
 
882
    z.low = zSig1;
883
    z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
884
    return z;
885
 
886
}
887
 
888
/*----------------------------------------------------------------------------
889
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
890
| and extended significand formed by the concatenation of `zSig0', `zSig1',
891
| and `zSig2', and returns the proper quadruple-precision floating-point value
892
| corresponding to the abstract input.  Ordinarily, the abstract value is
893
| simply rounded and packed into the quadruple-precision format, with the
894
| inexact exception raised if the abstract input cannot be represented
895
| exactly.  However, if the abstract value is too large, the overflow and
896
| inexact exceptions are raised and an infinity or maximal finite value is
897
| returned.  If the abstract value is too small, the input value is rounded to
898
| a subnormal number, and the underflow and inexact exceptions are raised if
899
| the abstract input cannot be represented exactly as a subnormal quadruple-
900
| precision floating-point number.
901
|     The input significand must be normalized or smaller.  If the input
902
| significand is not normalized, `zExp' must be 0; in that case, the result
903
| returned is a subnormal number, and it must not require rounding.  In the
904
| usual case that the input significand is normalized, `zExp' must be 1 less
905
| than the ``true'' floating-point exponent.  The handling of underflow and
906
| overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
907
*----------------------------------------------------------------------------*/
908
 
909
static float128
910
 roundAndPackFloat128(
911
     flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 )
912
{
913
    int8 roundingMode;
914
    flag roundNearestEven, increment, isTiny;
915
 
916
    roundingMode = float_rounding_mode;
917
    roundNearestEven = ( roundingMode == float_round_nearest_even );
918
    increment = ( (sbits64) zSig2 < 0 );
919
    if ( ! roundNearestEven ) {
920
        if ( roundingMode == float_round_to_zero ) {
921
            increment = 0;
922
        }
923
        else {
924
            if ( zSign ) {
925
                increment = ( roundingMode == float_round_down ) && zSig2;
926
            }
927
            else {
928
                increment = ( roundingMode == float_round_up ) && zSig2;
929
            }
930
        }
931
    }
932
    if ( 0x7FFD <= (bits32) zExp ) {
933
        if (    ( 0x7FFD < zExp )
934
             || (    ( zExp == 0x7FFD )
935
                  && eq128(
936
                         LIT64( 0x0001FFFFFFFFFFFF ),
937
                         LIT64( 0xFFFFFFFFFFFFFFFF ),
938
                         zSig0,
939
                         zSig1
940
                     )
941
                  && increment
942
                )
943
           ) {
944
            float_raise( float_flag_overflow | float_flag_inexact );
945
            if (    ( roundingMode == float_round_to_zero )
946
                 || ( zSign && ( roundingMode == float_round_up ) )
947
                 || ( ! zSign && ( roundingMode == float_round_down ) )
948
               ) {
949
                return
950
                    packFloat128(
951
                        zSign,
952
                        0x7FFE,
953
                        LIT64( 0x0000FFFFFFFFFFFF ),
954
                        LIT64( 0xFFFFFFFFFFFFFFFF )
955
                    );
956
            }
957
            return packFloat128( zSign, 0x7FFF, 0, 0 );
958
        }
959
        if ( zExp < 0 ) {
960
            isTiny =
961
                   ( float_detect_tininess == float_tininess_before_rounding )
962
                || ( zExp < -1 )
963
                || ! increment
964
                || lt128(
965
                       zSig0,
966
                       zSig1,
967
                       LIT64( 0x0001FFFFFFFFFFFF ),
968
                       LIT64( 0xFFFFFFFFFFFFFFFF )
969
                   );
970
            shift128ExtraRightJamming(
971
                zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
972
            zExp = 0;
973
            if ( isTiny && zSig2 ) float_raise( float_flag_underflow );
974
            if ( roundNearestEven ) {
975
                increment = ( (sbits64) zSig2 < 0 );
976
            }
977
            else {
978
                if ( zSign ) {
979
                    increment = ( roundingMode == float_round_down ) && zSig2;
980
                }
981
                else {
982
                    increment = ( roundingMode == float_round_up ) && zSig2;
983
                }
984
            }
985
        }
986
    }
987
    if ( zSig2 ) float_exception_flags |= float_flag_inexact;
988
    if ( increment ) {
989
        add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
990
        zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
991
    }
992
    else {
993
        if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
994
    }
995
    return packFloat128( zSign, zExp, zSig0, zSig1 );
996
 
997
}
998
 
999
/*----------------------------------------------------------------------------
1000
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1001
| and significand formed by the concatenation of `zSig0' and `zSig1', and
1002
| returns the proper quadruple-precision floating-point value corresponding
1003
| to the abstract input.  This routine is just like `roundAndPackFloat128'
1004
| except that the input significand has fewer bits and does not have to be
1005
| normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
1006
| point exponent.
1007
*----------------------------------------------------------------------------*/
1008
 
1009
static float128
1010
 normalizeRoundAndPackFloat128(
1011
     flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
1012
{
1013
    int8 shiftCount;
1014
    bits64 zSig2;
1015
 
1016
    if ( zSig0 == 0 ) {
1017
        zSig0 = zSig1;
1018
        zSig1 = 0;
1019
        zExp -= 64;
1020
    }
1021
    shiftCount = countLeadingZeros64( zSig0 ) - 15;
1022
    if ( 0 <= shiftCount ) {
1023
        zSig2 = 0;
1024
        shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1025
    }
1026
    else {
1027
        shift128ExtraRightJamming(
1028
            zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1029
    }
1030
    zExp -= shiftCount;
1031
    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
1032
 
1033
}
1034
 
1035
#endif
1036
 
1037
/*----------------------------------------------------------------------------
1038
| Returns the result of converting the 32-bit two's complement integer `a'
1039
| to the single-precision floating-point format.  The conversion is performed
1040
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1041
*----------------------------------------------------------------------------*/
1042
 
1043
float32 int32_to_float32( int32 a )
1044
{
1045
    flag zSign;
1046
 
1047
    if ( a == 0 ) return 0;
1048
    if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1049
    zSign = ( a < 0 );
1050
    return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a );
1051
 
1052
}
1053
 
1054
/*----------------------------------------------------------------------------
1055
| Returns the result of converting the 32-bit two's complement integer `a'
1056
| to the double-precision floating-point format.  The conversion is performed
1057
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1058
*----------------------------------------------------------------------------*/
1059
 
1060
float64 int32_to_float64( int32 a )
1061
{
1062
    flag zSign;
1063
    uint32 absA;
1064
    int8 shiftCount;
1065
    bits64 zSig;
1066
 
1067
    if ( a == 0 ) return 0;
1068
    zSign = ( a < 0 );
1069
    absA = zSign ? - a : a;
1070
    shiftCount = countLeadingZeros32( absA ) + 21;
1071
    zSig = absA;
1072
    return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1073
 
1074
}
1075
 
1076
#ifdef FLOATX80
1077
 
1078
/*----------------------------------------------------------------------------
1079
| Returns the result of converting the 32-bit two's complement integer `a'
1080
| to the extended double-precision floating-point format.  The conversion
1081
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1082
| Arithmetic.
1083
*----------------------------------------------------------------------------*/
1084
 
1085
floatx80 int32_to_floatx80( int32 a )
1086
{
1087
    flag zSign;
1088
    uint32 absA;
1089
    int8 shiftCount;
1090
    bits64 zSig;
1091
 
1092
    if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1093
    zSign = ( a < 0 );
1094
    absA = zSign ? - a : a;
1095
    shiftCount = countLeadingZeros32( absA ) + 32;
1096
    zSig = absA;
1097
    return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1098
 
1099
}
1100
 
1101
#endif
1102
 
1103
#ifdef FLOAT128
1104
 
1105
/*----------------------------------------------------------------------------
1106
| Returns the result of converting the 32-bit two's complement integer `a' to
1107
| the quadruple-precision floating-point format.  The conversion is performed
1108
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1109
*----------------------------------------------------------------------------*/
1110
 
1111
float128 int32_to_float128( int32 a )
1112
{
1113
    flag zSign;
1114
    uint32 absA;
1115
    int8 shiftCount;
1116
    bits64 zSig0;
1117
 
1118
    if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1119
    zSign = ( a < 0 );
1120
    absA = zSign ? - a : a;
1121
    shiftCount = countLeadingZeros32( absA ) + 17;
1122
    zSig0 = absA;
1123
    return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1124
 
1125
}
1126
 
1127
#endif
1128
 
1129
/*----------------------------------------------------------------------------
1130
| Returns the result of converting the 64-bit two's complement integer `a'
1131
| to the single-precision floating-point format.  The conversion is performed
1132
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1133
*----------------------------------------------------------------------------*/
1134
 
1135
float32 int64_to_float32( int64 a )
1136
{
1137
    flag zSign;
1138
    uint64 absA;
1139
    int8 shiftCount;
1140
    bits32 zSig;
1141
 
1142
    if ( a == 0 ) return 0;
1143
    zSign = ( a < 0 );
1144
    absA = zSign ? - a : a;
1145
    shiftCount = countLeadingZeros64( absA ) - 40;
1146
    if ( 0 <= shiftCount ) {
1147
        return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1148
    }
1149
    else {
1150
        shiftCount += 7;
1151
        if ( shiftCount < 0 ) {
1152
            shift64RightJamming( absA, - shiftCount, &absA );
1153
        }
1154
        else {
1155
            absA <<= shiftCount;
1156
        }
1157
        return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA );
1158
    }
1159
 
1160
}
1161
 
1162
/*----------------------------------------------------------------------------
1163
| Returns the result of converting the 64-bit two's complement integer `a'
1164
| to the double-precision floating-point format.  The conversion is performed
1165
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1166
*----------------------------------------------------------------------------*/
1167
 
1168
float64 int64_to_float64( int64 a )
1169
{
1170
    flag zSign;
1171
 
1172
    if ( a == 0 ) return 0;
1173
    if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
1174
        return packFloat64( 1, 0x43E, 0 );
1175
    }
1176
    zSign = ( a < 0 );
1177
    return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a );
1178
 
1179
}
1180
 
1181
#ifdef FLOATX80
1182
 
1183
/*----------------------------------------------------------------------------
1184
| Returns the result of converting the 64-bit two's complement integer `a'
1185
| to the extended double-precision floating-point format.  The conversion
1186
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1187
| Arithmetic.
1188
*----------------------------------------------------------------------------*/
1189
 
1190
floatx80 int64_to_floatx80( int64 a )
1191
{
1192
    flag zSign;
1193
    uint64 absA;
1194
    int8 shiftCount;
1195
 
1196
    if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1197
    zSign = ( a < 0 );
1198
    absA = zSign ? - a : a;
1199
    shiftCount = countLeadingZeros64( absA );
1200
    return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1201
 
1202
}
1203
 
1204
#endif
1205
 
1206
#ifdef FLOAT128
1207
 
1208
/*----------------------------------------------------------------------------
1209
| Returns the result of converting the 64-bit two's complement integer `a' to
1210
| the quadruple-precision floating-point format.  The conversion is performed
1211
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1212
*----------------------------------------------------------------------------*/
1213
 
1214
float128 int64_to_float128( int64 a )
1215
{
1216
    flag zSign;
1217
    uint64 absA;
1218
    int8 shiftCount;
1219
    int32 zExp;
1220
    bits64 zSig0, zSig1;
1221
 
1222
    if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1223
    zSign = ( a < 0 );
1224
    absA = zSign ? - a : a;
1225
    shiftCount = countLeadingZeros64( absA ) + 49;
1226
    zExp = 0x406E - shiftCount;
1227
    if ( 64 <= shiftCount ) {
1228
        zSig1 = 0;
1229
        zSig0 = absA;
1230
        shiftCount -= 64;
1231
    }
1232
    else {
1233
        zSig1 = absA;
1234
        zSig0 = 0;
1235
    }
1236
    shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1237
    return packFloat128( zSign, zExp, zSig0, zSig1 );
1238
 
1239
}
1240
 
1241
#endif
1242
 
1243
/*----------------------------------------------------------------------------
1244
| Returns the result of converting the single-precision floating-point value
1245
| `a' to the 32-bit two's complement integer format.  The conversion is
1246
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1247
| Arithmetic---which means in particular that the conversion is rounded
1248
| according to the current rounding mode.  If `a' is a NaN, the largest
1249
| positive integer is returned.  Otherwise, if the conversion overflows, the
1250
| largest integer with the same sign as `a' is returned.
1251
*----------------------------------------------------------------------------*/
1252
 
1253
int32 float32_to_int32( float32 a )
1254
{
1255
    flag aSign;
1256
    int16 aExp, shiftCount;
1257
    bits32 aSig;
1258
    bits64 aSig64;
1259
 
1260
    aSig = extractFloat32Frac( a );
1261
    aExp = extractFloat32Exp( a );
1262
    aSign = extractFloat32Sign( a );
1263
    if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1264
    if ( aExp ) aSig |= 0x00800000;
1265
    shiftCount = 0xAF - aExp;
1266
    aSig64 = aSig;
1267
    aSig64 <<= 32;
1268
    if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1269
    return roundAndPackInt32( aSign, aSig64 );
1270
 
1271
}
1272
 
1273
/*----------------------------------------------------------------------------
1274
| Returns the result of converting the single-precision floating-point value
1275
| `a' to the 32-bit two's complement integer format.  The conversion is
1276
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1277
| Arithmetic, except that the conversion is always rounded toward zero.
1278
| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1279
| the conversion overflows, the largest integer with the same sign as `a' is
1280
| returned.
1281
*----------------------------------------------------------------------------*/
1282
 
1283
int32 float32_to_int32_round_to_zero( float32 a )
1284
{
1285
    flag aSign;
1286
    int16 aExp, shiftCount;
1287
    bits32 aSig;
1288
    int32 z;
1289
 
1290
    aSig = extractFloat32Frac( a );
1291
    aExp = extractFloat32Exp( a );
1292
    aSign = extractFloat32Sign( a );
1293
    shiftCount = aExp - 0x9E;
1294
    if ( 0 <= shiftCount ) {
1295
        if ( a != 0xCF000000 ) {
1296
            float_raise( float_flag_invalid );
1297
            if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1298
        }
1299
        return (sbits32) 0x80000000;
1300
    }
1301
    else if ( aExp <= 0x7E ) {
1302
        if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
1303
        return 0;
1304
    }
1305
    aSig = ( aSig | 0x00800000 )<<8;
1306
    z = aSig>>( - shiftCount );
1307
    if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1308
        float_exception_flags |= float_flag_inexact;
1309
    }
1310
    if ( aSign ) z = - z;
1311
    return z;
1312
 
1313
}
1314
 
1315
/*----------------------------------------------------------------------------
1316
| Returns the result of converting the single-precision floating-point value
1317
| `a' to the 64-bit two's complement integer format.  The conversion is
1318
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1319
| Arithmetic---which means in particular that the conversion is rounded
1320
| according to the current rounding mode.  If `a' is a NaN, the largest
1321
| positive integer is returned.  Otherwise, if the conversion overflows, the
1322
| largest integer with the same sign as `a' is returned.
1323
*----------------------------------------------------------------------------*/
1324
 
1325
int64 float32_to_int64( float32 a )
1326
{
1327
    flag aSign;
1328
    int16 aExp, shiftCount;
1329
    bits32 aSig;
1330
    bits64 aSig64, aSigExtra;
1331
 
1332
    aSig = extractFloat32Frac( a );
1333
    aExp = extractFloat32Exp( a );
1334
    aSign = extractFloat32Sign( a );
1335
    shiftCount = 0xBE - aExp;
1336
    if ( shiftCount < 0 ) {
1337
        float_raise( float_flag_invalid );
1338
        if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1339
            return LIT64( 0x7FFFFFFFFFFFFFFF );
1340
        }
1341
        return (sbits64) LIT64( 0x8000000000000000 );
1342
    }
1343
    if ( aExp ) aSig |= 0x00800000;
1344
    aSig64 = aSig;
1345
    aSig64 <<= 40;
1346
    shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1347
    return roundAndPackInt64( aSign, aSig64, aSigExtra );
1348
 
1349
}
1350
 
1351
/*----------------------------------------------------------------------------
1352
| Returns the result of converting the single-precision floating-point value
1353
| `a' to the 64-bit two's complement integer format.  The conversion is
1354
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1355
| Arithmetic, except that the conversion is always rounded toward zero.  If
1356
| `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1357
| conversion overflows, the largest integer with the same sign as `a' is
1358
| returned.
1359
*----------------------------------------------------------------------------*/
1360
 
1361
int64 float32_to_int64_round_to_zero( float32 a )
1362
{
1363
    flag aSign;
1364
    int16 aExp, shiftCount;
1365
    bits32 aSig;
1366
    bits64 aSig64;
1367
    int64 z;
1368
 
1369
    aSig = extractFloat32Frac( a );
1370
    aExp = extractFloat32Exp( a );
1371
    aSign = extractFloat32Sign( a );
1372
    shiftCount = aExp - 0xBE;
1373
    if ( 0 <= shiftCount ) {
1374
        if ( a != 0xDF000000 ) {
1375
            float_raise( float_flag_invalid );
1376
            if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1377
                return LIT64( 0x7FFFFFFFFFFFFFFF );
1378
            }
1379
        }
1380
        return (sbits64) LIT64( 0x8000000000000000 );
1381
    }
1382
    else if ( aExp <= 0x7E ) {
1383
        if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
1384
        return 0;
1385
    }
1386
    aSig64 = aSig | 0x00800000;
1387
    aSig64 <<= 40;
1388
    z = aSig64>>( - shiftCount );
1389
    if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
1390
        float_exception_flags |= float_flag_inexact;
1391
    }
1392
    if ( aSign ) z = - z;
1393
    return z;
1394
 
1395
}
1396
 
1397
/*----------------------------------------------------------------------------
1398
| Returns the result of converting the single-precision floating-point value
1399
| `a' to the double-precision floating-point format.  The conversion is
1400
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1401
| Arithmetic.
1402
*----------------------------------------------------------------------------*/
1403
 
1404
float64 float32_to_float64( float32 a )
1405
{
1406
    flag aSign;
1407
    int16 aExp;
1408
    bits32 aSig;
1409
 
1410
    aSig = extractFloat32Frac( a );
1411
    aExp = extractFloat32Exp( a );
1412
    aSign = extractFloat32Sign( a );
1413
    if ( aExp == 0xFF ) {
1414
        if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) );
1415
        return packFloat64( aSign, 0x7FF, 0 );
1416
    }
1417
    if ( aExp == 0 ) {
1418
        if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1419
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1420
        --aExp;
1421
    }
1422
    return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
1423
 
1424
}
1425
 
1426
#ifdef FLOATX80
1427
 
1428
/*----------------------------------------------------------------------------
1429
| Returns the result of converting the single-precision floating-point value
1430
| `a' to the extended double-precision floating-point format.  The conversion
1431
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1432
| Arithmetic.
1433
*----------------------------------------------------------------------------*/
1434
 
1435
floatx80 float32_to_floatx80( float32 a )
1436
{
1437
    flag aSign;
1438
    int16 aExp;
1439
    bits32 aSig;
1440
 
1441
    aSig = extractFloat32Frac( a );
1442
    aExp = extractFloat32Exp( a );
1443
    aSign = extractFloat32Sign( a );
1444
    if ( aExp == 0xFF ) {
1445
        if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) );
1446
        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1447
    }
1448
    if ( aExp == 0 ) {
1449
        if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1450
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1451
    }
1452
    aSig |= 0x00800000;
1453
    return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
1454
 
1455
}
1456
 
1457
#endif
1458
 
1459
#ifdef FLOAT128
1460
 
1461
/*----------------------------------------------------------------------------
1462
| Returns the result of converting the single-precision floating-point value
1463
| `a' to the double-precision floating-point format.  The conversion is
1464
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1465
| Arithmetic.
1466
*----------------------------------------------------------------------------*/
1467
 
1468
float128 float32_to_float128( float32 a )
1469
{
1470
    flag aSign;
1471
    int16 aExp;
1472
    bits32 aSig;
1473
 
1474
    aSig = extractFloat32Frac( a );
1475
    aExp = extractFloat32Exp( a );
1476
    aSign = extractFloat32Sign( a );
1477
    if ( aExp == 0xFF ) {
1478
        if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a ) );
1479
        return packFloat128( aSign, 0x7FFF, 0, 0 );
1480
    }
1481
    if ( aExp == 0 ) {
1482
        if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1483
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1484
        --aExp;
1485
    }
1486
    return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
1487
 
1488
}
1489
 
1490
#endif
1491
 
1492
/*----------------------------------------------------------------------------
1493
| Rounds the single-precision floating-point value `a' to an integer, and
1494
| returns the result as a single-precision floating-point value.  The
1495
| operation is performed according to the IEC/IEEE Standard for Binary
1496
| Floating-Point Arithmetic.
1497
*----------------------------------------------------------------------------*/
1498
 
1499
float32 float32_round_to_int( float32 a )
1500
{
1501
    flag aSign;
1502
    int16 aExp;
1503
    bits32 lastBitMask, roundBitsMask;
1504
    int8 roundingMode;
1505
    float32 z;
1506
 
1507
    aExp = extractFloat32Exp( a );
1508
    if ( 0x96 <= aExp ) {
1509
        if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1510
            return propagateFloat32NaN( a, a );
1511
        }
1512
        return a;
1513
    }
1514
    if ( aExp <= 0x7E ) {
1515
        if ( (bits32) ( a<<1 ) == 0 ) return a;
1516
        float_exception_flags |= float_flag_inexact;
1517
        aSign = extractFloat32Sign( a );
1518
        switch ( float_rounding_mode ) {
1519
         case float_round_nearest_even:
1520
            if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1521
                return packFloat32( aSign, 0x7F, 0 );
1522
            }
1523
            break;
1524
         case float_round_down:
1525
            return aSign ? 0xBF800000 : 0;
1526
         case float_round_up:
1527
            return aSign ? 0x80000000 : 0x3F800000;
1528
        }
1529
        return packFloat32( aSign, 0, 0 );
1530
    }
1531
    lastBitMask = 1;
1532
    lastBitMask <<= 0x96 - aExp;
1533
    roundBitsMask = lastBitMask - 1;
1534
    z = a;
1535
    roundingMode = float_rounding_mode;
1536
    if ( roundingMode == float_round_nearest_even ) {
1537
        z += lastBitMask>>1;
1538
        if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1539
    }
1540
    else if ( roundingMode != float_round_to_zero ) {
1541
        if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1542
            z += roundBitsMask;
1543
        }
1544
    }
1545
    z &= ~ roundBitsMask;
1546
    if ( z != a ) float_exception_flags |= float_flag_inexact;
1547
    return z;
1548
 
1549
}
1550
 
1551
/*----------------------------------------------------------------------------
1552
| Returns the result of adding the absolute values of the single-precision
1553
| floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
1554
| before being returned.  `zSign' is ignored if the result is a NaN.
1555
| The addition is performed according to the IEC/IEEE Standard for Binary
1556
| Floating-Point Arithmetic.
1557
*----------------------------------------------------------------------------*/
1558
 
1559
static float32 addFloat32Sigs( float32 a, float32 b, flag zSign )
1560
{
1561
    int16 aExp, bExp, zExp;
1562
    bits32 aSig, bSig, zSig;
1563
    int16 expDiff;
1564
 
1565
    aSig = extractFloat32Frac( a );
1566
    aExp = extractFloat32Exp( a );
1567
    bSig = extractFloat32Frac( b );
1568
    bExp = extractFloat32Exp( b );
1569
    expDiff = aExp - bExp;
1570
    aSig <<= 6;
1571
    bSig <<= 6;
1572
    if ( 0 < expDiff ) {
1573
        if ( aExp == 0xFF ) {
1574
            if ( aSig ) return propagateFloat32NaN( a, b );
1575
            return a;
1576
        }
1577
        if ( bExp == 0 ) {
1578
            --expDiff;
1579
        }
1580
        else {
1581
            bSig |= 0x20000000;
1582
        }
1583
        shift32RightJamming( bSig, expDiff, &bSig );
1584
        zExp = aExp;
1585
    }
1586
    else if ( expDiff < 0 ) {
1587
        if ( bExp == 0xFF ) {
1588
            if ( bSig ) return propagateFloat32NaN( a, b );
1589
            return packFloat32( zSign, 0xFF, 0 );
1590
        }
1591
        if ( aExp == 0 ) {
1592
            ++expDiff;
1593
        }
1594
        else {
1595
            aSig |= 0x20000000;
1596
        }
1597
        shift32RightJamming( aSig, - expDiff, &aSig );
1598
        zExp = bExp;
1599
    }
1600
    else {
1601
        if ( aExp == 0xFF ) {
1602
            if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1603
            return a;
1604
        }
1605
        if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1606
        zSig = 0x40000000 + aSig + bSig;
1607
        zExp = aExp;
1608
        goto roundAndPack;
1609
    }
1610
    aSig |= 0x20000000;
1611
    zSig = ( aSig + bSig )<<1;
1612
    --zExp;
1613
    if ( (sbits32) zSig < 0 ) {
1614
        zSig = aSig + bSig;
1615
        ++zExp;
1616
    }
1617
 roundAndPack:
1618
    return roundAndPackFloat32( zSign, zExp, zSig );
1619
 
1620
}
1621
 
1622
/*----------------------------------------------------------------------------
1623
| Returns the result of subtracting the absolute values of the single-
1624
| precision floating-point values `a' and `b'.  If `zSign' is 1, the
1625
| difference is negated before being returned.  `zSign' is ignored if the
1626
| result is a NaN.  The subtraction is performed according to the IEC/IEEE
1627
| Standard for Binary Floating-Point Arithmetic.
1628
*----------------------------------------------------------------------------*/
1629
 
1630
static float32 subFloat32Sigs( float32 a, float32 b, flag zSign )
1631
{
1632
    int16 aExp, bExp, zExp;
1633
    bits32 aSig, bSig, zSig;
1634
    int16 expDiff;
1635
 
1636
    aSig = extractFloat32Frac( a );
1637
    aExp = extractFloat32Exp( a );
1638
    bSig = extractFloat32Frac( b );
1639
    bExp = extractFloat32Exp( b );
1640
    expDiff = aExp - bExp;
1641
    aSig <<= 7;
1642
    bSig <<= 7;
1643
    if ( 0 < expDiff ) goto aExpBigger;
1644
    if ( expDiff < 0 ) goto bExpBigger;
1645
    if ( aExp == 0xFF ) {
1646
        if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1647
        float_raise( float_flag_invalid );
1648
        return float32_default_nan;
1649
    }
1650
    if ( aExp == 0 ) {
1651
        aExp = 1;
1652
        bExp = 1;
1653
    }
1654
    if ( bSig < aSig ) goto aBigger;
1655
    if ( aSig < bSig ) goto bBigger;
1656
    return packFloat32( float_rounding_mode == float_round_down, 0, 0 );
1657
 bExpBigger:
1658
    if ( bExp == 0xFF ) {
1659
        if ( bSig ) return propagateFloat32NaN( a, b );
1660
        return packFloat32( zSign ^ 1, 0xFF, 0 );
1661
    }
1662
    if ( aExp == 0 ) {
1663
        ++expDiff;
1664
    }
1665
    else {
1666
        aSig |= 0x40000000;
1667
    }
1668
    shift32RightJamming( aSig, - expDiff, &aSig );
1669
    bSig |= 0x40000000;
1670
 bBigger:
1671
    zSig = bSig - aSig;
1672
    zExp = bExp;
1673
    zSign ^= 1;
1674
    goto normalizeRoundAndPack;
1675
 aExpBigger:
1676
    if ( aExp == 0xFF ) {
1677
        if ( aSig ) return propagateFloat32NaN( a, b );
1678
        return a;
1679
    }
1680
    if ( bExp == 0 ) {
1681
        --expDiff;
1682
    }
1683
    else {
1684
        bSig |= 0x40000000;
1685
    }
1686
    shift32RightJamming( bSig, expDiff, &bSig );
1687
    aSig |= 0x40000000;
1688
 aBigger:
1689
    zSig = aSig - bSig;
1690
    zExp = aExp;
1691
 normalizeRoundAndPack:
1692
    --zExp;
1693
    return normalizeRoundAndPackFloat32( zSign, zExp, zSig );
1694
 
1695
}
1696
 
1697
/*----------------------------------------------------------------------------
1698
| Returns the result of adding the single-precision floating-point values `a'
1699
| and `b'.  The operation is performed according to the IEC/IEEE Standard for
1700
| Binary Floating-Point Arithmetic.
1701
*----------------------------------------------------------------------------*/
1702
 
1703
float32 float32_add( float32 a, float32 b )
1704
{
1705
    flag aSign, bSign;
1706
 
1707
    aSign = extractFloat32Sign( a );
1708
    bSign = extractFloat32Sign( b );
1709
    if ( aSign == bSign ) {
1710
        return addFloat32Sigs( a, b, aSign );
1711
    }
1712
    else {
1713
        return subFloat32Sigs( a, b, aSign );
1714
    }
1715
 
1716
}
1717
 
1718
/*----------------------------------------------------------------------------
1719
| Returns the result of subtracting the single-precision floating-point values
1720
| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1721
| for Binary Floating-Point Arithmetic.
1722
*----------------------------------------------------------------------------*/
1723
 
1724
float32 float32_sub( float32 a, float32 b )
1725
{
1726
    flag aSign, bSign;
1727
 
1728
    aSign = extractFloat32Sign( a );
1729
    bSign = extractFloat32Sign( b );
1730
    if ( aSign == bSign ) {
1731
        return subFloat32Sigs( a, b, aSign );
1732
    }
1733
    else {
1734
        return addFloat32Sigs( a, b, aSign );
1735
    }
1736
 
1737
}
1738
 
1739
/*----------------------------------------------------------------------------
1740
| Returns the result of multiplying the single-precision floating-point values
1741
| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1742
| for Binary Floating-Point Arithmetic.
1743
*----------------------------------------------------------------------------*/
1744
 
1745
float32 float32_mul( float32 a, float32 b )
1746
{
1747
    flag aSign, bSign, zSign;
1748
    int16 aExp, bExp, zExp;
1749
    bits32 aSig, bSig;
1750
    bits64 zSig64;
1751
    bits32 zSig;
1752
 
1753
    aSig = extractFloat32Frac( a );
1754
    aExp = extractFloat32Exp( a );
1755
    aSign = extractFloat32Sign( a );
1756
    bSig = extractFloat32Frac( b );
1757
    bExp = extractFloat32Exp( b );
1758
    bSign = extractFloat32Sign( b );
1759
    zSign = aSign ^ bSign;
1760
    if ( aExp == 0xFF ) {
1761
        if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1762
            return propagateFloat32NaN( a, b );
1763
        }
1764
        if ( ( bExp | bSig ) == 0 ) {
1765
            float_raise( float_flag_invalid );
1766
            return float32_default_nan;
1767
        }
1768
        return packFloat32( zSign, 0xFF, 0 );
1769
    }
1770
    if ( bExp == 0xFF ) {
1771
        if ( bSig ) return propagateFloat32NaN( a, b );
1772
        if ( ( aExp | aSig ) == 0 ) {
1773
            float_raise( float_flag_invalid );
1774
            return float32_default_nan;
1775
        }
1776
        return packFloat32( zSign, 0xFF, 0 );
1777
    }
1778
    if ( aExp == 0 ) {
1779
        if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1780
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1781
    }
1782
    if ( bExp == 0 ) {
1783
        if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1784
        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1785
    }
1786
    zExp = aExp + bExp - 0x7F;
1787
    aSig = ( aSig | 0x00800000 )<<7;
1788
    bSig = ( bSig | 0x00800000 )<<8;
1789
    shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1790
    zSig = zSig64;
1791
    if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1792
        zSig <<= 1;
1793
        --zExp;
1794
    }
1795
    return roundAndPackFloat32( zSign, zExp, zSig );
1796
 
1797
}
1798
 
1799
/*----------------------------------------------------------------------------
1800
| Returns the result of dividing the single-precision floating-point value `a'
1801
| by the corresponding value `b'.  The operation is performed according to the
1802
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1803
*----------------------------------------------------------------------------*/
1804
 
1805
float32 float32_div( float32 a, float32 b )
1806
{
1807
    flag aSign, bSign, zSign;
1808
    int16 aExp, bExp, zExp;
1809
    bits32 aSig, bSig, zSig;
1810
 
1811
    aSig = extractFloat32Frac( a );
1812
    aExp = extractFloat32Exp( a );
1813
    aSign = extractFloat32Sign( a );
1814
    bSig = extractFloat32Frac( b );
1815
    bExp = extractFloat32Exp( b );
1816
    bSign = extractFloat32Sign( b );
1817
    zSign = aSign ^ bSign;
1818
    if ( aExp == 0xFF ) {
1819
        if ( aSig ) return propagateFloat32NaN( a, b );
1820
        if ( bExp == 0xFF ) {
1821
            if ( bSig ) return propagateFloat32NaN( a, b );
1822
            float_raise( float_flag_invalid );
1823
            return float32_default_nan;
1824
        }
1825
        return packFloat32( zSign, 0xFF, 0 );
1826
    }
1827
    if ( bExp == 0xFF ) {
1828
        if ( bSig ) return propagateFloat32NaN( a, b );
1829
        return packFloat32( zSign, 0, 0 );
1830
    }
1831
    if ( bExp == 0 ) {
1832
        if ( bSig == 0 ) {
1833
            if ( ( aExp | aSig ) == 0 ) {
1834
                float_raise( float_flag_invalid );
1835
                return float32_default_nan;
1836
            }
1837
            float_raise( float_flag_divbyzero );
1838
            return packFloat32( zSign, 0xFF, 0 );
1839
        }
1840
        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1841
    }
1842
    if ( aExp == 0 ) {
1843
        if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1844
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1845
    }
1846
    zExp = aExp - bExp + 0x7D;
1847
    aSig = ( aSig | 0x00800000 )<<7;
1848
    bSig = ( bSig | 0x00800000 )<<8;
1849
    if ( bSig <= ( aSig + aSig ) ) {
1850
        aSig >>= 1;
1851
        ++zExp;
1852
    }
1853
    zSig = ( ( (bits64) aSig )<<32 ) / bSig;
1854
    if ( ( zSig & 0x3F ) == 0 ) {
1855
        zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
1856
    }
1857
    return roundAndPackFloat32( zSign, zExp, zSig );
1858
 
1859
}
1860
 
1861
/*----------------------------------------------------------------------------
1862
| Returns the remainder of the single-precision floating-point value `a'
1863
| with respect to the corresponding value `b'.  The operation is performed
1864
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1865
*----------------------------------------------------------------------------*/
1866
 
1867
float32 float32_rem( float32 a, float32 b )
1868
{
1869
    flag aSign, bSign, zSign;
1870
    int16 aExp, bExp, expDiff;
1871
    bits32 aSig, bSig;
1872
    bits32 q;
1873
    bits64 aSig64, bSig64, q64;
1874
    bits32 alternateASig;
1875
    sbits32 sigMean;
1876
 
1877
    aSig = extractFloat32Frac( a );
1878
    aExp = extractFloat32Exp( a );
1879
    aSign = extractFloat32Sign( a );
1880
    bSig = extractFloat32Frac( b );
1881
    bExp = extractFloat32Exp( b );
1882
    bSign = extractFloat32Sign( b );
1883
    if ( aExp == 0xFF ) {
1884
        if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1885
            return propagateFloat32NaN( a, b );
1886
        }
1887
        float_raise( float_flag_invalid );
1888
        return float32_default_nan;
1889
    }
1890
    if ( bExp == 0xFF ) {
1891
        if ( bSig ) return propagateFloat32NaN( a, b );
1892
        return a;
1893
    }
1894
    if ( bExp == 0 ) {
1895
        if ( bSig == 0 ) {
1896
            float_raise( float_flag_invalid );
1897
            return float32_default_nan;
1898
        }
1899
        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1900
    }
1901
    if ( aExp == 0 ) {
1902
        if ( aSig == 0 ) return a;
1903
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1904
    }
1905
    expDiff = aExp - bExp;
1906
    aSig |= 0x00800000;
1907
    bSig |= 0x00800000;
1908
    if ( expDiff < 32 ) {
1909
        aSig <<= 8;
1910
        bSig <<= 8;
1911
        if ( expDiff < 0 ) {
1912
            if ( expDiff < -1 ) return a;
1913
            aSig >>= 1;
1914
        }
1915
        q = ( bSig <= aSig );
1916
        if ( q ) aSig -= bSig;
1917
        if ( 0 < expDiff ) {
1918
            q = ( ( (bits64) aSig )<<32 ) / bSig;
1919
            q >>= 32 - expDiff;
1920
            bSig >>= 2;
1921
            aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
1922
        }
1923
        else {
1924
            aSig >>= 2;
1925
            bSig >>= 2;
1926
        }
1927
    }
1928
    else {
1929
        if ( bSig <= aSig ) aSig -= bSig;
1930
        aSig64 = ( (bits64) aSig )<<40;
1931
        bSig64 = ( (bits64) bSig )<<40;
1932
        expDiff -= 64;
1933
        while ( 0 < expDiff ) {
1934
            q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1935
            q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1936
            aSig64 = - ( ( bSig * q64 )<<38 );
1937
            expDiff -= 62;
1938
        }
1939
        expDiff += 64;
1940
        q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1941
        q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1942
        q = q64>>( 64 - expDiff );
1943
        bSig <<= 6;
1944
        aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
1945
    }
1946
    do {
1947
        alternateASig = aSig;
1948
        ++q;
1949
        aSig -= bSig;
1950
    } while ( 0 <= (sbits32) aSig );
1951
    sigMean = aSig + alternateASig;
1952
    if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
1953
        aSig = alternateASig;
1954
    }
1955
    zSign = ( (sbits32) aSig < 0 );
1956
    if ( zSign ) aSig = - aSig;
1957
    return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig );
1958
 
1959
}
1960
 
1961
/*----------------------------------------------------------------------------
1962
| Returns the square root of the single-precision floating-point value `a'.
1963
| The operation is performed according to the IEC/IEEE Standard for Binary
1964
| Floating-Point Arithmetic.
1965
*----------------------------------------------------------------------------*/
1966
 
1967
float32 float32_sqrt( float32 a )
1968
{
1969
    flag aSign;
1970
    int16 aExp, zExp;
1971
    bits32 aSig, zSig;
1972
    bits64 rem, term;
1973
 
1974
    aSig = extractFloat32Frac( a );
1975
    aExp = extractFloat32Exp( a );
1976
    aSign = extractFloat32Sign( a );
1977
    if ( aExp == 0xFF ) {
1978
        if ( aSig ) return propagateFloat32NaN( a, 0 );
1979
        if ( ! aSign ) return a;
1980
        float_raise( float_flag_invalid );
1981
        return float32_default_nan;
1982
    }
1983
    if ( aSign ) {
1984
        if ( ( aExp | aSig ) == 0 ) return a;
1985
        float_raise( float_flag_invalid );
1986
        return float32_default_nan;
1987
    }
1988
    if ( aExp == 0 ) {
1989
        if ( aSig == 0 ) return 0;
1990
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1991
    }
1992
    zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
1993
    aSig = ( aSig | 0x00800000 )<<8;
1994
    zSig = estimateSqrt32( aExp, aSig ) + 2;
1995
    if ( ( zSig & 0x7F ) <= 5 ) {
1996
        if ( zSig < 2 ) {
1997
            zSig = 0x7FFFFFFF;
1998
            goto roundAndPack;
1999
        }
2000
        aSig >>= aExp & 1;
2001
        term = ( (bits64) zSig ) * zSig;
2002
        rem = ( ( (bits64) aSig )<<32 ) - term;
2003
        while ( (sbits64) rem < 0 ) {
2004
            --zSig;
2005
            rem += ( ( (bits64) zSig )<<1 ) | 1;
2006
        }
2007
        zSig |= ( rem != 0 );
2008
    }
2009
    shift32RightJamming( zSig, 1, &zSig );
2010
 roundAndPack:
2011
    return roundAndPackFloat32( 0, zExp, zSig );
2012
 
2013
}
2014
 
2015
/*----------------------------------------------------------------------------
2016
| Returns 1 if the single-precision floating-point value `a' is equal to
2017
| the corresponding value `b', and 0 otherwise.  The comparison is performed
2018
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2019
*----------------------------------------------------------------------------*/
2020
 
2021
flag float32_eq( float32 a, float32 b )
2022
{
2023
 
2024
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2025
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2026
       ) {
2027
        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2028
            float_raise( float_flag_invalid );
2029
        }
2030
        return 0;
2031
    }
2032
    return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2033
 
2034
}
2035
 
2036
/*----------------------------------------------------------------------------
2037
| Returns 1 if the single-precision floating-point value `a' is less than
2038
| or equal to the corresponding value `b', and 0 otherwise.  The comparison
2039
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
2040
| Arithmetic.
2041
*----------------------------------------------------------------------------*/
2042
 
2043
flag float32_le( float32 a, float32 b )
2044
{
2045
    flag aSign, bSign;
2046
 
2047
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2048
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2049
       ) {
2050
        float_raise( float_flag_invalid );
2051
        return 0;
2052
    }
2053
    aSign = extractFloat32Sign( a );
2054
    bSign = extractFloat32Sign( b );
2055
    if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2056
    return ( a == b ) || ( aSign ^ ( a < b ) );
2057
 
2058
}
2059
 
2060
/*----------------------------------------------------------------------------
2061
| Returns 1 if the single-precision floating-point value `a' is less than
2062
| the corresponding value `b', and 0 otherwise.  The comparison is performed
2063
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2064
*----------------------------------------------------------------------------*/
2065
 
2066
flag float32_lt( float32 a, float32 b )
2067
{
2068
    flag aSign, bSign;
2069
 
2070
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2071
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2072
       ) {
2073
        float_raise( float_flag_invalid );
2074
        return 0;
2075
    }
2076
    aSign = extractFloat32Sign( a );
2077
    bSign = extractFloat32Sign( b );
2078
    if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2079
    return ( a != b ) && ( aSign ^ ( a < b ) );
2080
 
2081
}
2082
 
2083
/*----------------------------------------------------------------------------
2084
| Returns 1 if the single-precision floating-point value `a' is equal to
2085
| the corresponding value `b', and 0 otherwise.  The invalid exception is
2086
| raised if either operand is a NaN.  Otherwise, the comparison is performed
2087
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2088
*----------------------------------------------------------------------------*/
2089
 
2090
flag float32_eq_signaling( float32 a, float32 b )
2091
{
2092
 
2093
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2094
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2095
       ) {
2096
        float_raise( float_flag_invalid );
2097
        return 0;
2098
    }
2099
    return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2100
 
2101
}
2102
 
2103
/*----------------------------------------------------------------------------
2104
| Returns 1 if the single-precision floating-point value `a' is less than or
2105
| equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2106
| cause an exception.  Otherwise, the comparison is performed according to the
2107
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2108
*----------------------------------------------------------------------------*/
2109
 
2110
flag float32_le_quiet( float32 a, float32 b )
2111
{
2112
    flag aSign, bSign;
2113
    int16 aExp, bExp;
2114
 
2115
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2116
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2117
       ) {
2118
        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2119
            float_raise( float_flag_invalid );
2120
        }
2121
        return 0;
2122
    }
2123
    aSign = extractFloat32Sign( a );
2124
    bSign = extractFloat32Sign( b );
2125
    if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2126
    return ( a == b ) || ( aSign ^ ( a < b ) );
2127
 
2128
}
2129
 
2130
/*----------------------------------------------------------------------------
2131
| Returns 1 if the single-precision floating-point value `a' is less than
2132
| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2133
| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
2134
| Standard for Binary Floating-Point Arithmetic.
2135
*----------------------------------------------------------------------------*/
2136
 
2137
flag float32_lt_quiet( float32 a, float32 b )
2138
{
2139
    flag aSign, bSign;
2140
 
2141
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2142
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2143
       ) {
2144
        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2145
            float_raise( float_flag_invalid );
2146
        }
2147
        return 0;
2148
    }
2149
    aSign = extractFloat32Sign( a );
2150
    bSign = extractFloat32Sign( b );
2151
    if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2152
    return ( a != b ) && ( aSign ^ ( a < b ) );
2153
 
2154
}
2155
 
2156
/*----------------------------------------------------------------------------
2157
| Returns the result of converting the double-precision floating-point value
2158
| `a' to the 32-bit two's complement integer format.  The conversion is
2159
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2160
| Arithmetic---which means in particular that the conversion is rounded
2161
| according to the current rounding mode.  If `a' is a NaN, the largest
2162
| positive integer is returned.  Otherwise, if the conversion overflows, the
2163
| largest integer with the same sign as `a' is returned.
2164
*----------------------------------------------------------------------------*/
2165
 
2166
int32 float64_to_int32( float64 a )
2167
{
2168
    flag aSign;
2169
    int16 aExp, shiftCount;
2170
    bits64 aSig;
2171
 
2172
    aSig = extractFloat64Frac( a );
2173
    aExp = extractFloat64Exp( a );
2174
    aSign = extractFloat64Sign( a );
2175
    if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2176
    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2177
    shiftCount = 0x42C - aExp;
2178
    if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2179
    return roundAndPackInt32( aSign, aSig );
2180
 
2181
}
2182
 
2183
/*----------------------------------------------------------------------------
2184
| Returns the result of converting the double-precision floating-point value
2185
| `a' to the 32-bit two's complement integer format.  The conversion is
2186
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2187
| Arithmetic, except that the conversion is always rounded toward zero.
2188
| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2189
| the conversion overflows, the largest integer with the same sign as `a' is
2190
| returned.
2191
*----------------------------------------------------------------------------*/
2192
 
2193
int32 float64_to_int32_round_to_zero( float64 a )
2194
{
2195
    flag aSign;
2196
    int16 aExp, shiftCount;
2197
    bits64 aSig, savedASig;
2198
    int32 z;
2199
 
2200
    aSig = extractFloat64Frac( a );
2201
    aExp = extractFloat64Exp( a );
2202
    aSign = extractFloat64Sign( a );
2203
    if ( 0x41E < aExp ) {
2204
        if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2205
        goto invalid;
2206
    }
2207
    else if ( aExp < 0x3FF ) {
2208
        if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
2209
        return 0;
2210
    }
2211
    aSig |= LIT64( 0x0010000000000000 );
2212
    shiftCount = 0x433 - aExp;
2213
    savedASig = aSig;
2214
    aSig >>= shiftCount;
2215
    z = aSig;
2216
    if ( aSign ) z = - z;
2217
    if ( ( z < 0 ) ^ aSign ) {
2218
 invalid:
2219
        float_raise( float_flag_invalid );
2220
        return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
2221
    }
2222
    if ( ( aSig<<shiftCount ) != savedASig ) {
2223
        float_exception_flags |= float_flag_inexact;
2224
    }
2225
    return z;
2226
 
2227
}
2228
 
2229
/*----------------------------------------------------------------------------
2230
| Returns the result of converting the double-precision floating-point value
2231
| `a' to the 64-bit two's complement integer format.  The conversion is
2232
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2233
| Arithmetic---which means in particular that the conversion is rounded
2234
| according to the current rounding mode.  If `a' is a NaN, the largest
2235
| positive integer is returned.  Otherwise, if the conversion overflows, the
2236
| largest integer with the same sign as `a' is returned.
2237
*----------------------------------------------------------------------------*/
2238
 
2239
int64 float64_to_int64( float64 a )
2240
{
2241
    flag aSign;
2242
    int16 aExp, shiftCount;
2243
    bits64 aSig, aSigExtra;
2244
 
2245
    aSig = extractFloat64Frac( a );
2246
    aExp = extractFloat64Exp( a );
2247
    aSign = extractFloat64Sign( a );
2248
    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2249
    shiftCount = 0x433 - aExp;
2250
    if ( shiftCount <= 0 ) {
2251
        if ( 0x43E < aExp ) {
2252
            float_raise( float_flag_invalid );
2253
            if (    ! aSign
2254
                 || (    ( aExp == 0x7FF )
2255
                      && ( aSig != LIT64( 0x0010000000000000 ) ) )
2256
               ) {
2257
                return LIT64( 0x7FFFFFFFFFFFFFFF );
2258
            }
2259
            return (sbits64) LIT64( 0x8000000000000000 );
2260
        }
2261
        aSigExtra = 0;
2262
        aSig <<= - shiftCount;
2263
    }
2264
    else {
2265
        shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2266
    }
2267
    return roundAndPackInt64( aSign, aSig, aSigExtra );
2268
 
2269
}
2270
 
2271
/*----------------------------------------------------------------------------
2272
| Returns the result of converting the double-precision floating-point value
2273
| `a' to the 64-bit two's complement integer format.  The conversion is
2274
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2275
| Arithmetic, except that the conversion is always rounded toward zero.
2276
| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2277
| the conversion overflows, the largest integer with the same sign as `a' is
2278
| returned.
2279
*----------------------------------------------------------------------------*/
2280
 
2281
int64 float64_to_int64_round_to_zero( float64 a )
2282
{
2283
    flag aSign;
2284
    int16 aExp, shiftCount;
2285
    bits64 aSig;
2286
    int64 z;
2287
 
2288
    aSig = extractFloat64Frac( a );
2289
    aExp = extractFloat64Exp( a );
2290
    aSign = extractFloat64Sign( a );
2291
    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2292
    shiftCount = aExp - 0x433;
2293
    if ( 0 <= shiftCount ) {
2294
        if ( 0x43E <= aExp ) {
2295
            if ( a != LIT64( 0xC3E0000000000000 ) ) {
2296
                float_raise( float_flag_invalid );
2297
                if (    ! aSign
2298
                     || (    ( aExp == 0x7FF )
2299
                          && ( aSig != LIT64( 0x0010000000000000 ) ) )
2300
                   ) {
2301
                    return LIT64( 0x7FFFFFFFFFFFFFFF );
2302
                }
2303
            }
2304
            return (sbits64) LIT64( 0x8000000000000000 );
2305
        }
2306
        z = aSig<<shiftCount;
2307
    }
2308
    else {
2309
        if ( aExp < 0x3FE ) {
2310
            if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
2311
            return 0;
2312
        }
2313
        z = aSig>>( - shiftCount );
2314
        if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
2315
            float_exception_flags |= float_flag_inexact;
2316
        }
2317
    }
2318
    if ( aSign ) z = - z;
2319
    return z;
2320
 
2321
}
2322
 
2323
/*----------------------------------------------------------------------------
2324
| Returns the result of converting the double-precision floating-point value
2325
| `a' to the single-precision floating-point format.  The conversion is
2326
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2327
| Arithmetic.
2328
*----------------------------------------------------------------------------*/
2329
 
2330
float32 float64_to_float32( float64 a )
2331
{
2332
    flag aSign;
2333
    int16 aExp;
2334
    bits64 aSig;
2335
    bits32 zSig;
2336
 
2337
    aSig = extractFloat64Frac( a );
2338
    aExp = extractFloat64Exp( a );
2339
    aSign = extractFloat64Sign( a );
2340
    if ( aExp == 0x7FF ) {
2341
        if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) );
2342
        return packFloat32( aSign, 0xFF, 0 );
2343
    }
2344
    shift64RightJamming( aSig, 22, &aSig );
2345
    zSig = aSig;
2346
    if ( aExp || zSig ) {
2347
        zSig |= 0x40000000;
2348
        aExp -= 0x381;
2349
    }
2350
    return roundAndPackFloat32( aSign, aExp, zSig );
2351
 
2352
}
2353
 
2354
#ifdef FLOATX80
2355
 
2356
/*----------------------------------------------------------------------------
2357
| Returns the result of converting the double-precision floating-point value
2358
| `a' to the extended double-precision floating-point format.  The conversion
2359
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
2360
| Arithmetic.
2361
*----------------------------------------------------------------------------*/
2362
 
2363
floatx80 float64_to_floatx80( float64 a )
2364
{
2365
    flag aSign;
2366
    int16 aExp;
2367
    bits64 aSig;
2368
 
2369
    aSig = extractFloat64Frac( a );
2370
    aExp = extractFloat64Exp( a );
2371
    aSign = extractFloat64Sign( a );
2372
    if ( aExp == 0x7FF ) {
2373
        if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) );
2374
        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2375
    }
2376
    if ( aExp == 0 ) {
2377
        if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2378
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2379
    }
2380
    return
2381
        packFloatx80(
2382
            aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2383
 
2384
}
2385
 
2386
#endif
2387
 
2388
#ifdef FLOAT128
2389
 
2390
/*----------------------------------------------------------------------------
2391
| Returns the result of converting the double-precision floating-point value
2392
| `a' to the quadruple-precision floating-point format.  The conversion is
2393
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2394
| Arithmetic.
2395
*----------------------------------------------------------------------------*/
2396
 
2397
float128 float64_to_float128( float64 a )
2398
{
2399
    flag aSign;
2400
    int16 aExp;
2401
    bits64 aSig, zSig0, zSig1;
2402
 
2403
    aSig = extractFloat64Frac( a );
2404
    aExp = extractFloat64Exp( a );
2405
    aSign = extractFloat64Sign( a );
2406
    if ( aExp == 0x7FF ) {
2407
        if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a ) );
2408
        return packFloat128( aSign, 0x7FFF, 0, 0 );
2409
    }
2410
    if ( aExp == 0 ) {
2411
        if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2412
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2413
        --aExp;
2414
    }
2415
    shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2416
    return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2417
 
2418
}
2419
 
2420
#endif
2421
 
2422
/*----------------------------------------------------------------------------
2423
| Rounds the double-precision floating-point value `a' to an integer, and
2424
| returns the result as a double-precision floating-point value.  The
2425
| operation is performed according to the IEC/IEEE Standard for Binary
2426
| Floating-Point Arithmetic.
2427
*----------------------------------------------------------------------------*/
2428
 
2429
float64 float64_round_to_int( float64 a )
2430
{
2431
    flag aSign;
2432
    int16 aExp;
2433
    bits64 lastBitMask, roundBitsMask;
2434
    int8 roundingMode;
2435
    float64 z;
2436
 
2437
    aExp = extractFloat64Exp( a );
2438
    if ( 0x433 <= aExp ) {
2439
        if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2440
            return propagateFloat64NaN( a, a );
2441
        }
2442
        return a;
2443
    }
2444
    if ( aExp < 0x3FF ) {
2445
        if ( (bits64) ( a<<1 ) == 0 ) return a;
2446
        float_exception_flags |= float_flag_inexact;
2447
        aSign = extractFloat64Sign( a );
2448
        switch ( float_rounding_mode ) {
2449
         case float_round_nearest_even:
2450
            if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2451
                return packFloat64( aSign, 0x3FF, 0 );
2452
            }
2453
            break;
2454
         case float_round_down:
2455
            return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
2456
         case float_round_up:
2457
            return
2458
            aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
2459
        }
2460
        return packFloat64( aSign, 0, 0 );
2461
    }
2462
    lastBitMask = 1;
2463
    lastBitMask <<= 0x433 - aExp;
2464
    roundBitsMask = lastBitMask - 1;
2465
    z = a;
2466
    roundingMode = float_rounding_mode;
2467
    if ( roundingMode == float_round_nearest_even ) {
2468
        z += lastBitMask>>1;
2469
        if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
2470
    }
2471
    else if ( roundingMode != float_round_to_zero ) {
2472
        if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
2473
            z += roundBitsMask;
2474
        }
2475
    }
2476
    z &= ~ roundBitsMask;
2477
    if ( z != a ) float_exception_flags |= float_flag_inexact;
2478
    return z;
2479
 
2480
}
2481
 
2482
/*----------------------------------------------------------------------------
2483
| Returns the result of adding the absolute values of the double-precision
2484
| floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
2485
| before being returned.  `zSign' is ignored if the result is a NaN.
2486
| The addition is performed according to the IEC/IEEE Standard for Binary
2487
| Floating-Point Arithmetic.
2488
*----------------------------------------------------------------------------*/
2489
 
2490
static float64 addFloat64Sigs( float64 a, float64 b, flag zSign )
2491
{
2492
    int16 aExp, bExp, zExp;
2493
    bits64 aSig, bSig, zSig;
2494
    int16 expDiff;
2495
 
2496
    aSig = extractFloat64Frac( a );
2497
    aExp = extractFloat64Exp( a );
2498
    bSig = extractFloat64Frac( b );
2499
    bExp = extractFloat64Exp( b );
2500
    expDiff = aExp - bExp;
2501
    aSig <<= 9;
2502
    bSig <<= 9;
2503
    if ( 0 < expDiff ) {
2504
        if ( aExp == 0x7FF ) {
2505
            if ( aSig ) return propagateFloat64NaN( a, b );
2506
            return a;
2507
        }
2508
        if ( bExp == 0 ) {
2509
            --expDiff;
2510
        }
2511
        else {
2512
            bSig |= LIT64( 0x2000000000000000 );
2513
        }
2514
        shift64RightJamming( bSig, expDiff, &bSig );
2515
        zExp = aExp;
2516
    }
2517
    else if ( expDiff < 0 ) {
2518
        if ( bExp == 0x7FF ) {
2519
            if ( bSig ) return propagateFloat64NaN( a, b );
2520
            return packFloat64( zSign, 0x7FF, 0 );
2521
        }
2522
        if ( aExp == 0 ) {
2523
            ++expDiff;
2524
        }
2525
        else {
2526
            aSig |= LIT64( 0x2000000000000000 );
2527
        }
2528
        shift64RightJamming( aSig, - expDiff, &aSig );
2529
        zExp = bExp;
2530
    }
2531
    else {
2532
        if ( aExp == 0x7FF ) {
2533
            if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2534
            return a;
2535
        }
2536
        if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
2537
        zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
2538
        zExp = aExp;
2539
        goto roundAndPack;
2540
    }
2541
    aSig |= LIT64( 0x2000000000000000 );
2542
    zSig = ( aSig + bSig )<<1;
2543
    --zExp;
2544
    if ( (sbits64) zSig < 0 ) {
2545
        zSig = aSig + bSig;
2546
        ++zExp;
2547
    }
2548
 roundAndPack:
2549
    return roundAndPackFloat64( zSign, zExp, zSig );
2550
 
2551
}
2552
 
2553
/*----------------------------------------------------------------------------
2554
| Returns the result of subtracting the absolute values of the double-
2555
| precision floating-point values `a' and `b'.  If `zSign' is 1, the
2556
| difference is negated before being returned.  `zSign' is ignored if the
2557
| result is a NaN.  The subtraction is performed according to the IEC/IEEE
2558
| Standard for Binary Floating-Point Arithmetic.
2559
*----------------------------------------------------------------------------*/
2560
 
2561
static float64 subFloat64Sigs( float64 a, float64 b, flag zSign )
2562
{
2563
    int16 aExp, bExp, zExp;
2564
    bits64 aSig, bSig, zSig;
2565
    int16 expDiff;
2566
 
2567
    aSig = extractFloat64Frac( a );
2568
    aExp = extractFloat64Exp( a );
2569
    bSig = extractFloat64Frac( b );
2570
    bExp = extractFloat64Exp( b );
2571
    expDiff = aExp - bExp;
2572
    aSig <<= 10;
2573
    bSig <<= 10;
2574
    if ( 0 < expDiff ) goto aExpBigger;
2575
    if ( expDiff < 0 ) goto bExpBigger;
2576
    if ( aExp == 0x7FF ) {
2577
        if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2578
        float_raise( float_flag_invalid );
2579
        return float64_default_nan;
2580
    }
2581
    if ( aExp == 0 ) {
2582
        aExp = 1;
2583
        bExp = 1;
2584
    }
2585
    if ( bSig < aSig ) goto aBigger;
2586
    if ( aSig < bSig ) goto bBigger;
2587
    return packFloat64( float_rounding_mode == float_round_down, 0, 0 );
2588
 bExpBigger:
2589
    if ( bExp == 0x7FF ) {
2590
        if ( bSig ) return propagateFloat64NaN( a, b );
2591
        return packFloat64( zSign ^ 1, 0x7FF, 0 );
2592
    }
2593
    if ( aExp == 0 ) {
2594
        ++expDiff;
2595
    }
2596
    else {
2597
        aSig |= LIT64( 0x4000000000000000 );
2598
    }
2599
    shift64RightJamming( aSig, - expDiff, &aSig );
2600
    bSig |= LIT64( 0x4000000000000000 );
2601
 bBigger:
2602
    zSig = bSig - aSig;
2603
    zExp = bExp;
2604
    zSign ^= 1;
2605
    goto normalizeRoundAndPack;
2606
 aExpBigger:
2607
    if ( aExp == 0x7FF ) {
2608
        if ( aSig ) return propagateFloat64NaN( a, b );
2609
        return a;
2610
    }
2611
    if ( bExp == 0 ) {
2612
        --expDiff;
2613
    }
2614
    else {
2615
        bSig |= LIT64( 0x4000000000000000 );
2616
    }
2617
    shift64RightJamming( bSig, expDiff, &bSig );
2618
    aSig |= LIT64( 0x4000000000000000 );
2619
 aBigger:
2620
    zSig = aSig - bSig;
2621
    zExp = aExp;
2622
 normalizeRoundAndPack:
2623
    --zExp;
2624
    return normalizeRoundAndPackFloat64( zSign, zExp, zSig );
2625
 
2626
}
2627
 
2628
/*----------------------------------------------------------------------------
2629
| Returns the result of adding the double-precision floating-point values `a'
2630
| and `b'.  The operation is performed according to the IEC/IEEE Standard for
2631
| Binary Floating-Point Arithmetic.
2632
*----------------------------------------------------------------------------*/
2633
 
2634
float64 float64_add( float64 a, float64 b )
2635
{
2636
    flag aSign, bSign;
2637
 
2638
    aSign = extractFloat64Sign( a );
2639
    bSign = extractFloat64Sign( b );
2640
    if ( aSign == bSign ) {
2641
        return addFloat64Sigs( a, b, aSign );
2642
    }
2643
    else {
2644
        return subFloat64Sigs( a, b, aSign );
2645
    }
2646
 
2647
}
2648
 
2649
/*----------------------------------------------------------------------------
2650
| Returns the result of subtracting the double-precision floating-point values
2651
| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2652
| for Binary Floating-Point Arithmetic.
2653
*----------------------------------------------------------------------------*/
2654
 
2655
float64 float64_sub( float64 a, float64 b )
2656
{
2657
    flag aSign, bSign;
2658
 
2659
    aSign = extractFloat64Sign( a );
2660
    bSign = extractFloat64Sign( b );
2661
    if ( aSign == bSign ) {
2662
        return subFloat64Sigs( a, b, aSign );
2663
    }
2664
    else {
2665
        return addFloat64Sigs( a, b, aSign );
2666
    }
2667
 
2668
}
2669
 
2670
/*----------------------------------------------------------------------------
2671
| Returns the result of multiplying the double-precision floating-point values
2672
| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2673
| for Binary Floating-Point Arithmetic.
2674
*----------------------------------------------------------------------------*/
2675
 
2676
float64 float64_mul( float64 a, float64 b )
2677
{
2678
    flag aSign, bSign, zSign;
2679
    int16 aExp, bExp, zExp;
2680
    bits64 aSig, bSig, zSig0, zSig1;
2681
 
2682
    aSig = extractFloat64Frac( a );
2683
    aExp = extractFloat64Exp( a );
2684
    aSign = extractFloat64Sign( a );
2685
    bSig = extractFloat64Frac( b );
2686
    bExp = extractFloat64Exp( b );
2687
    bSign = extractFloat64Sign( b );
2688
    zSign = aSign ^ bSign;
2689
    if ( aExp == 0x7FF ) {
2690
        if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2691
            return propagateFloat64NaN( a, b );
2692
        }
2693
        if ( ( bExp | bSig ) == 0 ) {
2694
            float_raise( float_flag_invalid );
2695
            return float64_default_nan;
2696
        }
2697
        return packFloat64( zSign, 0x7FF, 0 );
2698
    }
2699
    if ( bExp == 0x7FF ) {
2700
        if ( bSig ) return propagateFloat64NaN( a, b );
2701
        if ( ( aExp | aSig ) == 0 ) {
2702
            float_raise( float_flag_invalid );
2703
            return float64_default_nan;
2704
        }
2705
        return packFloat64( zSign, 0x7FF, 0 );
2706
    }
2707
    if ( aExp == 0 ) {
2708
        if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2709
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2710
    }
2711
    if ( bExp == 0 ) {
2712
        if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2713
        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2714
    }
2715
    zExp = aExp + bExp - 0x3FF;
2716
    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2717
    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2718
    mul64To128( aSig, bSig, &zSig0, &zSig1 );
2719
    zSig0 |= ( zSig1 != 0 );
2720
    if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2721
        zSig0 <<= 1;
2722
        --zExp;
2723
    }
2724
    return roundAndPackFloat64( zSign, zExp, zSig0 );
2725
 
2726
}
2727
 
2728
/*----------------------------------------------------------------------------
2729
| Returns the result of dividing the double-precision floating-point value `a'
2730
| by the corresponding value `b'.  The operation is performed according to
2731
| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2732
*----------------------------------------------------------------------------*/
2733
 
2734
float64 float64_div( float64 a, float64 b )
2735
{
2736
    flag aSign, bSign, zSign;
2737
    int16 aExp, bExp, zExp;
2738
    bits64 aSig, bSig, zSig;
2739
    bits64 rem0, rem1;
2740
    bits64 term0, term1;
2741
 
2742
    aSig = extractFloat64Frac( a );
2743
    aExp = extractFloat64Exp( a );
2744
    aSign = extractFloat64Sign( a );
2745
    bSig = extractFloat64Frac( b );
2746
    bExp = extractFloat64Exp( b );
2747
    bSign = extractFloat64Sign( b );
2748
    zSign = aSign ^ bSign;
2749
    if ( aExp == 0x7FF ) {
2750
        if ( aSig ) return propagateFloat64NaN( a, b );
2751
        if ( bExp == 0x7FF ) {
2752
            if ( bSig ) return propagateFloat64NaN( a, b );
2753
            float_raise( float_flag_invalid );
2754
            return float64_default_nan;
2755
        }
2756
        return packFloat64( zSign, 0x7FF, 0 );
2757
    }
2758
    if ( bExp == 0x7FF ) {
2759
        if ( bSig ) return propagateFloat64NaN( a, b );
2760
        return packFloat64( zSign, 0, 0 );
2761
    }
2762
    if ( bExp == 0 ) {
2763
        if ( bSig == 0 ) {
2764
            if ( ( aExp | aSig ) == 0 ) {
2765
                float_raise( float_flag_invalid );
2766
                return float64_default_nan;
2767
            }
2768
            float_raise( float_flag_divbyzero );
2769
            return packFloat64( zSign, 0x7FF, 0 );
2770
        }
2771
        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2772
    }
2773
    if ( aExp == 0 ) {
2774
        if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2775
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2776
    }
2777
    zExp = aExp - bExp + 0x3FD;
2778
    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2779
    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2780
    if ( bSig <= ( aSig + aSig ) ) {
2781
        aSig >>= 1;
2782
        ++zExp;
2783
    }
2784
    zSig = estimateDiv128To64( aSig, 0, bSig );
2785
    if ( ( zSig & 0x1FF ) <= 2 ) {
2786
        mul64To128( bSig, zSig, &term0, &term1 );
2787
        sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2788
        while ( (sbits64) rem0 < 0 ) {
2789
            --zSig;
2790
            add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
2791
        }
2792
        zSig |= ( rem1 != 0 );
2793
    }
2794
    return roundAndPackFloat64( zSign, zExp, zSig );
2795
 
2796
}
2797
 
2798
/*----------------------------------------------------------------------------
2799
| Returns the remainder of the double-precision floating-point value `a'
2800
| with respect to the corresponding value `b'.  The operation is performed
2801
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2802
*----------------------------------------------------------------------------*/
2803
 
2804
float64 float64_rem( float64 a, float64 b )
2805
{
2806
    flag aSign, bSign, zSign;
2807
    int16 aExp, bExp, expDiff;
2808
    bits64 aSig, bSig;
2809
    bits64 q, alternateASig;
2810
    sbits64 sigMean;
2811
 
2812
    aSig = extractFloat64Frac( a );
2813
    aExp = extractFloat64Exp( a );
2814
    aSign = extractFloat64Sign( a );
2815
    bSig = extractFloat64Frac( b );
2816
    bExp = extractFloat64Exp( b );
2817
    bSign = extractFloat64Sign( b );
2818
    if ( aExp == 0x7FF ) {
2819
        if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2820
            return propagateFloat64NaN( a, b );
2821
        }
2822
        float_raise( float_flag_invalid );
2823
        return float64_default_nan;
2824
    }
2825
    if ( bExp == 0x7FF ) {
2826
        if ( bSig ) return propagateFloat64NaN( a, b );
2827
        return a;
2828
    }
2829
    if ( bExp == 0 ) {
2830
        if ( bSig == 0 ) {
2831
            float_raise( float_flag_invalid );
2832
            return float64_default_nan;
2833
        }
2834
        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2835
    }
2836
    if ( aExp == 0 ) {
2837
        if ( aSig == 0 ) return a;
2838
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2839
    }
2840
    expDiff = aExp - bExp;
2841
    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
2842
    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2843
    if ( expDiff < 0 ) {
2844
        if ( expDiff < -1 ) return a;
2845
        aSig >>= 1;
2846
    }
2847
    q = ( bSig <= aSig );
2848
    if ( q ) aSig -= bSig;
2849
    expDiff -= 64;
2850
    while ( 0 < expDiff ) {
2851
        q = estimateDiv128To64( aSig, 0, bSig );
2852
        q = ( 2 < q ) ? q - 2 : 0;
2853
        aSig = - ( ( bSig>>2 ) * q );
2854
        expDiff -= 62;
2855
    }
2856
    expDiff += 64;
2857
    if ( 0 < expDiff ) {
2858
        q = estimateDiv128To64( aSig, 0, bSig );
2859
        q = ( 2 < q ) ? q - 2 : 0;
2860
        q >>= 64 - expDiff;
2861
        bSig >>= 2;
2862
        aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2863
    }
2864
    else {
2865
        aSig >>= 2;
2866
        bSig >>= 2;
2867
    }
2868
    do {
2869
        alternateASig = aSig;
2870
        ++q;
2871
        aSig -= bSig;
2872
    } while ( 0 <= (sbits64) aSig );
2873
    sigMean = aSig + alternateASig;
2874
    if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2875
        aSig = alternateASig;
2876
    }
2877
    zSign = ( (sbits64) aSig < 0 );
2878
    if ( zSign ) aSig = - aSig;
2879
    return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig );
2880
 
2881
}
2882
 
2883
/*----------------------------------------------------------------------------
2884
| Returns the square root of the double-precision floating-point value `a'.
2885
| The operation is performed according to the IEC/IEEE Standard for Binary
2886
| Floating-Point Arithmetic.
2887
*----------------------------------------------------------------------------*/
2888
 
2889
float64 float64_sqrt( float64 a )
2890
{
2891
    flag aSign;
2892
    int16 aExp, zExp;
2893
    bits64 aSig, zSig, doubleZSig;
2894
    bits64 rem0, rem1, term0, term1;
2895
    float64 z;
2896
 
2897
    aSig = extractFloat64Frac( a );
2898
    aExp = extractFloat64Exp( a );
2899
    aSign = extractFloat64Sign( a );
2900
    if ( aExp == 0x7FF ) {
2901
        if ( aSig ) return propagateFloat64NaN( a, a );
2902
        if ( ! aSign ) return a;
2903
        float_raise( float_flag_invalid );
2904
        return float64_default_nan;
2905
    }
2906
    if ( aSign ) {
2907
        if ( ( aExp | aSig ) == 0 ) return a;
2908
        float_raise( float_flag_invalid );
2909
        return float64_default_nan;
2910
    }
2911
    if ( aExp == 0 ) {
2912
        if ( aSig == 0 ) return 0;
2913
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2914
    }
2915
    zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
2916
    aSig |= LIT64( 0x0010000000000000 );
2917
    zSig = estimateSqrt32( aExp, aSig>>21 );
2918
    aSig <<= 9 - ( aExp & 1 );
2919
    zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
2920
    if ( ( zSig & 0x1FF ) <= 5 ) {
2921
        doubleZSig = zSig<<1;
2922
        mul64To128( zSig, zSig, &term0, &term1 );
2923
        sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2924
        while ( (sbits64) rem0 < 0 ) {
2925
            --zSig;
2926
            doubleZSig -= 2;
2927
            add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
2928
        }
2929
        zSig |= ( ( rem0 | rem1 ) != 0 );
2930
    }
2931
    return roundAndPackFloat64( 0, zExp, zSig );
2932
 
2933
}
2934
 
2935
/*----------------------------------------------------------------------------
2936
| Returns 1 if the double-precision floating-point value `a' is equal to the
2937
| corresponding value `b', and 0 otherwise.  The comparison is performed
2938
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2939
*----------------------------------------------------------------------------*/
2940
 
2941
flag float64_eq( float64 a, float64 b )
2942
{
2943
 
2944
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2945
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2946
       ) {
2947
        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2948
            float_raise( float_flag_invalid );
2949
        }
2950
        return 0;
2951
    }
2952
    return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
2953
 
2954
}
2955
 
2956
/*----------------------------------------------------------------------------
2957
| Returns 1 if the double-precision floating-point value `a' is less than or
2958
| equal to the corresponding value `b', and 0 otherwise.  The comparison is
2959
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2960
| Arithmetic.
2961
*----------------------------------------------------------------------------*/
2962
 
2963
flag float64_le( float64 a, float64 b )
2964
{
2965
    flag aSign, bSign;
2966
 
2967
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2968
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2969
       ) {
2970
        float_raise( float_flag_invalid );
2971
        return 0;
2972
    }
2973
    aSign = extractFloat64Sign( a );
2974
    bSign = extractFloat64Sign( b );
2975
    if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
2976
    return ( a == b ) || ( aSign ^ ( a < b ) );
2977
 
2978
}
2979
 
2980
/*----------------------------------------------------------------------------
2981
| Returns 1 if the double-precision floating-point value `a' is less than
2982
| the corresponding value `b', and 0 otherwise.  The comparison is performed
2983
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2984
*----------------------------------------------------------------------------*/
2985
 
2986
flag float64_lt( float64 a, float64 b )
2987
{
2988
    flag aSign, bSign;
2989
 
2990
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2991
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2992
       ) {
2993
        float_raise( float_flag_invalid );
2994
        return 0;
2995
    }
2996
    aSign = extractFloat64Sign( a );
2997
    bSign = extractFloat64Sign( b );
2998
    if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
2999
    return ( a != b ) && ( aSign ^ ( a < b ) );
3000
 
3001
}
3002
 
3003
/*----------------------------------------------------------------------------
3004
| Returns 1 if the double-precision floating-point value `a' is equal to the
3005
| corresponding value `b', and 0 otherwise.  The invalid exception is raised
3006
| if either operand is a NaN.  Otherwise, the comparison is performed
3007
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3008
*----------------------------------------------------------------------------*/
3009
 
3010
flag float64_eq_signaling( float64 a, float64 b )
3011
{
3012
 
3013
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3014
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3015
       ) {
3016
        float_raise( float_flag_invalid );
3017
        return 0;
3018
    }
3019
    return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
3020
 
3021
}
3022
 
3023
/*----------------------------------------------------------------------------
3024
| Returns 1 if the double-precision floating-point value `a' is less than or
3025
| equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
3026
| cause an exception.  Otherwise, the comparison is performed according to the
3027
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3028
*----------------------------------------------------------------------------*/
3029
 
3030
flag float64_le_quiet( float64 a, float64 b )
3031
{
3032
    flag aSign, bSign;
3033
    int16 aExp, bExp;
3034
 
3035
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3036
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3037
       ) {
3038
        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3039
            float_raise( float_flag_invalid );
3040
        }
3041
        return 0;
3042
    }
3043
    aSign = extractFloat64Sign( a );
3044
    bSign = extractFloat64Sign( b );
3045
    if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
3046
    return ( a == b ) || ( aSign ^ ( a < b ) );
3047
 
3048
}
3049
 
3050
/*----------------------------------------------------------------------------
3051
| Returns 1 if the double-precision floating-point value `a' is less than
3052
| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3053
| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
3054
| Standard for Binary Floating-Point Arithmetic.
3055
*----------------------------------------------------------------------------*/
3056
 
3057
flag float64_lt_quiet( float64 a, float64 b )
3058
{
3059
    flag aSign, bSign;
3060
 
3061
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3062
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3063
       ) {
3064
        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3065
            float_raise( float_flag_invalid );
3066
        }
3067
        return 0;
3068
    }
3069
    aSign = extractFloat64Sign( a );
3070
    bSign = extractFloat64Sign( b );
3071
    if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
3072
    return ( a != b ) && ( aSign ^ ( a < b ) );
3073
 
3074
}
3075
 
3076
#ifdef FLOATX80
3077
 
3078
/*----------------------------------------------------------------------------
3079
| Returns the result of converting the extended double-precision floating-
3080
| point value `a' to the 32-bit two's complement integer format.  The
3081
| conversion is performed according to the IEC/IEEE Standard for Binary
3082
| Floating-Point Arithmetic---which means in particular that the conversion
3083
| is rounded according to the current rounding mode.  If `a' is a NaN, the
3084
| largest positive integer is returned.  Otherwise, if the conversion
3085
| overflows, the largest integer with the same sign as `a' is returned.
3086
*----------------------------------------------------------------------------*/
3087
 
3088
int32 floatx80_to_int32( floatx80 a )
3089
{
3090
    flag aSign;
3091
    int32 aExp, shiftCount;
3092
    bits64 aSig;
3093
 
3094
    aSig = extractFloatx80Frac( a );
3095
    aExp = extractFloatx80Exp( a );
3096
    aSign = extractFloatx80Sign( a );
3097
    if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3098
    shiftCount = 0x4037 - aExp;
3099
    if ( shiftCount <= 0 ) shiftCount = 1;
3100
    shift64RightJamming( aSig, shiftCount, &aSig );
3101
    return roundAndPackInt32( aSign, aSig );
3102
 
3103
}
3104
 
3105
/*----------------------------------------------------------------------------
3106
| Returns the result of converting the extended double-precision floating-
3107
| point value `a' to the 32-bit two's complement integer format.  The
3108
| conversion is performed according to the IEC/IEEE Standard for Binary
3109
| Floating-Point Arithmetic, except that the conversion is always rounded
3110
| toward zero.  If `a' is a NaN, the largest positive integer is returned.
3111
| Otherwise, if the conversion overflows, the largest integer with the same
3112
| sign as `a' is returned.
3113
*----------------------------------------------------------------------------*/
3114
 
3115
int32 floatx80_to_int32_round_to_zero( floatx80 a )
3116
{
3117
    flag aSign;
3118
    int32 aExp, shiftCount;
3119
    bits64 aSig, savedASig;
3120
    int32 z;
3121
 
3122
    aSig = extractFloatx80Frac( a );
3123
    aExp = extractFloatx80Exp( a );
3124
    aSign = extractFloatx80Sign( a );
3125
    if ( 0x401E < aExp ) {
3126
        if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3127
        goto invalid;
3128
    }
3129
    else if ( aExp < 0x3FFF ) {
3130
        if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
3131
        return 0;
3132
    }
3133
    shiftCount = 0x403E - aExp;
3134
    savedASig = aSig;
3135
    aSig >>= shiftCount;
3136
    z = aSig;
3137
    if ( aSign ) z = - z;
3138
    if ( ( z < 0 ) ^ aSign ) {
3139
 invalid:
3140
        float_raise( float_flag_invalid );
3141
        return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
3142
    }
3143
    if ( ( aSig<<shiftCount ) != savedASig ) {
3144
        float_exception_flags |= float_flag_inexact;
3145
    }
3146
    return z;
3147
 
3148
}
3149
 
3150
/*----------------------------------------------------------------------------
3151
| Returns the result of converting the extended double-precision floating-
3152
| point value `a' to the 64-bit two's complement integer format.  The
3153
| conversion is performed according to the IEC/IEEE Standard for Binary
3154
| Floating-Point Arithmetic---which means in particular that the conversion
3155
| is rounded according to the current rounding mode.  If `a' is a NaN,
3156
| the largest positive integer is returned.  Otherwise, if the conversion
3157
| overflows, the largest integer with the same sign as `a' is returned.
3158
*----------------------------------------------------------------------------*/
3159
 
3160
int64 floatx80_to_int64( floatx80 a )
3161
{
3162
    flag aSign;
3163
    int32 aExp, shiftCount;
3164
    bits64 aSig, aSigExtra;
3165
 
3166
    aSig = extractFloatx80Frac( a );
3167
    aExp = extractFloatx80Exp( a );
3168
    aSign = extractFloatx80Sign( a );
3169
    shiftCount = 0x403E - aExp;
3170
    if ( shiftCount <= 0 ) {
3171
        if ( shiftCount ) {
3172
            float_raise( float_flag_invalid );
3173
            if (    ! aSign
3174
                 || (    ( aExp == 0x7FFF )
3175
                      && ( aSig != LIT64( 0x8000000000000000 ) ) )
3176
               ) {
3177
                return LIT64( 0x7FFFFFFFFFFFFFFF );
3178
            }
3179
            return (sbits64) LIT64( 0x8000000000000000 );
3180
        }
3181
        aSigExtra = 0;
3182
    }
3183
    else {
3184
        shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3185
    }
3186
    return roundAndPackInt64( aSign, aSig, aSigExtra );
3187
 
3188
}
3189
 
3190
/*----------------------------------------------------------------------------
3191
| Returns the result of converting the extended double-precision floating-
3192
| point value `a' to the 64-bit two's complement integer format.  The
3193
| conversion is performed according to the IEC/IEEE Standard for Binary
3194
| Floating-Point Arithmetic, except that the conversion is always rounded
3195
| toward zero.  If `a' is a NaN, the largest positive integer is returned.
3196
| Otherwise, if the conversion overflows, the largest integer with the same
3197
| sign as `a' is returned.
3198
*----------------------------------------------------------------------------*/
3199
 
3200
int64 floatx80_to_int64_round_to_zero( floatx80 a )
3201
{
3202
    flag aSign;
3203
    int32 aExp, shiftCount;
3204
    bits64 aSig;
3205
    int64 z;
3206
 
3207
    aSig = extractFloatx80Frac( a );
3208
    aExp = extractFloatx80Exp( a );
3209
    aSign = extractFloatx80Sign( a );
3210
    shiftCount = aExp - 0x403E;
3211
    if ( 0 <= shiftCount ) {
3212
        aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3213
        if ( ( a.high != 0xC03E ) || aSig ) {
3214
            float_raise( float_flag_invalid );
3215
            if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3216
                return LIT64( 0x7FFFFFFFFFFFFFFF );
3217
            }
3218
        }
3219
        return (sbits64) LIT64( 0x8000000000000000 );
3220
    }
3221
    else if ( aExp < 0x3FFF ) {
3222
        if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
3223
        return 0;
3224
    }
3225
    z = aSig>>( - shiftCount );
3226
    if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
3227
        float_exception_flags |= float_flag_inexact;
3228
    }
3229
    if ( aSign ) z = - z;
3230
    return z;
3231
 
3232
}
3233
 
3234
/*----------------------------------------------------------------------------
3235
| Returns the result of converting the extended double-precision floating-
3236
| point value `a' to the single-precision floating-point format.  The
3237
| conversion is performed according to the IEC/IEEE Standard for Binary
3238
| Floating-Point Arithmetic.
3239
*----------------------------------------------------------------------------*/
3240
 
3241
float32 floatx80_to_float32( floatx80 a )
3242
{
3243
    flag aSign;
3244
    int32 aExp;
3245
    bits64 aSig;
3246
 
3247
    aSig = extractFloatx80Frac( a );
3248
    aExp = extractFloatx80Exp( a );
3249
    aSign = extractFloatx80Sign( a );
3250
    if ( aExp == 0x7FFF ) {
3251
        if ( (bits64) ( aSig<<1 ) ) {
3252
            return commonNaNToFloat32( floatx80ToCommonNaN( a ) );
3253
        }
3254
        return packFloat32( aSign, 0xFF, 0 );
3255
    }
3256
    shift64RightJamming( aSig, 33, &aSig );
3257
    if ( aExp || aSig ) aExp -= 0x3F81;
3258
    return roundAndPackFloat32( aSign, aExp, aSig );
3259
 
3260
}
3261
 
3262
/*----------------------------------------------------------------------------
3263
| Returns the result of converting the extended double-precision floating-
3264
| point value `a' to the double-precision floating-point format.  The
3265
| conversion is performed according to the IEC/IEEE Standard for Binary
3266
| Floating-Point Arithmetic.
3267
*----------------------------------------------------------------------------*/
3268
 
3269
float64 floatx80_to_float64( floatx80 a )
3270
{
3271
    flag aSign;
3272
    int32 aExp;
3273
    bits64 aSig, zSig;
3274
 
3275
    aSig = extractFloatx80Frac( a );
3276
    aExp = extractFloatx80Exp( a );
3277
    aSign = extractFloatx80Sign( a );
3278
    if ( aExp == 0x7FFF ) {
3279
        if ( (bits64) ( aSig<<1 ) ) {
3280
            return commonNaNToFloat64( floatx80ToCommonNaN( a ) );
3281
        }
3282
        return packFloat64( aSign, 0x7FF, 0 );
3283
    }
3284
    shift64RightJamming( aSig, 1, &zSig );
3285
    if ( aExp || aSig ) aExp -= 0x3C01;
3286
    return roundAndPackFloat64( aSign, aExp, zSig );
3287
 
3288
}
3289
 
3290
#ifdef FLOAT128
3291
 
3292
/*----------------------------------------------------------------------------
3293
| Returns the result of converting the extended double-precision floating-
3294
| point value `a' to the quadruple-precision floating-point format.  The
3295
| conversion is performed according to the IEC/IEEE Standard for Binary
3296
| Floating-Point Arithmetic.
3297
*----------------------------------------------------------------------------*/
3298
 
3299
float128 floatx80_to_float128( floatx80 a )
3300
{
3301
    flag aSign;
3302
    int16 aExp;
3303
    bits64 aSig, zSig0, zSig1;
3304
 
3305
    aSig = extractFloatx80Frac( a );
3306
    aExp = extractFloatx80Exp( a );
3307
    aSign = extractFloatx80Sign( a );
3308
    if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
3309
        return commonNaNToFloat128( floatx80ToCommonNaN( a ) );
3310
    }
3311
    shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3312
    return packFloat128( aSign, aExp, zSig0, zSig1 );
3313
 
3314
}
3315
 
3316
#endif
3317
 
3318
/*----------------------------------------------------------------------------
3319
| Rounds the extended double-precision floating-point value `a' to an integer,
3320
| and returns the result as an extended quadruple-precision floating-point
3321
| value.  The operation is performed according to the IEC/IEEE Standard for
3322
| Binary Floating-Point Arithmetic.
3323
*----------------------------------------------------------------------------*/
3324
 
3325
floatx80 floatx80_round_to_int( floatx80 a )
3326
{
3327
    flag aSign;
3328
    int32 aExp;
3329
    bits64 lastBitMask, roundBitsMask;
3330
    int8 roundingMode;
3331
    floatx80 z;
3332
 
3333
    aExp = extractFloatx80Exp( a );
3334
    if ( 0x403E <= aExp ) {
3335
        if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
3336
            return propagateFloatx80NaN( a, a );
3337
        }
3338
        return a;
3339
    }
3340
    if ( aExp < 0x3FFF ) {
3341
        if (    ( aExp == 0 )
3342
             && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3343
            return a;
3344
        }
3345
        float_exception_flags |= float_flag_inexact;
3346
        aSign = extractFloatx80Sign( a );
3347
        switch ( float_rounding_mode ) {
3348
         case float_round_nearest_even:
3349
            if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
3350
               ) {
3351
                return
3352
                    packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3353
            }
3354
            break;
3355
         case float_round_down:
3356
            return
3357
                  aSign ?
3358
                      packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3359
                : packFloatx80( 0, 0, 0 );
3360
         case float_round_up:
3361
            return
3362
                  aSign ? packFloatx80( 1, 0, 0 )
3363
                : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3364
        }
3365
        return packFloatx80( aSign, 0, 0 );
3366
    }
3367
    lastBitMask = 1;
3368
    lastBitMask <<= 0x403E - aExp;
3369
    roundBitsMask = lastBitMask - 1;
3370
    z = a;
3371
    roundingMode = float_rounding_mode;
3372
    if ( roundingMode == float_round_nearest_even ) {
3373
        z.low += lastBitMask>>1;
3374
        if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
3375
    }
3376
    else if ( roundingMode != float_round_to_zero ) {
3377
        if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
3378
            z.low += roundBitsMask;
3379
        }
3380
    }
3381
    z.low &= ~ roundBitsMask;
3382
    if ( z.low == 0 ) {
3383
        ++z.high;
3384
        z.low = LIT64( 0x8000000000000000 );
3385
    }
3386
    if ( z.low != a.low ) float_exception_flags |= float_flag_inexact;
3387
    return z;
3388
 
3389
}
3390
 
3391
/*----------------------------------------------------------------------------
3392
| Returns the result of adding the absolute values of the extended double-
3393
| precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
3394
| negated before being returned.  `zSign' is ignored if the result is a NaN.
3395
| The addition is performed according to the IEC/IEEE Standard for Binary
3396
| Floating-Point Arithmetic.
3397
*----------------------------------------------------------------------------*/
3398
 
3399
static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3400
{
3401
    int32 aExp, bExp, zExp;
3402
    bits64 aSig, bSig, zSig0, zSig1;
3403
    int32 expDiff;
3404
 
3405
    aSig = extractFloatx80Frac( a );
3406
    aExp = extractFloatx80Exp( a );
3407
    bSig = extractFloatx80Frac( b );
3408
    bExp = extractFloatx80Exp( b );
3409
    expDiff = aExp - bExp;
3410
    if ( 0 < expDiff ) {
3411
        if ( aExp == 0x7FFF ) {
3412
            if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3413
            return a;
3414
        }
3415
        if ( bExp == 0 ) --expDiff;
3416
        shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3417
        zExp = aExp;
3418
    }
3419
    else if ( expDiff < 0 ) {
3420
        if ( bExp == 0x7FFF ) {
3421
            if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3422
            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3423
        }
3424
        if ( aExp == 0 ) ++expDiff;
3425
        shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3426
        zExp = bExp;
3427
    }
3428
    else {
3429
        if ( aExp == 0x7FFF ) {
3430
            if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3431
                return propagateFloatx80NaN( a, b );
3432
            }
3433
            return a;
3434
        }
3435
        zSig1 = 0;
3436
        zSig0 = aSig + bSig;
3437
        if ( aExp == 0 ) {
3438
            normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
3439
            goto roundAndPack;
3440
        }
3441
        zExp = aExp;
3442
        goto shiftRight1;
3443
    }
3444
    zSig0 = aSig + bSig;
3445
    if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
3446
 shiftRight1:
3447
    shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
3448
    zSig0 |= LIT64( 0x8000000000000000 );
3449
    ++zExp;
3450
 roundAndPack:
3451
    return
3452
        roundAndPackFloatx80(
3453
            floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3454
 
3455
}
3456
 
3457
/*----------------------------------------------------------------------------
3458
| Returns the result of subtracting the absolute values of the extended
3459
| double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
3460
| difference is negated before being returned.  `zSign' is ignored if the
3461
| result is a NaN.  The subtraction is performed according to the IEC/IEEE
3462
| Standard for Binary Floating-Point Arithmetic.
3463
*----------------------------------------------------------------------------*/
3464
 
3465
static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3466
{
3467
    int32 aExp, bExp, zExp;
3468
    bits64 aSig, bSig, zSig0, zSig1;
3469
    int32 expDiff;
3470
    floatx80 z;
3471
 
3472
    aSig = extractFloatx80Frac( a );
3473
    aExp = extractFloatx80Exp( a );
3474
    bSig = extractFloatx80Frac( b );
3475
    bExp = extractFloatx80Exp( b );
3476
    expDiff = aExp - bExp;
3477
    if ( 0 < expDiff ) goto aExpBigger;
3478
    if ( expDiff < 0 ) goto bExpBigger;
3479
    if ( aExp == 0x7FFF ) {
3480
        if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3481
            return propagateFloatx80NaN( a, b );
3482
        }
3483
        float_raise( float_flag_invalid );
3484
        z.low = floatx80_default_nan_low;
3485
        z.high = floatx80_default_nan_high;
3486
        return z;
3487
    }
3488
    if ( aExp == 0 ) {
3489
        aExp = 1;
3490
        bExp = 1;
3491
    }
3492
    zSig1 = 0;
3493
    if ( bSig < aSig ) goto aBigger;
3494
    if ( aSig < bSig ) goto bBigger;
3495
    return packFloatx80( float_rounding_mode == float_round_down, 0, 0 );
3496
 bExpBigger:
3497
    if ( bExp == 0x7FFF ) {
3498
        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3499
        return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
3500
    }
3501
    if ( aExp == 0 ) ++expDiff;
3502
    shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3503
 bBigger:
3504
    sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
3505
    zExp = bExp;
3506
    zSign ^= 1;
3507
    goto normalizeRoundAndPack;
3508
 aExpBigger:
3509
    if ( aExp == 0x7FFF ) {
3510
        if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3511
        return a;
3512
    }
3513
    if ( bExp == 0 ) --expDiff;
3514
    shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3515
 aBigger:
3516
    sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
3517
    zExp = aExp;
3518
 normalizeRoundAndPack:
3519
    return
3520
        normalizeRoundAndPackFloatx80(
3521
            floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3522
 
3523
}
3524
 
3525
/*----------------------------------------------------------------------------
3526
| Returns the result of adding the extended double-precision floating-point
3527
| values `a' and `b'.  The operation is performed according to the IEC/IEEE
3528
| Standard for Binary Floating-Point Arithmetic.
3529
*----------------------------------------------------------------------------*/
3530
 
3531
floatx80 floatx80_add( floatx80 a, floatx80 b )
3532
{
3533
    flag aSign, bSign;
3534
 
3535
    aSign = extractFloatx80Sign( a );
3536
    bSign = extractFloatx80Sign( b );
3537
    if ( aSign == bSign ) {
3538
        return addFloatx80Sigs( a, b, aSign );
3539
    }
3540
    else {
3541
        return subFloatx80Sigs( a, b, aSign );
3542
    }
3543
 
3544
}
3545
 
3546
/*----------------------------------------------------------------------------
3547
| Returns the result of subtracting the extended double-precision floating-
3548
| point values `a' and `b'.  The operation is performed according to the
3549
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3550
*----------------------------------------------------------------------------*/
3551
 
3552
floatx80 floatx80_sub( floatx80 a, floatx80 b )
3553
{
3554
    flag aSign, bSign;
3555
 
3556
    aSign = extractFloatx80Sign( a );
3557
    bSign = extractFloatx80Sign( b );
3558
    if ( aSign == bSign ) {
3559
        return subFloatx80Sigs( a, b, aSign );
3560
    }
3561
    else {
3562
        return addFloatx80Sigs( a, b, aSign );
3563
    }
3564
 
3565
}
3566
 
3567
/*----------------------------------------------------------------------------
3568
| Returns the result of multiplying the extended double-precision floating-
3569
| point values `a' and `b'.  The operation is performed according to the
3570
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3571
*----------------------------------------------------------------------------*/
3572
 
3573
floatx80 floatx80_mul( floatx80 a, floatx80 b )
3574
{
3575
    flag aSign, bSign, zSign;
3576
    int32 aExp, bExp, zExp;
3577
    bits64 aSig, bSig, zSig0, zSig1;
3578
    floatx80 z;
3579
 
3580
    aSig = extractFloatx80Frac( a );
3581
    aExp = extractFloatx80Exp( a );
3582
    aSign = extractFloatx80Sign( a );
3583
    bSig = extractFloatx80Frac( b );
3584
    bExp = extractFloatx80Exp( b );
3585
    bSign = extractFloatx80Sign( b );
3586
    zSign = aSign ^ bSign;
3587
    if ( aExp == 0x7FFF ) {
3588
        if (    (bits64) ( aSig<<1 )
3589
             || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3590
            return propagateFloatx80NaN( a, b );
3591
        }
3592
        if ( ( bExp | bSig ) == 0 ) goto invalid;
3593
        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3594
    }
3595
    if ( bExp == 0x7FFF ) {
3596
        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3597
        if ( ( aExp | aSig ) == 0 ) {
3598
 invalid:
3599
            float_raise( float_flag_invalid );
3600
            z.low = floatx80_default_nan_low;
3601
            z.high = floatx80_default_nan_high;
3602
            return z;
3603
        }
3604
        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3605
    }
3606
    if ( aExp == 0 ) {
3607
        if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3608
        normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3609
    }
3610
    if ( bExp == 0 ) {
3611
        if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
3612
        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3613
    }
3614
    zExp = aExp + bExp - 0x3FFE;
3615
    mul64To128( aSig, bSig, &zSig0, &zSig1 );
3616
    if ( 0 < (sbits64) zSig0 ) {
3617
        shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
3618
        --zExp;
3619
    }
3620
    return
3621
        roundAndPackFloatx80(
3622
            floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3623
 
3624
}
3625
 
3626
/*----------------------------------------------------------------------------
3627
| Returns the result of dividing the extended double-precision floating-point
3628
| value `a' by the corresponding value `b'.  The operation is performed
3629
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3630
*----------------------------------------------------------------------------*/
3631
 
3632
floatx80 floatx80_div( floatx80 a, floatx80 b )
3633
{
3634
    flag aSign, bSign, zSign;
3635
    int32 aExp, bExp, zExp;
3636
    bits64 aSig, bSig, zSig0, zSig1;
3637
    bits64 rem0, rem1, rem2, term0, term1, term2;
3638
    floatx80 z;
3639
 
3640
    aSig = extractFloatx80Frac( a );
3641
    aExp = extractFloatx80Exp( a );
3642
    aSign = extractFloatx80Sign( a );
3643
    bSig = extractFloatx80Frac( b );
3644
    bExp = extractFloatx80Exp( b );
3645
    bSign = extractFloatx80Sign( b );
3646
    zSign = aSign ^ bSign;
3647
    if ( aExp == 0x7FFF ) {
3648
        if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3649
        if ( bExp == 0x7FFF ) {
3650
            if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3651
            goto invalid;
3652
        }
3653
        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3654
    }
3655
    if ( bExp == 0x7FFF ) {
3656
        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3657
        return packFloatx80( zSign, 0, 0 );
3658
    }
3659
    if ( bExp == 0 ) {
3660
        if ( bSig == 0 ) {
3661
            if ( ( aExp | aSig ) == 0 ) {
3662
 invalid:
3663
                float_raise( float_flag_invalid );
3664
                z.low = floatx80_default_nan_low;
3665
                z.high = floatx80_default_nan_high;
3666
                return z;
3667
            }
3668
            float_raise( float_flag_divbyzero );
3669
            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3670
        }
3671
        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3672
    }
3673
    if ( aExp == 0 ) {
3674
        if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3675
        normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3676
    }
3677
    zExp = aExp - bExp + 0x3FFE;
3678
    rem1 = 0;
3679
    if ( bSig <= aSig ) {
3680
        shift128Right( aSig, 0, 1, &aSig, &rem1 );
3681
        ++zExp;
3682
    }
3683
    zSig0 = estimateDiv128To64( aSig, rem1, bSig );
3684
    mul64To128( bSig, zSig0, &term0, &term1 );
3685
    sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
3686
    while ( (sbits64) rem0 < 0 ) {
3687
        --zSig0;
3688
        add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3689
    }
3690
    zSig1 = estimateDiv128To64( rem1, 0, bSig );
3691
    if ( (bits64) ( zSig1<<1 ) <= 8 ) {
3692
        mul64To128( bSig, zSig1, &term1, &term2 );
3693
        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3694
        while ( (sbits64) rem1 < 0 ) {
3695
            --zSig1;
3696
            add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
3697
        }
3698
        zSig1 |= ( ( rem1 | rem2 ) != 0 );
3699
    }
3700
    return
3701
        roundAndPackFloatx80(
3702
            floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3703
 
3704
}
3705
 
3706
/*----------------------------------------------------------------------------
3707
| Returns the remainder of the extended double-precision floating-point value
3708
| `a' with respect to the corresponding value `b'.  The operation is performed
3709
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3710
*----------------------------------------------------------------------------*/
3711
 
3712
floatx80 floatx80_rem( floatx80 a, floatx80 b )
3713
{
3714
    flag aSign, bSign, zSign;
3715
    int32 aExp, bExp, expDiff;
3716
    bits64 aSig0, aSig1, bSig;
3717
    bits64 q, term0, term1, alternateASig0, alternateASig1;
3718
    floatx80 z;
3719
 
3720
    aSig0 = extractFloatx80Frac( a );
3721
    aExp = extractFloatx80Exp( a );
3722
    aSign = extractFloatx80Sign( a );
3723
    bSig = extractFloatx80Frac( b );
3724
    bExp = extractFloatx80Exp( b );
3725
    bSign = extractFloatx80Sign( b );
3726
    if ( aExp == 0x7FFF ) {
3727
        if (    (bits64) ( aSig0<<1 )
3728
             || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3729
            return propagateFloatx80NaN( a, b );
3730
        }
3731
        goto invalid;
3732
    }
3733
    if ( bExp == 0x7FFF ) {
3734
        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3735
        return a;
3736
    }
3737
    if ( bExp == 0 ) {
3738
        if ( bSig == 0 ) {
3739
 invalid:
3740
            float_raise( float_flag_invalid );
3741
            z.low = floatx80_default_nan_low;
3742
            z.high = floatx80_default_nan_high;
3743
            return z;
3744
        }
3745
        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3746
    }
3747
    if ( aExp == 0 ) {
3748
        if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
3749
        normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3750
    }
3751
    bSig |= LIT64( 0x8000000000000000 );
3752
    zSign = aSign;
3753
    expDiff = aExp - bExp;
3754
    aSig1 = 0;
3755
    if ( expDiff < 0 ) {
3756
        if ( expDiff < -1 ) return a;
3757
        shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
3758
        expDiff = 0;
3759
    }
3760
    q = ( bSig <= aSig0 );
3761
    if ( q ) aSig0 -= bSig;
3762
    expDiff -= 64;
3763
    while ( 0 < expDiff ) {
3764
        q = estimateDiv128To64( aSig0, aSig1, bSig );
3765
        q = ( 2 < q ) ? q - 2 : 0;
3766
        mul64To128( bSig, q, &term0, &term1 );
3767
        sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3768
        shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
3769
        expDiff -= 62;
3770
    }
3771
    expDiff += 64;
3772
    if ( 0 < expDiff ) {
3773
        q = estimateDiv128To64( aSig0, aSig1, bSig );
3774
        q = ( 2 < q ) ? q - 2 : 0;
3775
        q >>= 64 - expDiff;
3776
        mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
3777
        sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3778
        shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
3779
        while ( le128( term0, term1, aSig0, aSig1 ) ) {
3780
            ++q;
3781
            sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3782
        }
3783
    }
3784
    else {
3785
        term1 = 0;
3786
        term0 = bSig;
3787
    }
3788
    sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
3789
    if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
3790
         || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
3791
              && ( q & 1 ) )
3792
       ) {
3793
        aSig0 = alternateASig0;
3794
        aSig1 = alternateASig1;
3795
        zSign = ! zSign;
3796
    }
3797
    return
3798
        normalizeRoundAndPackFloatx80(
3799
            80, zSign, bExp + expDiff, aSig0, aSig1 );
3800
 
3801
}
3802
 
3803
/*----------------------------------------------------------------------------
3804
| Returns the square root of the extended double-precision floating-point
3805
| value `a'.  The operation is performed according to the IEC/IEEE Standard
3806
| for Binary Floating-Point Arithmetic.
3807
*----------------------------------------------------------------------------*/
3808
 
3809
floatx80 floatx80_sqrt( floatx80 a )
3810
{
3811
    flag aSign;
3812
    int32 aExp, zExp;
3813
    bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
3814
    bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
3815
    floatx80 z;
3816
 
3817
    aSig0 = extractFloatx80Frac( a );
3818
    aExp = extractFloatx80Exp( a );
3819
    aSign = extractFloatx80Sign( a );
3820
    if ( aExp == 0x7FFF ) {
3821
        if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a );
3822
        if ( ! aSign ) return a;
3823
        goto invalid;
3824
    }
3825
    if ( aSign ) {
3826
        if ( ( aExp | aSig0 ) == 0 ) return a;
3827
 invalid:
3828
        float_raise( float_flag_invalid );
3829
        z.low = floatx80_default_nan_low;
3830
        z.high = floatx80_default_nan_high;
3831
        return z;
3832
    }
3833
    if ( aExp == 0 ) {
3834
        if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
3835
        normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3836
    }
3837
    zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
3838
    zSig0 = estimateSqrt32( aExp, aSig0>>32 );
3839
    shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
3840
    zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
3841
    doubleZSig0 = zSig0<<1;
3842
    mul64To128( zSig0, zSig0, &term0, &term1 );
3843
    sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
3844
    while ( (sbits64) rem0 < 0 ) {
3845
        --zSig0;
3846
        doubleZSig0 -= 2;
3847
        add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
3848
    }
3849
    zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
3850
    if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
3851
        if ( zSig1 == 0 ) zSig1 = 1;
3852
        mul64To128( doubleZSig0, zSig1, &term1, &term2 );
3853
        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3854
        mul64To128( zSig1, zSig1, &term2, &term3 );
3855
        sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
3856
        while ( (sbits64) rem1 < 0 ) {
3857
            --zSig1;
3858
            shortShift128Left( 0, zSig1, 1, &term2, &term3 );
3859
            term3 |= 1;
3860
            term2 |= doubleZSig0;
3861
            add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
3862
        }
3863
        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
3864
    }
3865
    shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
3866
    zSig0 |= doubleZSig0;
3867
    return
3868
        roundAndPackFloatx80(
3869
            floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
3870
 
3871
}
3872
 
3873
/*----------------------------------------------------------------------------
3874
| Returns 1 if the extended double-precision floating-point value `a' is
3875
| equal to the corresponding value `b', and 0 otherwise.  The comparison is
3876
| performed according to the IEC/IEEE Standard for Binary Floating-Point
3877
| Arithmetic.
3878
*----------------------------------------------------------------------------*/
3879
 
3880
flag floatx80_eq( floatx80 a, floatx80 b )
3881
{
3882
 
3883
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3884
              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3885
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3886
              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3887
       ) {
3888
        if (    floatx80_is_signaling_nan( a )
3889
             || floatx80_is_signaling_nan( b ) ) {
3890
            float_raise( float_flag_invalid );
3891
        }
3892
        return 0;
3893
    }
3894
    return
3895
           ( a.low == b.low )
3896
        && (    ( a.high == b.high )
3897
             || (    ( a.low == 0 )
3898
                  && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
3899
           );
3900
 
3901
}
3902
 
3903
/*----------------------------------------------------------------------------
3904
| Returns 1 if the extended double-precision floating-point value `a' is
3905
| less than or equal to the corresponding value `b', and 0 otherwise.  The
3906
| comparison is performed according to the IEC/IEEE Standard for Binary
3907
| Floating-Point Arithmetic.
3908
*----------------------------------------------------------------------------*/
3909
 
3910
flag floatx80_le( floatx80 a, floatx80 b )
3911
{
3912
    flag aSign, bSign;
3913
 
3914
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3915
              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3916
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3917
              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3918
       ) {
3919
        float_raise( float_flag_invalid );
3920
        return 0;
3921
    }
3922
    aSign = extractFloatx80Sign( a );
3923
    bSign = extractFloatx80Sign( b );
3924
    if ( aSign != bSign ) {
3925
        return
3926
               aSign
3927
            || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3928
                 == 0 );
3929
    }
3930
    return
3931
          aSign ? le128( b.high, b.low, a.high, a.low )
3932
        : le128( a.high, a.low, b.high, b.low );
3933
 
3934
}
3935
 
3936
/*----------------------------------------------------------------------------
3937
| Returns 1 if the extended double-precision floating-point value `a' is
3938
| less than the corresponding value `b', and 0 otherwise.  The comparison
3939
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
3940
| Arithmetic.
3941
*----------------------------------------------------------------------------*/
3942
 
3943
flag floatx80_lt( floatx80 a, floatx80 b )
3944
{
3945
    flag aSign, bSign;
3946
 
3947
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3948
              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3949
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3950
              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3951
       ) {
3952
        float_raise( float_flag_invalid );
3953
        return 0;
3954
    }
3955
    aSign = extractFloatx80Sign( a );
3956
    bSign = extractFloatx80Sign( b );
3957
    if ( aSign != bSign ) {
3958
        return
3959
               aSign
3960
            && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3961
                 != 0 );
3962
    }
3963
    return
3964
          aSign ? lt128( b.high, b.low, a.high, a.low )
3965
        : lt128( a.high, a.low, b.high, b.low );
3966
 
3967
}
3968
 
3969
/*----------------------------------------------------------------------------
3970
| Returns 1 if the extended double-precision floating-point value `a' is equal
3971
| to the corresponding value `b', and 0 otherwise.  The invalid exception is
3972
| raised if either operand is a NaN.  Otherwise, the comparison is performed
3973
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3974
*----------------------------------------------------------------------------*/
3975
 
3976
flag floatx80_eq_signaling( floatx80 a, floatx80 b )
3977
{
3978
 
3979
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3980
              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3981
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3982
              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3983
       ) {
3984
        float_raise( float_flag_invalid );
3985
        return 0;
3986
    }
3987
    return
3988
           ( a.low == b.low )
3989
        && (    ( a.high == b.high )
3990
             || (    ( a.low == 0 )
3991
                  && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
3992
           );
3993
 
3994
}
3995
 
3996
/*----------------------------------------------------------------------------
3997
| Returns 1 if the extended double-precision floating-point value `a' is less
3998
| than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
3999
| do not cause an exception.  Otherwise, the comparison is performed according
4000
| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4001
*----------------------------------------------------------------------------*/
4002
 
4003
flag floatx80_le_quiet( floatx80 a, floatx80 b )
4004
{
4005
    flag aSign, bSign;
4006
 
4007
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4008
              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4009
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4010
              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4011
       ) {
4012
        if (    floatx80_is_signaling_nan( a )
4013
             || floatx80_is_signaling_nan( b ) ) {
4014
            float_raise( float_flag_invalid );
4015
        }
4016
        return 0;
4017
    }
4018
    aSign = extractFloatx80Sign( a );
4019
    bSign = extractFloatx80Sign( b );
4020
    if ( aSign != bSign ) {
4021
        return
4022
               aSign
4023
            || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4024
                 == 0 );
4025
    }
4026
    return
4027
          aSign ? le128( b.high, b.low, a.high, a.low )
4028
        : le128( a.high, a.low, b.high, b.low );
4029
 
4030
}
4031
 
4032
/*----------------------------------------------------------------------------
4033
| Returns 1 if the extended double-precision floating-point value `a' is less
4034
| than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
4035
| an exception.  Otherwise, the comparison is performed according to the
4036
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4037
*----------------------------------------------------------------------------*/
4038
 
4039
flag floatx80_lt_quiet( floatx80 a, floatx80 b )
4040
{
4041
    flag aSign, bSign;
4042
 
4043
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4044
              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4045
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4046
              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4047
       ) {
4048
        if (    floatx80_is_signaling_nan( a )
4049
             || floatx80_is_signaling_nan( b ) ) {
4050
            float_raise( float_flag_invalid );
4051
        }
4052
        return 0;
4053
    }
4054
    aSign = extractFloatx80Sign( a );
4055
    bSign = extractFloatx80Sign( b );
4056
    if ( aSign != bSign ) {
4057
        return
4058
               aSign
4059
            && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4060
                 != 0 );
4061
    }
4062
    return
4063
          aSign ? lt128( b.high, b.low, a.high, a.low )
4064
        : lt128( a.high, a.low, b.high, b.low );
4065
 
4066
}
4067
 
4068
#endif
4069
 
4070
#ifdef FLOAT128
4071
 
4072
/*----------------------------------------------------------------------------
4073
| Returns the result of converting the quadruple-precision floating-point
4074
| value `a' to the 32-bit two's complement integer format.  The conversion
4075
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4076
| Arithmetic---which means in particular that the conversion is rounded
4077
| according to the current rounding mode.  If `a' is a NaN, the largest
4078
| positive integer is returned.  Otherwise, if the conversion overflows, the
4079
| largest integer with the same sign as `a' is returned.
4080
*----------------------------------------------------------------------------*/
4081
 
4082
int32 float128_to_int32( float128 a )
4083
{
4084
    flag aSign;
4085
    int32 aExp, shiftCount;
4086
    bits64 aSig0, aSig1;
4087
 
4088
    aSig1 = extractFloat128Frac1( a );
4089
    aSig0 = extractFloat128Frac0( a );
4090
    aExp = extractFloat128Exp( a );
4091
    aSign = extractFloat128Sign( a );
4092
    if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4093
    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4094
    aSig0 |= ( aSig1 != 0 );
4095
    shiftCount = 0x4028 - aExp;
4096
    if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4097
    return roundAndPackInt32( aSign, aSig0 );
4098
 
4099
}
4100
 
4101
/*----------------------------------------------------------------------------
4102
| Returns the result of converting the quadruple-precision floating-point
4103
| value `a' to the 32-bit two's complement integer format.  The conversion
4104
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4105
| Arithmetic, except that the conversion is always rounded toward zero.  If
4106
| `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
4107
| conversion overflows, the largest integer with the same sign as `a' is
4108
| returned.
4109
*----------------------------------------------------------------------------*/
4110
 
4111
int32 float128_to_int32_round_to_zero( float128 a )
4112
{
4113
    flag aSign;
4114
    int32 aExp, shiftCount;
4115
    bits64 aSig0, aSig1, savedASig;
4116
    int32 z;
4117
 
4118
    aSig1 = extractFloat128Frac1( a );
4119
    aSig0 = extractFloat128Frac0( a );
4120
    aExp = extractFloat128Exp( a );
4121
    aSign = extractFloat128Sign( a );
4122
    aSig0 |= ( aSig1 != 0 );
4123
    if ( 0x401E < aExp ) {
4124
        if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4125
        goto invalid;
4126
    }
4127
    else if ( aExp < 0x3FFF ) {
4128
        if ( aExp || aSig0 ) float_exception_flags |= float_flag_inexact;
4129
        return 0;
4130
    }
4131
    aSig0 |= LIT64( 0x0001000000000000 );
4132
    shiftCount = 0x402F - aExp;
4133
    savedASig = aSig0;
4134
    aSig0 >>= shiftCount;
4135
    z = aSig0;
4136
    if ( aSign ) z = - z;
4137
    if ( ( z < 0 ) ^ aSign ) {
4138
 invalid:
4139
        float_raise( float_flag_invalid );
4140
        return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
4141
    }
4142
    if ( ( aSig0<<shiftCount ) != savedASig ) {
4143
        float_exception_flags |= float_flag_inexact;
4144
    }
4145
    return z;
4146
 
4147
}
4148
 
4149
/*----------------------------------------------------------------------------
4150
| Returns the result of converting the quadruple-precision floating-point
4151
| value `a' to the 64-bit two's complement integer format.  The conversion
4152
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4153
| Arithmetic---which means in particular that the conversion is rounded
4154
| according to the current rounding mode.  If `a' is a NaN, the largest
4155
| positive integer is returned.  Otherwise, if the conversion overflows, the
4156
| largest integer with the same sign as `a' is returned.
4157
*----------------------------------------------------------------------------*/
4158
 
4159
int64 float128_to_int64( float128 a )
4160
{
4161
    flag aSign;
4162
    int32 aExp, shiftCount;
4163
    bits64 aSig0, aSig1;
4164
 
4165
    aSig1 = extractFloat128Frac1( a );
4166
    aSig0 = extractFloat128Frac0( a );
4167
    aExp = extractFloat128Exp( a );
4168
    aSign = extractFloat128Sign( a );
4169
    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4170
    shiftCount = 0x402F - aExp;
4171
    if ( shiftCount <= 0 ) {
4172
        if ( 0x403E < aExp ) {
4173
            float_raise( float_flag_invalid );
4174
            if (    ! aSign
4175
                 || (    ( aExp == 0x7FFF )
4176
                      && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4177
                    )
4178
               ) {
4179
                return LIT64( 0x7FFFFFFFFFFFFFFF );
4180
            }
4181
            return (sbits64) LIT64( 0x8000000000000000 );
4182
        }
4183
        shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4184
    }
4185
    else {
4186
        shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4187
    }
4188
    return roundAndPackInt64( aSign, aSig0, aSig1 );
4189
 
4190
}
4191
 
4192
/*----------------------------------------------------------------------------
4193
| Returns the result of converting the quadruple-precision floating-point
4194
| value `a' to the 64-bit two's complement integer format.  The conversion
4195
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4196
| Arithmetic, except that the conversion is always rounded toward zero.
4197
| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
4198
| the conversion overflows, the largest integer with the same sign as `a' is
4199
| returned.
4200
*----------------------------------------------------------------------------*/
4201
 
4202
int64 float128_to_int64_round_to_zero( float128 a )
4203
{
4204
    flag aSign;
4205
    int32 aExp, shiftCount;
4206
    bits64 aSig0, aSig1;
4207
    int64 z;
4208
 
4209
    aSig1 = extractFloat128Frac1( a );
4210
    aSig0 = extractFloat128Frac0( a );
4211
    aExp = extractFloat128Exp( a );
4212
    aSign = extractFloat128Sign( a );
4213
    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4214
    shiftCount = aExp - 0x402F;
4215
    if ( 0 < shiftCount ) {
4216
        if ( 0x403E <= aExp ) {
4217
            aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4218
            if (    ( a.high == LIT64( 0xC03E000000000000 ) )
4219
                 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4220
                if ( aSig1 ) float_exception_flags |= float_flag_inexact;
4221
            }
4222
            else {
4223
                float_raise( float_flag_invalid );
4224
                if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4225
                    return LIT64( 0x7FFFFFFFFFFFFFFF );
4226
                }
4227
            }
4228
            return (sbits64) LIT64( 0x8000000000000000 );
4229
        }
4230
        z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4231
        if ( (bits64) ( aSig1<<shiftCount ) ) {
4232
            float_exception_flags |= float_flag_inexact;
4233
        }
4234
    }
4235
    else {
4236
        if ( aExp < 0x3FFF ) {
4237
            if ( aExp | aSig0 | aSig1 ) {
4238
                float_exception_flags |= float_flag_inexact;
4239
            }
4240
            return 0;
4241
        }
4242
        z = aSig0>>( - shiftCount );
4243
        if (    aSig1
4244
             || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4245
            float_exception_flags |= float_flag_inexact;
4246
        }
4247
    }
4248
    if ( aSign ) z = - z;
4249
    return z;
4250
 
4251
}
4252
 
4253
/*----------------------------------------------------------------------------
4254
| Returns the result of converting the quadruple-precision floating-point
4255
| value `a' to the single-precision floating-point format.  The conversion
4256
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4257
| Arithmetic.
4258
*----------------------------------------------------------------------------*/
4259
 
4260
float32 float128_to_float32( float128 a )
4261
{
4262
    flag aSign;
4263
    int32 aExp;
4264
    bits64 aSig0, aSig1;
4265
    bits32 zSig;
4266
 
4267
    aSig1 = extractFloat128Frac1( a );
4268
    aSig0 = extractFloat128Frac0( a );
4269
    aExp = extractFloat128Exp( a );
4270
    aSign = extractFloat128Sign( a );
4271
    if ( aExp == 0x7FFF ) {
4272
        if ( aSig0 | aSig1 ) {
4273
            return commonNaNToFloat32( float128ToCommonNaN( a ) );
4274
        }
4275
        return packFloat32( aSign, 0xFF, 0 );
4276
    }
4277
    aSig0 |= ( aSig1 != 0 );
4278
    shift64RightJamming( aSig0, 18, &aSig0 );
4279
    zSig = aSig0;
4280
    if ( aExp || zSig ) {
4281
        zSig |= 0x40000000;
4282
        aExp -= 0x3F81;
4283
    }
4284
    return roundAndPackFloat32( aSign, aExp, zSig );
4285
 
4286
}
4287
 
4288
/*----------------------------------------------------------------------------
4289
| Returns the result of converting the quadruple-precision floating-point
4290
| value `a' to the double-precision floating-point format.  The conversion
4291
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4292
| Arithmetic.
4293
*----------------------------------------------------------------------------*/
4294
 
4295
float64 float128_to_float64( float128 a )
4296
{
4297
    flag aSign;
4298
    int32 aExp;
4299
    bits64 aSig0, aSig1;
4300
 
4301
    aSig1 = extractFloat128Frac1( a );
4302
    aSig0 = extractFloat128Frac0( a );
4303
    aExp = extractFloat128Exp( a );
4304
    aSign = extractFloat128Sign( a );
4305
    if ( aExp == 0x7FFF ) {
4306
        if ( aSig0 | aSig1 ) {
4307
            return commonNaNToFloat64( float128ToCommonNaN( a ) );
4308
        }
4309
        return packFloat64( aSign, 0x7FF, 0 );
4310
    }
4311
    shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4312
    aSig0 |= ( aSig1 != 0 );
4313
    if ( aExp || aSig0 ) {
4314
        aSig0 |= LIT64( 0x4000000000000000 );
4315
        aExp -= 0x3C01;
4316
    }
4317
    return roundAndPackFloat64( aSign, aExp, aSig0 );
4318
 
4319
}
4320
 
4321
#ifdef FLOATX80
4322
 
4323
/*----------------------------------------------------------------------------
4324
| Returns the result of converting the quadruple-precision floating-point
4325
| value `a' to the extended double-precision floating-point format.  The
4326
| conversion is performed according to the IEC/IEEE Standard for Binary
4327
| Floating-Point Arithmetic.
4328
*----------------------------------------------------------------------------*/
4329
 
4330
floatx80 float128_to_floatx80( float128 a )
4331
{
4332
    flag aSign;
4333
    int32 aExp;
4334
    bits64 aSig0, aSig1;
4335
 
4336
    aSig1 = extractFloat128Frac1( a );
4337
    aSig0 = extractFloat128Frac0( a );
4338
    aExp = extractFloat128Exp( a );
4339
    aSign = extractFloat128Sign( a );
4340
    if ( aExp == 0x7FFF ) {
4341
        if ( aSig0 | aSig1 ) {
4342
            return commonNaNToFloatx80( float128ToCommonNaN( a ) );
4343
        }
4344
        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4345
    }
4346
    if ( aExp == 0 ) {
4347
        if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4348
        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4349
    }
4350
    else {
4351
        aSig0 |= LIT64( 0x0001000000000000 );
4352
    }
4353
    shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4354
    return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 );
4355
 
4356
}
4357
 
4358
#endif
4359
 
4360
/*----------------------------------------------------------------------------
4361
| Rounds the quadruple-precision floating-point value `a' to an integer, and
4362
| returns the result as a quadruple-precision floating-point value.  The
4363
| operation is performed according to the IEC/IEEE Standard for Binary
4364
| Floating-Point Arithmetic.
4365
*----------------------------------------------------------------------------*/
4366
 
4367
float128 float128_round_to_int( float128 a )
4368
{
4369
    flag aSign;
4370
    int32 aExp;
4371
    bits64 lastBitMask, roundBitsMask;
4372
    int8 roundingMode;
4373
    float128 z;
4374
 
4375
    aExp = extractFloat128Exp( a );
4376
    if ( 0x402F <= aExp ) {
4377
        if ( 0x406F <= aExp ) {
4378
            if (    ( aExp == 0x7FFF )
4379
                 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
4380
               ) {
4381
                return propagateFloat128NaN( a, a );
4382
            }
4383
            return a;
4384
        }
4385
        lastBitMask = 1;
4386
        lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
4387
        roundBitsMask = lastBitMask - 1;
4388
        z = a;
4389
        roundingMode = float_rounding_mode;
4390
        if ( roundingMode == float_round_nearest_even ) {
4391
            if ( lastBitMask ) {
4392
                add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
4393
                if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4394
            }
4395
            else {
4396
                if ( (sbits64) z.low < 0 ) {
4397
                    ++z.high;
4398
                    if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
4399
                }
4400
            }
4401
        }
4402
        else if ( roundingMode != float_round_to_zero ) {
4403
            if (   extractFloat128Sign( z )
4404
                 ^ ( roundingMode == float_round_up ) ) {
4405
                add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
4406
            }
4407
        }
4408
        z.low &= ~ roundBitsMask;
4409
    }
4410
    else {
4411
        if ( aExp < 0x3FFF ) {
4412
            if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
4413
            float_exception_flags |= float_flag_inexact;
4414
            aSign = extractFloat128Sign( a );
4415
            switch ( float_rounding_mode ) {
4416
             case float_round_nearest_even:
4417
                if (    ( aExp == 0x3FFE )
4418
                     && (   extractFloat128Frac0( a )
4419
                          | extractFloat128Frac1( a ) )
4420
                   ) {
4421
                    return packFloat128( aSign, 0x3FFF, 0, 0 );
4422
                }
4423
                break;
4424
             case float_round_down:
4425
                return
4426
                      aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
4427
                    : packFloat128( 0, 0, 0, 0 );
4428
             case float_round_up:
4429
                return
4430
                      aSign ? packFloat128( 1, 0, 0, 0 )
4431
                    : packFloat128( 0, 0x3FFF, 0, 0 );
4432
            }
4433
            return packFloat128( aSign, 0, 0, 0 );
4434
        }
4435
        lastBitMask = 1;
4436
        lastBitMask <<= 0x402F - aExp;
4437
        roundBitsMask = lastBitMask - 1;
4438
        z.low = 0;
4439
        z.high = a.high;
4440
        roundingMode = float_rounding_mode;
4441
        if ( roundingMode == float_round_nearest_even ) {
4442
            z.high += lastBitMask>>1;
4443
            if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
4444
                z.high &= ~ lastBitMask;
4445
            }
4446
        }
4447
        else if ( roundingMode != float_round_to_zero ) {
4448
            if (   extractFloat128Sign( z )
4449
                 ^ ( roundingMode == float_round_up ) ) {
4450
                z.high |= ( a.low != 0 );
4451
                z.high += roundBitsMask;
4452
            }
4453
        }
4454
        z.high &= ~ roundBitsMask;
4455
    }
4456
    if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
4457
        float_exception_flags |= float_flag_inexact;
4458
    }
4459
    return z;
4460
 
4461
}
4462
 
4463
/*----------------------------------------------------------------------------
4464
| Returns the result of adding the absolute values of the quadruple-precision
4465
| floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
4466
| before being returned.  `zSign' is ignored if the result is a NaN.
4467
| The addition is performed according to the IEC/IEEE Standard for Binary
4468
| Floating-Point Arithmetic.
4469
*----------------------------------------------------------------------------*/
4470
 
4471
static float128 addFloat128Sigs( float128 a, float128 b, flag zSign )
4472
{
4473
    int32 aExp, bExp, zExp;
4474
    bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4475
    int32 expDiff;
4476
 
4477
    aSig1 = extractFloat128Frac1( a );
4478
    aSig0 = extractFloat128Frac0( a );
4479
    aExp = extractFloat128Exp( a );
4480
    bSig1 = extractFloat128Frac1( b );
4481
    bSig0 = extractFloat128Frac0( b );
4482
    bExp = extractFloat128Exp( b );
4483
    expDiff = aExp - bExp;
4484
    if ( 0 < expDiff ) {
4485
        if ( aExp == 0x7FFF ) {
4486
            if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4487
            return a;
4488
        }
4489
        if ( bExp == 0 ) {
4490
            --expDiff;
4491
        }
4492
        else {
4493
            bSig0 |= LIT64( 0x0001000000000000 );
4494
        }
4495
        shift128ExtraRightJamming(
4496
            bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
4497
        zExp = aExp;
4498
    }
4499
    else if ( expDiff < 0 ) {
4500
        if ( bExp == 0x7FFF ) {
4501
            if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4502
            return packFloat128( zSign, 0x7FFF, 0, 0 );
4503
        }
4504
        if ( aExp == 0 ) {
4505
            ++expDiff;
4506
        }
4507
        else {
4508
            aSig0 |= LIT64( 0x0001000000000000 );
4509
        }
4510
        shift128ExtraRightJamming(
4511
            aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
4512
        zExp = bExp;
4513
    }
4514
    else {
4515
        if ( aExp == 0x7FFF ) {
4516
            if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4517
                return propagateFloat128NaN( a, b );
4518
            }
4519
            return a;
4520
        }
4521
        add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4522
        if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 );
4523
        zSig2 = 0;
4524
        zSig0 |= LIT64( 0x0002000000000000 );
4525
        zExp = aExp;
4526
        goto shiftRight1;
4527
    }
4528
    aSig0 |= LIT64( 0x0001000000000000 );
4529
    add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4530
    --zExp;
4531
    if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
4532
    ++zExp;
4533
 shiftRight1:
4534
    shift128ExtraRightJamming(
4535
        zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4536
 roundAndPack:
4537
    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4538
 
4539
}
4540
 
4541
/*----------------------------------------------------------------------------
4542
| Returns the result of subtracting the absolute values of the quadruple-
4543
| precision floating-point values `a' and `b'.  If `zSign' is 1, the
4544
| difference is negated before being returned.  `zSign' is ignored if the
4545
| result is a NaN.  The subtraction is performed according to the IEC/IEEE
4546
| Standard for Binary Floating-Point Arithmetic.
4547
*----------------------------------------------------------------------------*/
4548
 
4549
static float128 subFloat128Sigs( float128 a, float128 b, flag zSign )
4550
{
4551
    int32 aExp, bExp, zExp;
4552
    bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
4553
    int32 expDiff;
4554
    float128 z;
4555
 
4556
    aSig1 = extractFloat128Frac1( a );
4557
    aSig0 = extractFloat128Frac0( a );
4558
    aExp = extractFloat128Exp( a );
4559
    bSig1 = extractFloat128Frac1( b );
4560
    bSig0 = extractFloat128Frac0( b );
4561
    bExp = extractFloat128Exp( b );
4562
    expDiff = aExp - bExp;
4563
    shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4564
    shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
4565
    if ( 0 < expDiff ) goto aExpBigger;
4566
    if ( expDiff < 0 ) goto bExpBigger;
4567
    if ( aExp == 0x7FFF ) {
4568
        if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4569
            return propagateFloat128NaN( a, b );
4570
        }
4571
        float_raise( float_flag_invalid );
4572
        z.low = float128_default_nan_low;
4573
        z.high = float128_default_nan_high;
4574
        return z;
4575
    }
4576
    if ( aExp == 0 ) {
4577
        aExp = 1;
4578
        bExp = 1;
4579
    }
4580
    if ( bSig0 < aSig0 ) goto aBigger;
4581
    if ( aSig0 < bSig0 ) goto bBigger;
4582
    if ( bSig1 < aSig1 ) goto aBigger;
4583
    if ( aSig1 < bSig1 ) goto bBigger;
4584
    return packFloat128( float_rounding_mode == float_round_down, 0, 0, 0 );
4585
 bExpBigger:
4586
    if ( bExp == 0x7FFF ) {
4587
        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4588
        return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
4589
    }
4590
    if ( aExp == 0 ) {
4591
        ++expDiff;
4592
    }
4593
    else {
4594
        aSig0 |= LIT64( 0x4000000000000000 );
4595
    }
4596
    shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4597
    bSig0 |= LIT64( 0x4000000000000000 );
4598
 bBigger:
4599
    sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
4600
    zExp = bExp;
4601
    zSign ^= 1;
4602
    goto normalizeRoundAndPack;
4603
 aExpBigger:
4604
    if ( aExp == 0x7FFF ) {
4605
        if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4606
        return a;
4607
    }
4608
    if ( bExp == 0 ) {
4609
        --expDiff;
4610
    }
4611
    else {
4612
        bSig0 |= LIT64( 0x4000000000000000 );
4613
    }
4614
    shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
4615
    aSig0 |= LIT64( 0x4000000000000000 );
4616
 aBigger:
4617
    sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4618
    zExp = aExp;
4619
 normalizeRoundAndPack:
4620
    --zExp;
4621
    return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 );
4622
 
4623
}
4624
 
4625
/*----------------------------------------------------------------------------
4626
| Returns the result of adding the quadruple-precision floating-point values
4627
| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
4628
| for Binary Floating-Point Arithmetic.
4629
*----------------------------------------------------------------------------*/
4630
 
4631
float128 float128_add( float128 a, float128 b )
4632
{
4633
    flag aSign, bSign;
4634
 
4635
    aSign = extractFloat128Sign( a );
4636
    bSign = extractFloat128Sign( b );
4637
    if ( aSign == bSign ) {
4638
        return addFloat128Sigs( a, b, aSign );
4639
    }
4640
    else {
4641
        return subFloat128Sigs( a, b, aSign );
4642
    }
4643
 
4644
}
4645
 
4646
/*----------------------------------------------------------------------------
4647
| Returns the result of subtracting the quadruple-precision floating-point
4648
| values `a' and `b'.  The operation is performed according to the IEC/IEEE
4649
| Standard for Binary Floating-Point Arithmetic.
4650
*----------------------------------------------------------------------------*/
4651
 
4652
float128 float128_sub( float128 a, float128 b )
4653
{
4654
    flag aSign, bSign;
4655
 
4656
    aSign = extractFloat128Sign( a );
4657
    bSign = extractFloat128Sign( b );
4658
    if ( aSign == bSign ) {
4659
        return subFloat128Sigs( a, b, aSign );
4660
    }
4661
    else {
4662
        return addFloat128Sigs( a, b, aSign );
4663
    }
4664
 
4665
}
4666
 
4667
/*----------------------------------------------------------------------------
4668
| Returns the result of multiplying the quadruple-precision floating-point
4669
| values `a' and `b'.  The operation is performed according to the IEC/IEEE
4670
| Standard for Binary Floating-Point Arithmetic.
4671
*----------------------------------------------------------------------------*/
4672
 
4673
float128 float128_mul( float128 a, float128 b )
4674
{
4675
    flag aSign, bSign, zSign;
4676
    int32 aExp, bExp, zExp;
4677
    bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
4678
    float128 z;
4679
 
4680
    aSig1 = extractFloat128Frac1( a );
4681
    aSig0 = extractFloat128Frac0( a );
4682
    aExp = extractFloat128Exp( a );
4683
    aSign = extractFloat128Sign( a );
4684
    bSig1 = extractFloat128Frac1( b );
4685
    bSig0 = extractFloat128Frac0( b );
4686
    bExp = extractFloat128Exp( b );
4687
    bSign = extractFloat128Sign( b );
4688
    zSign = aSign ^ bSign;
4689
    if ( aExp == 0x7FFF ) {
4690
        if (    ( aSig0 | aSig1 )
4691
             || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4692
            return propagateFloat128NaN( a, b );
4693
        }
4694
        if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
4695
        return packFloat128( zSign, 0x7FFF, 0, 0 );
4696
    }
4697
    if ( bExp == 0x7FFF ) {
4698
        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4699
        if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4700
 invalid:
4701
            float_raise( float_flag_invalid );
4702
            z.low = float128_default_nan_low;
4703
            z.high = float128_default_nan_high;
4704
            return z;
4705
        }
4706
        return packFloat128( zSign, 0x7FFF, 0, 0 );
4707
    }
4708
    if ( aExp == 0 ) {
4709
        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4710
        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4711
    }
4712
    if ( bExp == 0 ) {
4713
        if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4714
        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4715
    }
4716
    zExp = aExp + bExp - 0x4000;
4717
    aSig0 |= LIT64( 0x0001000000000000 );
4718
    shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
4719
    mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
4720
    add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
4721
    zSig2 |= ( zSig3 != 0 );
4722
    if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
4723
        shift128ExtraRightJamming(
4724
            zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4725
        ++zExp;
4726
    }
4727
    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4728
 
4729
}
4730
 
4731
/*----------------------------------------------------------------------------
4732
| Returns the result of dividing the quadruple-precision floating-point value
4733
| `a' by the corresponding value `b'.  The operation is performed according to
4734
| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4735
*----------------------------------------------------------------------------*/
4736
 
4737
float128 float128_div( float128 a, float128 b )
4738
{
4739
    flag aSign, bSign, zSign;
4740
    int32 aExp, bExp, zExp;
4741
    bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4742
    bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4743
    float128 z;
4744
 
4745
    aSig1 = extractFloat128Frac1( a );
4746
    aSig0 = extractFloat128Frac0( a );
4747
    aExp = extractFloat128Exp( a );
4748
    aSign = extractFloat128Sign( a );
4749
    bSig1 = extractFloat128Frac1( b );
4750
    bSig0 = extractFloat128Frac0( b );
4751
    bExp = extractFloat128Exp( b );
4752
    bSign = extractFloat128Sign( b );
4753
    zSign = aSign ^ bSign;
4754
    if ( aExp == 0x7FFF ) {
4755
        if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4756
        if ( bExp == 0x7FFF ) {
4757
            if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4758
            goto invalid;
4759
        }
4760
        return packFloat128( zSign, 0x7FFF, 0, 0 );
4761
    }
4762
    if ( bExp == 0x7FFF ) {
4763
        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4764
        return packFloat128( zSign, 0, 0, 0 );
4765
    }
4766
    if ( bExp == 0 ) {
4767
        if ( ( bSig0 | bSig1 ) == 0 ) {
4768
            if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4769
 invalid:
4770
                float_raise( float_flag_invalid );
4771
                z.low = float128_default_nan_low;
4772
                z.high = float128_default_nan_high;
4773
                return z;
4774
            }
4775
            float_raise( float_flag_divbyzero );
4776
            return packFloat128( zSign, 0x7FFF, 0, 0 );
4777
        }
4778
        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4779
    }
4780
    if ( aExp == 0 ) {
4781
        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4782
        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4783
    }
4784
    zExp = aExp - bExp + 0x3FFD;
4785
    shortShift128Left(
4786
        aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
4787
    shortShift128Left(
4788
        bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
4789
    if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
4790
        shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
4791
        ++zExp;
4792
    }
4793
    zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
4794
    mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
4795
    sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
4796
    while ( (sbits64) rem0 < 0 ) {
4797
        --zSig0;
4798
        add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
4799
    }
4800
    zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
4801
    if ( ( zSig1 & 0x3FFF ) <= 4 ) {
4802
        mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
4803
        sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
4804
        while ( (sbits64) rem1 < 0 ) {
4805
            --zSig1;
4806
            add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
4807
        }
4808
        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4809
    }
4810
    shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
4811
    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4812
 
4813
}
4814
 
4815
/*----------------------------------------------------------------------------
4816
| Returns the remainder of the quadruple-precision floating-point value `a'
4817
| with respect to the corresponding value `b'.  The operation is performed
4818
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4819
*----------------------------------------------------------------------------*/
4820
 
4821
float128 float128_rem( float128 a, float128 b )
4822
{
4823
    flag aSign, bSign, zSign;
4824
    int32 aExp, bExp, expDiff;
4825
    bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
4826
    bits64 allZero, alternateASig0, alternateASig1, sigMean1;
4827
    sbits64 sigMean0;
4828
    float128 z;
4829
 
4830
    aSig1 = extractFloat128Frac1( a );
4831
    aSig0 = extractFloat128Frac0( a );
4832
    aExp = extractFloat128Exp( a );
4833
    aSign = extractFloat128Sign( a );
4834
    bSig1 = extractFloat128Frac1( b );
4835
    bSig0 = extractFloat128Frac0( b );
4836
    bExp = extractFloat128Exp( b );
4837
    bSign = extractFloat128Sign( b );
4838
    if ( aExp == 0x7FFF ) {
4839
        if (    ( aSig0 | aSig1 )
4840
             || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4841
            return propagateFloat128NaN( a, b );
4842
        }
4843
        goto invalid;
4844
    }
4845
    if ( bExp == 0x7FFF ) {
4846
        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4847
        return a;
4848
    }
4849
    if ( bExp == 0 ) {
4850
        if ( ( bSig0 | bSig1 ) == 0 ) {
4851
 invalid:
4852
            float_raise( float_flag_invalid );
4853
            z.low = float128_default_nan_low;
4854
            z.high = float128_default_nan_high;
4855
            return z;
4856
        }
4857
        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4858
    }
4859
    if ( aExp == 0 ) {
4860
        if ( ( aSig0 | aSig1 ) == 0 ) return a;
4861
        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4862
    }
4863
    expDiff = aExp - bExp;
4864
    if ( expDiff < -1 ) return a;
4865
    shortShift128Left(
4866
        aSig0 | LIT64( 0x0001000000000000 ),
4867
        aSig1,
4868
        15 - ( expDiff < 0 ),
4869
        &aSig0,
4870
        &aSig1
4871
    );
4872
    shortShift128Left(
4873
        bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
4874
    q = le128( bSig0, bSig1, aSig0, aSig1 );
4875
    if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
4876
    expDiff -= 64;
4877
    while ( 0 < expDiff ) {
4878
        q = estimateDiv128To64( aSig0, aSig1, bSig0 );
4879
        q = ( 4 < q ) ? q - 4 : 0;
4880
        mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
4881
        shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
4882
        shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
4883
        sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
4884
        expDiff -= 61;
4885
    }
4886
    if ( -64 < expDiff ) {
4887
        q = estimateDiv128To64( aSig0, aSig1, bSig0 );
4888
        q = ( 4 < q ) ? q - 4 : 0;
4889
        q >>= - expDiff;
4890
        shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
4891
        expDiff += 52;
4892
        if ( expDiff < 0 ) {
4893
            shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4894
        }
4895
        else {
4896
            shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
4897
        }
4898
        mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
4899
        sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
4900
    }
4901
    else {
4902
        shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
4903
        shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
4904
    }
4905
    do {
4906
        alternateASig0 = aSig0;
4907
        alternateASig1 = aSig1;
4908
        ++q;
4909
        sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
4910
    } while ( 0 <= (sbits64) aSig0 );
4911
    add128(
4912
        aSig0, aSig1, alternateASig0, alternateASig1, &sigMean0, &sigMean1 );
4913
    if (    ( sigMean0 < 0 )
4914
         || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
4915
        aSig0 = alternateASig0;
4916
        aSig1 = alternateASig1;
4917
    }
4918
    zSign = ( (sbits64) aSig0 < 0 );
4919
    if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
4920
    return
4921
        normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 );
4922
 
4923
}
4924
 
4925
/*----------------------------------------------------------------------------
4926
| Returns the square root of the quadruple-precision floating-point value `a'.
4927
| The operation is performed according to the IEC/IEEE Standard for Binary
4928
| Floating-Point Arithmetic.
4929
*----------------------------------------------------------------------------*/
4930
 
4931
float128 float128_sqrt( float128 a )
4932
{
4933
    flag aSign;
4934
    int32 aExp, zExp;
4935
    bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
4936
    bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4937
    float128 z;
4938
 
4939
    aSig1 = extractFloat128Frac1( a );
4940
    aSig0 = extractFloat128Frac0( a );
4941
    aExp = extractFloat128Exp( a );
4942
    aSign = extractFloat128Sign( a );
4943
    if ( aExp == 0x7FFF ) {
4944
        if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a );
4945
        if ( ! aSign ) return a;
4946
        goto invalid;
4947
    }
4948
    if ( aSign ) {
4949
        if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
4950
 invalid:
4951
        float_raise( float_flag_invalid );
4952
        z.low = float128_default_nan_low;
4953
        z.high = float128_default_nan_high;
4954
        return z;
4955
    }
4956
    if ( aExp == 0 ) {
4957
        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
4958
        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4959
    }
4960
    zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
4961
    aSig0 |= LIT64( 0x0001000000000000 );
4962
    zSig0 = estimateSqrt32( aExp, aSig0>>17 );
4963
    shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
4964
    zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4965
    doubleZSig0 = zSig0<<1;
4966
    mul64To128( zSig0, zSig0, &term0, &term1 );
4967
    sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4968
    while ( (sbits64) rem0 < 0 ) {
4969
        --zSig0;
4970
        doubleZSig0 -= 2;
4971
        add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4972
    }
4973
    zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4974
    if ( ( zSig1 & 0x1FFF ) <= 5 ) {
4975
        if ( zSig1 == 0 ) zSig1 = 1;
4976
        mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4977
        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4978
        mul64To128( zSig1, zSig1, &term2, &term3 );
4979
        sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4980
        while ( (sbits64) rem1 < 0 ) {
4981
            --zSig1;
4982
            shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4983
            term3 |= 1;
4984
            term2 |= doubleZSig0;
4985
            add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4986
        }
4987
        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4988
    }
4989
    shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
4990
    return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 );
4991
 
4992
}
4993
 
4994
/*----------------------------------------------------------------------------
4995
| Returns 1 if the quadruple-precision floating-point value `a' is equal to
4996
| the corresponding value `b', and 0 otherwise.  The comparison is performed
4997
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4998
*----------------------------------------------------------------------------*/
4999
 
5000
flag float128_eq( float128 a, float128 b )
5001
{
5002
 
5003
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5004
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5005
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5006
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5007
       ) {
5008
        if (    float128_is_signaling_nan( a )
5009
             || float128_is_signaling_nan( b ) ) {
5010
            float_raise( float_flag_invalid );
5011
        }
5012
        return 0;
5013
    }
5014
    return
5015
           ( a.low == b.low )
5016
        && (    ( a.high == b.high )
5017
             || (    ( a.low == 0 )
5018
                  && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5019
           );
5020
 
5021
}
5022
 
5023
/*----------------------------------------------------------------------------
5024
| Returns 1 if the quadruple-precision floating-point value `a' is less than
5025
| or equal to the corresponding value `b', and 0 otherwise.  The comparison
5026
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5027
| Arithmetic.
5028
*----------------------------------------------------------------------------*/
5029
 
5030
flag float128_le( float128 a, float128 b )
5031
{
5032
    flag aSign, bSign;
5033
 
5034
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5035
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5036
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5037
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5038
       ) {
5039
        float_raise( float_flag_invalid );
5040
        return 0;
5041
    }
5042
    aSign = extractFloat128Sign( a );
5043
    bSign = extractFloat128Sign( b );
5044
    if ( aSign != bSign ) {
5045
        return
5046
               aSign
5047
            || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5048
                 == 0 );
5049
    }
5050
    return
5051
          aSign ? le128( b.high, b.low, a.high, a.low )
5052
        : le128( a.high, a.low, b.high, b.low );
5053
 
5054
}
5055
 
5056
/*----------------------------------------------------------------------------
5057
| Returns 1 if the quadruple-precision floating-point value `a' is less than
5058
| the corresponding value `b', and 0 otherwise.  The comparison is performed
5059
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5060
*----------------------------------------------------------------------------*/
5061
 
5062
flag float128_lt( float128 a, float128 b )
5063
{
5064
    flag aSign, bSign;
5065
 
5066
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5067
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5068
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5069
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5070
       ) {
5071
        float_raise( float_flag_invalid );
5072
        return 0;
5073
    }
5074
    aSign = extractFloat128Sign( a );
5075
    bSign = extractFloat128Sign( b );
5076
    if ( aSign != bSign ) {
5077
        return
5078
               aSign
5079
            && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5080
                 != 0 );
5081
    }
5082
    return
5083
          aSign ? lt128( b.high, b.low, a.high, a.low )
5084
        : lt128( a.high, a.low, b.high, b.low );
5085
 
5086
}
5087
 
5088
/*----------------------------------------------------------------------------
5089
| Returns 1 if the quadruple-precision floating-point value `a' is equal to
5090
| the corresponding value `b', and 0 otherwise.  The invalid exception is
5091
| raised if either operand is a NaN.  Otherwise, the comparison is performed
5092
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5093
*----------------------------------------------------------------------------*/
5094
 
5095
flag float128_eq_signaling( float128 a, float128 b )
5096
{
5097
 
5098
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5099
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5100
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5101
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5102
       ) {
5103
        float_raise( float_flag_invalid );
5104
        return 0;
5105
    }
5106
    return
5107
           ( a.low == b.low )
5108
        && (    ( a.high == b.high )
5109
             || (    ( a.low == 0 )
5110
                  && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5111
           );
5112
 
5113
}
5114
 
5115
/*----------------------------------------------------------------------------
5116
| Returns 1 if the quadruple-precision floating-point value `a' is less than
5117
| or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5118
| cause an exception.  Otherwise, the comparison is performed according to the
5119
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5120
*----------------------------------------------------------------------------*/
5121
 
5122
flag float128_le_quiet( float128 a, float128 b )
5123
{
5124
    flag aSign, bSign;
5125
 
5126
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5127
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5128
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5129
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5130
       ) {
5131
        if (    float128_is_signaling_nan( a )
5132
             || float128_is_signaling_nan( b ) ) {
5133
            float_raise( float_flag_invalid );
5134
        }
5135
        return 0;
5136
    }
5137
    aSign = extractFloat128Sign( a );
5138
    bSign = extractFloat128Sign( b );
5139
    if ( aSign != bSign ) {
5140
        return
5141
               aSign
5142
            || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5143
                 == 0 );
5144
    }
5145
    return
5146
          aSign ? le128( b.high, b.low, a.high, a.low )
5147
        : le128( a.high, a.low, b.high, b.low );
5148
 
5149
}
5150
 
5151
/*----------------------------------------------------------------------------
5152
| Returns 1 if the quadruple-precision floating-point value `a' is less than
5153
| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
5154
| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
5155
| Standard for Binary Floating-Point Arithmetic.
5156
*----------------------------------------------------------------------------*/
5157
 
5158
flag float128_lt_quiet( float128 a, float128 b )
5159
{
5160
    flag aSign, bSign;
5161
 
5162
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5163
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5164
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5165
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5166
       ) {
5167
        if (    float128_is_signaling_nan( a )
5168
             || float128_is_signaling_nan( b ) ) {
5169
            float_raise( float_flag_invalid );
5170
        }
5171
        return 0;
5172
    }
5173
    aSign = extractFloat128Sign( a );
5174
    bSign = extractFloat128Sign( b );
5175
    if ( aSign != bSign ) {
5176
        return
5177
               aSign
5178
            && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5179
                 != 0 );
5180
    }
5181
    return
5182
          aSign ? lt128( b.high, b.low, a.high, a.low )
5183
        : lt128( a.high, a.low, b.high, b.low );
5184
 
5185
}
5186
 
5187
#endif
5188
 

powered by: WebSVN 2.1.0

© copyright 1999-2024 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.