OpenCores
URL https://opencores.org/ocsvn/openrisc_2011-10-31/openrisc_2011-10-31/trunk

Subversion Repositories openrisc_2011-10-31

[/] [openrisc/] [trunk/] [or1ksim/] [softfloat/] [softfloat.c] - Blame information for rev 342

Go to most recent revision | Details | Compare with Previous | View Log

Line No. Rev Author Line
1 234 jeremybenn
 
2
/*============================================================================
3
 
4
This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
5
Package, Release 2b.
6
 
7
Written by John R. Hauser.  This work was made possible in part by the
8
International Computer Science Institute, located at Suite 600, 1947 Center
9
Street, Berkeley, California 94704.  Funding was partially provided by the
10
National Science Foundation under grant MIP-9311980.  The original version
11
of this code was written as part of a project to build a fixed-point vector
12
processor in collaboration with the University of California at Berkeley,
13
overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
14
is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
15
arithmetic/SoftFloat.html'.
16
 
17
THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort has
18
been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
19
RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
20
AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
21
COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
22
EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
23
INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
24
OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
25
 
26
Derivative works are acceptable, even for commercial purposes, so long as
27
(1) the source code for the derivative work includes prominent notice that
28
the work is derivative, and (2) the source code includes prominent notice with
29
these four paragraphs for those parts of this code that are retained.
30
 
31
=============================================================================*/
32
 
33
#include "milieu.h"
34
#include "softfloat.h"
35
#include <stdio.h>
36
/*----------------------------------------------------------------------------
37
| Floating-point rounding mode, extended double-precision rounding precision,
38
| and exception flags.
39
*----------------------------------------------------------------------------*/
40
int8 float_rounding_mode = float_round_nearest_even;
41
int8 float_exception_flags = 0;
42
#ifdef FLOATX80
43
int8 floatx80_rounding_precision = 80;
44
#endif
45
 
46
/*----------------------------------------------------------------------------
47
| Primitive arithmetic functions, including multi-word arithmetic, and
48
| division and square root approximations.  (Can be specialized to target if
49
| desired.)
50
*----------------------------------------------------------------------------*/
51
#include "softfloat-macros"
52
 
53
/*----------------------------------------------------------------------------
54
| Functions and definitions to determine:  (1) whether tininess for underflow
55
| is detected before or after rounding by default, (2) what (if anything)
56
| happens when exceptions are raised, (3) how signaling NaNs are distinguished
57
| from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
58
| are propagated from function inputs to output.  These details are target-
59
| specific.
60
*----------------------------------------------------------------------------*/
61
#include "softfloat-specialize"
62
 
63
/*----------------------------------------------------------------------------
64
| Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
65
| and 7, and returns the properly rounded 32-bit integer corresponding to the
66
| input.  If `zSign' is 1, the input is negated before being converted to an
67
| integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
68
| is simply rounded to an integer, with the inexact exception raised if the
69
| input cannot be represented exactly as an integer.  However, if the fixed-
70
| point input is too large, the invalid exception is raised and the largest
71
| positive or negative integer is returned.
72
*----------------------------------------------------------------------------*/
73
 
74
static int32 roundAndPackInt32( flag zSign, bits64 absZ )
75
{
76
    int8 roundingMode;
77
    flag roundNearestEven;
78
    int8 roundIncrement, roundBits;
79
    int32 z;
80
 
81
    roundingMode = float_rounding_mode;
82
    roundNearestEven = ( roundingMode == float_round_nearest_even );
83
    roundIncrement = 0x40;
84
    if ( ! roundNearestEven ) {
85
        if ( roundingMode == float_round_to_zero ) {
86
            roundIncrement = 0;
87
        }
88
        else {
89
            roundIncrement = 0x7F;
90
            if ( zSign ) {
91
                if ( roundingMode == float_round_up ) roundIncrement = 0;
92
            }
93
            else {
94
                if ( roundingMode == float_round_down ) roundIncrement = 0;
95
            }
96
        }
97
    }
98
    roundBits = absZ & 0x7F;
99
    absZ = ( absZ + roundIncrement )>>7;
100
    absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
101
    z = absZ;
102
    if ( zSign ) z = - z;
103
    if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
104
        float_raise( float_flag_invalid );
105
        return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
106
    }
107
    if ( roundBits ) float_exception_flags |= float_flag_inexact;
108
    return z;
109
 
110
}
111
 
112
/*----------------------------------------------------------------------------
113
| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
114
| `absZ1', with binary point between bits 63 and 64 (between the input words),
115
| and returns the properly rounded 64-bit integer corresponding to the input.
116
| If `zSign' is 1, the input is negated before being converted to an integer.
117
| Ordinarily, the fixed-point input is simply rounded to an integer, with
118
| the inexact exception raised if the input cannot be represented exactly as
119
| an integer.  However, if the fixed-point input is too large, the invalid
120
| exception is raised and the largest positive or negative integer is
121
| returned.
122
*----------------------------------------------------------------------------*/
123
 
124
static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 )
125
{
126
    int8 roundingMode;
127
    flag roundNearestEven, increment;
128
    int64 z;
129
 
130
    roundingMode = float_rounding_mode;
131
    roundNearestEven = ( roundingMode == float_round_nearest_even );
132
    increment = ( (sbits64) absZ1 < 0 );
133
    if ( ! roundNearestEven ) {
134
        if ( roundingMode == float_round_to_zero ) {
135
            increment = 0;
136
        }
137
        else {
138
            if ( zSign ) {
139
                increment = ( roundingMode == float_round_down ) && absZ1;
140
            }
141
            else {
142
                increment = ( roundingMode == float_round_up ) && absZ1;
143
            }
144
        }
145
    }
146
    if ( increment ) {
147
        ++absZ0;
148
        if ( absZ0 == 0 ) goto overflow;
149
        absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
150
    }
151
    z = absZ0;
152
    if ( zSign ) z = - z;
153
    if ( z && ( ( z < 0 ) ^ zSign ) ) {
154
 overflow:
155
        float_raise( float_flag_invalid );
156
        return
157
              zSign ? (sbits64) LIT64( 0x8000000000000000 )
158
            : LIT64( 0x7FFFFFFFFFFFFFFF );
159
    }
160
    if ( absZ1 ) float_exception_flags |= float_flag_inexact;
161
    return z;
162
 
163
}
164
 
165
/*----------------------------------------------------------------------------
166
| Returns the fraction bits of the single-precision floating-point value `a'.
167
*----------------------------------------------------------------------------*/
168
 
169
INLINE bits32 extractFloat32Frac( float32 a )
170
{
171
 
172
    return a & 0x007FFFFF;
173
 
174
}
175
 
176
/*----------------------------------------------------------------------------
177
| Returns the exponent bits of the single-precision floating-point value `a'.
178
*----------------------------------------------------------------------------*/
179
 
180
INLINE int16 extractFloat32Exp( float32 a )
181
{
182
 
183
    return ( a>>23 ) & 0xFF;
184
 
185
}
186
 
187
/*----------------------------------------------------------------------------
188
| Returns the sign bit of the single-precision floating-point value `a'.
189
*----------------------------------------------------------------------------*/
190
 
191
INLINE flag extractFloat32Sign( float32 a )
192
{
193
 
194
    return a>>31;
195
 
196
}
197
 
198
/*----------------------------------------------------------------------------
199
| Normalizes the subnormal single-precision floating-point value represented
200
| by the denormalized significand `aSig'.  The normalized exponent and
201
| significand are stored at the locations pointed to by `zExpPtr' and
202
| `zSigPtr', respectively.
203
*----------------------------------------------------------------------------*/
204
 
205
static void
206
 normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
207
{
208
    int8 shiftCount;
209
 
210
    shiftCount = countLeadingZeros32( aSig ) - 8;
211
    *zSigPtr = aSig<<shiftCount;
212
    *zExpPtr = 1 - shiftCount;
213
 
214
}
215
 
216
/*----------------------------------------------------------------------------
217
| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
218
| single-precision floating-point value, returning the result.  After being
219
| shifted into the proper positions, the three fields are simply added
220
| together to form the result.  This means that any integer portion of `zSig'
221
| will be added into the exponent.  Since a properly normalized significand
222
| will have an integer portion equal to 1, the `zExp' input should be 1 less
223
| than the desired result exponent whenever `zSig' is a complete, normalized
224
| significand.
225
*----------------------------------------------------------------------------*/
226
 
227
INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
228
{
229
 
230
    return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
231
 
232
}
233
 
234
/*----------------------------------------------------------------------------
235
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
236
| and significand `zSig', and returns the proper single-precision floating-
237
| point value corresponding to the abstract input.  Ordinarily, the abstract
238
| value is simply rounded and packed into the single-precision format, with
239
| the inexact exception raised if the abstract input cannot be represented
240
| exactly.  However, if the abstract value is too large, the overflow and
241
| inexact exceptions are raised and an infinity or maximal finite value is
242
| returned.  If the abstract value is too small, the input value is rounded to
243
| a subnormal number, and the underflow and inexact exceptions are raised if
244
| the abstract input cannot be represented exactly as a subnormal single-
245
| precision floating-point number.
246
|     The input significand `zSig' has its binary point between bits 30
247
| and 29, which is 7 bits to the left of the usual location.  This shifted
248
| significand must be normalized or smaller.  If `zSig' is not normalized,
249
| `zExp' must be 0; in that case, the result returned is a subnormal number,
250
| and it must not require rounding.  In the usual case that `zSig' is
251
| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
252
| The handling of underflow and overflow follows the IEC/IEEE Standard for
253
| Binary Floating-Point Arithmetic.
254
*----------------------------------------------------------------------------*/
255
 
256
static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
257
{
258
    int8 roundingMode;
259
    flag roundNearestEven;
260
    int8 roundIncrement, roundBits;
261
    flag isTiny;
262
 
263
    roundingMode = float_rounding_mode;
264
    roundNearestEven = ( roundingMode == float_round_nearest_even );
265
    roundIncrement = 0x40;
266
    if ( ! roundNearestEven ) {
267
        if ( roundingMode == float_round_to_zero ) {
268
            roundIncrement = 0;
269
        }
270
        else {
271
            roundIncrement = 0x7F;
272
            if ( zSign ) {
273
                if ( roundingMode == float_round_up ) roundIncrement = 0;
274
            }
275
            else {
276
                if ( roundingMode == float_round_down ) roundIncrement = 0;
277
            }
278
        }
279
    }
280
    roundBits = zSig & 0x7F;
281
    if ( 0xFD <= (bits16) zExp ) {
282
        if (    ( 0xFD < zExp )
283
             || (    ( zExp == 0xFD )
284
                  && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
285
           ) {
286
            float_raise( float_flag_overflow | float_flag_inexact );
287
            return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
288
        }
289
        if ( zExp < 0 ) {
290
            isTiny =
291
                   ( float_detect_tininess == float_tininess_before_rounding )
292
                || ( zExp < -1 )
293
                || ( zSig + roundIncrement < 0x80000000 );
294
            shift32RightJamming( zSig, - zExp, &zSig );
295
            zExp = 0;
296
            roundBits = zSig & 0x7F;
297
            if ( isTiny && roundBits ) float_raise( float_flag_underflow );
298
        }
299
    }
300
    if ( roundBits ) float_exception_flags |= float_flag_inexact;
301
    zSig = ( zSig + roundIncrement )>>7;
302
    zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
303
    if ( zSig == 0 ) zExp = 0;
304
    return packFloat32( zSign, zExp, zSig );
305
 
306
}
307
 
308
/*----------------------------------------------------------------------------
309
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
310
| and significand `zSig', and returns the proper single-precision floating-
311
| point value corresponding to the abstract input.  This routine is just like
312
| `roundAndPackFloat32' except that `zSig' does not have to be normalized.
313
| Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
314
| floating-point exponent.
315
*----------------------------------------------------------------------------*/
316
 
317
static float32
318
 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
319
{
320
    int8 shiftCount;
321
 
322
    shiftCount = countLeadingZeros32( zSig ) - 1;
323
    return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount );
324
 
325
}
326
 
327
/*----------------------------------------------------------------------------
328
| Returns the fraction bits of the double-precision floating-point value `a'.
329
*----------------------------------------------------------------------------*/
330
 
331
INLINE bits64 extractFloat64Frac( float64 a )
332
{
333
 
334
    return a & LIT64( 0x000FFFFFFFFFFFFF );
335
 
336
}
337
 
338
/*----------------------------------------------------------------------------
339
| Returns the exponent bits of the double-precision floating-point value `a'.
340
*----------------------------------------------------------------------------*/
341
 
342
INLINE int16 extractFloat64Exp( float64 a )
343
{
344
 
345
    return ( a>>52 ) & 0x7FF;
346
 
347
}
348
 
349
/*----------------------------------------------------------------------------
350
| Returns the sign bit of the double-precision floating-point value `a'.
351
*----------------------------------------------------------------------------*/
352
 
353
INLINE flag extractFloat64Sign( float64 a )
354
{
355
 
356
    return a>>63;
357
 
358
}
359
 
360
/*----------------------------------------------------------------------------
361
| Normalizes the subnormal double-precision floating-point value represented
362
| by the denormalized significand `aSig'.  The normalized exponent and
363
| significand are stored at the locations pointed to by `zExpPtr' and
364
| `zSigPtr', respectively.
365
*----------------------------------------------------------------------------*/
366
 
367
static void
368
 normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
369
{
370
    int8 shiftCount;
371
 
372
    shiftCount = countLeadingZeros64( aSig ) - 11;
373
    *zSigPtr = aSig<<shiftCount;
374
    *zExpPtr = 1 - shiftCount;
375
 
376
}
377
 
378
/*----------------------------------------------------------------------------
379
| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
380
| double-precision floating-point value, returning the result.  After being
381
| shifted into the proper positions, the three fields are simply added
382
| together to form the result.  This means that any integer portion of `zSig'
383
| will be added into the exponent.  Since a properly normalized significand
384
| will have an integer portion equal to 1, the `zExp' input should be 1 less
385
| than the desired result exponent whenever `zSig' is a complete, normalized
386
| significand.
387
*----------------------------------------------------------------------------*/
388
 
389
INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
390
{
391
 
392
    return ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<52 ) + zSig;
393
 
394
}
395
 
396
/*----------------------------------------------------------------------------
397
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
398
| and significand `zSig', and returns the proper double-precision floating-
399
| point value corresponding to the abstract input.  Ordinarily, the abstract
400
| value is simply rounded and packed into the double-precision format, with
401
| the inexact exception raised if the abstract input cannot be represented
402
| exactly.  However, if the abstract value is too large, the overflow and
403
| inexact exceptions are raised and an infinity or maximal finite value is
404
| returned.  If the abstract value is too small, the input value is rounded
405
| to a subnormal number, and the underflow and inexact exceptions are raised
406
| if the abstract input cannot be represented exactly as a subnormal double-
407
| precision floating-point number.
408
|     The input significand `zSig' has its binary point between bits 62
409
| and 61, which is 10 bits to the left of the usual location.  This shifted
410
| significand must be normalized or smaller.  If `zSig' is not normalized,
411
| `zExp' must be 0; in that case, the result returned is a subnormal number,
412
| and it must not require rounding.  In the usual case that `zSig' is
413
| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
414
| The handling of underflow and overflow follows the IEC/IEEE Standard for
415
| Binary Floating-Point Arithmetic.
416
*----------------------------------------------------------------------------*/
417
 
418
static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
419
{
420
    int8 roundingMode;
421
    flag roundNearestEven;
422
    int16 roundIncrement, roundBits;
423
    flag isTiny;
424
 
425
    roundingMode = float_rounding_mode;
426
    roundNearestEven = ( roundingMode == float_round_nearest_even );
427
    roundIncrement = 0x200;
428
    if ( ! roundNearestEven ) {
429
        if ( roundingMode == float_round_to_zero ) {
430
            roundIncrement = 0;
431
        }
432
        else {
433
            roundIncrement = 0x3FF;
434
            if ( zSign ) {
435
                if ( roundingMode == float_round_up ) roundIncrement = 0;
436
            }
437
            else {
438
                if ( roundingMode == float_round_down ) roundIncrement = 0;
439
            }
440
        }
441
    }
442
    roundBits = zSig & 0x3FF;
443
    if ( 0x7FD <= (bits16) zExp ) {
444
        if (    ( 0x7FD < zExp )
445
             || (    ( zExp == 0x7FD )
446
                  && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
447
           ) {
448
            float_raise( float_flag_overflow | float_flag_inexact );
449
            return packFloat64( zSign, 0x7FF, 0 ) - ( roundIncrement == 0 );
450
        }
451
        if ( zExp < 0 ) {
452
            isTiny =
453
                   ( float_detect_tininess == float_tininess_before_rounding )
454
                || ( zExp < -1 )
455
                || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
456
            shift64RightJamming( zSig, - zExp, &zSig );
457
            zExp = 0;
458
            roundBits = zSig & 0x3FF;
459
            if ( isTiny && roundBits ) float_raise( float_flag_underflow );
460
        }
461
    }
462
    if ( roundBits ) float_exception_flags |= float_flag_inexact;
463
    zSig = ( zSig + roundIncrement )>>10;
464
    zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
465
    if ( zSig == 0 ) zExp = 0;
466
    return packFloat64( zSign, zExp, zSig );
467
 
468
}
469
 
470
/*----------------------------------------------------------------------------
471
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
472
| and significand `zSig', and returns the proper double-precision floating-
473
| point value corresponding to the abstract input.  This routine is just like
474
| `roundAndPackFloat64' except that `zSig' does not have to be normalized.
475
| Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
476
| floating-point exponent.
477
*----------------------------------------------------------------------------*/
478
 
479
static float64
480
 normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
481
{
482
    int8 shiftCount;
483
 
484
    shiftCount = countLeadingZeros64( zSig ) - 1;
485
    return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount );
486
 
487
}
488
 
489
#ifdef FLOATX80
490
 
491
/*----------------------------------------------------------------------------
492
| Returns the fraction bits of the extended double-precision floating-point
493
| value `a'.
494
*----------------------------------------------------------------------------*/
495
 
496
INLINE bits64 extractFloatx80Frac( floatx80 a )
497
{
498
 
499
    return a.low;
500
 
501
}
502
 
503
/*----------------------------------------------------------------------------
504
| Returns the exponent bits of the extended double-precision floating-point
505
| value `a'.
506
*----------------------------------------------------------------------------*/
507
 
508
INLINE int32 extractFloatx80Exp( floatx80 a )
509
{
510
 
511
    return a.high & 0x7FFF;
512
 
513
}
514
 
515
/*----------------------------------------------------------------------------
516
| Returns the sign bit of the extended double-precision floating-point value
517
| `a'.
518
*----------------------------------------------------------------------------*/
519
 
520
INLINE flag extractFloatx80Sign( floatx80 a )
521
{
522
 
523
    return a.high>>15;
524
 
525
}
526
 
527
/*----------------------------------------------------------------------------
528
| Normalizes the subnormal extended double-precision floating-point value
529
| represented by the denormalized significand `aSig'.  The normalized exponent
530
| and significand are stored at the locations pointed to by `zExpPtr' and
531
| `zSigPtr', respectively.
532
*----------------------------------------------------------------------------*/
533
 
534
static void
535
 normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
536
{
537
    int8 shiftCount;
538
 
539
    shiftCount = countLeadingZeros64( aSig );
540
    *zSigPtr = aSig<<shiftCount;
541
    *zExpPtr = 1 - shiftCount;
542
 
543
}
544
 
545
/*----------------------------------------------------------------------------
546
| Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
547
| extended double-precision floating-point value, returning the result.
548
*----------------------------------------------------------------------------*/
549
 
550
INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
551
{
552
    floatx80 z;
553
 
554
    z.low = zSig;
555
    z.high = ( ( (bits16) zSign )<<15 ) + zExp;
556
    return z;
557
 
558
}
559
 
560
/*----------------------------------------------------------------------------
561
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
562
| and extended significand formed by the concatenation of `zSig0' and `zSig1',
563
| and returns the proper extended double-precision floating-point value
564
| corresponding to the abstract input.  Ordinarily, the abstract value is
565
| rounded and packed into the extended double-precision format, with the
566
| inexact exception raised if the abstract input cannot be represented
567
| exactly.  However, if the abstract value is too large, the overflow and
568
| inexact exceptions are raised and an infinity or maximal finite value is
569
| returned.  If the abstract value is too small, the input value is rounded to
570
| a subnormal number, and the underflow and inexact exceptions are raised if
571
| the abstract input cannot be represented exactly as a subnormal extended
572
| double-precision floating-point number.
573
|     If `roundingPrecision' is 32 or 64, the result is rounded to the same
574
| number of bits as single or double precision, respectively.  Otherwise, the
575
| result is rounded to the full precision of the extended double-precision
576
| format.
577
|     The input significand must be normalized or smaller.  If the input
578
| significand is not normalized, `zExp' must be 0; in that case, the result
579
| returned is a subnormal number, and it must not require rounding.  The
580
| handling of underflow and overflow follows the IEC/IEEE Standard for Binary
581
| Floating-Point Arithmetic.
582
*----------------------------------------------------------------------------*/
583
 
584
static floatx80
585
 roundAndPackFloatx80(
586
     int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
587
 )
588
{
589
    int8 roundingMode;
590
    flag roundNearestEven, increment, isTiny;
591
    int64 roundIncrement, roundMask, roundBits;
592
 
593
    roundingMode = float_rounding_mode;
594
    roundNearestEven = ( roundingMode == float_round_nearest_even );
595
    if ( roundingPrecision == 80 ) goto precision80;
596
    if ( roundingPrecision == 64 ) {
597
        roundIncrement = LIT64( 0x0000000000000400 );
598
        roundMask = LIT64( 0x00000000000007FF );
599
    }
600
    else if ( roundingPrecision == 32 ) {
601
        roundIncrement = LIT64( 0x0000008000000000 );
602
        roundMask = LIT64( 0x000000FFFFFFFFFF );
603
    }
604
    else {
605
        goto precision80;
606
    }
607
    zSig0 |= ( zSig1 != 0 );
608
    if ( ! roundNearestEven ) {
609
        if ( roundingMode == float_round_to_zero ) {
610
            roundIncrement = 0;
611
        }
612
        else {
613
            roundIncrement = roundMask;
614
            if ( zSign ) {
615
                if ( roundingMode == float_round_up ) roundIncrement = 0;
616
            }
617
            else {
618
                if ( roundingMode == float_round_down ) roundIncrement = 0;
619
            }
620
        }
621
    }
622
    roundBits = zSig0 & roundMask;
623
    if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
624
        if (    ( 0x7FFE < zExp )
625
             || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
626
           ) {
627
            goto overflow;
628
        }
629
        if ( zExp <= 0 ) {
630
            isTiny =
631
                   ( float_detect_tininess == float_tininess_before_rounding )
632
                || ( zExp < 0 )
633
                || ( zSig0 <= zSig0 + roundIncrement );
634
            shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
635
            zExp = 0;
636
            roundBits = zSig0 & roundMask;
637
            if ( isTiny && roundBits ) float_raise( float_flag_underflow );
638
            if ( roundBits ) float_exception_flags |= float_flag_inexact;
639
            zSig0 += roundIncrement;
640
            if ( (sbits64) zSig0 < 0 ) zExp = 1;
641
            roundIncrement = roundMask + 1;
642
            if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
643
                roundMask |= roundIncrement;
644
            }
645
            zSig0 &= ~ roundMask;
646
            return packFloatx80( zSign, zExp, zSig0 );
647
        }
648
    }
649
    if ( roundBits ) float_exception_flags |= float_flag_inexact;
650
    zSig0 += roundIncrement;
651
    if ( zSig0 < roundIncrement ) {
652
        ++zExp;
653
        zSig0 = LIT64( 0x8000000000000000 );
654
    }
655
    roundIncrement = roundMask + 1;
656
    if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
657
        roundMask |= roundIncrement;
658
    }
659
    zSig0 &= ~ roundMask;
660
    if ( zSig0 == 0 ) zExp = 0;
661
    return packFloatx80( zSign, zExp, zSig0 );
662
 precision80:
663
    increment = ( (sbits64) zSig1 < 0 );
664
    if ( ! roundNearestEven ) {
665
        if ( roundingMode == float_round_to_zero ) {
666
            increment = 0;
667
        }
668
        else {
669
            if ( zSign ) {
670
                increment = ( roundingMode == float_round_down ) && zSig1;
671
            }
672
            else {
673
                increment = ( roundingMode == float_round_up ) && zSig1;
674
            }
675
        }
676
    }
677
    if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
678
        if (    ( 0x7FFE < zExp )
679
             || (    ( zExp == 0x7FFE )
680
                  && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
681
                  && increment
682
                )
683
           ) {
684
            roundMask = 0;
685
 overflow:
686
            float_raise( float_flag_overflow | float_flag_inexact );
687
            if (    ( roundingMode == float_round_to_zero )
688
                 || ( zSign && ( roundingMode == float_round_up ) )
689
                 || ( ! zSign && ( roundingMode == float_round_down ) )
690
               ) {
691
                return packFloatx80( zSign, 0x7FFE, ~ roundMask );
692
            }
693
            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
694
        }
695
        if ( zExp <= 0 ) {
696
            isTiny =
697
                   ( float_detect_tininess == float_tininess_before_rounding )
698
                || ( zExp < 0 )
699
                || ! increment
700
                || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
701
            shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
702
            zExp = 0;
703
            if ( isTiny && zSig1 ) float_raise( float_flag_underflow );
704
            if ( zSig1 ) float_exception_flags |= float_flag_inexact;
705
            if ( roundNearestEven ) {
706
                increment = ( (sbits64) zSig1 < 0 );
707
            }
708
            else {
709
                if ( zSign ) {
710
                    increment = ( roundingMode == float_round_down ) && zSig1;
711
                }
712
                else {
713
                    increment = ( roundingMode == float_round_up ) && zSig1;
714
                }
715
            }
716
            if ( increment ) {
717
                ++zSig0;
718
                zSig0 &=
719
                    ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
720
                if ( (sbits64) zSig0 < 0 ) zExp = 1;
721
            }
722
            return packFloatx80( zSign, zExp, zSig0 );
723
        }
724
    }
725
    if ( zSig1 ) float_exception_flags |= float_flag_inexact;
726
    if ( increment ) {
727
        ++zSig0;
728
        if ( zSig0 == 0 ) {
729
            ++zExp;
730
            zSig0 = LIT64( 0x8000000000000000 );
731
        }
732
        else {
733
            zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
734
        }
735
    }
736
    else {
737
        if ( zSig0 == 0 ) zExp = 0;
738
    }
739
    return packFloatx80( zSign, zExp, zSig0 );
740
 
741
}
742
 
743
/*----------------------------------------------------------------------------
744
| Takes an abstract floating-point value having sign `zSign', exponent
745
| `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
746
| and returns the proper extended double-precision floating-point value
747
| corresponding to the abstract input.  This routine is just like
748
| `roundAndPackFloatx80' except that the input significand does not have to be
749
| normalized.
750
*----------------------------------------------------------------------------*/
751
 
752
static floatx80
753
 normalizeRoundAndPackFloatx80(
754
     int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
755
 )
756
{
757
    int8 shiftCount;
758
 
759
    if ( zSig0 == 0 ) {
760
        zSig0 = zSig1;
761
        zSig1 = 0;
762
        zExp -= 64;
763
    }
764
    shiftCount = countLeadingZeros64( zSig0 );
765
    shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
766
    zExp -= shiftCount;
767
    return
768
        roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
769
 
770
}
771
 
772
#endif
773
 
774
#ifdef FLOAT128
775
 
776
/*----------------------------------------------------------------------------
777
| Returns the least-significant 64 fraction bits of the quadruple-precision
778
| floating-point value `a'.
779
*----------------------------------------------------------------------------*/
780
 
781
INLINE bits64 extractFloat128Frac1( float128 a )
782
{
783
 
784
    return a.low;
785
 
786
}
787
 
788
/*----------------------------------------------------------------------------
789
| Returns the most-significant 48 fraction bits of the quadruple-precision
790
| floating-point value `a'.
791
*----------------------------------------------------------------------------*/
792
 
793
INLINE bits64 extractFloat128Frac0( float128 a )
794
{
795
 
796
    return a.high & LIT64( 0x0000FFFFFFFFFFFF );
797
 
798
}
799
 
800
/*----------------------------------------------------------------------------
801
| Returns the exponent bits of the quadruple-precision floating-point value
802
| `a'.
803
*----------------------------------------------------------------------------*/
804
 
805
INLINE int32 extractFloat128Exp( float128 a )
806
{
807
 
808
    return ( a.high>>48 ) & 0x7FFF;
809
 
810
}
811
 
812
/*----------------------------------------------------------------------------
813
| Returns the sign bit of the quadruple-precision floating-point value `a'.
814
*----------------------------------------------------------------------------*/
815
 
816
INLINE flag extractFloat128Sign( float128 a )
817
{
818
 
819
    return a.high>>63;
820
 
821
}
822
 
823
/*----------------------------------------------------------------------------
824
| Normalizes the subnormal quadruple-precision floating-point value
825
| represented by the denormalized significand formed by the concatenation of
826
| `aSig0' and `aSig1'.  The normalized exponent is stored at the location
827
| pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
828
| significand are stored at the location pointed to by `zSig0Ptr', and the
829
| least significant 64 bits of the normalized significand are stored at the
830
| location pointed to by `zSig1Ptr'.
831
*----------------------------------------------------------------------------*/
832
 
833
static void
834
 normalizeFloat128Subnormal(
835
     bits64 aSig0,
836
     bits64 aSig1,
837
     int32 *zExpPtr,
838
     bits64 *zSig0Ptr,
839
     bits64 *zSig1Ptr
840
 )
841
{
842
    int8 shiftCount;
843
 
844
    if ( aSig0 == 0 ) {
845
        shiftCount = countLeadingZeros64( aSig1 ) - 15;
846
        if ( shiftCount < 0 ) {
847
            *zSig0Ptr = aSig1>>( - shiftCount );
848
            *zSig1Ptr = aSig1<<( shiftCount & 63 );
849
        }
850
        else {
851
            *zSig0Ptr = aSig1<<shiftCount;
852
            *zSig1Ptr = 0;
853
        }
854
        *zExpPtr = - shiftCount - 63;
855
    }
856
    else {
857
        shiftCount = countLeadingZeros64( aSig0 ) - 15;
858
        shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
859
        *zExpPtr = 1 - shiftCount;
860
    }
861
 
862
}
863
 
864
/*----------------------------------------------------------------------------
865
| Packs the sign `zSign', the exponent `zExp', and the significand formed
866
| by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
867
| floating-point value, returning the result.  After being shifted into the
868
| proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
869
| added together to form the most significant 32 bits of the result.  This
870
| means that any integer portion of `zSig0' will be added into the exponent.
871
| Since a properly normalized significand will have an integer portion equal
872
| to 1, the `zExp' input should be 1 less than the desired result exponent
873
| whenever `zSig0' and `zSig1' concatenated form a complete, normalized
874
| significand.
875
*----------------------------------------------------------------------------*/
876
 
877
INLINE float128
878
 packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
879
{
880
    float128 z;
881
 
882
    z.low = zSig1;
883
    z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
884
    return z;
885
 
886
}
887
 
888
/*----------------------------------------------------------------------------
889
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
890
| and extended significand formed by the concatenation of `zSig0', `zSig1',
891
| and `zSig2', and returns the proper quadruple-precision floating-point value
892
| corresponding to the abstract input.  Ordinarily, the abstract value is
893
| simply rounded and packed into the quadruple-precision format, with the
894
| inexact exception raised if the abstract input cannot be represented
895
| exactly.  However, if the abstract value is too large, the overflow and
896
| inexact exceptions are raised and an infinity or maximal finite value is
897
| returned.  If the abstract value is too small, the input value is rounded to
898
| a subnormal number, and the underflow and inexact exceptions are raised if
899
| the abstract input cannot be represented exactly as a subnormal quadruple-
900
| precision floating-point number.
901
|     The input significand must be normalized or smaller.  If the input
902
| significand is not normalized, `zExp' must be 0; in that case, the result
903
| returned is a subnormal number, and it must not require rounding.  In the
904
| usual case that the input significand is normalized, `zExp' must be 1 less
905
| than the ``true'' floating-point exponent.  The handling of underflow and
906
| overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
907
*----------------------------------------------------------------------------*/
908
 
909
static float128
910
 roundAndPackFloat128(
911
     flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 )
912
{
913
    int8 roundingMode;
914
    flag roundNearestEven, increment, isTiny;
915
 
916
    roundingMode = float_rounding_mode;
917
    roundNearestEven = ( roundingMode == float_round_nearest_even );
918
    increment = ( (sbits64) zSig2 < 0 );
919
    if ( ! roundNearestEven ) {
920
        if ( roundingMode == float_round_to_zero ) {
921
            increment = 0;
922
        }
923
        else {
924
            if ( zSign ) {
925
                increment = ( roundingMode == float_round_down ) && zSig2;
926
            }
927
            else {
928
                increment = ( roundingMode == float_round_up ) && zSig2;
929
            }
930
        }
931
    }
932
    if ( 0x7FFD <= (bits32) zExp ) {
933
        if (    ( 0x7FFD < zExp )
934
             || (    ( zExp == 0x7FFD )
935
                  && eq128(
936
                         LIT64( 0x0001FFFFFFFFFFFF ),
937
                         LIT64( 0xFFFFFFFFFFFFFFFF ),
938
                         zSig0,
939
                         zSig1
940
                     )
941
                  && increment
942
                )
943
           ) {
944
            float_raise( float_flag_overflow | float_flag_inexact );
945
            if (    ( roundingMode == float_round_to_zero )
946
                 || ( zSign && ( roundingMode == float_round_up ) )
947
                 || ( ! zSign && ( roundingMode == float_round_down ) )
948
               ) {
949
                return
950
                    packFloat128(
951
                        zSign,
952
                        0x7FFE,
953
                        LIT64( 0x0000FFFFFFFFFFFF ),
954
                        LIT64( 0xFFFFFFFFFFFFFFFF )
955
                    );
956
            }
957
            return packFloat128( zSign, 0x7FFF, 0, 0 );
958
        }
959
        if ( zExp < 0 ) {
960
            isTiny =
961
                   ( float_detect_tininess == float_tininess_before_rounding )
962
                || ( zExp < -1 )
963
                || ! increment
964
                || lt128(
965
                       zSig0,
966
                       zSig1,
967
                       LIT64( 0x0001FFFFFFFFFFFF ),
968
                       LIT64( 0xFFFFFFFFFFFFFFFF )
969
                   );
970
            shift128ExtraRightJamming(
971
                zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
972
            zExp = 0;
973
            if ( isTiny && zSig2 ) float_raise( float_flag_underflow );
974
            if ( roundNearestEven ) {
975
                increment = ( (sbits64) zSig2 < 0 );
976
            }
977
            else {
978
                if ( zSign ) {
979
                    increment = ( roundingMode == float_round_down ) && zSig2;
980
                }
981
                else {
982
                    increment = ( roundingMode == float_round_up ) && zSig2;
983
                }
984
            }
985
        }
986
    }
987
    if ( zSig2 ) float_exception_flags |= float_flag_inexact;
988
    if ( increment ) {
989
        add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
990
        zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
991
    }
992
    else {
993
        if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
994
    }
995
    return packFloat128( zSign, zExp, zSig0, zSig1 );
996
 
997
}
998
 
999
/*----------------------------------------------------------------------------
1000
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1001
| and significand formed by the concatenation of `zSig0' and `zSig1', and
1002
| returns the proper quadruple-precision floating-point value corresponding
1003
| to the abstract input.  This routine is just like `roundAndPackFloat128'
1004
| except that the input significand has fewer bits and does not have to be
1005
| normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
1006
| point exponent.
1007
*----------------------------------------------------------------------------*/
1008
 
1009
static float128
1010
 normalizeRoundAndPackFloat128(
1011
     flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
1012
{
1013
    int8 shiftCount;
1014
    bits64 zSig2;
1015
 
1016
    if ( zSig0 == 0 ) {
1017
        zSig0 = zSig1;
1018
        zSig1 = 0;
1019
        zExp -= 64;
1020
    }
1021
    shiftCount = countLeadingZeros64( zSig0 ) - 15;
1022
    if ( 0 <= shiftCount ) {
1023
        zSig2 = 0;
1024
        shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1025
    }
1026
    else {
1027
        shift128ExtraRightJamming(
1028
            zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1029
    }
1030
    zExp -= shiftCount;
1031
    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
1032
 
1033
}
1034
 
1035
#endif
1036
 
1037
/*----------------------------------------------------------------------------
1038
| Returns the result of converting the 32-bit two's complement integer `a'
1039
| to the single-precision floating-point format.  The conversion is performed
1040
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1041
*----------------------------------------------------------------------------*/
1042
 
1043
float32 int32_to_float32( int32 a )
1044
{
1045
    flag zSign;
1046
    if ( a == 0 ) return 0;
1047
    if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1048
    zSign = ( a < 0 );
1049
    return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a );
1050
 
1051
}
1052
 
1053
/*----------------------------------------------------------------------------
1054
| Returns the result of converting the 32-bit two's complement integer `a'
1055
| to the double-precision floating-point format.  The conversion is performed
1056
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1057
*----------------------------------------------------------------------------*/
1058
 
1059
float64 int32_to_float64( int32 a )
1060
{
1061
    flag zSign;
1062
    uint32 absA;
1063
    int8 shiftCount;
1064
    bits64 zSig;
1065
 
1066
    if ( a == 0 ) return 0;
1067
    zSign = ( a < 0 );
1068
    absA = zSign ? - a : a;
1069
    shiftCount = countLeadingZeros32( absA ) + 21;
1070
    zSig = absA;
1071
    return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1072
 
1073
}
1074
 
1075
#ifdef FLOATX80
1076
 
1077
/*----------------------------------------------------------------------------
1078
| Returns the result of converting the 32-bit two's complement integer `a'
1079
| to the extended double-precision floating-point format.  The conversion
1080
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1081
| Arithmetic.
1082
*----------------------------------------------------------------------------*/
1083
 
1084
floatx80 int32_to_floatx80( int32 a )
1085
{
1086
    flag zSign;
1087
    uint32 absA;
1088
    int8 shiftCount;
1089
    bits64 zSig;
1090
 
1091
    if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1092
    zSign = ( a < 0 );
1093
    absA = zSign ? - a : a;
1094
    shiftCount = countLeadingZeros32( absA ) + 32;
1095
    zSig = absA;
1096
    return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1097
 
1098
}
1099
 
1100
#endif
1101
 
1102
#ifdef FLOAT128
1103
 
1104
/*----------------------------------------------------------------------------
1105
| Returns the result of converting the 32-bit two's complement integer `a' to
1106
| the quadruple-precision floating-point format.  The conversion is performed
1107
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1108
*----------------------------------------------------------------------------*/
1109
 
1110
float128 int32_to_float128( int32 a )
1111
{
1112
    flag zSign;
1113
    uint32 absA;
1114
    int8 shiftCount;
1115
    bits64 zSig0;
1116
 
1117
    if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1118
    zSign = ( a < 0 );
1119
    absA = zSign ? - a : a;
1120
    shiftCount = countLeadingZeros32( absA ) + 17;
1121
    zSig0 = absA;
1122
    return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1123
 
1124
}
1125
 
1126
#endif
1127
 
1128
/*----------------------------------------------------------------------------
1129
| Returns the result of converting the 64-bit two's complement integer `a'
1130
| to the single-precision floating-point format.  The conversion is performed
1131
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1132
*----------------------------------------------------------------------------*/
1133
 
1134
float32 int64_to_float32( int64 a )
1135
{
1136
    flag zSign;
1137
    uint64 absA;
1138
    int8 shiftCount;
1139
    //bits32 zSig; // Unused variable - JB
1140
 
1141
    if ( a == 0 ) return 0;
1142
    zSign = ( a < 0 );
1143
    absA = zSign ? - a : a;
1144
    shiftCount = countLeadingZeros64( absA ) - 40;
1145
    if ( 0 <= shiftCount ) {
1146
        return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1147
    }
1148
    else {
1149
        shiftCount += 7;
1150
        if ( shiftCount < 0 ) {
1151
            shift64RightJamming( absA, - shiftCount, &absA );
1152
        }
1153
        else {
1154
            absA <<= shiftCount;
1155
        }
1156
        return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA );
1157
    }
1158
 
1159
}
1160
 
1161
/*----------------------------------------------------------------------------
1162
| Returns the result of converting the 64-bit two's complement integer `a'
1163
| to the double-precision floating-point format.  The conversion is performed
1164
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1165
*----------------------------------------------------------------------------*/
1166
 
1167
float64 int64_to_float64( int64 a )
1168
{
1169
    flag zSign;
1170
 
1171
    if ( a == 0 ) return 0;
1172
    if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
1173
        return packFloat64( 1, 0x43E, 0 );
1174
    }
1175
    zSign = ( a < 0 );
1176
    return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a );
1177
 
1178
}
1179
 
1180
#ifdef FLOATX80
1181
 
1182
/*----------------------------------------------------------------------------
1183
| Returns the result of converting the 64-bit two's complement integer `a'
1184
| to the extended double-precision floating-point format.  The conversion
1185
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1186
| Arithmetic.
1187
*----------------------------------------------------------------------------*/
1188
 
1189
floatx80 int64_to_floatx80( int64 a )
1190
{
1191
    flag zSign;
1192
    uint64 absA;
1193
    int8 shiftCount;
1194
 
1195
    if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1196
    zSign = ( a < 0 );
1197
    absA = zSign ? - a : a;
1198
    shiftCount = countLeadingZeros64( absA );
1199
    return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1200
 
1201
}
1202
 
1203
#endif
1204
 
1205
#ifdef FLOAT128
1206
 
1207
/*----------------------------------------------------------------------------
1208
| Returns the result of converting the 64-bit two's complement integer `a' to
1209
| the quadruple-precision floating-point format.  The conversion is performed
1210
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1211
*----------------------------------------------------------------------------*/
1212
 
1213
float128 int64_to_float128( int64 a )
1214
{
1215
    flag zSign;
1216
    uint64 absA;
1217
    int8 shiftCount;
1218
    int32 zExp;
1219
    bits64 zSig0, zSig1;
1220
 
1221
    if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1222
    zSign = ( a < 0 );
1223
    absA = zSign ? - a : a;
1224
    shiftCount = countLeadingZeros64( absA ) + 49;
1225
    zExp = 0x406E - shiftCount;
1226
    if ( 64 <= shiftCount ) {
1227
        zSig1 = 0;
1228
        zSig0 = absA;
1229
        shiftCount -= 64;
1230
    }
1231
    else {
1232
        zSig1 = absA;
1233
        zSig0 = 0;
1234
    }
1235
    shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1236
    return packFloat128( zSign, zExp, zSig0, zSig1 );
1237
 
1238
}
1239
 
1240
#endif
1241
 
1242
/*----------------------------------------------------------------------------
1243
| Returns the result of converting the single-precision floating-point value
1244
| `a' to the 32-bit two's complement integer format.  The conversion is
1245
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1246
| Arithmetic---which means in particular that the conversion is rounded
1247
| according to the current rounding mode.  If `a' is a NaN, the largest
1248
| positive integer is returned.  Otherwise, if the conversion overflows, the
1249
| largest integer with the same sign as `a' is returned.
1250
*----------------------------------------------------------------------------*/
1251
 
1252
int32 float32_to_int32( float32 a )
1253
{
1254
    flag aSign;
1255
    int16 aExp, shiftCount;
1256
    bits32 aSig;
1257
    bits64 aSig64;
1258
 
1259
    aSig = extractFloat32Frac( a );
1260
    aExp = extractFloat32Exp( a );
1261
    aSign = extractFloat32Sign( a );
1262
    if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1263
    if ( aExp ) aSig |= 0x00800000;
1264
    shiftCount = 0xAF - aExp;
1265
    aSig64 = aSig;
1266
    aSig64 <<= 32;
1267
    if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1268
    return roundAndPackInt32( aSign, aSig64 );
1269
 
1270
}
1271
 
1272
/*----------------------------------------------------------------------------
1273
| Returns the result of converting the single-precision floating-point value
1274
| `a' to the 32-bit two's complement integer format.  The conversion is
1275
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1276
| Arithmetic, except that the conversion is always rounded toward zero.
1277
| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1278
| the conversion overflows, the largest integer with the same sign as `a' is
1279
| returned.
1280
*----------------------------------------------------------------------------*/
1281
 
1282
int32 float32_to_int32_round_to_zero( float32 a )
1283
{
1284
    flag aSign;
1285
    int16 aExp, shiftCount;
1286
    bits32 aSig;
1287
    int32 z;
1288
 
1289
    aSig = extractFloat32Frac( a );
1290
    aExp = extractFloat32Exp( a );
1291
    aSign = extractFloat32Sign( a );
1292
    shiftCount = aExp - 0x9E;
1293
    if ( 0 <= shiftCount ) {
1294
        if ( a != 0xCF000000 ) {
1295
            float_raise( float_flag_invalid );
1296
            if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1297
        }
1298
        return (sbits32) 0x80000000;
1299
    }
1300
    else if ( aExp <= 0x7E ) {
1301
        if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
1302
        return 0;
1303
    }
1304
    aSig = ( aSig | 0x00800000 )<<8;
1305
    z = aSig>>( - shiftCount );
1306
    if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1307
        float_exception_flags |= float_flag_inexact;
1308
    }
1309
    if ( aSign ) z = - z;
1310
    return z;
1311
 
1312
}
1313
 
1314
/*----------------------------------------------------------------------------
1315
| Returns the result of converting the single-precision floating-point value
1316
| `a' to the 64-bit two's complement integer format.  The conversion is
1317
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1318
| Arithmetic---which means in particular that the conversion is rounded
1319
| according to the current rounding mode.  If `a' is a NaN, the largest
1320
| positive integer is returned.  Otherwise, if the conversion overflows, the
1321
| largest integer with the same sign as `a' is returned.
1322
*----------------------------------------------------------------------------*/
1323
 
1324
int64 float32_to_int64( float32 a )
1325
{
1326
    flag aSign;
1327
    int16 aExp, shiftCount;
1328
    bits32 aSig;
1329
    bits64 aSig64, aSigExtra;
1330
 
1331
    aSig = extractFloat32Frac( a );
1332
    aExp = extractFloat32Exp( a );
1333
    aSign = extractFloat32Sign( a );
1334
    shiftCount = 0xBE - aExp;
1335
    if ( shiftCount < 0 ) {
1336
        float_raise( float_flag_invalid );
1337
        if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1338
            return LIT64( 0x7FFFFFFFFFFFFFFF );
1339
        }
1340
        return (sbits64) LIT64( 0x8000000000000000 );
1341
    }
1342
    if ( aExp ) aSig |= 0x00800000;
1343
    aSig64 = aSig;
1344
    aSig64 <<= 40;
1345
    shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1346
    return roundAndPackInt64( aSign, aSig64, aSigExtra );
1347
 
1348
}
1349
 
1350
/*----------------------------------------------------------------------------
1351
| Returns the result of converting the single-precision floating-point value
1352
| `a' to the 64-bit two's complement integer format.  The conversion is
1353
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1354
| Arithmetic, except that the conversion is always rounded toward zero.  If
1355
| `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1356
| conversion overflows, the largest integer with the same sign as `a' is
1357
| returned.
1358
*----------------------------------------------------------------------------*/
1359
 
1360
int64 float32_to_int64_round_to_zero( float32 a )
1361
{
1362
    flag aSign;
1363
    int16 aExp, shiftCount;
1364
    bits32 aSig;
1365
    bits64 aSig64;
1366
    int64 z;
1367
 
1368
    aSig = extractFloat32Frac( a );
1369
    aExp = extractFloat32Exp( a );
1370
    aSign = extractFloat32Sign( a );
1371
    shiftCount = aExp - 0xBE;
1372
    if ( 0 <= shiftCount ) {
1373
        if ( a != 0xDF000000 ) {
1374
            float_raise( float_flag_invalid );
1375
            if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1376
                return LIT64( 0x7FFFFFFFFFFFFFFF );
1377
            }
1378
        }
1379
        return (sbits64) LIT64( 0x8000000000000000 );
1380
    }
1381
    else if ( aExp <= 0x7E ) {
1382
        if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
1383
        return 0;
1384
    }
1385
    aSig64 = aSig | 0x00800000;
1386
    aSig64 <<= 40;
1387
    z = aSig64>>( - shiftCount );
1388
    if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
1389
        float_exception_flags |= float_flag_inexact;
1390
    }
1391
    if ( aSign ) z = - z;
1392
    return z;
1393
 
1394
}
1395
 
1396
/*----------------------------------------------------------------------------
1397
| Returns the result of converting the single-precision floating-point value
1398
| `a' to the double-precision floating-point format.  The conversion is
1399
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1400
| Arithmetic.
1401
*----------------------------------------------------------------------------*/
1402
 
1403
float64 float32_to_float64( float32 a )
1404
{
1405
    flag aSign;
1406
    int16 aExp;
1407
    bits32 aSig;
1408
 
1409
    aSig = extractFloat32Frac( a );
1410
    aExp = extractFloat32Exp( a );
1411
    aSign = extractFloat32Sign( a );
1412
    if ( aExp == 0xFF ) {
1413
        if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) );
1414
        return packFloat64( aSign, 0x7FF, 0 );
1415
    }
1416
    if ( aExp == 0 ) {
1417
        if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1418
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1419
        --aExp;
1420
    }
1421
    return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
1422
 
1423
}
1424
 
1425
#ifdef FLOATX80
1426
 
1427
/*----------------------------------------------------------------------------
1428
| Returns the result of converting the single-precision floating-point value
1429
| `a' to the extended double-precision floating-point format.  The conversion
1430
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1431
| Arithmetic.
1432
*----------------------------------------------------------------------------*/
1433
 
1434
floatx80 float32_to_floatx80( float32 a )
1435
{
1436
    flag aSign;
1437
    int16 aExp;
1438
    bits32 aSig;
1439
 
1440
    aSig = extractFloat32Frac( a );
1441
    aExp = extractFloat32Exp( a );
1442
    aSign = extractFloat32Sign( a );
1443
    if ( aExp == 0xFF ) {
1444
        if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) );
1445
        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1446
    }
1447
    if ( aExp == 0 ) {
1448
        if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1449
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1450
    }
1451
    aSig |= 0x00800000;
1452
    return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
1453
 
1454
}
1455
 
1456
#endif
1457
 
1458
#ifdef FLOAT128
1459
 
1460
/*----------------------------------------------------------------------------
1461
| Returns the result of converting the single-precision floating-point value
1462
| `a' to the double-precision floating-point format.  The conversion is
1463
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1464
| Arithmetic.
1465
*----------------------------------------------------------------------------*/
1466
 
1467
float128 float32_to_float128( float32 a )
1468
{
1469
    flag aSign;
1470
    int16 aExp;
1471
    bits32 aSig;
1472
 
1473
    aSig = extractFloat32Frac( a );
1474
    aExp = extractFloat32Exp( a );
1475
    aSign = extractFloat32Sign( a );
1476
    if ( aExp == 0xFF ) {
1477
        if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a ) );
1478
        return packFloat128( aSign, 0x7FFF, 0, 0 );
1479
    }
1480
    if ( aExp == 0 ) {
1481
        if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1482
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1483
        --aExp;
1484
    }
1485
    return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
1486
 
1487
}
1488
 
1489
#endif
1490
 
1491
/*----------------------------------------------------------------------------
1492
| Rounds the single-precision floating-point value `a' to an integer, and
1493
| returns the result as a single-precision floating-point value.  The
1494
| operation is performed according to the IEC/IEEE Standard for Binary
1495
| Floating-Point Arithmetic.
1496
*----------------------------------------------------------------------------*/
1497
 
1498
float32 float32_round_to_int( float32 a )
1499
{
1500
    flag aSign;
1501
    int16 aExp;
1502
    bits32 lastBitMask, roundBitsMask;
1503
    int8 roundingMode;
1504
    float32 z;
1505
 
1506
    aExp = extractFloat32Exp( a );
1507
    if ( 0x96 <= aExp ) {
1508
        if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1509
            return propagateFloat32NaN( a, a );
1510
        }
1511
        return a;
1512
    }
1513
    if ( aExp <= 0x7E ) {
1514
        if ( (bits32) ( a<<1 ) == 0 ) return a;
1515
        float_exception_flags |= float_flag_inexact;
1516
        aSign = extractFloat32Sign( a );
1517
        switch ( float_rounding_mode ) {
1518
         case float_round_nearest_even:
1519
            if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1520
                return packFloat32( aSign, 0x7F, 0 );
1521
            }
1522
            break;
1523
         case float_round_down:
1524
            return aSign ? 0xBF800000 : 0;
1525
         case float_round_up:
1526
            return aSign ? 0x80000000 : 0x3F800000;
1527
        }
1528
        return packFloat32( aSign, 0, 0 );
1529
    }
1530
    lastBitMask = 1;
1531
    lastBitMask <<= 0x96 - aExp;
1532
    roundBitsMask = lastBitMask - 1;
1533
    z = a;
1534
    roundingMode = float_rounding_mode;
1535
    if ( roundingMode == float_round_nearest_even ) {
1536
        z += lastBitMask>>1;
1537
        if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1538
    }
1539
    else if ( roundingMode != float_round_to_zero ) {
1540
        if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1541
            z += roundBitsMask;
1542
        }
1543
    }
1544
    z &= ~ roundBitsMask;
1545
    if ( z != a ) float_exception_flags |= float_flag_inexact;
1546
    return z;
1547
 
1548
}
1549
 
1550
/*----------------------------------------------------------------------------
1551
| Returns the result of adding the absolute values of the single-precision
1552
| floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
1553
| before being returned.  `zSign' is ignored if the result is a NaN.
1554
| The addition is performed according to the IEC/IEEE Standard for Binary
1555
| Floating-Point Arithmetic.
1556
*----------------------------------------------------------------------------*/
1557
 
1558
static float32 addFloat32Sigs( float32 a, float32 b, flag zSign )
1559
{
1560
    int16 aExp, bExp, zExp;
1561
    bits32 aSig, bSig, zSig;
1562
    int16 expDiff;
1563
 
1564
    aSig = extractFloat32Frac( a );
1565
    aExp = extractFloat32Exp( a );
1566
    bSig = extractFloat32Frac( b );
1567
    bExp = extractFloat32Exp( b );
1568
    expDiff = aExp - bExp;
1569
    aSig <<= 6;
1570
    bSig <<= 6;
1571
    if ( 0 < expDiff ) {
1572
        if ( aExp == 0xFF ) {
1573
            if ( aSig ) return propagateFloat32NaN( a, b );
1574
            return a;
1575
        }
1576
        if ( bExp == 0 ) {
1577
            --expDiff;
1578
        }
1579
        else {
1580
            bSig |= 0x20000000;
1581
        }
1582
        shift32RightJamming( bSig, expDiff, &bSig );
1583
        zExp = aExp;
1584
    }
1585
    else if ( expDiff < 0 ) {
1586
        if ( bExp == 0xFF ) {
1587
            if ( bSig ) return propagateFloat32NaN( a, b );
1588
            return packFloat32( zSign, 0xFF, 0 );
1589
        }
1590
        if ( aExp == 0 ) {
1591
            ++expDiff;
1592
        }
1593
        else {
1594
            aSig |= 0x20000000;
1595
        }
1596
        shift32RightJamming( aSig, - expDiff, &aSig );
1597
        zExp = bExp;
1598
    }
1599
    else {
1600
        if ( aExp == 0xFF ) {
1601
            if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1602
            return a;
1603
        }
1604
        if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1605
        zSig = 0x40000000 + aSig + bSig;
1606
        zExp = aExp;
1607
        goto roundAndPack;
1608
    }
1609
    aSig |= 0x20000000;
1610
    zSig = ( aSig + bSig )<<1;
1611
    --zExp;
1612
    if ( (sbits32) zSig < 0 ) {
1613
        zSig = aSig + bSig;
1614
        ++zExp;
1615
    }
1616
 roundAndPack:
1617
    return roundAndPackFloat32( zSign, zExp, zSig );
1618
 
1619
}
1620
 
1621
/*----------------------------------------------------------------------------
1622
| Returns the result of subtracting the absolute values of the single-
1623
| precision floating-point values `a' and `b'.  If `zSign' is 1, the
1624
| difference is negated before being returned.  `zSign' is ignored if the
1625
| result is a NaN.  The subtraction is performed according to the IEC/IEEE
1626
| Standard for Binary Floating-Point Arithmetic.
1627
*----------------------------------------------------------------------------*/
1628
 
1629
static float32 subFloat32Sigs( float32 a, float32 b, flag zSign )
1630
{
1631
    int16 aExp, bExp, zExp;
1632
    bits32 aSig, bSig, zSig;
1633
    int16 expDiff;
1634
 
1635
    aSig = extractFloat32Frac( a );
1636
    aExp = extractFloat32Exp( a );
1637
    bSig = extractFloat32Frac( b );
1638
    bExp = extractFloat32Exp( b );
1639
    expDiff = aExp - bExp;
1640
    aSig <<= 7;
1641
    bSig <<= 7;
1642
    if ( 0 < expDiff ) goto aExpBigger;
1643
    if ( expDiff < 0 ) goto bExpBigger;
1644
    if ( aExp == 0xFF ) {
1645
        if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1646
        float_raise( float_flag_invalid );
1647
        return float32_default_nan;
1648
    }
1649
    if ( aExp == 0 ) {
1650
        aExp = 1;
1651
        bExp = 1;
1652
    }
1653
    if ( bSig < aSig ) goto aBigger;
1654
    if ( aSig < bSig ) goto bBigger;
1655
    return packFloat32( float_rounding_mode == float_round_down, 0, 0 );
1656
 bExpBigger:
1657
    if ( bExp == 0xFF ) {
1658
        if ( bSig ) return propagateFloat32NaN( a, b );
1659
        return packFloat32( zSign ^ 1, 0xFF, 0 );
1660
    }
1661
    if ( aExp == 0 ) {
1662
        ++expDiff;
1663
    }
1664
    else {
1665
        aSig |= 0x40000000;
1666
    }
1667
    shift32RightJamming( aSig, - expDiff, &aSig );
1668
    bSig |= 0x40000000;
1669
 bBigger:
1670
    zSig = bSig - aSig;
1671
    zExp = bExp;
1672
    zSign ^= 1;
1673
    goto normalizeRoundAndPack;
1674
 aExpBigger:
1675
    if ( aExp == 0xFF ) {
1676
        if ( aSig ) return propagateFloat32NaN( a, b );
1677
        return a;
1678
    }
1679
    if ( bExp == 0 ) {
1680
        --expDiff;
1681
    }
1682
    else {
1683
        bSig |= 0x40000000;
1684
    }
1685
    shift32RightJamming( bSig, expDiff, &bSig );
1686
    aSig |= 0x40000000;
1687
 aBigger:
1688
    zSig = aSig - bSig;
1689
    zExp = aExp;
1690
 normalizeRoundAndPack:
1691
    --zExp;
1692
    return normalizeRoundAndPackFloat32( zSign, zExp, zSig );
1693
 
1694
}
1695
 
1696
/*----------------------------------------------------------------------------
1697
| Returns the result of adding the single-precision floating-point values `a'
1698
| and `b'.  The operation is performed according to the IEC/IEEE Standard for
1699
| Binary Floating-Point Arithmetic.
1700
*----------------------------------------------------------------------------*/
1701
 
1702
float32 float32_add( float32 a, float32 b )
1703
{
1704
    flag aSign, bSign;
1705
 
1706
    aSign = extractFloat32Sign( a );
1707
    bSign = extractFloat32Sign( b );
1708
    if ( aSign == bSign ) {
1709
        return addFloat32Sigs( a, b, aSign );
1710
    }
1711
    else {
1712
        return subFloat32Sigs( a, b, aSign );
1713
    }
1714
 
1715
}
1716
 
1717
/*----------------------------------------------------------------------------
1718
| Returns the result of subtracting the single-precision floating-point values
1719
| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1720
| for Binary Floating-Point Arithmetic.
1721
*----------------------------------------------------------------------------*/
1722
 
1723
float32 float32_sub( float32 a, float32 b )
1724
{
1725
    flag aSign, bSign;
1726
 
1727
    aSign = extractFloat32Sign( a );
1728
    bSign = extractFloat32Sign( b );
1729
    if ( aSign == bSign ) {
1730
        return subFloat32Sigs( a, b, aSign );
1731
    }
1732
    else {
1733
        return addFloat32Sigs( a, b, aSign );
1734
    }
1735
 
1736
}
1737
 
1738
/*----------------------------------------------------------------------------
1739
| Returns the result of multiplying the single-precision floating-point values
1740
| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1741
| for Binary Floating-Point Arithmetic.
1742
*----------------------------------------------------------------------------*/
1743
 
1744
float32 float32_mul( float32 a, float32 b )
1745
{
1746
    flag aSign, bSign, zSign;
1747
    int16 aExp, bExp, zExp;
1748
    bits32 aSig, bSig;
1749
    bits64 zSig64;
1750
    bits32 zSig;
1751
 
1752
    aSig = extractFloat32Frac( a );
1753
    aExp = extractFloat32Exp( a );
1754
    aSign = extractFloat32Sign( a );
1755
    bSig = extractFloat32Frac( b );
1756
    bExp = extractFloat32Exp( b );
1757
    bSign = extractFloat32Sign( b );
1758
    zSign = aSign ^ bSign;
1759
    if ( aExp == 0xFF ) {
1760
        if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1761
            return propagateFloat32NaN( a, b );
1762
        }
1763
        if ( ( bExp | bSig ) == 0 ) {
1764
            float_raise( float_flag_invalid );
1765
            return float32_default_nan;
1766
        }
1767
        return packFloat32( zSign, 0xFF, 0 );
1768
    }
1769
    if ( bExp == 0xFF ) {
1770
        if ( bSig ) return propagateFloat32NaN( a, b );
1771
        if ( ( aExp | aSig ) == 0 ) {
1772
            float_raise( float_flag_invalid );
1773
            return float32_default_nan;
1774
        }
1775
        return packFloat32( zSign, 0xFF, 0 );
1776
    }
1777
    if ( aExp == 0 ) {
1778
        if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1779
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1780
    }
1781
    if ( bExp == 0 ) {
1782
        if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1783
        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1784
    }
1785
    zExp = aExp + bExp - 0x7F;
1786
    aSig = ( aSig | 0x00800000 )<<7;
1787
    bSig = ( bSig | 0x00800000 )<<8;
1788
    shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1789
    zSig = zSig64;
1790
    if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1791
        zSig <<= 1;
1792
        --zExp;
1793
    }
1794
    return roundAndPackFloat32( zSign, zExp, zSig );
1795
 
1796
}
1797
 
1798
/*----------------------------------------------------------------------------
1799
| Returns the result of dividing the single-precision floating-point value `a'
1800
| by the corresponding value `b'.  The operation is performed according to the
1801
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1802
*----------------------------------------------------------------------------*/
1803
 
1804
float32 float32_div( float32 a, float32 b )
1805
{
1806
    flag aSign, bSign, zSign;
1807
    int16 aExp, bExp, zExp;
1808
    bits32 aSig, bSig, zSig;
1809
 
1810
    aSig = extractFloat32Frac( a );
1811
    aExp = extractFloat32Exp( a );
1812
    aSign = extractFloat32Sign( a );
1813
    bSig = extractFloat32Frac( b );
1814
    bExp = extractFloat32Exp( b );
1815
    bSign = extractFloat32Sign( b );
1816
    zSign = aSign ^ bSign;
1817
    if ( aExp == 0xFF ) {
1818
        if ( aSig ) return propagateFloat32NaN( a, b );
1819
        if ( bExp == 0xFF ) {
1820
            if ( bSig ) return propagateFloat32NaN( a, b );
1821
            float_raise( float_flag_invalid );
1822
            return float32_default_nan;
1823
        }
1824
        return packFloat32( zSign, 0xFF, 0 );
1825
    }
1826
    if ( bExp == 0xFF ) {
1827
        if ( bSig ) return propagateFloat32NaN( a, b );
1828
        return packFloat32( zSign, 0, 0 );
1829
    }
1830
    if ( bExp == 0 ) {
1831
        if ( bSig == 0 ) {
1832
            if ( ( aExp | aSig ) == 0 ) {
1833
                float_raise( float_flag_invalid );
1834
                return float32_default_nan;
1835
            }
1836
            float_raise( float_flag_divbyzero );
1837
            return packFloat32( zSign, 0xFF, 0 );
1838
        }
1839
        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1840
    }
1841
    if ( aExp == 0 ) {
1842
        if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1843
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1844
    }
1845
    zExp = aExp - bExp + 0x7D;
1846
    aSig = ( aSig | 0x00800000 )<<7;
1847
    bSig = ( bSig | 0x00800000 )<<8;
1848
    if ( bSig <= ( aSig + aSig ) ) {
1849
        aSig >>= 1;
1850
        ++zExp;
1851
    }
1852
    zSig = ( ( (bits64) aSig )<<32 ) / bSig;
1853
    if ( ( zSig & 0x3F ) == 0 ) {
1854
        zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
1855
    }
1856
    return roundAndPackFloat32( zSign, zExp, zSig );
1857
 
1858
}
1859
 
1860
/*----------------------------------------------------------------------------
1861
| Returns the remainder of the single-precision floating-point value `a'
1862
| with respect to the corresponding value `b'.  The operation is performed
1863
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1864
*----------------------------------------------------------------------------*/
1865
 
1866
float32 float32_rem( float32 a, float32 b )
1867
{
1868
    flag aSign, bSign, zSign;
1869
    int16 aExp, bExp, expDiff;
1870
    bits32 aSig, bSig;
1871
    bits32 q;
1872
    bits64 aSig64, bSig64, q64;
1873
    bits32 alternateASig;
1874
    sbits32 sigMean;
1875
 
1876
    aSig = extractFloat32Frac( a );
1877
    aExp = extractFloat32Exp( a );
1878
    aSign = extractFloat32Sign( a );
1879
    bSig = extractFloat32Frac( b );
1880
    bExp = extractFloat32Exp( b );
1881
    bSign = extractFloat32Sign( b );
1882
    if ( aExp == 0xFF ) {
1883
        if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1884
            return propagateFloat32NaN( a, b );
1885
        }
1886
        float_raise( float_flag_invalid );
1887
        return float32_default_nan;
1888
    }
1889
    if ( bExp == 0xFF ) {
1890
        if ( bSig ) return propagateFloat32NaN( a, b );
1891
        return a;
1892
    }
1893
    if ( bExp == 0 ) {
1894
        if ( bSig == 0 ) {
1895
            float_raise( float_flag_invalid );
1896
            return float32_default_nan;
1897
        }
1898
        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1899
    }
1900
    if ( aExp == 0 ) {
1901
        if ( aSig == 0 ) return a;
1902
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1903
    }
1904
    expDiff = aExp - bExp;
1905
    aSig |= 0x00800000;
1906
    bSig |= 0x00800000;
1907
    if ( expDiff < 32 ) {
1908
        aSig <<= 8;
1909
        bSig <<= 8;
1910
        if ( expDiff < 0 ) {
1911
            if ( expDiff < -1 ) return a;
1912
            aSig >>= 1;
1913
        }
1914
        q = ( bSig <= aSig );
1915
        if ( q ) aSig -= bSig;
1916
        if ( 0 < expDiff ) {
1917
            q = ( ( (bits64) aSig )<<32 ) / bSig;
1918
            q >>= 32 - expDiff;
1919
            bSig >>= 2;
1920
            aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
1921
        }
1922
        else {
1923
            aSig >>= 2;
1924
            bSig >>= 2;
1925
        }
1926
    }
1927
    else {
1928
        if ( bSig <= aSig ) aSig -= bSig;
1929
        aSig64 = ( (bits64) aSig )<<40;
1930
        bSig64 = ( (bits64) bSig )<<40;
1931
        expDiff -= 64;
1932
        while ( 0 < expDiff ) {
1933
            q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1934
            q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1935
            aSig64 = - ( ( bSig * q64 )<<38 );
1936
            expDiff -= 62;
1937
        }
1938
        expDiff += 64;
1939
        q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1940
        q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1941
        q = q64>>( 64 - expDiff );
1942
        bSig <<= 6;
1943
        aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
1944
    }
1945
    do {
1946
        alternateASig = aSig;
1947
        ++q;
1948
        aSig -= bSig;
1949
    } while ( 0 <= (sbits32) aSig );
1950
    sigMean = aSig + alternateASig;
1951
    if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
1952
        aSig = alternateASig;
1953
    }
1954
    zSign = ( (sbits32) aSig < 0 );
1955
    if ( zSign ) aSig = - aSig;
1956
    return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig );
1957
 
1958
}
1959
 
1960
/*----------------------------------------------------------------------------
1961
| Returns the square root of the single-precision floating-point value `a'.
1962
| The operation is performed according to the IEC/IEEE Standard for Binary
1963
| Floating-Point Arithmetic.
1964
*----------------------------------------------------------------------------*/
1965
 
1966
float32 float32_sqrt( float32 a )
1967
{
1968
    flag aSign;
1969
    int16 aExp, zExp;
1970
    bits32 aSig, zSig;
1971
    bits64 rem, term;
1972
 
1973
    aSig = extractFloat32Frac( a );
1974
    aExp = extractFloat32Exp( a );
1975
    aSign = extractFloat32Sign( a );
1976
    if ( aExp == 0xFF ) {
1977
        if ( aSig ) return propagateFloat32NaN( a, 0 );
1978
        if ( ! aSign ) return a;
1979
        float_raise( float_flag_invalid );
1980
        return float32_default_nan;
1981
    }
1982
    if ( aSign ) {
1983
        if ( ( aExp | aSig ) == 0 ) return a;
1984
        float_raise( float_flag_invalid );
1985
        return float32_default_nan;
1986
    }
1987
    if ( aExp == 0 ) {
1988
        if ( aSig == 0 ) return 0;
1989
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1990
    }
1991
    zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
1992
    aSig = ( aSig | 0x00800000 )<<8;
1993
    zSig = estimateSqrt32( aExp, aSig ) + 2;
1994
    if ( ( zSig & 0x7F ) <= 5 ) {
1995
        if ( zSig < 2 ) {
1996
            zSig = 0x7FFFFFFF;
1997
            goto roundAndPack;
1998
        }
1999
        aSig >>= aExp & 1;
2000
        term = ( (bits64) zSig ) * zSig;
2001
        rem = ( ( (bits64) aSig )<<32 ) - term;
2002
        while ( (sbits64) rem < 0 ) {
2003
            --zSig;
2004
            rem += ( ( (bits64) zSig )<<1 ) | 1;
2005
        }
2006
        zSig |= ( rem != 0 );
2007
    }
2008
    shift32RightJamming( zSig, 1, &zSig );
2009
 roundAndPack:
2010
    return roundAndPackFloat32( 0, zExp, zSig );
2011
 
2012
}
2013
 
2014
/*----------------------------------------------------------------------------
2015
| Returns 1 if the single-precision floating-point value `a' is equal to
2016
| the corresponding value `b', and 0 otherwise.  The comparison is performed
2017
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2018
*----------------------------------------------------------------------------*/
2019
 
2020
flag float32_eq( float32 a, float32 b )
2021
{
2022
 
2023
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2024
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2025
       ) {
2026
        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2027
            float_raise( float_flag_invalid );
2028
        }
2029
        return 0;
2030
    }
2031
    return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2032
 
2033
}
2034
 
2035
/*----------------------------------------------------------------------------
2036
| Returns 1 if the single-precision floating-point value `a' is less than
2037
| or equal to the corresponding value `b', and 0 otherwise.  The comparison
2038
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
2039
| Arithmetic.
2040
*----------------------------------------------------------------------------*/
2041
 
2042
flag float32_le( float32 a, float32 b )
2043
{
2044
    flag aSign, bSign;
2045
 
2046
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2047
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2048
       ) {
2049
        float_raise( float_flag_invalid );
2050
        return 0;
2051
    }
2052
    aSign = extractFloat32Sign( a );
2053
    bSign = extractFloat32Sign( b );
2054
    if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2055
    return ( a == b ) || ( aSign ^ ( a < b ) );
2056
 
2057
}
2058
 
2059
/*----------------------------------------------------------------------------
2060
| Returns 1 if the single-precision floating-point value `a' is less than
2061
| the corresponding value `b', and 0 otherwise.  The comparison is performed
2062
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2063
*----------------------------------------------------------------------------*/
2064
 
2065
flag float32_lt( float32 a, float32 b )
2066
{
2067
    flag aSign, bSign;
2068
 
2069
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2070
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2071
       ) {
2072
        float_raise( float_flag_invalid );
2073
        return 0;
2074
    }
2075
    aSign = extractFloat32Sign( a );
2076
    bSign = extractFloat32Sign( b );
2077
    if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2078
    return ( a != b ) && ( aSign ^ ( a < b ) );
2079
 
2080
}
2081
 
2082
/*----------------------------------------------------------------------------
2083
| Returns 1 if the single-precision floating-point value `a' is equal to
2084
| the corresponding value `b', and 0 otherwise.  The invalid exception is
2085
| raised if either operand is a NaN.  Otherwise, the comparison is performed
2086
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2087
*----------------------------------------------------------------------------*/
2088
 
2089
flag float32_eq_signaling( float32 a, float32 b )
2090
{
2091
 
2092
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2093
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2094
       ) {
2095
        float_raise( float_flag_invalid );
2096
        return 0;
2097
    }
2098
    return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2099
 
2100
}
2101
 
2102
/*----------------------------------------------------------------------------
2103
| Returns 1 if the single-precision floating-point value `a' is less than or
2104
| equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2105
| cause an exception.  Otherwise, the comparison is performed according to the
2106
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2107
*----------------------------------------------------------------------------*/
2108
 
2109
flag float32_le_quiet( float32 a, float32 b )
2110
{
2111
    flag aSign, bSign;
2112
    //int16 aExp, bExp; // Unused variables - JB
2113
 
2114
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2115
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2116
       ) {
2117
        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2118
            float_raise( float_flag_invalid );
2119
        }
2120
        return 0;
2121
    }
2122
    aSign = extractFloat32Sign( a );
2123
    bSign = extractFloat32Sign( b );
2124
    if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2125
    return ( a == b ) || ( aSign ^ ( a < b ) );
2126
 
2127
}
2128
 
2129
/*----------------------------------------------------------------------------
2130
| Returns 1 if the single-precision floating-point value `a' is less than
2131
| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2132
| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
2133
| Standard for Binary Floating-Point Arithmetic.
2134
*----------------------------------------------------------------------------*/
2135
 
2136
flag float32_lt_quiet( float32 a, float32 b )
2137
{
2138
    flag aSign, bSign;
2139
 
2140
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2141
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2142
       ) {
2143
        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2144
            float_raise( float_flag_invalid );
2145
        }
2146
        return 0;
2147
    }
2148
    aSign = extractFloat32Sign( a );
2149
    bSign = extractFloat32Sign( b );
2150
    if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2151
    return ( a != b ) && ( aSign ^ ( a < b ) );
2152
 
2153
}
2154
 
2155
/*----------------------------------------------------------------------------
2156
| Returns the result of converting the double-precision floating-point value
2157
| `a' to the 32-bit two's complement integer format.  The conversion is
2158
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2159
| Arithmetic---which means in particular that the conversion is rounded
2160
| according to the current rounding mode.  If `a' is a NaN, the largest
2161
| positive integer is returned.  Otherwise, if the conversion overflows, the
2162
| largest integer with the same sign as `a' is returned.
2163
*----------------------------------------------------------------------------*/
2164
 
2165
int32 float64_to_int32( float64 a )
2166
{
2167
    flag aSign;
2168
    int16 aExp, shiftCount;
2169
    bits64 aSig;
2170
 
2171
    aSig = extractFloat64Frac( a );
2172
    aExp = extractFloat64Exp( a );
2173
    aSign = extractFloat64Sign( a );
2174
    if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2175
    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2176
    shiftCount = 0x42C - aExp;
2177
    if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2178
    return roundAndPackInt32( aSign, aSig );
2179
 
2180
}
2181
 
2182
/*----------------------------------------------------------------------------
2183
| Returns the result of converting the double-precision floating-point value
2184
| `a' to the 32-bit two's complement integer format.  The conversion is
2185
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2186
| Arithmetic, except that the conversion is always rounded toward zero.
2187
| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2188
| the conversion overflows, the largest integer with the same sign as `a' is
2189
| returned.
2190
*----------------------------------------------------------------------------*/
2191
 
2192
int32 float64_to_int32_round_to_zero( float64 a )
2193
{
2194
    flag aSign;
2195
    int16 aExp, shiftCount;
2196
    bits64 aSig, savedASig;
2197
    int32 z;
2198
 
2199
    aSig = extractFloat64Frac( a );
2200
    aExp = extractFloat64Exp( a );
2201
    aSign = extractFloat64Sign( a );
2202
    if ( 0x41E < aExp ) {
2203
        if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2204
        goto invalid;
2205
    }
2206
    else if ( aExp < 0x3FF ) {
2207
        if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
2208
        return 0;
2209
    }
2210
    aSig |= LIT64( 0x0010000000000000 );
2211
    shiftCount = 0x433 - aExp;
2212
    savedASig = aSig;
2213
    aSig >>= shiftCount;
2214
    z = aSig;
2215
    if ( aSign ) z = - z;
2216
    if ( ( z < 0 ) ^ aSign ) {
2217
 invalid:
2218
        float_raise( float_flag_invalid );
2219
        return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
2220
    }
2221
    if ( ( aSig<<shiftCount ) != savedASig ) {
2222
        float_exception_flags |= float_flag_inexact;
2223
    }
2224
    return z;
2225
 
2226
}
2227
 
2228
/*----------------------------------------------------------------------------
2229
| Returns the result of converting the double-precision floating-point value
2230
| `a' to the 64-bit two's complement integer format.  The conversion is
2231
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2232
| Arithmetic---which means in particular that the conversion is rounded
2233
| according to the current rounding mode.  If `a' is a NaN, the largest
2234
| positive integer is returned.  Otherwise, if the conversion overflows, the
2235
| largest integer with the same sign as `a' is returned.
2236
*----------------------------------------------------------------------------*/
2237
 
2238
int64 float64_to_int64( float64 a )
2239
{
2240
    flag aSign;
2241
    int16 aExp, shiftCount;
2242
    bits64 aSig, aSigExtra;
2243
 
2244
    aSig = extractFloat64Frac( a );
2245
    aExp = extractFloat64Exp( a );
2246
    aSign = extractFloat64Sign( a );
2247
    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2248
    shiftCount = 0x433 - aExp;
2249
    if ( shiftCount <= 0 ) {
2250
        if ( 0x43E < aExp ) {
2251
            float_raise( float_flag_invalid );
2252
            if (    ! aSign
2253
                 || (    ( aExp == 0x7FF )
2254
                      && ( aSig != LIT64( 0x0010000000000000 ) ) )
2255
               ) {
2256
                return LIT64( 0x7FFFFFFFFFFFFFFF );
2257
            }
2258
            return (sbits64) LIT64( 0x8000000000000000 );
2259
        }
2260
        aSigExtra = 0;
2261
        aSig <<= - shiftCount;
2262
    }
2263
    else {
2264
        shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2265
    }
2266
    return roundAndPackInt64( aSign, aSig, aSigExtra );
2267
 
2268
}
2269
 
2270
/*----------------------------------------------------------------------------
2271
| Returns the result of converting the double-precision floating-point value
2272
| `a' to the 64-bit two's complement integer format.  The conversion is
2273
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2274
| Arithmetic, except that the conversion is always rounded toward zero.
2275
| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2276
| the conversion overflows, the largest integer with the same sign as `a' is
2277
| returned.
2278
*----------------------------------------------------------------------------*/
2279
 
2280
int64 float64_to_int64_round_to_zero( float64 a )
2281
{
2282
    flag aSign;
2283
    int16 aExp, shiftCount;
2284
    bits64 aSig;
2285
    int64 z;
2286
 
2287
    aSig = extractFloat64Frac( a );
2288
    aExp = extractFloat64Exp( a );
2289
    aSign = extractFloat64Sign( a );
2290
    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2291
    shiftCount = aExp - 0x433;
2292
    if ( 0 <= shiftCount ) {
2293
        if ( 0x43E <= aExp ) {
2294
            if ( a != LIT64( 0xC3E0000000000000 ) ) {
2295
                float_raise( float_flag_invalid );
2296
                if (    ! aSign
2297
                     || (    ( aExp == 0x7FF )
2298
                          && ( aSig != LIT64( 0x0010000000000000 ) ) )
2299
                   ) {
2300
                    return LIT64( 0x7FFFFFFFFFFFFFFF );
2301
                }
2302
            }
2303
            return (sbits64) LIT64( 0x8000000000000000 );
2304
        }
2305
        z = aSig<<shiftCount;
2306
    }
2307
    else {
2308
        if ( aExp < 0x3FE ) {
2309
            if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
2310
            return 0;
2311
        }
2312
        z = aSig>>( - shiftCount );
2313
        if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
2314
            float_exception_flags |= float_flag_inexact;
2315
        }
2316
    }
2317
    if ( aSign ) z = - z;
2318
    return z;
2319
 
2320
}
2321
 
2322
/*----------------------------------------------------------------------------
2323
| Returns the result of converting the double-precision floating-point value
2324
| `a' to the single-precision floating-point format.  The conversion is
2325
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2326
| Arithmetic.
2327
*----------------------------------------------------------------------------*/
2328
 
2329
float32 float64_to_float32( float64 a )
2330
{
2331
    flag aSign;
2332
    int16 aExp;
2333
    bits64 aSig;
2334
    bits32 zSig;
2335
 
2336
    aSig = extractFloat64Frac( a );
2337
    aExp = extractFloat64Exp( a );
2338
    aSign = extractFloat64Sign( a );
2339
    if ( aExp == 0x7FF ) {
2340
        if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) );
2341
        return packFloat32( aSign, 0xFF, 0 );
2342
    }
2343
    shift64RightJamming( aSig, 22, &aSig );
2344
    zSig = aSig;
2345
    if ( aExp || zSig ) {
2346
        zSig |= 0x40000000;
2347
        aExp -= 0x381;
2348
    }
2349
    return roundAndPackFloat32( aSign, aExp, zSig );
2350
 
2351
}
2352
 
2353
#ifdef FLOATX80
2354
 
2355
/*----------------------------------------------------------------------------
2356
| Returns the result of converting the double-precision floating-point value
2357
| `a' to the extended double-precision floating-point format.  The conversion
2358
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
2359
| Arithmetic.
2360
*----------------------------------------------------------------------------*/
2361
 
2362
floatx80 float64_to_floatx80( float64 a )
2363
{
2364
    flag aSign;
2365
    int16 aExp;
2366
    bits64 aSig;
2367
 
2368
    aSig = extractFloat64Frac( a );
2369
    aExp = extractFloat64Exp( a );
2370
    aSign = extractFloat64Sign( a );
2371
    if ( aExp == 0x7FF ) {
2372
        if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) );
2373
        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2374
    }
2375
    if ( aExp == 0 ) {
2376
        if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2377
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2378
    }
2379
    return
2380
        packFloatx80(
2381
            aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2382
 
2383
}
2384
 
2385
#endif
2386
 
2387
#ifdef FLOAT128
2388
 
2389
/*----------------------------------------------------------------------------
2390
| Returns the result of converting the double-precision floating-point value
2391
| `a' to the quadruple-precision floating-point format.  The conversion is
2392
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2393
| Arithmetic.
2394
*----------------------------------------------------------------------------*/
2395
 
2396
float128 float64_to_float128( float64 a )
2397
{
2398
    flag aSign;
2399
    int16 aExp;
2400
    bits64 aSig, zSig0, zSig1;
2401
 
2402
    aSig = extractFloat64Frac( a );
2403
    aExp = extractFloat64Exp( a );
2404
    aSign = extractFloat64Sign( a );
2405
    if ( aExp == 0x7FF ) {
2406
        if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a ) );
2407
        return packFloat128( aSign, 0x7FFF, 0, 0 );
2408
    }
2409
    if ( aExp == 0 ) {
2410
        if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2411
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2412
        --aExp;
2413
    }
2414
    shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2415
    return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2416
 
2417
}
2418
 
2419
#endif
2420
 
2421
/*----------------------------------------------------------------------------
2422
| Rounds the double-precision floating-point value `a' to an integer, and
2423
| returns the result as a double-precision floating-point value.  The
2424
| operation is performed according to the IEC/IEEE Standard for Binary
2425
| Floating-Point Arithmetic.
2426
*----------------------------------------------------------------------------*/
2427
 
2428
float64 float64_round_to_int( float64 a )
2429
{
2430
    flag aSign;
2431
    int16 aExp;
2432
    bits64 lastBitMask, roundBitsMask;
2433
    int8 roundingMode;
2434
    float64 z;
2435
 
2436
    aExp = extractFloat64Exp( a );
2437
    if ( 0x433 <= aExp ) {
2438
        if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2439
            return propagateFloat64NaN( a, a );
2440
        }
2441
        return a;
2442
    }
2443
    if ( aExp < 0x3FF ) {
2444
        if ( (bits64) ( a<<1 ) == 0 ) return a;
2445
        float_exception_flags |= float_flag_inexact;
2446
        aSign = extractFloat64Sign( a );
2447
        switch ( float_rounding_mode ) {
2448
         case float_round_nearest_even:
2449
            if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2450
                return packFloat64( aSign, 0x3FF, 0 );
2451
            }
2452
            break;
2453
         case float_round_down:
2454
            return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
2455
         case float_round_up:
2456
            return
2457
            aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
2458
        }
2459
        return packFloat64( aSign, 0, 0 );
2460
    }
2461
    lastBitMask = 1;
2462
    lastBitMask <<= 0x433 - aExp;
2463
    roundBitsMask = lastBitMask - 1;
2464
    z = a;
2465
    roundingMode = float_rounding_mode;
2466
    if ( roundingMode == float_round_nearest_even ) {
2467
        z += lastBitMask>>1;
2468
        if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
2469
    }
2470
    else if ( roundingMode != float_round_to_zero ) {
2471
        if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
2472
            z += roundBitsMask;
2473
        }
2474
    }
2475
    z &= ~ roundBitsMask;
2476
    if ( z != a ) float_exception_flags |= float_flag_inexact;
2477
    return z;
2478
 
2479
}
2480
 
2481
/*----------------------------------------------------------------------------
2482
| Returns the result of adding the absolute values of the double-precision
2483
| floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
2484
| before being returned.  `zSign' is ignored if the result is a NaN.
2485
| The addition is performed according to the IEC/IEEE Standard for Binary
2486
| Floating-Point Arithmetic.
2487
*----------------------------------------------------------------------------*/
2488
 
2489
static float64 addFloat64Sigs( float64 a, float64 b, flag zSign )
2490
{
2491
    int16 aExp, bExp, zExp;
2492
    bits64 aSig, bSig, zSig;
2493
    int16 expDiff;
2494
 
2495
    aSig = extractFloat64Frac( a );
2496
    aExp = extractFloat64Exp( a );
2497
    bSig = extractFloat64Frac( b );
2498
    bExp = extractFloat64Exp( b );
2499
    expDiff = aExp - bExp;
2500
    aSig <<= 9;
2501
    bSig <<= 9;
2502
    if ( 0 < expDiff ) {
2503
        if ( aExp == 0x7FF ) {
2504
            if ( aSig ) return propagateFloat64NaN( a, b );
2505
            return a;
2506
        }
2507
        if ( bExp == 0 ) {
2508
            --expDiff;
2509
        }
2510
        else {
2511
            bSig |= LIT64( 0x2000000000000000 );
2512
        }
2513
        shift64RightJamming( bSig, expDiff, &bSig );
2514
        zExp = aExp;
2515
    }
2516
    else if ( expDiff < 0 ) {
2517
        if ( bExp == 0x7FF ) {
2518
            if ( bSig ) return propagateFloat64NaN( a, b );
2519
            return packFloat64( zSign, 0x7FF, 0 );
2520
        }
2521
        if ( aExp == 0 ) {
2522
            ++expDiff;
2523
        }
2524
        else {
2525
            aSig |= LIT64( 0x2000000000000000 );
2526
        }
2527
        shift64RightJamming( aSig, - expDiff, &aSig );
2528
        zExp = bExp;
2529
    }
2530
    else {
2531
        if ( aExp == 0x7FF ) {
2532
            if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2533
            return a;
2534
        }
2535
        if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
2536
        zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
2537
        zExp = aExp;
2538
        goto roundAndPack;
2539
    }
2540
    aSig |= LIT64( 0x2000000000000000 );
2541
    zSig = ( aSig + bSig )<<1;
2542
    --zExp;
2543
    if ( (sbits64) zSig < 0 ) {
2544
        zSig = aSig + bSig;
2545
        ++zExp;
2546
    }
2547
 roundAndPack:
2548
    return roundAndPackFloat64( zSign, zExp, zSig );
2549
 
2550
}
2551
 
2552
/*----------------------------------------------------------------------------
2553
| Returns the result of subtracting the absolute values of the double-
2554
| precision floating-point values `a' and `b'.  If `zSign' is 1, the
2555
| difference is negated before being returned.  `zSign' is ignored if the
2556
| result is a NaN.  The subtraction is performed according to the IEC/IEEE
2557
| Standard for Binary Floating-Point Arithmetic.
2558
*----------------------------------------------------------------------------*/
2559
 
2560
static float64 subFloat64Sigs( float64 a, float64 b, flag zSign )
2561
{
2562
    int16 aExp, bExp, zExp;
2563
    bits64 aSig, bSig, zSig;
2564
    int16 expDiff;
2565
 
2566
    aSig = extractFloat64Frac( a );
2567
    aExp = extractFloat64Exp( a );
2568
    bSig = extractFloat64Frac( b );
2569
    bExp = extractFloat64Exp( b );
2570
    expDiff = aExp - bExp;
2571
    aSig <<= 10;
2572
    bSig <<= 10;
2573
    if ( 0 < expDiff ) goto aExpBigger;
2574
    if ( expDiff < 0 ) goto bExpBigger;
2575
    if ( aExp == 0x7FF ) {
2576
        if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2577
        float_raise( float_flag_invalid );
2578
        return float64_default_nan;
2579
    }
2580
    if ( aExp == 0 ) {
2581
        aExp = 1;
2582
        bExp = 1;
2583
    }
2584
    if ( bSig < aSig ) goto aBigger;
2585
    if ( aSig < bSig ) goto bBigger;
2586
    return packFloat64( float_rounding_mode == float_round_down, 0, 0 );
2587
 bExpBigger:
2588
    if ( bExp == 0x7FF ) {
2589
        if ( bSig ) return propagateFloat64NaN( a, b );
2590
        return packFloat64( zSign ^ 1, 0x7FF, 0 );
2591
    }
2592
    if ( aExp == 0 ) {
2593
        ++expDiff;
2594
    }
2595
    else {
2596
        aSig |= LIT64( 0x4000000000000000 );
2597
    }
2598
    shift64RightJamming( aSig, - expDiff, &aSig );
2599
    bSig |= LIT64( 0x4000000000000000 );
2600
 bBigger:
2601
    zSig = bSig - aSig;
2602
    zExp = bExp;
2603
    zSign ^= 1;
2604
    goto normalizeRoundAndPack;
2605
 aExpBigger:
2606
    if ( aExp == 0x7FF ) {
2607
        if ( aSig ) return propagateFloat64NaN( a, b );
2608
        return a;
2609
    }
2610
    if ( bExp == 0 ) {
2611
        --expDiff;
2612
    }
2613
    else {
2614
        bSig |= LIT64( 0x4000000000000000 );
2615
    }
2616
    shift64RightJamming( bSig, expDiff, &bSig );
2617
    aSig |= LIT64( 0x4000000000000000 );
2618
 aBigger:
2619
    zSig = aSig - bSig;
2620
    zExp = aExp;
2621
 normalizeRoundAndPack:
2622
    --zExp;
2623
    return normalizeRoundAndPackFloat64( zSign, zExp, zSig );
2624
 
2625
}
2626
 
2627
/*----------------------------------------------------------------------------
2628
| Returns the result of adding the double-precision floating-point values `a'
2629
| and `b'.  The operation is performed according to the IEC/IEEE Standard for
2630
| Binary Floating-Point Arithmetic.
2631
*----------------------------------------------------------------------------*/
2632
 
2633
float64 float64_add( float64 a, float64 b )
2634
{
2635
    flag aSign, bSign;
2636
 
2637
    aSign = extractFloat64Sign( a );
2638
    bSign = extractFloat64Sign( b );
2639
    if ( aSign == bSign ) {
2640
        return addFloat64Sigs( a, b, aSign );
2641
    }
2642
    else {
2643
        return subFloat64Sigs( a, b, aSign );
2644
    }
2645
 
2646
}
2647
 
2648
/*----------------------------------------------------------------------------
2649
| Returns the result of subtracting the double-precision floating-point values
2650
| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2651
| for Binary Floating-Point Arithmetic.
2652
*----------------------------------------------------------------------------*/
2653
 
2654
float64 float64_sub( float64 a, float64 b )
2655
{
2656
    flag aSign, bSign;
2657
 
2658
    aSign = extractFloat64Sign( a );
2659
    bSign = extractFloat64Sign( b );
2660
    if ( aSign == bSign ) {
2661
        return subFloat64Sigs( a, b, aSign );
2662
    }
2663
    else {
2664
        return addFloat64Sigs( a, b, aSign );
2665
    }
2666
 
2667
}
2668
 
2669
/*----------------------------------------------------------------------------
2670
| Returns the result of multiplying the double-precision floating-point values
2671
| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2672
| for Binary Floating-Point Arithmetic.
2673
*----------------------------------------------------------------------------*/
2674
 
2675
float64 float64_mul( float64 a, float64 b )
2676
{
2677
    flag aSign, bSign, zSign;
2678
    int16 aExp, bExp, zExp;
2679
    bits64 aSig, bSig, zSig0, zSig1;
2680
 
2681
    aSig = extractFloat64Frac( a );
2682
    aExp = extractFloat64Exp( a );
2683
    aSign = extractFloat64Sign( a );
2684
    bSig = extractFloat64Frac( b );
2685
    bExp = extractFloat64Exp( b );
2686
    bSign = extractFloat64Sign( b );
2687
    zSign = aSign ^ bSign;
2688
    if ( aExp == 0x7FF ) {
2689
        if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2690
            return propagateFloat64NaN( a, b );
2691
        }
2692
        if ( ( bExp | bSig ) == 0 ) {
2693
            float_raise( float_flag_invalid );
2694
            return float64_default_nan;
2695
        }
2696
        return packFloat64( zSign, 0x7FF, 0 );
2697
    }
2698
    if ( bExp == 0x7FF ) {
2699
        if ( bSig ) return propagateFloat64NaN( a, b );
2700
        if ( ( aExp | aSig ) == 0 ) {
2701
            float_raise( float_flag_invalid );
2702
            return float64_default_nan;
2703
        }
2704
        return packFloat64( zSign, 0x7FF, 0 );
2705
    }
2706
    if ( aExp == 0 ) {
2707
        if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2708
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2709
    }
2710
    if ( bExp == 0 ) {
2711
        if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2712
        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2713
    }
2714
    zExp = aExp + bExp - 0x3FF;
2715
    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2716
    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2717
    mul64To128( aSig, bSig, &zSig0, &zSig1 );
2718
    zSig0 |= ( zSig1 != 0 );
2719
    if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2720
        zSig0 <<= 1;
2721
        --zExp;
2722
    }
2723
    return roundAndPackFloat64( zSign, zExp, zSig0 );
2724
 
2725
}
2726
 
2727
/*----------------------------------------------------------------------------
2728
| Returns the result of dividing the double-precision floating-point value `a'
2729
| by the corresponding value `b'.  The operation is performed according to
2730
| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2731
*----------------------------------------------------------------------------*/
2732
 
2733
float64 float64_div( float64 a, float64 b )
2734
{
2735
    flag aSign, bSign, zSign;
2736
    int16 aExp, bExp, zExp;
2737
    bits64 aSig, bSig, zSig;
2738
    bits64 rem0, rem1;
2739
    bits64 term0, term1;
2740
 
2741
    aSig = extractFloat64Frac( a );
2742
    aExp = extractFloat64Exp( a );
2743
    aSign = extractFloat64Sign( a );
2744
    bSig = extractFloat64Frac( b );
2745
    bExp = extractFloat64Exp( b );
2746
    bSign = extractFloat64Sign( b );
2747
    zSign = aSign ^ bSign;
2748
    if ( aExp == 0x7FF ) {
2749
        if ( aSig ) return propagateFloat64NaN( a, b );
2750
        if ( bExp == 0x7FF ) {
2751
            if ( bSig ) return propagateFloat64NaN( a, b );
2752
            float_raise( float_flag_invalid );
2753
            return float64_default_nan;
2754
        }
2755
        return packFloat64( zSign, 0x7FF, 0 );
2756
    }
2757
    if ( bExp == 0x7FF ) {
2758
        if ( bSig ) return propagateFloat64NaN( a, b );
2759
        return packFloat64( zSign, 0, 0 );
2760
    }
2761
    if ( bExp == 0 ) {
2762
        if ( bSig == 0 ) {
2763
            if ( ( aExp | aSig ) == 0 ) {
2764
                float_raise( float_flag_invalid );
2765
                return float64_default_nan;
2766
            }
2767
            float_raise( float_flag_divbyzero );
2768
            return packFloat64( zSign, 0x7FF, 0 );
2769
        }
2770
        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2771
    }
2772
    if ( aExp == 0 ) {
2773
        if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2774
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2775
    }
2776
    zExp = aExp - bExp + 0x3FD;
2777
    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2778
    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2779
    if ( bSig <= ( aSig + aSig ) ) {
2780
        aSig >>= 1;
2781
        ++zExp;
2782
    }
2783
    zSig = estimateDiv128To64( aSig, 0, bSig );
2784
    if ( ( zSig & 0x1FF ) <= 2 ) {
2785
        mul64To128( bSig, zSig, &term0, &term1 );
2786
        sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2787
        while ( (sbits64) rem0 < 0 ) {
2788
            --zSig;
2789
            add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
2790
        }
2791
        zSig |= ( rem1 != 0 );
2792
    }
2793
    return roundAndPackFloat64( zSign, zExp, zSig );
2794
 
2795
}
2796
 
2797
/*----------------------------------------------------------------------------
2798
| Returns the remainder of the double-precision floating-point value `a'
2799
| with respect to the corresponding value `b'.  The operation is performed
2800
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2801
*----------------------------------------------------------------------------*/
2802
 
2803
float64 float64_rem( float64 a, float64 b )
2804
{
2805
    flag aSign, bSign, zSign;
2806
    int16 aExp, bExp, expDiff;
2807
    bits64 aSig, bSig;
2808
    bits64 q, alternateASig;
2809
    sbits64 sigMean;
2810
 
2811
    aSig = extractFloat64Frac( a );
2812
    aExp = extractFloat64Exp( a );
2813
    aSign = extractFloat64Sign( a );
2814
    bSig = extractFloat64Frac( b );
2815
    bExp = extractFloat64Exp( b );
2816
    bSign = extractFloat64Sign( b );
2817
    if ( aExp == 0x7FF ) {
2818
        if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2819
            return propagateFloat64NaN( a, b );
2820
        }
2821
        float_raise( float_flag_invalid );
2822
        return float64_default_nan;
2823
    }
2824
    if ( bExp == 0x7FF ) {
2825
        if ( bSig ) return propagateFloat64NaN( a, b );
2826
        return a;
2827
    }
2828
    if ( bExp == 0 ) {
2829
        if ( bSig == 0 ) {
2830
            float_raise( float_flag_invalid );
2831
            return float64_default_nan;
2832
        }
2833
        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2834
    }
2835
    if ( aExp == 0 ) {
2836
        if ( aSig == 0 ) return a;
2837
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2838
    }
2839
    expDiff = aExp - bExp;
2840
    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
2841
    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2842
    if ( expDiff < 0 ) {
2843
        if ( expDiff < -1 ) return a;
2844
        aSig >>= 1;
2845
    }
2846
    q = ( bSig <= aSig );
2847
    if ( q ) aSig -= bSig;
2848
    expDiff -= 64;
2849
    while ( 0 < expDiff ) {
2850
        q = estimateDiv128To64( aSig, 0, bSig );
2851
        q = ( 2 < q ) ? q - 2 : 0;
2852
        aSig = - ( ( bSig>>2 ) * q );
2853
        expDiff -= 62;
2854
    }
2855
    expDiff += 64;
2856
    if ( 0 < expDiff ) {
2857
        q = estimateDiv128To64( aSig, 0, bSig );
2858
        q = ( 2 < q ) ? q - 2 : 0;
2859
        q >>= 64 - expDiff;
2860
        bSig >>= 2;
2861
        aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2862
    }
2863
    else {
2864
        aSig >>= 2;
2865
        bSig >>= 2;
2866
    }
2867
    do {
2868
        alternateASig = aSig;
2869
        ++q;
2870
        aSig -= bSig;
2871
    } while ( 0 <= (sbits64) aSig );
2872
    sigMean = aSig + alternateASig;
2873
    if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2874
        aSig = alternateASig;
2875
    }
2876
    zSign = ( (sbits64) aSig < 0 );
2877
    if ( zSign ) aSig = - aSig;
2878
    return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig );
2879
 
2880
}
2881
 
2882
/*----------------------------------------------------------------------------
2883
| Returns the square root of the double-precision floating-point value `a'.
2884
| The operation is performed according to the IEC/IEEE Standard for Binary
2885
| Floating-Point Arithmetic.
2886
*----------------------------------------------------------------------------*/
2887
 
2888
float64 float64_sqrt( float64 a )
2889
{
2890
    flag aSign;
2891
    int16 aExp, zExp;
2892
    bits64 aSig, zSig, doubleZSig;
2893
    bits64 rem0, rem1, term0, term1;
2894
    //float64 z; // Unused variable - JB
2895
 
2896
    aSig = extractFloat64Frac( a );
2897
    aExp = extractFloat64Exp( a );
2898
    aSign = extractFloat64Sign( a );
2899
    if ( aExp == 0x7FF ) {
2900
        if ( aSig ) return propagateFloat64NaN( a, a );
2901
        if ( ! aSign ) return a;
2902
        float_raise( float_flag_invalid );
2903
        return float64_default_nan;
2904
    }
2905
    if ( aSign ) {
2906
        if ( ( aExp | aSig ) == 0 ) return a;
2907
        float_raise( float_flag_invalid );
2908
        return float64_default_nan;
2909
    }
2910
    if ( aExp == 0 ) {
2911
        if ( aSig == 0 ) return 0;
2912
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2913
    }
2914
    zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
2915
    aSig |= LIT64( 0x0010000000000000 );
2916
    zSig = estimateSqrt32( aExp, aSig>>21 );
2917
    aSig <<= 9 - ( aExp & 1 );
2918
    zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
2919
    if ( ( zSig & 0x1FF ) <= 5 ) {
2920
        doubleZSig = zSig<<1;
2921
        mul64To128( zSig, zSig, &term0, &term1 );
2922
        sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2923
        while ( (sbits64) rem0 < 0 ) {
2924
            --zSig;
2925
            doubleZSig -= 2;
2926
            add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
2927
        }
2928
        zSig |= ( ( rem0 | rem1 ) != 0 );
2929
    }
2930
    return roundAndPackFloat64( 0, zExp, zSig );
2931
 
2932
}
2933
 
2934
/*----------------------------------------------------------------------------
2935
| Returns 1 if the double-precision floating-point value `a' is equal to the
2936
| corresponding value `b', and 0 otherwise.  The comparison is performed
2937
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2938
*----------------------------------------------------------------------------*/
2939
 
2940
flag float64_eq( float64 a, float64 b )
2941
{
2942
 
2943
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2944
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2945
       ) {
2946
        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2947
            float_raise( float_flag_invalid );
2948
        }
2949
        return 0;
2950
    }
2951
    return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
2952
 
2953
}
2954
 
2955
/*----------------------------------------------------------------------------
2956
| Returns 1 if the double-precision floating-point value `a' is less than or
2957
| equal to the corresponding value `b', and 0 otherwise.  The comparison is
2958
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2959
| Arithmetic.
2960
*----------------------------------------------------------------------------*/
2961
 
2962
flag float64_le( float64 a, float64 b )
2963
{
2964
    flag aSign, bSign;
2965
 
2966
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2967
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2968
       ) {
2969
        float_raise( float_flag_invalid );
2970
        return 0;
2971
    }
2972
    aSign = extractFloat64Sign( a );
2973
    bSign = extractFloat64Sign( b );
2974
    if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
2975
    return ( a == b ) || ( aSign ^ ( a < b ) );
2976
 
2977
}
2978
 
2979
/*----------------------------------------------------------------------------
2980
| Returns 1 if the double-precision floating-point value `a' is less than
2981
| the corresponding value `b', and 0 otherwise.  The comparison is performed
2982
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2983
*----------------------------------------------------------------------------*/
2984
 
2985
flag float64_lt( float64 a, float64 b )
2986
{
2987
    flag aSign, bSign;
2988
 
2989
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2990
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2991
       ) {
2992
        float_raise( float_flag_invalid );
2993
        return 0;
2994
    }
2995
    aSign = extractFloat64Sign( a );
2996
    bSign = extractFloat64Sign( b );
2997
    if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
2998
    return ( a != b ) && ( aSign ^ ( a < b ) );
2999
 
3000
}
3001
 
3002
/*----------------------------------------------------------------------------
3003
| Returns 1 if the double-precision floating-point value `a' is equal to the
3004
| corresponding value `b', and 0 otherwise.  The invalid exception is raised
3005
| if either operand is a NaN.  Otherwise, the comparison is performed
3006
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3007
*----------------------------------------------------------------------------*/
3008
 
3009
flag float64_eq_signaling( float64 a, float64 b )
3010
{
3011
 
3012
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3013
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3014
       ) {
3015
        float_raise( float_flag_invalid );
3016
        return 0;
3017
    }
3018
    return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
3019
 
3020
}
3021
 
3022
/*----------------------------------------------------------------------------
3023
| Returns 1 if the double-precision floating-point value `a' is less than or
3024
| equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
3025
| cause an exception.  Otherwise, the comparison is performed according to the
3026
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3027
*----------------------------------------------------------------------------*/
3028
 
3029
flag float64_le_quiet( float64 a, float64 b )
3030
{
3031
    flag aSign, bSign;
3032
    //int16 aExp, bExp; // Unused variable - JB
3033
 
3034
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3035
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3036
       ) {
3037
        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3038
            float_raise( float_flag_invalid );
3039
        }
3040
        return 0;
3041
    }
3042
    aSign = extractFloat64Sign( a );
3043
    bSign = extractFloat64Sign( b );
3044
    if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
3045
    return ( a == b ) || ( aSign ^ ( a < b ) );
3046
 
3047
}
3048
 
3049
/*----------------------------------------------------------------------------
3050
| Returns 1 if the double-precision floating-point value `a' is less than
3051
| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3052
| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
3053
| Standard for Binary Floating-Point Arithmetic.
3054
*----------------------------------------------------------------------------*/
3055
 
3056
flag float64_lt_quiet( float64 a, float64 b )
3057
{
3058
    flag aSign, bSign;
3059
 
3060
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3061
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3062
       ) {
3063
        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3064
            float_raise( float_flag_invalid );
3065
        }
3066
        return 0;
3067
    }
3068
    aSign = extractFloat64Sign( a );
3069
    bSign = extractFloat64Sign( b );
3070
    if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
3071
    return ( a != b ) && ( aSign ^ ( a < b ) );
3072
 
3073
}
3074
 
3075
#ifdef FLOATX80
3076
 
3077
/*----------------------------------------------------------------------------
3078
| Returns the result of converting the extended double-precision floating-
3079
| point value `a' to the 32-bit two's complement integer format.  The
3080
| conversion is performed according to the IEC/IEEE Standard for Binary
3081
| Floating-Point Arithmetic---which means in particular that the conversion
3082
| is rounded according to the current rounding mode.  If `a' is a NaN, the
3083
| largest positive integer is returned.  Otherwise, if the conversion
3084
| overflows, the largest integer with the same sign as `a' is returned.
3085
*----------------------------------------------------------------------------*/
3086
 
3087
int32 floatx80_to_int32( floatx80 a )
3088
{
3089
    flag aSign;
3090
    int32 aExp, shiftCount;
3091
    bits64 aSig;
3092
 
3093
    aSig = extractFloatx80Frac( a );
3094
    aExp = extractFloatx80Exp( a );
3095
    aSign = extractFloatx80Sign( a );
3096
    if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3097
    shiftCount = 0x4037 - aExp;
3098
    if ( shiftCount <= 0 ) shiftCount = 1;
3099
    shift64RightJamming( aSig, shiftCount, &aSig );
3100
    return roundAndPackInt32( aSign, aSig );
3101
 
3102
}
3103
 
3104
/*----------------------------------------------------------------------------
3105
| Returns the result of converting the extended double-precision floating-
3106
| point value `a' to the 32-bit two's complement integer format.  The
3107
| conversion is performed according to the IEC/IEEE Standard for Binary
3108
| Floating-Point Arithmetic, except that the conversion is always rounded
3109
| toward zero.  If `a' is a NaN, the largest positive integer is returned.
3110
| Otherwise, if the conversion overflows, the largest integer with the same
3111
| sign as `a' is returned.
3112
*----------------------------------------------------------------------------*/
3113
 
3114
int32 floatx80_to_int32_round_to_zero( floatx80 a )
3115
{
3116
    flag aSign;
3117
    int32 aExp, shiftCount;
3118
    bits64 aSig, savedASig;
3119
    int32 z;
3120
 
3121
    aSig = extractFloatx80Frac( a );
3122
    aExp = extractFloatx80Exp( a );
3123
    aSign = extractFloatx80Sign( a );
3124
    if ( 0x401E < aExp ) {
3125
        if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3126
        goto invalid;
3127
    }
3128
    else if ( aExp < 0x3FFF ) {
3129
        if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
3130
        return 0;
3131
    }
3132
    shiftCount = 0x403E - aExp;
3133
    savedASig = aSig;
3134
    aSig >>= shiftCount;
3135
    z = aSig;
3136
    if ( aSign ) z = - z;
3137
    if ( ( z < 0 ) ^ aSign ) {
3138
 invalid:
3139
        float_raise( float_flag_invalid );
3140
        return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
3141
    }
3142
    if ( ( aSig<<shiftCount ) != savedASig ) {
3143
        float_exception_flags |= float_flag_inexact;
3144
    }
3145
    return z;
3146
 
3147
}
3148
 
3149
/*----------------------------------------------------------------------------
3150
| Returns the result of converting the extended double-precision floating-
3151
| point value `a' to the 64-bit two's complement integer format.  The
3152
| conversion is performed according to the IEC/IEEE Standard for Binary
3153
| Floating-Point Arithmetic---which means in particular that the conversion
3154
| is rounded according to the current rounding mode.  If `a' is a NaN,
3155
| the largest positive integer is returned.  Otherwise, if the conversion
3156
| overflows, the largest integer with the same sign as `a' is returned.
3157
*----------------------------------------------------------------------------*/
3158
 
3159
int64 floatx80_to_int64( floatx80 a )
3160
{
3161
    flag aSign;
3162
    int32 aExp, shiftCount;
3163
    bits64 aSig, aSigExtra;
3164
 
3165
    aSig = extractFloatx80Frac( a );
3166
    aExp = extractFloatx80Exp( a );
3167
    aSign = extractFloatx80Sign( a );
3168
    shiftCount = 0x403E - aExp;
3169
    if ( shiftCount <= 0 ) {
3170
        if ( shiftCount ) {
3171
            float_raise( float_flag_invalid );
3172
            if (    ! aSign
3173
                 || (    ( aExp == 0x7FFF )
3174
                      && ( aSig != LIT64( 0x8000000000000000 ) ) )
3175
               ) {
3176
                return LIT64( 0x7FFFFFFFFFFFFFFF );
3177
            }
3178
            return (sbits64) LIT64( 0x8000000000000000 );
3179
        }
3180
        aSigExtra = 0;
3181
    }
3182
    else {
3183
        shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3184
    }
3185
    return roundAndPackInt64( aSign, aSig, aSigExtra );
3186
 
3187
}
3188
 
3189
/*----------------------------------------------------------------------------
3190
| Returns the result of converting the extended double-precision floating-
3191
| point value `a' to the 64-bit two's complement integer format.  The
3192
| conversion is performed according to the IEC/IEEE Standard for Binary
3193
| Floating-Point Arithmetic, except that the conversion is always rounded
3194
| toward zero.  If `a' is a NaN, the largest positive integer is returned.
3195
| Otherwise, if the conversion overflows, the largest integer with the same
3196
| sign as `a' is returned.
3197
*----------------------------------------------------------------------------*/
3198
 
3199
int64 floatx80_to_int64_round_to_zero( floatx80 a )
3200
{
3201
    flag aSign;
3202
    int32 aExp, shiftCount;
3203
    bits64 aSig;
3204
    int64 z;
3205
 
3206
    aSig = extractFloatx80Frac( a );
3207
    aExp = extractFloatx80Exp( a );
3208
    aSign = extractFloatx80Sign( a );
3209
    shiftCount = aExp - 0x403E;
3210
    if ( 0 <= shiftCount ) {
3211
        aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3212
        if ( ( a.high != 0xC03E ) || aSig ) {
3213
            float_raise( float_flag_invalid );
3214
            if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3215
                return LIT64( 0x7FFFFFFFFFFFFFFF );
3216
            }
3217
        }
3218
        return (sbits64) LIT64( 0x8000000000000000 );
3219
    }
3220
    else if ( aExp < 0x3FFF ) {
3221
        if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
3222
        return 0;
3223
    }
3224
    z = aSig>>( - shiftCount );
3225
    if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
3226
        float_exception_flags |= float_flag_inexact;
3227
    }
3228
    if ( aSign ) z = - z;
3229
    return z;
3230
 
3231
}
3232
 
3233
/*----------------------------------------------------------------------------
3234
| Returns the result of converting the extended double-precision floating-
3235
| point value `a' to the single-precision floating-point format.  The
3236
| conversion is performed according to the IEC/IEEE Standard for Binary
3237
| Floating-Point Arithmetic.
3238
*----------------------------------------------------------------------------*/
3239
 
3240
float32 floatx80_to_float32( floatx80 a )
3241
{
3242
    flag aSign;
3243
    int32 aExp;
3244
    bits64 aSig;
3245
 
3246
    aSig = extractFloatx80Frac( a );
3247
    aExp = extractFloatx80Exp( a );
3248
    aSign = extractFloatx80Sign( a );
3249
    if ( aExp == 0x7FFF ) {
3250
        if ( (bits64) ( aSig<<1 ) ) {
3251
            return commonNaNToFloat32( floatx80ToCommonNaN( a ) );
3252
        }
3253
        return packFloat32( aSign, 0xFF, 0 );
3254
    }
3255
    shift64RightJamming( aSig, 33, &aSig );
3256
    if ( aExp || aSig ) aExp -= 0x3F81;
3257
    return roundAndPackFloat32( aSign, aExp, aSig );
3258
 
3259
}
3260
 
3261
/*----------------------------------------------------------------------------
3262
| Returns the result of converting the extended double-precision floating-
3263
| point value `a' to the double-precision floating-point format.  The
3264
| conversion is performed according to the IEC/IEEE Standard for Binary
3265
| Floating-Point Arithmetic.
3266
*----------------------------------------------------------------------------*/
3267
 
3268
float64 floatx80_to_float64( floatx80 a )
3269
{
3270
    flag aSign;
3271
    int32 aExp;
3272
    bits64 aSig, zSig;
3273
 
3274
    aSig = extractFloatx80Frac( a );
3275
    aExp = extractFloatx80Exp( a );
3276
    aSign = extractFloatx80Sign( a );
3277
    if ( aExp == 0x7FFF ) {
3278
        if ( (bits64) ( aSig<<1 ) ) {
3279
            return commonNaNToFloat64( floatx80ToCommonNaN( a ) );
3280
        }
3281
        return packFloat64( aSign, 0x7FF, 0 );
3282
    }
3283
    shift64RightJamming( aSig, 1, &zSig );
3284
    if ( aExp || aSig ) aExp -= 0x3C01;
3285
    return roundAndPackFloat64( aSign, aExp, zSig );
3286
 
3287
}
3288
 
3289
#ifdef FLOAT128
3290
 
3291
/*----------------------------------------------------------------------------
3292
| Returns the result of converting the extended double-precision floating-
3293
| point value `a' to the quadruple-precision floating-point format.  The
3294
| conversion is performed according to the IEC/IEEE Standard for Binary
3295
| Floating-Point Arithmetic.
3296
*----------------------------------------------------------------------------*/
3297
 
3298
float128 floatx80_to_float128( floatx80 a )
3299
{
3300
    flag aSign;
3301
    int16 aExp;
3302
    bits64 aSig, zSig0, zSig1;
3303
 
3304
    aSig = extractFloatx80Frac( a );
3305
    aExp = extractFloatx80Exp( a );
3306
    aSign = extractFloatx80Sign( a );
3307
    if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
3308
        return commonNaNToFloat128( floatx80ToCommonNaN( a ) );
3309
    }
3310
    shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3311
    return packFloat128( aSign, aExp, zSig0, zSig1 );
3312
 
3313
}
3314
 
3315
#endif
3316
 
3317
/*----------------------------------------------------------------------------
3318
| Rounds the extended double-precision floating-point value `a' to an integer,
3319
| and returns the result as an extended quadruple-precision floating-point
3320
| value.  The operation is performed according to the IEC/IEEE Standard for
3321
| Binary Floating-Point Arithmetic.
3322
*----------------------------------------------------------------------------*/
3323
 
3324
floatx80 floatx80_round_to_int( floatx80 a )
3325
{
3326
    flag aSign;
3327
    int32 aExp;
3328
    bits64 lastBitMask, roundBitsMask;
3329
    int8 roundingMode;
3330
    floatx80 z;
3331
 
3332
    aExp = extractFloatx80Exp( a );
3333
    if ( 0x403E <= aExp ) {
3334
        if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
3335
            return propagateFloatx80NaN( a, a );
3336
        }
3337
        return a;
3338
    }
3339
    if ( aExp < 0x3FFF ) {
3340
        if (    ( aExp == 0 )
3341
             && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3342
            return a;
3343
        }
3344
        float_exception_flags |= float_flag_inexact;
3345
        aSign = extractFloatx80Sign( a );
3346
        switch ( float_rounding_mode ) {
3347
         case float_round_nearest_even:
3348
            if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
3349
               ) {
3350
                return
3351
                    packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3352
            }
3353
            break;
3354
         case float_round_down:
3355
            return
3356
                  aSign ?
3357
                      packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3358
                : packFloatx80( 0, 0, 0 );
3359
         case float_round_up:
3360
            return
3361
                  aSign ? packFloatx80( 1, 0, 0 )
3362
                : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3363
        }
3364
        return packFloatx80( aSign, 0, 0 );
3365
    }
3366
    lastBitMask = 1;
3367
    lastBitMask <<= 0x403E - aExp;
3368
    roundBitsMask = lastBitMask - 1;
3369
    z = a;
3370
    roundingMode = float_rounding_mode;
3371
    if ( roundingMode == float_round_nearest_even ) {
3372
        z.low += lastBitMask>>1;
3373
        if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
3374
    }
3375
    else if ( roundingMode != float_round_to_zero ) {
3376
        if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
3377
            z.low += roundBitsMask;
3378
        }
3379
    }
3380
    z.low &= ~ roundBitsMask;
3381
    if ( z.low == 0 ) {
3382
        ++z.high;
3383
        z.low = LIT64( 0x8000000000000000 );
3384
    }
3385
    if ( z.low != a.low ) float_exception_flags |= float_flag_inexact;
3386
    return z;
3387
 
3388
}
3389
 
3390
/*----------------------------------------------------------------------------
3391
| Returns the result of adding the absolute values of the extended double-
3392
| precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
3393
| negated before being returned.  `zSign' is ignored if the result is a NaN.
3394
| The addition is performed according to the IEC/IEEE Standard for Binary
3395
| Floating-Point Arithmetic.
3396
*----------------------------------------------------------------------------*/
3397
 
3398
static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3399
{
3400
    int32 aExp, bExp, zExp;
3401
    bits64 aSig, bSig, zSig0, zSig1;
3402
    int32 expDiff;
3403
 
3404
    aSig = extractFloatx80Frac( a );
3405
    aExp = extractFloatx80Exp( a );
3406
    bSig = extractFloatx80Frac( b );
3407
    bExp = extractFloatx80Exp( b );
3408
    expDiff = aExp - bExp;
3409
    if ( 0 < expDiff ) {
3410
        if ( aExp == 0x7FFF ) {
3411
            if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3412
            return a;
3413
        }
3414
        if ( bExp == 0 ) --expDiff;
3415
        shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3416
        zExp = aExp;
3417
    }
3418
    else if ( expDiff < 0 ) {
3419
        if ( bExp == 0x7FFF ) {
3420
            if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3421
            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3422
        }
3423
        if ( aExp == 0 ) ++expDiff;
3424
        shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3425
        zExp = bExp;
3426
    }
3427
    else {
3428
        if ( aExp == 0x7FFF ) {
3429
            if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3430
                return propagateFloatx80NaN( a, b );
3431
            }
3432
            return a;
3433
        }
3434
        zSig1 = 0;
3435
        zSig0 = aSig + bSig;
3436
        if ( aExp == 0 ) {
3437
            normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
3438
            goto roundAndPack;
3439
        }
3440
        zExp = aExp;
3441
        goto shiftRight1;
3442
    }
3443
    zSig0 = aSig + bSig;
3444
    if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
3445
 shiftRight1:
3446
    shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
3447
    zSig0 |= LIT64( 0x8000000000000000 );
3448
    ++zExp;
3449
 roundAndPack:
3450
    return
3451
        roundAndPackFloatx80(
3452
            floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3453
 
3454
}
3455
 
3456
/*----------------------------------------------------------------------------
3457
| Returns the result of subtracting the absolute values of the extended
3458
| double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
3459
| difference is negated before being returned.  `zSign' is ignored if the
3460
| result is a NaN.  The subtraction is performed according to the IEC/IEEE
3461
| Standard for Binary Floating-Point Arithmetic.
3462
*----------------------------------------------------------------------------*/
3463
 
3464
static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3465
{
3466
    int32 aExp, bExp, zExp;
3467
    bits64 aSig, bSig, zSig0, zSig1;
3468
    int32 expDiff;
3469
    floatx80 z;
3470
 
3471
    aSig = extractFloatx80Frac( a );
3472
    aExp = extractFloatx80Exp( a );
3473
    bSig = extractFloatx80Frac( b );
3474
    bExp = extractFloatx80Exp( b );
3475
    expDiff = aExp - bExp;
3476
    if ( 0 < expDiff ) goto aExpBigger;
3477
    if ( expDiff < 0 ) goto bExpBigger;
3478
    if ( aExp == 0x7FFF ) {
3479
        if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3480
            return propagateFloatx80NaN( a, b );
3481
        }
3482
        float_raise( float_flag_invalid );
3483
        z.low = floatx80_default_nan_low;
3484
        z.high = floatx80_default_nan_high;
3485
        return z;
3486
    }
3487
    if ( aExp == 0 ) {
3488
        aExp = 1;
3489
        bExp = 1;
3490
    }
3491
    zSig1 = 0;
3492
    if ( bSig < aSig ) goto aBigger;
3493
    if ( aSig < bSig ) goto bBigger;
3494
    return packFloatx80( float_rounding_mode == float_round_down, 0, 0 );
3495
 bExpBigger:
3496
    if ( bExp == 0x7FFF ) {
3497
        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3498
        return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
3499
    }
3500
    if ( aExp == 0 ) ++expDiff;
3501
    shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3502
 bBigger:
3503
    sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
3504
    zExp = bExp;
3505
    zSign ^= 1;
3506
    goto normalizeRoundAndPack;
3507
 aExpBigger:
3508
    if ( aExp == 0x7FFF ) {
3509
        if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3510
        return a;
3511
    }
3512
    if ( bExp == 0 ) --expDiff;
3513
    shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3514
 aBigger:
3515
    sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
3516
    zExp = aExp;
3517
 normalizeRoundAndPack:
3518
    return
3519
        normalizeRoundAndPackFloatx80(
3520
            floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3521
 
3522
}
3523
 
3524
/*----------------------------------------------------------------------------
3525
| Returns the result of adding the extended double-precision floating-point
3526
| values `a' and `b'.  The operation is performed according to the IEC/IEEE
3527
| Standard for Binary Floating-Point Arithmetic.
3528
*----------------------------------------------------------------------------*/
3529
 
3530
floatx80 floatx80_add( floatx80 a, floatx80 b )
3531
{
3532
    flag aSign, bSign;
3533
 
3534
    aSign = extractFloatx80Sign( a );
3535
    bSign = extractFloatx80Sign( b );
3536
    if ( aSign == bSign ) {
3537
        return addFloatx80Sigs( a, b, aSign );
3538
    }
3539
    else {
3540
        return subFloatx80Sigs( a, b, aSign );
3541
    }
3542
 
3543
}
3544
 
3545
/*----------------------------------------------------------------------------
3546
| Returns the result of subtracting the extended double-precision floating-
3547
| point values `a' and `b'.  The operation is performed according to the
3548
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3549
*----------------------------------------------------------------------------*/
3550
 
3551
floatx80 floatx80_sub( floatx80 a, floatx80 b )
3552
{
3553
    flag aSign, bSign;
3554
 
3555
    aSign = extractFloatx80Sign( a );
3556
    bSign = extractFloatx80Sign( b );
3557
    if ( aSign == bSign ) {
3558
        return subFloatx80Sigs( a, b, aSign );
3559
    }
3560
    else {
3561
        return addFloatx80Sigs( a, b, aSign );
3562
    }
3563
 
3564
}
3565
 
3566
/*----------------------------------------------------------------------------
3567
| Returns the result of multiplying the extended double-precision floating-
3568
| point values `a' and `b'.  The operation is performed according to the
3569
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3570
*----------------------------------------------------------------------------*/
3571
 
3572
floatx80 floatx80_mul( floatx80 a, floatx80 b )
3573
{
3574
    flag aSign, bSign, zSign;
3575
    int32 aExp, bExp, zExp;
3576
    bits64 aSig, bSig, zSig0, zSig1;
3577
    floatx80 z;
3578
 
3579
    aSig = extractFloatx80Frac( a );
3580
    aExp = extractFloatx80Exp( a );
3581
    aSign = extractFloatx80Sign( a );
3582
    bSig = extractFloatx80Frac( b );
3583
    bExp = extractFloatx80Exp( b );
3584
    bSign = extractFloatx80Sign( b );
3585
    zSign = aSign ^ bSign;
3586
    if ( aExp == 0x7FFF ) {
3587
        if (    (bits64) ( aSig<<1 )
3588
             || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3589
            return propagateFloatx80NaN( a, b );
3590
        }
3591
        if ( ( bExp | bSig ) == 0 ) goto invalid;
3592
        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3593
    }
3594
    if ( bExp == 0x7FFF ) {
3595
        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3596
        if ( ( aExp | aSig ) == 0 ) {
3597
 invalid:
3598
            float_raise( float_flag_invalid );
3599
            z.low = floatx80_default_nan_low;
3600
            z.high = floatx80_default_nan_high;
3601
            return z;
3602
        }
3603
        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3604
    }
3605
    if ( aExp == 0 ) {
3606
        if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3607
        normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3608
    }
3609
    if ( bExp == 0 ) {
3610
        if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
3611
        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3612
    }
3613
    zExp = aExp + bExp - 0x3FFE;
3614
    mul64To128( aSig, bSig, &zSig0, &zSig1 );
3615
    if ( 0 < (sbits64) zSig0 ) {
3616
        shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
3617
        --zExp;
3618
    }
3619
    return
3620
        roundAndPackFloatx80(
3621
            floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3622
 
3623
}
3624
 
3625
/*----------------------------------------------------------------------------
3626
| Returns the result of dividing the extended double-precision floating-point
3627
| value `a' by the corresponding value `b'.  The operation is performed
3628
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3629
*----------------------------------------------------------------------------*/
3630
 
3631
floatx80 floatx80_div( floatx80 a, floatx80 b )
3632
{
3633
    flag aSign, bSign, zSign;
3634
    int32 aExp, bExp, zExp;
3635
    bits64 aSig, bSig, zSig0, zSig1;
3636
    bits64 rem0, rem1, rem2, term0, term1, term2;
3637
    floatx80 z;
3638
 
3639
    aSig = extractFloatx80Frac( a );
3640
    aExp = extractFloatx80Exp( a );
3641
    aSign = extractFloatx80Sign( a );
3642
    bSig = extractFloatx80Frac( b );
3643
    bExp = extractFloatx80Exp( b );
3644
    bSign = extractFloatx80Sign( b );
3645
    zSign = aSign ^ bSign;
3646
    if ( aExp == 0x7FFF ) {
3647
        if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3648
        if ( bExp == 0x7FFF ) {
3649
            if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3650
            goto invalid;
3651
        }
3652
        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3653
    }
3654
    if ( bExp == 0x7FFF ) {
3655
        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3656
        return packFloatx80( zSign, 0, 0 );
3657
    }
3658
    if ( bExp == 0 ) {
3659
        if ( bSig == 0 ) {
3660
            if ( ( aExp | aSig ) == 0 ) {
3661
 invalid:
3662
                float_raise( float_flag_invalid );
3663
                z.low = floatx80_default_nan_low;
3664
                z.high = floatx80_default_nan_high;
3665
                return z;
3666
            }
3667
            float_raise( float_flag_divbyzero );
3668
            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3669
        }
3670
        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3671
    }
3672
    if ( aExp == 0 ) {
3673
        if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3674
        normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3675
    }
3676
    zExp = aExp - bExp + 0x3FFE;
3677
    rem1 = 0;
3678
    if ( bSig <= aSig ) {
3679
        shift128Right( aSig, 0, 1, &aSig, &rem1 );
3680
        ++zExp;
3681
    }
3682
    zSig0 = estimateDiv128To64( aSig, rem1, bSig );
3683
    mul64To128( bSig, zSig0, &term0, &term1 );
3684
    sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
3685
    while ( (sbits64) rem0 < 0 ) {
3686
        --zSig0;
3687
        add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3688
    }
3689
    zSig1 = estimateDiv128To64( rem1, 0, bSig );
3690
    if ( (bits64) ( zSig1<<1 ) <= 8 ) {
3691
        mul64To128( bSig, zSig1, &term1, &term2 );
3692
        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3693
        while ( (sbits64) rem1 < 0 ) {
3694
            --zSig1;
3695
            add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
3696
        }
3697
        zSig1 |= ( ( rem1 | rem2 ) != 0 );
3698
    }
3699
    return
3700
        roundAndPackFloatx80(
3701
            floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3702
 
3703
}
3704
 
3705
/*----------------------------------------------------------------------------
3706
| Returns the remainder of the extended double-precision floating-point value
3707
| `a' with respect to the corresponding value `b'.  The operation is performed
3708
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3709
*----------------------------------------------------------------------------*/
3710
 
3711
floatx80 floatx80_rem( floatx80 a, floatx80 b )
3712
{
3713
    flag aSign, bSign, zSign;
3714
    int32 aExp, bExp, expDiff;
3715
    bits64 aSig0, aSig1, bSig;
3716
    bits64 q, term0, term1, alternateASig0, alternateASig1;
3717
    floatx80 z;
3718
 
3719
    aSig0 = extractFloatx80Frac( a );
3720
    aExp = extractFloatx80Exp( a );
3721
    aSign = extractFloatx80Sign( a );
3722
    bSig = extractFloatx80Frac( b );
3723
    bExp = extractFloatx80Exp( b );
3724
    bSign = extractFloatx80Sign( b );
3725
    if ( aExp == 0x7FFF ) {
3726
        if (    (bits64) ( aSig0<<1 )
3727
             || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3728
            return propagateFloatx80NaN( a, b );
3729
        }
3730
        goto invalid;
3731
    }
3732
    if ( bExp == 0x7FFF ) {
3733
        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3734
        return a;
3735
    }
3736
    if ( bExp == 0 ) {
3737
        if ( bSig == 0 ) {
3738
 invalid:
3739
            float_raise( float_flag_invalid );
3740
            z.low = floatx80_default_nan_low;
3741
            z.high = floatx80_default_nan_high;
3742
            return z;
3743
        }
3744
        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3745
    }
3746
    if ( aExp == 0 ) {
3747
        if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
3748
        normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3749
    }
3750
    bSig |= LIT64( 0x8000000000000000 );
3751
    zSign = aSign;
3752
    expDiff = aExp - bExp;
3753
    aSig1 = 0;
3754
    if ( expDiff < 0 ) {
3755
        if ( expDiff < -1 ) return a;
3756
        shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
3757
        expDiff = 0;
3758
    }
3759
    q = ( bSig <= aSig0 );
3760
    if ( q ) aSig0 -= bSig;
3761
    expDiff -= 64;
3762
    while ( 0 < expDiff ) {
3763
        q = estimateDiv128To64( aSig0, aSig1, bSig );
3764
        q = ( 2 < q ) ? q - 2 : 0;
3765
        mul64To128( bSig, q, &term0, &term1 );
3766
        sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3767
        shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
3768
        expDiff -= 62;
3769
    }
3770
    expDiff += 64;
3771
    if ( 0 < expDiff ) {
3772
        q = estimateDiv128To64( aSig0, aSig1, bSig );
3773
        q = ( 2 < q ) ? q - 2 : 0;
3774
        q >>= 64 - expDiff;
3775
        mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
3776
        sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3777
        shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
3778
        while ( le128( term0, term1, aSig0, aSig1 ) ) {
3779
            ++q;
3780
            sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3781
        }
3782
    }
3783
    else {
3784
        term1 = 0;
3785
        term0 = bSig;
3786
    }
3787
    sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
3788
    if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
3789
         || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
3790
              && ( q & 1 ) )
3791
       ) {
3792
        aSig0 = alternateASig0;
3793
        aSig1 = alternateASig1;
3794
        zSign = ! zSign;
3795
    }
3796
    return
3797
        normalizeRoundAndPackFloatx80(
3798
            80, zSign, bExp + expDiff, aSig0, aSig1 );
3799
 
3800
}
3801
 
3802
/*----------------------------------------------------------------------------
3803
| Returns the square root of the extended double-precision floating-point
3804
| value `a'.  The operation is performed according to the IEC/IEEE Standard
3805
| for Binary Floating-Point Arithmetic.
3806
*----------------------------------------------------------------------------*/
3807
 
3808
floatx80 floatx80_sqrt( floatx80 a )
3809
{
3810
    flag aSign;
3811
    int32 aExp, zExp;
3812
    bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
3813
    bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
3814
    floatx80 z;
3815
 
3816
    aSig0 = extractFloatx80Frac( a );
3817
    aExp = extractFloatx80Exp( a );
3818
    aSign = extractFloatx80Sign( a );
3819
    if ( aExp == 0x7FFF ) {
3820
        if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a );
3821
        if ( ! aSign ) return a;
3822
        goto invalid;
3823
    }
3824
    if ( aSign ) {
3825
        if ( ( aExp | aSig0 ) == 0 ) return a;
3826
 invalid:
3827
        float_raise( float_flag_invalid );
3828
        z.low = floatx80_default_nan_low;
3829
        z.high = floatx80_default_nan_high;
3830
        return z;
3831
    }
3832
    if ( aExp == 0 ) {
3833
        if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
3834
        normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3835
    }
3836
    zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
3837
    zSig0 = estimateSqrt32( aExp, aSig0>>32 );
3838
    shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
3839
    zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
3840
    doubleZSig0 = zSig0<<1;
3841
    mul64To128( zSig0, zSig0, &term0, &term1 );
3842
    sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
3843
    while ( (sbits64) rem0 < 0 ) {
3844
        --zSig0;
3845
        doubleZSig0 -= 2;
3846
        add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
3847
    }
3848
    zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
3849
    if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
3850
        if ( zSig1 == 0 ) zSig1 = 1;
3851
        mul64To128( doubleZSig0, zSig1, &term1, &term2 );
3852
        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3853
        mul64To128( zSig1, zSig1, &term2, &term3 );
3854
        sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
3855
        while ( (sbits64) rem1 < 0 ) {
3856
            --zSig1;
3857
            shortShift128Left( 0, zSig1, 1, &term2, &term3 );
3858
            term3 |= 1;
3859
            term2 |= doubleZSig0;
3860
            add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
3861
        }
3862
        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
3863
    }
3864
    shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
3865
    zSig0 |= doubleZSig0;
3866
    return
3867
        roundAndPackFloatx80(
3868
            floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
3869
 
3870
}
3871
 
3872
/*----------------------------------------------------------------------------
3873
| Returns 1 if the extended double-precision floating-point value `a' is
3874
| equal to the corresponding value `b', and 0 otherwise.  The comparison is
3875
| performed according to the IEC/IEEE Standard for Binary Floating-Point
3876
| Arithmetic.
3877
*----------------------------------------------------------------------------*/
3878
 
3879
flag floatx80_eq( floatx80 a, floatx80 b )
3880
{
3881
 
3882
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3883
              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3884
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3885
              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3886
       ) {
3887
        if (    floatx80_is_signaling_nan( a )
3888
             || floatx80_is_signaling_nan( b ) ) {
3889
            float_raise( float_flag_invalid );
3890
        }
3891
        return 0;
3892
    }
3893
    return
3894
           ( a.low == b.low )
3895
        && (    ( a.high == b.high )
3896
             || (    ( a.low == 0 )
3897
                  && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
3898
           );
3899
 
3900
}
3901
 
3902
/*----------------------------------------------------------------------------
3903
| Returns 1 if the extended double-precision floating-point value `a' is
3904
| less than or equal to the corresponding value `b', and 0 otherwise.  The
3905
| comparison is performed according to the IEC/IEEE Standard for Binary
3906
| Floating-Point Arithmetic.
3907
*----------------------------------------------------------------------------*/
3908
 
3909
flag floatx80_le( floatx80 a, floatx80 b )
3910
{
3911
    flag aSign, bSign;
3912
 
3913
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3914
              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3915
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3916
              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3917
       ) {
3918
        float_raise( float_flag_invalid );
3919
        return 0;
3920
    }
3921
    aSign = extractFloatx80Sign( a );
3922
    bSign = extractFloatx80Sign( b );
3923
    if ( aSign != bSign ) {
3924
        return
3925
               aSign
3926
            || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3927
                 == 0 );
3928
    }
3929
    return
3930
          aSign ? le128( b.high, b.low, a.high, a.low )
3931
        : le128( a.high, a.low, b.high, b.low );
3932
 
3933
}
3934
 
3935
/*----------------------------------------------------------------------------
3936
| Returns 1 if the extended double-precision floating-point value `a' is
3937
| less than the corresponding value `b', and 0 otherwise.  The comparison
3938
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
3939
| Arithmetic.
3940
*----------------------------------------------------------------------------*/
3941
 
3942
flag floatx80_lt( floatx80 a, floatx80 b )
3943
{
3944
    flag aSign, bSign;
3945
 
3946
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3947
              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3948
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3949
              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3950
       ) {
3951
        float_raise( float_flag_invalid );
3952
        return 0;
3953
    }
3954
    aSign = extractFloatx80Sign( a );
3955
    bSign = extractFloatx80Sign( b );
3956
    if ( aSign != bSign ) {
3957
        return
3958
               aSign
3959
            && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3960
                 != 0 );
3961
    }
3962
    return
3963
          aSign ? lt128( b.high, b.low, a.high, a.low )
3964
        : lt128( a.high, a.low, b.high, b.low );
3965
 
3966
}
3967
 
3968
/*----------------------------------------------------------------------------
3969
| Returns 1 if the extended double-precision floating-point value `a' is equal
3970
| to the corresponding value `b', and 0 otherwise.  The invalid exception is
3971
| raised if either operand is a NaN.  Otherwise, the comparison is performed
3972
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3973
*----------------------------------------------------------------------------*/
3974
 
3975
flag floatx80_eq_signaling( floatx80 a, floatx80 b )
3976
{
3977
 
3978
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3979
              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3980
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3981
              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3982
       ) {
3983
        float_raise( float_flag_invalid );
3984
        return 0;
3985
    }
3986
    return
3987
           ( a.low == b.low )
3988
        && (    ( a.high == b.high )
3989
             || (    ( a.low == 0 )
3990
                  && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
3991
           );
3992
 
3993
}
3994
 
3995
/*----------------------------------------------------------------------------
3996
| Returns 1 if the extended double-precision floating-point value `a' is less
3997
| than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
3998
| do not cause an exception.  Otherwise, the comparison is performed according
3999
| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4000
*----------------------------------------------------------------------------*/
4001
 
4002
flag floatx80_le_quiet( floatx80 a, floatx80 b )
4003
{
4004
    flag aSign, bSign;
4005
 
4006
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4007
              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4008
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4009
              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4010
       ) {
4011
        if (    floatx80_is_signaling_nan( a )
4012
             || floatx80_is_signaling_nan( b ) ) {
4013
            float_raise( float_flag_invalid );
4014
        }
4015
        return 0;
4016
    }
4017
    aSign = extractFloatx80Sign( a );
4018
    bSign = extractFloatx80Sign( b );
4019
    if ( aSign != bSign ) {
4020
        return
4021
               aSign
4022
            || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4023
                 == 0 );
4024
    }
4025
    return
4026
          aSign ? le128( b.high, b.low, a.high, a.low )
4027
        : le128( a.high, a.low, b.high, b.low );
4028
 
4029
}
4030
 
4031
/*----------------------------------------------------------------------------
4032
| Returns 1 if the extended double-precision floating-point value `a' is less
4033
| than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
4034
| an exception.  Otherwise, the comparison is performed according to the
4035
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4036
*----------------------------------------------------------------------------*/
4037
 
4038
flag floatx80_lt_quiet( floatx80 a, floatx80 b )
4039
{
4040
    flag aSign, bSign;
4041
 
4042
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4043
              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4044
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4045
              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4046
       ) {
4047
        if (    floatx80_is_signaling_nan( a )
4048
             || floatx80_is_signaling_nan( b ) ) {
4049
            float_raise( float_flag_invalid );
4050
        }
4051
        return 0;
4052
    }
4053
    aSign = extractFloatx80Sign( a );
4054
    bSign = extractFloatx80Sign( b );
4055
    if ( aSign != bSign ) {
4056
        return
4057
               aSign
4058
            && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4059
                 != 0 );
4060
    }
4061
    return
4062
          aSign ? lt128( b.high, b.low, a.high, a.low )
4063
        : lt128( a.high, a.low, b.high, b.low );
4064
 
4065
}
4066
 
4067
#endif
4068
 
4069
#ifdef FLOAT128
4070
 
4071
/*----------------------------------------------------------------------------
4072
| Returns the result of converting the quadruple-precision floating-point
4073
| value `a' to the 32-bit two's complement integer format.  The conversion
4074
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4075
| Arithmetic---which means in particular that the conversion is rounded
4076
| according to the current rounding mode.  If `a' is a NaN, the largest
4077
| positive integer is returned.  Otherwise, if the conversion overflows, the
4078
| largest integer with the same sign as `a' is returned.
4079
*----------------------------------------------------------------------------*/
4080
 
4081
int32 float128_to_int32( float128 a )
4082
{
4083
    flag aSign;
4084
    int32 aExp, shiftCount;
4085
    bits64 aSig0, aSig1;
4086
 
4087
    aSig1 = extractFloat128Frac1( a );
4088
    aSig0 = extractFloat128Frac0( a );
4089
    aExp = extractFloat128Exp( a );
4090
    aSign = extractFloat128Sign( a );
4091
    if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4092
    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4093
    aSig0 |= ( aSig1 != 0 );
4094
    shiftCount = 0x4028 - aExp;
4095
    if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4096
    return roundAndPackInt32( aSign, aSig0 );
4097
 
4098
}
4099
 
4100
/*----------------------------------------------------------------------------
4101
| Returns the result of converting the quadruple-precision floating-point
4102
| value `a' to the 32-bit two's complement integer format.  The conversion
4103
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4104
| Arithmetic, except that the conversion is always rounded toward zero.  If
4105
| `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
4106
| conversion overflows, the largest integer with the same sign as `a' is
4107
| returned.
4108
*----------------------------------------------------------------------------*/
4109
 
4110
int32 float128_to_int32_round_to_zero( float128 a )
4111
{
4112
    flag aSign;
4113
    int32 aExp, shiftCount;
4114
    bits64 aSig0, aSig1, savedASig;
4115
    int32 z;
4116
 
4117
    aSig1 = extractFloat128Frac1( a );
4118
    aSig0 = extractFloat128Frac0( a );
4119
    aExp = extractFloat128Exp( a );
4120
    aSign = extractFloat128Sign( a );
4121
    aSig0 |= ( aSig1 != 0 );
4122
    if ( 0x401E < aExp ) {
4123
        if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4124
        goto invalid;
4125
    }
4126
    else if ( aExp < 0x3FFF ) {
4127
        if ( aExp || aSig0 ) float_exception_flags |= float_flag_inexact;
4128
        return 0;
4129
    }
4130
    aSig0 |= LIT64( 0x0001000000000000 );
4131
    shiftCount = 0x402F - aExp;
4132
    savedASig = aSig0;
4133
    aSig0 >>= shiftCount;
4134
    z = aSig0;
4135
    if ( aSign ) z = - z;
4136
    if ( ( z < 0 ) ^ aSign ) {
4137
 invalid:
4138
        float_raise( float_flag_invalid );
4139
        return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
4140
    }
4141
    if ( ( aSig0<<shiftCount ) != savedASig ) {
4142
        float_exception_flags |= float_flag_inexact;
4143
    }
4144
    return z;
4145
 
4146
}
4147
 
4148
/*----------------------------------------------------------------------------
4149
| Returns the result of converting the quadruple-precision floating-point
4150
| value `a' to the 64-bit two's complement integer format.  The conversion
4151
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4152
| Arithmetic---which means in particular that the conversion is rounded
4153
| according to the current rounding mode.  If `a' is a NaN, the largest
4154
| positive integer is returned.  Otherwise, if the conversion overflows, the
4155
| largest integer with the same sign as `a' is returned.
4156
*----------------------------------------------------------------------------*/
4157
 
4158
int64 float128_to_int64( float128 a )
4159
{
4160
    flag aSign;
4161
    int32 aExp, shiftCount;
4162
    bits64 aSig0, aSig1;
4163
 
4164
    aSig1 = extractFloat128Frac1( a );
4165
    aSig0 = extractFloat128Frac0( a );
4166
    aExp = extractFloat128Exp( a );
4167
    aSign = extractFloat128Sign( a );
4168
    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4169
    shiftCount = 0x402F - aExp;
4170
    if ( shiftCount <= 0 ) {
4171
        if ( 0x403E < aExp ) {
4172
            float_raise( float_flag_invalid );
4173
            if (    ! aSign
4174
                 || (    ( aExp == 0x7FFF )
4175
                      && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4176
                    )
4177
               ) {
4178
                return LIT64( 0x7FFFFFFFFFFFFFFF );
4179
            }
4180
            return (sbits64) LIT64( 0x8000000000000000 );
4181
        }
4182
        shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4183
    }
4184
    else {
4185
        shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4186
    }
4187
    return roundAndPackInt64( aSign, aSig0, aSig1 );
4188
 
4189
}
4190
 
4191
/*----------------------------------------------------------------------------
4192
| Returns the result of converting the quadruple-precision floating-point
4193
| value `a' to the 64-bit two's complement integer format.  The conversion
4194
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4195
| Arithmetic, except that the conversion is always rounded toward zero.
4196
| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
4197
| the conversion overflows, the largest integer with the same sign as `a' is
4198
| returned.
4199
*----------------------------------------------------------------------------*/
4200
 
4201
int64 float128_to_int64_round_to_zero( float128 a )
4202
{
4203
    flag aSign;
4204
    int32 aExp, shiftCount;
4205
    bits64 aSig0, aSig1;
4206
    int64 z;
4207
 
4208
    aSig1 = extractFloat128Frac1( a );
4209
    aSig0 = extractFloat128Frac0( a );
4210
    aExp = extractFloat128Exp( a );
4211
    aSign = extractFloat128Sign( a );
4212
    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4213
    shiftCount = aExp - 0x402F;
4214
    if ( 0 < shiftCount ) {
4215
        if ( 0x403E <= aExp ) {
4216
            aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4217
            if (    ( a.high == LIT64( 0xC03E000000000000 ) )
4218
                 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4219
                if ( aSig1 ) float_exception_flags |= float_flag_inexact;
4220
            }
4221
            else {
4222
                float_raise( float_flag_invalid );
4223
                if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4224
                    return LIT64( 0x7FFFFFFFFFFFFFFF );
4225
                }
4226
            }
4227
            return (sbits64) LIT64( 0x8000000000000000 );
4228
        }
4229
        z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4230
        if ( (bits64) ( aSig1<<shiftCount ) ) {
4231
            float_exception_flags |= float_flag_inexact;
4232
        }
4233
    }
4234
    else {
4235
        if ( aExp < 0x3FFF ) {
4236
            if ( aExp | aSig0 | aSig1 ) {
4237
                float_exception_flags |= float_flag_inexact;
4238
            }
4239
            return 0;
4240
        }
4241
        z = aSig0>>( - shiftCount );
4242
        if (    aSig1
4243
             || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4244
            float_exception_flags |= float_flag_inexact;
4245
        }
4246
    }
4247
    if ( aSign ) z = - z;
4248
    return z;
4249
 
4250
}
4251
 
4252
/*----------------------------------------------------------------------------
4253
| Returns the result of converting the quadruple-precision floating-point
4254
| value `a' to the single-precision floating-point format.  The conversion
4255
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4256
| Arithmetic.
4257
*----------------------------------------------------------------------------*/
4258
 
4259
float32 float128_to_float32( float128 a )
4260
{
4261
    flag aSign;
4262
    int32 aExp;
4263
    bits64 aSig0, aSig1;
4264
    bits32 zSig;
4265
 
4266
    aSig1 = extractFloat128Frac1( a );
4267
    aSig0 = extractFloat128Frac0( a );
4268
    aExp = extractFloat128Exp( a );
4269
    aSign = extractFloat128Sign( a );
4270
    if ( aExp == 0x7FFF ) {
4271
        if ( aSig0 | aSig1 ) {
4272
            return commonNaNToFloat32( float128ToCommonNaN( a ) );
4273
        }
4274
        return packFloat32( aSign, 0xFF, 0 );
4275
    }
4276
    aSig0 |= ( aSig1 != 0 );
4277
    shift64RightJamming( aSig0, 18, &aSig0 );
4278
    zSig = aSig0;
4279
    if ( aExp || zSig ) {
4280
        zSig |= 0x40000000;
4281
        aExp -= 0x3F81;
4282
    }
4283
    return roundAndPackFloat32( aSign, aExp, zSig );
4284
 
4285
}
4286
 
4287
/*----------------------------------------------------------------------------
4288
| Returns the result of converting the quadruple-precision floating-point
4289
| value `a' to the double-precision floating-point format.  The conversion
4290
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4291
| Arithmetic.
4292
*----------------------------------------------------------------------------*/
4293
 
4294
float64 float128_to_float64( float128 a )
4295
{
4296
    flag aSign;
4297
    int32 aExp;
4298
    bits64 aSig0, aSig1;
4299
 
4300
    aSig1 = extractFloat128Frac1( a );
4301
    aSig0 = extractFloat128Frac0( a );
4302
    aExp = extractFloat128Exp( a );
4303
    aSign = extractFloat128Sign( a );
4304
    if ( aExp == 0x7FFF ) {
4305
        if ( aSig0 | aSig1 ) {
4306
            return commonNaNToFloat64( float128ToCommonNaN( a ) );
4307
        }
4308
        return packFloat64( aSign, 0x7FF, 0 );
4309
    }
4310
    shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4311
    aSig0 |= ( aSig1 != 0 );
4312
    if ( aExp || aSig0 ) {
4313
        aSig0 |= LIT64( 0x4000000000000000 );
4314
        aExp -= 0x3C01;
4315
    }
4316
    return roundAndPackFloat64( aSign, aExp, aSig0 );
4317
 
4318
}
4319
 
4320
#ifdef FLOATX80
4321
 
4322
/*----------------------------------------------------------------------------
4323
| Returns the result of converting the quadruple-precision floating-point
4324
| value `a' to the extended double-precision floating-point format.  The
4325
| conversion is performed according to the IEC/IEEE Standard for Binary
4326
| Floating-Point Arithmetic.
4327
*----------------------------------------------------------------------------*/
4328
 
4329
floatx80 float128_to_floatx80( float128 a )
4330
{
4331
    flag aSign;
4332
    int32 aExp;
4333
    bits64 aSig0, aSig1;
4334
 
4335
    aSig1 = extractFloat128Frac1( a );
4336
    aSig0 = extractFloat128Frac0( a );
4337
    aExp = extractFloat128Exp( a );
4338
    aSign = extractFloat128Sign( a );
4339
    if ( aExp == 0x7FFF ) {
4340
        if ( aSig0 | aSig1 ) {
4341
            return commonNaNToFloatx80( float128ToCommonNaN( a ) );
4342
        }
4343
        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4344
    }
4345
    if ( aExp == 0 ) {
4346
        if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4347
        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4348
    }
4349
    else {
4350
        aSig0 |= LIT64( 0x0001000000000000 );
4351
    }
4352
    shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4353
    return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 );
4354
 
4355
}
4356
 
4357
#endif
4358
 
4359
/*----------------------------------------------------------------------------
4360
| Rounds the quadruple-precision floating-point value `a' to an integer, and
4361
| returns the result as a quadruple-precision floating-point value.  The
4362
| operation is performed according to the IEC/IEEE Standard for Binary
4363
| Floating-Point Arithmetic.
4364
*----------------------------------------------------------------------------*/
4365
 
4366
float128 float128_round_to_int( float128 a )
4367
{
4368
    flag aSign;
4369
    int32 aExp;
4370
    bits64 lastBitMask, roundBitsMask;
4371
    int8 roundingMode;
4372
    float128 z;
4373
 
4374
    aExp = extractFloat128Exp( a );
4375
    if ( 0x402F <= aExp ) {
4376
        if ( 0x406F <= aExp ) {
4377
            if (    ( aExp == 0x7FFF )
4378
                 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
4379
               ) {
4380
                return propagateFloat128NaN( a, a );
4381
            }
4382
            return a;
4383
        }
4384
        lastBitMask = 1;
4385
        lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
4386
        roundBitsMask = lastBitMask - 1;
4387
        z = a;
4388
        roundingMode = float_rounding_mode;
4389
        if ( roundingMode == float_round_nearest_even ) {
4390
            if ( lastBitMask ) {
4391
                add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
4392
                if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4393
            }
4394
            else {
4395
                if ( (sbits64) z.low < 0 ) {
4396
                    ++z.high;
4397
                    if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
4398
                }
4399
            }
4400
        }
4401
        else if ( roundingMode != float_round_to_zero ) {
4402
            if (   extractFloat128Sign( z )
4403
                 ^ ( roundingMode == float_round_up ) ) {
4404
                add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
4405
            }
4406
        }
4407
        z.low &= ~ roundBitsMask;
4408
    }
4409
    else {
4410
        if ( aExp < 0x3FFF ) {
4411
            if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
4412
            float_exception_flags |= float_flag_inexact;
4413
            aSign = extractFloat128Sign( a );
4414
            switch ( float_rounding_mode ) {
4415
             case float_round_nearest_even:
4416
                if (    ( aExp == 0x3FFE )
4417
                     && (   extractFloat128Frac0( a )
4418
                          | extractFloat128Frac1( a ) )
4419
                   ) {
4420
                    return packFloat128( aSign, 0x3FFF, 0, 0 );
4421
                }
4422
                break;
4423
             case float_round_down:
4424
                return
4425
                      aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
4426
                    : packFloat128( 0, 0, 0, 0 );
4427
             case float_round_up:
4428
                return
4429
                      aSign ? packFloat128( 1, 0, 0, 0 )
4430
                    : packFloat128( 0, 0x3FFF, 0, 0 );
4431
            }
4432
            return packFloat128( aSign, 0, 0, 0 );
4433
        }
4434
        lastBitMask = 1;
4435
        lastBitMask <<= 0x402F - aExp;
4436
        roundBitsMask = lastBitMask - 1;
4437
        z.low = 0;
4438
        z.high = a.high;
4439
        roundingMode = float_rounding_mode;
4440
        if ( roundingMode == float_round_nearest_even ) {
4441
            z.high += lastBitMask>>1;
4442
            if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
4443
                z.high &= ~ lastBitMask;
4444
            }
4445
        }
4446
        else if ( roundingMode != float_round_to_zero ) {
4447
            if (   extractFloat128Sign( z )
4448
                 ^ ( roundingMode == float_round_up ) ) {
4449
                z.high |= ( a.low != 0 );
4450
                z.high += roundBitsMask;
4451
            }
4452
        }
4453
        z.high &= ~ roundBitsMask;
4454
    }
4455
    if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
4456
        float_exception_flags |= float_flag_inexact;
4457
    }
4458
    return z;
4459
 
4460
}
4461
 
4462
/*----------------------------------------------------------------------------
4463
| Returns the result of adding the absolute values of the quadruple-precision
4464
| floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
4465
| before being returned.  `zSign' is ignored if the result is a NaN.
4466
| The addition is performed according to the IEC/IEEE Standard for Binary
4467
| Floating-Point Arithmetic.
4468
*----------------------------------------------------------------------------*/
4469
 
4470
static float128 addFloat128Sigs( float128 a, float128 b, flag zSign )
4471
{
4472
    int32 aExp, bExp, zExp;
4473
    bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4474
    int32 expDiff;
4475
 
4476
    aSig1 = extractFloat128Frac1( a );
4477
    aSig0 = extractFloat128Frac0( a );
4478
    aExp = extractFloat128Exp( a );
4479
    bSig1 = extractFloat128Frac1( b );
4480
    bSig0 = extractFloat128Frac0( b );
4481
    bExp = extractFloat128Exp( b );
4482
    expDiff = aExp - bExp;
4483
    if ( 0 < expDiff ) {
4484
        if ( aExp == 0x7FFF ) {
4485
            if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4486
            return a;
4487
        }
4488
        if ( bExp == 0 ) {
4489
            --expDiff;
4490
        }
4491
        else {
4492
            bSig0 |= LIT64( 0x0001000000000000 );
4493
        }
4494
        shift128ExtraRightJamming(
4495
            bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
4496
        zExp = aExp;
4497
    }
4498
    else if ( expDiff < 0 ) {
4499
        if ( bExp == 0x7FFF ) {
4500
            if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4501
            return packFloat128( zSign, 0x7FFF, 0, 0 );
4502
        }
4503
        if ( aExp == 0 ) {
4504
            ++expDiff;
4505
        }
4506
        else {
4507
            aSig0 |= LIT64( 0x0001000000000000 );
4508
        }
4509
        shift128ExtraRightJamming(
4510
            aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
4511
        zExp = bExp;
4512
    }
4513
    else {
4514
        if ( aExp == 0x7FFF ) {
4515
            if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4516
                return propagateFloat128NaN( a, b );
4517
            }
4518
            return a;
4519
        }
4520
        add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4521
        if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 );
4522
        zSig2 = 0;
4523
        zSig0 |= LIT64( 0x0002000000000000 );
4524
        zExp = aExp;
4525
        goto shiftRight1;
4526
    }
4527
    aSig0 |= LIT64( 0x0001000000000000 );
4528
    add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4529
    --zExp;
4530
    if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
4531
    ++zExp;
4532
 shiftRight1:
4533
    shift128ExtraRightJamming(
4534
        zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4535
 roundAndPack:
4536
    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4537
 
4538
}
4539
 
4540
/*----------------------------------------------------------------------------
4541
| Returns the result of subtracting the absolute values of the quadruple-
4542
| precision floating-point values `a' and `b'.  If `zSign' is 1, the
4543
| difference is negated before being returned.  `zSign' is ignored if the
4544
| result is a NaN.  The subtraction is performed according to the IEC/IEEE
4545
| Standard for Binary Floating-Point Arithmetic.
4546
*----------------------------------------------------------------------------*/
4547
 
4548
static float128 subFloat128Sigs( float128 a, float128 b, flag zSign )
4549
{
4550
    int32 aExp, bExp, zExp;
4551
    bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
4552
    int32 expDiff;
4553
    float128 z;
4554
 
4555
    aSig1 = extractFloat128Frac1( a );
4556
    aSig0 = extractFloat128Frac0( a );
4557
    aExp = extractFloat128Exp( a );
4558
    bSig1 = extractFloat128Frac1( b );
4559
    bSig0 = extractFloat128Frac0( b );
4560
    bExp = extractFloat128Exp( b );
4561
    expDiff = aExp - bExp;
4562
    shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4563
    shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
4564
    if ( 0 < expDiff ) goto aExpBigger;
4565
    if ( expDiff < 0 ) goto bExpBigger;
4566
    if ( aExp == 0x7FFF ) {
4567
        if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4568
            return propagateFloat128NaN( a, b );
4569
        }
4570
        float_raise( float_flag_invalid );
4571
        z.low = float128_default_nan_low;
4572
        z.high = float128_default_nan_high;
4573
        return z;
4574
    }
4575
    if ( aExp == 0 ) {
4576
        aExp = 1;
4577
        bExp = 1;
4578
    }
4579
    if ( bSig0 < aSig0 ) goto aBigger;
4580
    if ( aSig0 < bSig0 ) goto bBigger;
4581
    if ( bSig1 < aSig1 ) goto aBigger;
4582
    if ( aSig1 < bSig1 ) goto bBigger;
4583
    return packFloat128( float_rounding_mode == float_round_down, 0, 0, 0 );
4584
 bExpBigger:
4585
    if ( bExp == 0x7FFF ) {
4586
        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4587
        return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
4588
    }
4589
    if ( aExp == 0 ) {
4590
        ++expDiff;
4591
    }
4592
    else {
4593
        aSig0 |= LIT64( 0x4000000000000000 );
4594
    }
4595
    shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4596
    bSig0 |= LIT64( 0x4000000000000000 );
4597
 bBigger:
4598
    sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
4599
    zExp = bExp;
4600
    zSign ^= 1;
4601
    goto normalizeRoundAndPack;
4602
 aExpBigger:
4603
    if ( aExp == 0x7FFF ) {
4604
        if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4605
        return a;
4606
    }
4607
    if ( bExp == 0 ) {
4608
        --expDiff;
4609
    }
4610
    else {
4611
        bSig0 |= LIT64( 0x4000000000000000 );
4612
    }
4613
    shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
4614
    aSig0 |= LIT64( 0x4000000000000000 );
4615
 aBigger:
4616
    sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4617
    zExp = aExp;
4618
 normalizeRoundAndPack:
4619
    --zExp;
4620
    return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 );
4621
 
4622
}
4623
 
4624
/*----------------------------------------------------------------------------
4625
| Returns the result of adding the quadruple-precision floating-point values
4626
| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
4627
| for Binary Floating-Point Arithmetic.
4628
*----------------------------------------------------------------------------*/
4629
 
4630
float128 float128_add( float128 a, float128 b )
4631
{
4632
    flag aSign, bSign;
4633
 
4634
    aSign = extractFloat128Sign( a );
4635
    bSign = extractFloat128Sign( b );
4636
    if ( aSign == bSign ) {
4637
        return addFloat128Sigs( a, b, aSign );
4638
    }
4639
    else {
4640
        return subFloat128Sigs( a, b, aSign );
4641
    }
4642
 
4643
}
4644
 
4645
/*----------------------------------------------------------------------------
4646
| Returns the result of subtracting the quadruple-precision floating-point
4647
| values `a' and `b'.  The operation is performed according to the IEC/IEEE
4648
| Standard for Binary Floating-Point Arithmetic.
4649
*----------------------------------------------------------------------------*/
4650
 
4651
float128 float128_sub( float128 a, float128 b )
4652
{
4653
    flag aSign, bSign;
4654
 
4655
    aSign = extractFloat128Sign( a );
4656
    bSign = extractFloat128Sign( b );
4657
    if ( aSign == bSign ) {
4658
        return subFloat128Sigs( a, b, aSign );
4659
    }
4660
    else {
4661
        return addFloat128Sigs( a, b, aSign );
4662
    }
4663
 
4664
}
4665
 
4666
/*----------------------------------------------------------------------------
4667
| Returns the result of multiplying the quadruple-precision floating-point
4668
| values `a' and `b'.  The operation is performed according to the IEC/IEEE
4669
| Standard for Binary Floating-Point Arithmetic.
4670
*----------------------------------------------------------------------------*/
4671
 
4672
float128 float128_mul( float128 a, float128 b )
4673
{
4674
    flag aSign, bSign, zSign;
4675
    int32 aExp, bExp, zExp;
4676
    bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
4677
    float128 z;
4678
 
4679
    aSig1 = extractFloat128Frac1( a );
4680
    aSig0 = extractFloat128Frac0( a );
4681
    aExp = extractFloat128Exp( a );
4682
    aSign = extractFloat128Sign( a );
4683
    bSig1 = extractFloat128Frac1( b );
4684
    bSig0 = extractFloat128Frac0( b );
4685
    bExp = extractFloat128Exp( b );
4686
    bSign = extractFloat128Sign( b );
4687
    zSign = aSign ^ bSign;
4688
    if ( aExp == 0x7FFF ) {
4689
        if (    ( aSig0 | aSig1 )
4690
             || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4691
            return propagateFloat128NaN( a, b );
4692
        }
4693
        if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
4694
        return packFloat128( zSign, 0x7FFF, 0, 0 );
4695
    }
4696
    if ( bExp == 0x7FFF ) {
4697
        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4698
        if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4699
 invalid:
4700
            float_raise( float_flag_invalid );
4701
            z.low = float128_default_nan_low;
4702
            z.high = float128_default_nan_high;
4703
            return z;
4704
        }
4705
        return packFloat128( zSign, 0x7FFF, 0, 0 );
4706
    }
4707
    if ( aExp == 0 ) {
4708
        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4709
        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4710
    }
4711
    if ( bExp == 0 ) {
4712
        if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4713
        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4714
    }
4715
    zExp = aExp + bExp - 0x4000;
4716
    aSig0 |= LIT64( 0x0001000000000000 );
4717
    shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
4718
    mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
4719
    add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
4720
    zSig2 |= ( zSig3 != 0 );
4721
    if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
4722
        shift128ExtraRightJamming(
4723
            zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4724
        ++zExp;
4725
    }
4726
    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4727
 
4728
}
4729
 
4730
/*----------------------------------------------------------------------------
4731
| Returns the result of dividing the quadruple-precision floating-point value
4732
| `a' by the corresponding value `b'.  The operation is performed according to
4733
| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4734
*----------------------------------------------------------------------------*/
4735
 
4736
float128 float128_div( float128 a, float128 b )
4737
{
4738
    flag aSign, bSign, zSign;
4739
    int32 aExp, bExp, zExp;
4740
    bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4741
    bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4742
    float128 z;
4743
 
4744
    aSig1 = extractFloat128Frac1( a );
4745
    aSig0 = extractFloat128Frac0( a );
4746
    aExp = extractFloat128Exp( a );
4747
    aSign = extractFloat128Sign( a );
4748
    bSig1 = extractFloat128Frac1( b );
4749
    bSig0 = extractFloat128Frac0( b );
4750
    bExp = extractFloat128Exp( b );
4751
    bSign = extractFloat128Sign( b );
4752
    zSign = aSign ^ bSign;
4753
    if ( aExp == 0x7FFF ) {
4754
        if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4755
        if ( bExp == 0x7FFF ) {
4756
            if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4757
            goto invalid;
4758
        }
4759
        return packFloat128( zSign, 0x7FFF, 0, 0 );
4760
    }
4761
    if ( bExp == 0x7FFF ) {
4762
        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4763
        return packFloat128( zSign, 0, 0, 0 );
4764
    }
4765
    if ( bExp == 0 ) {
4766
        if ( ( bSig0 | bSig1 ) == 0 ) {
4767
            if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4768
 invalid:
4769
                float_raise( float_flag_invalid );
4770
                z.low = float128_default_nan_low;
4771
                z.high = float128_default_nan_high;
4772
                return z;
4773
            }
4774
            float_raise( float_flag_divbyzero );
4775
            return packFloat128( zSign, 0x7FFF, 0, 0 );
4776
        }
4777
        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4778
    }
4779
    if ( aExp == 0 ) {
4780
        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4781
        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4782
    }
4783
    zExp = aExp - bExp + 0x3FFD;
4784
    shortShift128Left(
4785
        aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
4786
    shortShift128Left(
4787
        bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
4788
    if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
4789
        shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
4790
        ++zExp;
4791
    }
4792
    zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
4793
    mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
4794
    sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
4795
    while ( (sbits64) rem0 < 0 ) {
4796
        --zSig0;
4797
        add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
4798
    }
4799
    zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
4800
    if ( ( zSig1 & 0x3FFF ) <= 4 ) {
4801
        mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
4802
        sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
4803
        while ( (sbits64) rem1 < 0 ) {
4804
            --zSig1;
4805
            add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
4806
        }
4807
        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4808
    }
4809
    shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
4810
    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4811
 
4812
}
4813
 
4814
/*----------------------------------------------------------------------------
4815
| Returns the remainder of the quadruple-precision floating-point value `a'
4816
| with respect to the corresponding value `b'.  The operation is performed
4817
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4818
*----------------------------------------------------------------------------*/
4819
 
4820
float128 float128_rem( float128 a, float128 b )
4821
{
4822
    flag aSign, bSign, zSign;
4823
    int32 aExp, bExp, expDiff;
4824
    bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
4825
    bits64 allZero, alternateASig0, alternateASig1, sigMean1;
4826
    sbits64 sigMean0;
4827
    float128 z;
4828
 
4829
    aSig1 = extractFloat128Frac1( a );
4830
    aSig0 = extractFloat128Frac0( a );
4831
    aExp = extractFloat128Exp( a );
4832
    aSign = extractFloat128Sign( a );
4833
    bSig1 = extractFloat128Frac1( b );
4834
    bSig0 = extractFloat128Frac0( b );
4835
    bExp = extractFloat128Exp( b );
4836
    bSign = extractFloat128Sign( b );
4837
    if ( aExp == 0x7FFF ) {
4838
        if (    ( aSig0 | aSig1 )
4839
             || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4840
            return propagateFloat128NaN( a, b );
4841
        }
4842
        goto invalid;
4843
    }
4844
    if ( bExp == 0x7FFF ) {
4845
        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4846
        return a;
4847
    }
4848
    if ( bExp == 0 ) {
4849
        if ( ( bSig0 | bSig1 ) == 0 ) {
4850
 invalid:
4851
            float_raise( float_flag_invalid );
4852
            z.low = float128_default_nan_low;
4853
            z.high = float128_default_nan_high;
4854
            return z;
4855
        }
4856
        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4857
    }
4858
    if ( aExp == 0 ) {
4859
        if ( ( aSig0 | aSig1 ) == 0 ) return a;
4860
        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4861
    }
4862
    expDiff = aExp - bExp;
4863
    if ( expDiff < -1 ) return a;
4864
    shortShift128Left(
4865
        aSig0 | LIT64( 0x0001000000000000 ),
4866
        aSig1,
4867
        15 - ( expDiff < 0 ),
4868
        &aSig0,
4869
        &aSig1
4870
    );
4871
    shortShift128Left(
4872
        bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
4873
    q = le128( bSig0, bSig1, aSig0, aSig1 );
4874
    if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
4875
    expDiff -= 64;
4876
    while ( 0 < expDiff ) {
4877
        q = estimateDiv128To64( aSig0, aSig1, bSig0 );
4878
        q = ( 4 < q ) ? q - 4 : 0;
4879
        mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
4880
        shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
4881
        shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
4882
        sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
4883
        expDiff -= 61;
4884
    }
4885
    if ( -64 < expDiff ) {
4886
        q = estimateDiv128To64( aSig0, aSig1, bSig0 );
4887
        q = ( 4 < q ) ? q - 4 : 0;
4888
        q >>= - expDiff;
4889
        shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
4890
        expDiff += 52;
4891
        if ( expDiff < 0 ) {
4892
            shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4893
        }
4894
        else {
4895
            shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
4896
        }
4897
        mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
4898
        sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
4899
    }
4900
    else {
4901
        shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
4902
        shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
4903
    }
4904
    do {
4905
        alternateASig0 = aSig0;
4906
        alternateASig1 = aSig1;
4907
        ++q;
4908
        sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
4909
    } while ( 0 <= (sbits64) aSig0 );
4910
    add128(
4911
        aSig0, aSig1, alternateASig0, alternateASig1, &sigMean0, &sigMean1 );
4912
    if (    ( sigMean0 < 0 )
4913
         || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
4914
        aSig0 = alternateASig0;
4915
        aSig1 = alternateASig1;
4916
    }
4917
    zSign = ( (sbits64) aSig0 < 0 );
4918
    if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
4919
    return
4920
        normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 );
4921
 
4922
}
4923
 
4924
/*----------------------------------------------------------------------------
4925
| Returns the square root of the quadruple-precision floating-point value `a'.
4926
| The operation is performed according to the IEC/IEEE Standard for Binary
4927
| Floating-Point Arithmetic.
4928
*----------------------------------------------------------------------------*/
4929
 
4930
float128 float128_sqrt( float128 a )
4931
{
4932
    flag aSign;
4933
    int32 aExp, zExp;
4934
    bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
4935
    bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4936
    float128 z;
4937
 
4938
    aSig1 = extractFloat128Frac1( a );
4939
    aSig0 = extractFloat128Frac0( a );
4940
    aExp = extractFloat128Exp( a );
4941
    aSign = extractFloat128Sign( a );
4942
    if ( aExp == 0x7FFF ) {
4943
        if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a );
4944
        if ( ! aSign ) return a;
4945
        goto invalid;
4946
    }
4947
    if ( aSign ) {
4948
        if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
4949
 invalid:
4950
        float_raise( float_flag_invalid );
4951
        z.low = float128_default_nan_low;
4952
        z.high = float128_default_nan_high;
4953
        return z;
4954
    }
4955
    if ( aExp == 0 ) {
4956
        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
4957
        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4958
    }
4959
    zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
4960
    aSig0 |= LIT64( 0x0001000000000000 );
4961
    zSig0 = estimateSqrt32( aExp, aSig0>>17 );
4962
    shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
4963
    zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4964
    doubleZSig0 = zSig0<<1;
4965
    mul64To128( zSig0, zSig0, &term0, &term1 );
4966
    sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4967
    while ( (sbits64) rem0 < 0 ) {
4968
        --zSig0;
4969
        doubleZSig0 -= 2;
4970
        add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4971
    }
4972
    zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4973
    if ( ( zSig1 & 0x1FFF ) <= 5 ) {
4974
        if ( zSig1 == 0 ) zSig1 = 1;
4975
        mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4976
        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4977
        mul64To128( zSig1, zSig1, &term2, &term3 );
4978
        sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4979
        while ( (sbits64) rem1 < 0 ) {
4980
            --zSig1;
4981
            shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4982
            term3 |= 1;
4983
            term2 |= doubleZSig0;
4984
            add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4985
        }
4986
        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4987
    }
4988
    shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
4989
    return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 );
4990
 
4991
}
4992
 
4993
/*----------------------------------------------------------------------------
4994
| Returns 1 if the quadruple-precision floating-point value `a' is equal to
4995
| the corresponding value `b', and 0 otherwise.  The comparison is performed
4996
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4997
*----------------------------------------------------------------------------*/
4998
 
4999
flag float128_eq( float128 a, float128 b )
5000
{
5001
 
5002
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5003
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5004
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5005
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5006
       ) {
5007
        if (    float128_is_signaling_nan( a )
5008
             || float128_is_signaling_nan( b ) ) {
5009
            float_raise( float_flag_invalid );
5010
        }
5011
        return 0;
5012
    }
5013
    return
5014
           ( a.low == b.low )
5015
        && (    ( a.high == b.high )
5016
             || (    ( a.low == 0 )
5017
                  && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5018
           );
5019
 
5020
}
5021
 
5022
/*----------------------------------------------------------------------------
5023
| Returns 1 if the quadruple-precision floating-point value `a' is less than
5024
| or equal to the corresponding value `b', and 0 otherwise.  The comparison
5025
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5026
| Arithmetic.
5027
*----------------------------------------------------------------------------*/
5028
 
5029
flag float128_le( float128 a, float128 b )
5030
{
5031
    flag aSign, bSign;
5032
 
5033
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5034
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5035
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5036
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5037
       ) {
5038
        float_raise( float_flag_invalid );
5039
        return 0;
5040
    }
5041
    aSign = extractFloat128Sign( a );
5042
    bSign = extractFloat128Sign( b );
5043
    if ( aSign != bSign ) {
5044
        return
5045
               aSign
5046
            || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5047
                 == 0 );
5048
    }
5049
    return
5050
          aSign ? le128( b.high, b.low, a.high, a.low )
5051
        : le128( a.high, a.low, b.high, b.low );
5052
 
5053
}
5054
 
5055
/*----------------------------------------------------------------------------
5056
| Returns 1 if the quadruple-precision floating-point value `a' is less than
5057
| the corresponding value `b', and 0 otherwise.  The comparison is performed
5058
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5059
*----------------------------------------------------------------------------*/
5060
 
5061
flag float128_lt( float128 a, float128 b )
5062
{
5063
    flag aSign, bSign;
5064
 
5065
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5066
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5067
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5068
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5069
       ) {
5070
        float_raise( float_flag_invalid );
5071
        return 0;
5072
    }
5073
    aSign = extractFloat128Sign( a );
5074
    bSign = extractFloat128Sign( b );
5075
    if ( aSign != bSign ) {
5076
        return
5077
               aSign
5078
            && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5079
                 != 0 );
5080
    }
5081
    return
5082
          aSign ? lt128( b.high, b.low, a.high, a.low )
5083
        : lt128( a.high, a.low, b.high, b.low );
5084
 
5085
}
5086
 
5087
/*----------------------------------------------------------------------------
5088
| Returns 1 if the quadruple-precision floating-point value `a' is equal to
5089
| the corresponding value `b', and 0 otherwise.  The invalid exception is
5090
| raised if either operand is a NaN.  Otherwise, the comparison is performed
5091
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5092
*----------------------------------------------------------------------------*/
5093
 
5094
flag float128_eq_signaling( float128 a, float128 b )
5095
{
5096
 
5097
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5098
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5099
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5100
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5101
       ) {
5102
        float_raise( float_flag_invalid );
5103
        return 0;
5104
    }
5105
    return
5106
           ( a.low == b.low )
5107
        && (    ( a.high == b.high )
5108
             || (    ( a.low == 0 )
5109
                  && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5110
           );
5111
 
5112
}
5113
 
5114
/*----------------------------------------------------------------------------
5115
| Returns 1 if the quadruple-precision floating-point value `a' is less than
5116
| or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5117
| cause an exception.  Otherwise, the comparison is performed according to the
5118
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5119
*----------------------------------------------------------------------------*/
5120
 
5121
flag float128_le_quiet( float128 a, float128 b )
5122
{
5123
    flag aSign, bSign;
5124
 
5125
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5126
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5127
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5128
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5129
       ) {
5130
        if (    float128_is_signaling_nan( a )
5131
             || float128_is_signaling_nan( b ) ) {
5132
            float_raise( float_flag_invalid );
5133
        }
5134
        return 0;
5135
    }
5136
    aSign = extractFloat128Sign( a );
5137
    bSign = extractFloat128Sign( b );
5138
    if ( aSign != bSign ) {
5139
        return
5140
               aSign
5141
            || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5142
                 == 0 );
5143
    }
5144
    return
5145
          aSign ? le128( b.high, b.low, a.high, a.low )
5146
        : le128( a.high, a.low, b.high, b.low );
5147
 
5148
}
5149
 
5150
/*----------------------------------------------------------------------------
5151
| Returns 1 if the quadruple-precision floating-point value `a' is less than
5152
| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
5153
| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
5154
| Standard for Binary Floating-Point Arithmetic.
5155
*----------------------------------------------------------------------------*/
5156
 
5157
flag float128_lt_quiet( float128 a, float128 b )
5158
{
5159
    flag aSign, bSign;
5160
 
5161
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5162
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5163
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5164
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5165
       ) {
5166
        if (    float128_is_signaling_nan( a )
5167
             || float128_is_signaling_nan( b ) ) {
5168
            float_raise( float_flag_invalid );
5169
        }
5170
        return 0;
5171
    }
5172
    aSign = extractFloat128Sign( a );
5173
    bSign = extractFloat128Sign( b );
5174
    if ( aSign != bSign ) {
5175
        return
5176
               aSign
5177
            && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5178
                 != 0 );
5179
    }
5180
    return
5181
          aSign ? lt128( b.high, b.low, a.high, a.low )
5182
        : lt128( a.high, a.low, b.high, b.low );
5183
 
5184
}
5185
 
5186
#endif
5187
 

powered by: WebSVN 2.1.0

© copyright 1999-2025 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.