OpenCores
URL https://opencores.org/ocsvn/openrisc/openrisc/trunk

Subversion Repositories openrisc

[/] [openrisc/] [trunk/] [or1ksim/] [testsuite/] [test-code-or1k/] [testfloat/] [softfloat.c] - Blame information for rev 414

Go to most recent revision | Details | Compare with Previous | View Log

Line No. Rev Author Line
1 234 jeremybenn
 
2
/*============================================================================
3
 
4
This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
5
Package, Release 2b.
6
 
7
Written by John R. Hauser.  This work was made possible in part by the
8
International Computer Science Institute, located at Suite 600, 1947 Center
9
Street, Berkeley, California 94704.  Funding was partially provided by the
10
National Science Foundation under grant MIP-9311980.  The original version
11
of this code was written as part of a project to build a fixed-point vector
12
processor in collaboration with the University of California at Berkeley,
13
overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
14
is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
15
arithmetic/SoftFloat.html'.
16
 
17
THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort has
18
been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
19
RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
20
AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
21
COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
22
EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
23
INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
24
OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
25
 
26
Derivative works are acceptable, even for commercial purposes, so long as
27
(1) the source code for the derivative work includes prominent notice that
28
the work is derivative, and (2) the source code includes prominent notice with
29
these four paragraphs for those parts of this code that are retained.
30
 
31
Modified for use with or1ksim's testsuite.
32
 
33
Contributor Julius Baxter <julius.baxter@orsoc.se>
34
 
35
=============================================================================*/
36
 
37
#include "milieu.h"
38
#include "softfloat.h"
39
 
40
/*----------------------------------------------------------------------------
41
| Floating-point rounding mode, extended double-precision rounding precision,
42
| and exception flags.
43
*----------------------------------------------------------------------------*/
44
int8 float_rounding_mode = float_round_nearest_even;
45
int8 float_exception_flags = 0;
46
#ifdef FLOATX80
47
int8 floatx80_rounding_precision = 80;
48
#endif
49
 
50
/*----------------------------------------------------------------------------
51
| Primitive arithmetic functions, including multi-word arithmetic, and
52
| division and square root approximations.  (Can be specialized to target if
53
| desired.)
54
*----------------------------------------------------------------------------*/
55
//#include "softfloat-macros"
56
 
57
/*----------------------------------------------------------------------------
58
| Shifts `a' right by the number of bits given in `count'.  If any nonzero
59
| bits are shifted off, they are ``jammed'' into the least significant bit of
60
| the result by setting the least significant bit to 1.  The value of `count'
61
| can be arbitrarily large; in particular, if `count' is greater than 32, the
62
| result will be either 0 or 1, depending on whether `a' is zero or nonzero.
63
| The result is stored in the location pointed to by `zPtr'.
64
*----------------------------------------------------------------------------*/
65
 
66
INLINE void shift32RightJamming( bits32 a, int16 count, bits32 *zPtr )
67
{
68
    bits32 z;
69
 
70
    if ( count == 0 ) {
71
        z = a;
72
    }
73
    else if ( count < 32 ) {
74
        z = ( a>>count ) | ( ( a<<( ( - count ) & 31 ) ) != 0 );
75
    }
76
    else {
77
        z = ( a != 0 );
78
    }
79
    *zPtr = z;
80
 
81
}
82
 
83
/*----------------------------------------------------------------------------
84
| Shifts `a' right by the number of bits given in `count'.  If any nonzero
85
| bits are shifted off, they are ``jammed'' into the least significant bit of
86
| the result by setting the least significant bit to 1.  The value of `count'
87
| can be arbitrarily large; in particular, if `count' is greater than 64, the
88
| result will be either 0 or 1, depending on whether `a' is zero or nonzero.
89
| The result is stored in the location pointed to by `zPtr'.
90
*----------------------------------------------------------------------------*/
91
 
92
INLINE void shift64RightJamming( bits64 a, int16 count, bits64 *zPtr )
93
{
94
    bits64 z;
95
 
96
    if ( count == 0 ) {
97
        z = a;
98
    }
99
    else if ( count < 64 ) {
100
        z = ( a>>count ) | ( ( a<<( ( - count ) & 63 ) ) != 0 );
101
    }
102
    else {
103
        z = ( a != 0 );
104
    }
105
    *zPtr = z;
106
 
107
}
108
 
109
/*----------------------------------------------------------------------------
110
| Shifts the 128-bit value formed by concatenating `a0' and `a1' right by 64
111
| _plus_ the number of bits given in `count'.  The shifted result is at most
112
| 64 nonzero bits; this is stored at the location pointed to by `z0Ptr'.  The
113
| bits shifted off form a second 64-bit result as follows:  The _last_ bit
114
| shifted off is the most-significant bit of the extra result, and the other
115
| 63 bits of the extra result are all zero if and only if _all_but_the_last_
116
| bits shifted off were all zero.  This extra result is stored in the location
117
| pointed to by `z1Ptr'.  The value of `count' can be arbitrarily large.
118
|     (This routine makes more sense if `a0' and `a1' are considered to form
119
| a fixed-point value with binary point between `a0' and `a1'.  This fixed-
120
| point value is shifted right by the number of bits given in `count', and
121
| the integer part of the result is returned at the location pointed to by
122
| `z0Ptr'.  The fractional part of the result may be slightly corrupted as
123
| described above, and is returned at the location pointed to by `z1Ptr'.)
124
*----------------------------------------------------------------------------*/
125
 
126
INLINE void
127
 shift64ExtraRightJamming(
128
     bits64 a0, bits64 a1, int16 count, bits64 *z0Ptr, bits64 *z1Ptr )
129
{
130
    bits64 z0, z1;
131
    int8 negCount = ( - count ) & 63;
132
 
133
    if ( count == 0 ) {
134
        z1 = a1;
135
        z0 = a0;
136
    }
137
    else if ( count < 64 ) {
138
        z1 = ( a0<<negCount ) | ( a1 != 0 );
139
        z0 = a0>>count;
140
    }
141
    else {
142
        if ( count == 64 ) {
143
            z1 = a0 | ( a1 != 0 );
144
        }
145
        else {
146
            z1 = ( ( a0 | a1 ) != 0 );
147
        }
148
        z0 = 0;
149
    }
150
    *z1Ptr = z1;
151
    *z0Ptr = z0;
152
 
153
}
154
 
155
/*----------------------------------------------------------------------------
156
| Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the
157
| number of bits given in `count'.  Any bits shifted off are lost.  The value
158
| of `count' can be arbitrarily large; in particular, if `count' is greater
159
| than 128, the result will be 0.  The result is broken into two 64-bit pieces
160
| which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
161
*----------------------------------------------------------------------------*/
162
// Not used - commenting out to stop werrors during compile
163
/*
164
INLINE void
165
 shift128Right(
166
     bits64 a0, bits64 a1, int16 count, bits64 *z0Ptr, bits64 *z1Ptr )
167
{
168
    bits64 z0, z1;
169
    int8 negCount = ( - count ) & 63;
170
 
171
    if ( count == 0 ) {
172
        z1 = a1;
173
        z0 = a0;
174
    }
175
    else if ( count < 64 ) {
176
        z1 = ( a0<<negCount ) | ( a1>>count );
177
        z0 = a0>>count;
178
    }
179
    else {
180
        z1 = ( count < 64 ) ? ( a0>>( count & 63 ) ) : 0;
181
        z0 = 0;
182
    }
183
    *z1Ptr = z1;
184
    *z0Ptr = z0;
185
 
186
}
187
*/
188
/*----------------------------------------------------------------------------
189
| Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the
190
| number of bits given in `count'.  If any nonzero bits are shifted off, they
191
| are ``jammed'' into the least significant bit of the result by setting the
192
| least significant bit to 1.  The value of `count' can be arbitrarily large;
193
| in particular, if `count' is greater than 128, the result will be either
194
| 0 or 1, depending on whether the concatenation of `a0' and `a1' is zero or
195
| nonzero.  The result is broken into two 64-bit pieces which are stored at
196
| the locations pointed to by `z0Ptr' and `z1Ptr'.
197
*----------------------------------------------------------------------------*/
198
 // Not used - commenting out to stop werrors during compile
199
 /*
200
INLINE void
201
 shift128RightJamming(
202
     bits64 a0, bits64 a1, int16 count, bits64 *z0Ptr, bits64 *z1Ptr )
203
{
204
    bits64 z0, z1;
205
    int8 negCount = ( - count ) & 63;
206
 
207
    if ( count == 0 ) {
208
        z1 = a1;
209
        z0 = a0;
210
    }
211
    else if ( count < 64 ) {
212
        z1 = ( a0<<negCount ) | ( a1>>count ) | ( ( a1<<negCount ) != 0 );
213
        z0 = a0>>count;
214
    }
215
    else {
216
        if ( count == 64 ) {
217
            z1 = a0 | ( a1 != 0 );
218
        }
219
        else if ( count < 128 ) {
220
            z1 = ( a0>>( count & 63 ) ) | ( ( ( a0<<negCount ) | a1 ) != 0 );
221
        }
222
        else {
223
            z1 = ( ( a0 | a1 ) != 0 );
224
        }
225
        z0 = 0;
226
    }
227
    *z1Ptr = z1;
228
    *z0Ptr = z0;
229
 
230
}
231
 */
232
/*----------------------------------------------------------------------------
233
| Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' right
234
| by 64 _plus_ the number of bits given in `count'.  The shifted result is
235
| at most 128 nonzero bits; these are broken into two 64-bit pieces which are
236
| stored at the locations pointed to by `z0Ptr' and `z1Ptr'.  The bits shifted
237
| off form a third 64-bit result as follows:  The _last_ bit shifted off is
238
| the most-significant bit of the extra result, and the other 63 bits of the
239
| extra result are all zero if and only if _all_but_the_last_ bits shifted off
240
| were all zero.  This extra result is stored in the location pointed to by
241
| `z2Ptr'.  The value of `count' can be arbitrarily large.
242
|     (This routine makes more sense if `a0', `a1', and `a2' are considered
243
| to form a fixed-point value with binary point between `a1' and `a2'.  This
244
| fixed-point value is shifted right by the number of bits given in `count',
245
| and the integer part of the result is returned at the locations pointed to
246
| by `z0Ptr' and `z1Ptr'.  The fractional part of the result may be slightly
247
| corrupted as described above, and is returned at the location pointed to by
248
| `z2Ptr'.)
249
*----------------------------------------------------------------------------*/
250
  // Not used - commenting out to stop werrors during compile
251
  /*
252
INLINE void
253
 shift128ExtraRightJamming(
254
     bits64 a0,
255
     bits64 a1,
256
     bits64 a2,
257
     int16 count,
258
     bits64 *z0Ptr,
259
     bits64 *z1Ptr,
260
     bits64 *z2Ptr
261
 )
262
{
263
    bits64 z0, z1, z2;
264
    int8 negCount = ( - count ) & 63;
265
 
266
    if ( count == 0 ) {
267
        z2 = a2;
268
        z1 = a1;
269
        z0 = a0;
270
    }
271
    else {
272
        if ( count < 64 ) {
273
            z2 = a1<<negCount;
274
            z1 = ( a0<<negCount ) | ( a1>>count );
275
            z0 = a0>>count;
276
        }
277
        else {
278
            if ( count == 64 ) {
279
                z2 = a1;
280
                z1 = a0;
281
            }
282
            else {
283
                a2 |= a1;
284
                if ( count < 128 ) {
285
                    z2 = a0<<negCount;
286
                    z1 = a0>>( count & 63 );
287
                }
288
                else {
289
                    z2 = ( count == 128 ) ? a0 : ( a0 != 0 );
290
                    z1 = 0;
291
                }
292
            }
293
            z0 = 0;
294
        }
295
        z2 |= ( a2 != 0 );
296
    }
297
    *z2Ptr = z2;
298
    *z1Ptr = z1;
299
    *z0Ptr = z0;
300
 
301
}
302
*/
303
/*----------------------------------------------------------------------------
304
| Shifts the 128-bit value formed by concatenating `a0' and `a1' left by the
305
| number of bits given in `count'.  Any bits shifted off are lost.  The value
306
| of `count' must be less than 64.  The result is broken into two 64-bit
307
| pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
308
*----------------------------------------------------------------------------*/
309
   // Not used - commenting out to stop werrors during compile
310
   /*
311
INLINE void
312
 shortShift128Left(
313
     bits64 a0, bits64 a1, int16 count, bits64 *z0Ptr, bits64 *z1Ptr )
314
{
315
 
316
    *z1Ptr = a1<<count;
317
    *z0Ptr =
318
        ( count == 0 ) ? a0 : ( a0<<count ) | ( a1>>( ( - count ) & 63 ) );
319
 
320
}
321
*/
322
/*----------------------------------------------------------------------------
323
| Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' left
324
| by the number of bits given in `count'.  Any bits shifted off are lost.
325
| The value of `count' must be less than 64.  The result is broken into three
326
| 64-bit pieces which are stored at the locations pointed to by `z0Ptr',
327
| `z1Ptr', and `z2Ptr'.
328
*----------------------------------------------------------------------------*/
329
    // Not used - commenting out to stop werrors during compile
330
    /*
331
INLINE void
332
 shortShift192Left(
333
     bits64 a0,
334
     bits64 a1,
335
     bits64 a2,
336
     int16 count,
337
     bits64 *z0Ptr,
338
     bits64 *z1Ptr,
339
     bits64 *z2Ptr
340
 )
341
{
342
    bits64 z0, z1, z2;
343
    int8 negCount;
344
 
345
    z2 = a2<<count;
346
    z1 = a1<<count;
347
    z0 = a0<<count;
348
    if ( 0 < count ) {
349
        negCount = ( ( - count ) & 63 );
350
        z1 |= a2>>negCount;
351
        z0 |= a1>>negCount;
352
    }
353
    *z2Ptr = z2;
354
    *z1Ptr = z1;
355
    *z0Ptr = z0;
356
 
357
}
358
*/
359
/*----------------------------------------------------------------------------
360
| Adds the 128-bit value formed by concatenating `a0' and `a1' to the 128-bit
361
| value formed by concatenating `b0' and `b1'.  Addition is modulo 2^128, so
362
| any carry out is lost.  The result is broken into two 64-bit pieces which
363
| are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
364
*----------------------------------------------------------------------------*/
365
 
366
INLINE void
367
 add128(
368
     bits64 a0, bits64 a1, bits64 b0, bits64 b1, bits64 *z0Ptr, bits64 *z1Ptr )
369
{
370
    bits64 z1;
371
 
372
    z1 = a1 + b1;
373
    *z1Ptr = z1;
374
    *z0Ptr = a0 + b0 + ( z1 < a1 );
375
 
376
}
377
 
378
/*----------------------------------------------------------------------------
379
| Adds the 192-bit value formed by concatenating `a0', `a1', and `a2' to the
380
| 192-bit value formed by concatenating `b0', `b1', and `b2'.  Addition is
381
| modulo 2^192, so any carry out is lost.  The result is broken into three
382
| 64-bit pieces which are stored at the locations pointed to by `z0Ptr',
383
| `z1Ptr', and `z2Ptr'.
384
*----------------------------------------------------------------------------*/
385
// Not used - commenting out to stop werrors during compile
386
/*
387
INLINE void
388
 add192(
389
     bits64 a0,
390
     bits64 a1,
391
     bits64 a2,
392
     bits64 b0,
393
     bits64 b1,
394
     bits64 b2,
395
     bits64 *z0Ptr,
396
     bits64 *z1Ptr,
397
     bits64 *z2Ptr
398
 )
399
{
400
    bits64 z0, z1, z2;
401
    int8 carry0, carry1;
402
 
403
    z2 = a2 + b2;
404
    carry1 = ( z2 < a2 );
405
    z1 = a1 + b1;
406
    carry0 = ( z1 < a1 );
407
    z0 = a0 + b0;
408
    z1 += carry1;
409
    z0 += ( z1 < carry1 );
410
    z0 += carry0;
411
    *z2Ptr = z2;
412
    *z1Ptr = z1;
413
    *z0Ptr = z0;
414
 
415
}
416
*/
417
/*----------------------------------------------------------------------------
418
| Subtracts the 128-bit value formed by concatenating `b0' and `b1' from the
419
| 128-bit value formed by concatenating `a0' and `a1'.  Subtraction is modulo
420
| 2^128, so any borrow out (carry out) is lost.  The result is broken into two
421
| 64-bit pieces which are stored at the locations pointed to by `z0Ptr' and
422
| `z1Ptr'.
423
*----------------------------------------------------------------------------*/
424
 
425
INLINE void
426
 sub128(
427
     bits64 a0, bits64 a1, bits64 b0, bits64 b1, bits64 *z0Ptr, bits64 *z1Ptr )
428
{
429
 
430
    *z1Ptr = a1 - b1;
431
    *z0Ptr = a0 - b0 - ( a1 < b1 );
432
 
433
}
434
 
435
/*----------------------------------------------------------------------------
436
| Subtracts the 192-bit value formed by concatenating `b0', `b1', and `b2'
437
| from the 192-bit value formed by concatenating `a0', `a1', and `a2'.
438
| Subtraction is modulo 2^192, so any borrow out (carry out) is lost.  The
439
| result is broken into three 64-bit pieces which are stored at the locations
440
| pointed to by `z0Ptr', `z1Ptr', and `z2Ptr'.
441
*----------------------------------------------------------------------------*/
442
// Not used - commenting out to stop werrors during compile
443
/*
444
INLINE void
445
 sub192(
446
     bits64 a0,
447
     bits64 a1,
448
     bits64 a2,
449
     bits64 b0,
450
     bits64 b1,
451
     bits64 b2,
452
     bits64 *z0Ptr,
453
     bits64 *z1Ptr,
454
     bits64 *z2Ptr
455
 )
456
{
457
    bits64 z0, z1, z2;
458
    int8 borrow0, borrow1;
459
 
460
    z2 = a2 - b2;
461
    borrow1 = ( a2 < b2 );
462
    z1 = a1 - b1;
463
    borrow0 = ( a1 < b1 );
464
    z0 = a0 - b0;
465
    z0 -= ( z1 < borrow1 );
466
    z1 -= borrow1;
467
    z0 -= borrow0;
468
    *z2Ptr = z2;
469
    *z1Ptr = z1;
470
    *z0Ptr = z0;
471
 
472
}
473
*/
474
/*----------------------------------------------------------------------------
475
| Multiplies `a' by `b' to obtain a 128-bit product.  The product is broken
476
| into two 64-bit pieces which are stored at the locations pointed to by
477
| `z0Ptr' and `z1Ptr'.
478
*----------------------------------------------------------------------------*/
479
 
480
INLINE void
481
mul64To128( bits64 a, bits64 b, bits64 *z0Ptr, bits64 *z1Ptr )
482
{
483
    bits32 aHigh, aLow, bHigh, bLow;
484
    bits64 z0, zMiddleA, zMiddleB, z1;
485
 
486
    aLow = a;
487
    aHigh = a>>32;
488
    bLow = b;
489
    bHigh = b>>32;
490
    z1 = ( (bits64) aLow ) * bLow;
491
    zMiddleA = ( (bits64) aLow ) * bHigh;
492
    zMiddleB = ( (bits64) aHigh ) * bLow;
493
    z0 = ( (bits64) aHigh ) * bHigh;
494
    zMiddleA += zMiddleB;
495
    z0 += ( ( (bits64) ( zMiddleA < zMiddleB ) )<<32 ) + ( zMiddleA>>32 );
496
    zMiddleA <<= 32;
497
    z1 += zMiddleA;
498
    z0 += ( z1 < zMiddleA );
499
    *z1Ptr = z1;
500
    *z0Ptr = z0;
501
 
502
}
503
 
504
/*----------------------------------------------------------------------------
505
| Multiplies the 128-bit value formed by concatenating `a0' and `a1' by
506
| `b' to obtain a 192-bit product.  The product is broken into three 64-bit
507
| pieces which are stored at the locations pointed to by `z0Ptr', `z1Ptr', and
508
| `z2Ptr'.
509
*----------------------------------------------------------------------------*/
510
// Not used - commenting out to stop werrors during compile
511
/*
512
INLINE void
513
 mul128By64To192(
514
     bits64 a0,
515
     bits64 a1,
516
     bits64 b,
517
     bits64 *z0Ptr,
518
     bits64 *z1Ptr,
519
     bits64 *z2Ptr
520
 )
521
{
522
    bits64 z0, z1, z2, more1;
523
 
524
    mul64To128( a1, b, &z1, &z2 );
525
    mul64To128( a0, b, &z0, &more1 );
526
    add128( z0, more1, 0, z1, &z0, &z1 );
527
    *z2Ptr = z2;
528
    *z1Ptr = z1;
529
    *z0Ptr = z0;
530
 
531
}
532
*/
533
/*----------------------------------------------------------------------------
534
| Multiplies the 128-bit value formed by concatenating `a0' and `a1' to the
535
| 128-bit value formed by concatenating `b0' and `b1' to obtain a 256-bit
536
| product.  The product is broken into four 64-bit pieces which are stored at
537
| the locations pointed to by `z0Ptr', `z1Ptr', `z2Ptr', and `z3Ptr'.
538
*----------------------------------------------------------------------------*/
539
 // Not used - commenting out to stop werrors during compile
540
 /*
541
INLINE void
542
 mul128To256(
543
     bits64 a0,
544
     bits64 a1,
545
     bits64 b0,
546
     bits64 b1,
547
     bits64 *z0Ptr,
548
     bits64 *z1Ptr,
549
     bits64 *z2Ptr,
550
     bits64 *z3Ptr
551
 )
552
{
553
    bits64 z0, z1, z2, z3;
554
    bits64 more1, more2;
555
 
556
    mul64To128( a1, b1, &z2, &z3 );
557
    mul64To128( a1, b0, &z1, &more2 );
558
    add128( z1, more2, 0, z2, &z1, &z2 );
559
    mul64To128( a0, b0, &z0, &more1 );
560
    add128( z0, more1, 0, z1, &z0, &z1 );
561
    mul64To128( a0, b1, &more1, &more2 );
562
    add128( more1, more2, 0, z2, &more1, &z2 );
563
    add128( z0, z1, 0, more1, &z0, &z1 );
564
    *z3Ptr = z3;
565
    *z2Ptr = z2;
566
    *z1Ptr = z1;
567
    *z0Ptr = z0;
568
 
569
}
570
 */
571
/*----------------------------------------------------------------------------
572
| Returns an approximation to the 64-bit integer quotient obtained by dividing
573
| `b' into the 128-bit value formed by concatenating `a0' and `a1'.  The
574
| divisor `b' must be at least 2^63.  If q is the exact quotient truncated
575
| toward zero, the approximation returned lies between q and q + 2 inclusive.
576
| If the exact quotient q is larger than 64 bits, the maximum positive 64-bit
577
| unsigned integer is returned.
578
*----------------------------------------------------------------------------*/
579
 
580
static bits64 estimateDiv128To64( bits64 a0, bits64 a1, bits64 b )
581
{
582
    bits64 b0, b1;
583
    bits64 rem0, rem1, term0, term1;
584
    bits64 z;
585
 
586
    if ( b <= a0 ) return LIT64( 0xFFFFFFFFFFFFFFFF );
587
    b0 = b>>32;
588
    z = ( b0<<32 <= a0 ) ? LIT64( 0xFFFFFFFF00000000 ) : ( a0 / b0 )<<32;
589
    mul64To128( b, z, &term0, &term1 );
590
    sub128( a0, a1, term0, term1, &rem0, &rem1 );
591
    while ( ( (sbits64) rem0 ) < 0 ) {
592
        z -= LIT64( 0x100000000 );
593
        b1 = b<<32;
594
        add128( rem0, rem1, b0, b1, &rem0, &rem1 );
595
    }
596
    rem0 = ( rem0<<32 ) | ( rem1>>32 );
597
    z |= ( b0<<32 <= rem0 ) ? 0xFFFFFFFF : rem0 / b0;
598
    return z;
599
 
600
}
601
 
602
/*----------------------------------------------------------------------------
603
| Returns an approximation to the square root of the 32-bit significand given
604
| by `a'.  Considered as an integer, `a' must be at least 2^31.  If bit 0 of
605
| `aExp' (the least significant bit) is 1, the integer returned approximates
606
| 2^31*sqrt(`a'/2^31), where `a' is considered an integer.  If bit 0 of `aExp'
607
| is 0, the integer returned approximates 2^31*sqrt(`a'/2^30).  In either
608
| case, the approximation returned lies strictly within +/-2 of the exact
609
| value.
610
*----------------------------------------------------------------------------*/
611
 
612
static bits32 estimateSqrt32( int16 aExp, bits32 a )
613
{
614
    static const bits16 sqrtOddAdjustments[] = {
615
        0x0004, 0x0022, 0x005D, 0x00B1, 0x011D, 0x019F, 0x0236, 0x02E0,
616
        0x039C, 0x0468, 0x0545, 0x0631, 0x072B, 0x0832, 0x0946, 0x0A67
617
    };
618
    static const bits16 sqrtEvenAdjustments[] = {
619
        0x0A2D, 0x08AF, 0x075A, 0x0629, 0x051A, 0x0429, 0x0356, 0x029E,
620
        0x0200, 0x0179, 0x0109, 0x00AF, 0x0068, 0x0034, 0x0012, 0x0002
621
    };
622
    int8 index;
623
    bits32 z;
624
 
625
    index = ( a>>27 ) & 15;
626
    if ( aExp & 1 ) {
627
        z = 0x4000 + ( a>>17 ) - sqrtOddAdjustments[ index ];
628
        z = ( ( a / z )<<14 ) + ( z<<15 );
629
        a >>= 1;
630
    }
631
    else {
632
        z = 0x8000 + ( a>>17 ) - sqrtEvenAdjustments[ index ];
633
        z = a / z + z;
634
        z = ( 0x20000 <= z ) ? 0xFFFF8000 : ( z<<15 );
635
        if ( z <= a ) return (bits32) ( ( (sbits32) a )>>1 );
636
    }
637
    return ( (bits32) ( ( ( (bits64) a )<<31 ) / z ) ) + ( z>>1 );
638
 
639
}
640
 
641
/*----------------------------------------------------------------------------
642
| Returns the number of leading 0 bits before the most-significant 1 bit of
643
| `a'.  If `a' is zero, 32 is returned.
644
*----------------------------------------------------------------------------*/
645
 
646
static int8 countLeadingZeros32( bits32 a )
647
{
648
    static const int8 countLeadingZerosHigh[] = {
649
        8, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
650
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
651
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
652
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
653
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
654
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
655
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
656
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
657
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
658
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
659
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
660
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
661
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
662
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
663
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
664
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
665
    };
666
    int8 shiftCount;
667
 
668
    shiftCount = 0;
669
    if ( a < 0x10000 ) {
670
        shiftCount += 16;
671
        a <<= 16;
672
    }
673
    if ( a < 0x1000000 ) {
674
        shiftCount += 8;
675
        a <<= 8;
676
    }
677
    shiftCount += countLeadingZerosHigh[ a>>24 ];
678
    return shiftCount;
679
 
680
}
681
 
682
/*----------------------------------------------------------------------------
683
| Returns the number of leading 0 bits before the most-significant 1 bit of
684
| `a'.  If `a' is zero, 64 is returned.
685
*----------------------------------------------------------------------------*/
686
 
687
static int8 countLeadingZeros64( bits64 a )
688
{
689
    int8 shiftCount;
690
 
691
    shiftCount = 0;
692
    if ( a < ( (bits64) 1 )<<32 ) {
693
        shiftCount += 32;
694
    }
695
    else {
696
        a >>= 32;
697
    }
698
    shiftCount += countLeadingZeros32( a );
699
    return shiftCount;
700
 
701
}
702
 
703
/*----------------------------------------------------------------------------
704
| Returns 1 if the 128-bit value formed by concatenating `a0' and `a1'
705
| is equal to the 128-bit value formed by concatenating `b0' and `b1'.
706
| Otherwise, returns 0.
707
*----------------------------------------------------------------------------*/
708
// Not used - commenting out to stop werrors during compile
709
/*
710
INLINE flag eq128( bits64 a0, bits64 a1, bits64 b0, bits64 b1 )
711
{
712
 
713
    return ( a0 == b0 ) && ( a1 == b1 );
714
 
715
}
716
*/
717
/*----------------------------------------------------------------------------
718
| Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less
719
| than or equal to the 128-bit value formed by concatenating `b0' and `b1'.
720
| Otherwise, returns 0.
721
*----------------------------------------------------------------------------*/
722
 // Not used - commenting out to stop werrors during compile
723
 /*
724
INLINE flag le128( bits64 a0, bits64 a1, bits64 b0, bits64 b1 )
725
{
726
 
727
    return ( a0 < b0 ) || ( ( a0 == b0 ) && ( a1 <= b1 ) );
728
 
729
}
730
 */
731
/*----------------------------------------------------------------------------
732
| Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less
733
| than the 128-bit value formed by concatenating `b0' and `b1'.  Otherwise,
734
| returns 0.
735
*----------------------------------------------------------------------------*/
736
  // Not used - commenting out to stop werrors during compile
737
  /*
738
INLINE flag lt128( bits64 a0, bits64 a1, bits64 b0, bits64 b1 )
739
{
740
 
741
    return ( a0 < b0 ) || ( ( a0 == b0 ) && ( a1 < b1 ) );
742
 
743
}
744
  */
745
/*----------------------------------------------------------------------------
746
| Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is
747
| not equal to the 128-bit value formed by concatenating `b0' and `b1'.
748
| Otherwise, returns 0.
749
*----------------------------------------------------------------------------*/
750
   // Not used - commenting out to stop werrors during compile
751
   /*
752
INLINE flag ne128( bits64 a0, bits64 a1, bits64 b0, bits64 b1 )
753
{
754
 
755
    return ( a0 != b0 ) || ( a1 != b1 );
756
 
757
}
758
   */
759
/*----------------------------------------------------------------------------
760
| Functions and definitions to determine:  (1) whether tininess for underflow
761
| is detected before or after rounding by default, (2) what (if anything)
762
| happens when exceptions are raised, (3) how signaling NaNs are distinguished
763
| from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
764
| are propagated from function inputs to output.  These details are target-
765
| specific.
766
*----------------------------------------------------------------------------*/
767
//#include "softfloat-specialize"
768
 
769
/*----------------------------------------------------------------------------
770
| Underflow tininess-detection mode, statically initialized to default value.
771
| (The declaration in `softfloat.h' must match the `int8' type here.)
772
*----------------------------------------------------------------------------*/
773
int8 float_detect_tininess = float_tininess_after_rounding;
774
 
775
/*----------------------------------------------------------------------------
776
| Raises the exceptions specified by `flags'.  Floating-point traps can be
777
| defined here if desired.  It is currently not possible for such a trap
778
| to substitute a result value.  If traps are not implemented, this routine
779
| should be simply `float_exception_flags |= flags;'.
780
*----------------------------------------------------------------------------*/
781
 
782
void float_raise( int8 flags )
783
{
784
 
785
    float_exception_flags |= flags;
786
 
787
}
788
 
789
/*----------------------------------------------------------------------------
790
| Internal canonical NaN format.
791
*----------------------------------------------------------------------------*/
792
typedef struct {
793
    flag sign;
794
    bits64 high, low;
795
} commonNaNT;
796
 
797
/*----------------------------------------------------------------------------
798
| The pattern for a default generated single-precision NaN.
799
*----------------------------------------------------------------------------*/
800
#define float32_default_nan 0xFFC00000
801
 
802
/*----------------------------------------------------------------------------
803
| Returns 1 if the single-precision floating-point value `a' is a NaN;
804
| otherwise returns 0.
805
*----------------------------------------------------------------------------*/
806
 
807
flag float32_is_nan( float32 a )
808
{
809
 
810
    return ( 0xFF000000 < (bits32) ( a<<1 ) );
811
 
812
}
813
 
814
/*----------------------------------------------------------------------------
815
| Returns 1 if the single-precision floating-point value `a' is a signaling
816
| NaN; otherwise returns 0.
817
*----------------------------------------------------------------------------*/
818
 
819
flag float32_is_signaling_nan( float32 a )
820
{
821
 
822
    return ( ( ( a>>22 ) & 0x1FF ) == 0x1FE ) && ( a & 0x003FFFFF );
823
 
824
}
825
 
826
/*----------------------------------------------------------------------------
827
| Returns the result of converting the single-precision floating-point NaN
828
| `a' to the canonical NaN format.  If `a' is a signaling NaN, the invalid
829
| exception is raised.
830
*----------------------------------------------------------------------------*/
831
 
832
static commonNaNT float32ToCommonNaN( float32 a )
833
{
834
    commonNaNT z;
835
 
836
    if ( float32_is_signaling_nan( a ) ) float_raise( float_flag_invalid );
837
    z.sign = a>>31;
838
    z.low = 0;
839
    z.high = ( (bits64) a )<<41;
840
    return z;
841
 
842
}
843
 
844
/*----------------------------------------------------------------------------
845
| Returns the result of converting the canonical NaN `a' to the single-
846
| precision floating-point format.
847
*----------------------------------------------------------------------------*/
848
 
849
static float32 commonNaNToFloat32( commonNaNT a )
850
{
851
 
852
    return ( ( (bits32) a.sign )<<31 ) | 0x7FC00000 | ( a.high>>41 );
853
 
854
}
855
 
856
/*----------------------------------------------------------------------------
857
| Takes two single-precision floating-point values `a' and `b', one of which
858
| is a NaN, and returns the appropriate NaN result.  If either `a' or `b' is a
859
| signaling NaN, the invalid exception is raised.
860
*----------------------------------------------------------------------------*/
861
 
862
static float32 propagateFloat32NaN( float32 a, float32 b )
863
{
864
    flag aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN;
865
 
866
    aIsNaN = float32_is_nan( a );
867
    aIsSignalingNaN = float32_is_signaling_nan( a );
868
    bIsNaN = float32_is_nan( b );
869
    bIsSignalingNaN = float32_is_signaling_nan( b );
870
    a |= 0x00400000;
871
    b |= 0x00400000;
872
    if ( aIsSignalingNaN | bIsSignalingNaN ) float_raise( float_flag_invalid );
873
    if ( aIsSignalingNaN ) {
874
        if ( bIsSignalingNaN ) goto returnLargerSignificand;
875
        return bIsNaN ? b : a;
876
    }
877
    else if ( aIsNaN ) {
878
        if ( bIsSignalingNaN | ! bIsNaN ) return a;
879
 returnLargerSignificand:
880
        if ( (bits32) ( a<<1 ) < (bits32) ( b<<1 ) ) return b;
881
        if ( (bits32) ( b<<1 ) < (bits32) ( a<<1 ) ) return a;
882
        return ( a < b ) ? a : b;
883
    }
884
    else {
885
        return b;
886
    }
887
 
888
}
889
 
890
/*----------------------------------------------------------------------------
891
| The pattern for a default generated double-precision NaN.
892
*----------------------------------------------------------------------------*/
893
#define float64_default_nan LIT64( 0xFFF8000000000000 )
894
 
895
/*----------------------------------------------------------------------------
896
| Returns 1 if the double-precision floating-point value `a' is a NaN;
897
| otherwise returns 0.
898
*----------------------------------------------------------------------------*/
899
 
900
flag float64_is_nan( float64 a )
901
{
902
 
903
    return ( LIT64( 0xFFE0000000000000 ) < (bits64) ( a<<1 ) );
904
 
905
}
906
 
907
/*----------------------------------------------------------------------------
908
| Returns 1 if the double-precision floating-point value `a' is a signaling
909
| NaN; otherwise returns 0.
910
*----------------------------------------------------------------------------*/
911
 
912
flag float64_is_signaling_nan( float64 a )
913
{
914
 
915
    return
916
           ( ( ( a>>51 ) & 0xFFF ) == 0xFFE )
917
        && ( a & LIT64( 0x0007FFFFFFFFFFFF ) );
918
 
919
}
920
 
921
/*----------------------------------------------------------------------------
922
| Returns the result of converting the double-precision floating-point NaN
923
| `a' to the canonical NaN format.  If `a' is a signaling NaN, the invalid
924
| exception is raised.
925
*----------------------------------------------------------------------------*/
926
 
927
static commonNaNT float64ToCommonNaN( float64 a )
928
{
929
    commonNaNT z;
930
 
931
    if ( float64_is_signaling_nan( a ) ) float_raise( float_flag_invalid );
932
    z.sign = a>>63;
933
    z.low = 0;
934
    z.high = a<<12;
935
    return z;
936
 
937
}
938
 
939
/*----------------------------------------------------------------------------
940
| Returns the result of converting the canonical NaN `a' to the double-
941
| precision floating-point format.
942
*----------------------------------------------------------------------------*/
943
 
944
static float64 commonNaNToFloat64( commonNaNT a )
945
{
946
 
947
    return
948
          ( ( (bits64) a.sign )<<63 )
949
        | LIT64( 0x7FF8000000000000 )
950
        | ( a.high>>12 );
951
 
952
}
953
 
954
/*----------------------------------------------------------------------------
955
| Takes two double-precision floating-point values `a' and `b', one of which
956
| is a NaN, and returns the appropriate NaN result.  If either `a' or `b' is a
957
| signaling NaN, the invalid exception is raised.
958
*----------------------------------------------------------------------------*/
959
 
960
static float64 propagateFloat64NaN( float64 a, float64 b )
961
{
962
    flag aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN;
963
 
964
    aIsNaN = float64_is_nan( a );
965
    aIsSignalingNaN = float64_is_signaling_nan( a );
966
    bIsNaN = float64_is_nan( b );
967
    bIsSignalingNaN = float64_is_signaling_nan( b );
968
    a |= LIT64( 0x0008000000000000 );
969
    b |= LIT64( 0x0008000000000000 );
970
    if ( aIsSignalingNaN | bIsSignalingNaN ) float_raise( float_flag_invalid );
971
    if ( aIsSignalingNaN ) {
972
        if ( bIsSignalingNaN ) goto returnLargerSignificand;
973
        return bIsNaN ? b : a;
974
    }
975
    else if ( aIsNaN ) {
976
        if ( bIsSignalingNaN | ! bIsNaN ) return a;
977
 returnLargerSignificand:
978
        if ( (bits64) ( a<<1 ) < (bits64) ( b<<1 ) ) return b;
979
        if ( (bits64) ( b<<1 ) < (bits64) ( a<<1 ) ) return a;
980
        return ( a < b ) ? a : b;
981
    }
982
    else {
983
        return b;
984
    }
985
 
986
}
987
 
988
#ifdef FLOATX80
989
 
990
/*----------------------------------------------------------------------------
991
| The pattern for a default generated extended double-precision NaN.  The
992
| `high' and `low' values hold the most- and least-significant bits,
993
| respectively.
994
*----------------------------------------------------------------------------*/
995
#define floatx80_default_nan_high 0xFFFF
996
#define floatx80_default_nan_low  LIT64( 0xC000000000000000 )
997
 
998
/*----------------------------------------------------------------------------
999
| Returns 1 if the extended double-precision floating-point value `a' is a
1000
| NaN; otherwise returns 0.
1001
*----------------------------------------------------------------------------*/
1002
 
1003
flag floatx80_is_nan( floatx80 a )
1004
{
1005
 
1006
    return ( ( a.high & 0x7FFF ) == 0x7FFF ) && (bits64) ( a.low<<1 );
1007
 
1008
}
1009
 
1010
/*----------------------------------------------------------------------------
1011
| Returns 1 if the extended double-precision floating-point value `a' is a
1012
| signaling NaN; otherwise returns 0.
1013
*----------------------------------------------------------------------------*/
1014
 
1015
flag floatx80_is_signaling_nan( floatx80 a )
1016
{
1017
    bits64 aLow;
1018
 
1019
    aLow = a.low & ~ LIT64( 0x4000000000000000 );
1020
    return
1021
           ( ( a.high & 0x7FFF ) == 0x7FFF )
1022
        && (bits64) ( aLow<<1 )
1023
        && ( a.low == aLow );
1024
 
1025
}
1026
 
1027
/*----------------------------------------------------------------------------
1028
| Returns the result of converting the extended double-precision floating-
1029
| point NaN `a' to the canonical NaN format.  If `a' is a signaling NaN, the
1030
| invalid exception is raised.
1031
*----------------------------------------------------------------------------*/
1032
 
1033
static commonNaNT floatx80ToCommonNaN( floatx80 a )
1034
{
1035
    commonNaNT z;
1036
 
1037
    if ( floatx80_is_signaling_nan( a ) ) float_raise( float_flag_invalid );
1038
    z.sign = a.high>>15;
1039
    z.low = 0;
1040
    z.high = a.low<<1;
1041
    return z;
1042
 
1043
}
1044
 
1045
/*----------------------------------------------------------------------------
1046
| Returns the result of converting the canonical NaN `a' to the extended
1047
| double-precision floating-point format.
1048
*----------------------------------------------------------------------------*/
1049
 
1050
static floatx80 commonNaNToFloatx80( commonNaNT a )
1051
{
1052
    floatx80 z;
1053
 
1054
    z.low = LIT64( 0xC000000000000000 ) | ( a.high>>1 );
1055
    z.high = ( ( (bits16) a.sign )<<15 ) | 0x7FFF;
1056
    return z;
1057
 
1058
}
1059
 
1060
/*----------------------------------------------------------------------------
1061
| Takes two extended double-precision floating-point values `a' and `b', one
1062
| of which is a NaN, and returns the appropriate NaN result.  If either `a' or
1063
| `b' is a signaling NaN, the invalid exception is raised.
1064
*----------------------------------------------------------------------------*/
1065
 
1066
static floatx80 propagateFloatx80NaN( floatx80 a, floatx80 b )
1067
{
1068
    flag aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN;
1069
 
1070
    aIsNaN = floatx80_is_nan( a );
1071
    aIsSignalingNaN = floatx80_is_signaling_nan( a );
1072
    bIsNaN = floatx80_is_nan( b );
1073
    bIsSignalingNaN = floatx80_is_signaling_nan( b );
1074
    a.low |= LIT64( 0xC000000000000000 );
1075
    b.low |= LIT64( 0xC000000000000000 );
1076
    if ( aIsSignalingNaN | bIsSignalingNaN ) float_raise( float_flag_invalid );
1077
    if ( aIsSignalingNaN ) {
1078
        if ( bIsSignalingNaN ) goto returnLargerSignificand;
1079
        return bIsNaN ? b : a;
1080
    }
1081
    else if ( aIsNaN ) {
1082
        if ( bIsSignalingNaN | ! bIsNaN ) return a;
1083
 returnLargerSignificand:
1084
        if ( a.low < b.low ) return b;
1085
        if ( b.low < a.low ) return a;
1086
        return ( a.high < b.high ) ? a : b;
1087
    }
1088
    else {
1089
        return b;
1090
    }
1091
 
1092
}
1093
 
1094
#endif
1095
 
1096
#ifdef FLOAT128
1097
 
1098
/*----------------------------------------------------------------------------
1099
| The pattern for a default generated quadruple-precision NaN.  The `high' and
1100
| `low' values hold the most- and least-significant bits, respectively.
1101
*----------------------------------------------------------------------------*/
1102
#define float128_default_nan_high LIT64( 0xFFFF800000000000 )
1103
#define float128_default_nan_low  LIT64( 0x0000000000000000 )
1104
 
1105
/*----------------------------------------------------------------------------
1106
| Returns 1 if the quadruple-precision floating-point value `a' is a NaN;
1107
| otherwise returns 0.
1108
*----------------------------------------------------------------------------*/
1109
 
1110
flag float128_is_nan( float128 a )
1111
{
1112
 
1113
    return
1114
           ( LIT64( 0xFFFE000000000000 ) <= (bits64) ( a.high<<1 ) )
1115
        && ( a.low || ( a.high & LIT64( 0x0000FFFFFFFFFFFF ) ) );
1116
 
1117
}
1118
 
1119
/*----------------------------------------------------------------------------
1120
| Returns 1 if the quadruple-precision floating-point value `a' is a
1121
| signaling NaN; otherwise returns 0.
1122
*----------------------------------------------------------------------------*/
1123
 
1124
flag float128_is_signaling_nan( float128 a )
1125
{
1126
 
1127
    return
1128
           ( ( ( a.high>>47 ) & 0xFFFF ) == 0xFFFE )
1129
        && ( a.low || ( a.high & LIT64( 0x00007FFFFFFFFFFF ) ) );
1130
 
1131
}
1132
 
1133
/*----------------------------------------------------------------------------
1134
| Returns the result of converting the quadruple-precision floating-point NaN
1135
| `a' to the canonical NaN format.  If `a' is a signaling NaN, the invalid
1136
| exception is raised.
1137
*----------------------------------------------------------------------------*/
1138
 
1139
static commonNaNT float128ToCommonNaN( float128 a )
1140
{
1141
    commonNaNT z;
1142
 
1143
    if ( float128_is_signaling_nan( a ) ) float_raise( float_flag_invalid );
1144
    z.sign = a.high>>63;
1145
    shortShift128Left( a.high, a.low, 16, &z.high, &z.low );
1146
    return z;
1147
 
1148
}
1149
 
1150
/*----------------------------------------------------------------------------
1151
| Returns the result of converting the canonical NaN `a' to the quadruple-
1152
| precision floating-point format.
1153
*----------------------------------------------------------------------------*/
1154
 
1155
static float128 commonNaNToFloat128( commonNaNT a )
1156
{
1157
    float128 z;
1158
 
1159
    shift128Right( a.high, a.low, 16, &z.high, &z.low );
1160
    z.high |= ( ( (bits64) a.sign )<<63 ) | LIT64( 0x7FFF800000000000 );
1161
    return z;
1162
 
1163
}
1164
 
1165
/*----------------------------------------------------------------------------
1166
| Takes two quadruple-precision floating-point values `a' and `b', one of
1167
| which is a NaN, and returns the appropriate NaN result.  If either `a' or
1168
| `b' is a signaling NaN, the invalid exception is raised.
1169
*----------------------------------------------------------------------------*/
1170
 
1171
static float128 propagateFloat128NaN( float128 a, float128 b )
1172
{
1173
    flag aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN;
1174
 
1175
    aIsNaN = float128_is_nan( a );
1176
    aIsSignalingNaN = float128_is_signaling_nan( a );
1177
    bIsNaN = float128_is_nan( b );
1178
    bIsSignalingNaN = float128_is_signaling_nan( b );
1179
    a.high |= LIT64( 0x0000800000000000 );
1180
    b.high |= LIT64( 0x0000800000000000 );
1181
    if ( aIsSignalingNaN | bIsSignalingNaN ) float_raise( float_flag_invalid );
1182
    if ( aIsSignalingNaN ) {
1183
        if ( bIsSignalingNaN ) goto returnLargerSignificand;
1184
        return bIsNaN ? b : a;
1185
    }
1186
    else if ( aIsNaN ) {
1187
        if ( bIsSignalingNaN | ! bIsNaN ) return a;
1188
 returnLargerSignificand:
1189
        if ( lt128( a.high<<1, a.low, b.high<<1, b.low ) ) return b;
1190
        if ( lt128( b.high<<1, b.low, a.high<<1, a.low ) ) return a;
1191
        return ( a.high < b.high ) ? a : b;
1192
    }
1193
    else {
1194
        return b;
1195
    }
1196
 
1197
}
1198
 
1199
#endif
1200
 
1201
 
1202
 
1203
/*----------------------------------------------------------------------------
1204
| Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
1205
| and 7, and returns the properly rounded 32-bit integer corresponding to the
1206
| input.  If `zSign' is 1, the input is negated before being converted to an
1207
| integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
1208
| is simply rounded to an integer, with the inexact exception raised if the
1209
| input cannot be represented exactly as an integer.  However, if the fixed-
1210
| point input is too large, the invalid exception is raised and the largest
1211
| positive or negative integer is returned.
1212
*----------------------------------------------------------------------------*/
1213
 
1214
static int32 roundAndPackInt32( flag zSign, bits64 absZ )
1215
{
1216
    int8 roundingMode;
1217
    flag roundNearestEven;
1218
    int8 roundIncrement, roundBits;
1219
    int32 z;
1220
 
1221
    roundingMode = float_rounding_mode;
1222
    roundNearestEven = ( roundingMode == float_round_nearest_even );
1223
    roundIncrement = 0x40;
1224
    if ( ! roundNearestEven ) {
1225
        if ( roundingMode == float_round_to_zero ) {
1226
            roundIncrement = 0;
1227
        }
1228
        else {
1229
            roundIncrement = 0x7F;
1230
            if ( zSign ) {
1231
                if ( roundingMode == float_round_up ) roundIncrement = 0;
1232
            }
1233
            else {
1234
                if ( roundingMode == float_round_down ) roundIncrement = 0;
1235
            }
1236
        }
1237
    }
1238
    roundBits = absZ & 0x7F;
1239
    absZ = ( absZ + roundIncrement )>>7;
1240
    absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
1241
    z = absZ;
1242
    if ( zSign ) z = - z;
1243
    if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
1244
        float_raise( float_flag_invalid );
1245
        return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
1246
    }
1247
    if ( roundBits ) float_exception_flags |= float_flag_inexact;
1248
    return z;
1249
 
1250
}
1251
 
1252
/*----------------------------------------------------------------------------
1253
| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
1254
| `absZ1', with binary point between bits 63 and 64 (between the input words),
1255
| and returns the properly rounded 64-bit integer corresponding to the input.
1256
| If `zSign' is 1, the input is negated before being converted to an integer.
1257
| Ordinarily, the fixed-point input is simply rounded to an integer, with
1258
| the inexact exception raised if the input cannot be represented exactly as
1259
| an integer.  However, if the fixed-point input is too large, the invalid
1260
| exception is raised and the largest positive or negative integer is
1261
| returned.
1262
*----------------------------------------------------------------------------*/
1263
 
1264
static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 )
1265
{
1266
    int8 roundingMode;
1267
    flag roundNearestEven, increment;
1268
    int64 z;
1269
 
1270
    roundingMode = float_rounding_mode;
1271
    roundNearestEven = ( roundingMode == float_round_nearest_even );
1272
    increment = ( (sbits64) absZ1 < 0 );
1273
    if ( ! roundNearestEven ) {
1274
        if ( roundingMode == float_round_to_zero ) {
1275
            increment = 0;
1276
        }
1277
        else {
1278
            if ( zSign ) {
1279
                increment = ( roundingMode == float_round_down ) && absZ1;
1280
            }
1281
            else {
1282
                increment = ( roundingMode == float_round_up ) && absZ1;
1283
            }
1284
        }
1285
    }
1286
    if ( increment ) {
1287
        ++absZ0;
1288
        if ( absZ0 == 0 ) goto overflow;
1289
        absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
1290
    }
1291
    z = absZ0;
1292
    if ( zSign ) z = - z;
1293
    if ( z && ( ( z < 0 ) ^ zSign ) ) {
1294
 overflow:
1295
        float_raise( float_flag_invalid );
1296
        return
1297
              zSign ? (sbits64) LIT64( 0x8000000000000000 )
1298
            : LIT64( 0x7FFFFFFFFFFFFFFF );
1299
    }
1300
    if ( absZ1 ) float_exception_flags |= float_flag_inexact;
1301
    return z;
1302
 
1303
}
1304
 
1305
/*----------------------------------------------------------------------------
1306
| Returns the fraction bits of the single-precision floating-point value `a'.
1307
*----------------------------------------------------------------------------*/
1308
 
1309
INLINE bits32 extractFloat32Frac( float32 a )
1310
{
1311
 
1312
    return a & 0x007FFFFF;
1313
 
1314
}
1315
 
1316
/*----------------------------------------------------------------------------
1317
| Returns the exponent bits of the single-precision floating-point value `a'.
1318
*----------------------------------------------------------------------------*/
1319
 
1320
INLINE int16 extractFloat32Exp( float32 a )
1321
{
1322
 
1323
    return ( a>>23 ) & 0xFF;
1324
 
1325
}
1326
 
1327
/*----------------------------------------------------------------------------
1328
| Returns the sign bit of the single-precision floating-point value `a'.
1329
*----------------------------------------------------------------------------*/
1330
 
1331
INLINE flag extractFloat32Sign( float32 a )
1332
{
1333
 
1334
    return a>>31;
1335
 
1336
}
1337
 
1338
/*----------------------------------------------------------------------------
1339
| Normalizes the subnormal single-precision floating-point value represented
1340
| by the denormalized significand `aSig'.  The normalized exponent and
1341
| significand are stored at the locations pointed to by `zExpPtr' and
1342
| `zSigPtr', respectively.
1343
*----------------------------------------------------------------------------*/
1344
 
1345
static void
1346
 normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
1347
{
1348
    int8 shiftCount;
1349
 
1350
    shiftCount = countLeadingZeros32( aSig ) - 8;
1351
    *zSigPtr = aSig<<shiftCount;
1352
    *zExpPtr = 1 - shiftCount;
1353
 
1354
}
1355
 
1356
/*----------------------------------------------------------------------------
1357
| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
1358
| single-precision floating-point value, returning the result.  After being
1359
| shifted into the proper positions, the three fields are simply added
1360
| together to form the result.  This means that any integer portion of `zSig'
1361
| will be added into the exponent.  Since a properly normalized significand
1362
| will have an integer portion equal to 1, the `zExp' input should be 1 less
1363
| than the desired result exponent whenever `zSig' is a complete, normalized
1364
| significand.
1365
*----------------------------------------------------------------------------*/
1366
 
1367
INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
1368
{
1369
 
1370
    return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
1371
 
1372
}
1373
 
1374
/*----------------------------------------------------------------------------
1375
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1376
| and significand `zSig', and returns the proper single-precision floating-
1377
| point value corresponding to the abstract input.  Ordinarily, the abstract
1378
| value is simply rounded and packed into the single-precision format, with
1379
| the inexact exception raised if the abstract input cannot be represented
1380
| exactly.  However, if the abstract value is too large, the overflow and
1381
| inexact exceptions are raised and an infinity or maximal finite value is
1382
| returned.  If the abstract value is too small, the input value is rounded to
1383
| a subnormal number, and the underflow and inexact exceptions are raised if
1384
| the abstract input cannot be represented exactly as a subnormal single-
1385
| precision floating-point number.
1386
|     The input significand `zSig' has its binary point between bits 30
1387
| and 29, which is 7 bits to the left of the usual location.  This shifted
1388
| significand must be normalized or smaller.  If `zSig' is not normalized,
1389
| `zExp' must be 0; in that case, the result returned is a subnormal number,
1390
| and it must not require rounding.  In the usual case that `zSig' is
1391
| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
1392
| The handling of underflow and overflow follows the IEC/IEEE Standard for
1393
| Binary Floating-Point Arithmetic.
1394
*----------------------------------------------------------------------------*/
1395
 
1396
static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
1397
{
1398
    int8 roundingMode;
1399
    flag roundNearestEven;
1400
    int8 roundIncrement, roundBits;
1401
    flag isTiny;
1402
 
1403
    roundingMode = float_rounding_mode;
1404
    roundNearestEven = ( roundingMode == float_round_nearest_even );
1405
    roundIncrement = 0x40;
1406
    if ( ! roundNearestEven ) {
1407
        if ( roundingMode == float_round_to_zero ) {
1408
            roundIncrement = 0;
1409
        }
1410
        else {
1411
            roundIncrement = 0x7F;
1412
            if ( zSign ) {
1413
                if ( roundingMode == float_round_up ) roundIncrement = 0;
1414
            }
1415
            else {
1416
                if ( roundingMode == float_round_down ) roundIncrement = 0;
1417
            }
1418
        }
1419
    }
1420
    roundBits = zSig & 0x7F;
1421
    if ( 0xFD <= (bits16) zExp ) {
1422
        if (    ( 0xFD < zExp )
1423
             || (    ( zExp == 0xFD )
1424
                  && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
1425
           ) {
1426
            float_raise( float_flag_overflow | float_flag_inexact );
1427
            return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
1428
        }
1429
        if ( zExp < 0 ) {
1430
            isTiny =
1431
                   ( float_detect_tininess == float_tininess_before_rounding )
1432
                || ( zExp < -1 )
1433
                || ( zSig + roundIncrement < 0x80000000 );
1434
            shift32RightJamming( zSig, - zExp, &zSig );
1435
            zExp = 0;
1436
            roundBits = zSig & 0x7F;
1437
            if ( isTiny && roundBits ) float_raise( float_flag_underflow );
1438
        }
1439
    }
1440
    if ( roundBits ) float_exception_flags |= float_flag_inexact;
1441
    zSig = ( zSig + roundIncrement )>>7;
1442
    zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
1443
    if ( zSig == 0 ) zExp = 0;
1444
    return packFloat32( zSign, zExp, zSig );
1445
 
1446
}
1447
 
1448
/*----------------------------------------------------------------------------
1449
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1450
| and significand `zSig', and returns the proper single-precision floating-
1451
| point value corresponding to the abstract input.  This routine is just like
1452
| `roundAndPackFloat32' except that `zSig' does not have to be normalized.
1453
| Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
1454
| floating-point exponent.
1455
*----------------------------------------------------------------------------*/
1456
 
1457
static float32
1458
 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
1459
{
1460
    int8 shiftCount;
1461
 
1462
    shiftCount = countLeadingZeros32( zSig ) - 1;
1463
    return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount );
1464
 
1465
}
1466
 
1467
/*----------------------------------------------------------------------------
1468
| Returns the fraction bits of the double-precision floating-point value `a'.
1469
*----------------------------------------------------------------------------*/
1470
 
1471
INLINE bits64 extractFloat64Frac( float64 a )
1472
{
1473
 
1474
    return a & LIT64( 0x000FFFFFFFFFFFFF );
1475
 
1476
}
1477
 
1478
/*----------------------------------------------------------------------------
1479
| Returns the exponent bits of the double-precision floating-point value `a'.
1480
*----------------------------------------------------------------------------*/
1481
 
1482
INLINE int16 extractFloat64Exp( float64 a )
1483
{
1484
 
1485
    return ( a>>52 ) & 0x7FF;
1486
 
1487
}
1488
 
1489
/*----------------------------------------------------------------------------
1490
| Returns the sign bit of the double-precision floating-point value `a'.
1491
*----------------------------------------------------------------------------*/
1492
 
1493
INLINE flag extractFloat64Sign( float64 a )
1494
{
1495
 
1496
    return a>>63;
1497
 
1498
}
1499
 
1500
/*----------------------------------------------------------------------------
1501
| Normalizes the subnormal double-precision floating-point value represented
1502
| by the denormalized significand `aSig'.  The normalized exponent and
1503
| significand are stored at the locations pointed to by `zExpPtr' and
1504
| `zSigPtr', respectively.
1505
*----------------------------------------------------------------------------*/
1506
 
1507
static void
1508
 normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
1509
{
1510
    int8 shiftCount;
1511
 
1512
    shiftCount = countLeadingZeros64( aSig ) - 11;
1513
    *zSigPtr = aSig<<shiftCount;
1514
    *zExpPtr = 1 - shiftCount;
1515
 
1516
}
1517
 
1518
/*----------------------------------------------------------------------------
1519
| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
1520
| double-precision floating-point value, returning the result.  After being
1521
| shifted into the proper positions, the three fields are simply added
1522
| together to form the result.  This means that any integer portion of `zSig'
1523
| will be added into the exponent.  Since a properly normalized significand
1524
| will have an integer portion equal to 1, the `zExp' input should be 1 less
1525
| than the desired result exponent whenever `zSig' is a complete, normalized
1526
| significand.
1527
*----------------------------------------------------------------------------*/
1528
 
1529
INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
1530
{
1531
 
1532
    return ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<52 ) + zSig;
1533
 
1534
}
1535
 
1536
/*----------------------------------------------------------------------------
1537
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1538
| and significand `zSig', and returns the proper double-precision floating-
1539
| point value corresponding to the abstract input.  Ordinarily, the abstract
1540
| value is simply rounded and packed into the double-precision format, with
1541
| the inexact exception raised if the abstract input cannot be represented
1542
| exactly.  However, if the abstract value is too large, the overflow and
1543
| inexact exceptions are raised and an infinity or maximal finite value is
1544
| returned.  If the abstract value is too small, the input value is rounded
1545
| to a subnormal number, and the underflow and inexact exceptions are raised
1546
| if the abstract input cannot be represented exactly as a subnormal double-
1547
| precision floating-point number.
1548
|     The input significand `zSig' has its binary point between bits 62
1549
| and 61, which is 10 bits to the left of the usual location.  This shifted
1550
| significand must be normalized or smaller.  If `zSig' is not normalized,
1551
| `zExp' must be 0; in that case, the result returned is a subnormal number,
1552
| and it must not require rounding.  In the usual case that `zSig' is
1553
| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
1554
| The handling of underflow and overflow follows the IEC/IEEE Standard for
1555
| Binary Floating-Point Arithmetic.
1556
*----------------------------------------------------------------------------*/
1557
 
1558
static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
1559
{
1560
    int8 roundingMode;
1561
    flag roundNearestEven;
1562
    int16 roundIncrement, roundBits;
1563
    flag isTiny;
1564
 
1565
    roundingMode = float_rounding_mode;
1566
    roundNearestEven = ( roundingMode == float_round_nearest_even );
1567
    roundIncrement = 0x200;
1568
    if ( ! roundNearestEven ) {
1569
        if ( roundingMode == float_round_to_zero ) {
1570
            roundIncrement = 0;
1571
        }
1572
        else {
1573
            roundIncrement = 0x3FF;
1574
            if ( zSign ) {
1575
                if ( roundingMode == float_round_up ) roundIncrement = 0;
1576
            }
1577
            else {
1578
                if ( roundingMode == float_round_down ) roundIncrement = 0;
1579
            }
1580
        }
1581
    }
1582
    roundBits = zSig & 0x3FF;
1583
    if ( 0x7FD <= (bits16) zExp ) {
1584
        if (    ( 0x7FD < zExp )
1585
             || (    ( zExp == 0x7FD )
1586
                  && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
1587
           ) {
1588
            float_raise( float_flag_overflow | float_flag_inexact );
1589
            return packFloat64( zSign, 0x7FF, 0 ) - ( roundIncrement == 0 );
1590
        }
1591
        if ( zExp < 0 ) {
1592
            isTiny =
1593
                   ( float_detect_tininess == float_tininess_before_rounding )
1594
                || ( zExp < -1 )
1595
                || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
1596
            shift64RightJamming( zSig, - zExp, &zSig );
1597
            zExp = 0;
1598
            roundBits = zSig & 0x3FF;
1599
            if ( isTiny && roundBits ) float_raise( float_flag_underflow );
1600
        }
1601
    }
1602
    if ( roundBits ) float_exception_flags |= float_flag_inexact;
1603
    zSig = ( zSig + roundIncrement )>>10;
1604
    zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
1605
    if ( zSig == 0 ) zExp = 0;
1606
    return packFloat64( zSign, zExp, zSig );
1607
 
1608
}
1609
 
1610
/*----------------------------------------------------------------------------
1611
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1612
| and significand `zSig', and returns the proper double-precision floating-
1613
| point value corresponding to the abstract input.  This routine is just like
1614
| `roundAndPackFloat64' except that `zSig' does not have to be normalized.
1615
| Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
1616
| floating-point exponent.
1617
*----------------------------------------------------------------------------*/
1618
 
1619
static float64
1620
 normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
1621
{
1622
    int8 shiftCount;
1623
 
1624
    shiftCount = countLeadingZeros64( zSig ) - 1;
1625
    return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount );
1626
 
1627
}
1628
 
1629
#ifdef FLOATX80
1630
 
1631
/*----------------------------------------------------------------------------
1632
| Returns the fraction bits of the extended double-precision floating-point
1633
| value `a'.
1634
*----------------------------------------------------------------------------*/
1635
 
1636
INLINE bits64 extractFloatx80Frac( floatx80 a )
1637
{
1638
 
1639
    return a.low;
1640
 
1641
}
1642
 
1643
/*----------------------------------------------------------------------------
1644
| Returns the exponent bits of the extended double-precision floating-point
1645
| value `a'.
1646
*----------------------------------------------------------------------------*/
1647
 
1648
INLINE int32 extractFloatx80Exp( floatx80 a )
1649
{
1650
 
1651
    return a.high & 0x7FFF;
1652
 
1653
}
1654
 
1655
/*----------------------------------------------------------------------------
1656
| Returns the sign bit of the extended double-precision floating-point value
1657
| `a'.
1658
*----------------------------------------------------------------------------*/
1659
 
1660
INLINE flag extractFloatx80Sign( floatx80 a )
1661
{
1662
 
1663
    return a.high>>15;
1664
 
1665
}
1666
 
1667
/*----------------------------------------------------------------------------
1668
| Normalizes the subnormal extended double-precision floating-point value
1669
| represented by the denormalized significand `aSig'.  The normalized exponent
1670
| and significand are stored at the locations pointed to by `zExpPtr' and
1671
| `zSigPtr', respectively.
1672
*----------------------------------------------------------------------------*/
1673
 
1674
static void
1675
 normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
1676
{
1677
    int8 shiftCount;
1678
 
1679
    shiftCount = countLeadingZeros64( aSig );
1680
    *zSigPtr = aSig<<shiftCount;
1681
    *zExpPtr = 1 - shiftCount;
1682
 
1683
}
1684
 
1685
/*----------------------------------------------------------------------------
1686
| Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
1687
| extended double-precision floating-point value, returning the result.
1688
*----------------------------------------------------------------------------*/
1689
 
1690
INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
1691
{
1692
    floatx80 z;
1693
 
1694
    z.low = zSig;
1695
    z.high = ( ( (bits16) zSign )<<15 ) + zExp;
1696
    return z;
1697
 
1698
}
1699
 
1700
/*----------------------------------------------------------------------------
1701
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1702
| and extended significand formed by the concatenation of `zSig0' and `zSig1',
1703
| and returns the proper extended double-precision floating-point value
1704
| corresponding to the abstract input.  Ordinarily, the abstract value is
1705
| rounded and packed into the extended double-precision format, with the
1706
| inexact exception raised if the abstract input cannot be represented
1707
| exactly.  However, if the abstract value is too large, the overflow and
1708
| inexact exceptions are raised and an infinity or maximal finite value is
1709
| returned.  If the abstract value is too small, the input value is rounded to
1710
| a subnormal number, and the underflow and inexact exceptions are raised if
1711
| the abstract input cannot be represented exactly as a subnormal extended
1712
| double-precision floating-point number.
1713
|     If `roundingPrecision' is 32 or 64, the result is rounded to the same
1714
| number of bits as single or double precision, respectively.  Otherwise, the
1715
| result is rounded to the full precision of the extended double-precision
1716
| format.
1717
|     The input significand must be normalized or smaller.  If the input
1718
| significand is not normalized, `zExp' must be 0; in that case, the result
1719
| returned is a subnormal number, and it must not require rounding.  The
1720
| handling of underflow and overflow follows the IEC/IEEE Standard for Binary
1721
| Floating-Point Arithmetic.
1722
*----------------------------------------------------------------------------*/
1723
 
1724
static floatx80
1725
 roundAndPackFloatx80(
1726
     int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
1727
 )
1728
{
1729
    int8 roundingMode;
1730
    flag roundNearestEven, increment, isTiny;
1731
    int64 roundIncrement, roundMask, roundBits;
1732
 
1733
    roundingMode = float_rounding_mode;
1734
    roundNearestEven = ( roundingMode == float_round_nearest_even );
1735
    if ( roundingPrecision == 80 ) goto precision80;
1736
    if ( roundingPrecision == 64 ) {
1737
        roundIncrement = LIT64( 0x0000000000000400 );
1738
        roundMask = LIT64( 0x00000000000007FF );
1739
    }
1740
    else if ( roundingPrecision == 32 ) {
1741
        roundIncrement = LIT64( 0x0000008000000000 );
1742
        roundMask = LIT64( 0x000000FFFFFFFFFF );
1743
    }
1744
    else {
1745
        goto precision80;
1746
    }
1747
    zSig0 |= ( zSig1 != 0 );
1748
    if ( ! roundNearestEven ) {
1749
        if ( roundingMode == float_round_to_zero ) {
1750
            roundIncrement = 0;
1751
        }
1752
        else {
1753
            roundIncrement = roundMask;
1754
            if ( zSign ) {
1755
                if ( roundingMode == float_round_up ) roundIncrement = 0;
1756
            }
1757
            else {
1758
                if ( roundingMode == float_round_down ) roundIncrement = 0;
1759
            }
1760
        }
1761
    }
1762
    roundBits = zSig0 & roundMask;
1763
    if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
1764
        if (    ( 0x7FFE < zExp )
1765
             || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
1766
           ) {
1767
            goto overflow;
1768
        }
1769
        if ( zExp <= 0 ) {
1770
            isTiny =
1771
                   ( float_detect_tininess == float_tininess_before_rounding )
1772
                || ( zExp < 0 )
1773
                || ( zSig0 <= zSig0 + roundIncrement );
1774
            shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
1775
            zExp = 0;
1776
            roundBits = zSig0 & roundMask;
1777
            if ( isTiny && roundBits ) float_raise( float_flag_underflow );
1778
            if ( roundBits ) float_exception_flags |= float_flag_inexact;
1779
            zSig0 += roundIncrement;
1780
            if ( (sbits64) zSig0 < 0 ) zExp = 1;
1781
            roundIncrement = roundMask + 1;
1782
            if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
1783
                roundMask |= roundIncrement;
1784
            }
1785
            zSig0 &= ~ roundMask;
1786
            return packFloatx80( zSign, zExp, zSig0 );
1787
        }
1788
    }
1789
    if ( roundBits ) float_exception_flags |= float_flag_inexact;
1790
    zSig0 += roundIncrement;
1791
    if ( zSig0 < roundIncrement ) {
1792
        ++zExp;
1793
        zSig0 = LIT64( 0x8000000000000000 );
1794
    }
1795
    roundIncrement = roundMask + 1;
1796
    if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
1797
        roundMask |= roundIncrement;
1798
    }
1799
    zSig0 &= ~ roundMask;
1800
    if ( zSig0 == 0 ) zExp = 0;
1801
    return packFloatx80( zSign, zExp, zSig0 );
1802
 precision80:
1803
    increment = ( (sbits64) zSig1 < 0 );
1804
    if ( ! roundNearestEven ) {
1805
        if ( roundingMode == float_round_to_zero ) {
1806
            increment = 0;
1807
        }
1808
        else {
1809
            if ( zSign ) {
1810
                increment = ( roundingMode == float_round_down ) && zSig1;
1811
            }
1812
            else {
1813
                increment = ( roundingMode == float_round_up ) && zSig1;
1814
            }
1815
        }
1816
    }
1817
    if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
1818
        if (    ( 0x7FFE < zExp )
1819
             || (    ( zExp == 0x7FFE )
1820
                  && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
1821
                  && increment
1822
                )
1823
           ) {
1824
            roundMask = 0;
1825
 overflow:
1826
            float_raise( float_flag_overflow | float_flag_inexact );
1827
            if (    ( roundingMode == float_round_to_zero )
1828
                 || ( zSign && ( roundingMode == float_round_up ) )
1829
                 || ( ! zSign && ( roundingMode == float_round_down ) )
1830
               ) {
1831
                return packFloatx80( zSign, 0x7FFE, ~ roundMask );
1832
            }
1833
            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1834
        }
1835
        if ( zExp <= 0 ) {
1836
            isTiny =
1837
                   ( float_detect_tininess == float_tininess_before_rounding )
1838
                || ( zExp < 0 )
1839
                || ! increment
1840
                || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
1841
            shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
1842
            zExp = 0;
1843
            if ( isTiny && zSig1 ) float_raise( float_flag_underflow );
1844
            if ( zSig1 ) float_exception_flags |= float_flag_inexact;
1845
            if ( roundNearestEven ) {
1846
                increment = ( (sbits64) zSig1 < 0 );
1847
            }
1848
            else {
1849
                if ( zSign ) {
1850
                    increment = ( roundingMode == float_round_down ) && zSig1;
1851
                }
1852
                else {
1853
                    increment = ( roundingMode == float_round_up ) && zSig1;
1854
                }
1855
            }
1856
            if ( increment ) {
1857
                ++zSig0;
1858
                zSig0 &=
1859
                    ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
1860
                if ( (sbits64) zSig0 < 0 ) zExp = 1;
1861
            }
1862
            return packFloatx80( zSign, zExp, zSig0 );
1863
        }
1864
    }
1865
    if ( zSig1 ) float_exception_flags |= float_flag_inexact;
1866
    if ( increment ) {
1867
        ++zSig0;
1868
        if ( zSig0 == 0 ) {
1869
            ++zExp;
1870
            zSig0 = LIT64( 0x8000000000000000 );
1871
        }
1872
        else {
1873
            zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
1874
        }
1875
    }
1876
    else {
1877
        if ( zSig0 == 0 ) zExp = 0;
1878
    }
1879
    return packFloatx80( zSign, zExp, zSig0 );
1880
 
1881
}
1882
 
1883
/*----------------------------------------------------------------------------
1884
| Takes an abstract floating-point value having sign `zSign', exponent
1885
| `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
1886
| and returns the proper extended double-precision floating-point value
1887
| corresponding to the abstract input.  This routine is just like
1888
| `roundAndPackFloatx80' except that the input significand does not have to be
1889
| normalized.
1890
*----------------------------------------------------------------------------*/
1891
 
1892
static floatx80
1893
 normalizeRoundAndPackFloatx80(
1894
     int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
1895
 )
1896
{
1897
    int8 shiftCount;
1898
 
1899
    if ( zSig0 == 0 ) {
1900
        zSig0 = zSig1;
1901
        zSig1 = 0;
1902
        zExp -= 64;
1903
    }
1904
    shiftCount = countLeadingZeros64( zSig0 );
1905
    shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1906
    zExp -= shiftCount;
1907
    return
1908
        roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
1909
 
1910
}
1911
 
1912
#endif
1913
 
1914
#ifdef FLOAT128
1915
 
1916
/*----------------------------------------------------------------------------
1917
| Returns the least-significant 64 fraction bits of the quadruple-precision
1918
| floating-point value `a'.
1919
*----------------------------------------------------------------------------*/
1920
 
1921
INLINE bits64 extractFloat128Frac1( float128 a )
1922
{
1923
 
1924
    return a.low;
1925
 
1926
}
1927
 
1928
/*----------------------------------------------------------------------------
1929
| Returns the most-significant 48 fraction bits of the quadruple-precision
1930
| floating-point value `a'.
1931
*----------------------------------------------------------------------------*/
1932
 
1933
INLINE bits64 extractFloat128Frac0( float128 a )
1934
{
1935
 
1936
    return a.high & LIT64( 0x0000FFFFFFFFFFFF );
1937
 
1938
}
1939
 
1940
/*----------------------------------------------------------------------------
1941
| Returns the exponent bits of the quadruple-precision floating-point value
1942
| `a'.
1943
*----------------------------------------------------------------------------*/
1944
 
1945
INLINE int32 extractFloat128Exp( float128 a )
1946
{
1947
 
1948
    return ( a.high>>48 ) & 0x7FFF;
1949
 
1950
}
1951
 
1952
/*----------------------------------------------------------------------------
1953
| Returns the sign bit of the quadruple-precision floating-point value `a'.
1954
*----------------------------------------------------------------------------*/
1955
 
1956
INLINE flag extractFloat128Sign( float128 a )
1957
{
1958
 
1959
    return a.high>>63;
1960
 
1961
}
1962
 
1963
/*----------------------------------------------------------------------------
1964
| Normalizes the subnormal quadruple-precision floating-point value
1965
| represented by the denormalized significand formed by the concatenation of
1966
| `aSig0' and `aSig1'.  The normalized exponent is stored at the location
1967
| pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
1968
| significand are stored at the location pointed to by `zSig0Ptr', and the
1969
| least significant 64 bits of the normalized significand are stored at the
1970
| location pointed to by `zSig1Ptr'.
1971
*----------------------------------------------------------------------------*/
1972
 
1973
static void
1974
 normalizeFloat128Subnormal(
1975
     bits64 aSig0,
1976
     bits64 aSig1,
1977
     int32 *zExpPtr,
1978
     bits64 *zSig0Ptr,
1979
     bits64 *zSig1Ptr
1980
 )
1981
{
1982
    int8 shiftCount;
1983
 
1984
    if ( aSig0 == 0 ) {
1985
        shiftCount = countLeadingZeros64( aSig1 ) - 15;
1986
        if ( shiftCount < 0 ) {
1987
            *zSig0Ptr = aSig1>>( - shiftCount );
1988
            *zSig1Ptr = aSig1<<( shiftCount & 63 );
1989
        }
1990
        else {
1991
            *zSig0Ptr = aSig1<<shiftCount;
1992
            *zSig1Ptr = 0;
1993
        }
1994
        *zExpPtr = - shiftCount - 63;
1995
    }
1996
    else {
1997
        shiftCount = countLeadingZeros64( aSig0 ) - 15;
1998
        shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
1999
        *zExpPtr = 1 - shiftCount;
2000
    }
2001
 
2002
}
2003
 
2004
/*----------------------------------------------------------------------------
2005
| Packs the sign `zSign', the exponent `zExp', and the significand formed
2006
| by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
2007
| floating-point value, returning the result.  After being shifted into the
2008
| proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
2009
| added together to form the most significant 32 bits of the result.  This
2010
| means that any integer portion of `zSig0' will be added into the exponent.
2011
| Since a properly normalized significand will have an integer portion equal
2012
| to 1, the `zExp' input should be 1 less than the desired result exponent
2013
| whenever `zSig0' and `zSig1' concatenated form a complete, normalized
2014
| significand.
2015
*----------------------------------------------------------------------------*/
2016
 
2017
INLINE float128
2018
 packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
2019
{
2020
    float128 z;
2021
 
2022
    z.low = zSig1;
2023
    z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
2024
    return z;
2025
 
2026
}
2027
 
2028
/*----------------------------------------------------------------------------
2029
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2030
| and extended significand formed by the concatenation of `zSig0', `zSig1',
2031
| and `zSig2', and returns the proper quadruple-precision floating-point value
2032
| corresponding to the abstract input.  Ordinarily, the abstract value is
2033
| simply rounded and packed into the quadruple-precision format, with the
2034
| inexact exception raised if the abstract input cannot be represented
2035
| exactly.  However, if the abstract value is too large, the overflow and
2036
| inexact exceptions are raised and an infinity or maximal finite value is
2037
| returned.  If the abstract value is too small, the input value is rounded to
2038
| a subnormal number, and the underflow and inexact exceptions are raised if
2039
| the abstract input cannot be represented exactly as a subnormal quadruple-
2040
| precision floating-point number.
2041
|     The input significand must be normalized or smaller.  If the input
2042
| significand is not normalized, `zExp' must be 0; in that case, the result
2043
| returned is a subnormal number, and it must not require rounding.  In the
2044
| usual case that the input significand is normalized, `zExp' must be 1 less
2045
| than the ``true'' floating-point exponent.  The handling of underflow and
2046
| overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2047
*----------------------------------------------------------------------------*/
2048
 
2049
static float128
2050
 roundAndPackFloat128(
2051
     flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 )
2052
{
2053
    int8 roundingMode;
2054
    flag roundNearestEven, increment, isTiny;
2055
 
2056
    roundingMode = float_rounding_mode;
2057
    roundNearestEven = ( roundingMode == float_round_nearest_even );
2058
    increment = ( (sbits64) zSig2 < 0 );
2059
    if ( ! roundNearestEven ) {
2060
        if ( roundingMode == float_round_to_zero ) {
2061
            increment = 0;
2062
        }
2063
        else {
2064
            if ( zSign ) {
2065
                increment = ( roundingMode == float_round_down ) && zSig2;
2066
            }
2067
            else {
2068
                increment = ( roundingMode == float_round_up ) && zSig2;
2069
            }
2070
        }
2071
    }
2072
    if ( 0x7FFD <= (bits32) zExp ) {
2073
        if (    ( 0x7FFD < zExp )
2074
             || (    ( zExp == 0x7FFD )
2075
                  && eq128(
2076
                         LIT64( 0x0001FFFFFFFFFFFF ),
2077
                         LIT64( 0xFFFFFFFFFFFFFFFF ),
2078
                         zSig0,
2079
                         zSig1
2080
                     )
2081
                  && increment
2082
                )
2083
           ) {
2084
            float_raise( float_flag_overflow | float_flag_inexact );
2085
            if (    ( roundingMode == float_round_to_zero )
2086
                 || ( zSign && ( roundingMode == float_round_up ) )
2087
                 || ( ! zSign && ( roundingMode == float_round_down ) )
2088
               ) {
2089
                return
2090
                    packFloat128(
2091
                        zSign,
2092
                        0x7FFE,
2093
                        LIT64( 0x0000FFFFFFFFFFFF ),
2094
                        LIT64( 0xFFFFFFFFFFFFFFFF )
2095
                    );
2096
            }
2097
            return packFloat128( zSign, 0x7FFF, 0, 0 );
2098
        }
2099
        if ( zExp < 0 ) {
2100
            isTiny =
2101
                   ( float_detect_tininess == float_tininess_before_rounding )
2102
                || ( zExp < -1 )
2103
                || ! increment
2104
                || lt128(
2105
                       zSig0,
2106
                       zSig1,
2107
                       LIT64( 0x0001FFFFFFFFFFFF ),
2108
                       LIT64( 0xFFFFFFFFFFFFFFFF )
2109
                   );
2110
            shift128ExtraRightJamming(
2111
                zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
2112
            zExp = 0;
2113
            if ( isTiny && zSig2 ) float_raise( float_flag_underflow );
2114
            if ( roundNearestEven ) {
2115
                increment = ( (sbits64) zSig2 < 0 );
2116
            }
2117
            else {
2118
                if ( zSign ) {
2119
                    increment = ( roundingMode == float_round_down ) && zSig2;
2120
                }
2121
                else {
2122
                    increment = ( roundingMode == float_round_up ) && zSig2;
2123
                }
2124
            }
2125
        }
2126
    }
2127
    if ( zSig2 ) float_exception_flags |= float_flag_inexact;
2128
    if ( increment ) {
2129
        add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
2130
        zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
2131
    }
2132
    else {
2133
        if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
2134
    }
2135
    return packFloat128( zSign, zExp, zSig0, zSig1 );
2136
 
2137
}
2138
 
2139
/*----------------------------------------------------------------------------
2140
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2141
| and significand formed by the concatenation of `zSig0' and `zSig1', and
2142
| returns the proper quadruple-precision floating-point value corresponding
2143
| to the abstract input.  This routine is just like `roundAndPackFloat128'
2144
| except that the input significand has fewer bits and does not have to be
2145
| normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
2146
| point exponent.
2147
*----------------------------------------------------------------------------*/
2148
 
2149
static float128
2150
 normalizeRoundAndPackFloat128(
2151
     flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
2152
{
2153
    int8 shiftCount;
2154
    bits64 zSig2;
2155
 
2156
    if ( zSig0 == 0 ) {
2157
        zSig0 = zSig1;
2158
        zSig1 = 0;
2159
        zExp -= 64;
2160
    }
2161
    shiftCount = countLeadingZeros64( zSig0 ) - 15;
2162
    if ( 0 <= shiftCount ) {
2163
        zSig2 = 0;
2164
        shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
2165
    }
2166
    else {
2167
        shift128ExtraRightJamming(
2168
            zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
2169
    }
2170
    zExp -= shiftCount;
2171
    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
2172
 
2173
}
2174
 
2175
#endif
2176
 
2177
/*----------------------------------------------------------------------------
2178
| Returns the result of converting the 32-bit two's complement integer `a'
2179
| to the single-precision floating-point format.  The conversion is performed
2180
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2181
*----------------------------------------------------------------------------*/
2182
 
2183
float32 int32_to_float32( int32 a )
2184
{
2185
    flag zSign;
2186
 
2187
    if ( a == 0 ) return 0;
2188
    if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
2189
    zSign = ( a < 0 );
2190
    return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a );
2191
 
2192
}
2193
 
2194
/*----------------------------------------------------------------------------
2195
| Returns the result of converting the 32-bit two's complement integer `a'
2196
| to the double-precision floating-point format.  The conversion is performed
2197
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2198
*----------------------------------------------------------------------------*/
2199
 
2200
float64 int32_to_float64( int32 a )
2201
{
2202
    flag zSign;
2203
    uint32 absA;
2204
    int8 shiftCount;
2205
    bits64 zSig;
2206
 
2207
    if ( a == 0 ) return 0;
2208
    zSign = ( a < 0 );
2209
    absA = zSign ? - a : a;
2210
    shiftCount = countLeadingZeros32( absA ) + 21;
2211
    zSig = absA;
2212
    return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
2213
 
2214
}
2215
 
2216
#ifdef FLOATX80
2217
 
2218
/*----------------------------------------------------------------------------
2219
| Returns the result of converting the 32-bit two's complement integer `a'
2220
| to the extended double-precision floating-point format.  The conversion
2221
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
2222
| Arithmetic.
2223
*----------------------------------------------------------------------------*/
2224
 
2225
floatx80 int32_to_floatx80( int32 a )
2226
{
2227
    flag zSign;
2228
    uint32 absA;
2229
    int8 shiftCount;
2230
    bits64 zSig;
2231
 
2232
    if ( a == 0 ) return packFloatx80( 0, 0, 0 );
2233
    zSign = ( a < 0 );
2234
    absA = zSign ? - a : a;
2235
    shiftCount = countLeadingZeros32( absA ) + 32;
2236
    zSig = absA;
2237
    return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
2238
 
2239
}
2240
 
2241
#endif
2242
 
2243
#ifdef FLOAT128
2244
 
2245
/*----------------------------------------------------------------------------
2246
| Returns the result of converting the 32-bit two's complement integer `a' to
2247
| the quadruple-precision floating-point format.  The conversion is performed
2248
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2249
*----------------------------------------------------------------------------*/
2250
 
2251
float128 int32_to_float128( int32 a )
2252
{
2253
    flag zSign;
2254
    uint32 absA;
2255
    int8 shiftCount;
2256
    bits64 zSig0;
2257
 
2258
    if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
2259
    zSign = ( a < 0 );
2260
    absA = zSign ? - a : a;
2261
    shiftCount = countLeadingZeros32( absA ) + 17;
2262
    zSig0 = absA;
2263
    return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
2264
 
2265
}
2266
 
2267
#endif
2268
 
2269
/*----------------------------------------------------------------------------
2270
| Returns the result of converting the 64-bit two's complement integer `a'
2271
| to the single-precision floating-point format.  The conversion is performed
2272
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2273
*----------------------------------------------------------------------------*/
2274
 
2275
float32 int64_to_float32( int64 a )
2276
{
2277
    flag zSign;
2278
    uint64 absA;
2279
    int8 shiftCount;
2280
    //bits32 zSig; // Unused variable
2281
 
2282
    if ( a == 0 ) return 0;
2283
    zSign = ( a < 0 );
2284
    absA = zSign ? - a : a;
2285
    shiftCount = countLeadingZeros64( absA ) - 40;
2286
    if ( 0 <= shiftCount ) {
2287
        return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
2288
    }
2289
    else {
2290
        shiftCount += 7;
2291
        if ( shiftCount < 0 ) {
2292
            shift64RightJamming( absA, - shiftCount, &absA );
2293
        }
2294
        else {
2295
            absA <<= shiftCount;
2296
        }
2297
        return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA );
2298
    }
2299
 
2300
}
2301
 
2302
/*----------------------------------------------------------------------------
2303
| Returns the result of converting the 64-bit two's complement integer `a'
2304
| to the double-precision floating-point format.  The conversion is performed
2305
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2306
*----------------------------------------------------------------------------*/
2307
 
2308
float64 int64_to_float64( int64 a )
2309
{
2310
    flag zSign;
2311
 
2312
    if ( a == 0 ) return 0;
2313
    if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
2314
        return packFloat64( 1, 0x43E, 0 );
2315
    }
2316
    zSign = ( a < 0 );
2317
    return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a );
2318
 
2319
}
2320
 
2321
#ifdef FLOATX80
2322
 
2323
/*----------------------------------------------------------------------------
2324
| Returns the result of converting the 64-bit two's complement integer `a'
2325
| to the extended double-precision floating-point format.  The conversion
2326
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
2327
| Arithmetic.
2328
*----------------------------------------------------------------------------*/
2329
 
2330
floatx80 int64_to_floatx80( int64 a )
2331
{
2332
    flag zSign;
2333
    uint64 absA;
2334
    int8 shiftCount;
2335
 
2336
    if ( a == 0 ) return packFloatx80( 0, 0, 0 );
2337
    zSign = ( a < 0 );
2338
    absA = zSign ? - a : a;
2339
    shiftCount = countLeadingZeros64( absA );
2340
    return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
2341
 
2342
}
2343
 
2344
#endif
2345
 
2346
#ifdef FLOAT128
2347
 
2348
/*----------------------------------------------------------------------------
2349
| Returns the result of converting the 64-bit two's complement integer `a' to
2350
| the quadruple-precision floating-point format.  The conversion is performed
2351
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2352
*----------------------------------------------------------------------------*/
2353
 
2354
float128 int64_to_float128( int64 a )
2355
{
2356
    flag zSign;
2357
    uint64 absA;
2358
    int8 shiftCount;
2359
    int32 zExp;
2360
    bits64 zSig0, zSig1;
2361
 
2362
    if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
2363
    zSign = ( a < 0 );
2364
    absA = zSign ? - a : a;
2365
    shiftCount = countLeadingZeros64( absA ) + 49;
2366
    zExp = 0x406E - shiftCount;
2367
    if ( 64 <= shiftCount ) {
2368
        zSig1 = 0;
2369
        zSig0 = absA;
2370
        shiftCount -= 64;
2371
    }
2372
    else {
2373
        zSig1 = absA;
2374
        zSig0 = 0;
2375
    }
2376
    shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
2377
    return packFloat128( zSign, zExp, zSig0, zSig1 );
2378
 
2379
}
2380
 
2381
#endif
2382
 
2383
/*----------------------------------------------------------------------------
2384
| Returns the result of converting the single-precision floating-point value
2385
| `a' to the 32-bit two's complement integer format.  The conversion is
2386
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2387
| Arithmetic---which means in particular that the conversion is rounded
2388
| according to the current rounding mode.  If `a' is a NaN, the largest
2389
| positive integer is returned.  Otherwise, if the conversion overflows, the
2390
| largest integer with the same sign as `a' is returned.
2391
*----------------------------------------------------------------------------*/
2392
 
2393
int32 float32_to_int32( float32 a )
2394
{
2395
    flag aSign;
2396
    int16 aExp, shiftCount;
2397
    bits32 aSig;
2398
    bits64 aSig64;
2399
 
2400
    aSig = extractFloat32Frac( a );
2401
    aExp = extractFloat32Exp( a );
2402
    aSign = extractFloat32Sign( a );
2403
    if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
2404
    if ( aExp ) aSig |= 0x00800000;
2405
    shiftCount = 0xAF - aExp;
2406
    aSig64 = aSig;
2407
    aSig64 <<= 32;
2408
    if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
2409
    return roundAndPackInt32( aSign, aSig64 );
2410
 
2411
}
2412
 
2413
/*----------------------------------------------------------------------------
2414
| Returns the result of converting the single-precision floating-point value
2415
| `a' to the 32-bit two's complement integer format.  The conversion is
2416
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2417
| Arithmetic, except that the conversion is always rounded toward zero.
2418
| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2419
| the conversion overflows, the largest integer with the same sign as `a' is
2420
| returned.
2421
*----------------------------------------------------------------------------*/
2422
 
2423
int32 float32_to_int32_round_to_zero( float32 a )
2424
{
2425
    flag aSign;
2426
    int16 aExp, shiftCount;
2427
    bits32 aSig;
2428
    int32 z;
2429
 
2430
    aSig = extractFloat32Frac( a );
2431
    aExp = extractFloat32Exp( a );
2432
    aSign = extractFloat32Sign( a );
2433
    shiftCount = aExp - 0x9E;
2434
    if ( 0 <= shiftCount ) {
2435
        if ( a != 0xCF000000 ) {
2436
            float_raise( float_flag_invalid );
2437
            if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
2438
        }
2439
        return (sbits32) 0x80000000;
2440
    }
2441
    else if ( aExp <= 0x7E ) {
2442
        if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
2443
        return 0;
2444
    }
2445
    aSig = ( aSig | 0x00800000 )<<8;
2446
    z = aSig>>( - shiftCount );
2447
    if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
2448
        float_exception_flags |= float_flag_inexact;
2449
    }
2450
    if ( aSign ) z = - z;
2451
    return z;
2452
 
2453
}
2454
 
2455
/*----------------------------------------------------------------------------
2456
| Returns the result of converting the single-precision floating-point value
2457
| `a' to the 64-bit two's complement integer format.  The conversion is
2458
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2459
| Arithmetic---which means in particular that the conversion is rounded
2460
| according to the current rounding mode.  If `a' is a NaN, the largest
2461
| positive integer is returned.  Otherwise, if the conversion overflows, the
2462
| largest integer with the same sign as `a' is returned.
2463
*----------------------------------------------------------------------------*/
2464
 
2465
int64 float32_to_int64( float32 a )
2466
{
2467
    flag aSign;
2468
    int16 aExp, shiftCount;
2469
    bits32 aSig;
2470
    bits64 aSig64, aSigExtra;
2471
 
2472
    aSig = extractFloat32Frac( a );
2473
    aExp = extractFloat32Exp( a );
2474
    aSign = extractFloat32Sign( a );
2475
    shiftCount = 0xBE - aExp;
2476
    if ( shiftCount < 0 ) {
2477
        float_raise( float_flag_invalid );
2478
        if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
2479
            return LIT64( 0x7FFFFFFFFFFFFFFF );
2480
        }
2481
        return (sbits64) LIT64( 0x8000000000000000 );
2482
    }
2483
    if ( aExp ) aSig |= 0x00800000;
2484
    aSig64 = aSig;
2485
    aSig64 <<= 40;
2486
    shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
2487
    return roundAndPackInt64( aSign, aSig64, aSigExtra );
2488
 
2489
}
2490
 
2491
/*----------------------------------------------------------------------------
2492
| Returns the result of converting the single-precision floating-point value
2493
| `a' to the 64-bit two's complement integer format.  The conversion is
2494
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2495
| Arithmetic, except that the conversion is always rounded toward zero.  If
2496
| `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
2497
| conversion overflows, the largest integer with the same sign as `a' is
2498
| returned.
2499
*----------------------------------------------------------------------------*/
2500
 
2501
int64 float32_to_int64_round_to_zero( float32 a )
2502
{
2503
    flag aSign;
2504
    int16 aExp, shiftCount;
2505
    bits32 aSig;
2506
    bits64 aSig64;
2507
    int64 z;
2508
 
2509
    aSig = extractFloat32Frac( a );
2510
    aExp = extractFloat32Exp( a );
2511
    aSign = extractFloat32Sign( a );
2512
    shiftCount = aExp - 0xBE;
2513
    if ( 0 <= shiftCount ) {
2514
        if ( a != 0xDF000000 ) {
2515
            float_raise( float_flag_invalid );
2516
            if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
2517
                return LIT64( 0x7FFFFFFFFFFFFFFF );
2518
            }
2519
        }
2520
        return (sbits64) LIT64( 0x8000000000000000 );
2521
    }
2522
    else if ( aExp <= 0x7E ) {
2523
        if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
2524
        return 0;
2525
    }
2526
    aSig64 = aSig | 0x00800000;
2527
    aSig64 <<= 40;
2528
    z = aSig64>>( - shiftCount );
2529
    if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
2530
        float_exception_flags |= float_flag_inexact;
2531
    }
2532
    if ( aSign ) z = - z;
2533
    return z;
2534
 
2535
}
2536
 
2537
/*----------------------------------------------------------------------------
2538
| Returns the result of converting the single-precision floating-point value
2539
| `a' to the double-precision floating-point format.  The conversion is
2540
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2541
| Arithmetic.
2542
*----------------------------------------------------------------------------*/
2543
 
2544
float64 float32_to_float64( float32 a )
2545
{
2546
    flag aSign;
2547
    int16 aExp;
2548
    bits32 aSig;
2549
 
2550
    aSig = extractFloat32Frac( a );
2551
    aExp = extractFloat32Exp( a );
2552
    aSign = extractFloat32Sign( a );
2553
    if ( aExp == 0xFF ) {
2554
        if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) );
2555
        return packFloat64( aSign, 0x7FF, 0 );
2556
    }
2557
    if ( aExp == 0 ) {
2558
        if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
2559
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2560
        --aExp;
2561
    }
2562
    return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
2563
 
2564
}
2565
 
2566
#ifdef FLOATX80
2567
 
2568
/*----------------------------------------------------------------------------
2569
| Returns the result of converting the single-precision floating-point value
2570
| `a' to the extended double-precision floating-point format.  The conversion
2571
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
2572
| Arithmetic.
2573
*----------------------------------------------------------------------------*/
2574
 
2575
floatx80 float32_to_floatx80( float32 a )
2576
{
2577
    flag aSign;
2578
    int16 aExp;
2579
    bits32 aSig;
2580
 
2581
    aSig = extractFloat32Frac( a );
2582
    aExp = extractFloat32Exp( a );
2583
    aSign = extractFloat32Sign( a );
2584
    if ( aExp == 0xFF ) {
2585
        if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) );
2586
        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2587
    }
2588
    if ( aExp == 0 ) {
2589
        if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2590
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2591
    }
2592
    aSig |= 0x00800000;
2593
    return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
2594
 
2595
}
2596
 
2597
#endif
2598
 
2599
#ifdef FLOAT128
2600
 
2601
/*----------------------------------------------------------------------------
2602
| Returns the result of converting the single-precision floating-point value
2603
| `a' to the double-precision floating-point format.  The conversion is
2604
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2605
| Arithmetic.
2606
*----------------------------------------------------------------------------*/
2607
 
2608
float128 float32_to_float128( float32 a )
2609
{
2610
    flag aSign;
2611
    int16 aExp;
2612
    bits32 aSig;
2613
 
2614
    aSig = extractFloat32Frac( a );
2615
    aExp = extractFloat32Exp( a );
2616
    aSign = extractFloat32Sign( a );
2617
    if ( aExp == 0xFF ) {
2618
        if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a ) );
2619
        return packFloat128( aSign, 0x7FFF, 0, 0 );
2620
    }
2621
    if ( aExp == 0 ) {
2622
        if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2623
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2624
        --aExp;
2625
    }
2626
    return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
2627
 
2628
}
2629
 
2630
#endif
2631
 
2632
/*----------------------------------------------------------------------------
2633
| Rounds the single-precision floating-point value `a' to an integer, and
2634
| returns the result as a single-precision floating-point value.  The
2635
| operation is performed according to the IEC/IEEE Standard for Binary
2636
| Floating-Point Arithmetic.
2637
*----------------------------------------------------------------------------*/
2638
 
2639
float32 float32_round_to_int( float32 a )
2640
{
2641
    flag aSign;
2642
    int16 aExp;
2643
    bits32 lastBitMask, roundBitsMask;
2644
    int8 roundingMode;
2645
    float32 z;
2646
 
2647
    aExp = extractFloat32Exp( a );
2648
    if ( 0x96 <= aExp ) {
2649
        if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
2650
            return propagateFloat32NaN( a, a );
2651
        }
2652
        return a;
2653
    }
2654
    if ( aExp <= 0x7E ) {
2655
        if ( (bits32) ( a<<1 ) == 0 ) return a;
2656
        float_exception_flags |= float_flag_inexact;
2657
        aSign = extractFloat32Sign( a );
2658
        switch ( float_rounding_mode ) {
2659
         case float_round_nearest_even:
2660
            if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
2661
                return packFloat32( aSign, 0x7F, 0 );
2662
            }
2663
            break;
2664
         case float_round_down:
2665
            return aSign ? 0xBF800000 : 0;
2666
         case float_round_up:
2667
            return aSign ? 0x80000000 : 0x3F800000;
2668
        }
2669
        return packFloat32( aSign, 0, 0 );
2670
    }
2671
    lastBitMask = 1;
2672
    lastBitMask <<= 0x96 - aExp;
2673
    roundBitsMask = lastBitMask - 1;
2674
    z = a;
2675
    roundingMode = float_rounding_mode;
2676
    if ( roundingMode == float_round_nearest_even ) {
2677
        z += lastBitMask>>1;
2678
        if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
2679
    }
2680
    else if ( roundingMode != float_round_to_zero ) {
2681
        if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
2682
            z += roundBitsMask;
2683
        }
2684
    }
2685
    z &= ~ roundBitsMask;
2686
    if ( z != a ) float_exception_flags |= float_flag_inexact;
2687
    return z;
2688
 
2689
}
2690
 
2691
/*----------------------------------------------------------------------------
2692
| Returns the result of adding the absolute values of the single-precision
2693
| floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
2694
| before being returned.  `zSign' is ignored if the result is a NaN.
2695
| The addition is performed according to the IEC/IEEE Standard for Binary
2696
| Floating-Point Arithmetic.
2697
*----------------------------------------------------------------------------*/
2698
 
2699
static float32 addFloat32Sigs( float32 a, float32 b, flag zSign )
2700
{
2701
    int16 aExp, bExp, zExp;
2702
    bits32 aSig, bSig, zSig;
2703
    int16 expDiff;
2704
 
2705
    aSig = extractFloat32Frac( a );
2706
    aExp = extractFloat32Exp( a );
2707
    bSig = extractFloat32Frac( b );
2708
    bExp = extractFloat32Exp( b );
2709
    expDiff = aExp - bExp;
2710
    aSig <<= 6;
2711
    bSig <<= 6;
2712
    if ( 0 < expDiff ) {
2713
        if ( aExp == 0xFF ) {
2714
            if ( aSig ) return propagateFloat32NaN( a, b );
2715
            return a;
2716
        }
2717
        if ( bExp == 0 ) {
2718
            --expDiff;
2719
        }
2720
        else {
2721
            bSig |= 0x20000000;
2722
        }
2723
        shift32RightJamming( bSig, expDiff, &bSig );
2724
        zExp = aExp;
2725
    }
2726
    else if ( expDiff < 0 ) {
2727
        if ( bExp == 0xFF ) {
2728
            if ( bSig ) return propagateFloat32NaN( a, b );
2729
            return packFloat32( zSign, 0xFF, 0 );
2730
        }
2731
        if ( aExp == 0 ) {
2732
            ++expDiff;
2733
        }
2734
        else {
2735
            aSig |= 0x20000000;
2736
        }
2737
        shift32RightJamming( aSig, - expDiff, &aSig );
2738
        zExp = bExp;
2739
    }
2740
    else {
2741
        if ( aExp == 0xFF ) {
2742
            if ( aSig | bSig ) return propagateFloat32NaN( a, b );
2743
            return a;
2744
        }
2745
        if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
2746
        zSig = 0x40000000 + aSig + bSig;
2747
        zExp = aExp;
2748
        goto roundAndPack;
2749
    }
2750
    aSig |= 0x20000000;
2751
    zSig = ( aSig + bSig )<<1;
2752
    --zExp;
2753
    if ( (sbits32) zSig < 0 ) {
2754
        zSig = aSig + bSig;
2755
        ++zExp;
2756
    }
2757
 roundAndPack:
2758
    return roundAndPackFloat32( zSign, zExp, zSig );
2759
 
2760
}
2761
 
2762
/*----------------------------------------------------------------------------
2763
| Returns the result of subtracting the absolute values of the single-
2764
| precision floating-point values `a' and `b'.  If `zSign' is 1, the
2765
| difference is negated before being returned.  `zSign' is ignored if the
2766
| result is a NaN.  The subtraction is performed according to the IEC/IEEE
2767
| Standard for Binary Floating-Point Arithmetic.
2768
*----------------------------------------------------------------------------*/
2769
 
2770
static float32 subFloat32Sigs( float32 a, float32 b, flag zSign )
2771
{
2772
    int16 aExp, bExp, zExp;
2773
    bits32 aSig, bSig, zSig;
2774
    int16 expDiff;
2775
 
2776
    aSig = extractFloat32Frac( a );
2777
    aExp = extractFloat32Exp( a );
2778
    bSig = extractFloat32Frac( b );
2779
    bExp = extractFloat32Exp( b );
2780
    expDiff = aExp - bExp;
2781
    aSig <<= 7;
2782
    bSig <<= 7;
2783
    if ( 0 < expDiff ) goto aExpBigger;
2784
    if ( expDiff < 0 ) goto bExpBigger;
2785
    if ( aExp == 0xFF ) {
2786
        if ( aSig | bSig ) return propagateFloat32NaN( a, b );
2787
        float_raise( float_flag_invalid );
2788
        return float32_default_nan;
2789
    }
2790
    if ( aExp == 0 ) {
2791
        aExp = 1;
2792
        bExp = 1;
2793
    }
2794
    if ( bSig < aSig ) goto aBigger;
2795
    if ( aSig < bSig ) goto bBigger;
2796
    return packFloat32( float_rounding_mode == float_round_down, 0, 0 );
2797
 bExpBigger:
2798
    if ( bExp == 0xFF ) {
2799
        if ( bSig ) return propagateFloat32NaN( a, b );
2800
        return packFloat32( zSign ^ 1, 0xFF, 0 );
2801
    }
2802
    if ( aExp == 0 ) {
2803
        ++expDiff;
2804
    }
2805
    else {
2806
        aSig |= 0x40000000;
2807
    }
2808
    shift32RightJamming( aSig, - expDiff, &aSig );
2809
    bSig |= 0x40000000;
2810
 bBigger:
2811
    zSig = bSig - aSig;
2812
    zExp = bExp;
2813
    zSign ^= 1;
2814
    goto normalizeRoundAndPack;
2815
 aExpBigger:
2816
    if ( aExp == 0xFF ) {
2817
        if ( aSig ) return propagateFloat32NaN( a, b );
2818
        return a;
2819
    }
2820
    if ( bExp == 0 ) {
2821
        --expDiff;
2822
    }
2823
    else {
2824
        bSig |= 0x40000000;
2825
    }
2826
    shift32RightJamming( bSig, expDiff, &bSig );
2827
    aSig |= 0x40000000;
2828
 aBigger:
2829
    zSig = aSig - bSig;
2830
    zExp = aExp;
2831
 normalizeRoundAndPack:
2832
    --zExp;
2833
    return normalizeRoundAndPackFloat32( zSign, zExp, zSig );
2834
 
2835
}
2836
 
2837
/*----------------------------------------------------------------------------
2838
| Returns the result of adding the single-precision floating-point values `a'
2839
| and `b'.  The operation is performed according to the IEC/IEEE Standard for
2840
| Binary Floating-Point Arithmetic.
2841
*----------------------------------------------------------------------------*/
2842
 
2843
float32 float32_add( float32 a, float32 b )
2844
{
2845
    flag aSign, bSign;
2846
 
2847
    aSign = extractFloat32Sign( a );
2848
    bSign = extractFloat32Sign( b );
2849
    if ( aSign == bSign ) {
2850
        return addFloat32Sigs( a, b, aSign );
2851
    }
2852
    else {
2853
        return subFloat32Sigs( a, b, aSign );
2854
    }
2855
 
2856
}
2857
 
2858
/*----------------------------------------------------------------------------
2859
| Returns the result of subtracting the single-precision floating-point values
2860
| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2861
| for Binary Floating-Point Arithmetic.
2862
*----------------------------------------------------------------------------*/
2863
 
2864
float32 float32_sub( float32 a, float32 b )
2865
{
2866
    flag aSign, bSign;
2867
 
2868
    aSign = extractFloat32Sign( a );
2869
    bSign = extractFloat32Sign( b );
2870
    if ( aSign == bSign ) {
2871
        return subFloat32Sigs( a, b, aSign );
2872
    }
2873
    else {
2874
        return addFloat32Sigs( a, b, aSign );
2875
    }
2876
 
2877
}
2878
 
2879
/*----------------------------------------------------------------------------
2880
| Returns the result of multiplying the single-precision floating-point values
2881
| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2882
| for Binary Floating-Point Arithmetic.
2883
*----------------------------------------------------------------------------*/
2884
 
2885
float32 float32_mul( float32 a, float32 b )
2886
{
2887
    flag aSign, bSign, zSign;
2888
    int16 aExp, bExp, zExp;
2889
    bits32 aSig, bSig;
2890
    bits64 zSig64;
2891
    bits32 zSig;
2892
 
2893
    aSig = extractFloat32Frac( a );
2894
    aExp = extractFloat32Exp( a );
2895
    aSign = extractFloat32Sign( a );
2896
    bSig = extractFloat32Frac( b );
2897
    bExp = extractFloat32Exp( b );
2898
    bSign = extractFloat32Sign( b );
2899
    zSign = aSign ^ bSign;
2900
    if ( aExp == 0xFF ) {
2901
        if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2902
            return propagateFloat32NaN( a, b );
2903
        }
2904
        if ( ( bExp | bSig ) == 0 ) {
2905
            float_raise( float_flag_invalid );
2906
            return float32_default_nan;
2907
        }
2908
        return packFloat32( zSign, 0xFF, 0 );
2909
    }
2910
    if ( bExp == 0xFF ) {
2911
        if ( bSig ) return propagateFloat32NaN( a, b );
2912
        if ( ( aExp | aSig ) == 0 ) {
2913
            float_raise( float_flag_invalid );
2914
            return float32_default_nan;
2915
        }
2916
        return packFloat32( zSign, 0xFF, 0 );
2917
    }
2918
    if ( aExp == 0 ) {
2919
        if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2920
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2921
    }
2922
    if ( bExp == 0 ) {
2923
        if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
2924
        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2925
    }
2926
    zExp = aExp + bExp - 0x7F;
2927
    aSig = ( aSig | 0x00800000 )<<7;
2928
    bSig = ( bSig | 0x00800000 )<<8;
2929
    shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
2930
    zSig = zSig64;
2931
    if ( 0 <= (sbits32) ( zSig<<1 ) ) {
2932
        zSig <<= 1;
2933
        --zExp;
2934
    }
2935
    return roundAndPackFloat32( zSign, zExp, zSig );
2936
 
2937
}
2938
 
2939
/*----------------------------------------------------------------------------
2940
| Returns the result of dividing the single-precision floating-point value `a'
2941
| by the corresponding value `b'.  The operation is performed according to the
2942
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2943
*----------------------------------------------------------------------------*/
2944
 
2945
float32 float32_div( float32 a, float32 b )
2946
{
2947
    flag aSign, bSign, zSign;
2948
    int16 aExp, bExp, zExp;
2949
    bits32 aSig, bSig, zSig;
2950
 
2951
    aSig = extractFloat32Frac( a );
2952
    aExp = extractFloat32Exp( a );
2953
    aSign = extractFloat32Sign( a );
2954
    bSig = extractFloat32Frac( b );
2955
    bExp = extractFloat32Exp( b );
2956
    bSign = extractFloat32Sign( b );
2957
    zSign = aSign ^ bSign;
2958
    if ( aExp == 0xFF ) {
2959
        if ( aSig ) return propagateFloat32NaN( a, b );
2960
        if ( bExp == 0xFF ) {
2961
            if ( bSig ) return propagateFloat32NaN( a, b );
2962
            float_raise( float_flag_invalid );
2963
            return float32_default_nan;
2964
        }
2965
        return packFloat32( zSign, 0xFF, 0 );
2966
    }
2967
    if ( bExp == 0xFF ) {
2968
        if ( bSig ) return propagateFloat32NaN( a, b );
2969
        return packFloat32( zSign, 0, 0 );
2970
    }
2971
    if ( bExp == 0 ) {
2972
        if ( bSig == 0 ) {
2973
            if ( ( aExp | aSig ) == 0 ) {
2974
                float_raise( float_flag_invalid );
2975
                return float32_default_nan;
2976
            }
2977
            float_raise( float_flag_divbyzero );
2978
            return packFloat32( zSign, 0xFF, 0 );
2979
        }
2980
        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2981
    }
2982
    if ( aExp == 0 ) {
2983
        if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2984
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2985
    }
2986
    zExp = aExp - bExp + 0x7D;
2987
    aSig = ( aSig | 0x00800000 )<<7;
2988
    bSig = ( bSig | 0x00800000 )<<8;
2989
    if ( bSig <= ( aSig + aSig ) ) {
2990
        aSig >>= 1;
2991
        ++zExp;
2992
    }
2993
    zSig = ( ( (bits64) aSig )<<32 ) / bSig;
2994
    if ( ( zSig & 0x3F ) == 0 ) {
2995
        zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
2996
    }
2997
    return roundAndPackFloat32( zSign, zExp, zSig );
2998
 
2999
}
3000
 
3001
/*----------------------------------------------------------------------------
3002
| Returns the remainder of the single-precision floating-point value `a'
3003
| with respect to the corresponding value `b'.  The operation is performed
3004
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3005
*----------------------------------------------------------------------------*/
3006
 
3007
float32 float32_rem( float32 a, float32 b )
3008
{
3009
    flag aSign, bSign, zSign;
3010
    int16 aExp, bExp, expDiff;
3011
    bits32 aSig, bSig;
3012
    bits32 q;
3013
    bits64 aSig64, bSig64, q64;
3014
    bits32 alternateASig;
3015
    sbits32 sigMean;
3016
 
3017
    aSig = extractFloat32Frac( a );
3018
    aExp = extractFloat32Exp( a );
3019
    aSign = extractFloat32Sign( a );
3020
    bSig = extractFloat32Frac( b );
3021
    bExp = extractFloat32Exp( b );
3022
    bSign = extractFloat32Sign( b );
3023
    if ( aExp == 0xFF ) {
3024
        if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
3025
            return propagateFloat32NaN( a, b );
3026
        }
3027
        float_raise( float_flag_invalid );
3028
        return float32_default_nan;
3029
    }
3030
    if ( bExp == 0xFF ) {
3031
        if ( bSig ) return propagateFloat32NaN( a, b );
3032
        return a;
3033
    }
3034
    if ( bExp == 0 ) {
3035
        if ( bSig == 0 ) {
3036
            float_raise( float_flag_invalid );
3037
            return float32_default_nan;
3038
        }
3039
        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
3040
    }
3041
    if ( aExp == 0 ) {
3042
        if ( aSig == 0 ) return a;
3043
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3044
    }
3045
    expDiff = aExp - bExp;
3046
    aSig |= 0x00800000;
3047
    bSig |= 0x00800000;
3048
    if ( expDiff < 32 ) {
3049
        aSig <<= 8;
3050
        bSig <<= 8;
3051
        if ( expDiff < 0 ) {
3052
            if ( expDiff < -1 ) return a;
3053
            aSig >>= 1;
3054
        }
3055
        q = ( bSig <= aSig );
3056
        if ( q ) aSig -= bSig;
3057
        if ( 0 < expDiff ) {
3058
            q = ( ( (bits64) aSig )<<32 ) / bSig;
3059
            q >>= 32 - expDiff;
3060
            bSig >>= 2;
3061
            aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3062
        }
3063
        else {
3064
            aSig >>= 2;
3065
            bSig >>= 2;
3066
        }
3067
    }
3068
    else {
3069
        if ( bSig <= aSig ) aSig -= bSig;
3070
        aSig64 = ( (bits64) aSig )<<40;
3071
        bSig64 = ( (bits64) bSig )<<40;
3072
        expDiff -= 64;
3073
        while ( 0 < expDiff ) {
3074
            q64 = estimateDiv128To64( aSig64, 0, bSig64 );
3075
            q64 = ( 2 < q64 ) ? q64 - 2 : 0;
3076
            aSig64 = - ( ( bSig * q64 )<<38 );
3077
            expDiff -= 62;
3078
        }
3079
        expDiff += 64;
3080
        q64 = estimateDiv128To64( aSig64, 0, bSig64 );
3081
        q64 = ( 2 < q64 ) ? q64 - 2 : 0;
3082
        q = q64>>( 64 - expDiff );
3083
        bSig <<= 6;
3084
        aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
3085
    }
3086
    do {
3087
        alternateASig = aSig;
3088
        ++q;
3089
        aSig -= bSig;
3090
    } while ( 0 <= (sbits32) aSig );
3091
    sigMean = aSig + alternateASig;
3092
    if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3093
        aSig = alternateASig;
3094
    }
3095
    zSign = ( (sbits32) aSig < 0 );
3096
    if ( zSign ) aSig = - aSig;
3097
    return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig );
3098
 
3099
}
3100
 
3101
/*----------------------------------------------------------------------------
3102
| Returns the square root of the single-precision floating-point value `a'.
3103
| The operation is performed according to the IEC/IEEE Standard for Binary
3104
| Floating-Point Arithmetic.
3105
*----------------------------------------------------------------------------*/
3106
 
3107
float32 float32_sqrt( float32 a )
3108
{
3109
    flag aSign;
3110
    int16 aExp, zExp;
3111
    bits32 aSig, zSig;
3112
    bits64 rem, term;
3113
 
3114
    aSig = extractFloat32Frac( a );
3115
    aExp = extractFloat32Exp( a );
3116
    aSign = extractFloat32Sign( a );
3117
    if ( aExp == 0xFF ) {
3118
        if ( aSig ) return propagateFloat32NaN( a, 0 );
3119
        if ( ! aSign ) return a;
3120
        float_raise( float_flag_invalid );
3121
        return float32_default_nan;
3122
    }
3123
    if ( aSign ) {
3124
        if ( ( aExp | aSig ) == 0 ) return a;
3125
        float_raise( float_flag_invalid );
3126
        return float32_default_nan;
3127
    }
3128
    if ( aExp == 0 ) {
3129
        if ( aSig == 0 ) return 0;
3130
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3131
    }
3132
    zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
3133
    aSig = ( aSig | 0x00800000 )<<8;
3134
    zSig = estimateSqrt32( aExp, aSig ) + 2;
3135
    if ( ( zSig & 0x7F ) <= 5 ) {
3136
        if ( zSig < 2 ) {
3137
            zSig = 0x7FFFFFFF;
3138
            goto roundAndPack;
3139
        }
3140
        aSig >>= aExp & 1;
3141
        term = ( (bits64) zSig ) * zSig;
3142
        rem = ( ( (bits64) aSig )<<32 ) - term;
3143
        while ( (sbits64) rem < 0 ) {
3144
            --zSig;
3145
            rem += ( ( (bits64) zSig )<<1 ) | 1;
3146
        }
3147
        zSig |= ( rem != 0 );
3148
    }
3149
    shift32RightJamming( zSig, 1, &zSig );
3150
 roundAndPack:
3151
    return roundAndPackFloat32( 0, zExp, zSig );
3152
 
3153
}
3154
 
3155
/*----------------------------------------------------------------------------
3156
| Returns 1 if the single-precision floating-point value `a' is equal to
3157
| the corresponding value `b', and 0 otherwise.  The comparison is performed
3158
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3159
*----------------------------------------------------------------------------*/
3160
 
3161
flag float32_eq( float32 a, float32 b )
3162
{
3163
 
3164
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3165
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3166
       ) {
3167
        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
3168
            float_raise( float_flag_invalid );
3169
        }
3170
        return 0;
3171
    }
3172
    return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
3173
 
3174
}
3175
 
3176
/*----------------------------------------------------------------------------
3177
| Returns 1 if the single-precision floating-point value `a' is less than
3178
| or equal to the corresponding value `b', and 0 otherwise.  The comparison
3179
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
3180
| Arithmetic.
3181
*----------------------------------------------------------------------------*/
3182
 
3183
flag float32_le( float32 a, float32 b )
3184
{
3185
    flag aSign, bSign;
3186
 
3187
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3188
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3189
       ) {
3190
        float_raise( float_flag_invalid );
3191
        return 0;
3192
    }
3193
    aSign = extractFloat32Sign( a );
3194
    bSign = extractFloat32Sign( b );
3195
    if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
3196
    return ( a == b ) || ( aSign ^ ( a < b ) );
3197
 
3198
}
3199
 
3200
/*----------------------------------------------------------------------------
3201
| Returns 1 if the single-precision floating-point value `a' is less than
3202
| the corresponding value `b', and 0 otherwise.  The comparison is performed
3203
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3204
*----------------------------------------------------------------------------*/
3205
 
3206
flag float32_lt( float32 a, float32 b )
3207
{
3208
    flag aSign, bSign;
3209
 
3210
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3211
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3212
       ) {
3213
        float_raise( float_flag_invalid );
3214
        return 0;
3215
    }
3216
    aSign = extractFloat32Sign( a );
3217
    bSign = extractFloat32Sign( b );
3218
    if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
3219
    return ( a != b ) && ( aSign ^ ( a < b ) );
3220
 
3221
}
3222
 
3223
/*----------------------------------------------------------------------------
3224
| Returns 1 if the single-precision floating-point value `a' is equal to
3225
| the corresponding value `b', and 0 otherwise.  The invalid exception is
3226
| raised if either operand is a NaN.  Otherwise, the comparison is performed
3227
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3228
*----------------------------------------------------------------------------*/
3229
 
3230
flag float32_eq_signaling( float32 a, float32 b )
3231
{
3232
 
3233
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3234
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3235
       ) {
3236
        float_raise( float_flag_invalid );
3237
        return 0;
3238
    }
3239
    return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
3240
 
3241
}
3242
 
3243
/*----------------------------------------------------------------------------
3244
| Returns 1 if the single-precision floating-point value `a' is less than or
3245
| equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
3246
| cause an exception.  Otherwise, the comparison is performed according to the
3247
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3248
*----------------------------------------------------------------------------*/
3249
 
3250
flag float32_le_quiet( float32 a, float32 b )
3251
{
3252
    flag aSign, bSign;
3253
    //int16 aExp, bExp; // Unused variable
3254
 
3255
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3256
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3257
       ) {
3258
        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
3259
            float_raise( float_flag_invalid );
3260
        }
3261
        return 0;
3262
    }
3263
    aSign = extractFloat32Sign( a );
3264
    bSign = extractFloat32Sign( b );
3265
    if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
3266
    return ( a == b ) || ( aSign ^ ( a < b ) );
3267
 
3268
}
3269
 
3270
/*----------------------------------------------------------------------------
3271
| Returns 1 if the single-precision floating-point value `a' is less than
3272
| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3273
| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
3274
| Standard for Binary Floating-Point Arithmetic.
3275
*----------------------------------------------------------------------------*/
3276
 
3277
flag float32_lt_quiet( float32 a, float32 b )
3278
{
3279
    flag aSign, bSign;
3280
 
3281
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3282
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3283
       ) {
3284
        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
3285
            float_raise( float_flag_invalid );
3286
        }
3287
        return 0;
3288
    }
3289
    aSign = extractFloat32Sign( a );
3290
    bSign = extractFloat32Sign( b );
3291
    if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
3292
    return ( a != b ) && ( aSign ^ ( a < b ) );
3293
 
3294
}
3295
 
3296
/*----------------------------------------------------------------------------
3297
| Returns the result of converting the double-precision floating-point value
3298
| `a' to the 32-bit two's complement integer format.  The conversion is
3299
| performed according to the IEC/IEEE Standard for Binary Floating-Point
3300
| Arithmetic---which means in particular that the conversion is rounded
3301
| according to the current rounding mode.  If `a' is a NaN, the largest
3302
| positive integer is returned.  Otherwise, if the conversion overflows, the
3303
| largest integer with the same sign as `a' is returned.
3304
*----------------------------------------------------------------------------*/
3305
 
3306
int32 float64_to_int32( float64 a )
3307
{
3308
    flag aSign;
3309
    int16 aExp, shiftCount;
3310
    bits64 aSig;
3311
 
3312
    aSig = extractFloat64Frac( a );
3313
    aExp = extractFloat64Exp( a );
3314
    aSign = extractFloat64Sign( a );
3315
    if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
3316
    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3317
    shiftCount = 0x42C - aExp;
3318
    if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
3319
    return roundAndPackInt32( aSign, aSig );
3320
 
3321
}
3322
 
3323
/*----------------------------------------------------------------------------
3324
| Returns the result of converting the double-precision floating-point value
3325
| `a' to the 32-bit two's complement integer format.  The conversion is
3326
| performed according to the IEC/IEEE Standard for Binary Floating-Point
3327
| Arithmetic, except that the conversion is always rounded toward zero.
3328
| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
3329
| the conversion overflows, the largest integer with the same sign as `a' is
3330
| returned.
3331
*----------------------------------------------------------------------------*/
3332
 
3333
int32 float64_to_int32_round_to_zero( float64 a )
3334
{
3335
    flag aSign;
3336
    int16 aExp, shiftCount;
3337
    bits64 aSig, savedASig;
3338
    int32 z;
3339
 
3340
    aSig = extractFloat64Frac( a );
3341
    aExp = extractFloat64Exp( a );
3342
    aSign = extractFloat64Sign( a );
3343
    if ( 0x41E < aExp ) {
3344
        if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
3345
        goto invalid;
3346
    }
3347
    else if ( aExp < 0x3FF ) {
3348
        if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
3349
        return 0;
3350
    }
3351
    aSig |= LIT64( 0x0010000000000000 );
3352
    shiftCount = 0x433 - aExp;
3353
    savedASig = aSig;
3354
    aSig >>= shiftCount;
3355
    z = aSig;
3356
    if ( aSign ) z = - z;
3357
    if ( ( z < 0 ) ^ aSign ) {
3358
 invalid:
3359
        float_raise( float_flag_invalid );
3360
        return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
3361
    }
3362
    if ( ( aSig<<shiftCount ) != savedASig ) {
3363
        float_exception_flags |= float_flag_inexact;
3364
    }
3365
    return z;
3366
 
3367
}
3368
 
3369
/*----------------------------------------------------------------------------
3370
| Returns the result of converting the double-precision floating-point value
3371
| `a' to the 64-bit two's complement integer format.  The conversion is
3372
| performed according to the IEC/IEEE Standard for Binary Floating-Point
3373
| Arithmetic---which means in particular that the conversion is rounded
3374
| according to the current rounding mode.  If `a' is a NaN, the largest
3375
| positive integer is returned.  Otherwise, if the conversion overflows, the
3376
| largest integer with the same sign as `a' is returned.
3377
*----------------------------------------------------------------------------*/
3378
 
3379
int64 float64_to_int64( float64 a )
3380
{
3381
    flag aSign;
3382
    int16 aExp, shiftCount;
3383
    bits64 aSig, aSigExtra;
3384
 
3385
    aSig = extractFloat64Frac( a );
3386
    aExp = extractFloat64Exp( a );
3387
    aSign = extractFloat64Sign( a );
3388
    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3389
    shiftCount = 0x433 - aExp;
3390
    if ( shiftCount <= 0 ) {
3391
        if ( 0x43E < aExp ) {
3392
            float_raise( float_flag_invalid );
3393
            if (    ! aSign
3394
                 || (    ( aExp == 0x7FF )
3395
                      && ( aSig != LIT64( 0x0010000000000000 ) ) )
3396
               ) {
3397
                return LIT64( 0x7FFFFFFFFFFFFFFF );
3398
            }
3399
            return (sbits64) LIT64( 0x8000000000000000 );
3400
        }
3401
        aSigExtra = 0;
3402
        aSig <<= - shiftCount;
3403
    }
3404
    else {
3405
        shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3406
    }
3407
    return roundAndPackInt64( aSign, aSig, aSigExtra );
3408
 
3409
}
3410
 
3411
/*----------------------------------------------------------------------------
3412
| Returns the result of converting the double-precision floating-point value
3413
| `a' to the 64-bit two's complement integer format.  The conversion is
3414
| performed according to the IEC/IEEE Standard for Binary Floating-Point
3415
| Arithmetic, except that the conversion is always rounded toward zero.
3416
| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
3417
| the conversion overflows, the largest integer with the same sign as `a' is
3418
| returned.
3419
*----------------------------------------------------------------------------*/
3420
 
3421
int64 float64_to_int64_round_to_zero( float64 a )
3422
{
3423
    flag aSign;
3424
    int16 aExp, shiftCount;
3425
    bits64 aSig;
3426
    int64 z;
3427
 
3428
    aSig = extractFloat64Frac( a );
3429
    aExp = extractFloat64Exp( a );
3430
    aSign = extractFloat64Sign( a );
3431
    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3432
    shiftCount = aExp - 0x433;
3433
    if ( 0 <= shiftCount ) {
3434
        if ( 0x43E <= aExp ) {
3435
            if ( a != LIT64( 0xC3E0000000000000 ) ) {
3436
                float_raise( float_flag_invalid );
3437
                if (    ! aSign
3438
                     || (    ( aExp == 0x7FF )
3439
                          && ( aSig != LIT64( 0x0010000000000000 ) ) )
3440
                   ) {
3441
                    return LIT64( 0x7FFFFFFFFFFFFFFF );
3442
                }
3443
            }
3444
            return (sbits64) LIT64( 0x8000000000000000 );
3445
        }
3446
        z = aSig<<shiftCount;
3447
    }
3448
    else {
3449
        if ( aExp < 0x3FE ) {
3450
            if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
3451
            return 0;
3452
        }
3453
        z = aSig>>( - shiftCount );
3454
        if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
3455
            float_exception_flags |= float_flag_inexact;
3456
        }
3457
    }
3458
    if ( aSign ) z = - z;
3459
    return z;
3460
 
3461
}
3462
 
3463
/*----------------------------------------------------------------------------
3464
| Returns the result of converting the double-precision floating-point value
3465
| `a' to the single-precision floating-point format.  The conversion is
3466
| performed according to the IEC/IEEE Standard for Binary Floating-Point
3467
| Arithmetic.
3468
*----------------------------------------------------------------------------*/
3469
 
3470
float32 float64_to_float32( float64 a )
3471
{
3472
    flag aSign;
3473
    int16 aExp;
3474
    bits64 aSig;
3475
    bits32 zSig;
3476
 
3477
    aSig = extractFloat64Frac( a );
3478
    aExp = extractFloat64Exp( a );
3479
    aSign = extractFloat64Sign( a );
3480
    if ( aExp == 0x7FF ) {
3481
        if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) );
3482
        return packFloat32( aSign, 0xFF, 0 );
3483
    }
3484
    shift64RightJamming( aSig, 22, &aSig );
3485
    zSig = aSig;
3486
    if ( aExp || zSig ) {
3487
        zSig |= 0x40000000;
3488
        aExp -= 0x381;
3489
    }
3490
    return roundAndPackFloat32( aSign, aExp, zSig );
3491
 
3492
}
3493
 
3494
#ifdef FLOATX80
3495
 
3496
/*----------------------------------------------------------------------------
3497
| Returns the result of converting the double-precision floating-point value
3498
| `a' to the extended double-precision floating-point format.  The conversion
3499
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
3500
| Arithmetic.
3501
*----------------------------------------------------------------------------*/
3502
 
3503
floatx80 float64_to_floatx80( float64 a )
3504
{
3505
    flag aSign;
3506
    int16 aExp;
3507
    bits64 aSig;
3508
 
3509
    aSig = extractFloat64Frac( a );
3510
    aExp = extractFloat64Exp( a );
3511
    aSign = extractFloat64Sign( a );
3512
    if ( aExp == 0x7FF ) {
3513
        if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) );
3514
        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3515
    }
3516
    if ( aExp == 0 ) {
3517
        if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3518
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3519
    }
3520
    return
3521
        packFloatx80(
3522
            aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
3523
 
3524
}
3525
 
3526
#endif
3527
 
3528
#ifdef FLOAT128
3529
 
3530
/*----------------------------------------------------------------------------
3531
| Returns the result of converting the double-precision floating-point value
3532
| `a' to the quadruple-precision floating-point format.  The conversion is
3533
| performed according to the IEC/IEEE Standard for Binary Floating-Point
3534
| Arithmetic.
3535
*----------------------------------------------------------------------------*/
3536
 
3537
float128 float64_to_float128( float64 a )
3538
{
3539
    flag aSign;
3540
    int16 aExp;
3541
    bits64 aSig, zSig0, zSig1;
3542
 
3543
    aSig = extractFloat64Frac( a );
3544
    aExp = extractFloat64Exp( a );
3545
    aSign = extractFloat64Sign( a );
3546
    if ( aExp == 0x7FF ) {
3547
        if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a ) );
3548
        return packFloat128( aSign, 0x7FFF, 0, 0 );
3549
    }
3550
    if ( aExp == 0 ) {
3551
        if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3552
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3553
        --aExp;
3554
    }
3555
    shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
3556
    return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
3557
 
3558
}
3559
 
3560
#endif
3561
 
3562
/*----------------------------------------------------------------------------
3563
| Rounds the double-precision floating-point value `a' to an integer, and
3564
| returns the result as a double-precision floating-point value.  The
3565
| operation is performed according to the IEC/IEEE Standard for Binary
3566
| Floating-Point Arithmetic.
3567
*----------------------------------------------------------------------------*/
3568
 
3569
float64 float64_round_to_int( float64 a )
3570
{
3571
    flag aSign;
3572
    int16 aExp;
3573
    bits64 lastBitMask, roundBitsMask;
3574
    int8 roundingMode;
3575
    float64 z;
3576
 
3577
    aExp = extractFloat64Exp( a );
3578
    if ( 0x433 <= aExp ) {
3579
        if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
3580
            return propagateFloat64NaN( a, a );
3581
        }
3582
        return a;
3583
    }
3584
    if ( aExp < 0x3FF ) {
3585
        if ( (bits64) ( a<<1 ) == 0 ) return a;
3586
        float_exception_flags |= float_flag_inexact;
3587
        aSign = extractFloat64Sign( a );
3588
        switch ( float_rounding_mode ) {
3589
         case float_round_nearest_even:
3590
            if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
3591
                return packFloat64( aSign, 0x3FF, 0 );
3592
            }
3593
            break;
3594
         case float_round_down:
3595
            return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
3596
         case float_round_up:
3597
            return
3598
            aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
3599
        }
3600
        return packFloat64( aSign, 0, 0 );
3601
    }
3602
    lastBitMask = 1;
3603
    lastBitMask <<= 0x433 - aExp;
3604
    roundBitsMask = lastBitMask - 1;
3605
    z = a;
3606
    roundingMode = float_rounding_mode;
3607
    if ( roundingMode == float_round_nearest_even ) {
3608
        z += lastBitMask>>1;
3609
        if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
3610
    }
3611
    else if ( roundingMode != float_round_to_zero ) {
3612
        if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
3613
            z += roundBitsMask;
3614
        }
3615
    }
3616
    z &= ~ roundBitsMask;
3617
    if ( z != a ) float_exception_flags |= float_flag_inexact;
3618
    return z;
3619
 
3620
}
3621
 
3622
/*----------------------------------------------------------------------------
3623
| Returns the result of adding the absolute values of the double-precision
3624
| floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
3625
| before being returned.  `zSign' is ignored if the result is a NaN.
3626
| The addition is performed according to the IEC/IEEE Standard for Binary
3627
| Floating-Point Arithmetic.
3628
*----------------------------------------------------------------------------*/
3629
 
3630
static float64 addFloat64Sigs( float64 a, float64 b, flag zSign )
3631
{
3632
    int16 aExp, bExp, zExp;
3633
    bits64 aSig, bSig, zSig;
3634
    int16 expDiff;
3635
 
3636
    aSig = extractFloat64Frac( a );
3637
    aExp = extractFloat64Exp( a );
3638
    bSig = extractFloat64Frac( b );
3639
    bExp = extractFloat64Exp( b );
3640
    expDiff = aExp - bExp;
3641
    aSig <<= 9;
3642
    bSig <<= 9;
3643
    if ( 0 < expDiff ) {
3644
        if ( aExp == 0x7FF ) {
3645
            if ( aSig ) return propagateFloat64NaN( a, b );
3646
            return a;
3647
        }
3648
        if ( bExp == 0 ) {
3649
            --expDiff;
3650
        }
3651
        else {
3652
            bSig |= LIT64( 0x2000000000000000 );
3653
        }
3654
        shift64RightJamming( bSig, expDiff, &bSig );
3655
        zExp = aExp;
3656
    }
3657
    else if ( expDiff < 0 ) {
3658
        if ( bExp == 0x7FF ) {
3659
            if ( bSig ) return propagateFloat64NaN( a, b );
3660
            return packFloat64( zSign, 0x7FF, 0 );
3661
        }
3662
        if ( aExp == 0 ) {
3663
            ++expDiff;
3664
        }
3665
        else {
3666
            aSig |= LIT64( 0x2000000000000000 );
3667
        }
3668
        shift64RightJamming( aSig, - expDiff, &aSig );
3669
        zExp = bExp;
3670
    }
3671
    else {
3672
        if ( aExp == 0x7FF ) {
3673
            if ( aSig | bSig ) return propagateFloat64NaN( a, b );
3674
            return a;
3675
        }
3676
        if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
3677
        zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
3678
        zExp = aExp;
3679
        goto roundAndPack;
3680
    }
3681
    aSig |= LIT64( 0x2000000000000000 );
3682
    zSig = ( aSig + bSig )<<1;
3683
    --zExp;
3684
    if ( (sbits64) zSig < 0 ) {
3685
        zSig = aSig + bSig;
3686
        ++zExp;
3687
    }
3688
 roundAndPack:
3689
    return roundAndPackFloat64( zSign, zExp, zSig );
3690
 
3691
}
3692
 
3693
/*----------------------------------------------------------------------------
3694
| Returns the result of subtracting the absolute values of the double-
3695
| precision floating-point values `a' and `b'.  If `zSign' is 1, the
3696
| difference is negated before being returned.  `zSign' is ignored if the
3697
| result is a NaN.  The subtraction is performed according to the IEC/IEEE
3698
| Standard for Binary Floating-Point Arithmetic.
3699
*----------------------------------------------------------------------------*/
3700
 
3701
static float64 subFloat64Sigs( float64 a, float64 b, flag zSign )
3702
{
3703
    int16 aExp, bExp, zExp;
3704
    bits64 aSig, bSig, zSig;
3705
    int16 expDiff;
3706
 
3707
    aSig = extractFloat64Frac( a );
3708
    aExp = extractFloat64Exp( a );
3709
    bSig = extractFloat64Frac( b );
3710
    bExp = extractFloat64Exp( b );
3711
    expDiff = aExp - bExp;
3712
    aSig <<= 10;
3713
    bSig <<= 10;
3714
    if ( 0 < expDiff ) goto aExpBigger;
3715
    if ( expDiff < 0 ) goto bExpBigger;
3716
    if ( aExp == 0x7FF ) {
3717
        if ( aSig | bSig ) return propagateFloat64NaN( a, b );
3718
        float_raise( float_flag_invalid );
3719
        return float64_default_nan;
3720
    }
3721
    if ( aExp == 0 ) {
3722
        aExp = 1;
3723
        bExp = 1;
3724
    }
3725
    if ( bSig < aSig ) goto aBigger;
3726
    if ( aSig < bSig ) goto bBigger;
3727
    return packFloat64( float_rounding_mode == float_round_down, 0, 0 );
3728
 bExpBigger:
3729
    if ( bExp == 0x7FF ) {
3730
        if ( bSig ) return propagateFloat64NaN( a, b );
3731
        return packFloat64( zSign ^ 1, 0x7FF, 0 );
3732
    }
3733
    if ( aExp == 0 ) {
3734
        ++expDiff;
3735
    }
3736
    else {
3737
        aSig |= LIT64( 0x4000000000000000 );
3738
    }
3739
    shift64RightJamming( aSig, - expDiff, &aSig );
3740
    bSig |= LIT64( 0x4000000000000000 );
3741
 bBigger:
3742
    zSig = bSig - aSig;
3743
    zExp = bExp;
3744
    zSign ^= 1;
3745
    goto normalizeRoundAndPack;
3746
 aExpBigger:
3747
    if ( aExp == 0x7FF ) {
3748
        if ( aSig ) return propagateFloat64NaN( a, b );
3749
        return a;
3750
    }
3751
    if ( bExp == 0 ) {
3752
        --expDiff;
3753
    }
3754
    else {
3755
        bSig |= LIT64( 0x4000000000000000 );
3756
    }
3757
    shift64RightJamming( bSig, expDiff, &bSig );
3758
    aSig |= LIT64( 0x4000000000000000 );
3759
 aBigger:
3760
    zSig = aSig - bSig;
3761
    zExp = aExp;
3762
 normalizeRoundAndPack:
3763
    --zExp;
3764
    return normalizeRoundAndPackFloat64( zSign, zExp, zSig );
3765
 
3766
}
3767
 
3768
/*----------------------------------------------------------------------------
3769
| Returns the result of adding the double-precision floating-point values `a'
3770
| and `b'.  The operation is performed according to the IEC/IEEE Standard for
3771
| Binary Floating-Point Arithmetic.
3772
*----------------------------------------------------------------------------*/
3773
 
3774
float64 float64_add( float64 a, float64 b )
3775
{
3776
    flag aSign, bSign;
3777
 
3778
    aSign = extractFloat64Sign( a );
3779
    bSign = extractFloat64Sign( b );
3780
    if ( aSign == bSign ) {
3781
        return addFloat64Sigs( a, b, aSign );
3782
    }
3783
    else {
3784
        return subFloat64Sigs( a, b, aSign );
3785
    }
3786
 
3787
}
3788
 
3789
/*----------------------------------------------------------------------------
3790
| Returns the result of subtracting the double-precision floating-point values
3791
| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
3792
| for Binary Floating-Point Arithmetic.
3793
*----------------------------------------------------------------------------*/
3794
 
3795
float64 float64_sub( float64 a, float64 b )
3796
{
3797
    flag aSign, bSign;
3798
 
3799
    aSign = extractFloat64Sign( a );
3800
    bSign = extractFloat64Sign( b );
3801
    if ( aSign == bSign ) {
3802
        return subFloat64Sigs( a, b, aSign );
3803
    }
3804
    else {
3805
        return addFloat64Sigs( a, b, aSign );
3806
    }
3807
 
3808
}
3809
 
3810
/*----------------------------------------------------------------------------
3811
| Returns the result of multiplying the double-precision floating-point values
3812
| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
3813
| for Binary Floating-Point Arithmetic.
3814
*----------------------------------------------------------------------------*/
3815
 
3816
float64 float64_mul( float64 a, float64 b )
3817
{
3818
    flag aSign, bSign, zSign;
3819
    int16 aExp, bExp, zExp;
3820
    bits64 aSig, bSig, zSig0, zSig1;
3821
 
3822
    aSig = extractFloat64Frac( a );
3823
    aExp = extractFloat64Exp( a );
3824
    aSign = extractFloat64Sign( a );
3825
    bSig = extractFloat64Frac( b );
3826
    bExp = extractFloat64Exp( b );
3827
    bSign = extractFloat64Sign( b );
3828
    zSign = aSign ^ bSign;
3829
    if ( aExp == 0x7FF ) {
3830
        if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3831
            return propagateFloat64NaN( a, b );
3832
        }
3833
        if ( ( bExp | bSig ) == 0 ) {
3834
            float_raise( float_flag_invalid );
3835
            return float64_default_nan;
3836
        }
3837
        return packFloat64( zSign, 0x7FF, 0 );
3838
    }
3839
    if ( bExp == 0x7FF ) {
3840
        if ( bSig ) return propagateFloat64NaN( a, b );
3841
        if ( ( aExp | aSig ) == 0 ) {
3842
            float_raise( float_flag_invalid );
3843
            return float64_default_nan;
3844
        }
3845
        return packFloat64( zSign, 0x7FF, 0 );
3846
    }
3847
    if ( aExp == 0 ) {
3848
        if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3849
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3850
    }
3851
    if ( bExp == 0 ) {
3852
        if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
3853
        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3854
    }
3855
    zExp = aExp + bExp - 0x3FF;
3856
    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3857
    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3858
    mul64To128( aSig, bSig, &zSig0, &zSig1 );
3859
    zSig0 |= ( zSig1 != 0 );
3860
    if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
3861
        zSig0 <<= 1;
3862
        --zExp;
3863
    }
3864
    return roundAndPackFloat64( zSign, zExp, zSig0 );
3865
 
3866
}
3867
 
3868
/*----------------------------------------------------------------------------
3869
| Returns the result of dividing the double-precision floating-point value `a'
3870
| by the corresponding value `b'.  The operation is performed according to
3871
| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3872
*----------------------------------------------------------------------------*/
3873
 
3874
float64 float64_div( float64 a, float64 b )
3875
{
3876
    flag aSign, bSign, zSign;
3877
    int16 aExp, bExp, zExp;
3878
    bits64 aSig, bSig, zSig;
3879
    bits64 rem0, rem1;
3880
    bits64 term0, term1;
3881
 
3882
    aSig = extractFloat64Frac( a );
3883
    aExp = extractFloat64Exp( a );
3884
    aSign = extractFloat64Sign( a );
3885
    bSig = extractFloat64Frac( b );
3886
    bExp = extractFloat64Exp( b );
3887
    bSign = extractFloat64Sign( b );
3888
    zSign = aSign ^ bSign;
3889
    if ( aExp == 0x7FF ) {
3890
        if ( aSig ) return propagateFloat64NaN( a, b );
3891
        if ( bExp == 0x7FF ) {
3892
            if ( bSig ) return propagateFloat64NaN( a, b );
3893
            float_raise( float_flag_invalid );
3894
            return float64_default_nan;
3895
        }
3896
        return packFloat64( zSign, 0x7FF, 0 );
3897
    }
3898
    if ( bExp == 0x7FF ) {
3899
        if ( bSig ) return propagateFloat64NaN( a, b );
3900
        return packFloat64( zSign, 0, 0 );
3901
    }
3902
    if ( bExp == 0 ) {
3903
        if ( bSig == 0 ) {
3904
            if ( ( aExp | aSig ) == 0 ) {
3905
                float_raise( float_flag_invalid );
3906
                return float64_default_nan;
3907
            }
3908
            float_raise( float_flag_divbyzero );
3909
            return packFloat64( zSign, 0x7FF, 0 );
3910
        }
3911
        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3912
    }
3913
    if ( aExp == 0 ) {
3914
        if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3915
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3916
    }
3917
    zExp = aExp - bExp + 0x3FD;
3918
    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3919
    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3920
    if ( bSig <= ( aSig + aSig ) ) {
3921
        aSig >>= 1;
3922
        ++zExp;
3923
    }
3924
    zSig = estimateDiv128To64( aSig, 0, bSig );
3925
    if ( ( zSig & 0x1FF ) <= 2 ) {
3926
        mul64To128( bSig, zSig, &term0, &term1 );
3927
        sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3928
        while ( (sbits64) rem0 < 0 ) {
3929
            --zSig;
3930
            add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3931
        }
3932
        zSig |= ( rem1 != 0 );
3933
    }
3934
    return roundAndPackFloat64( zSign, zExp, zSig );
3935
 
3936
}
3937
 
3938
/*----------------------------------------------------------------------------
3939
| Returns the remainder of the double-precision floating-point value `a'
3940
| with respect to the corresponding value `b'.  The operation is performed
3941
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3942
*----------------------------------------------------------------------------*/
3943
 
3944
float64 float64_rem( float64 a, float64 b )
3945
{
3946
    flag aSign, bSign, zSign;
3947
    int16 aExp, bExp, expDiff;
3948
    bits64 aSig, bSig;
3949
    bits64 q, alternateASig;
3950
    sbits64 sigMean;
3951
 
3952
    aSig = extractFloat64Frac( a );
3953
    aExp = extractFloat64Exp( a );
3954
    aSign = extractFloat64Sign( a );
3955
    bSig = extractFloat64Frac( b );
3956
    bExp = extractFloat64Exp( b );
3957
    bSign = extractFloat64Sign( b );
3958
    if ( aExp == 0x7FF ) {
3959
        if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3960
            return propagateFloat64NaN( a, b );
3961
        }
3962
        float_raise( float_flag_invalid );
3963
        return float64_default_nan;
3964
    }
3965
    if ( bExp == 0x7FF ) {
3966
        if ( bSig ) return propagateFloat64NaN( a, b );
3967
        return a;
3968
    }
3969
    if ( bExp == 0 ) {
3970
        if ( bSig == 0 ) {
3971
            float_raise( float_flag_invalid );
3972
            return float64_default_nan;
3973
        }
3974
        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3975
    }
3976
    if ( aExp == 0 ) {
3977
        if ( aSig == 0 ) return a;
3978
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3979
    }
3980
    expDiff = aExp - bExp;
3981
    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
3982
    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3983
    if ( expDiff < 0 ) {
3984
        if ( expDiff < -1 ) return a;
3985
        aSig >>= 1;
3986
    }
3987
    q = ( bSig <= aSig );
3988
    if ( q ) aSig -= bSig;
3989
    expDiff -= 64;
3990
    while ( 0 < expDiff ) {
3991
        q = estimateDiv128To64( aSig, 0, bSig );
3992
        q = ( 2 < q ) ? q - 2 : 0;
3993
        aSig = - ( ( bSig>>2 ) * q );
3994
        expDiff -= 62;
3995
    }
3996
    expDiff += 64;
3997
    if ( 0 < expDiff ) {
3998
        q = estimateDiv128To64( aSig, 0, bSig );
3999
        q = ( 2 < q ) ? q - 2 : 0;
4000
        q >>= 64 - expDiff;
4001
        bSig >>= 2;
4002
        aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4003
    }
4004
    else {
4005
        aSig >>= 2;
4006
        bSig >>= 2;
4007
    }
4008
    do {
4009
        alternateASig = aSig;
4010
        ++q;
4011
        aSig -= bSig;
4012
    } while ( 0 <= (sbits64) aSig );
4013
    sigMean = aSig + alternateASig;
4014
    if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
4015
        aSig = alternateASig;
4016
    }
4017
    zSign = ( (sbits64) aSig < 0 );
4018
    if ( zSign ) aSig = - aSig;
4019
    return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig );
4020
 
4021
}
4022
 
4023
/*----------------------------------------------------------------------------
4024
| Returns the square root of the double-precision floating-point value `a'.
4025
| The operation is performed according to the IEC/IEEE Standard for Binary
4026
| Floating-Point Arithmetic.
4027
*----------------------------------------------------------------------------*/
4028
 
4029
float64 float64_sqrt( float64 a )
4030
{
4031
    flag aSign;
4032
    int16 aExp, zExp;
4033
    bits64 aSig, zSig, doubleZSig;
4034
    bits64 rem0, rem1, term0, term1;
4035
    //float64 z; // Unused variable
4036
 
4037
    aSig = extractFloat64Frac( a );
4038
    aExp = extractFloat64Exp( a );
4039
    aSign = extractFloat64Sign( a );
4040
    if ( aExp == 0x7FF ) {
4041
        if ( aSig ) return propagateFloat64NaN( a, a );
4042
        if ( ! aSign ) return a;
4043
        float_raise( float_flag_invalid );
4044
        return float64_default_nan;
4045
    }
4046
    if ( aSign ) {
4047
        if ( ( aExp | aSig ) == 0 ) return a;
4048
        float_raise( float_flag_invalid );
4049
        return float64_default_nan;
4050
    }
4051
    if ( aExp == 0 ) {
4052
        if ( aSig == 0 ) return 0;
4053
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4054
    }
4055
    zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
4056
    aSig |= LIT64( 0x0010000000000000 );
4057
    zSig = estimateSqrt32( aExp, aSig>>21 );
4058
    aSig <<= 9 - ( aExp & 1 );
4059
    zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
4060
    if ( ( zSig & 0x1FF ) <= 5 ) {
4061
        doubleZSig = zSig<<1;
4062
        mul64To128( zSig, zSig, &term0, &term1 );
4063
        sub128( aSig, 0, term0, term1, &rem0, &rem1 );
4064
        while ( (sbits64) rem0 < 0 ) {
4065
            --zSig;
4066
            doubleZSig -= 2;
4067
            add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
4068
        }
4069
        zSig |= ( ( rem0 | rem1 ) != 0 );
4070
    }
4071
    return roundAndPackFloat64( 0, zExp, zSig );
4072
 
4073
}
4074
 
4075
/*----------------------------------------------------------------------------
4076
| Returns 1 if the double-precision floating-point value `a' is equal to the
4077
| corresponding value `b', and 0 otherwise.  The comparison is performed
4078
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4079
*----------------------------------------------------------------------------*/
4080
 
4081
flag float64_eq( float64 a, float64 b )
4082
{
4083
 
4084
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4085
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4086
       ) {
4087
        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4088
            float_raise( float_flag_invalid );
4089
        }
4090
        return 0;
4091
    }
4092
    return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
4093
 
4094
}
4095
 
4096
/*----------------------------------------------------------------------------
4097
| Returns 1 if the double-precision floating-point value `a' is less than or
4098
| equal to the corresponding value `b', and 0 otherwise.  The comparison is
4099
| performed according to the IEC/IEEE Standard for Binary Floating-Point
4100
| Arithmetic.
4101
*----------------------------------------------------------------------------*/
4102
 
4103
flag float64_le( float64 a, float64 b )
4104
{
4105
    flag aSign, bSign;
4106
 
4107
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4108
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4109
       ) {
4110
        float_raise( float_flag_invalid );
4111
        return 0;
4112
    }
4113
    aSign = extractFloat64Sign( a );
4114
    bSign = extractFloat64Sign( b );
4115
    if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
4116
    return ( a == b ) || ( aSign ^ ( a < b ) );
4117
 
4118
}
4119
 
4120
/*----------------------------------------------------------------------------
4121
| Returns 1 if the double-precision floating-point value `a' is less than
4122
| the corresponding value `b', and 0 otherwise.  The comparison is performed
4123
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4124
*----------------------------------------------------------------------------*/
4125
 
4126
flag float64_lt( float64 a, float64 b )
4127
{
4128
    flag aSign, bSign;
4129
 
4130
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4131
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4132
       ) {
4133
        float_raise( float_flag_invalid );
4134
        return 0;
4135
    }
4136
    aSign = extractFloat64Sign( a );
4137
    bSign = extractFloat64Sign( b );
4138
    if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
4139
    return ( a != b ) && ( aSign ^ ( a < b ) );
4140
 
4141
}
4142
 
4143
/*----------------------------------------------------------------------------
4144
| Returns 1 if the double-precision floating-point value `a' is equal to the
4145
| corresponding value `b', and 0 otherwise.  The invalid exception is raised
4146
| if either operand is a NaN.  Otherwise, the comparison is performed
4147
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4148
*----------------------------------------------------------------------------*/
4149
 
4150
flag float64_eq_signaling( float64 a, float64 b )
4151
{
4152
 
4153
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4154
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4155
       ) {
4156
        float_raise( float_flag_invalid );
4157
        return 0;
4158
    }
4159
    return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
4160
 
4161
}
4162
 
4163
/*----------------------------------------------------------------------------
4164
| Returns 1 if the double-precision floating-point value `a' is less than or
4165
| equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
4166
| cause an exception.  Otherwise, the comparison is performed according to the
4167
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4168
*----------------------------------------------------------------------------*/
4169
 
4170
flag float64_le_quiet( float64 a, float64 b )
4171
{
4172
    flag aSign, bSign;
4173
    //int16 aExp, bExp; // Unused variable
4174
 
4175
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4176
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4177
       ) {
4178
        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4179
            float_raise( float_flag_invalid );
4180
        }
4181
        return 0;
4182
    }
4183
    aSign = extractFloat64Sign( a );
4184
    bSign = extractFloat64Sign( b );
4185
    if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
4186
    return ( a == b ) || ( aSign ^ ( a < b ) );
4187
 
4188
}
4189
 
4190
/*----------------------------------------------------------------------------
4191
| Returns 1 if the double-precision floating-point value `a' is less than
4192
| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4193
| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
4194
| Standard for Binary Floating-Point Arithmetic.
4195
*----------------------------------------------------------------------------*/
4196
 
4197
flag float64_lt_quiet( float64 a, float64 b )
4198
{
4199
    flag aSign, bSign;
4200
 
4201
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4202
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4203
       ) {
4204
        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4205
            float_raise( float_flag_invalid );
4206
        }
4207
        return 0;
4208
    }
4209
    aSign = extractFloat64Sign( a );
4210
    bSign = extractFloat64Sign( b );
4211
    if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
4212
    return ( a != b ) && ( aSign ^ ( a < b ) );
4213
 
4214
}
4215
 
4216
#ifdef FLOATX80
4217
 
4218
/*----------------------------------------------------------------------------
4219
| Returns the result of converting the extended double-precision floating-
4220
| point value `a' to the 32-bit two's complement integer format.  The
4221
| conversion is performed according to the IEC/IEEE Standard for Binary
4222
| Floating-Point Arithmetic---which means in particular that the conversion
4223
| is rounded according to the current rounding mode.  If `a' is a NaN, the
4224
| largest positive integer is returned.  Otherwise, if the conversion
4225
| overflows, the largest integer with the same sign as `a' is returned.
4226
*----------------------------------------------------------------------------*/
4227
 
4228
int32 floatx80_to_int32( floatx80 a )
4229
{
4230
    flag aSign;
4231
    int32 aExp, shiftCount;
4232
    bits64 aSig;
4233
 
4234
    aSig = extractFloatx80Frac( a );
4235
    aExp = extractFloatx80Exp( a );
4236
    aSign = extractFloatx80Sign( a );
4237
    if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
4238
    shiftCount = 0x4037 - aExp;
4239
    if ( shiftCount <= 0 ) shiftCount = 1;
4240
    shift64RightJamming( aSig, shiftCount, &aSig );
4241
    return roundAndPackInt32( aSign, aSig );
4242
 
4243
}
4244
 
4245
/*----------------------------------------------------------------------------
4246
| Returns the result of converting the extended double-precision floating-
4247
| point value `a' to the 32-bit two's complement integer format.  The
4248
| conversion is performed according to the IEC/IEEE Standard for Binary
4249
| Floating-Point Arithmetic, except that the conversion is always rounded
4250
| toward zero.  If `a' is a NaN, the largest positive integer is returned.
4251
| Otherwise, if the conversion overflows, the largest integer with the same
4252
| sign as `a' is returned.
4253
*----------------------------------------------------------------------------*/
4254
 
4255
int32 floatx80_to_int32_round_to_zero( floatx80 a )
4256
{
4257
    flag aSign;
4258
    int32 aExp, shiftCount;
4259
    bits64 aSig, savedASig;
4260
    int32 z;
4261
 
4262
    aSig = extractFloatx80Frac( a );
4263
    aExp = extractFloatx80Exp( a );
4264
    aSign = extractFloatx80Sign( a );
4265
    if ( 0x401E < aExp ) {
4266
        if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
4267
        goto invalid;
4268
    }
4269
    else if ( aExp < 0x3FFF ) {
4270
        if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
4271
        return 0;
4272
    }
4273
    shiftCount = 0x403E - aExp;
4274
    savedASig = aSig;
4275
    aSig >>= shiftCount;
4276
    z = aSig;
4277
    if ( aSign ) z = - z;
4278
    if ( ( z < 0 ) ^ aSign ) {
4279
 invalid:
4280
        float_raise( float_flag_invalid );
4281
        return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
4282
    }
4283
    if ( ( aSig<<shiftCount ) != savedASig ) {
4284
        float_exception_flags |= float_flag_inexact;
4285
    }
4286
    return z;
4287
 
4288
}
4289
 
4290
/*----------------------------------------------------------------------------
4291
| Returns the result of converting the extended double-precision floating-
4292
| point value `a' to the 64-bit two's complement integer format.  The
4293
| conversion is performed according to the IEC/IEEE Standard for Binary
4294
| Floating-Point Arithmetic---which means in particular that the conversion
4295
| is rounded according to the current rounding mode.  If `a' is a NaN,
4296
| the largest positive integer is returned.  Otherwise, if the conversion
4297
| overflows, the largest integer with the same sign as `a' is returned.
4298
*----------------------------------------------------------------------------*/
4299
 
4300
int64 floatx80_to_int64( floatx80 a )
4301
{
4302
    flag aSign;
4303
    int32 aExp, shiftCount;
4304
    bits64 aSig, aSigExtra;
4305
 
4306
    aSig = extractFloatx80Frac( a );
4307
    aExp = extractFloatx80Exp( a );
4308
    aSign = extractFloatx80Sign( a );
4309
    shiftCount = 0x403E - aExp;
4310
    if ( shiftCount <= 0 ) {
4311
        if ( shiftCount ) {
4312
            float_raise( float_flag_invalid );
4313
            if (    ! aSign
4314
                 || (    ( aExp == 0x7FFF )
4315
                      && ( aSig != LIT64( 0x8000000000000000 ) ) )
4316
               ) {
4317
                return LIT64( 0x7FFFFFFFFFFFFFFF );
4318
            }
4319
            return (sbits64) LIT64( 0x8000000000000000 );
4320
        }
4321
        aSigExtra = 0;
4322
    }
4323
    else {
4324
        shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
4325
    }
4326
    return roundAndPackInt64( aSign, aSig, aSigExtra );
4327
 
4328
}
4329
 
4330
/*----------------------------------------------------------------------------
4331
| Returns the result of converting the extended double-precision floating-
4332
| point value `a' to the 64-bit two's complement integer format.  The
4333
| conversion is performed according to the IEC/IEEE Standard for Binary
4334
| Floating-Point Arithmetic, except that the conversion is always rounded
4335
| toward zero.  If `a' is a NaN, the largest positive integer is returned.
4336
| Otherwise, if the conversion overflows, the largest integer with the same
4337
| sign as `a' is returned.
4338
*----------------------------------------------------------------------------*/
4339
 
4340
int64 floatx80_to_int64_round_to_zero( floatx80 a )
4341
{
4342
    flag aSign;
4343
    int32 aExp, shiftCount;
4344
    bits64 aSig;
4345
    int64 z;
4346
 
4347
    aSig = extractFloatx80Frac( a );
4348
    aExp = extractFloatx80Exp( a );
4349
    aSign = extractFloatx80Sign( a );
4350
    shiftCount = aExp - 0x403E;
4351
    if ( 0 <= shiftCount ) {
4352
        aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
4353
        if ( ( a.high != 0xC03E ) || aSig ) {
4354
            float_raise( float_flag_invalid );
4355
            if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
4356
                return LIT64( 0x7FFFFFFFFFFFFFFF );
4357
            }
4358
        }
4359
        return (sbits64) LIT64( 0x8000000000000000 );
4360
    }
4361
    else if ( aExp < 0x3FFF ) {
4362
        if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
4363
        return 0;
4364
    }
4365
    z = aSig>>( - shiftCount );
4366
    if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
4367
        float_exception_flags |= float_flag_inexact;
4368
    }
4369
    if ( aSign ) z = - z;
4370
    return z;
4371
 
4372
}
4373
 
4374
/*----------------------------------------------------------------------------
4375
| Returns the result of converting the extended double-precision floating-
4376
| point value `a' to the single-precision floating-point format.  The
4377
| conversion is performed according to the IEC/IEEE Standard for Binary
4378
| Floating-Point Arithmetic.
4379
*----------------------------------------------------------------------------*/
4380
 
4381
float32 floatx80_to_float32( floatx80 a )
4382
{
4383
    flag aSign;
4384
    int32 aExp;
4385
    bits64 aSig;
4386
 
4387
    aSig = extractFloatx80Frac( a );
4388
    aExp = extractFloatx80Exp( a );
4389
    aSign = extractFloatx80Sign( a );
4390
    if ( aExp == 0x7FFF ) {
4391
        if ( (bits64) ( aSig<<1 ) ) {
4392
            return commonNaNToFloat32( floatx80ToCommonNaN( a ) );
4393
        }
4394
        return packFloat32( aSign, 0xFF, 0 );
4395
    }
4396
    shift64RightJamming( aSig, 33, &aSig );
4397
    if ( aExp || aSig ) aExp -= 0x3F81;
4398
    return roundAndPackFloat32( aSign, aExp, aSig );
4399
 
4400
}
4401
 
4402
/*----------------------------------------------------------------------------
4403
| Returns the result of converting the extended double-precision floating-
4404
| point value `a' to the double-precision floating-point format.  The
4405
| conversion is performed according to the IEC/IEEE Standard for Binary
4406
| Floating-Point Arithmetic.
4407
*----------------------------------------------------------------------------*/
4408
 
4409
float64 floatx80_to_float64( floatx80 a )
4410
{
4411
    flag aSign;
4412
    int32 aExp;
4413
    bits64 aSig, zSig;
4414
 
4415
    aSig = extractFloatx80Frac( a );
4416
    aExp = extractFloatx80Exp( a );
4417
    aSign = extractFloatx80Sign( a );
4418
    if ( aExp == 0x7FFF ) {
4419
        if ( (bits64) ( aSig<<1 ) ) {
4420
            return commonNaNToFloat64( floatx80ToCommonNaN( a ) );
4421
        }
4422
        return packFloat64( aSign, 0x7FF, 0 );
4423
    }
4424
    shift64RightJamming( aSig, 1, &zSig );
4425
    if ( aExp || aSig ) aExp -= 0x3C01;
4426
    return roundAndPackFloat64( aSign, aExp, zSig );
4427
 
4428
}
4429
 
4430
#ifdef FLOAT128
4431
 
4432
/*----------------------------------------------------------------------------
4433
| Returns the result of converting the extended double-precision floating-
4434
| point value `a' to the quadruple-precision floating-point format.  The
4435
| conversion is performed according to the IEC/IEEE Standard for Binary
4436
| Floating-Point Arithmetic.
4437
*----------------------------------------------------------------------------*/
4438
 
4439
float128 floatx80_to_float128( floatx80 a )
4440
{
4441
    flag aSign;
4442
    int16 aExp;
4443
    bits64 aSig, zSig0, zSig1;
4444
 
4445
    aSig = extractFloatx80Frac( a );
4446
    aExp = extractFloatx80Exp( a );
4447
    aSign = extractFloatx80Sign( a );
4448
    if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
4449
        return commonNaNToFloat128( floatx80ToCommonNaN( a ) );
4450
    }
4451
    shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
4452
    return packFloat128( aSign, aExp, zSig0, zSig1 );
4453
 
4454
}
4455
 
4456
#endif
4457
 
4458
/*----------------------------------------------------------------------------
4459
| Rounds the extended double-precision floating-point value `a' to an integer,
4460
| and returns the result as an extended quadruple-precision floating-point
4461
| value.  The operation is performed according to the IEC/IEEE Standard for
4462
| Binary Floating-Point Arithmetic.
4463
*----------------------------------------------------------------------------*/
4464
 
4465
floatx80 floatx80_round_to_int( floatx80 a )
4466
{
4467
    flag aSign;
4468
    int32 aExp;
4469
    bits64 lastBitMask, roundBitsMask;
4470
    int8 roundingMode;
4471
    floatx80 z;
4472
 
4473
    aExp = extractFloatx80Exp( a );
4474
    if ( 0x403E <= aExp ) {
4475
        if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
4476
            return propagateFloatx80NaN( a, a );
4477
        }
4478
        return a;
4479
    }
4480
    if ( aExp < 0x3FFF ) {
4481
        if (    ( aExp == 0 )
4482
             && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
4483
            return a;
4484
        }
4485
        float_exception_flags |= float_flag_inexact;
4486
        aSign = extractFloatx80Sign( a );
4487
        switch ( float_rounding_mode ) {
4488
         case float_round_nearest_even:
4489
            if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
4490
               ) {
4491
                return
4492
                    packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
4493
            }
4494
            break;
4495
         case float_round_down:
4496
            return
4497
                  aSign ?
4498
                      packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4499
                : packFloatx80( 0, 0, 0 );
4500
         case float_round_up:
4501
            return
4502
                  aSign ? packFloatx80( 1, 0, 0 )
4503
                : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4504
        }
4505
        return packFloatx80( aSign, 0, 0 );
4506
    }
4507
    lastBitMask = 1;
4508
    lastBitMask <<= 0x403E - aExp;
4509
    roundBitsMask = lastBitMask - 1;
4510
    z = a;
4511
    roundingMode = float_rounding_mode;
4512
    if ( roundingMode == float_round_nearest_even ) {
4513
        z.low += lastBitMask>>1;
4514
        if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4515
    }
4516
    else if ( roundingMode != float_round_to_zero ) {
4517
        if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
4518
            z.low += roundBitsMask;
4519
        }
4520
    }
4521
    z.low &= ~ roundBitsMask;
4522
    if ( z.low == 0 ) {
4523
        ++z.high;
4524
        z.low = LIT64( 0x8000000000000000 );
4525
    }
4526
    if ( z.low != a.low ) float_exception_flags |= float_flag_inexact;
4527
    return z;
4528
 
4529
}
4530
 
4531
/*----------------------------------------------------------------------------
4532
| Returns the result of adding the absolute values of the extended double-
4533
| precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
4534
| negated before being returned.  `zSign' is ignored if the result is a NaN.
4535
| The addition is performed according to the IEC/IEEE Standard for Binary
4536
| Floating-Point Arithmetic.
4537
*----------------------------------------------------------------------------*/
4538
 
4539
static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
4540
{
4541
    int32 aExp, bExp, zExp;
4542
    bits64 aSig, bSig, zSig0, zSig1;
4543
    int32 expDiff;
4544
 
4545
    aSig = extractFloatx80Frac( a );
4546
    aExp = extractFloatx80Exp( a );
4547
    bSig = extractFloatx80Frac( b );
4548
    bExp = extractFloatx80Exp( b );
4549
    expDiff = aExp - bExp;
4550
    if ( 0 < expDiff ) {
4551
        if ( aExp == 0x7FFF ) {
4552
            if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
4553
            return a;
4554
        }
4555
        if ( bExp == 0 ) --expDiff;
4556
        shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4557
        zExp = aExp;
4558
    }
4559
    else if ( expDiff < 0 ) {
4560
        if ( bExp == 0x7FFF ) {
4561
            if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
4562
            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4563
        }
4564
        if ( aExp == 0 ) ++expDiff;
4565
        shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4566
        zExp = bExp;
4567
    }
4568
    else {
4569
        if ( aExp == 0x7FFF ) {
4570
            if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
4571
                return propagateFloatx80NaN( a, b );
4572
            }
4573
            return a;
4574
        }
4575
        zSig1 = 0;
4576
        zSig0 = aSig + bSig;
4577
        if ( aExp == 0 ) {
4578
            normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
4579
            goto roundAndPack;
4580
        }
4581
        zExp = aExp;
4582
        goto shiftRight1;
4583
    }
4584
    zSig0 = aSig + bSig;
4585
    if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
4586
 shiftRight1:
4587
    shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
4588
    zSig0 |= LIT64( 0x8000000000000000 );
4589
    ++zExp;
4590
 roundAndPack:
4591
    return
4592
        roundAndPackFloatx80(
4593
            floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
4594
 
4595
}
4596
 
4597
/*----------------------------------------------------------------------------
4598
| Returns the result of subtracting the absolute values of the extended
4599
| double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
4600
| difference is negated before being returned.  `zSign' is ignored if the
4601
| result is a NaN.  The subtraction is performed according to the IEC/IEEE
4602
| Standard for Binary Floating-Point Arithmetic.
4603
*----------------------------------------------------------------------------*/
4604
 
4605
static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
4606
{
4607
    int32 aExp, bExp, zExp;
4608
    bits64 aSig, bSig, zSig0, zSig1;
4609
    int32 expDiff;
4610
    floatx80 z;
4611
 
4612
    aSig = extractFloatx80Frac( a );
4613
    aExp = extractFloatx80Exp( a );
4614
    bSig = extractFloatx80Frac( b );
4615
    bExp = extractFloatx80Exp( b );
4616
    expDiff = aExp - bExp;
4617
    if ( 0 < expDiff ) goto aExpBigger;
4618
    if ( expDiff < 0 ) goto bExpBigger;
4619
    if ( aExp == 0x7FFF ) {
4620
        if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
4621
            return propagateFloatx80NaN( a, b );
4622
        }
4623
        float_raise( float_flag_invalid );
4624
        z.low = floatx80_default_nan_low;
4625
        z.high = floatx80_default_nan_high;
4626
        return z;
4627
    }
4628
    if ( aExp == 0 ) {
4629
        aExp = 1;
4630
        bExp = 1;
4631
    }
4632
    zSig1 = 0;
4633
    if ( bSig < aSig ) goto aBigger;
4634
    if ( aSig < bSig ) goto bBigger;
4635
    return packFloatx80( float_rounding_mode == float_round_down, 0, 0 );
4636
 bExpBigger:
4637
    if ( bExp == 0x7FFF ) {
4638
        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
4639
        return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
4640
    }
4641
    if ( aExp == 0 ) ++expDiff;
4642
    shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4643
 bBigger:
4644
    sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
4645
    zExp = bExp;
4646
    zSign ^= 1;
4647
    goto normalizeRoundAndPack;
4648
 aExpBigger:
4649
    if ( aExp == 0x7FFF ) {
4650
        if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
4651
        return a;
4652
    }
4653
    if ( bExp == 0 ) --expDiff;
4654
    shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4655
 aBigger:
4656
    sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
4657
    zExp = aExp;
4658
 normalizeRoundAndPack:
4659
    return
4660
        normalizeRoundAndPackFloatx80(
4661
            floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
4662
 
4663
}
4664
 
4665
/*----------------------------------------------------------------------------
4666
| Returns the result of adding the extended double-precision floating-point
4667
| values `a' and `b'.  The operation is performed according to the IEC/IEEE
4668
| Standard for Binary Floating-Point Arithmetic.
4669
*----------------------------------------------------------------------------*/
4670
 
4671
floatx80 floatx80_add( floatx80 a, floatx80 b )
4672
{
4673
    flag aSign, bSign;
4674
 
4675
    aSign = extractFloatx80Sign( a );
4676
    bSign = extractFloatx80Sign( b );
4677
    if ( aSign == bSign ) {
4678
        return addFloatx80Sigs( a, b, aSign );
4679
    }
4680
    else {
4681
        return subFloatx80Sigs( a, b, aSign );
4682
    }
4683
 
4684
}
4685
 
4686
/*----------------------------------------------------------------------------
4687
| Returns the result of subtracting the extended double-precision floating-
4688
| point values `a' and `b'.  The operation is performed according to the
4689
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4690
*----------------------------------------------------------------------------*/
4691
 
4692
floatx80 floatx80_sub( floatx80 a, floatx80 b )
4693
{
4694
    flag aSign, bSign;
4695
 
4696
    aSign = extractFloatx80Sign( a );
4697
    bSign = extractFloatx80Sign( b );
4698
    if ( aSign == bSign ) {
4699
        return subFloatx80Sigs( a, b, aSign );
4700
    }
4701
    else {
4702
        return addFloatx80Sigs( a, b, aSign );
4703
    }
4704
 
4705
}
4706
 
4707
/*----------------------------------------------------------------------------
4708
| Returns the result of multiplying the extended double-precision floating-
4709
| point values `a' and `b'.  The operation is performed according to the
4710
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4711
*----------------------------------------------------------------------------*/
4712
 
4713
floatx80 floatx80_mul( floatx80 a, floatx80 b )
4714
{
4715
    flag aSign, bSign, zSign;
4716
    int32 aExp, bExp, zExp;
4717
    bits64 aSig, bSig, zSig0, zSig1;
4718
    floatx80 z;
4719
 
4720
    aSig = extractFloatx80Frac( a );
4721
    aExp = extractFloatx80Exp( a );
4722
    aSign = extractFloatx80Sign( a );
4723
    bSig = extractFloatx80Frac( b );
4724
    bExp = extractFloatx80Exp( b );
4725
    bSign = extractFloatx80Sign( b );
4726
    zSign = aSign ^ bSign;
4727
    if ( aExp == 0x7FFF ) {
4728
        if (    (bits64) ( aSig<<1 )
4729
             || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
4730
            return propagateFloatx80NaN( a, b );
4731
        }
4732
        if ( ( bExp | bSig ) == 0 ) goto invalid;
4733
        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4734
    }
4735
    if ( bExp == 0x7FFF ) {
4736
        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
4737
        if ( ( aExp | aSig ) == 0 ) {
4738
 invalid:
4739
            float_raise( float_flag_invalid );
4740
            z.low = floatx80_default_nan_low;
4741
            z.high = floatx80_default_nan_high;
4742
            return z;
4743
        }
4744
        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4745
    }
4746
    if ( aExp == 0 ) {
4747
        if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4748
        normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4749
    }
4750
    if ( bExp == 0 ) {
4751
        if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
4752
        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4753
    }
4754
    zExp = aExp + bExp - 0x3FFE;
4755
    mul64To128( aSig, bSig, &zSig0, &zSig1 );
4756
    if ( 0 < (sbits64) zSig0 ) {
4757
        shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
4758
        --zExp;
4759
    }
4760
    return
4761
        roundAndPackFloatx80(
4762
            floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
4763
 
4764
}
4765
 
4766
/*----------------------------------------------------------------------------
4767
| Returns the result of dividing the extended double-precision floating-point
4768
| value `a' by the corresponding value `b'.  The operation is performed
4769
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4770
*----------------------------------------------------------------------------*/
4771
 
4772
floatx80 floatx80_div( floatx80 a, floatx80 b )
4773
{
4774
    flag aSign, bSign, zSign;
4775
    int32 aExp, bExp, zExp;
4776
    bits64 aSig, bSig, zSig0, zSig1;
4777
    bits64 rem0, rem1, rem2, term0, term1, term2;
4778
    floatx80 z;
4779
 
4780
    aSig = extractFloatx80Frac( a );
4781
    aExp = extractFloatx80Exp( a );
4782
    aSign = extractFloatx80Sign( a );
4783
    bSig = extractFloatx80Frac( b );
4784
    bExp = extractFloatx80Exp( b );
4785
    bSign = extractFloatx80Sign( b );
4786
    zSign = aSign ^ bSign;
4787
    if ( aExp == 0x7FFF ) {
4788
        if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
4789
        if ( bExp == 0x7FFF ) {
4790
            if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
4791
            goto invalid;
4792
        }
4793
        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4794
    }
4795
    if ( bExp == 0x7FFF ) {
4796
        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
4797
        return packFloatx80( zSign, 0, 0 );
4798
    }
4799
    if ( bExp == 0 ) {
4800
        if ( bSig == 0 ) {
4801
            if ( ( aExp | aSig ) == 0 ) {
4802
 invalid:
4803
                float_raise( float_flag_invalid );
4804
                z.low = floatx80_default_nan_low;
4805
                z.high = floatx80_default_nan_high;
4806
                return z;
4807
            }
4808
            float_raise( float_flag_divbyzero );
4809
            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4810
        }
4811
        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4812
    }
4813
    if ( aExp == 0 ) {
4814
        if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4815
        normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4816
    }
4817
    zExp = aExp - bExp + 0x3FFE;
4818
    rem1 = 0;
4819
    if ( bSig <= aSig ) {
4820
        shift128Right( aSig, 0, 1, &aSig, &rem1 );
4821
        ++zExp;
4822
    }
4823
    zSig0 = estimateDiv128To64( aSig, rem1, bSig );
4824
    mul64To128( bSig, zSig0, &term0, &term1 );
4825
    sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
4826
    while ( (sbits64) rem0 < 0 ) {
4827
        --zSig0;
4828
        add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
4829
    }
4830
    zSig1 = estimateDiv128To64( rem1, 0, bSig );
4831
    if ( (bits64) ( zSig1<<1 ) <= 8 ) {
4832
        mul64To128( bSig, zSig1, &term1, &term2 );
4833
        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4834
        while ( (sbits64) rem1 < 0 ) {
4835
            --zSig1;
4836
            add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
4837
        }
4838
        zSig1 |= ( ( rem1 | rem2 ) != 0 );
4839
    }
4840
    return
4841
        roundAndPackFloatx80(
4842
            floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
4843
 
4844
}
4845
 
4846
/*----------------------------------------------------------------------------
4847
| Returns the remainder of the extended double-precision floating-point value
4848
| `a' with respect to the corresponding value `b'.  The operation is performed
4849
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4850
*----------------------------------------------------------------------------*/
4851
 
4852
floatx80 floatx80_rem( floatx80 a, floatx80 b )
4853
{
4854
    flag aSign, bSign, zSign;
4855
    int32 aExp, bExp, expDiff;
4856
    bits64 aSig0, aSig1, bSig;
4857
    bits64 q, term0, term1, alternateASig0, alternateASig1;
4858
    floatx80 z;
4859
 
4860
    aSig0 = extractFloatx80Frac( a );
4861
    aExp = extractFloatx80Exp( a );
4862
    aSign = extractFloatx80Sign( a );
4863
    bSig = extractFloatx80Frac( b );
4864
    bExp = extractFloatx80Exp( b );
4865
    bSign = extractFloatx80Sign( b );
4866
    if ( aExp == 0x7FFF ) {
4867
        if (    (bits64) ( aSig0<<1 )
4868
             || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
4869
            return propagateFloatx80NaN( a, b );
4870
        }
4871
        goto invalid;
4872
    }
4873
    if ( bExp == 0x7FFF ) {
4874
        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
4875
        return a;
4876
    }
4877
    if ( bExp == 0 ) {
4878
        if ( bSig == 0 ) {
4879
 invalid:
4880
            float_raise( float_flag_invalid );
4881
            z.low = floatx80_default_nan_low;
4882
            z.high = floatx80_default_nan_high;
4883
            return z;
4884
        }
4885
        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4886
    }
4887
    if ( aExp == 0 ) {
4888
        if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
4889
        normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4890
    }
4891
    bSig |= LIT64( 0x8000000000000000 );
4892
    zSign = aSign;
4893
    expDiff = aExp - bExp;
4894
    aSig1 = 0;
4895
    if ( expDiff < 0 ) {
4896
        if ( expDiff < -1 ) return a;
4897
        shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
4898
        expDiff = 0;
4899
    }
4900
    q = ( bSig <= aSig0 );
4901
    if ( q ) aSig0 -= bSig;
4902
    expDiff -= 64;
4903
    while ( 0 < expDiff ) {
4904
        q = estimateDiv128To64( aSig0, aSig1, bSig );
4905
        q = ( 2 < q ) ? q - 2 : 0;
4906
        mul64To128( bSig, q, &term0, &term1 );
4907
        sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4908
        shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
4909
        expDiff -= 62;
4910
    }
4911
    expDiff += 64;
4912
    if ( 0 < expDiff ) {
4913
        q = estimateDiv128To64( aSig0, aSig1, bSig );
4914
        q = ( 2 < q ) ? q - 2 : 0;
4915
        q >>= 64 - expDiff;
4916
        mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
4917
        sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4918
        shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
4919
        while ( le128( term0, term1, aSig0, aSig1 ) ) {
4920
            ++q;
4921
            sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4922
        }
4923
    }
4924
    else {
4925
        term1 = 0;
4926
        term0 = bSig;
4927
    }
4928
    sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
4929
    if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
4930
         || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
4931
              && ( q & 1 ) )
4932
       ) {
4933
        aSig0 = alternateASig0;
4934
        aSig1 = alternateASig1;
4935
        zSign = ! zSign;
4936
    }
4937
    return
4938
        normalizeRoundAndPackFloatx80(
4939
            80, zSign, bExp + expDiff, aSig0, aSig1 );
4940
 
4941
}
4942
 
4943
/*----------------------------------------------------------------------------
4944
| Returns the square root of the extended double-precision floating-point
4945
| value `a'.  The operation is performed according to the IEC/IEEE Standard
4946
| for Binary Floating-Point Arithmetic.
4947
*----------------------------------------------------------------------------*/
4948
 
4949
floatx80 floatx80_sqrt( floatx80 a )
4950
{
4951
    flag aSign;
4952
    int32 aExp, zExp;
4953
    bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
4954
    bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4955
    floatx80 z;
4956
 
4957
    aSig0 = extractFloatx80Frac( a );
4958
    aExp = extractFloatx80Exp( a );
4959
    aSign = extractFloatx80Sign( a );
4960
    if ( aExp == 0x7FFF ) {
4961
        if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a );
4962
        if ( ! aSign ) return a;
4963
        goto invalid;
4964
    }
4965
    if ( aSign ) {
4966
        if ( ( aExp | aSig0 ) == 0 ) return a;
4967
 invalid:
4968
        float_raise( float_flag_invalid );
4969
        z.low = floatx80_default_nan_low;
4970
        z.high = floatx80_default_nan_high;
4971
        return z;
4972
    }
4973
    if ( aExp == 0 ) {
4974
        if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
4975
        normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4976
    }
4977
    zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
4978
    zSig0 = estimateSqrt32( aExp, aSig0>>32 );
4979
    shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
4980
    zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4981
    doubleZSig0 = zSig0<<1;
4982
    mul64To128( zSig0, zSig0, &term0, &term1 );
4983
    sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4984
    while ( (sbits64) rem0 < 0 ) {
4985
        --zSig0;
4986
        doubleZSig0 -= 2;
4987
        add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4988
    }
4989
    zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4990
    if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4991
        if ( zSig1 == 0 ) zSig1 = 1;
4992
        mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4993
        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4994
        mul64To128( zSig1, zSig1, &term2, &term3 );
4995
        sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4996
        while ( (sbits64) rem1 < 0 ) {
4997
            --zSig1;
4998
            shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4999
            term3 |= 1;
5000
            term2 |= doubleZSig0;
5001
            add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5002
        }
5003
        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5004
    }
5005
    shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
5006
    zSig0 |= doubleZSig0;
5007
    return
5008
        roundAndPackFloatx80(
5009
            floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
5010
 
5011
}
5012
 
5013
/*----------------------------------------------------------------------------
5014
| Returns 1 if the extended double-precision floating-point value `a' is
5015
| equal to the corresponding value `b', and 0 otherwise.  The comparison is
5016
| performed according to the IEC/IEEE Standard for Binary Floating-Point
5017
| Arithmetic.
5018
*----------------------------------------------------------------------------*/
5019
 
5020
flag floatx80_eq( floatx80 a, floatx80 b )
5021
{
5022
 
5023
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5024
              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
5025
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5026
              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
5027
       ) {
5028
        if (    floatx80_is_signaling_nan( a )
5029
             || floatx80_is_signaling_nan( b ) ) {
5030
            float_raise( float_flag_invalid );
5031
        }
5032
        return 0;
5033
    }
5034
    return
5035
           ( a.low == b.low )
5036
        && (    ( a.high == b.high )
5037
             || (    ( a.low == 0 )
5038
                  && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
5039
           );
5040
 
5041
}
5042
 
5043
/*----------------------------------------------------------------------------
5044
| Returns 1 if the extended double-precision floating-point value `a' is
5045
| less than or equal to the corresponding value `b', and 0 otherwise.  The
5046
| comparison is performed according to the IEC/IEEE Standard for Binary
5047
| Floating-Point Arithmetic.
5048
*----------------------------------------------------------------------------*/
5049
 
5050
flag floatx80_le( floatx80 a, floatx80 b )
5051
{
5052
    flag aSign, bSign;
5053
 
5054
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5055
              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
5056
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5057
              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
5058
       ) {
5059
        float_raise( float_flag_invalid );
5060
        return 0;
5061
    }
5062
    aSign = extractFloatx80Sign( a );
5063
    bSign = extractFloatx80Sign( b );
5064
    if ( aSign != bSign ) {
5065
        return
5066
               aSign
5067
            || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5068
                 == 0 );
5069
    }
5070
    return
5071
          aSign ? le128( b.high, b.low, a.high, a.low )
5072
        : le128( a.high, a.low, b.high, b.low );
5073
 
5074
}
5075
 
5076
/*----------------------------------------------------------------------------
5077
| Returns 1 if the extended double-precision floating-point value `a' is
5078
| less than the corresponding value `b', and 0 otherwise.  The comparison
5079
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5080
| Arithmetic.
5081
*----------------------------------------------------------------------------*/
5082
 
5083
flag floatx80_lt( floatx80 a, floatx80 b )
5084
{
5085
    flag aSign, bSign;
5086
 
5087
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5088
              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
5089
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5090
              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
5091
       ) {
5092
        float_raise( float_flag_invalid );
5093
        return 0;
5094
    }
5095
    aSign = extractFloatx80Sign( a );
5096
    bSign = extractFloatx80Sign( b );
5097
    if ( aSign != bSign ) {
5098
        return
5099
               aSign
5100
            && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5101
                 != 0 );
5102
    }
5103
    return
5104
          aSign ? lt128( b.high, b.low, a.high, a.low )
5105
        : lt128( a.high, a.low, b.high, b.low );
5106
 
5107
}
5108
 
5109
/*----------------------------------------------------------------------------
5110
| Returns 1 if the extended double-precision floating-point value `a' is equal
5111
| to the corresponding value `b', and 0 otherwise.  The invalid exception is
5112
| raised if either operand is a NaN.  Otherwise, the comparison is performed
5113
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5114
*----------------------------------------------------------------------------*/
5115
 
5116
flag floatx80_eq_signaling( floatx80 a, floatx80 b )
5117
{
5118
 
5119
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5120
              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
5121
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5122
              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
5123
       ) {
5124
        float_raise( float_flag_invalid );
5125
        return 0;
5126
    }
5127
    return
5128
           ( a.low == b.low )
5129
        && (    ( a.high == b.high )
5130
             || (    ( a.low == 0 )
5131
                  && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
5132
           );
5133
 
5134
}
5135
 
5136
/*----------------------------------------------------------------------------
5137
| Returns 1 if the extended double-precision floating-point value `a' is less
5138
| than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
5139
| do not cause an exception.  Otherwise, the comparison is performed according
5140
| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5141
*----------------------------------------------------------------------------*/
5142
 
5143
flag floatx80_le_quiet( floatx80 a, floatx80 b )
5144
{
5145
    flag aSign, bSign;
5146
 
5147
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5148
              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
5149
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5150
              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
5151
       ) {
5152
        if (    floatx80_is_signaling_nan( a )
5153
             || floatx80_is_signaling_nan( b ) ) {
5154
            float_raise( float_flag_invalid );
5155
        }
5156
        return 0;
5157
    }
5158
    aSign = extractFloatx80Sign( a );
5159
    bSign = extractFloatx80Sign( b );
5160
    if ( aSign != bSign ) {
5161
        return
5162
               aSign
5163
            || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5164
                 == 0 );
5165
    }
5166
    return
5167
          aSign ? le128( b.high, b.low, a.high, a.low )
5168
        : le128( a.high, a.low, b.high, b.low );
5169
 
5170
}
5171
 
5172
/*----------------------------------------------------------------------------
5173
| Returns 1 if the extended double-precision floating-point value `a' is less
5174
| than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
5175
| an exception.  Otherwise, the comparison is performed according to the
5176
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5177
*----------------------------------------------------------------------------*/
5178
 
5179
flag floatx80_lt_quiet( floatx80 a, floatx80 b )
5180
{
5181
    flag aSign, bSign;
5182
 
5183
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5184
              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
5185
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5186
              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
5187
       ) {
5188
        if (    floatx80_is_signaling_nan( a )
5189
             || floatx80_is_signaling_nan( b ) ) {
5190
            float_raise( float_flag_invalid );
5191
        }
5192
        return 0;
5193
    }
5194
    aSign = extractFloatx80Sign( a );
5195
    bSign = extractFloatx80Sign( b );
5196
    if ( aSign != bSign ) {
5197
        return
5198
               aSign
5199
            && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5200
                 != 0 );
5201
    }
5202
    return
5203
          aSign ? lt128( b.high, b.low, a.high, a.low )
5204
        : lt128( a.high, a.low, b.high, b.low );
5205
 
5206
}
5207
 
5208
#endif
5209
 
5210
#ifdef FLOAT128
5211
 
5212
/*----------------------------------------------------------------------------
5213
| Returns the result of converting the quadruple-precision floating-point
5214
| value `a' to the 32-bit two's complement integer format.  The conversion
5215
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5216
| Arithmetic---which means in particular that the conversion is rounded
5217
| according to the current rounding mode.  If `a' is a NaN, the largest
5218
| positive integer is returned.  Otherwise, if the conversion overflows, the
5219
| largest integer with the same sign as `a' is returned.
5220
*----------------------------------------------------------------------------*/
5221
 
5222
int32 float128_to_int32( float128 a )
5223
{
5224
    flag aSign;
5225
    int32 aExp, shiftCount;
5226
    bits64 aSig0, aSig1;
5227
 
5228
    aSig1 = extractFloat128Frac1( a );
5229
    aSig0 = extractFloat128Frac0( a );
5230
    aExp = extractFloat128Exp( a );
5231
    aSign = extractFloat128Sign( a );
5232
    if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
5233
    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5234
    aSig0 |= ( aSig1 != 0 );
5235
    shiftCount = 0x4028 - aExp;
5236
    if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
5237
    return roundAndPackInt32( aSign, aSig0 );
5238
 
5239
}
5240
 
5241
/*----------------------------------------------------------------------------
5242
| Returns the result of converting the quadruple-precision floating-point
5243
| value `a' to the 32-bit two's complement integer format.  The conversion
5244
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5245
| Arithmetic, except that the conversion is always rounded toward zero.  If
5246
| `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
5247
| conversion overflows, the largest integer with the same sign as `a' is
5248
| returned.
5249
*----------------------------------------------------------------------------*/
5250
 
5251
int32 float128_to_int32_round_to_zero( float128 a )
5252
{
5253
    flag aSign;
5254
    int32 aExp, shiftCount;
5255
    bits64 aSig0, aSig1, savedASig;
5256
    int32 z;
5257
 
5258
    aSig1 = extractFloat128Frac1( a );
5259
    aSig0 = extractFloat128Frac0( a );
5260
    aExp = extractFloat128Exp( a );
5261
    aSign = extractFloat128Sign( a );
5262
    aSig0 |= ( aSig1 != 0 );
5263
    if ( 0x401E < aExp ) {
5264
        if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
5265
        goto invalid;
5266
    }
5267
    else if ( aExp < 0x3FFF ) {
5268
        if ( aExp || aSig0 ) float_exception_flags |= float_flag_inexact;
5269
        return 0;
5270
    }
5271
    aSig0 |= LIT64( 0x0001000000000000 );
5272
    shiftCount = 0x402F - aExp;
5273
    savedASig = aSig0;
5274
    aSig0 >>= shiftCount;
5275
    z = aSig0;
5276
    if ( aSign ) z = - z;
5277
    if ( ( z < 0 ) ^ aSign ) {
5278
 invalid:
5279
        float_raise( float_flag_invalid );
5280
        return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
5281
    }
5282
    if ( ( aSig0<<shiftCount ) != savedASig ) {
5283
        float_exception_flags |= float_flag_inexact;
5284
    }
5285
    return z;
5286
 
5287
}
5288
 
5289
/*----------------------------------------------------------------------------
5290
| Returns the result of converting the quadruple-precision floating-point
5291
| value `a' to the 64-bit two's complement integer format.  The conversion
5292
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5293
| Arithmetic---which means in particular that the conversion is rounded
5294
| according to the current rounding mode.  If `a' is a NaN, the largest
5295
| positive integer is returned.  Otherwise, if the conversion overflows, the
5296
| largest integer with the same sign as `a' is returned.
5297
*----------------------------------------------------------------------------*/
5298
 
5299
int64 float128_to_int64( float128 a )
5300
{
5301
    flag aSign;
5302
    int32 aExp, shiftCount;
5303
    bits64 aSig0, aSig1;
5304
 
5305
    aSig1 = extractFloat128Frac1( a );
5306
    aSig0 = extractFloat128Frac0( a );
5307
    aExp = extractFloat128Exp( a );
5308
    aSign = extractFloat128Sign( a );
5309
    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5310
    shiftCount = 0x402F - aExp;
5311
    if ( shiftCount <= 0 ) {
5312
        if ( 0x403E < aExp ) {
5313
            float_raise( float_flag_invalid );
5314
            if (    ! aSign
5315
                 || (    ( aExp == 0x7FFF )
5316
                      && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
5317
                    )
5318
               ) {
5319
                return LIT64( 0x7FFFFFFFFFFFFFFF );
5320
            }
5321
            return (sbits64) LIT64( 0x8000000000000000 );
5322
        }
5323
        shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
5324
    }
5325
    else {
5326
        shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
5327
    }
5328
    return roundAndPackInt64( aSign, aSig0, aSig1 );
5329
 
5330
}
5331
 
5332
/*----------------------------------------------------------------------------
5333
| Returns the result of converting the quadruple-precision floating-point
5334
| value `a' to the 64-bit two's complement integer format.  The conversion
5335
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5336
| Arithmetic, except that the conversion is always rounded toward zero.
5337
| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
5338
| the conversion overflows, the largest integer with the same sign as `a' is
5339
| returned.
5340
*----------------------------------------------------------------------------*/
5341
 
5342
int64 float128_to_int64_round_to_zero( float128 a )
5343
{
5344
    flag aSign;
5345
    int32 aExp, shiftCount;
5346
    bits64 aSig0, aSig1;
5347
    int64 z;
5348
 
5349
    aSig1 = extractFloat128Frac1( a );
5350
    aSig0 = extractFloat128Frac0( a );
5351
    aExp = extractFloat128Exp( a );
5352
    aSign = extractFloat128Sign( a );
5353
    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5354
    shiftCount = aExp - 0x402F;
5355
    if ( 0 < shiftCount ) {
5356
        if ( 0x403E <= aExp ) {
5357
            aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
5358
            if (    ( a.high == LIT64( 0xC03E000000000000 ) )
5359
                 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
5360
                if ( aSig1 ) float_exception_flags |= float_flag_inexact;
5361
            }
5362
            else {
5363
                float_raise( float_flag_invalid );
5364
                if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
5365
                    return LIT64( 0x7FFFFFFFFFFFFFFF );
5366
                }
5367
            }
5368
            return (sbits64) LIT64( 0x8000000000000000 );
5369
        }
5370
        z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
5371
        if ( (bits64) ( aSig1<<shiftCount ) ) {
5372
            float_exception_flags |= float_flag_inexact;
5373
        }
5374
    }
5375
    else {
5376
        if ( aExp < 0x3FFF ) {
5377
            if ( aExp | aSig0 | aSig1 ) {
5378
                float_exception_flags |= float_flag_inexact;
5379
            }
5380
            return 0;
5381
        }
5382
        z = aSig0>>( - shiftCount );
5383
        if (    aSig1
5384
             || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
5385
            float_exception_flags |= float_flag_inexact;
5386
        }
5387
    }
5388
    if ( aSign ) z = - z;
5389
    return z;
5390
 
5391
}
5392
 
5393
/*----------------------------------------------------------------------------
5394
| Returns the result of converting the quadruple-precision floating-point
5395
| value `a' to the single-precision floating-point format.  The conversion
5396
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5397
| Arithmetic.
5398
*----------------------------------------------------------------------------*/
5399
 
5400
float32 float128_to_float32( float128 a )
5401
{
5402
    flag aSign;
5403
    int32 aExp;
5404
    bits64 aSig0, aSig1;
5405
    bits32 zSig;
5406
 
5407
    aSig1 = extractFloat128Frac1( a );
5408
    aSig0 = extractFloat128Frac0( a );
5409
    aExp = extractFloat128Exp( a );
5410
    aSign = extractFloat128Sign( a );
5411
    if ( aExp == 0x7FFF ) {
5412
        if ( aSig0 | aSig1 ) {
5413
            return commonNaNToFloat32( float128ToCommonNaN( a ) );
5414
        }
5415
        return packFloat32( aSign, 0xFF, 0 );
5416
    }
5417
    aSig0 |= ( aSig1 != 0 );
5418
    shift64RightJamming( aSig0, 18, &aSig0 );
5419
    zSig = aSig0;
5420
    if ( aExp || zSig ) {
5421
        zSig |= 0x40000000;
5422
        aExp -= 0x3F81;
5423
    }
5424
    return roundAndPackFloat32( aSign, aExp, zSig );
5425
 
5426
}
5427
 
5428
/*----------------------------------------------------------------------------
5429
| Returns the result of converting the quadruple-precision floating-point
5430
| value `a' to the double-precision floating-point format.  The conversion
5431
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5432
| Arithmetic.
5433
*----------------------------------------------------------------------------*/
5434
 
5435
float64 float128_to_float64( float128 a )
5436
{
5437
    flag aSign;
5438
    int32 aExp;
5439
    bits64 aSig0, aSig1;
5440
 
5441
    aSig1 = extractFloat128Frac1( a );
5442
    aSig0 = extractFloat128Frac0( a );
5443
    aExp = extractFloat128Exp( a );
5444
    aSign = extractFloat128Sign( a );
5445
    if ( aExp == 0x7FFF ) {
5446
        if ( aSig0 | aSig1 ) {
5447
            return commonNaNToFloat64( float128ToCommonNaN( a ) );
5448
        }
5449
        return packFloat64( aSign, 0x7FF, 0 );
5450
    }
5451
    shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5452
    aSig0 |= ( aSig1 != 0 );
5453
    if ( aExp || aSig0 ) {
5454
        aSig0 |= LIT64( 0x4000000000000000 );
5455
        aExp -= 0x3C01;
5456
    }
5457
    return roundAndPackFloat64( aSign, aExp, aSig0 );
5458
 
5459
}
5460
 
5461
#ifdef FLOATX80
5462
 
5463
/*----------------------------------------------------------------------------
5464
| Returns the result of converting the quadruple-precision floating-point
5465
| value `a' to the extended double-precision floating-point format.  The
5466
| conversion is performed according to the IEC/IEEE Standard for Binary
5467
| Floating-Point Arithmetic.
5468
*----------------------------------------------------------------------------*/
5469
 
5470
floatx80 float128_to_floatx80( float128 a )
5471
{
5472
    flag aSign;
5473
    int32 aExp;
5474
    bits64 aSig0, aSig1;
5475
 
5476
    aSig1 = extractFloat128Frac1( a );
5477
    aSig0 = extractFloat128Frac0( a );
5478
    aExp = extractFloat128Exp( a );
5479
    aSign = extractFloat128Sign( a );
5480
    if ( aExp == 0x7FFF ) {
5481
        if ( aSig0 | aSig1 ) {
5482
            return commonNaNToFloatx80( float128ToCommonNaN( a ) );
5483
        }
5484
        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5485
    }
5486
    if ( aExp == 0 ) {
5487
        if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
5488
        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5489
    }
5490
    else {
5491
        aSig0 |= LIT64( 0x0001000000000000 );
5492
    }
5493
    shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
5494
    return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 );
5495
 
5496
}
5497
 
5498
#endif
5499
 
5500
/*----------------------------------------------------------------------------
5501
| Rounds the quadruple-precision floating-point value `a' to an integer, and
5502
| returns the result as a quadruple-precision floating-point value.  The
5503
| operation is performed according to the IEC/IEEE Standard for Binary
5504
| Floating-Point Arithmetic.
5505
*----------------------------------------------------------------------------*/
5506
 
5507
float128 float128_round_to_int( float128 a )
5508
{
5509
    flag aSign;
5510
    int32 aExp;
5511
    bits64 lastBitMask, roundBitsMask;
5512
    int8 roundingMode;
5513
    float128 z;
5514
 
5515
    aExp = extractFloat128Exp( a );
5516
    if ( 0x402F <= aExp ) {
5517
        if ( 0x406F <= aExp ) {
5518
            if (    ( aExp == 0x7FFF )
5519
                 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
5520
               ) {
5521
                return propagateFloat128NaN( a, a );
5522
            }
5523
            return a;
5524
        }
5525
        lastBitMask = 1;
5526
        lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
5527
        roundBitsMask = lastBitMask - 1;
5528
        z = a;
5529
        roundingMode = float_rounding_mode;
5530
        if ( roundingMode == float_round_nearest_even ) {
5531
            if ( lastBitMask ) {
5532
                add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
5533
                if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
5534
            }
5535
            else {
5536
                if ( (sbits64) z.low < 0 ) {
5537
                    ++z.high;
5538
                    if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
5539
                }
5540
            }
5541
        }
5542
        else if ( roundingMode != float_round_to_zero ) {
5543
            if (   extractFloat128Sign( z )
5544
                 ^ ( roundingMode == float_round_up ) ) {
5545
                add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
5546
            }
5547
        }
5548
        z.low &= ~ roundBitsMask;
5549
    }
5550
    else {
5551
        if ( aExp < 0x3FFF ) {
5552
            if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
5553
            float_exception_flags |= float_flag_inexact;
5554
            aSign = extractFloat128Sign( a );
5555
            switch ( float_rounding_mode ) {
5556
             case float_round_nearest_even:
5557
                if (    ( aExp == 0x3FFE )
5558
                     && (   extractFloat128Frac0( a )
5559
                          | extractFloat128Frac1( a ) )
5560
                   ) {
5561
                    return packFloat128( aSign, 0x3FFF, 0, 0 );
5562
                }
5563
                break;
5564
             case float_round_down:
5565
                return
5566
                      aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
5567
                    : packFloat128( 0, 0, 0, 0 );
5568
             case float_round_up:
5569
                return
5570
                      aSign ? packFloat128( 1, 0, 0, 0 )
5571
                    : packFloat128( 0, 0x3FFF, 0, 0 );
5572
            }
5573
            return packFloat128( aSign, 0, 0, 0 );
5574
        }
5575
        lastBitMask = 1;
5576
        lastBitMask <<= 0x402F - aExp;
5577
        roundBitsMask = lastBitMask - 1;
5578
        z.low = 0;
5579
        z.high = a.high;
5580
        roundingMode = float_rounding_mode;
5581
        if ( roundingMode == float_round_nearest_even ) {
5582
            z.high += lastBitMask>>1;
5583
            if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
5584
                z.high &= ~ lastBitMask;
5585
            }
5586
        }
5587
        else if ( roundingMode != float_round_to_zero ) {
5588
            if (   extractFloat128Sign( z )
5589
                 ^ ( roundingMode == float_round_up ) ) {
5590
                z.high |= ( a.low != 0 );
5591
                z.high += roundBitsMask;
5592
            }
5593
        }
5594
        z.high &= ~ roundBitsMask;
5595
    }
5596
    if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
5597
        float_exception_flags |= float_flag_inexact;
5598
    }
5599
    return z;
5600
 
5601
}
5602
 
5603
/*----------------------------------------------------------------------------
5604
| Returns the result of adding the absolute values of the quadruple-precision
5605
| floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
5606
| before being returned.  `zSign' is ignored if the result is a NaN.
5607
| The addition is performed according to the IEC/IEEE Standard for Binary
5608
| Floating-Point Arithmetic.
5609
*----------------------------------------------------------------------------*/
5610
 
5611
static float128 addFloat128Sigs( float128 a, float128 b, flag zSign )
5612
{
5613
    int32 aExp, bExp, zExp;
5614
    bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5615
    int32 expDiff;
5616
 
5617
    aSig1 = extractFloat128Frac1( a );
5618
    aSig0 = extractFloat128Frac0( a );
5619
    aExp = extractFloat128Exp( a );
5620
    bSig1 = extractFloat128Frac1( b );
5621
    bSig0 = extractFloat128Frac0( b );
5622
    bExp = extractFloat128Exp( b );
5623
    expDiff = aExp - bExp;
5624
    if ( 0 < expDiff ) {
5625
        if ( aExp == 0x7FFF ) {
5626
            if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
5627
            return a;
5628
        }
5629
        if ( bExp == 0 ) {
5630
            --expDiff;
5631
        }
5632
        else {
5633
            bSig0 |= LIT64( 0x0001000000000000 );
5634
        }
5635
        shift128ExtraRightJamming(
5636
            bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
5637
        zExp = aExp;
5638
    }
5639
    else if ( expDiff < 0 ) {
5640
        if ( bExp == 0x7FFF ) {
5641
            if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5642
            return packFloat128( zSign, 0x7FFF, 0, 0 );
5643
        }
5644
        if ( aExp == 0 ) {
5645
            ++expDiff;
5646
        }
5647
        else {
5648
            aSig0 |= LIT64( 0x0001000000000000 );
5649
        }
5650
        shift128ExtraRightJamming(
5651
            aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
5652
        zExp = bExp;
5653
    }
5654
    else {
5655
        if ( aExp == 0x7FFF ) {
5656
            if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5657
                return propagateFloat128NaN( a, b );
5658
            }
5659
            return a;
5660
        }
5661
        add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5662
        if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 );
5663
        zSig2 = 0;
5664
        zSig0 |= LIT64( 0x0002000000000000 );
5665
        zExp = aExp;
5666
        goto shiftRight1;
5667
    }
5668
    aSig0 |= LIT64( 0x0001000000000000 );
5669
    add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5670
    --zExp;
5671
    if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
5672
    ++zExp;
5673
 shiftRight1:
5674
    shift128ExtraRightJamming(
5675
        zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5676
 roundAndPack:
5677
    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
5678
 
5679
}
5680
 
5681
/*----------------------------------------------------------------------------
5682
| Returns the result of subtracting the absolute values of the quadruple-
5683
| precision floating-point values `a' and `b'.  If `zSign' is 1, the
5684
| difference is negated before being returned.  `zSign' is ignored if the
5685
| result is a NaN.  The subtraction is performed according to the IEC/IEEE
5686
| Standard for Binary Floating-Point Arithmetic.
5687
*----------------------------------------------------------------------------*/
5688
 
5689
static float128 subFloat128Sigs( float128 a, float128 b, flag zSign )
5690
{
5691
    int32 aExp, bExp, zExp;
5692
    bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
5693
    int32 expDiff;
5694
    float128 z;
5695
 
5696
    aSig1 = extractFloat128Frac1( a );
5697
    aSig0 = extractFloat128Frac0( a );
5698
    aExp = extractFloat128Exp( a );
5699
    bSig1 = extractFloat128Frac1( b );
5700
    bSig0 = extractFloat128Frac0( b );
5701
    bExp = extractFloat128Exp( b );
5702
    expDiff = aExp - bExp;
5703
    shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5704
    shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
5705
    if ( 0 < expDiff ) goto aExpBigger;
5706
    if ( expDiff < 0 ) goto bExpBigger;
5707
    if ( aExp == 0x7FFF ) {
5708
        if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5709
            return propagateFloat128NaN( a, b );
5710
        }
5711
        float_raise( float_flag_invalid );
5712
        z.low = float128_default_nan_low;
5713
        z.high = float128_default_nan_high;
5714
        return z;
5715
    }
5716
    if ( aExp == 0 ) {
5717
        aExp = 1;
5718
        bExp = 1;
5719
    }
5720
    if ( bSig0 < aSig0 ) goto aBigger;
5721
    if ( aSig0 < bSig0 ) goto bBigger;
5722
    if ( bSig1 < aSig1 ) goto aBigger;
5723
    if ( aSig1 < bSig1 ) goto bBigger;
5724
    return packFloat128( float_rounding_mode == float_round_down, 0, 0, 0 );
5725
 bExpBigger:
5726
    if ( bExp == 0x7FFF ) {
5727
        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5728
        return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
5729
    }
5730
    if ( aExp == 0 ) {
5731
        ++expDiff;
5732
    }
5733
    else {
5734
        aSig0 |= LIT64( 0x4000000000000000 );
5735
    }
5736
    shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5737
    bSig0 |= LIT64( 0x4000000000000000 );
5738
 bBigger:
5739
    sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
5740
    zExp = bExp;
5741
    zSign ^= 1;
5742
    goto normalizeRoundAndPack;
5743
 aExpBigger:
5744
    if ( aExp == 0x7FFF ) {
5745
        if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
5746
        return a;
5747
    }
5748
    if ( bExp == 0 ) {
5749
        --expDiff;
5750
    }
5751
    else {
5752
        bSig0 |= LIT64( 0x4000000000000000 );
5753
    }
5754
    shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
5755
    aSig0 |= LIT64( 0x4000000000000000 );
5756
 aBigger:
5757
    sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5758
    zExp = aExp;
5759
 normalizeRoundAndPack:
5760
    --zExp;
5761
    return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 );
5762
 
5763
}
5764
 
5765
/*----------------------------------------------------------------------------
5766
| Returns the result of adding the quadruple-precision floating-point values
5767
| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
5768
| for Binary Floating-Point Arithmetic.
5769
*----------------------------------------------------------------------------*/
5770
 
5771
float128 float128_add( float128 a, float128 b )
5772
{
5773
    flag aSign, bSign;
5774
 
5775
    aSign = extractFloat128Sign( a );
5776
    bSign = extractFloat128Sign( b );
5777
    if ( aSign == bSign ) {
5778
        return addFloat128Sigs( a, b, aSign );
5779
    }
5780
    else {
5781
        return subFloat128Sigs( a, b, aSign );
5782
    }
5783
 
5784
}
5785
 
5786
/*----------------------------------------------------------------------------
5787
| Returns the result of subtracting the quadruple-precision floating-point
5788
| values `a' and `b'.  The operation is performed according to the IEC/IEEE
5789
| Standard for Binary Floating-Point Arithmetic.
5790
*----------------------------------------------------------------------------*/
5791
 
5792
float128 float128_sub( float128 a, float128 b )
5793
{
5794
    flag aSign, bSign;
5795
 
5796
    aSign = extractFloat128Sign( a );
5797
    bSign = extractFloat128Sign( b );
5798
    if ( aSign == bSign ) {
5799
        return subFloat128Sigs( a, b, aSign );
5800
    }
5801
    else {
5802
        return addFloat128Sigs( a, b, aSign );
5803
    }
5804
 
5805
}
5806
 
5807
/*----------------------------------------------------------------------------
5808
| Returns the result of multiplying the quadruple-precision floating-point
5809
| values `a' and `b'.  The operation is performed according to the IEC/IEEE
5810
| Standard for Binary Floating-Point Arithmetic.
5811
*----------------------------------------------------------------------------*/
5812
 
5813
float128 float128_mul( float128 a, float128 b )
5814
{
5815
    flag aSign, bSign, zSign;
5816
    int32 aExp, bExp, zExp;
5817
    bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
5818
    float128 z;
5819
 
5820
    aSig1 = extractFloat128Frac1( a );
5821
    aSig0 = extractFloat128Frac0( a );
5822
    aExp = extractFloat128Exp( a );
5823
    aSign = extractFloat128Sign( a );
5824
    bSig1 = extractFloat128Frac1( b );
5825
    bSig0 = extractFloat128Frac0( b );
5826
    bExp = extractFloat128Exp( b );
5827
    bSign = extractFloat128Sign( b );
5828
    zSign = aSign ^ bSign;
5829
    if ( aExp == 0x7FFF ) {
5830
        if (    ( aSig0 | aSig1 )
5831
             || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5832
            return propagateFloat128NaN( a, b );
5833
        }
5834
        if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
5835
        return packFloat128( zSign, 0x7FFF, 0, 0 );
5836
    }
5837
    if ( bExp == 0x7FFF ) {
5838
        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5839
        if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5840
 invalid:
5841
            float_raise( float_flag_invalid );
5842
            z.low = float128_default_nan_low;
5843
            z.high = float128_default_nan_high;
5844
            return z;
5845
        }
5846
        return packFloat128( zSign, 0x7FFF, 0, 0 );
5847
    }
5848
    if ( aExp == 0 ) {
5849
        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5850
        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5851
    }
5852
    if ( bExp == 0 ) {
5853
        if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5854
        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5855
    }
5856
    zExp = aExp + bExp - 0x4000;
5857
    aSig0 |= LIT64( 0x0001000000000000 );
5858
    shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
5859
    mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
5860
    add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
5861
    zSig2 |= ( zSig3 != 0 );
5862
    if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
5863
        shift128ExtraRightJamming(
5864
            zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5865
        ++zExp;
5866
    }
5867
    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
5868
 
5869
}
5870
 
5871
/*----------------------------------------------------------------------------
5872
| Returns the result of dividing the quadruple-precision floating-point value
5873
| `a' by the corresponding value `b'.  The operation is performed according to
5874
| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5875
*----------------------------------------------------------------------------*/
5876
 
5877
float128 float128_div( float128 a, float128 b )
5878
{
5879
    flag aSign, bSign, zSign;
5880
    int32 aExp, bExp, zExp;
5881
    bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5882
    bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5883
    float128 z;
5884
 
5885
    aSig1 = extractFloat128Frac1( a );
5886
    aSig0 = extractFloat128Frac0( a );
5887
    aExp = extractFloat128Exp( a );
5888
    aSign = extractFloat128Sign( a );
5889
    bSig1 = extractFloat128Frac1( b );
5890
    bSig0 = extractFloat128Frac0( b );
5891
    bExp = extractFloat128Exp( b );
5892
    bSign = extractFloat128Sign( b );
5893
    zSign = aSign ^ bSign;
5894
    if ( aExp == 0x7FFF ) {
5895
        if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
5896
        if ( bExp == 0x7FFF ) {
5897
            if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5898
            goto invalid;
5899
        }
5900
        return packFloat128( zSign, 0x7FFF, 0, 0 );
5901
    }
5902
    if ( bExp == 0x7FFF ) {
5903
        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5904
        return packFloat128( zSign, 0, 0, 0 );
5905
    }
5906
    if ( bExp == 0 ) {
5907
        if ( ( bSig0 | bSig1 ) == 0 ) {
5908
            if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5909
 invalid:
5910
                float_raise( float_flag_invalid );
5911
                z.low = float128_default_nan_low;
5912
                z.high = float128_default_nan_high;
5913
                return z;
5914
            }
5915
            float_raise( float_flag_divbyzero );
5916
            return packFloat128( zSign, 0x7FFF, 0, 0 );
5917
        }
5918
        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5919
    }
5920
    if ( aExp == 0 ) {
5921
        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5922
        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5923
    }
5924
    zExp = aExp - bExp + 0x3FFD;
5925
    shortShift128Left(
5926
        aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
5927
    shortShift128Left(
5928
        bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5929
    if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
5930
        shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
5931
        ++zExp;
5932
    }
5933
    zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
5934
    mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
5935
    sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
5936
    while ( (sbits64) rem0 < 0 ) {
5937
        --zSig0;
5938
        add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
5939
    }
5940
    zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
5941
    if ( ( zSig1 & 0x3FFF ) <= 4 ) {
5942
        mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
5943
        sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
5944
        while ( (sbits64) rem1 < 0 ) {
5945
            --zSig1;
5946
            add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
5947
        }
5948
        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5949
    }
5950
    shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
5951
    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
5952
 
5953
}
5954
 
5955
/*----------------------------------------------------------------------------
5956
| Returns the remainder of the quadruple-precision floating-point value `a'
5957
| with respect to the corresponding value `b'.  The operation is performed
5958
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5959
*----------------------------------------------------------------------------*/
5960
 
5961
float128 float128_rem( float128 a, float128 b )
5962
{
5963
    flag aSign, bSign, zSign;
5964
    int32 aExp, bExp, expDiff;
5965
    bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
5966
    bits64 allZero, alternateASig0, alternateASig1, sigMean1;
5967
    sbits64 sigMean0;
5968
    float128 z;
5969
 
5970
    aSig1 = extractFloat128Frac1( a );
5971
    aSig0 = extractFloat128Frac0( a );
5972
    aExp = extractFloat128Exp( a );
5973
    aSign = extractFloat128Sign( a );
5974
    bSig1 = extractFloat128Frac1( b );
5975
    bSig0 = extractFloat128Frac0( b );
5976
    bExp = extractFloat128Exp( b );
5977
    bSign = extractFloat128Sign( b );
5978
    if ( aExp == 0x7FFF ) {
5979
        if (    ( aSig0 | aSig1 )
5980
             || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5981
            return propagateFloat128NaN( a, b );
5982
        }
5983
        goto invalid;
5984
    }
5985
    if ( bExp == 0x7FFF ) {
5986
        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5987
        return a;
5988
    }
5989
    if ( bExp == 0 ) {
5990
        if ( ( bSig0 | bSig1 ) == 0 ) {
5991
 invalid:
5992
            float_raise( float_flag_invalid );
5993
            z.low = float128_default_nan_low;
5994
            z.high = float128_default_nan_high;
5995
            return z;
5996
        }
5997
        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5998
    }
5999
    if ( aExp == 0 ) {
6000
        if ( ( aSig0 | aSig1 ) == 0 ) return a;
6001
        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6002
    }
6003
    expDiff = aExp - bExp;
6004
    if ( expDiff < -1 ) return a;
6005
    shortShift128Left(
6006
        aSig0 | LIT64( 0x0001000000000000 ),
6007
        aSig1,
6008
        15 - ( expDiff < 0 ),
6009
        &aSig0,
6010
        &aSig1
6011
    );
6012
    shortShift128Left(
6013
        bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6014
    q = le128( bSig0, bSig1, aSig0, aSig1 );
6015
    if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6016
    expDiff -= 64;
6017
    while ( 0 < expDiff ) {
6018
        q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6019
        q = ( 4 < q ) ? q - 4 : 0;
6020
        mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6021
        shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
6022
        shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
6023
        sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
6024
        expDiff -= 61;
6025
    }
6026
    if ( -64 < expDiff ) {
6027
        q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6028
        q = ( 4 < q ) ? q - 4 : 0;
6029
        q >>= - expDiff;
6030
        shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6031
        expDiff += 52;
6032
        if ( expDiff < 0 ) {
6033
            shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6034
        }
6035
        else {
6036
            shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
6037
        }
6038
        mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6039
        sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
6040
    }
6041
    else {
6042
        shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
6043
        shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6044
    }
6045
    do {
6046
        alternateASig0 = aSig0;
6047
        alternateASig1 = aSig1;
6048
        ++q;
6049
        sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6050
    } while ( 0 <= (sbits64) aSig0 );
6051
    add128(
6052
        aSig0, aSig1, alternateASig0, alternateASig1, &sigMean0, &sigMean1 );
6053
    if (    ( sigMean0 < 0 )
6054
         || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
6055
        aSig0 = alternateASig0;
6056
        aSig1 = alternateASig1;
6057
    }
6058
    zSign = ( (sbits64) aSig0 < 0 );
6059
    if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
6060
    return
6061
        normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 );
6062
 
6063
}
6064
 
6065
/*----------------------------------------------------------------------------
6066
| Returns the square root of the quadruple-precision floating-point value `a'.
6067
| The operation is performed according to the IEC/IEEE Standard for Binary
6068
| Floating-Point Arithmetic.
6069
*----------------------------------------------------------------------------*/
6070
 
6071
float128 float128_sqrt( float128 a )
6072
{
6073
    flag aSign;
6074
    int32 aExp, zExp;
6075
    bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
6076
    bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6077
    float128 z;
6078
 
6079
    aSig1 = extractFloat128Frac1( a );
6080
    aSig0 = extractFloat128Frac0( a );
6081
    aExp = extractFloat128Exp( a );
6082
    aSign = extractFloat128Sign( a );
6083
    if ( aExp == 0x7FFF ) {
6084
        if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a );
6085
        if ( ! aSign ) return a;
6086
        goto invalid;
6087
    }
6088
    if ( aSign ) {
6089
        if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
6090
 invalid:
6091
        float_raise( float_flag_invalid );
6092
        z.low = float128_default_nan_low;
6093
        z.high = float128_default_nan_high;
6094
        return z;
6095
    }
6096
    if ( aExp == 0 ) {
6097
        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
6098
        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6099
    }
6100
    zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
6101
    aSig0 |= LIT64( 0x0001000000000000 );
6102
    zSig0 = estimateSqrt32( aExp, aSig0>>17 );
6103
    shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
6104
    zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6105
    doubleZSig0 = zSig0<<1;
6106
    mul64To128( zSig0, zSig0, &term0, &term1 );
6107
    sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6108
    while ( (sbits64) rem0 < 0 ) {
6109
        --zSig0;
6110
        doubleZSig0 -= 2;
6111
        add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6112
    }
6113
    zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6114
    if ( ( zSig1 & 0x1FFF ) <= 5 ) {
6115
        if ( zSig1 == 0 ) zSig1 = 1;
6116
        mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6117
        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6118
        mul64To128( zSig1, zSig1, &term2, &term3 );
6119
        sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6120
        while ( (sbits64) rem1 < 0 ) {
6121
            --zSig1;
6122
            shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6123
            term3 |= 1;
6124
            term2 |= doubleZSig0;
6125
            add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6126
        }
6127
        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6128
    }
6129
    shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
6130
    return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 );
6131
 
6132
}
6133
 
6134
/*----------------------------------------------------------------------------
6135
| Returns 1 if the quadruple-precision floating-point value `a' is equal to
6136
| the corresponding value `b', and 0 otherwise.  The comparison is performed
6137
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6138
*----------------------------------------------------------------------------*/
6139
 
6140
flag float128_eq( float128 a, float128 b )
6141
{
6142
 
6143
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6144
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6145
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
6146
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6147
       ) {
6148
        if (    float128_is_signaling_nan( a )
6149
             || float128_is_signaling_nan( b ) ) {
6150
            float_raise( float_flag_invalid );
6151
        }
6152
        return 0;
6153
    }
6154
    return
6155
           ( a.low == b.low )
6156
        && (    ( a.high == b.high )
6157
             || (    ( a.low == 0 )
6158
                  && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
6159
           );
6160
 
6161
}
6162
 
6163
/*----------------------------------------------------------------------------
6164
| Returns 1 if the quadruple-precision floating-point value `a' is less than
6165
| or equal to the corresponding value `b', and 0 otherwise.  The comparison
6166
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
6167
| Arithmetic.
6168
*----------------------------------------------------------------------------*/
6169
 
6170
flag float128_le( float128 a, float128 b )
6171
{
6172
    flag aSign, bSign;
6173
 
6174
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6175
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6176
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
6177
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6178
       ) {
6179
        float_raise( float_flag_invalid );
6180
        return 0;
6181
    }
6182
    aSign = extractFloat128Sign( a );
6183
    bSign = extractFloat128Sign( b );
6184
    if ( aSign != bSign ) {
6185
        return
6186
               aSign
6187
            || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6188
                 == 0 );
6189
    }
6190
    return
6191
          aSign ? le128( b.high, b.low, a.high, a.low )
6192
        : le128( a.high, a.low, b.high, b.low );
6193
 
6194
}
6195
 
6196
/*----------------------------------------------------------------------------
6197
| Returns 1 if the quadruple-precision floating-point value `a' is less than
6198
| the corresponding value `b', and 0 otherwise.  The comparison is performed
6199
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6200
*----------------------------------------------------------------------------*/
6201
 
6202
flag float128_lt( float128 a, float128 b )
6203
{
6204
    flag aSign, bSign;
6205
 
6206
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6207
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6208
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
6209
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6210
       ) {
6211
        float_raise( float_flag_invalid );
6212
        return 0;
6213
    }
6214
    aSign = extractFloat128Sign( a );
6215
    bSign = extractFloat128Sign( b );
6216
    if ( aSign != bSign ) {
6217
        return
6218
               aSign
6219
            && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6220
                 != 0 );
6221
    }
6222
    return
6223
          aSign ? lt128( b.high, b.low, a.high, a.low )
6224
        : lt128( a.high, a.low, b.high, b.low );
6225
 
6226
}
6227
 
6228
/*----------------------------------------------------------------------------
6229
| Returns 1 if the quadruple-precision floating-point value `a' is equal to
6230
| the corresponding value `b', and 0 otherwise.  The invalid exception is
6231
| raised if either operand is a NaN.  Otherwise, the comparison is performed
6232
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6233
*----------------------------------------------------------------------------*/
6234
 
6235
flag float128_eq_signaling( float128 a, float128 b )
6236
{
6237
 
6238
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6239
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6240
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
6241
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6242
       ) {
6243
        float_raise( float_flag_invalid );
6244
        return 0;
6245
    }
6246
    return
6247
           ( a.low == b.low )
6248
        && (    ( a.high == b.high )
6249
             || (    ( a.low == 0 )
6250
                  && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
6251
           );
6252
 
6253
}
6254
 
6255
/*----------------------------------------------------------------------------
6256
| Returns 1 if the quadruple-precision floating-point value `a' is less than
6257
| or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
6258
| cause an exception.  Otherwise, the comparison is performed according to the
6259
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6260
*----------------------------------------------------------------------------*/
6261
 
6262
flag float128_le_quiet( float128 a, float128 b )
6263
{
6264
    flag aSign, bSign;
6265
 
6266
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6267
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6268
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
6269
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6270
       ) {
6271
        if (    float128_is_signaling_nan( a )
6272
             || float128_is_signaling_nan( b ) ) {
6273
            float_raise( float_flag_invalid );
6274
        }
6275
        return 0;
6276
    }
6277
    aSign = extractFloat128Sign( a );
6278
    bSign = extractFloat128Sign( b );
6279
    if ( aSign != bSign ) {
6280
        return
6281
               aSign
6282
            || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6283
                 == 0 );
6284
    }
6285
    return
6286
          aSign ? le128( b.high, b.low, a.high, a.low )
6287
        : le128( a.high, a.low, b.high, b.low );
6288
 
6289
}
6290
 
6291
/*----------------------------------------------------------------------------
6292
| Returns 1 if the quadruple-precision floating-point value `a' is less than
6293
| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
6294
| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
6295
| Standard for Binary Floating-Point Arithmetic.
6296
*----------------------------------------------------------------------------*/
6297
 
6298
flag float128_lt_quiet( float128 a, float128 b )
6299
{
6300
    flag aSign, bSign;
6301
 
6302
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6303
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6304
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
6305
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6306
       ) {
6307
        if (    float128_is_signaling_nan( a )
6308
             || float128_is_signaling_nan( b ) ) {
6309
            float_raise( float_flag_invalid );
6310
        }
6311
        return 0;
6312
    }
6313
    aSign = extractFloat128Sign( a );
6314
    bSign = extractFloat128Sign( b );
6315
    if ( aSign != bSign ) {
6316
        return
6317
               aSign
6318
            && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6319
                 != 0 );
6320
    }
6321
    return
6322
          aSign ? lt128( b.high, b.low, a.high, a.low )
6323
        : lt128( a.high, a.low, b.high, b.low );
6324
 
6325
}
6326
 
6327
#endif
6328
 

powered by: WebSVN 2.1.0

© copyright 1999-2024 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.