OpenCores
URL https://opencores.org/ocsvn/openrisc/openrisc/trunk

Subversion Repositories openrisc

[/] [openrisc/] [trunk/] [orpsocv2/] [sw/] [apps/] [testfloat/] [softfloat.c] - Blame information for rev 438

Go to most recent revision | Details | Compare with Previous | View Log

Line No. Rev Author Line
1 349 julius
 
2
/*============================================================================
3
 
4
This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
5
Package, Release 2b.
6
 
7
Written by John R. Hauser.  This work was made possible in part by the
8
International Computer Science Institute, located at Suite 600, 1947 Center
9
Street, Berkeley, California 94704.  Funding was partially provided by the
10
National Science Foundation under grant MIP-9311980.  The original version
11
of this code was written as part of a project to build a fixed-point vector
12
processor in collaboration with the University of California at Berkeley,
13
overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
14
is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
15
arithmetic/SoftFloat.html'.
16
 
17
THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort has
18
been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
19
RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
20
AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
21
COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
22
EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
23
INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
24
OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
25
 
26
Derivative works are acceptable, even for commercial purposes, so long as
27
(1) the source code for the derivative work includes prominent notice that
28
the work is derivative, and (2) the source code includes prominent notice with
29
these four paragraphs for those parts of this code that are retained.
30
 
31
Modified for use with or1ksim's testsuite.
32
 
33
Contributor Julius Baxter <julius.baxter@orsoc.se>
34
 
35
=============================================================================*/
36
 
37
#include "milieu.h"
38
#include "softfloat.h"
39
 
40
/*----------------------------------------------------------------------------
41
| Floating-point rounding mode, extended double-precision rounding precision,
42
| and exception flags.
43
*----------------------------------------------------------------------------*/
44
int8 float_rounding_mode = float_round_nearest_even;
45
int8 float_exception_flags = 0;
46
#ifdef FLOATX80
47
int8 floatx80_rounding_precision = 80;
48
#endif
49
 
50
/*----------------------------------------------------------------------------
51
| Primitive arithmetic functions, including multi-word arithmetic, and
52
| division and square root approximations.  (Can be specialized to target if
53
| desired.)
54
*----------------------------------------------------------------------------*/
55
//#include "softfloat-macros"
56
 
57
/*----------------------------------------------------------------------------
58
| Shifts `a' right by the number of bits given in `count'.  If any nonzero
59
| bits are shifted off, they are ``jammed'' into the least significant bit of
60
| the result by setting the least significant bit to 1.  The value of `count'
61
| can be arbitrarily large; in particular, if `count' is greater than 32, the
62
| result will be either 0 or 1, depending on whether `a' is zero or nonzero.
63
| The result is stored in the location pointed to by `zPtr'.
64
*----------------------------------------------------------------------------*/
65
 
66
INLINE void shift32RightJamming( bits32 a, int16 count, bits32 *zPtr )
67
{
68
    bits32 z;
69
 
70
    if ( count == 0 ) {
71
        z = a;
72
    }
73
    else if ( count < 32 ) {
74
        z = ( a>>count ) | ( ( a<<( ( - count ) & 31 ) ) != 0 );
75
    }
76
    else {
77
        z = ( a != 0 );
78
    }
79
    *zPtr = z;
80
 
81
}
82
 
83
/*----------------------------------------------------------------------------
84
| Shifts `a' right by the number of bits given in `count'.  If any nonzero
85
| bits are shifted off, they are ``jammed'' into the least significant bit of
86
| the result by setting the least significant bit to 1.  The value of `count'
87
| can be arbitrarily large; in particular, if `count' is greater than 64, the
88
| result will be either 0 or 1, depending on whether `a' is zero or nonzero.
89
| The result is stored in the location pointed to by `zPtr'.
90
*----------------------------------------------------------------------------*/
91
 
92
INLINE void shift64RightJamming( bits64 a, int16 count, bits64 *zPtr )
93
{
94
    bits64 z;
95
 
96
    if ( count == 0 ) {
97
        z = a;
98
    }
99
    else if ( count < 64 ) {
100
        z = ( a>>count ) | ( ( a<<( ( - count ) & 63 ) ) != 0 );
101
    }
102
    else {
103
        z = ( a != 0 );
104
    }
105
    *zPtr = z;
106
 
107
}
108
 
109
/*----------------------------------------------------------------------------
110
| Shifts the 128-bit value formed by concatenating `a0' and `a1' right by 64
111
| _plus_ the number of bits given in `count'.  The shifted result is at most
112
| 64 nonzero bits; this is stored at the location pointed to by `z0Ptr'.  The
113
| bits shifted off form a second 64-bit result as follows:  The _last_ bit
114
| shifted off is the most-significant bit of the extra result, and the other
115
| 63 bits of the extra result are all zero if and only if _all_but_the_last_
116
| bits shifted off were all zero.  This extra result is stored in the location
117
| pointed to by `z1Ptr'.  The value of `count' can be arbitrarily large.
118
|     (This routine makes more sense if `a0' and `a1' are considered to form
119
| a fixed-point value with binary point between `a0' and `a1'.  This fixed-
120
| point value is shifted right by the number of bits given in `count', and
121
| the integer part of the result is returned at the location pointed to by
122
| `z0Ptr'.  The fractional part of the result may be slightly corrupted as
123
| described above, and is returned at the location pointed to by `z1Ptr'.)
124
*----------------------------------------------------------------------------*/
125
 
126
INLINE void
127
 shift64ExtraRightJamming(
128
     bits64 a0, bits64 a1, int16 count, bits64 *z0Ptr, bits64 *z1Ptr )
129
{
130
    bits64 z0, z1;
131
    int8 negCount = ( - count ) & 63;
132
 
133
    if ( count == 0 ) {
134
        z1 = a1;
135
        z0 = a0;
136
    }
137
    else if ( count < 64 ) {
138
        z1 = ( a0<<negCount ) | ( a1 != 0 );
139
        z0 = a0>>count;
140
    }
141
    else {
142
        if ( count == 64 ) {
143
            z1 = a0 | ( a1 != 0 );
144
        }
145
        else {
146
            z1 = ( ( a0 | a1 ) != 0 );
147
        }
148
        z0 = 0;
149
    }
150
    *z1Ptr = z1;
151
    *z0Ptr = z0;
152
 
153
}
154
 
155
/*----------------------------------------------------------------------------
156
| Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the
157
| number of bits given in `count'.  Any bits shifted off are lost.  The value
158
| of `count' can be arbitrarily large; in particular, if `count' is greater
159
| than 128, the result will be 0.  The result is broken into two 64-bit pieces
160
| which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
161
*----------------------------------------------------------------------------*/
162
// Not used - commenting out to stop werrors during compile
163
/*
164
INLINE void
165
 shift128Right(
166
     bits64 a0, bits64 a1, int16 count, bits64 *z0Ptr, bits64 *z1Ptr )
167
{
168
    bits64 z0, z1;
169
    int8 negCount = ( - count ) & 63;
170
 
171
    if ( count == 0 ) {
172
        z1 = a1;
173
        z0 = a0;
174
    }
175
    else if ( count < 64 ) {
176
        z1 = ( a0<<negCount ) | ( a1>>count );
177
        z0 = a0>>count;
178
    }
179
    else {
180
        z1 = ( count < 64 ) ? ( a0>>( count & 63 ) ) : 0;
181
        z0 = 0;
182
    }
183
    *z1Ptr = z1;
184
    *z0Ptr = z0;
185
 
186
}
187
*/
188
/*----------------------------------------------------------------------------
189
| Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the
190
| number of bits given in `count'.  If any nonzero bits are shifted off, they
191
| are ``jammed'' into the least significant bit of the result by setting the
192
| least significant bit to 1.  The value of `count' can be arbitrarily large;
193
| in particular, if `count' is greater than 128, the result will be either
194
| 0 or 1, depending on whether the concatenation of `a0' and `a1' is zero or
195
| nonzero.  The result is broken into two 64-bit pieces which are stored at
196
| the locations pointed to by `z0Ptr' and `z1Ptr'.
197
*----------------------------------------------------------------------------*/
198
 // Not used - commenting out to stop werrors during compile
199
 /*
200
INLINE void
201
 shift128RightJamming(
202
     bits64 a0, bits64 a1, int16 count, bits64 *z0Ptr, bits64 *z1Ptr )
203
{
204
    bits64 z0, z1;
205
    int8 negCount = ( - count ) & 63;
206
 
207
    if ( count == 0 ) {
208
        z1 = a1;
209
        z0 = a0;
210
    }
211
    else if ( count < 64 ) {
212
        z1 = ( a0<<negCount ) | ( a1>>count ) | ( ( a1<<negCount ) != 0 );
213
        z0 = a0>>count;
214
    }
215
    else {
216
        if ( count == 64 ) {
217
            z1 = a0 | ( a1 != 0 );
218
        }
219
        else if ( count < 128 ) {
220
            z1 = ( a0>>( count & 63 ) ) | ( ( ( a0<<negCount ) | a1 ) != 0 );
221
        }
222
        else {
223
            z1 = ( ( a0 | a1 ) != 0 );
224
        }
225
        z0 = 0;
226
    }
227
    *z1Ptr = z1;
228
    *z0Ptr = z0;
229
 
230
}
231
 */
232
/*----------------------------------------------------------------------------
233
| Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' right
234
| by 64 _plus_ the number of bits given in `count'.  The shifted result is
235
| at most 128 nonzero bits; these are broken into two 64-bit pieces which are
236
| stored at the locations pointed to by `z0Ptr' and `z1Ptr'.  The bits shifted
237
| off form a third 64-bit result as follows:  The _last_ bit shifted off is
238
| the most-significant bit of the extra result, and the other 63 bits of the
239
| extra result are all zero if and only if _all_but_the_last_ bits shifted off
240
| were all zero.  This extra result is stored in the location pointed to by
241
| `z2Ptr'.  The value of `count' can be arbitrarily large.
242
|     (This routine makes more sense if `a0', `a1', and `a2' are considered
243
| to form a fixed-point value with binary point between `a1' and `a2'.  This
244
| fixed-point value is shifted right by the number of bits given in `count',
245
| and the integer part of the result is returned at the locations pointed to
246
| by `z0Ptr' and `z1Ptr'.  The fractional part of the result may be slightly
247
| corrupted as described above, and is returned at the location pointed to by
248
| `z2Ptr'.)
249
*----------------------------------------------------------------------------*/
250
  // Not used - commenting out to stop werrors during compile
251
  /*
252
INLINE void
253
 shift128ExtraRightJamming(
254
     bits64 a0,
255
     bits64 a1,
256
     bits64 a2,
257
     int16 count,
258
     bits64 *z0Ptr,
259
     bits64 *z1Ptr,
260
     bits64 *z2Ptr
261
 )
262
{
263
    bits64 z0, z1, z2;
264
    int8 negCount = ( - count ) & 63;
265
 
266
    if ( count == 0 ) {
267
        z2 = a2;
268
        z1 = a1;
269
        z0 = a0;
270
    }
271
    else {
272
        if ( count < 64 ) {
273
            z2 = a1<<negCount;
274
            z1 = ( a0<<negCount ) | ( a1>>count );
275
            z0 = a0>>count;
276
        }
277
        else {
278
            if ( count == 64 ) {
279
                z2 = a1;
280
                z1 = a0;
281
            }
282
            else {
283
                a2 |= a1;
284
                if ( count < 128 ) {
285
                    z2 = a0<<negCount;
286
                    z1 = a0>>( count & 63 );
287
                }
288
                else {
289
                    z2 = ( count == 128 ) ? a0 : ( a0 != 0 );
290
                    z1 = 0;
291
                }
292
            }
293
            z0 = 0;
294
        }
295
        z2 |= ( a2 != 0 );
296
    }
297
    *z2Ptr = z2;
298
    *z1Ptr = z1;
299
    *z0Ptr = z0;
300
 
301
}
302
*/
303
/*----------------------------------------------------------------------------
304
| Shifts the 128-bit value formed by concatenating `a0' and `a1' left by the
305
| number of bits given in `count'.  Any bits shifted off are lost.  The value
306
| of `count' must be less than 64.  The result is broken into two 64-bit
307
| pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
308
*----------------------------------------------------------------------------*/
309
   // Not used - commenting out to stop werrors during compile
310
   /*
311
INLINE void
312
 shortShift128Left(
313
     bits64 a0, bits64 a1, int16 count, bits64 *z0Ptr, bits64 *z1Ptr )
314
{
315
 
316
    *z1Ptr = a1<<count;
317
    *z0Ptr =
318
        ( count == 0 ) ? a0 : ( a0<<count ) | ( a1>>( ( - count ) & 63 ) );
319
 
320
}
321
*/
322
/*----------------------------------------------------------------------------
323
| Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' left
324
| by the number of bits given in `count'.  Any bits shifted off are lost.
325
| The value of `count' must be less than 64.  The result is broken into three
326
| 64-bit pieces which are stored at the locations pointed to by `z0Ptr',
327
| `z1Ptr', and `z2Ptr'.
328
*----------------------------------------------------------------------------*/
329
    // Not used - commenting out to stop werrors during compile
330
    /*
331
INLINE void
332
 shortShift192Left(
333
     bits64 a0,
334
     bits64 a1,
335
     bits64 a2,
336
     int16 count,
337
     bits64 *z0Ptr,
338
     bits64 *z1Ptr,
339
     bits64 *z2Ptr
340
 )
341
{
342
    bits64 z0, z1, z2;
343
    int8 negCount;
344
 
345
    z2 = a2<<count;
346
    z1 = a1<<count;
347
    z0 = a0<<count;
348
    if ( 0 < count ) {
349
        negCount = ( ( - count ) & 63 );
350
        z1 |= a2>>negCount;
351
        z0 |= a1>>negCount;
352
    }
353
    *z2Ptr = z2;
354
    *z1Ptr = z1;
355
    *z0Ptr = z0;
356
 
357
}
358
*/
359
/*----------------------------------------------------------------------------
360
| Adds the 128-bit value formed by concatenating `a0' and `a1' to the 128-bit
361
| value formed by concatenating `b0' and `b1'.  Addition is modulo 2^128, so
362
| any carry out is lost.  The result is broken into two 64-bit pieces which
363
| are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
364
*----------------------------------------------------------------------------*/
365
 
366
INLINE void
367
 add128(
368
     bits64 a0, bits64 a1, bits64 b0, bits64 b1, bits64 *z0Ptr, bits64 *z1Ptr )
369
{
370
    bits64 z1;
371
 
372
    z1 = a1 + b1;
373
    *z1Ptr = z1;
374
    *z0Ptr = a0 + b0 + ( z1 < a1 );
375
 
376
}
377
 
378
/*----------------------------------------------------------------------------
379
| Adds the 192-bit value formed by concatenating `a0', `a1', and `a2' to the
380
| 192-bit value formed by concatenating `b0', `b1', and `b2'.  Addition is
381
| modulo 2^192, so any carry out is lost.  The result is broken into three
382
| 64-bit pieces which are stored at the locations pointed to by `z0Ptr',
383
| `z1Ptr', and `z2Ptr'.
384
*----------------------------------------------------------------------------*/
385
// Not used - commenting out to stop werrors during compile
386
/*
387
INLINE void
388
 add192(
389
     bits64 a0,
390
     bits64 a1,
391
     bits64 a2,
392
     bits64 b0,
393
     bits64 b1,
394
     bits64 b2,
395
     bits64 *z0Ptr,
396
     bits64 *z1Ptr,
397
     bits64 *z2Ptr
398
 )
399
{
400
    bits64 z0, z1, z2;
401
    int8 carry0, carry1;
402
 
403
    z2 = a2 + b2;
404
    carry1 = ( z2 < a2 );
405
    z1 = a1 + b1;
406
    carry0 = ( z1 < a1 );
407
    z0 = a0 + b0;
408
    z1 += carry1;
409
    z0 += ( z1 < carry1 );
410
    z0 += carry0;
411
    *z2Ptr = z2;
412
    *z1Ptr = z1;
413
    *z0Ptr = z0;
414
 
415
}
416
*/
417
/*----------------------------------------------------------------------------
418
| Subtracts the 128-bit value formed by concatenating `b0' and `b1' from the
419
| 128-bit value formed by concatenating `a0' and `a1'.  Subtraction is modulo
420
| 2^128, so any borrow out (carry out) is lost.  The result is broken into two
421
| 64-bit pieces which are stored at the locations pointed to by `z0Ptr' and
422
| `z1Ptr'.
423
*----------------------------------------------------------------------------*/
424
 
425
INLINE void
426
 sub128(
427
     bits64 a0, bits64 a1, bits64 b0, bits64 b1, bits64 *z0Ptr, bits64 *z1Ptr )
428
{
429
 
430
    *z1Ptr = a1 - b1;
431
    *z0Ptr = a0 - b0 - ( a1 < b1 );
432
 
433
}
434
 
435
/*----------------------------------------------------------------------------
436
| Subtracts the 192-bit value formed by concatenating `b0', `b1', and `b2'
437
| from the 192-bit value formed by concatenating `a0', `a1', and `a2'.
438
| Subtraction is modulo 2^192, so any borrow out (carry out) is lost.  The
439
| result is broken into three 64-bit pieces which are stored at the locations
440
| pointed to by `z0Ptr', `z1Ptr', and `z2Ptr'.
441
*----------------------------------------------------------------------------*/
442
// Not used - commenting out to stop werrors during compile
443
/*
444
INLINE void
445
 sub192(
446
     bits64 a0,
447
     bits64 a1,
448
     bits64 a2,
449
     bits64 b0,
450
     bits64 b1,
451
     bits64 b2,
452
     bits64 *z0Ptr,
453
     bits64 *z1Ptr,
454
     bits64 *z2Ptr
455
 )
456
{
457
    bits64 z0, z1, z2;
458
    int8 borrow0, borrow1;
459
 
460
    z2 = a2 - b2;
461
    borrow1 = ( a2 < b2 );
462
    z1 = a1 - b1;
463
    borrow0 = ( a1 < b1 );
464
    z0 = a0 - b0;
465
    z0 -= ( z1 < borrow1 );
466
    z1 -= borrow1;
467
    z0 -= borrow0;
468
    *z2Ptr = z2;
469
    *z1Ptr = z1;
470
    *z0Ptr = z0;
471
 
472
}
473
*/
474
/*----------------------------------------------------------------------------
475
| Multiplies `a' by `b' to obtain a 128-bit product.  The product is broken
476
| into two 64-bit pieces which are stored at the locations pointed to by
477
| `z0Ptr' and `z1Ptr'.
478
*----------------------------------------------------------------------------*/
479
 
480
INLINE void
481
mul64To128( bits64 a, bits64 b, bits64 *z0Ptr, bits64 *z1Ptr )
482
{
483
    bits32 aHigh, aLow, bHigh, bLow;
484
    bits64 z0, zMiddleA, zMiddleB, z1;
485
 
486
    aLow = a;
487
    aHigh = a>>32;
488
    bLow = b;
489
    bHigh = b>>32;
490
    z1 = ( (bits64) aLow ) * bLow;
491
    zMiddleA = ( (bits64) aLow ) * bHigh;
492
    zMiddleB = ( (bits64) aHigh ) * bLow;
493
    z0 = ( (bits64) aHigh ) * bHigh;
494
    zMiddleA += zMiddleB;
495
    z0 += ( ( (bits64) ( zMiddleA < zMiddleB ) )<<32 ) + ( zMiddleA>>32 );
496
    zMiddleA <<= 32;
497
    z1 += zMiddleA;
498
    z0 += ( z1 < zMiddleA );
499
    *z1Ptr = z1;
500
    *z0Ptr = z0;
501
 
502
}
503
 
504
/*----------------------------------------------------------------------------
505
| Multiplies the 128-bit value formed by concatenating `a0' and `a1' by
506
| `b' to obtain a 192-bit product.  The product is broken into three 64-bit
507
| pieces which are stored at the locations pointed to by `z0Ptr', `z1Ptr', and
508
| `z2Ptr'.
509
*----------------------------------------------------------------------------*/
510
// Not used - commenting out to stop werrors during compile
511
/*
512
INLINE void
513
 mul128By64To192(
514
     bits64 a0,
515
     bits64 a1,
516
     bits64 b,
517
     bits64 *z0Ptr,
518
     bits64 *z1Ptr,
519
     bits64 *z2Ptr
520
 )
521
{
522
    bits64 z0, z1, z2, more1;
523
 
524
    mul64To128( a1, b, &z1, &z2 );
525
    mul64To128( a0, b, &z0, &more1 );
526
    add128( z0, more1, 0, z1, &z0, &z1 );
527
    *z2Ptr = z2;
528
    *z1Ptr = z1;
529
    *z0Ptr = z0;
530
 
531
}
532
*/
533
/*----------------------------------------------------------------------------
534
| Multiplies the 128-bit value formed by concatenating `a0' and `a1' to the
535
| 128-bit value formed by concatenating `b0' and `b1' to obtain a 256-bit
536
| product.  The product is broken into four 64-bit pieces which are stored at
537
| the locations pointed to by `z0Ptr', `z1Ptr', `z2Ptr', and `z3Ptr'.
538
*----------------------------------------------------------------------------*/
539
 // Not used - commenting out to stop werrors during compile
540
 /*
541
INLINE void
542
 mul128To256(
543
     bits64 a0,
544
     bits64 a1,
545
     bits64 b0,
546
     bits64 b1,
547
     bits64 *z0Ptr,
548
     bits64 *z1Ptr,
549
     bits64 *z2Ptr,
550
     bits64 *z3Ptr
551
 )
552
{
553
    bits64 z0, z1, z2, z3;
554
    bits64 more1, more2;
555
 
556
    mul64To128( a1, b1, &z2, &z3 );
557
    mul64To128( a1, b0, &z1, &more2 );
558
    add128( z1, more2, 0, z2, &z1, &z2 );
559
    mul64To128( a0, b0, &z0, &more1 );
560
    add128( z0, more1, 0, z1, &z0, &z1 );
561
    mul64To128( a0, b1, &more1, &more2 );
562
    add128( more1, more2, 0, z2, &more1, &z2 );
563
    add128( z0, z1, 0, more1, &z0, &z1 );
564
    *z3Ptr = z3;
565
    *z2Ptr = z2;
566
    *z1Ptr = z1;
567
    *z0Ptr = z0;
568
 
569
}
570
 */
571
/*----------------------------------------------------------------------------
572
| Returns an approximation to the 64-bit integer quotient obtained by dividing
573
| `b' into the 128-bit value formed by concatenating `a0' and `a1'.  The
574
| divisor `b' must be at least 2^63.  If q is the exact quotient truncated
575
| toward zero, the approximation returned lies between q and q + 2 inclusive.
576
| If the exact quotient q is larger than 64 bits, the maximum positive 64-bit
577
| unsigned integer is returned.
578
*----------------------------------------------------------------------------*/
579
 
580
static bits64 estimateDiv128To64( bits64 a0, bits64 a1, bits64 b )
581
{
582
    bits64 b0, b1;
583
    bits64 rem0, rem1, term0, term1;
584
    bits64 z;
585
 
586
    if ( b <= a0 ) return LIT64( 0xFFFFFFFFFFFFFFFF );
587
    b0 = b>>32;
588
    z = ( b0<<32 <= a0 ) ? LIT64( 0xFFFFFFFF00000000 ) : ( a0 / b0 )<<32;
589
    mul64To128( b, z, &term0, &term1 );
590
    sub128( a0, a1, term0, term1, &rem0, &rem1 );
591
    while ( ( (sbits64) rem0 ) < 0 ) {
592
        z -= LIT64( 0x100000000 );
593
        b1 = b<<32;
594
        add128( rem0, rem1, b0, b1, &rem0, &rem1 );
595
    }
596
    rem0 = ( rem0<<32 ) | ( rem1>>32 );
597
    z |= ( b0<<32 <= rem0 ) ? 0xFFFFFFFF : rem0 / b0;
598
    return z;
599
 
600
}
601
 
602
/*----------------------------------------------------------------------------
603
| Returns an approximation to the square root of the 32-bit significand given
604
| by `a'.  Considered as an integer, `a' must be at least 2^31.  If bit 0 of
605
| `aExp' (the least significant bit) is 1, the integer returned approximates
606
| 2^31*sqrt(`a'/2^31), where `a' is considered an integer.  If bit 0 of `aExp'
607
| is 0, the integer returned approximates 2^31*sqrt(`a'/2^30).  In either
608
| case, the approximation returned lies strictly within +/-2 of the exact
609
| value.
610
*----------------------------------------------------------------------------*/
611
 
612
static bits32 estimateSqrt32( int16 aExp, bits32 a )
613
{
614
    static const bits16 sqrtOddAdjustments[] = {
615
        0x0004, 0x0022, 0x005D, 0x00B1, 0x011D, 0x019F, 0x0236, 0x02E0,
616
        0x039C, 0x0468, 0x0545, 0x0631, 0x072B, 0x0832, 0x0946, 0x0A67
617
    };
618
    static const bits16 sqrtEvenAdjustments[] = {
619
        0x0A2D, 0x08AF, 0x075A, 0x0629, 0x051A, 0x0429, 0x0356, 0x029E,
620
        0x0200, 0x0179, 0x0109, 0x00AF, 0x0068, 0x0034, 0x0012, 0x0002
621
    };
622
    int8 index;
623
    bits32 z;
624
 
625
    index = ( a>>27 ) & 15;
626
    if ( aExp & 1 ) {
627
        z = 0x4000 + ( a>>17 ) - sqrtOddAdjustments[ index ];
628
        z = ( ( a / z )<<14 ) + ( z<<15 );
629
        a >>= 1;
630
    }
631
    else {
632
        z = 0x8000 + ( a>>17 ) - sqrtEvenAdjustments[ index ];
633
        z = a / z + z;
634
        z = ( 0x20000 <= z ) ? 0xFFFF8000 : ( z<<15 );
635
        if ( z <= a ) return (bits32) ( ( (sbits32) a )>>1 );
636
    }
637
    return ( (bits32) ( ( ( (bits64) a )<<31 ) / z ) ) + ( z>>1 );
638
 
639
}
640
 
641
/*----------------------------------------------------------------------------
642
| Returns the number of leading 0 bits before the most-significant 1 bit of
643
| `a'.  If `a' is zero, 32 is returned.
644
*----------------------------------------------------------------------------*/
645
 
646
static int8 countLeadingZeros32( bits32 a )
647
{
648
    static const int8 countLeadingZerosHigh[] = {
649
        8, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
650
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
651
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
652
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
653
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
654
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
655
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
656
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
657
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
658
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
659
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
660
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
661
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
662
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
663
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
664
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
665
    };
666
    int8 shiftCount;
667
 
668
    shiftCount = 0;
669
    if ( a < 0x10000 ) {
670
        shiftCount += 16;
671
        a <<= 16;
672
    }
673
    if ( a < 0x1000000 ) {
674
        shiftCount += 8;
675
        a <<= 8;
676
    }
677
    shiftCount += countLeadingZerosHigh[ a>>24 ];
678
    return shiftCount;
679
 
680
}
681
 
682
/*----------------------------------------------------------------------------
683
| Returns the number of leading 0 bits before the most-significant 1 bit of
684
| `a'.  If `a' is zero, 64 is returned.
685
*----------------------------------------------------------------------------*/
686
 
687
static int8 countLeadingZeros64( bits64 a )
688
{
689
    int8 shiftCount;
690
 
691
    shiftCount = 0;
692
    if ( a < ( (bits64) 1 )<<32 ) {
693
        shiftCount += 32;
694
    }
695
    else {
696
        a >>= 32;
697
    }
698
    shiftCount += countLeadingZeros32( a );
699
    return shiftCount;
700
 
701
}
702
 
703
/*----------------------------------------------------------------------------
704
| Returns 1 if the 128-bit value formed by concatenating `a0' and `a1'
705
| is equal to the 128-bit value formed by concatenating `b0' and `b1'.
706
| Otherwise, returns 0.
707
*----------------------------------------------------------------------------*/
708
// Not used - commenting out to stop werrors during compile
709
/*
710
INLINE flag eq128( bits64 a0, bits64 a1, bits64 b0, bits64 b1 )
711
{
712
 
713
    return ( a0 == b0 ) && ( a1 == b1 );
714
 
715
}
716
*/
717
/*----------------------------------------------------------------------------
718
| Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less
719
| than or equal to the 128-bit value formed by concatenating `b0' and `b1'.
720
| Otherwise, returns 0.
721
*----------------------------------------------------------------------------*/
722
 // Not used - commenting out to stop werrors during compile
723
 /*
724
INLINE flag le128( bits64 a0, bits64 a1, bits64 b0, bits64 b1 )
725
{
726
 
727
    return ( a0 < b0 ) || ( ( a0 == b0 ) && ( a1 <= b1 ) );
728
 
729
}
730
 */
731
/*----------------------------------------------------------------------------
732
| Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less
733
| than the 128-bit value formed by concatenating `b0' and `b1'.  Otherwise,
734
| returns 0.
735
*----------------------------------------------------------------------------*/
736
  // Not used - commenting out to stop werrors during compile
737
  /*
738
INLINE flag lt128( bits64 a0, bits64 a1, bits64 b0, bits64 b1 )
739
{
740
 
741
    return ( a0 < b0 ) || ( ( a0 == b0 ) && ( a1 < b1 ) );
742
 
743
}
744
  */
745
/*----------------------------------------------------------------------------
746
| Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is
747
| not equal to the 128-bit value formed by concatenating `b0' and `b1'.
748
| Otherwise, returns 0.
749
*----------------------------------------------------------------------------*/
750
   // Not used - commenting out to stop werrors during compile
751
   /*
752
INLINE flag ne128( bits64 a0, bits64 a1, bits64 b0, bits64 b1 )
753
{
754
 
755
    return ( a0 != b0 ) || ( a1 != b1 );
756
 
757
}
758
   */
759
/*----------------------------------------------------------------------------
760
| Functions and definitions to determine:  (1) whether tininess for underflow
761
| is detected before or after rounding by default, (2) what (if anything)
762
| happens when exceptions are raised, (3) how signaling NaNs are distinguished
763
| from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
764
| are propagated from function inputs to output.  These details are target-
765
| specific.
766
*----------------------------------------------------------------------------*/
767
//#include "softfloat-specialize"
768
 
769
/*----------------------------------------------------------------------------
770
| Underflow tininess-detection mode, statically initialized to default value.
771
| (The declaration in `softfloat.h' must match the `int8' type here.)
772
*----------------------------------------------------------------------------*/
773
int8 float_detect_tininess = float_tininess_after_rounding;
774
 
775
/*----------------------------------------------------------------------------
776
| Raises the exceptions specified by `flags'.  Floating-point traps can be
777
| defined here if desired.  It is currently not possible for such a trap
778
| to substitute a result value.  If traps are not implemented, this routine
779
| should be simply `float_exception_flags |= flags;'.
780
*----------------------------------------------------------------------------*/
781
 
782
void float_raise( int8 flags )
783
{
784
 
785
    float_exception_flags |= flags;
786
 
787
}
788
 
789
/*----------------------------------------------------------------------------
790
| Internal canonical NaN format.
791
*----------------------------------------------------------------------------*/
792
typedef struct {
793
    flag sign;
794
    bits64 high, low;
795
} commonNaNT;
796
 
797
/*----------------------------------------------------------------------------
798
| The pattern for a default generated single-precision NaN.
799
*----------------------------------------------------------------------------*/
800
#define float32_default_nan 0xFFC00000
801
 
802
/*----------------------------------------------------------------------------
803
| Returns 1 if the single-precision floating-point value `a' is a NaN;
804
| otherwise returns 0.
805
*----------------------------------------------------------------------------*/
806
 
807
flag float32_is_nan( float32 a )
808
{
809
 
810
    return ( 0xFF000000 < (bits32) ( a<<1 ) );
811
 
812
}
813
 
814
/*----------------------------------------------------------------------------
815
| Returns 1 if the single-precision floating-point value `a' is a signaling
816
| NaN; otherwise returns 0.
817
*----------------------------------------------------------------------------*/
818
 
819
flag float32_is_signaling_nan( float32 a )
820
{
821
 
822
    return ( ( ( a>>22 ) & 0x1FF ) == 0x1FE ) && ( a & 0x003FFFFF );
823
 
824
}
825
 
826
/*----------------------------------------------------------------------------
827
| Returns the result of converting the single-precision floating-point NaN
828
| `a' to the canonical NaN format.  If `a' is a signaling NaN, the invalid
829
| exception is raised.
830
*----------------------------------------------------------------------------*/
831
 
832
static commonNaNT float32ToCommonNaN( float32 a )
833
{
834
    commonNaNT z;
835
 
836
    if ( float32_is_signaling_nan( a ) ) float_raise( float_flag_invalid );
837
    z.sign = a>>31;
838
    z.low = 0;
839
    z.high = ( (bits64) a )<<41;
840
    return z;
841
 
842
}
843
 
844
/*----------------------------------------------------------------------------
845
| Returns the result of converting the canonical NaN `a' to the single-
846
| precision floating-point format.
847
*----------------------------------------------------------------------------*/
848
 
849
static float32 commonNaNToFloat32( commonNaNT a )
850
{
851
 
852
    return ( ( (bits32) a.sign )<<31 ) | 0x7FC00000 | ( a.high>>41 );
853
 
854
}
855
 
856
/*----------------------------------------------------------------------------
857
| Takes two single-precision floating-point values `a' and `b', one of which
858
| is a NaN, and returns the appropriate NaN result.  If either `a' or `b' is a
859
| signaling NaN, the invalid exception is raised.
860
*----------------------------------------------------------------------------*/
861
 
862
static float32 propagateFloat32NaN( float32 a, float32 b )
863
{
864
    flag aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN;
865
 
866
    aIsNaN = float32_is_nan( a );
867
    aIsSignalingNaN = float32_is_signaling_nan( a );
868
    bIsNaN = float32_is_nan( b );
869
    bIsSignalingNaN = float32_is_signaling_nan( b );
870
    a |= 0x00400000;
871
    b |= 0x00400000;
872
    if ( aIsSignalingNaN | bIsSignalingNaN ) float_raise( float_flag_invalid );
873
    if ( aIsSignalingNaN ) {
874
        if ( bIsSignalingNaN ) goto returnLargerSignificand;
875
        return bIsNaN ? b : a;
876
    }
877
    else if ( aIsNaN ) {
878
        if ( bIsSignalingNaN | ! bIsNaN ) return a;
879
 returnLargerSignificand:
880
        if ( (bits32) ( a<<1 ) < (bits32) ( b<<1 ) ) return b;
881
        if ( (bits32) ( b<<1 ) < (bits32) ( a<<1 ) ) return a;
882
        return ( a < b ) ? a : b;
883
    }
884
    else {
885
        return b;
886
    }
887
 
888
}
889
 
890
/*----------------------------------------------------------------------------
891
| The pattern for a default generated double-precision NaN.
892
*----------------------------------------------------------------------------*/
893
#define float64_default_nan LIT64( 0xFFF8000000000000 )
894
 
895
/*----------------------------------------------------------------------------
896
| Returns 1 if the double-precision floating-point value `a' is a NaN;
897
| otherwise returns 0.
898
*----------------------------------------------------------------------------*/
899
 
900
flag float64_is_nan( float64 a )
901
{
902
 
903
    return ( LIT64( 0xFFE0000000000000 ) < (bits64) ( a<<1 ) );
904
 
905
}
906
 
907
/*----------------------------------------------------------------------------
908
| Returns 1 if the double-precision floating-point value `a' is a signaling
909
| NaN; otherwise returns 0.
910
*----------------------------------------------------------------------------*/
911
 
912
flag float64_is_signaling_nan( float64 a )
913
{
914
 
915
    return
916
           ( ( ( a>>51 ) & 0xFFF ) == 0xFFE )
917
        && ( a & LIT64( 0x0007FFFFFFFFFFFF ) );
918
 
919
}
920
 
921
/*----------------------------------------------------------------------------
922
| Returns the result of converting the double-precision floating-point NaN
923
| `a' to the canonical NaN format.  If `a' is a signaling NaN, the invalid
924
| exception is raised.
925
*----------------------------------------------------------------------------*/
926
 
927
static commonNaNT float64ToCommonNaN( float64 a )
928
{
929
    commonNaNT z;
930
 
931
    if ( float64_is_signaling_nan( a ) ) float_raise( float_flag_invalid );
932
    z.sign = a>>63;
933
    z.low = 0;
934
    z.high = a<<12;
935
    return z;
936
 
937
}
938
 
939
/*----------------------------------------------------------------------------
940
| Returns the result of converting the canonical NaN `a' to the double-
941
| precision floating-point format.
942
*----------------------------------------------------------------------------*/
943
 
944
static float64 commonNaNToFloat64( commonNaNT a )
945
{
946
 
947
    return
948
          ( ( (bits64) a.sign )<<63 )
949
        | LIT64( 0x7FF8000000000000 )
950
        | ( a.high>>12 );
951
 
952
}
953
 
954
/*----------------------------------------------------------------------------
955
| Takes two double-precision floating-point values `a' and `b', one of which
956
| is a NaN, and returns the appropriate NaN result.  If either `a' or `b' is a
957
| signaling NaN, the invalid exception is raised.
958
*----------------------------------------------------------------------------*/
959
 
960
static float64 propagateFloat64NaN( float64 a, float64 b )
961
{
962
    flag aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN;
963
 
964
    aIsNaN = float64_is_nan( a );
965
    aIsSignalingNaN = float64_is_signaling_nan( a );
966
    bIsNaN = float64_is_nan( b );
967
    bIsSignalingNaN = float64_is_signaling_nan( b );
968
    a |= LIT64( 0x0008000000000000 );
969
    b |= LIT64( 0x0008000000000000 );
970
    if ( aIsSignalingNaN | bIsSignalingNaN ) float_raise( float_flag_invalid );
971
    if ( aIsSignalingNaN ) {
972
        if ( bIsSignalingNaN ) goto returnLargerSignificand;
973
        return bIsNaN ? b : a;
974
    }
975
    else if ( aIsNaN ) {
976
        if ( bIsSignalingNaN | ! bIsNaN ) return a;
977
 returnLargerSignificand:
978
        if ( (bits64) ( a<<1 ) < (bits64) ( b<<1 ) ) return b;
979
        if ( (bits64) ( b<<1 ) < (bits64) ( a<<1 ) ) return a;
980
        return ( a < b ) ? a : b;
981
    }
982
    else {
983
        return b;
984
    }
985
 
986
}
987
 
988
#ifdef FLOATX80
989
 
990
/*----------------------------------------------------------------------------
991
| The pattern for a default generated extended double-precision NaN.  The
992
| `high' and `low' values hold the most- and least-significant bits,
993
| respectively.
994
*----------------------------------------------------------------------------*/
995
#define floatx80_default_nan_high 0xFFFF
996
#define floatx80_default_nan_low  LIT64( 0xC000000000000000 )
997
 
998
/*----------------------------------------------------------------------------
999
| Returns 1 if the extended double-precision floating-point value `a' is a
1000
| NaN; otherwise returns 0.
1001
*----------------------------------------------------------------------------*/
1002
 
1003
flag floatx80_is_nan( floatx80 a )
1004
{
1005
 
1006
    return ( ( a.high & 0x7FFF ) == 0x7FFF ) && (bits64) ( a.low<<1 );
1007
 
1008
}
1009
 
1010
/*----------------------------------------------------------------------------
1011
| Returns 1 if the extended double-precision floating-point value `a' is a
1012
| signaling NaN; otherwise returns 0.
1013
*----------------------------------------------------------------------------*/
1014
 
1015
flag floatx80_is_signaling_nan( floatx80 a )
1016
{
1017
    bits64 aLow;
1018
 
1019
    aLow = a.low & ~ LIT64( 0x4000000000000000 );
1020
    return
1021
           ( ( a.high & 0x7FFF ) == 0x7FFF )
1022
        && (bits64) ( aLow<<1 )
1023
        && ( a.low == aLow );
1024
 
1025
}
1026
 
1027
/*----------------------------------------------------------------------------
1028
| Returns the result of converting the extended double-precision floating-
1029
| point NaN `a' to the canonical NaN format.  If `a' is a signaling NaN, the
1030
| invalid exception is raised.
1031
*----------------------------------------------------------------------------*/
1032
 
1033
static commonNaNT floatx80ToCommonNaN( floatx80 a )
1034
{
1035
    commonNaNT z;
1036
 
1037
    if ( floatx80_is_signaling_nan( a ) ) float_raise( float_flag_invalid );
1038
    z.sign = a.high>>15;
1039
    z.low = 0;
1040
    z.high = a.low<<1;
1041
    return z;
1042
 
1043
}
1044
 
1045
/*----------------------------------------------------------------------------
1046
| Returns the result of converting the canonical NaN `a' to the extended
1047
| double-precision floating-point format.
1048
*----------------------------------------------------------------------------*/
1049
 
1050
static floatx80 commonNaNToFloatx80( commonNaNT a )
1051
{
1052
    floatx80 z;
1053
 
1054
    z.low = LIT64( 0xC000000000000000 ) | ( a.high>>1 );
1055
    z.high = ( ( (bits16) a.sign )<<15 ) | 0x7FFF;
1056
    return z;
1057
 
1058
}
1059
 
1060
/*----------------------------------------------------------------------------
1061
| Takes two extended double-precision floating-point values `a' and `b', one
1062
| of which is a NaN, and returns the appropriate NaN result.  If either `a' or
1063
| `b' is a signaling NaN, the invalid exception is raised.
1064
*----------------------------------------------------------------------------*/
1065
 
1066
static floatx80 propagateFloatx80NaN( floatx80 a, floatx80 b )
1067
{
1068
    flag aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN;
1069
 
1070
    aIsNaN = floatx80_is_nan( a );
1071
    aIsSignalingNaN = floatx80_is_signaling_nan( a );
1072
    bIsNaN = floatx80_is_nan( b );
1073
    bIsSignalingNaN = floatx80_is_signaling_nan( b );
1074
    a.low |= LIT64( 0xC000000000000000 );
1075
    b.low |= LIT64( 0xC000000000000000 );
1076
    if ( aIsSignalingNaN | bIsSignalingNaN ) float_raise( float_flag_invalid );
1077
    if ( aIsSignalingNaN ) {
1078
        if ( bIsSignalingNaN ) goto returnLargerSignificand;
1079
        return bIsNaN ? b : a;
1080
    }
1081
    else if ( aIsNaN ) {
1082
        if ( bIsSignalingNaN | ! bIsNaN ) return a;
1083
 returnLargerSignificand:
1084
        if ( a.low < b.low ) return b;
1085
        if ( b.low < a.low ) return a;
1086
        return ( a.high < b.high ) ? a : b;
1087
    }
1088
    else {
1089
        return b;
1090
    }
1091
 
1092
}
1093
 
1094
#endif
1095
 
1096
#ifdef FLOAT128
1097
 
1098
/*----------------------------------------------------------------------------
1099
| The pattern for a default generated quadruple-precision NaN.  The `high' and
1100
| `low' values hold the most- and least-significant bits, respectively.
1101
*----------------------------------------------------------------------------*/
1102
#define float128_default_nan_high LIT64( 0xFFFF800000000000 )
1103
#define float128_default_nan_low  LIT64( 0x0000000000000000 )
1104
 
1105
/*----------------------------------------------------------------------------
1106
| Returns 1 if the quadruple-precision floating-point value `a' is a NaN;
1107
| otherwise returns 0.
1108
*----------------------------------------------------------------------------*/
1109
 
1110
flag float128_is_nan( float128 a )
1111
{
1112
 
1113
    return
1114
           ( LIT64( 0xFFFE000000000000 ) <= (bits64) ( a.high<<1 ) )
1115
        && ( a.low || ( a.high & LIT64( 0x0000FFFFFFFFFFFF ) ) );
1116
 
1117
}
1118
 
1119
/*----------------------------------------------------------------------------
1120
| Returns 1 if the quadruple-precision floating-point value `a' is a
1121
| signaling NaN; otherwise returns 0.
1122
*----------------------------------------------------------------------------*/
1123
 
1124
flag float128_is_signaling_nan( float128 a )
1125
{
1126
 
1127
    return
1128
           ( ( ( a.high>>47 ) & 0xFFFF ) == 0xFFFE )
1129
        && ( a.low || ( a.high & LIT64( 0x00007FFFFFFFFFFF ) ) );
1130
 
1131
}
1132
 
1133
/*----------------------------------------------------------------------------
1134
| Returns the result of converting the quadruple-precision floating-point NaN
1135
| `a' to the canonical NaN format.  If `a' is a signaling NaN, the invalid
1136
| exception is raised.
1137
*----------------------------------------------------------------------------*/
1138
 
1139
static commonNaNT float128ToCommonNaN( float128 a )
1140
{
1141
    commonNaNT z;
1142
 
1143
    if ( float128_is_signaling_nan( a ) ) float_raise( float_flag_invalid );
1144
    z.sign = a.high>>63;
1145
    shortShift128Left( a.high, a.low, 16, &z.high, &z.low );
1146
    return z;
1147
 
1148
}
1149
 
1150
/*----------------------------------------------------------------------------
1151
| Returns the result of converting the canonical NaN `a' to the quadruple-
1152
| precision floating-point format.
1153
*----------------------------------------------------------------------------*/
1154
 
1155
static float128 commonNaNToFloat128( commonNaNT a )
1156
{
1157
    float128 z;
1158
 
1159
    shift128Right( a.high, a.low, 16, &z.high, &z.low );
1160
    z.high |= ( ( (bits64) a.sign )<<63 ) | LIT64( 0x7FFF800000000000 );
1161
    return z;
1162
 
1163
}
1164
 
1165
/*----------------------------------------------------------------------------
1166
| Takes two quadruple-precision floating-point values `a' and `b', one of
1167
| which is a NaN, and returns the appropriate NaN result.  If either `a' or
1168
| `b' is a signaling NaN, the invalid exception is raised.
1169
*----------------------------------------------------------------------------*/
1170
 
1171
static float128 propagateFloat128NaN( float128 a, float128 b )
1172
{
1173
    flag aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN;
1174
 
1175
    aIsNaN = float128_is_nan( a );
1176
    aIsSignalingNaN = float128_is_signaling_nan( a );
1177
    bIsNaN = float128_is_nan( b );
1178
    bIsSignalingNaN = float128_is_signaling_nan( b );
1179
    a.high |= LIT64( 0x0000800000000000 );
1180
    b.high |= LIT64( 0x0000800000000000 );
1181
    if ( aIsSignalingNaN | bIsSignalingNaN ) float_raise( float_flag_invalid );
1182
    if ( aIsSignalingNaN ) {
1183
        if ( bIsSignalingNaN ) goto returnLargerSignificand;
1184
        return bIsNaN ? b : a;
1185
    }
1186
    else if ( aIsNaN ) {
1187
        if ( bIsSignalingNaN | ! bIsNaN ) return a;
1188
 returnLargerSignificand:
1189
        if ( lt128( a.high<<1, a.low, b.high<<1, b.low ) ) return b;
1190
        if ( lt128( b.high<<1, b.low, a.high<<1, a.low ) ) return a;
1191
        return ( a.high < b.high ) ? a : b;
1192
    }
1193
    else {
1194
        return b;
1195
    }
1196
 
1197
}
1198
 
1199
#endif
1200
 
1201
 
1202
 
1203
/*----------------------------------------------------------------------------
1204
| Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
1205
| and 7, and returns the properly rounded 32-bit integer corresponding to the
1206
| input.  If `zSign' is 1, the input is negated before being converted to an
1207
| integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
1208
| is simply rounded to an integer, with the inexact exception raised if the
1209
| input cannot be represented exactly as an integer.  However, if the fixed-
1210
| point input is too large, the invalid exception is raised and the largest
1211
| positive or negative integer is returned.
1212
*----------------------------------------------------------------------------*/
1213
 
1214
static int32 roundAndPackInt32( flag zSign, bits64 absZ )
1215
{
1216
    int8 roundingMode;
1217
    flag roundNearestEven;
1218
    int8 roundIncrement, roundBits;
1219
    int32 z;
1220
 
1221
    roundingMode = float_rounding_mode;
1222
    roundNearestEven = ( roundingMode == float_round_nearest_even );
1223
    roundIncrement = 0x40;
1224
    if ( ! roundNearestEven ) {
1225
        if ( roundingMode == float_round_to_zero ) {
1226
            roundIncrement = 0;
1227
        }
1228
        else {
1229
            roundIncrement = 0x7F;
1230
            if ( zSign ) {
1231
                if ( roundingMode == float_round_up ) roundIncrement = 0;
1232
            }
1233
            else {
1234
                if ( roundingMode == float_round_down ) roundIncrement = 0;
1235
            }
1236
        }
1237
    }
1238
    roundBits = absZ & 0x7F;
1239
    absZ = ( absZ + roundIncrement )>>7;
1240
    absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
1241
    z = absZ;
1242
    if ( zSign ) z = - z;
1243
    if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
1244
        float_raise( float_flag_invalid );
1245
        return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
1246
    }
1247
    if ( roundBits ) float_exception_flags |= float_flag_inexact;
1248
    return z;
1249
 
1250
}
1251
 
1252
/*----------------------------------------------------------------------------
1253
| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
1254
| `absZ1', with binary point between bits 63 and 64 (between the input words),
1255
| and returns the properly rounded 64-bit integer corresponding to the input.
1256
| If `zSign' is 1, the input is negated before being converted to an integer.
1257
| Ordinarily, the fixed-point input is simply rounded to an integer, with
1258
| the inexact exception raised if the input cannot be represented exactly as
1259
| an integer.  However, if the fixed-point input is too large, the invalid
1260
| exception is raised and the largest positive or negative integer is
1261
| returned.
1262
*----------------------------------------------------------------------------*/
1263
 
1264
static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 )
1265
{
1266
    int8 roundingMode;
1267
    flag roundNearestEven, increment;
1268
    int64 z;
1269
 
1270
    roundingMode = float_rounding_mode;
1271
    roundNearestEven = ( roundingMode == float_round_nearest_even );
1272
    increment = ( (sbits64) absZ1 < 0 );
1273
    if ( ! roundNearestEven ) {
1274
        if ( roundingMode == float_round_to_zero ) {
1275
            increment = 0;
1276
        }
1277
        else {
1278
            if ( zSign ) {
1279
                increment = ( roundingMode == float_round_down ) && absZ1;
1280
            }
1281
            else {
1282
                increment = ( roundingMode == float_round_up ) && absZ1;
1283
            }
1284
        }
1285
    }
1286
    if ( increment ) {
1287
        ++absZ0;
1288
        if ( absZ0 == 0 ) goto overflow;
1289
        absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
1290
    }
1291
    z = absZ0;
1292
    if ( zSign ) z = - z;
1293
    if ( z && ( ( z < 0 ) ^ zSign ) ) {
1294
 overflow:
1295
        float_raise( float_flag_invalid );
1296
        return
1297
              zSign ? (sbits64) LIT64( 0x8000000000000000 )
1298
            : LIT64( 0x7FFFFFFFFFFFFFFF );
1299
    }
1300
    if ( absZ1 ) float_exception_flags |= float_flag_inexact;
1301
    return z;
1302
 
1303
}
1304
 
1305
/*----------------------------------------------------------------------------
1306
| Returns the fraction bits of the single-precision floating-point value `a'.
1307
*----------------------------------------------------------------------------*/
1308
 
1309
INLINE bits32 extractFloat32Frac( float32 a )
1310
{
1311
 
1312
    return a & 0x007FFFFF;
1313
 
1314
}
1315
 
1316
/*----------------------------------------------------------------------------
1317
| Returns the exponent bits of the single-precision floating-point value `a'.
1318
*----------------------------------------------------------------------------*/
1319
 
1320
INLINE int16 extractFloat32Exp( float32 a )
1321
{
1322
 
1323
    return ( a>>23 ) & 0xFF;
1324
 
1325
}
1326
 
1327
/*----------------------------------------------------------------------------
1328
| Returns the sign bit of the single-precision floating-point value `a'.
1329
*----------------------------------------------------------------------------*/
1330
 
1331
INLINE flag extractFloat32Sign( float32 a )
1332
{
1333
 
1334
    return a>>31;
1335
 
1336
}
1337
 
1338
/*----------------------------------------------------------------------------
1339
| Normalizes the subnormal single-precision floating-point value represented
1340
| by the denormalized significand `aSig'.  The normalized exponent and
1341
| significand are stored at the locations pointed to by `zExpPtr' and
1342
| `zSigPtr', respectively.
1343
*----------------------------------------------------------------------------*/
1344
 
1345
static void
1346
 normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
1347
{
1348
    int8 shiftCount;
1349
 
1350
    shiftCount = countLeadingZeros32( aSig ) - 8;
1351
    *zSigPtr = aSig<<shiftCount;
1352
    *zExpPtr = 1 - shiftCount;
1353
 
1354
}
1355
 
1356
/*----------------------------------------------------------------------------
1357
| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
1358
| single-precision floating-point value, returning the result.  After being
1359
| shifted into the proper positions, the three fields are simply added
1360
| together to form the result.  This means that any integer portion of `zSig'
1361
| will be added into the exponent.  Since a properly normalized significand
1362
| will have an integer portion equal to 1, the `zExp' input should be 1 less
1363
| than the desired result exponent whenever `zSig' is a complete, normalized
1364
| significand.
1365
*----------------------------------------------------------------------------*/
1366
 
1367
INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
1368
{
1369
 
1370
    return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
1371
 
1372
}
1373
 
1374
/*----------------------------------------------------------------------------
1375
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1376
| and significand `zSig', and returns the proper single-precision floating-
1377
| point value corresponding to the abstract input.  Ordinarily, the abstract
1378
| value is simply rounded and packed into the single-precision format, with
1379
| the inexact exception raised if the abstract input cannot be represented
1380
| exactly.  However, if the abstract value is too large, the overflow and
1381
| inexact exceptions are raised and an infinity or maximal finite value is
1382
| returned.  If the abstract value is too small, the input value is rounded to
1383
| a subnormal number, and the underflow and inexact exceptions are raised if
1384
| the abstract input cannot be represented exactly as a subnormal single-
1385
| precision floating-point number.
1386
|     The input significand `zSig' has its binary point between bits 30
1387
| and 29, which is 7 bits to the left of the usual location.  This shifted
1388
| significand must be normalized or smaller.  If `zSig' is not normalized,
1389
| `zExp' must be 0; in that case, the result returned is a subnormal number,
1390
| and it must not require rounding.  In the usual case that `zSig' is
1391
| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
1392
| The handling of underflow and overflow follows the IEC/IEEE Standard for
1393
| Binary Floating-Point Arithmetic.
1394
*----------------------------------------------------------------------------*/
1395
 
1396
static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
1397
{
1398
    int8 roundingMode;
1399
    flag roundNearestEven;
1400
    int8 roundIncrement, roundBits;
1401
    flag isTiny;
1402
 
1403
    roundingMode = float_rounding_mode;
1404
    roundNearestEven = ( roundingMode == float_round_nearest_even );
1405
    roundIncrement = 0x40;
1406
    if ( ! roundNearestEven ) {
1407
        if ( roundingMode == float_round_to_zero ) {
1408
            roundIncrement = 0;
1409
        }
1410
        else {
1411
            roundIncrement = 0x7F;
1412
            if ( zSign ) {
1413
                if ( roundingMode == float_round_up ) roundIncrement = 0;
1414
            }
1415
            else {
1416
                if ( roundingMode == float_round_down ) roundIncrement = 0;
1417
            }
1418
        }
1419
    }
1420
    roundBits = zSig & 0x7F;
1421
    if ( 0xFD <= (bits16) zExp ) {
1422
        if (    ( 0xFD < zExp )
1423
             || (    ( zExp == 0xFD )
1424
                  && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
1425
           ) {
1426
            float_raise( float_flag_overflow | float_flag_inexact );
1427
            return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
1428
        }
1429
        if ( zExp < 0 ) {
1430
            isTiny =
1431
                   ( float_detect_tininess == float_tininess_before_rounding )
1432
                || ( zExp < -1 )
1433
                || ( zSig + roundIncrement < 0x80000000 );
1434
            shift32RightJamming( zSig, - zExp, &zSig );
1435
            zExp = 0;
1436
            roundBits = zSig & 0x7F;
1437
            if ( isTiny && roundBits ) float_raise( float_flag_underflow );
1438
        }
1439
    }
1440
    if ( roundBits ) float_exception_flags |= float_flag_inexact;
1441
    zSig = ( zSig + roundIncrement )>>7;
1442
    zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
1443
    if ( zSig == 0 ) zExp = 0;
1444
    return packFloat32( zSign, zExp, zSig );
1445
 
1446
}
1447
 
1448
/*----------------------------------------------------------------------------
1449
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1450
| and significand `zSig', and returns the proper single-precision floating-
1451
| point value corresponding to the abstract input.  This routine is just like
1452
| `roundAndPackFloat32' except that `zSig' does not have to be normalized.
1453
| Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
1454
| floating-point exponent.
1455
*----------------------------------------------------------------------------*/
1456
 
1457
static float32
1458
 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
1459
{
1460
    int8 shiftCount;
1461
 
1462
    shiftCount = countLeadingZeros32( zSig ) - 1;
1463
    return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount );
1464
 
1465
}
1466
 
1467
/*----------------------------------------------------------------------------
1468
| Returns the fraction bits of the double-precision floating-point value `a'.
1469
*----------------------------------------------------------------------------*/
1470
 
1471
INLINE bits64 extractFloat64Frac( float64 a )
1472
{
1473
 
1474
    return a & LIT64( 0x000FFFFFFFFFFFFF );
1475
 
1476
}
1477
 
1478
/*----------------------------------------------------------------------------
1479
| Returns the exponent bits of the double-precision floating-point value `a'.
1480
*----------------------------------------------------------------------------*/
1481
 
1482
INLINE int16 extractFloat64Exp( float64 a )
1483
{
1484
 
1485
    return ( a>>52 ) & 0x7FF;
1486
 
1487
}
1488
 
1489
/*----------------------------------------------------------------------------
1490
| Returns the sign bit of the double-precision floating-point value `a'.
1491
*----------------------------------------------------------------------------*/
1492
 
1493
INLINE flag extractFloat64Sign( float64 a )
1494
{
1495
 
1496
    return a>>63;
1497
 
1498
}
1499
 
1500
/*----------------------------------------------------------------------------
1501
| Normalizes the subnormal double-precision floating-point value represented
1502
| by the denormalized significand `aSig'.  The normalized exponent and
1503
| significand are stored at the locations pointed to by `zExpPtr' and
1504
| `zSigPtr', respectively.
1505
*----------------------------------------------------------------------------*/
1506
 
1507
static void
1508
 normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
1509
{
1510
    int8 shiftCount;
1511
 
1512
    shiftCount = countLeadingZeros64( aSig ) - 11;
1513
    *zSigPtr = aSig<<shiftCount;
1514
    *zExpPtr = 1 - shiftCount;
1515
 
1516
}
1517
 
1518
/*----------------------------------------------------------------------------
1519
| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
1520
| double-precision floating-point value, returning the result.  After being
1521
| shifted into the proper positions, the three fields are simply added
1522
| together to form the result.  This means that any integer portion of `zSig'
1523
| will be added into the exponent.  Since a properly normalized significand
1524
| will have an integer portion equal to 1, the `zExp' input should be 1 less
1525
| than the desired result exponent whenever `zSig' is a complete, normalized
1526
| significand.
1527
*----------------------------------------------------------------------------*/
1528
 
1529
INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
1530
{
1531
 
1532
    return ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<52 ) + zSig;
1533
 
1534
}
1535
 
1536
/*----------------------------------------------------------------------------
1537
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1538
| and significand `zSig', and returns the proper double-precision floating-
1539
| point value corresponding to the abstract input.  Ordinarily, the abstract
1540
| value is simply rounded and packed into the double-precision format, with
1541
| the inexact exception raised if the abstract input cannot be represented
1542
| exactly.  However, if the abstract value is too large, the overflow and
1543
| inexact exceptions are raised and an infinity or maximal finite value is
1544
| returned.  If the abstract value is too small, the input value is rounded
1545
| to a subnormal number, and the underflow and inexact exceptions are raised
1546
| if the abstract input cannot be represented exactly as a subnormal double-
1547
| precision floating-point number.
1548
|     The input significand `zSig' has its binary point between bits 62
1549
| and 61, which is 10 bits to the left of the usual location.  This shifted
1550
| significand must be normalized or smaller.  If `zSig' is not normalized,
1551
| `zExp' must be 0; in that case, the result returned is a subnormal number,
1552
| and it must not require rounding.  In the usual case that `zSig' is
1553
| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
1554
| The handling of underflow and overflow follows the IEC/IEEE Standard for
1555
| Binary Floating-Point Arithmetic.
1556
*----------------------------------------------------------------------------*/
1557
 
1558
static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
1559
{
1560
    int8 roundingMode;
1561
    flag roundNearestEven;
1562
    int16 roundIncrement, roundBits;
1563
    flag isTiny;
1564
 
1565
    roundingMode = float_rounding_mode;
1566
    roundNearestEven = ( roundingMode == float_round_nearest_even );
1567
    roundIncrement = 0x200;
1568
    if ( ! roundNearestEven ) {
1569
        if ( roundingMode == float_round_to_zero ) {
1570
            roundIncrement = 0;
1571
        }
1572
        else {
1573
            roundIncrement = 0x3FF;
1574
            if ( zSign ) {
1575
                if ( roundingMode == float_round_up ) roundIncrement = 0;
1576
            }
1577
            else {
1578
                if ( roundingMode == float_round_down ) roundIncrement = 0;
1579
            }
1580
        }
1581
    }
1582
    roundBits = zSig & 0x3FF;
1583
    if ( 0x7FD <= (bits16) zExp ) {
1584
        if (    ( 0x7FD < zExp )
1585
             || (    ( zExp == 0x7FD )
1586
                  && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
1587
           ) {
1588
            float_raise( float_flag_overflow | float_flag_inexact );
1589
            return packFloat64( zSign, 0x7FF, 0 ) - ( roundIncrement == 0 );
1590
        }
1591
        if ( zExp < 0 ) {
1592
            isTiny =
1593
                   ( float_detect_tininess == float_tininess_before_rounding )
1594
                || ( zExp < -1 )
1595
                || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
1596
            shift64RightJamming( zSig, - zExp, &zSig );
1597
            zExp = 0;
1598
            roundBits = zSig & 0x3FF;
1599
            if ( isTiny && roundBits ) float_raise( float_flag_underflow );
1600
        }
1601
    }
1602
    if ( roundBits ) float_exception_flags |= float_flag_inexact;
1603
    zSig = ( zSig + roundIncrement )>>10;
1604
    zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
1605
    if ( zSig == 0 ) zExp = 0;
1606
    return packFloat64( zSign, zExp, zSig );
1607
 
1608
}
1609
 
1610
/*----------------------------------------------------------------------------
1611
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1612
| and significand `zSig', and returns the proper double-precision floating-
1613
| point value corresponding to the abstract input.  This routine is just like
1614
| `roundAndPackFloat64' except that `zSig' does not have to be normalized.
1615
| Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
1616
| floating-point exponent.
1617
*----------------------------------------------------------------------------*/
1618
 
1619
static float64
1620
 normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
1621
{
1622
    int8 shiftCount;
1623
 
1624
    shiftCount = countLeadingZeros64( zSig ) - 1;
1625
    return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount );
1626
 
1627
}
1628
 
1629
#ifdef FLOATX80
1630
 
1631
/*----------------------------------------------------------------------------
1632
| Returns the fraction bits of the extended double-precision floating-point
1633
| value `a'.
1634
*----------------------------------------------------------------------------*/
1635
 
1636
INLINE bits64 extractFloatx80Frac( floatx80 a )
1637
{
1638
 
1639
    return a.low;
1640
 
1641
}
1642
 
1643
/*----------------------------------------------------------------------------
1644
| Returns the exponent bits of the extended double-precision floating-point
1645
| value `a'.
1646
*----------------------------------------------------------------------------*/
1647
 
1648
INLINE int32 extractFloatx80Exp( floatx80 a )
1649
{
1650
 
1651
    return a.high & 0x7FFF;
1652
 
1653
}
1654
 
1655
/*----------------------------------------------------------------------------
1656
| Returns the sign bit of the extended double-precision floating-point value
1657
| `a'.
1658
*----------------------------------------------------------------------------*/
1659
 
1660
INLINE flag extractFloatx80Sign( floatx80 a )
1661
{
1662
 
1663
    return a.high>>15;
1664
 
1665
}
1666
 
1667
/*----------------------------------------------------------------------------
1668
| Normalizes the subnormal extended double-precision floating-point value
1669
| represented by the denormalized significand `aSig'.  The normalized exponent
1670
| and significand are stored at the locations pointed to by `zExpPtr' and
1671
| `zSigPtr', respectively.
1672
*----------------------------------------------------------------------------*/
1673
 
1674
static void
1675
 normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
1676
{
1677
    int8 shiftCount;
1678
 
1679
    shiftCount = countLeadingZeros64( aSig );
1680
    *zSigPtr = aSig<<shiftCount;
1681
    *zExpPtr = 1 - shiftCount;
1682
 
1683
}
1684
 
1685
/*----------------------------------------------------------------------------
1686
| Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
1687
| extended double-precision floating-point value, returning the result.
1688
*----------------------------------------------------------------------------*/
1689
 
1690
INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
1691
{
1692
    floatx80 z;
1693
 
1694
    z.low = zSig;
1695
    z.high = ( ( (bits16) zSign )<<15 ) + zExp;
1696
    return z;
1697
 
1698
}
1699
 
1700
/*----------------------------------------------------------------------------
1701
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1702
| and extended significand formed by the concatenation of `zSig0' and `zSig1',
1703
| and returns the proper extended double-precision floating-point value
1704
| corresponding to the abstract input.  Ordinarily, the abstract value is
1705
| rounded and packed into the extended double-precision format, with the
1706
| inexact exception raised if the abstract input cannot be represented
1707
| exactly.  However, if the abstract value is too large, the overflow and
1708
| inexact exceptions are raised and an infinity or maximal finite value is
1709
| returned.  If the abstract value is too small, the input value is rounded to
1710
| a subnormal number, and the underflow and inexact exceptions are raised if
1711
| the abstract input cannot be represented exactly as a subnormal extended
1712
| double-precision floating-point number.
1713
|     If `roundingPrecision' is 32 or 64, the result is rounded to the same
1714
| number of bits as single or double precision, respectively.  Otherwise, the
1715
| result is rounded to the full precision of the extended double-precision
1716
| format.
1717
|     The input significand must be normalized or smaller.  If the input
1718
| significand is not normalized, `zExp' must be 0; in that case, the result
1719
| returned is a subnormal number, and it must not require rounding.  The
1720
| handling of underflow and overflow follows the IEC/IEEE Standard for Binary
1721
| Floating-Point Arithmetic.
1722
*----------------------------------------------------------------------------*/
1723
 
1724
static floatx80
1725
 roundAndPackFloatx80(
1726
     int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
1727
 )
1728
{
1729
    int8 roundingMode;
1730
    flag roundNearestEven, increment, isTiny;
1731
    int64 roundIncrement, roundMask, roundBits;
1732
 
1733
    roundingMode = float_rounding_mode;
1734
    roundNearestEven = ( roundingMode == float_round_nearest_even );
1735
    if ( roundingPrecision == 80 ) goto precision80;
1736
    if ( roundingPrecision == 64 ) {
1737
        roundIncrement = LIT64( 0x0000000000000400 );
1738
        roundMask = LIT64( 0x00000000000007FF );
1739
    }
1740
    else if ( roundingPrecision == 32 ) {
1741
        roundIncrement = LIT64( 0x0000008000000000 );
1742
        roundMask = LIT64( 0x000000FFFFFFFFFF );
1743
    }
1744
    else {
1745
        goto precision80;
1746
    }
1747
    zSig0 |= ( zSig1 != 0 );
1748
    if ( ! roundNearestEven ) {
1749
        if ( roundingMode == float_round_to_zero ) {
1750
            roundIncrement = 0;
1751
        }
1752
        else {
1753
            roundIncrement = roundMask;
1754
            if ( zSign ) {
1755
                if ( roundingMode == float_round_up ) roundIncrement = 0;
1756
            }
1757
            else {
1758
                if ( roundingMode == float_round_down ) roundIncrement = 0;
1759
            }
1760
        }
1761
    }
1762
    roundBits = zSig0 & roundMask;
1763
    if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
1764
        if (    ( 0x7FFE < zExp )
1765
             || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
1766
           ) {
1767
            goto overflow;
1768
        }
1769
        if ( zExp <= 0 ) {
1770
            isTiny =
1771
                   ( float_detect_tininess == float_tininess_before_rounding )
1772
                || ( zExp < 0 )
1773
                || ( zSig0 <= zSig0 + roundIncrement );
1774
            shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
1775
            zExp = 0;
1776
            roundBits = zSig0 & roundMask;
1777
            if ( isTiny && roundBits ) float_raise( float_flag_underflow );
1778
            if ( roundBits ) float_exception_flags |= float_flag_inexact;
1779
            zSig0 += roundIncrement;
1780
            if ( (sbits64) zSig0 < 0 ) zExp = 1;
1781
            roundIncrement = roundMask + 1;
1782
            if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
1783
                roundMask |= roundIncrement;
1784
            }
1785
            zSig0 &= ~ roundMask;
1786
            return packFloatx80( zSign, zExp, zSig0 );
1787
        }
1788
    }
1789
    if ( roundBits ) float_exception_flags |= float_flag_inexact;
1790
    zSig0 += roundIncrement;
1791
    if ( zSig0 < roundIncrement ) {
1792
        ++zExp;
1793
        zSig0 = LIT64( 0x8000000000000000 );
1794
    }
1795
    roundIncrement = roundMask + 1;
1796
    if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
1797
        roundMask |= roundIncrement;
1798
    }
1799
    zSig0 &= ~ roundMask;
1800
    if ( zSig0 == 0 ) zExp = 0;
1801
    return packFloatx80( zSign, zExp, zSig0 );
1802
 precision80:
1803
    increment = ( (sbits64) zSig1 < 0 );
1804
    if ( ! roundNearestEven ) {
1805
        if ( roundingMode == float_round_to_zero ) {
1806
            increment = 0;
1807
        }
1808
        else {
1809
            if ( zSign ) {
1810
                increment = ( roundingMode == float_round_down ) && zSig1;
1811
            }
1812
            else {
1813
                increment = ( roundingMode == float_round_up ) && zSig1;
1814
            }
1815
        }
1816
    }
1817
    if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
1818
        if (    ( 0x7FFE < zExp )
1819
             || (    ( zExp == 0x7FFE )
1820
                  && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
1821
                  && increment
1822
                )
1823
           ) {
1824
            roundMask = 0;
1825
 overflow:
1826
            float_raise( float_flag_overflow | float_flag_inexact );
1827
            if (    ( roundingMode == float_round_to_zero )
1828
                 || ( zSign && ( roundingMode == float_round_up ) )
1829
                 || ( ! zSign && ( roundingMode == float_round_down ) )
1830
               ) {
1831
                return packFloatx80( zSign, 0x7FFE, ~ roundMask );
1832
            }
1833
            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1834
        }
1835
        if ( zExp <= 0 ) {
1836
            isTiny =
1837
                   ( float_detect_tininess == float_tininess_before_rounding )
1838
                || ( zExp < 0 )
1839
                || ! increment
1840
                || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
1841
            shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
1842
            zExp = 0;
1843
            if ( isTiny && zSig1 ) float_raise( float_flag_underflow );
1844
            if ( zSig1 ) float_exception_flags |= float_flag_inexact;
1845
            if ( roundNearestEven ) {
1846
                increment = ( (sbits64) zSig1 < 0 );
1847
            }
1848
            else {
1849
                if ( zSign ) {
1850
                    increment = ( roundingMode == float_round_down ) && zSig1;
1851
                }
1852
                else {
1853
                    increment = ( roundingMode == float_round_up ) && zSig1;
1854
                }
1855
            }
1856
            if ( increment ) {
1857
                ++zSig0;
1858
                zSig0 &=
1859
                    ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
1860
                if ( (sbits64) zSig0 < 0 ) zExp = 1;
1861
            }
1862
            return packFloatx80( zSign, zExp, zSig0 );
1863
        }
1864
    }
1865
    if ( zSig1 ) float_exception_flags |= float_flag_inexact;
1866
    if ( increment ) {
1867
        ++zSig0;
1868
        if ( zSig0 == 0 ) {
1869
            ++zExp;
1870
            zSig0 = LIT64( 0x8000000000000000 );
1871
        }
1872
        else {
1873
            zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
1874
        }
1875
    }
1876
    else {
1877
        if ( zSig0 == 0 ) zExp = 0;
1878
    }
1879
    return packFloatx80( zSign, zExp, zSig0 );
1880
 
1881
}
1882
 
1883
/*----------------------------------------------------------------------------
1884
| Takes an abstract floating-point value having sign `zSign', exponent
1885
| `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
1886
| and returns the proper extended double-precision floating-point value
1887
| corresponding to the abstract input.  This routine is just like
1888
| `roundAndPackFloatx80' except that the input significand does not have to be
1889
| normalized.
1890
*----------------------------------------------------------------------------*/
1891
 
1892
static floatx80
1893
 normalizeRoundAndPackFloatx80(
1894
     int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
1895
 )
1896
{
1897
    int8 shiftCount;
1898
 
1899
    if ( zSig0 == 0 ) {
1900
        zSig0 = zSig1;
1901
        zSig1 = 0;
1902
        zExp -= 64;
1903
    }
1904
    shiftCount = countLeadingZeros64( zSig0 );
1905
    shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1906
    zExp -= shiftCount;
1907
    return
1908
        roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
1909
 
1910
}
1911
 
1912
#endif
1913
 
1914
#ifdef FLOAT128
1915
 
1916
/*----------------------------------------------------------------------------
1917
| Returns the least-significant 64 fraction bits of the quadruple-precision
1918
| floating-point value `a'.
1919
*----------------------------------------------------------------------------*/
1920
 
1921
INLINE bits64 extractFloat128Frac1( float128 a )
1922
{
1923
 
1924
    return a.low;
1925
 
1926
}
1927
 
1928
/*----------------------------------------------------------------------------
1929
| Returns the most-significant 48 fraction bits of the quadruple-precision
1930
| floating-point value `a'.
1931
*----------------------------------------------------------------------------*/
1932
 
1933
INLINE bits64 extractFloat128Frac0( float128 a )
1934
{
1935
 
1936
    return a.high & LIT64( 0x0000FFFFFFFFFFFF );
1937
 
1938
}
1939
 
1940
/*----------------------------------------------------------------------------
1941
| Returns the exponent bits of the quadruple-precision floating-point value
1942
| `a'.
1943
*----------------------------------------------------------------------------*/
1944
 
1945
INLINE int32 extractFloat128Exp( float128 a )
1946
{
1947
 
1948
    return ( a.high>>48 ) & 0x7FFF;
1949
 
1950
}
1951
 
1952
/*----------------------------------------------------------------------------
1953
| Returns the sign bit of the quadruple-precision floating-point value `a'.
1954
*----------------------------------------------------------------------------*/
1955
 
1956
INLINE flag extractFloat128Sign( float128 a )
1957
{
1958
 
1959
    return a.high>>63;
1960
 
1961
}
1962
 
1963
/*----------------------------------------------------------------------------
1964
| Normalizes the subnormal quadruple-precision floating-point value
1965
| represented by the denormalized significand formed by the concatenation of
1966
| `aSig0' and `aSig1'.  The normalized exponent is stored at the location
1967
| pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
1968
| significand are stored at the location pointed to by `zSig0Ptr', and the
1969
| least significant 64 bits of the normalized significand are stored at the
1970
| location pointed to by `zSig1Ptr'.
1971
*----------------------------------------------------------------------------*/
1972
 
1973
static void
1974
 normalizeFloat128Subnormal(
1975
     bits64 aSig0,
1976
     bits64 aSig1,
1977
     int32 *zExpPtr,
1978
     bits64 *zSig0Ptr,
1979
     bits64 *zSig1Ptr
1980
 )
1981
{
1982
    int8 shiftCount;
1983
 
1984
    if ( aSig0 == 0 ) {
1985
        shiftCount = countLeadingZeros64( aSig1 ) - 15;
1986
        if ( shiftCount < 0 ) {
1987
            *zSig0Ptr = aSig1>>( - shiftCount );
1988
            *zSig1Ptr = aSig1<<( shiftCount & 63 );
1989
        }
1990
        else {
1991
            *zSig0Ptr = aSig1<<shiftCount;
1992
            *zSig1Ptr = 0;
1993
        }
1994
        *zExpPtr = - shiftCount - 63;
1995
    }
1996
    else {
1997
        shiftCount = countLeadingZeros64( aSig0 ) - 15;
1998
        shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
1999
        *zExpPtr = 1 - shiftCount;
2000
    }
2001
 
2002
}
2003
 
2004
/*----------------------------------------------------------------------------
2005
| Packs the sign `zSign', the exponent `zExp', and the significand formed
2006
| by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
2007
| floating-point value, returning the result.  After being shifted into the
2008
| proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
2009
| added together to form the most significant 32 bits of the result.  This
2010
| means that any integer portion of `zSig0' will be added into the exponent.
2011
| Since a properly normalized significand will have an integer portion equal
2012
| to 1, the `zExp' input should be 1 less than the desired result exponent
2013
| whenever `zSig0' and `zSig1' concatenated form a complete, normalized
2014
| significand.
2015
*----------------------------------------------------------------------------*/
2016
 
2017
INLINE float128
2018
 packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
2019
{
2020
    float128 z;
2021
 
2022
    z.low = zSig1;
2023
    z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
2024
    return z;
2025
 
2026
}
2027
 
2028
/*----------------------------------------------------------------------------
2029
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2030
| and extended significand formed by the concatenation of `zSig0', `zSig1',
2031
| and `zSig2', and returns the proper quadruple-precision floating-point value
2032
| corresponding to the abstract input.  Ordinarily, the abstract value is
2033
| simply rounded and packed into the quadruple-precision format, with the
2034
| inexact exception raised if the abstract input cannot be represented
2035
| exactly.  However, if the abstract value is too large, the overflow and
2036
| inexact exceptions are raised and an infinity or maximal finite value is
2037
| returned.  If the abstract value is too small, the input value is rounded to
2038
| a subnormal number, and the underflow and inexact exceptions are raised if
2039
| the abstract input cannot be represented exactly as a subnormal quadruple-
2040
| precision floating-point number.
2041
|     The input significand must be normalized or smaller.  If the input
2042
| significand is not normalized, `zExp' must be 0; in that case, the result
2043
| returned is a subnormal number, and it must not require rounding.  In the
2044
| usual case that the input significand is normalized, `zExp' must be 1 less
2045
| than the ``true'' floating-point exponent.  The handling of underflow and
2046
| overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2047
*----------------------------------------------------------------------------*/
2048
 
2049
static float128
2050
 roundAndPackFloat128(
2051
     flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 )
2052
{
2053
    int8 roundingMode;
2054
    flag roundNearestEven, increment, isTiny;
2055
 
2056
    roundingMode = float_rounding_mode;
2057
    roundNearestEven = ( roundingMode == float_round_nearest_even );
2058
    increment = ( (sbits64) zSig2 < 0 );
2059
    if ( ! roundNearestEven ) {
2060
        if ( roundingMode == float_round_to_zero ) {
2061
            increment = 0;
2062
        }
2063
        else {
2064
            if ( zSign ) {
2065
                increment = ( roundingMode == float_round_down ) && zSig2;
2066
            }
2067
            else {
2068
                increment = ( roundingMode == float_round_up ) && zSig2;
2069
            }
2070
        }
2071
    }
2072
    if ( 0x7FFD <= (bits32) zExp ) {
2073
        if (    ( 0x7FFD < zExp )
2074
             || (    ( zExp == 0x7FFD )
2075
                  && eq128(
2076
                         LIT64( 0x0001FFFFFFFFFFFF ),
2077
                         LIT64( 0xFFFFFFFFFFFFFFFF ),
2078
                         zSig0,
2079
                         zSig1
2080
                     )
2081
                  && increment
2082
                )
2083
           ) {
2084
            float_raise( float_flag_overflow | float_flag_inexact );
2085
            if (    ( roundingMode == float_round_to_zero )
2086
                 || ( zSign && ( roundingMode == float_round_up ) )
2087
                 || ( ! zSign && ( roundingMode == float_round_down ) )
2088
               ) {
2089
                return
2090
                    packFloat128(
2091
                        zSign,
2092
                        0x7FFE,
2093
                        LIT64( 0x0000FFFFFFFFFFFF ),
2094
                        LIT64( 0xFFFFFFFFFFFFFFFF )
2095
                    );
2096
            }
2097
            return packFloat128( zSign, 0x7FFF, 0, 0 );
2098
        }
2099
        if ( zExp < 0 ) {
2100
            isTiny =
2101
                   ( float_detect_tininess == float_tininess_before_rounding )
2102
                || ( zExp < -1 )
2103
                || ! increment
2104
                || lt128(
2105
                       zSig0,
2106
                       zSig1,
2107
                       LIT64( 0x0001FFFFFFFFFFFF ),
2108
                       LIT64( 0xFFFFFFFFFFFFFFFF )
2109
                   );
2110
            shift128ExtraRightJamming(
2111
                zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
2112
            zExp = 0;
2113
            if ( isTiny && zSig2 ) float_raise( float_flag_underflow );
2114
            if ( roundNearestEven ) {
2115
                increment = ( (sbits64) zSig2 < 0 );
2116
            }
2117
            else {
2118
                if ( zSign ) {
2119
                    increment = ( roundingMode == float_round_down ) && zSig2;
2120
                }
2121
                else {
2122
                    increment = ( roundingMode == float_round_up ) && zSig2;
2123
                }
2124
            }
2125
        }
2126
    }
2127
    if ( zSig2 ) float_exception_flags |= float_flag_inexact;
2128
    if ( increment ) {
2129
        add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
2130
        zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
2131
    }
2132
    else {
2133
        if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
2134
    }
2135
    return packFloat128( zSign, zExp, zSig0, zSig1 );
2136
 
2137
}
2138
 
2139
/*----------------------------------------------------------------------------
2140
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2141
| and significand formed by the concatenation of `zSig0' and `zSig1', and
2142
| returns the proper quadruple-precision floating-point value corresponding
2143
| to the abstract input.  This routine is just like `roundAndPackFloat128'
2144
| except that the input significand has fewer bits and does not have to be
2145
| normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
2146
| point exponent.
2147
*----------------------------------------------------------------------------*/
2148
 
2149
static float128
2150
 normalizeRoundAndPackFloat128(
2151
     flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
2152
{
2153
    int8 shiftCount;
2154
    bits64 zSig2;
2155
 
2156
    if ( zSig0 == 0 ) {
2157
        zSig0 = zSig1;
2158
        zSig1 = 0;
2159
        zExp -= 64;
2160
    }
2161
    shiftCount = countLeadingZeros64( zSig0 ) - 15;
2162
    if ( 0 <= shiftCount ) {
2163
        zSig2 = 0;
2164
        shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
2165
    }
2166
    else {
2167
        shift128ExtraRightJamming(
2168
            zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
2169
    }
2170
    zExp -= shiftCount;
2171
    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
2172
 
2173
}
2174
 
2175
#endif
2176
 
2177
/*----------------------------------------------------------------------------
2178
| Returns the result of converting the 32-bit two's complement integer `a'
2179
| to the single-precision floating-point format.  The conversion is performed
2180
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2181
*----------------------------------------------------------------------------*/
2182
 
2183
float32 int32_to_float32( int32 a )
2184
{
2185
    flag zSign;
2186
 
2187
    if ( a == 0 ) return 0;
2188
    if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
2189
    zSign = ( a < 0 );
2190
    return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a );
2191
 
2192
}
2193
 
2194
/*----------------------------------------------------------------------------
2195
| Returns the result of converting the 32-bit two's complement integer `a'
2196
| to the double-precision floating-point format.  The conversion is performed
2197
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2198
*----------------------------------------------------------------------------*/
2199
 
2200
float64 int32_to_float64( int32 a )
2201
{
2202
    flag zSign;
2203
    uint32 absA;
2204
    int8 shiftCount;
2205
    bits64 zSig;
2206
 
2207
    if ( a == 0 ) return 0;
2208
    zSign = ( a < 0 );
2209
    absA = zSign ? - a : a;
2210
    shiftCount = countLeadingZeros32( absA ) + 21;
2211
    zSig = absA;
2212
    return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
2213
 
2214
}
2215
 
2216
#ifdef FLOATX80
2217
 
2218
/*----------------------------------------------------------------------------
2219
| Returns the result of converting the 32-bit two's complement integer `a'
2220
| to the extended double-precision floating-point format.  The conversion
2221
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
2222
| Arithmetic.
2223
*----------------------------------------------------------------------------*/
2224
 
2225
floatx80 int32_to_floatx80( int32 a )
2226
{
2227
    flag zSign;
2228
    uint32 absA;
2229
    int8 shiftCount;
2230
    bits64 zSig;
2231
 
2232
    if ( a == 0 ) return packFloatx80( 0, 0, 0 );
2233
    zSign = ( a < 0 );
2234
    absA = zSign ? - a : a;
2235
    shiftCount = countLeadingZeros32( absA ) + 32;
2236
    zSig = absA;
2237
    return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
2238
 
2239
}
2240
 
2241
#endif
2242
 
2243
#ifdef FLOAT128
2244
 
2245
/*----------------------------------------------------------------------------
2246
| Returns the result of converting the 32-bit two's complement integer `a' to
2247
| the quadruple-precision floating-point format.  The conversion is performed
2248
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2249
*----------------------------------------------------------------------------*/
2250
 
2251
float128 int32_to_float128( int32 a )
2252
{
2253
    flag zSign;
2254
    uint32 absA;
2255
    int8 shiftCount;
2256
    bits64 zSig0;
2257
 
2258
    if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
2259
    zSign = ( a < 0 );
2260
    absA = zSign ? - a : a;
2261
    shiftCount = countLeadingZeros32( absA ) + 17;
2262
    zSig0 = absA;
2263
    return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
2264
 
2265
}
2266
 
2267
#endif
2268
 
2269
/*----------------------------------------------------------------------------
2270
| Returns the result of converting the 64-bit two's complement integer `a'
2271
| to the single-precision floating-point format.  The conversion is performed
2272
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2273
*----------------------------------------------------------------------------*/
2274
 
2275
float32 int64_to_float32( int64 a )
2276
{
2277
    flag zSign;
2278
    uint64 absA;
2279
    int8 shiftCount;
2280
    //bits32 zSig; // Unused variable
2281
 
2282
    if ( a == 0 ) return 0;
2283
    zSign = ( a < 0 );
2284
    absA = zSign ? - a : a;
2285
    shiftCount = countLeadingZeros64( absA ) - 40;
2286
    if ( 0 <= shiftCount ) {
2287
        return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
2288
    }
2289
    else {
2290
        shiftCount += 7;
2291
        if ( shiftCount < 0 ) {
2292
            shift64RightJamming( absA, - shiftCount, &absA );
2293
        }
2294
        else {
2295
            absA <<= shiftCount;
2296
        }
2297
        return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA );
2298
    }
2299
 
2300
}
2301
 
2302
/*----------------------------------------------------------------------------
2303
| Returns the result of converting the 64-bit two's complement integer `a'
2304
| to the double-precision floating-point format.  The conversion is performed
2305
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2306
*----------------------------------------------------------------------------*/
2307
 
2308
float64 int64_to_float64( int64 a )
2309
{
2310
    flag zSign;
2311
 
2312
    if ( a == 0 ) return 0;
2313
    if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
2314
        return packFloat64( 1, 0x43E, 0 );
2315
    }
2316
    zSign = ( a < 0 );
2317
    return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a );
2318
 
2319
}
2320
 
2321
#ifdef FLOATX80
2322
 
2323
/*----------------------------------------------------------------------------
2324
| Returns the result of converting the 64-bit two's complement integer `a'
2325
| to the extended double-precision floating-point format.  The conversion
2326
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
2327
| Arithmetic.
2328
*----------------------------------------------------------------------------*/
2329
 
2330
floatx80 int64_to_floatx80( int64 a )
2331
{
2332
    flag zSign;
2333
    uint64 absA;
2334
    int8 shiftCount;
2335
 
2336
    if ( a == 0 ) return packFloatx80( 0, 0, 0 );
2337
    zSign = ( a < 0 );
2338
    absA = zSign ? - a : a;
2339
    shiftCount = countLeadingZeros64( absA );
2340
    return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
2341
 
2342
}
2343
 
2344
#endif
2345
 
2346
#ifdef FLOAT128
2347
 
2348
/*----------------------------------------------------------------------------
2349
| Returns the result of converting the 64-bit two's complement integer `a' to
2350
| the quadruple-precision floating-point format.  The conversion is performed
2351
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2352
*----------------------------------------------------------------------------*/
2353
 
2354
float128 int64_to_float128( int64 a )
2355
{
2356
    flag zSign;
2357
    uint64 absA;
2358
    int8 shiftCount;
2359
    int32 zExp;
2360
    bits64 zSig0, zSig1;
2361
 
2362
    if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
2363
    zSign = ( a < 0 );
2364
    absA = zSign ? - a : a;
2365
    shiftCount = countLeadingZeros64( absA ) + 49;
2366
    zExp = 0x406E - shiftCount;
2367
    if ( 64 <= shiftCount ) {
2368
        zSig1 = 0;
2369
        zSig0 = absA;
2370
        shiftCount -= 64;
2371
    }
2372
    else {
2373
        zSig1 = absA;
2374
        zSig0 = 0;
2375
    }
2376
    shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
2377
    return packFloat128( zSign, zExp, zSig0, zSig1 );
2378
 
2379
}
2380
 
2381
#endif
2382
 
2383
/*----------------------------------------------------------------------------
2384
| Returns the result of converting the single-precision floating-point value
2385
| `a' to the 32-bit two's complement integer format.  The conversion is
2386
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2387
| Arithmetic---which means in particular that the conversion is rounded
2388
| according to the current rounding mode.  If `a' is a NaN, the largest
2389
| positive integer is returned.  Otherwise, if the conversion overflows, the
2390
| largest integer with the same sign as `a' is returned.
2391
*----------------------------------------------------------------------------*/
2392
 
2393
int32 float32_to_int32( float32 a )
2394
{
2395
    flag aSign;
2396
    int16 aExp, shiftCount;
2397
    bits32 aSig;
2398
    bits64 aSig64;
2399
 
2400
    aSig = extractFloat32Frac( a );
2401
    aExp = extractFloat32Exp( a );
2402
    aSign = extractFloat32Sign( a );
2403
    if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
2404
    if ( aExp ) aSig |= 0x00800000;
2405
    shiftCount = 0xAF - aExp;
2406
    aSig64 = aSig;
2407
    aSig64 <<= 32;
2408
    if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
2409
    return roundAndPackInt32( aSign, aSig64 );
2410
 
2411
}
2412
 
2413
/*----------------------------------------------------------------------------
2414
| Returns the result of converting the single-precision floating-point value
2415
| `a' to the 32-bit two's complement integer format.  The conversion is
2416
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2417
| Arithmetic, except that the conversion is always rounded toward zero.
2418
| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2419
| the conversion overflows, the largest integer with the same sign as `a' is
2420
| returned.
2421
*----------------------------------------------------------------------------*/
2422
 
2423
int32 float32_to_int32_round_to_zero( float32 a )
2424
{
2425
    flag aSign;
2426
    int16 aExp, shiftCount;
2427
    bits32 aSig;
2428
    int32 z;
2429
    aSig = extractFloat32Frac( a );
2430
    aExp = extractFloat32Exp( a );
2431
    aSign = extractFloat32Sign( a );
2432
    shiftCount = aExp - 0x9E;
2433
    if ( 0 <= shiftCount ) {
2434
        if ( a != 0xCF000000 ) {
2435
            float_raise( float_flag_invalid );
2436
            if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
2437
        }
2438
        return (sbits32) 0x80000000;
2439
    }
2440
    else if ( aExp <= 0x7E ) {
2441
        if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
2442
        return 0;
2443
    }
2444
    aSig = ( aSig | 0x00800000 )<<8;
2445
    z = aSig>>( - shiftCount );
2446
    if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
2447
        float_exception_flags |= float_flag_inexact;
2448
    }
2449
    if ( aSign ) z = - z;
2450
    return z;
2451
 
2452
}
2453
 
2454
/*----------------------------------------------------------------------------
2455
| Returns the result of converting the single-precision floating-point value
2456
| `a' to the 64-bit two's complement integer format.  The conversion is
2457
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2458
| Arithmetic---which means in particular that the conversion is rounded
2459
| according to the current rounding mode.  If `a' is a NaN, the largest
2460
| positive integer is returned.  Otherwise, if the conversion overflows, the
2461
| largest integer with the same sign as `a' is returned.
2462
*----------------------------------------------------------------------------*/
2463
 
2464
int64 float32_to_int64( float32 a )
2465
{
2466
    flag aSign;
2467
    int16 aExp, shiftCount;
2468
    bits32 aSig;
2469
    bits64 aSig64, aSigExtra;
2470
 
2471
    aSig = extractFloat32Frac( a );
2472
    aExp = extractFloat32Exp( a );
2473
    aSign = extractFloat32Sign( a );
2474
    shiftCount = 0xBE - aExp;
2475
    if ( shiftCount < 0 ) {
2476
        float_raise( float_flag_invalid );
2477
        if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
2478
            return LIT64( 0x7FFFFFFFFFFFFFFF );
2479
        }
2480
        return (sbits64) LIT64( 0x8000000000000000 );
2481
    }
2482
    if ( aExp ) aSig |= 0x00800000;
2483
    aSig64 = aSig;
2484
    aSig64 <<= 40;
2485
    shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
2486
    return roundAndPackInt64( aSign, aSig64, aSigExtra );
2487
 
2488
}
2489
 
2490
/*----------------------------------------------------------------------------
2491
| Returns the result of converting the single-precision floating-point value
2492
| `a' to the 64-bit two's complement integer format.  The conversion is
2493
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2494
| Arithmetic, except that the conversion is always rounded toward zero.  If
2495
| `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
2496
| conversion overflows, the largest integer with the same sign as `a' is
2497
| returned.
2498
*----------------------------------------------------------------------------*/
2499
 
2500
int64 float32_to_int64_round_to_zero( float32 a )
2501
{
2502
    flag aSign;
2503
    int16 aExp, shiftCount;
2504
    bits32 aSig;
2505
    bits64 aSig64;
2506
    int64 z;
2507
 
2508
    aSig = extractFloat32Frac( a );
2509
    aExp = extractFloat32Exp( a );
2510
    aSign = extractFloat32Sign( a );
2511
    shiftCount = aExp - 0xBE;
2512
    if ( 0 <= shiftCount ) {
2513
        if ( a != 0xDF000000 ) {
2514
            float_raise( float_flag_invalid );
2515
            if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
2516
                return LIT64( 0x7FFFFFFFFFFFFFFF );
2517
            }
2518
        }
2519
        return (sbits64) LIT64( 0x8000000000000000 );
2520
    }
2521
    else if ( aExp <= 0x7E ) {
2522
        if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
2523
        return 0;
2524
    }
2525
    aSig64 = aSig | 0x00800000;
2526
    aSig64 <<= 40;
2527
    z = aSig64>>( - shiftCount );
2528
    if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
2529
        float_exception_flags |= float_flag_inexact;
2530
    }
2531
    if ( aSign ) z = - z;
2532
    return z;
2533
 
2534
}
2535
 
2536
/*----------------------------------------------------------------------------
2537
| Returns the result of converting the single-precision floating-point value
2538
| `a' to the double-precision floating-point format.  The conversion is
2539
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2540
| Arithmetic.
2541
*----------------------------------------------------------------------------*/
2542
 
2543
float64 float32_to_float64( float32 a )
2544
{
2545
    flag aSign;
2546
    int16 aExp;
2547
    bits32 aSig;
2548
 
2549
    aSig = extractFloat32Frac( a );
2550
    aExp = extractFloat32Exp( a );
2551
    aSign = extractFloat32Sign( a );
2552
    if ( aExp == 0xFF ) {
2553
        if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) );
2554
        return packFloat64( aSign, 0x7FF, 0 );
2555
    }
2556
    if ( aExp == 0 ) {
2557
        if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
2558
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2559
        --aExp;
2560
    }
2561
    return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
2562
 
2563
}
2564
 
2565
#ifdef FLOATX80
2566
 
2567
/*----------------------------------------------------------------------------
2568
| Returns the result of converting the single-precision floating-point value
2569
| `a' to the extended double-precision floating-point format.  The conversion
2570
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
2571
| Arithmetic.
2572
*----------------------------------------------------------------------------*/
2573
 
2574
floatx80 float32_to_floatx80( float32 a )
2575
{
2576
    flag aSign;
2577
    int16 aExp;
2578
    bits32 aSig;
2579
 
2580
    aSig = extractFloat32Frac( a );
2581
    aExp = extractFloat32Exp( a );
2582
    aSign = extractFloat32Sign( a );
2583
    if ( aExp == 0xFF ) {
2584
        if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) );
2585
        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2586
    }
2587
    if ( aExp == 0 ) {
2588
        if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2589
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2590
    }
2591
    aSig |= 0x00800000;
2592
    return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
2593
 
2594
}
2595
 
2596
#endif
2597
 
2598
#ifdef FLOAT128
2599
 
2600
/*----------------------------------------------------------------------------
2601
| Returns the result of converting the single-precision floating-point value
2602
| `a' to the double-precision floating-point format.  The conversion is
2603
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2604
| Arithmetic.
2605
*----------------------------------------------------------------------------*/
2606
 
2607
float128 float32_to_float128( float32 a )
2608
{
2609
    flag aSign;
2610
    int16 aExp;
2611
    bits32 aSig;
2612
 
2613
    aSig = extractFloat32Frac( a );
2614
    aExp = extractFloat32Exp( a );
2615
    aSign = extractFloat32Sign( a );
2616
    if ( aExp == 0xFF ) {
2617
        if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a ) );
2618
        return packFloat128( aSign, 0x7FFF, 0, 0 );
2619
    }
2620
    if ( aExp == 0 ) {
2621
        if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2622
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2623
        --aExp;
2624
    }
2625
    return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
2626
 
2627
}
2628
 
2629
#endif
2630
 
2631
/*----------------------------------------------------------------------------
2632
| Rounds the single-precision floating-point value `a' to an integer, and
2633
| returns the result as a single-precision floating-point value.  The
2634
| operation is performed according to the IEC/IEEE Standard for Binary
2635
| Floating-Point Arithmetic.
2636
*----------------------------------------------------------------------------*/
2637
 
2638
float32 float32_round_to_int( float32 a )
2639
{
2640
    flag aSign;
2641
    int16 aExp;
2642
    bits32 lastBitMask, roundBitsMask;
2643
    int8 roundingMode;
2644
    float32 z;
2645
 
2646
    aExp = extractFloat32Exp( a );
2647
    if ( 0x96 <= aExp ) {
2648
        if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
2649
            return propagateFloat32NaN( a, a );
2650
        }
2651
        return a;
2652
    }
2653
    if ( aExp <= 0x7E ) {
2654
        if ( (bits32) ( a<<1 ) == 0 ) return a;
2655
        float_exception_flags |= float_flag_inexact;
2656
        aSign = extractFloat32Sign( a );
2657
        switch ( float_rounding_mode ) {
2658
         case float_round_nearest_even:
2659
            if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
2660
                return packFloat32( aSign, 0x7F, 0 );
2661
            }
2662
            break;
2663
         case float_round_down:
2664
            return aSign ? 0xBF800000 : 0;
2665
         case float_round_up:
2666
            return aSign ? 0x80000000 : 0x3F800000;
2667
        }
2668
        return packFloat32( aSign, 0, 0 );
2669
    }
2670
    lastBitMask = 1;
2671
    lastBitMask <<= 0x96 - aExp;
2672
    roundBitsMask = lastBitMask - 1;
2673
    z = a;
2674
    roundingMode = float_rounding_mode;
2675
    if ( roundingMode == float_round_nearest_even ) {
2676
        z += lastBitMask>>1;
2677
        if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
2678
    }
2679
    else if ( roundingMode != float_round_to_zero ) {
2680
        if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
2681
            z += roundBitsMask;
2682
        }
2683
    }
2684
    z &= ~ roundBitsMask;
2685
    if ( z != a ) float_exception_flags |= float_flag_inexact;
2686
    return z;
2687
 
2688
}
2689
 
2690
/*----------------------------------------------------------------------------
2691
| Returns the result of adding the absolute values of the single-precision
2692
| floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
2693
| before being returned.  `zSign' is ignored if the result is a NaN.
2694
| The addition is performed according to the IEC/IEEE Standard for Binary
2695
| Floating-Point Arithmetic.
2696
*----------------------------------------------------------------------------*/
2697
 
2698
static float32 addFloat32Sigs( float32 a, float32 b, flag zSign )
2699
{
2700
    int16 aExp, bExp, zExp;
2701
    bits32 aSig, bSig, zSig;
2702
    int16 expDiff;
2703
 
2704
    aSig = extractFloat32Frac( a );
2705
    aExp = extractFloat32Exp( a );
2706
    bSig = extractFloat32Frac( b );
2707
    bExp = extractFloat32Exp( b );
2708
    expDiff = aExp - bExp;
2709
    aSig <<= 6;
2710
    bSig <<= 6;
2711
    if ( 0 < expDiff ) {
2712
        if ( aExp == 0xFF ) {
2713
            if ( aSig ) return propagateFloat32NaN( a, b );
2714
            return a;
2715
        }
2716
        if ( bExp == 0 ) {
2717
            --expDiff;
2718
        }
2719
        else {
2720
            bSig |= 0x20000000;
2721
        }
2722
        shift32RightJamming( bSig, expDiff, &bSig );
2723
        zExp = aExp;
2724
    }
2725
    else if ( expDiff < 0 ) {
2726
        if ( bExp == 0xFF ) {
2727
            if ( bSig ) return propagateFloat32NaN( a, b );
2728
            return packFloat32( zSign, 0xFF, 0 );
2729
        }
2730
        if ( aExp == 0 ) {
2731
            ++expDiff;
2732
        }
2733
        else {
2734
            aSig |= 0x20000000;
2735
        }
2736
        shift32RightJamming( aSig, - expDiff, &aSig );
2737
        zExp = bExp;
2738
    }
2739
    else {
2740
        if ( aExp == 0xFF ) {
2741
            if ( aSig | bSig ) return propagateFloat32NaN( a, b );
2742
            return a;
2743
        }
2744
        if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
2745
        zSig = 0x40000000 + aSig + bSig;
2746
        zExp = aExp;
2747
        goto roundAndPack;
2748
    }
2749
    aSig |= 0x20000000;
2750
    zSig = ( aSig + bSig )<<1;
2751
    --zExp;
2752
    if ( (sbits32) zSig < 0 ) {
2753
        zSig = aSig + bSig;
2754
        ++zExp;
2755
    }
2756
 roundAndPack:
2757
    return roundAndPackFloat32( zSign, zExp, zSig );
2758
 
2759
}
2760
 
2761
/*----------------------------------------------------------------------------
2762
| Returns the result of subtracting the absolute values of the single-
2763
| precision floating-point values `a' and `b'.  If `zSign' is 1, the
2764
| difference is negated before being returned.  `zSign' is ignored if the
2765
| result is a NaN.  The subtraction is performed according to the IEC/IEEE
2766
| Standard for Binary Floating-Point Arithmetic.
2767
*----------------------------------------------------------------------------*/
2768
 
2769
static float32 subFloat32Sigs( float32 a, float32 b, flag zSign )
2770
{
2771
    int16 aExp, bExp, zExp;
2772
    bits32 aSig, bSig, zSig;
2773
    int16 expDiff;
2774
 
2775
    aSig = extractFloat32Frac( a );
2776
    aExp = extractFloat32Exp( a );
2777
    bSig = extractFloat32Frac( b );
2778
    bExp = extractFloat32Exp( b );
2779
    expDiff = aExp - bExp;
2780
    aSig <<= 7;
2781
    bSig <<= 7;
2782
    if ( 0 < expDiff ) goto aExpBigger;
2783
    if ( expDiff < 0 ) goto bExpBigger;
2784
    if ( aExp == 0xFF ) {
2785
        if ( aSig | bSig ) return propagateFloat32NaN( a, b );
2786
        float_raise( float_flag_invalid );
2787
        return float32_default_nan;
2788
    }
2789
    if ( aExp == 0 ) {
2790
        aExp = 1;
2791
        bExp = 1;
2792
    }
2793
    if ( bSig < aSig ) goto aBigger;
2794
    if ( aSig < bSig ) goto bBigger;
2795
    return packFloat32( float_rounding_mode == float_round_down, 0, 0 );
2796
 bExpBigger:
2797
    if ( bExp == 0xFF ) {
2798
        if ( bSig ) return propagateFloat32NaN( a, b );
2799
        return packFloat32( zSign ^ 1, 0xFF, 0 );
2800
    }
2801
    if ( aExp == 0 ) {
2802
        ++expDiff;
2803
    }
2804
    else {
2805
        aSig |= 0x40000000;
2806
    }
2807
    shift32RightJamming( aSig, - expDiff, &aSig );
2808
    bSig |= 0x40000000;
2809
 bBigger:
2810
    zSig = bSig - aSig;
2811
    zExp = bExp;
2812
    zSign ^= 1;
2813
    goto normalizeRoundAndPack;
2814
 aExpBigger:
2815
    if ( aExp == 0xFF ) {
2816
        if ( aSig ) return propagateFloat32NaN( a, b );
2817
        return a;
2818
    }
2819
    if ( bExp == 0 ) {
2820
        --expDiff;
2821
    }
2822
    else {
2823
        bSig |= 0x40000000;
2824
    }
2825
    shift32RightJamming( bSig, expDiff, &bSig );
2826
    aSig |= 0x40000000;
2827
 aBigger:
2828
    zSig = aSig - bSig;
2829
    zExp = aExp;
2830
 normalizeRoundAndPack:
2831
    --zExp;
2832
    return normalizeRoundAndPackFloat32( zSign, zExp, zSig );
2833
 
2834
}
2835
 
2836
/*----------------------------------------------------------------------------
2837
| Returns the result of adding the single-precision floating-point values `a'
2838
| and `b'.  The operation is performed according to the IEC/IEEE Standard for
2839
| Binary Floating-Point Arithmetic.
2840
*----------------------------------------------------------------------------*/
2841
 
2842
float32 float32_add( float32 a, float32 b )
2843
{
2844
    flag aSign, bSign;
2845
 
2846
    aSign = extractFloat32Sign( a );
2847
    bSign = extractFloat32Sign( b );
2848
    if ( aSign == bSign ) {
2849
        return addFloat32Sigs( a, b, aSign );
2850
    }
2851
    else {
2852
        return subFloat32Sigs( a, b, aSign );
2853
    }
2854
 
2855
}
2856
 
2857
/*----------------------------------------------------------------------------
2858
| Returns the result of subtracting the single-precision floating-point values
2859
| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2860
| for Binary Floating-Point Arithmetic.
2861
*----------------------------------------------------------------------------*/
2862
 
2863
float32 float32_sub( float32 a, float32 b )
2864
{
2865
    flag aSign, bSign;
2866
 
2867
    aSign = extractFloat32Sign( a );
2868
    bSign = extractFloat32Sign( b );
2869
    if ( aSign == bSign ) {
2870
        return subFloat32Sigs( a, b, aSign );
2871
    }
2872
    else {
2873
        return addFloat32Sigs( a, b, aSign );
2874
    }
2875
 
2876
}
2877
 
2878
/*----------------------------------------------------------------------------
2879
| Returns the result of multiplying the single-precision floating-point values
2880
| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2881
| for Binary Floating-Point Arithmetic.
2882
*----------------------------------------------------------------------------*/
2883
 
2884
float32 float32_mul( float32 a, float32 b )
2885
{
2886
    flag aSign, bSign, zSign;
2887
    int16 aExp, bExp, zExp;
2888
    bits32 aSig, bSig;
2889
    bits64 zSig64;
2890
    bits32 zSig;
2891
 
2892
    aSig = extractFloat32Frac( a );
2893
    aExp = extractFloat32Exp( a );
2894
    aSign = extractFloat32Sign( a );
2895
    bSig = extractFloat32Frac( b );
2896
    bExp = extractFloat32Exp( b );
2897
    bSign = extractFloat32Sign( b );
2898
    zSign = aSign ^ bSign;
2899
    if ( aExp == 0xFF ) {
2900
        if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2901
            return propagateFloat32NaN( a, b );
2902
        }
2903
        if ( ( bExp | bSig ) == 0 ) {
2904
            float_raise( float_flag_invalid );
2905
            return float32_default_nan;
2906
        }
2907
        return packFloat32( zSign, 0xFF, 0 );
2908
    }
2909
    if ( bExp == 0xFF ) {
2910
        if ( bSig ) return propagateFloat32NaN( a, b );
2911
        if ( ( aExp | aSig ) == 0 ) {
2912
            float_raise( float_flag_invalid );
2913
            return float32_default_nan;
2914
        }
2915
        return packFloat32( zSign, 0xFF, 0 );
2916
    }
2917
    if ( aExp == 0 ) {
2918
        if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2919
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2920
    }
2921
    if ( bExp == 0 ) {
2922
        if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
2923
        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2924
    }
2925
    zExp = aExp + bExp - 0x7F;
2926
    aSig = ( aSig | 0x00800000 )<<7;
2927
    bSig = ( bSig | 0x00800000 )<<8;
2928
    shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
2929
    zSig = zSig64;
2930
    if ( 0 <= (sbits32) ( zSig<<1 ) ) {
2931
        zSig <<= 1;
2932
        --zExp;
2933
    }
2934
    return roundAndPackFloat32( zSign, zExp, zSig );
2935
 
2936
}
2937
 
2938
/*----------------------------------------------------------------------------
2939
| Returns the result of dividing the single-precision floating-point value `a'
2940
| by the corresponding value `b'.  The operation is performed according to the
2941
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2942
*----------------------------------------------------------------------------*/
2943
 
2944
float32 float32_div( float32 a, float32 b )
2945
{
2946
    flag aSign, bSign, zSign;
2947
    int16 aExp, bExp, zExp;
2948
    bits32 aSig, bSig, zSig;
2949
 
2950
    aSig = extractFloat32Frac( a );
2951
    aExp = extractFloat32Exp( a );
2952
    aSign = extractFloat32Sign( a );
2953
    bSig = extractFloat32Frac( b );
2954
    bExp = extractFloat32Exp( b );
2955
    bSign = extractFloat32Sign( b );
2956
    zSign = aSign ^ bSign;
2957
    if ( aExp == 0xFF ) {
2958
        if ( aSig ) return propagateFloat32NaN( a, b );
2959
        if ( bExp == 0xFF ) {
2960
            if ( bSig ) return propagateFloat32NaN( a, b );
2961
            float_raise( float_flag_invalid );
2962
            return float32_default_nan;
2963
        }
2964
        return packFloat32( zSign, 0xFF, 0 );
2965
    }
2966
    if ( bExp == 0xFF ) {
2967
        if ( bSig ) return propagateFloat32NaN( a, b );
2968
        return packFloat32( zSign, 0, 0 );
2969
    }
2970
    if ( bExp == 0 ) {
2971
        if ( bSig == 0 ) {
2972
            if ( ( aExp | aSig ) == 0 ) {
2973
                float_raise( float_flag_invalid );
2974
                return float32_default_nan;
2975
            }
2976
            float_raise( float_flag_divbyzero );
2977
            return packFloat32( zSign, 0xFF, 0 );
2978
        }
2979
        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2980
    }
2981
    if ( aExp == 0 ) {
2982
        if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2983
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2984
    }
2985
    zExp = aExp - bExp + 0x7D;
2986
    aSig = ( aSig | 0x00800000 )<<7;
2987
    bSig = ( bSig | 0x00800000 )<<8;
2988
    if ( bSig <= ( aSig + aSig ) ) {
2989
        aSig >>= 1;
2990
        ++zExp;
2991
    }
2992
    zSig = ( ( (bits64) aSig )<<32 ) / bSig;
2993
    if ( ( zSig & 0x3F ) == 0 ) {
2994
        zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
2995
    }
2996
    return roundAndPackFloat32( zSign, zExp, zSig );
2997
 
2998
}
2999
 
3000
/*----------------------------------------------------------------------------
3001
| Returns the remainder of the single-precision floating-point value `a'
3002
| with respect to the corresponding value `b'.  The operation is performed
3003
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3004
*----------------------------------------------------------------------------*/
3005
 
3006
float32 float32_rem( float32 a, float32 b )
3007
{
3008
    flag aSign, bSign, zSign;
3009
    int16 aExp, bExp, expDiff;
3010
    bits32 aSig, bSig;
3011
    bits32 q;
3012
    bits64 aSig64, bSig64, q64;
3013
    bits32 alternateASig;
3014
    sbits32 sigMean;
3015
 
3016
    aSig = extractFloat32Frac( a );
3017
    aExp = extractFloat32Exp( a );
3018
    aSign = extractFloat32Sign( a );
3019
    bSig = extractFloat32Frac( b );
3020
    bExp = extractFloat32Exp( b );
3021
    bSign = extractFloat32Sign( b );
3022
    if ( aExp == 0xFF ) {
3023
        if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
3024
            return propagateFloat32NaN( a, b );
3025
        }
3026
        float_raise( float_flag_invalid );
3027
        return float32_default_nan;
3028
    }
3029
    if ( bExp == 0xFF ) {
3030
        if ( bSig ) return propagateFloat32NaN( a, b );
3031
        return a;
3032
    }
3033
    if ( bExp == 0 ) {
3034
        if ( bSig == 0 ) {
3035
            float_raise( float_flag_invalid );
3036
            return float32_default_nan;
3037
        }
3038
        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
3039
    }
3040
    if ( aExp == 0 ) {
3041
        if ( aSig == 0 ) return a;
3042
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3043
    }
3044
    expDiff = aExp - bExp;
3045
    aSig |= 0x00800000;
3046
    bSig |= 0x00800000;
3047
    if ( expDiff < 32 ) {
3048
        aSig <<= 8;
3049
        bSig <<= 8;
3050
        if ( expDiff < 0 ) {
3051
            if ( expDiff < -1 ) return a;
3052
            aSig >>= 1;
3053
        }
3054
        q = ( bSig <= aSig );
3055
        if ( q ) aSig -= bSig;
3056
        if ( 0 < expDiff ) {
3057
            q = ( ( (bits64) aSig )<<32 ) / bSig;
3058
            q >>= 32 - expDiff;
3059
            bSig >>= 2;
3060
            aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3061
        }
3062
        else {
3063
            aSig >>= 2;
3064
            bSig >>= 2;
3065
        }
3066
    }
3067
    else {
3068
        if ( bSig <= aSig ) aSig -= bSig;
3069
        aSig64 = ( (bits64) aSig )<<40;
3070
        bSig64 = ( (bits64) bSig )<<40;
3071
        expDiff -= 64;
3072
        while ( 0 < expDiff ) {
3073
            q64 = estimateDiv128To64( aSig64, 0, bSig64 );
3074
            q64 = ( 2 < q64 ) ? q64 - 2 : 0;
3075
            aSig64 = - ( ( bSig * q64 )<<38 );
3076
            expDiff -= 62;
3077
        }
3078
        expDiff += 64;
3079
        q64 = estimateDiv128To64( aSig64, 0, bSig64 );
3080
        q64 = ( 2 < q64 ) ? q64 - 2 : 0;
3081
        q = q64>>( 64 - expDiff );
3082
        bSig <<= 6;
3083
        aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
3084
    }
3085
    do {
3086
        alternateASig = aSig;
3087
        ++q;
3088
        aSig -= bSig;
3089
    } while ( 0 <= (sbits32) aSig );
3090
    sigMean = aSig + alternateASig;
3091
    if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3092
        aSig = alternateASig;
3093
    }
3094
    zSign = ( (sbits32) aSig < 0 );
3095
    if ( zSign ) aSig = - aSig;
3096
    return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig );
3097
 
3098
}
3099
 
3100
/*----------------------------------------------------------------------------
3101
| Returns the square root of the single-precision floating-point value `a'.
3102
| The operation is performed according to the IEC/IEEE Standard for Binary
3103
| Floating-Point Arithmetic.
3104
*----------------------------------------------------------------------------*/
3105
 
3106
float32 float32_sqrt( float32 a )
3107
{
3108
    flag aSign;
3109
    int16 aExp, zExp;
3110
    bits32 aSig, zSig;
3111
    bits64 rem, term;
3112
 
3113
    aSig = extractFloat32Frac( a );
3114
    aExp = extractFloat32Exp( a );
3115
    aSign = extractFloat32Sign( a );
3116
    if ( aExp == 0xFF ) {
3117
        if ( aSig ) return propagateFloat32NaN( a, 0 );
3118
        if ( ! aSign ) return a;
3119
        float_raise( float_flag_invalid );
3120
        return float32_default_nan;
3121
    }
3122
    if ( aSign ) {
3123
        if ( ( aExp | aSig ) == 0 ) return a;
3124
        float_raise( float_flag_invalid );
3125
        return float32_default_nan;
3126
    }
3127
    if ( aExp == 0 ) {
3128
        if ( aSig == 0 ) return 0;
3129
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3130
    }
3131
    zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
3132
    aSig = ( aSig | 0x00800000 )<<8;
3133
    zSig = estimateSqrt32( aExp, aSig ) + 2;
3134
    if ( ( zSig & 0x7F ) <= 5 ) {
3135
        if ( zSig < 2 ) {
3136
            zSig = 0x7FFFFFFF;
3137
            goto roundAndPack;
3138
        }
3139
        aSig >>= aExp & 1;
3140
        term = ( (bits64) zSig ) * zSig;
3141
        rem = ( ( (bits64) aSig )<<32 ) - term;
3142
        while ( (sbits64) rem < 0 ) {
3143
            --zSig;
3144
            rem += ( ( (bits64) zSig )<<1 ) | 1;
3145
        }
3146
        zSig |= ( rem != 0 );
3147
    }
3148
    shift32RightJamming( zSig, 1, &zSig );
3149
 roundAndPack:
3150
    return roundAndPackFloat32( 0, zExp, zSig );
3151
 
3152
}
3153
 
3154
/*----------------------------------------------------------------------------
3155
| Returns 1 if the single-precision floating-point value `a' is equal to
3156
| the corresponding value `b', and 0 otherwise.  The comparison is performed
3157
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3158
*----------------------------------------------------------------------------*/
3159
 
3160
flag float32_eq( float32 a, float32 b )
3161
{
3162
 
3163
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3164
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3165
       ) {
3166
        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
3167
            float_raise( float_flag_invalid );
3168
        }
3169
        return 0;
3170
    }
3171
    return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
3172
 
3173
}
3174
 
3175
/*----------------------------------------------------------------------------
3176
| Returns 1 if the single-precision floating-point value `a' is less than
3177
| or equal to the corresponding value `b', and 0 otherwise.  The comparison
3178
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
3179
| Arithmetic.
3180
*----------------------------------------------------------------------------*/
3181
 
3182
flag float32_le( float32 a, float32 b )
3183
{
3184
    flag aSign, bSign;
3185
 
3186
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3187
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3188
       ) {
3189
        float_raise( float_flag_invalid );
3190
        return 0;
3191
    }
3192
    aSign = extractFloat32Sign( a );
3193
    bSign = extractFloat32Sign( b );
3194
    if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
3195
    return ( a == b ) || ( aSign ^ ( a < b ) );
3196
 
3197
}
3198
 
3199
/*----------------------------------------------------------------------------
3200
| Returns 1 if the single-precision floating-point value `a' is less than
3201
| the corresponding value `b', and 0 otherwise.  The comparison is performed
3202
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3203
*----------------------------------------------------------------------------*/
3204
 
3205
flag float32_lt( float32 a, float32 b )
3206
{
3207
    flag aSign, bSign;
3208
 
3209
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3210
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3211
       ) {
3212
        float_raise( float_flag_invalid );
3213
        return 0;
3214
    }
3215
    aSign = extractFloat32Sign( a );
3216
    bSign = extractFloat32Sign( b );
3217
    if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
3218
    return ( a != b ) && ( aSign ^ ( a < b ) );
3219
 
3220
}
3221
 
3222
/*----------------------------------------------------------------------------
3223
| Returns 1 if the single-precision floating-point value `a' is equal to
3224
| the corresponding value `b', and 0 otherwise.  The invalid exception is
3225
| raised if either operand is a NaN.  Otherwise, the comparison is performed
3226
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3227
*----------------------------------------------------------------------------*/
3228
 
3229
flag float32_eq_signaling( float32 a, float32 b )
3230
{
3231
 
3232
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3233
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3234
       ) {
3235
        float_raise( float_flag_invalid );
3236
        return 0;
3237
    }
3238
    return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
3239
 
3240
}
3241
 
3242
/*----------------------------------------------------------------------------
3243
| Returns 1 if the single-precision floating-point value `a' is less than or
3244
| equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
3245
| cause an exception.  Otherwise, the comparison is performed according to the
3246
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3247
*----------------------------------------------------------------------------*/
3248
 
3249
flag float32_le_quiet( float32 a, float32 b )
3250
{
3251
    flag aSign, bSign;
3252
    //int16 aExp, bExp; // Unused variable
3253
 
3254
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3255
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3256
       ) {
3257
        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
3258
            float_raise( float_flag_invalid );
3259
        }
3260
        return 0;
3261
    }
3262
    aSign = extractFloat32Sign( a );
3263
    bSign = extractFloat32Sign( b );
3264
    if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
3265
    return ( a == b ) || ( aSign ^ ( a < b ) );
3266
 
3267
}
3268
 
3269
/*----------------------------------------------------------------------------
3270
| Returns 1 if the single-precision floating-point value `a' is less than
3271
| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3272
| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
3273
| Standard for Binary Floating-Point Arithmetic.
3274
*----------------------------------------------------------------------------*/
3275
 
3276
flag float32_lt_quiet( float32 a, float32 b )
3277
{
3278
    flag aSign, bSign;
3279
 
3280
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3281
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3282
       ) {
3283
        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
3284
            float_raise( float_flag_invalid );
3285
        }
3286
        return 0;
3287
    }
3288
    aSign = extractFloat32Sign( a );
3289
    bSign = extractFloat32Sign( b );
3290
    if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
3291
    return ( a != b ) && ( aSign ^ ( a < b ) );
3292
 
3293
}
3294
 
3295
/*----------------------------------------------------------------------------
3296
| Returns the result of converting the double-precision floating-point value
3297
| `a' to the 32-bit two's complement integer format.  The conversion is
3298
| performed according to the IEC/IEEE Standard for Binary Floating-Point
3299
| Arithmetic---which means in particular that the conversion is rounded
3300
| according to the current rounding mode.  If `a' is a NaN, the largest
3301
| positive integer is returned.  Otherwise, if the conversion overflows, the
3302
| largest integer with the same sign as `a' is returned.
3303
*----------------------------------------------------------------------------*/
3304
 
3305
int32 float64_to_int32( float64 a )
3306
{
3307
    flag aSign;
3308
    int16 aExp, shiftCount;
3309
    bits64 aSig;
3310
 
3311
    aSig = extractFloat64Frac( a );
3312
    aExp = extractFloat64Exp( a );
3313
    aSign = extractFloat64Sign( a );
3314
    if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
3315
    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3316
    shiftCount = 0x42C - aExp;
3317
    if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
3318
    return roundAndPackInt32( aSign, aSig );
3319
 
3320
}
3321
 
3322
/*----------------------------------------------------------------------------
3323
| Returns the result of converting the double-precision floating-point value
3324
| `a' to the 32-bit two's complement integer format.  The conversion is
3325
| performed according to the IEC/IEEE Standard for Binary Floating-Point
3326
| Arithmetic, except that the conversion is always rounded toward zero.
3327
| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
3328
| the conversion overflows, the largest integer with the same sign as `a' is
3329
| returned.
3330
*----------------------------------------------------------------------------*/
3331
 
3332
int32 float64_to_int32_round_to_zero( float64 a )
3333
{
3334
    flag aSign;
3335
    int16 aExp, shiftCount;
3336
    bits64 aSig, savedASig;
3337
    int32 z;
3338
 
3339
    aSig = extractFloat64Frac( a );
3340
    aExp = extractFloat64Exp( a );
3341
    aSign = extractFloat64Sign( a );
3342
    if ( 0x41E < aExp ) {
3343
        if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
3344
        goto invalid;
3345
    }
3346
    else if ( aExp < 0x3FF ) {
3347
        if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
3348
        return 0;
3349
    }
3350
    aSig |= LIT64( 0x0010000000000000 );
3351
    shiftCount = 0x433 - aExp;
3352
    savedASig = aSig;
3353
    aSig >>= shiftCount;
3354
    z = aSig;
3355
    if ( aSign ) z = - z;
3356
    if ( ( z < 0 ) ^ aSign ) {
3357
 invalid:
3358
        float_raise( float_flag_invalid );
3359
        return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
3360
    }
3361
    if ( ( aSig<<shiftCount ) != savedASig ) {
3362
        float_exception_flags |= float_flag_inexact;
3363
    }
3364
    return z;
3365
 
3366
}
3367
 
3368
/*----------------------------------------------------------------------------
3369
| Returns the result of converting the double-precision floating-point value
3370
| `a' to the 64-bit two's complement integer format.  The conversion is
3371
| performed according to the IEC/IEEE Standard for Binary Floating-Point
3372
| Arithmetic---which means in particular that the conversion is rounded
3373
| according to the current rounding mode.  If `a' is a NaN, the largest
3374
| positive integer is returned.  Otherwise, if the conversion overflows, the
3375
| largest integer with the same sign as `a' is returned.
3376
*----------------------------------------------------------------------------*/
3377
 
3378
int64 float64_to_int64( float64 a )
3379
{
3380
    flag aSign;
3381
    int16 aExp, shiftCount;
3382
    bits64 aSig, aSigExtra;
3383
 
3384
    aSig = extractFloat64Frac( a );
3385
    aExp = extractFloat64Exp( a );
3386
    aSign = extractFloat64Sign( a );
3387
    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3388
    shiftCount = 0x433 - aExp;
3389
    if ( shiftCount <= 0 ) {
3390
        if ( 0x43E < aExp ) {
3391
            float_raise( float_flag_invalid );
3392
            if (    ! aSign
3393
                 || (    ( aExp == 0x7FF )
3394
                      && ( aSig != LIT64( 0x0010000000000000 ) ) )
3395
               ) {
3396
                return LIT64( 0x7FFFFFFFFFFFFFFF );
3397
            }
3398
            return (sbits64) LIT64( 0x8000000000000000 );
3399
        }
3400
        aSigExtra = 0;
3401
        aSig <<= - shiftCount;
3402
    }
3403
    else {
3404
        shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3405
    }
3406
    return roundAndPackInt64( aSign, aSig, aSigExtra );
3407
 
3408
}
3409
 
3410
/*----------------------------------------------------------------------------
3411
| Returns the result of converting the double-precision floating-point value
3412
| `a' to the 64-bit two's complement integer format.  The conversion is
3413
| performed according to the IEC/IEEE Standard for Binary Floating-Point
3414
| Arithmetic, except that the conversion is always rounded toward zero.
3415
| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
3416
| the conversion overflows, the largest integer with the same sign as `a' is
3417
| returned.
3418
*----------------------------------------------------------------------------*/
3419
 
3420
int64 float64_to_int64_round_to_zero( float64 a )
3421
{
3422
    flag aSign;
3423
    int16 aExp, shiftCount;
3424
    bits64 aSig;
3425
    int64 z;
3426
 
3427
    aSig = extractFloat64Frac( a );
3428
    aExp = extractFloat64Exp( a );
3429
    aSign = extractFloat64Sign( a );
3430
    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3431
    shiftCount = aExp - 0x433;
3432
    if ( 0 <= shiftCount ) {
3433
        if ( 0x43E <= aExp ) {
3434
            if ( a != LIT64( 0xC3E0000000000000 ) ) {
3435
                float_raise( float_flag_invalid );
3436
                if (    ! aSign
3437
                     || (    ( aExp == 0x7FF )
3438
                          && ( aSig != LIT64( 0x0010000000000000 ) ) )
3439
                   ) {
3440
                    return LIT64( 0x7FFFFFFFFFFFFFFF );
3441
                }
3442
            }
3443
            return (sbits64) LIT64( 0x8000000000000000 );
3444
        }
3445
        z = aSig<<shiftCount;
3446
    }
3447
    else {
3448
        if ( aExp < 0x3FE ) {
3449
            if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
3450
            return 0;
3451
        }
3452
        z = aSig>>( - shiftCount );
3453
        if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
3454
            float_exception_flags |= float_flag_inexact;
3455
        }
3456
    }
3457
    if ( aSign ) z = - z;
3458
    return z;
3459
 
3460
}
3461
 
3462
/*----------------------------------------------------------------------------
3463
| Returns the result of converting the double-precision floating-point value
3464
| `a' to the single-precision floating-point format.  The conversion is
3465
| performed according to the IEC/IEEE Standard for Binary Floating-Point
3466
| Arithmetic.
3467
*----------------------------------------------------------------------------*/
3468
 
3469
float32 float64_to_float32( float64 a )
3470
{
3471
    flag aSign;
3472
    int16 aExp;
3473
    bits64 aSig;
3474
    bits32 zSig;
3475
 
3476
    aSig = extractFloat64Frac( a );
3477
    aExp = extractFloat64Exp( a );
3478
    aSign = extractFloat64Sign( a );
3479
    if ( aExp == 0x7FF ) {
3480
        if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) );
3481
        return packFloat32( aSign, 0xFF, 0 );
3482
    }
3483
    shift64RightJamming( aSig, 22, &aSig );
3484
    zSig = aSig;
3485
    if ( aExp || zSig ) {
3486
        zSig |= 0x40000000;
3487
        aExp -= 0x381;
3488
    }
3489
    return roundAndPackFloat32( aSign, aExp, zSig );
3490
 
3491
}
3492
 
3493
#ifdef FLOATX80
3494
 
3495
/*----------------------------------------------------------------------------
3496
| Returns the result of converting the double-precision floating-point value
3497
| `a' to the extended double-precision floating-point format.  The conversion
3498
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
3499
| Arithmetic.
3500
*----------------------------------------------------------------------------*/
3501
 
3502
floatx80 float64_to_floatx80( float64 a )
3503
{
3504
    flag aSign;
3505
    int16 aExp;
3506
    bits64 aSig;
3507
 
3508
    aSig = extractFloat64Frac( a );
3509
    aExp = extractFloat64Exp( a );
3510
    aSign = extractFloat64Sign( a );
3511
    if ( aExp == 0x7FF ) {
3512
        if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) );
3513
        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3514
    }
3515
    if ( aExp == 0 ) {
3516
        if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3517
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3518
    }
3519
    return
3520
        packFloatx80(
3521
            aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
3522
 
3523
}
3524
 
3525
#endif
3526
 
3527
#ifdef FLOAT128
3528
 
3529
/*----------------------------------------------------------------------------
3530
| Returns the result of converting the double-precision floating-point value
3531
| `a' to the quadruple-precision floating-point format.  The conversion is
3532
| performed according to the IEC/IEEE Standard for Binary Floating-Point
3533
| Arithmetic.
3534
*----------------------------------------------------------------------------*/
3535
 
3536
float128 float64_to_float128( float64 a )
3537
{
3538
    flag aSign;
3539
    int16 aExp;
3540
    bits64 aSig, zSig0, zSig1;
3541
 
3542
    aSig = extractFloat64Frac( a );
3543
    aExp = extractFloat64Exp( a );
3544
    aSign = extractFloat64Sign( a );
3545
    if ( aExp == 0x7FF ) {
3546
        if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a ) );
3547
        return packFloat128( aSign, 0x7FFF, 0, 0 );
3548
    }
3549
    if ( aExp == 0 ) {
3550
        if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3551
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3552
        --aExp;
3553
    }
3554
    shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
3555
    return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
3556
 
3557
}
3558
 
3559
#endif
3560
 
3561
/*----------------------------------------------------------------------------
3562
| Rounds the double-precision floating-point value `a' to an integer, and
3563
| returns the result as a double-precision floating-point value.  The
3564
| operation is performed according to the IEC/IEEE Standard for Binary
3565
| Floating-Point Arithmetic.
3566
*----------------------------------------------------------------------------*/
3567
 
3568
float64 float64_round_to_int( float64 a )
3569
{
3570
    flag aSign;
3571
    int16 aExp;
3572
    bits64 lastBitMask, roundBitsMask;
3573
    int8 roundingMode;
3574
    float64 z;
3575
 
3576
    aExp = extractFloat64Exp( a );
3577
    if ( 0x433 <= aExp ) {
3578
        if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
3579
            return propagateFloat64NaN( a, a );
3580
        }
3581
        return a;
3582
    }
3583
    if ( aExp < 0x3FF ) {
3584
        if ( (bits64) ( a<<1 ) == 0 ) return a;
3585
        float_exception_flags |= float_flag_inexact;
3586
        aSign = extractFloat64Sign( a );
3587
        switch ( float_rounding_mode ) {
3588
         case float_round_nearest_even:
3589
            if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
3590
                return packFloat64( aSign, 0x3FF, 0 );
3591
            }
3592
            break;
3593
         case float_round_down:
3594
            return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
3595
         case float_round_up:
3596
            return
3597
            aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
3598
        }
3599
        return packFloat64( aSign, 0, 0 );
3600
    }
3601
    lastBitMask = 1;
3602
    lastBitMask <<= 0x433 - aExp;
3603
    roundBitsMask = lastBitMask - 1;
3604
    z = a;
3605
    roundingMode = float_rounding_mode;
3606
    if ( roundingMode == float_round_nearest_even ) {
3607
        z += lastBitMask>>1;
3608
        if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
3609
    }
3610
    else if ( roundingMode != float_round_to_zero ) {
3611
        if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
3612
            z += roundBitsMask;
3613
        }
3614
    }
3615
    z &= ~ roundBitsMask;
3616
    if ( z != a ) float_exception_flags |= float_flag_inexact;
3617
    return z;
3618
 
3619
}
3620
 
3621
/*----------------------------------------------------------------------------
3622
| Returns the result of adding the absolute values of the double-precision
3623
| floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
3624
| before being returned.  `zSign' is ignored if the result is a NaN.
3625
| The addition is performed according to the IEC/IEEE Standard for Binary
3626
| Floating-Point Arithmetic.
3627
*----------------------------------------------------------------------------*/
3628
 
3629
static float64 addFloat64Sigs( float64 a, float64 b, flag zSign )
3630
{
3631
    int16 aExp, bExp, zExp;
3632
    bits64 aSig, bSig, zSig;
3633
    int16 expDiff;
3634
 
3635
    aSig = extractFloat64Frac( a );
3636
    aExp = extractFloat64Exp( a );
3637
    bSig = extractFloat64Frac( b );
3638
    bExp = extractFloat64Exp( b );
3639
    expDiff = aExp - bExp;
3640
    aSig <<= 9;
3641
    bSig <<= 9;
3642
    if ( 0 < expDiff ) {
3643
        if ( aExp == 0x7FF ) {
3644
            if ( aSig ) return propagateFloat64NaN( a, b );
3645
            return a;
3646
        }
3647
        if ( bExp == 0 ) {
3648
            --expDiff;
3649
        }
3650
        else {
3651
            bSig |= LIT64( 0x2000000000000000 );
3652
        }
3653
        shift64RightJamming( bSig, expDiff, &bSig );
3654
        zExp = aExp;
3655
    }
3656
    else if ( expDiff < 0 ) {
3657
        if ( bExp == 0x7FF ) {
3658
            if ( bSig ) return propagateFloat64NaN( a, b );
3659
            return packFloat64( zSign, 0x7FF, 0 );
3660
        }
3661
        if ( aExp == 0 ) {
3662
            ++expDiff;
3663
        }
3664
        else {
3665
            aSig |= LIT64( 0x2000000000000000 );
3666
        }
3667
        shift64RightJamming( aSig, - expDiff, &aSig );
3668
        zExp = bExp;
3669
    }
3670
    else {
3671
        if ( aExp == 0x7FF ) {
3672
            if ( aSig | bSig ) return propagateFloat64NaN( a, b );
3673
            return a;
3674
        }
3675
        if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
3676
        zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
3677
        zExp = aExp;
3678
        goto roundAndPack;
3679
    }
3680
    aSig |= LIT64( 0x2000000000000000 );
3681
    zSig = ( aSig + bSig )<<1;
3682
    --zExp;
3683
    if ( (sbits64) zSig < 0 ) {
3684
        zSig = aSig + bSig;
3685
        ++zExp;
3686
    }
3687
 roundAndPack:
3688
    return roundAndPackFloat64( zSign, zExp, zSig );
3689
 
3690
}
3691
 
3692
/*----------------------------------------------------------------------------
3693
| Returns the result of subtracting the absolute values of the double-
3694
| precision floating-point values `a' and `b'.  If `zSign' is 1, the
3695
| difference is negated before being returned.  `zSign' is ignored if the
3696
| result is a NaN.  The subtraction is performed according to the IEC/IEEE
3697
| Standard for Binary Floating-Point Arithmetic.
3698
*----------------------------------------------------------------------------*/
3699
 
3700
static float64 subFloat64Sigs( float64 a, float64 b, flag zSign )
3701
{
3702
    int16 aExp, bExp, zExp;
3703
    bits64 aSig, bSig, zSig;
3704
    int16 expDiff;
3705
 
3706
    aSig = extractFloat64Frac( a );
3707
    aExp = extractFloat64Exp( a );
3708
    bSig = extractFloat64Frac( b );
3709
    bExp = extractFloat64Exp( b );
3710
    expDiff = aExp - bExp;
3711
    aSig <<= 10;
3712
    bSig <<= 10;
3713
    if ( 0 < expDiff ) goto aExpBigger;
3714
    if ( expDiff < 0 ) goto bExpBigger;
3715
    if ( aExp == 0x7FF ) {
3716
        if ( aSig | bSig ) return propagateFloat64NaN( a, b );
3717
        float_raise( float_flag_invalid );
3718
        return float64_default_nan;
3719
    }
3720
    if ( aExp == 0 ) {
3721
        aExp = 1;
3722
        bExp = 1;
3723
    }
3724
    if ( bSig < aSig ) goto aBigger;
3725
    if ( aSig < bSig ) goto bBigger;
3726
    return packFloat64( float_rounding_mode == float_round_down, 0, 0 );
3727
 bExpBigger:
3728
    if ( bExp == 0x7FF ) {
3729
        if ( bSig ) return propagateFloat64NaN( a, b );
3730
        return packFloat64( zSign ^ 1, 0x7FF, 0 );
3731
    }
3732
    if ( aExp == 0 ) {
3733
        ++expDiff;
3734
    }
3735
    else {
3736
        aSig |= LIT64( 0x4000000000000000 );
3737
    }
3738
    shift64RightJamming( aSig, - expDiff, &aSig );
3739
    bSig |= LIT64( 0x4000000000000000 );
3740
 bBigger:
3741
    zSig = bSig - aSig;
3742
    zExp = bExp;
3743
    zSign ^= 1;
3744
    goto normalizeRoundAndPack;
3745
 aExpBigger:
3746
    if ( aExp == 0x7FF ) {
3747
        if ( aSig ) return propagateFloat64NaN( a, b );
3748
        return a;
3749
    }
3750
    if ( bExp == 0 ) {
3751
        --expDiff;
3752
    }
3753
    else {
3754
        bSig |= LIT64( 0x4000000000000000 );
3755
    }
3756
    shift64RightJamming( bSig, expDiff, &bSig );
3757
    aSig |= LIT64( 0x4000000000000000 );
3758
 aBigger:
3759
    zSig = aSig - bSig;
3760
    zExp = aExp;
3761
 normalizeRoundAndPack:
3762
    --zExp;
3763
    return normalizeRoundAndPackFloat64( zSign, zExp, zSig );
3764
 
3765
}
3766
 
3767
/*----------------------------------------------------------------------------
3768
| Returns the result of adding the double-precision floating-point values `a'
3769
| and `b'.  The operation is performed according to the IEC/IEEE Standard for
3770
| Binary Floating-Point Arithmetic.
3771
*----------------------------------------------------------------------------*/
3772
 
3773
float64 float64_add( float64 a, float64 b )
3774
{
3775
    flag aSign, bSign;
3776
 
3777
    aSign = extractFloat64Sign( a );
3778
    bSign = extractFloat64Sign( b );
3779
    if ( aSign == bSign ) {
3780
        return addFloat64Sigs( a, b, aSign );
3781
    }
3782
    else {
3783
        return subFloat64Sigs( a, b, aSign );
3784
    }
3785
 
3786
}
3787
 
3788
/*----------------------------------------------------------------------------
3789
| Returns the result of subtracting the double-precision floating-point values
3790
| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
3791
| for Binary Floating-Point Arithmetic.
3792
*----------------------------------------------------------------------------*/
3793
 
3794
float64 float64_sub( float64 a, float64 b )
3795
{
3796
    flag aSign, bSign;
3797
 
3798
    aSign = extractFloat64Sign( a );
3799
    bSign = extractFloat64Sign( b );
3800
    if ( aSign == bSign ) {
3801
        return subFloat64Sigs( a, b, aSign );
3802
    }
3803
    else {
3804
        return addFloat64Sigs( a, b, aSign );
3805
    }
3806
 
3807
}
3808
 
3809
/*----------------------------------------------------------------------------
3810
| Returns the result of multiplying the double-precision floating-point values
3811
| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
3812
| for Binary Floating-Point Arithmetic.
3813
*----------------------------------------------------------------------------*/
3814
 
3815
float64 float64_mul( float64 a, float64 b )
3816
{
3817
    flag aSign, bSign, zSign;
3818
    int16 aExp, bExp, zExp;
3819
    bits64 aSig, bSig, zSig0, zSig1;
3820
 
3821
    aSig = extractFloat64Frac( a );
3822
    aExp = extractFloat64Exp( a );
3823
    aSign = extractFloat64Sign( a );
3824
    bSig = extractFloat64Frac( b );
3825
    bExp = extractFloat64Exp( b );
3826
    bSign = extractFloat64Sign( b );
3827
    zSign = aSign ^ bSign;
3828
    if ( aExp == 0x7FF ) {
3829
        if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3830
            return propagateFloat64NaN( a, b );
3831
        }
3832
        if ( ( bExp | bSig ) == 0 ) {
3833
            float_raise( float_flag_invalid );
3834
            return float64_default_nan;
3835
        }
3836
        return packFloat64( zSign, 0x7FF, 0 );
3837
    }
3838
    if ( bExp == 0x7FF ) {
3839
        if ( bSig ) return propagateFloat64NaN( a, b );
3840
        if ( ( aExp | aSig ) == 0 ) {
3841
            float_raise( float_flag_invalid );
3842
            return float64_default_nan;
3843
        }
3844
        return packFloat64( zSign, 0x7FF, 0 );
3845
    }
3846
    if ( aExp == 0 ) {
3847
        if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3848
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3849
    }
3850
    if ( bExp == 0 ) {
3851
        if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
3852
        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3853
    }
3854
    zExp = aExp + bExp - 0x3FF;
3855
    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3856
    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3857
    mul64To128( aSig, bSig, &zSig0, &zSig1 );
3858
    zSig0 |= ( zSig1 != 0 );
3859
    if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
3860
        zSig0 <<= 1;
3861
        --zExp;
3862
    }
3863
    return roundAndPackFloat64( zSign, zExp, zSig0 );
3864
 
3865
}
3866
 
3867
/*----------------------------------------------------------------------------
3868
| Returns the result of dividing the double-precision floating-point value `a'
3869
| by the corresponding value `b'.  The operation is performed according to
3870
| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3871
*----------------------------------------------------------------------------*/
3872
 
3873
float64 float64_div( float64 a, float64 b )
3874
{
3875
    flag aSign, bSign, zSign;
3876
    int16 aExp, bExp, zExp;
3877
    bits64 aSig, bSig, zSig;
3878
    bits64 rem0, rem1;
3879
    bits64 term0, term1;
3880
 
3881
    aSig = extractFloat64Frac( a );
3882
    aExp = extractFloat64Exp( a );
3883
    aSign = extractFloat64Sign( a );
3884
    bSig = extractFloat64Frac( b );
3885
    bExp = extractFloat64Exp( b );
3886
    bSign = extractFloat64Sign( b );
3887
    zSign = aSign ^ bSign;
3888
    if ( aExp == 0x7FF ) {
3889
        if ( aSig ) return propagateFloat64NaN( a, b );
3890
        if ( bExp == 0x7FF ) {
3891
            if ( bSig ) return propagateFloat64NaN( a, b );
3892
            float_raise( float_flag_invalid );
3893
            return float64_default_nan;
3894
        }
3895
        return packFloat64( zSign, 0x7FF, 0 );
3896
    }
3897
    if ( bExp == 0x7FF ) {
3898
        if ( bSig ) return propagateFloat64NaN( a, b );
3899
        return packFloat64( zSign, 0, 0 );
3900
    }
3901
    if ( bExp == 0 ) {
3902
        if ( bSig == 0 ) {
3903
            if ( ( aExp | aSig ) == 0 ) {
3904
                float_raise( float_flag_invalid );
3905
                return float64_default_nan;
3906
            }
3907
            float_raise( float_flag_divbyzero );
3908
            return packFloat64( zSign, 0x7FF, 0 );
3909
        }
3910
        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3911
    }
3912
    if ( aExp == 0 ) {
3913
        if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3914
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3915
    }
3916
    zExp = aExp - bExp + 0x3FD;
3917
    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3918
    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3919
    if ( bSig <= ( aSig + aSig ) ) {
3920
        aSig >>= 1;
3921
        ++zExp;
3922
    }
3923
    zSig = estimateDiv128To64( aSig, 0, bSig );
3924
    if ( ( zSig & 0x1FF ) <= 2 ) {
3925
        mul64To128( bSig, zSig, &term0, &term1 );
3926
        sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3927
        while ( (sbits64) rem0 < 0 ) {
3928
            --zSig;
3929
            add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3930
        }
3931
        zSig |= ( rem1 != 0 );
3932
    }
3933
    return roundAndPackFloat64( zSign, zExp, zSig );
3934
 
3935
}
3936
 
3937
/*----------------------------------------------------------------------------
3938
| Returns the remainder of the double-precision floating-point value `a'
3939
| with respect to the corresponding value `b'.  The operation is performed
3940
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3941
*----------------------------------------------------------------------------*/
3942
 
3943
float64 float64_rem( float64 a, float64 b )
3944
{
3945
    flag aSign, bSign, zSign;
3946
    int16 aExp, bExp, expDiff;
3947
    bits64 aSig, bSig;
3948
    bits64 q, alternateASig;
3949
    sbits64 sigMean;
3950
 
3951
    aSig = extractFloat64Frac( a );
3952
    aExp = extractFloat64Exp( a );
3953
    aSign = extractFloat64Sign( a );
3954
    bSig = extractFloat64Frac( b );
3955
    bExp = extractFloat64Exp( b );
3956
    bSign = extractFloat64Sign( b );
3957
    if ( aExp == 0x7FF ) {
3958
        if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3959
            return propagateFloat64NaN( a, b );
3960
        }
3961
        float_raise( float_flag_invalid );
3962
        return float64_default_nan;
3963
    }
3964
    if ( bExp == 0x7FF ) {
3965
        if ( bSig ) return propagateFloat64NaN( a, b );
3966
        return a;
3967
    }
3968
    if ( bExp == 0 ) {
3969
        if ( bSig == 0 ) {
3970
            float_raise( float_flag_invalid );
3971
            return float64_default_nan;
3972
        }
3973
        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3974
    }
3975
    if ( aExp == 0 ) {
3976
        if ( aSig == 0 ) return a;
3977
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3978
    }
3979
    expDiff = aExp - bExp;
3980
    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
3981
    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3982
    if ( expDiff < 0 ) {
3983
        if ( expDiff < -1 ) return a;
3984
        aSig >>= 1;
3985
    }
3986
    q = ( bSig <= aSig );
3987
    if ( q ) aSig -= bSig;
3988
    expDiff -= 64;
3989
    while ( 0 < expDiff ) {
3990
        q = estimateDiv128To64( aSig, 0, bSig );
3991
        q = ( 2 < q ) ? q - 2 : 0;
3992
        aSig = - ( ( bSig>>2 ) * q );
3993
        expDiff -= 62;
3994
    }
3995
    expDiff += 64;
3996
    if ( 0 < expDiff ) {
3997
        q = estimateDiv128To64( aSig, 0, bSig );
3998
        q = ( 2 < q ) ? q - 2 : 0;
3999
        q >>= 64 - expDiff;
4000
        bSig >>= 2;
4001
        aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4002
    }
4003
    else {
4004
        aSig >>= 2;
4005
        bSig >>= 2;
4006
    }
4007
    do {
4008
        alternateASig = aSig;
4009
        ++q;
4010
        aSig -= bSig;
4011
    } while ( 0 <= (sbits64) aSig );
4012
    sigMean = aSig + alternateASig;
4013
    if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
4014
        aSig = alternateASig;
4015
    }
4016
    zSign = ( (sbits64) aSig < 0 );
4017
    if ( zSign ) aSig = - aSig;
4018
    return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig );
4019
 
4020
}
4021
 
4022
/*----------------------------------------------------------------------------
4023
| Returns the square root of the double-precision floating-point value `a'.
4024
| The operation is performed according to the IEC/IEEE Standard for Binary
4025
| Floating-Point Arithmetic.
4026
*----------------------------------------------------------------------------*/
4027
 
4028
float64 float64_sqrt( float64 a )
4029
{
4030
    flag aSign;
4031
    int16 aExp, zExp;
4032
    bits64 aSig, zSig, doubleZSig;
4033
    bits64 rem0, rem1, term0, term1;
4034
    //float64 z; // Unused variable
4035
 
4036
    aSig = extractFloat64Frac( a );
4037
    aExp = extractFloat64Exp( a );
4038
    aSign = extractFloat64Sign( a );
4039
    if ( aExp == 0x7FF ) {
4040
        if ( aSig ) return propagateFloat64NaN( a, a );
4041
        if ( ! aSign ) return a;
4042
        float_raise( float_flag_invalid );
4043
        return float64_default_nan;
4044
    }
4045
    if ( aSign ) {
4046
        if ( ( aExp | aSig ) == 0 ) return a;
4047
        float_raise( float_flag_invalid );
4048
        return float64_default_nan;
4049
    }
4050
    if ( aExp == 0 ) {
4051
        if ( aSig == 0 ) return 0;
4052
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4053
    }
4054
    zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
4055
    aSig |= LIT64( 0x0010000000000000 );
4056
    zSig = estimateSqrt32( aExp, aSig>>21 );
4057
    aSig <<= 9 - ( aExp & 1 );
4058
    zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
4059
    if ( ( zSig & 0x1FF ) <= 5 ) {
4060
        doubleZSig = zSig<<1;
4061
        mul64To128( zSig, zSig, &term0, &term1 );
4062
        sub128( aSig, 0, term0, term1, &rem0, &rem1 );
4063
        while ( (sbits64) rem0 < 0 ) {
4064
            --zSig;
4065
            doubleZSig -= 2;
4066
            add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
4067
        }
4068
        zSig |= ( ( rem0 | rem1 ) != 0 );
4069
    }
4070
    return roundAndPackFloat64( 0, zExp, zSig );
4071
 
4072
}
4073
 
4074
/*----------------------------------------------------------------------------
4075
| Returns 1 if the double-precision floating-point value `a' is equal to the
4076
| corresponding value `b', and 0 otherwise.  The comparison is performed
4077
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4078
*----------------------------------------------------------------------------*/
4079
 
4080
flag float64_eq( float64 a, float64 b )
4081
{
4082
 
4083
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4084
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4085
       ) {
4086
        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4087
            float_raise( float_flag_invalid );
4088
        }
4089
        return 0;
4090
    }
4091
    return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
4092
 
4093
}
4094
 
4095
/*----------------------------------------------------------------------------
4096
| Returns 1 if the double-precision floating-point value `a' is less than or
4097
| equal to the corresponding value `b', and 0 otherwise.  The comparison is
4098
| performed according to the IEC/IEEE Standard for Binary Floating-Point
4099
| Arithmetic.
4100
*----------------------------------------------------------------------------*/
4101
 
4102
flag float64_le( float64 a, float64 b )
4103
{
4104
    flag aSign, bSign;
4105
 
4106
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4107
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4108
       ) {
4109
        float_raise( float_flag_invalid );
4110
        return 0;
4111
    }
4112
    aSign = extractFloat64Sign( a );
4113
    bSign = extractFloat64Sign( b );
4114
    if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
4115
    return ( a == b ) || ( aSign ^ ( a < b ) );
4116
 
4117
}
4118
 
4119
/*----------------------------------------------------------------------------
4120
| Returns 1 if the double-precision floating-point value `a' is less than
4121
| the corresponding value `b', and 0 otherwise.  The comparison is performed
4122
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4123
*----------------------------------------------------------------------------*/
4124
 
4125
flag float64_lt( float64 a, float64 b )
4126
{
4127
    flag aSign, bSign;
4128
 
4129
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4130
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4131
       ) {
4132
        float_raise( float_flag_invalid );
4133
        return 0;
4134
    }
4135
    aSign = extractFloat64Sign( a );
4136
    bSign = extractFloat64Sign( b );
4137
    if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
4138
    return ( a != b ) && ( aSign ^ ( a < b ) );
4139
 
4140
}
4141
 
4142
/*----------------------------------------------------------------------------
4143
| Returns 1 if the double-precision floating-point value `a' is equal to the
4144
| corresponding value `b', and 0 otherwise.  The invalid exception is raised
4145
| if either operand is a NaN.  Otherwise, the comparison is performed
4146
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4147
*----------------------------------------------------------------------------*/
4148
 
4149
flag float64_eq_signaling( float64 a, float64 b )
4150
{
4151
 
4152
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4153
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4154
       ) {
4155
        float_raise( float_flag_invalid );
4156
        return 0;
4157
    }
4158
    return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
4159
 
4160
}
4161
 
4162
/*----------------------------------------------------------------------------
4163
| Returns 1 if the double-precision floating-point value `a' is less than or
4164
| equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
4165
| cause an exception.  Otherwise, the comparison is performed according to the
4166
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4167
*----------------------------------------------------------------------------*/
4168
 
4169
flag float64_le_quiet( float64 a, float64 b )
4170
{
4171
    flag aSign, bSign;
4172
    //int16 aExp, bExp; // Unused variable
4173
 
4174
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4175
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4176
       ) {
4177
        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4178
            float_raise( float_flag_invalid );
4179
        }
4180
        return 0;
4181
    }
4182
    aSign = extractFloat64Sign( a );
4183
    bSign = extractFloat64Sign( b );
4184
    if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
4185
    return ( a == b ) || ( aSign ^ ( a < b ) );
4186
 
4187
}
4188
 
4189
/*----------------------------------------------------------------------------
4190
| Returns 1 if the double-precision floating-point value `a' is less than
4191
| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4192
| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
4193
| Standard for Binary Floating-Point Arithmetic.
4194
*----------------------------------------------------------------------------*/
4195
 
4196
flag float64_lt_quiet( float64 a, float64 b )
4197
{
4198
    flag aSign, bSign;
4199
 
4200
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4201
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4202
       ) {
4203
        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4204
            float_raise( float_flag_invalid );
4205
        }
4206
        return 0;
4207
    }
4208
    aSign = extractFloat64Sign( a );
4209
    bSign = extractFloat64Sign( b );
4210
    if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
4211
    return ( a != b ) && ( aSign ^ ( a < b ) );
4212
 
4213
}
4214
 
4215
#ifdef FLOATX80
4216
 
4217
/*----------------------------------------------------------------------------
4218
| Returns the result of converting the extended double-precision floating-
4219
| point value `a' to the 32-bit two's complement integer format.  The
4220
| conversion is performed according to the IEC/IEEE Standard for Binary
4221
| Floating-Point Arithmetic---which means in particular that the conversion
4222
| is rounded according to the current rounding mode.  If `a' is a NaN, the
4223
| largest positive integer is returned.  Otherwise, if the conversion
4224
| overflows, the largest integer with the same sign as `a' is returned.
4225
*----------------------------------------------------------------------------*/
4226
 
4227
int32 floatx80_to_int32( floatx80 a )
4228
{
4229
    flag aSign;
4230
    int32 aExp, shiftCount;
4231
    bits64 aSig;
4232
 
4233
    aSig = extractFloatx80Frac( a );
4234
    aExp = extractFloatx80Exp( a );
4235
    aSign = extractFloatx80Sign( a );
4236
    if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
4237
    shiftCount = 0x4037 - aExp;
4238
    if ( shiftCount <= 0 ) shiftCount = 1;
4239
    shift64RightJamming( aSig, shiftCount, &aSig );
4240
    return roundAndPackInt32( aSign, aSig );
4241
 
4242
}
4243
 
4244
/*----------------------------------------------------------------------------
4245
| Returns the result of converting the extended double-precision floating-
4246
| point value `a' to the 32-bit two's complement integer format.  The
4247
| conversion is performed according to the IEC/IEEE Standard for Binary
4248
| Floating-Point Arithmetic, except that the conversion is always rounded
4249
| toward zero.  If `a' is a NaN, the largest positive integer is returned.
4250
| Otherwise, if the conversion overflows, the largest integer with the same
4251
| sign as `a' is returned.
4252
*----------------------------------------------------------------------------*/
4253
 
4254
int32 floatx80_to_int32_round_to_zero( floatx80 a )
4255
{
4256
    flag aSign;
4257
    int32 aExp, shiftCount;
4258
    bits64 aSig, savedASig;
4259
    int32 z;
4260
 
4261
    aSig = extractFloatx80Frac( a );
4262
    aExp = extractFloatx80Exp( a );
4263
    aSign = extractFloatx80Sign( a );
4264
    if ( 0x401E < aExp ) {
4265
        if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
4266
        goto invalid;
4267
    }
4268
    else if ( aExp < 0x3FFF ) {
4269
        if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
4270
        return 0;
4271
    }
4272
    shiftCount = 0x403E - aExp;
4273
    savedASig = aSig;
4274
    aSig >>= shiftCount;
4275
    z = aSig;
4276
    if ( aSign ) z = - z;
4277
    if ( ( z < 0 ) ^ aSign ) {
4278
 invalid:
4279
        float_raise( float_flag_invalid );
4280
        return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
4281
    }
4282
    if ( ( aSig<<shiftCount ) != savedASig ) {
4283
        float_exception_flags |= float_flag_inexact;
4284
    }
4285
    return z;
4286
 
4287
}
4288
 
4289
/*----------------------------------------------------------------------------
4290
| Returns the result of converting the extended double-precision floating-
4291
| point value `a' to the 64-bit two's complement integer format.  The
4292
| conversion is performed according to the IEC/IEEE Standard for Binary
4293
| Floating-Point Arithmetic---which means in particular that the conversion
4294
| is rounded according to the current rounding mode.  If `a' is a NaN,
4295
| the largest positive integer is returned.  Otherwise, if the conversion
4296
| overflows, the largest integer with the same sign as `a' is returned.
4297
*----------------------------------------------------------------------------*/
4298
 
4299
int64 floatx80_to_int64( floatx80 a )
4300
{
4301
    flag aSign;
4302
    int32 aExp, shiftCount;
4303
    bits64 aSig, aSigExtra;
4304
 
4305
    aSig = extractFloatx80Frac( a );
4306
    aExp = extractFloatx80Exp( a );
4307
    aSign = extractFloatx80Sign( a );
4308
    shiftCount = 0x403E - aExp;
4309
    if ( shiftCount <= 0 ) {
4310
        if ( shiftCount ) {
4311
            float_raise( float_flag_invalid );
4312
            if (    ! aSign
4313
                 || (    ( aExp == 0x7FFF )
4314
                      && ( aSig != LIT64( 0x8000000000000000 ) ) )
4315
               ) {
4316
                return LIT64( 0x7FFFFFFFFFFFFFFF );
4317
            }
4318
            return (sbits64) LIT64( 0x8000000000000000 );
4319
        }
4320
        aSigExtra = 0;
4321
    }
4322
    else {
4323
        shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
4324
    }
4325
    return roundAndPackInt64( aSign, aSig, aSigExtra );
4326
 
4327
}
4328
 
4329
/*----------------------------------------------------------------------------
4330
| Returns the result of converting the extended double-precision floating-
4331
| point value `a' to the 64-bit two's complement integer format.  The
4332
| conversion is performed according to the IEC/IEEE Standard for Binary
4333
| Floating-Point Arithmetic, except that the conversion is always rounded
4334
| toward zero.  If `a' is a NaN, the largest positive integer is returned.
4335
| Otherwise, if the conversion overflows, the largest integer with the same
4336
| sign as `a' is returned.
4337
*----------------------------------------------------------------------------*/
4338
 
4339
int64 floatx80_to_int64_round_to_zero( floatx80 a )
4340
{
4341
    flag aSign;
4342
    int32 aExp, shiftCount;
4343
    bits64 aSig;
4344
    int64 z;
4345
 
4346
    aSig = extractFloatx80Frac( a );
4347
    aExp = extractFloatx80Exp( a );
4348
    aSign = extractFloatx80Sign( a );
4349
    shiftCount = aExp - 0x403E;
4350
    if ( 0 <= shiftCount ) {
4351
        aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
4352
        if ( ( a.high != 0xC03E ) || aSig ) {
4353
            float_raise( float_flag_invalid );
4354
            if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
4355
                return LIT64( 0x7FFFFFFFFFFFFFFF );
4356
            }
4357
        }
4358
        return (sbits64) LIT64( 0x8000000000000000 );
4359
    }
4360
    else if ( aExp < 0x3FFF ) {
4361
        if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
4362
        return 0;
4363
    }
4364
    z = aSig>>( - shiftCount );
4365
    if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
4366
        float_exception_flags |= float_flag_inexact;
4367
    }
4368
    if ( aSign ) z = - z;
4369
    return z;
4370
 
4371
}
4372
 
4373
/*----------------------------------------------------------------------------
4374
| Returns the result of converting the extended double-precision floating-
4375
| point value `a' to the single-precision floating-point format.  The
4376
| conversion is performed according to the IEC/IEEE Standard for Binary
4377
| Floating-Point Arithmetic.
4378
*----------------------------------------------------------------------------*/
4379
 
4380
float32 floatx80_to_float32( floatx80 a )
4381
{
4382
    flag aSign;
4383
    int32 aExp;
4384
    bits64 aSig;
4385
 
4386
    aSig = extractFloatx80Frac( a );
4387
    aExp = extractFloatx80Exp( a );
4388
    aSign = extractFloatx80Sign( a );
4389
    if ( aExp == 0x7FFF ) {
4390
        if ( (bits64) ( aSig<<1 ) ) {
4391
            return commonNaNToFloat32( floatx80ToCommonNaN( a ) );
4392
        }
4393
        return packFloat32( aSign, 0xFF, 0 );
4394
    }
4395
    shift64RightJamming( aSig, 33, &aSig );
4396
    if ( aExp || aSig ) aExp -= 0x3F81;
4397
    return roundAndPackFloat32( aSign, aExp, aSig );
4398
 
4399
}
4400
 
4401
/*----------------------------------------------------------------------------
4402
| Returns the result of converting the extended double-precision floating-
4403
| point value `a' to the double-precision floating-point format.  The
4404
| conversion is performed according to the IEC/IEEE Standard for Binary
4405
| Floating-Point Arithmetic.
4406
*----------------------------------------------------------------------------*/
4407
 
4408
float64 floatx80_to_float64( floatx80 a )
4409
{
4410
    flag aSign;
4411
    int32 aExp;
4412
    bits64 aSig, zSig;
4413
 
4414
    aSig = extractFloatx80Frac( a );
4415
    aExp = extractFloatx80Exp( a );
4416
    aSign = extractFloatx80Sign( a );
4417
    if ( aExp == 0x7FFF ) {
4418
        if ( (bits64) ( aSig<<1 ) ) {
4419
            return commonNaNToFloat64( floatx80ToCommonNaN( a ) );
4420
        }
4421
        return packFloat64( aSign, 0x7FF, 0 );
4422
    }
4423
    shift64RightJamming( aSig, 1, &zSig );
4424
    if ( aExp || aSig ) aExp -= 0x3C01;
4425
    return roundAndPackFloat64( aSign, aExp, zSig );
4426
 
4427
}
4428
 
4429
#ifdef FLOAT128
4430
 
4431
/*----------------------------------------------------------------------------
4432
| Returns the result of converting the extended double-precision floating-
4433
| point value `a' to the quadruple-precision floating-point format.  The
4434
| conversion is performed according to the IEC/IEEE Standard for Binary
4435
| Floating-Point Arithmetic.
4436
*----------------------------------------------------------------------------*/
4437
 
4438
float128 floatx80_to_float128( floatx80 a )
4439
{
4440
    flag aSign;
4441
    int16 aExp;
4442
    bits64 aSig, zSig0, zSig1;
4443
 
4444
    aSig = extractFloatx80Frac( a );
4445
    aExp = extractFloatx80Exp( a );
4446
    aSign = extractFloatx80Sign( a );
4447
    if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
4448
        return commonNaNToFloat128( floatx80ToCommonNaN( a ) );
4449
    }
4450
    shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
4451
    return packFloat128( aSign, aExp, zSig0, zSig1 );
4452
 
4453
}
4454
 
4455
#endif
4456
 
4457
/*----------------------------------------------------------------------------
4458
| Rounds the extended double-precision floating-point value `a' to an integer,
4459
| and returns the result as an extended quadruple-precision floating-point
4460
| value.  The operation is performed according to the IEC/IEEE Standard for
4461
| Binary Floating-Point Arithmetic.
4462
*----------------------------------------------------------------------------*/
4463
 
4464
floatx80 floatx80_round_to_int( floatx80 a )
4465
{
4466
    flag aSign;
4467
    int32 aExp;
4468
    bits64 lastBitMask, roundBitsMask;
4469
    int8 roundingMode;
4470
    floatx80 z;
4471
 
4472
    aExp = extractFloatx80Exp( a );
4473
    if ( 0x403E <= aExp ) {
4474
        if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
4475
            return propagateFloatx80NaN( a, a );
4476
        }
4477
        return a;
4478
    }
4479
    if ( aExp < 0x3FFF ) {
4480
        if (    ( aExp == 0 )
4481
             && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
4482
            return a;
4483
        }
4484
        float_exception_flags |= float_flag_inexact;
4485
        aSign = extractFloatx80Sign( a );
4486
        switch ( float_rounding_mode ) {
4487
         case float_round_nearest_even:
4488
            if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
4489
               ) {
4490
                return
4491
                    packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
4492
            }
4493
            break;
4494
         case float_round_down:
4495
            return
4496
                  aSign ?
4497
                      packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4498
                : packFloatx80( 0, 0, 0 );
4499
         case float_round_up:
4500
            return
4501
                  aSign ? packFloatx80( 1, 0, 0 )
4502
                : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4503
        }
4504
        return packFloatx80( aSign, 0, 0 );
4505
    }
4506
    lastBitMask = 1;
4507
    lastBitMask <<= 0x403E - aExp;
4508
    roundBitsMask = lastBitMask - 1;
4509
    z = a;
4510
    roundingMode = float_rounding_mode;
4511
    if ( roundingMode == float_round_nearest_even ) {
4512
        z.low += lastBitMask>>1;
4513
        if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4514
    }
4515
    else if ( roundingMode != float_round_to_zero ) {
4516
        if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
4517
            z.low += roundBitsMask;
4518
        }
4519
    }
4520
    z.low &= ~ roundBitsMask;
4521
    if ( z.low == 0 ) {
4522
        ++z.high;
4523
        z.low = LIT64( 0x8000000000000000 );
4524
    }
4525
    if ( z.low != a.low ) float_exception_flags |= float_flag_inexact;
4526
    return z;
4527
 
4528
}
4529
 
4530
/*----------------------------------------------------------------------------
4531
| Returns the result of adding the absolute values of the extended double-
4532
| precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
4533
| negated before being returned.  `zSign' is ignored if the result is a NaN.
4534
| The addition is performed according to the IEC/IEEE Standard for Binary
4535
| Floating-Point Arithmetic.
4536
*----------------------------------------------------------------------------*/
4537
 
4538
static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
4539
{
4540
    int32 aExp, bExp, zExp;
4541
    bits64 aSig, bSig, zSig0, zSig1;
4542
    int32 expDiff;
4543
 
4544
    aSig = extractFloatx80Frac( a );
4545
    aExp = extractFloatx80Exp( a );
4546
    bSig = extractFloatx80Frac( b );
4547
    bExp = extractFloatx80Exp( b );
4548
    expDiff = aExp - bExp;
4549
    if ( 0 < expDiff ) {
4550
        if ( aExp == 0x7FFF ) {
4551
            if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
4552
            return a;
4553
        }
4554
        if ( bExp == 0 ) --expDiff;
4555
        shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4556
        zExp = aExp;
4557
    }
4558
    else if ( expDiff < 0 ) {
4559
        if ( bExp == 0x7FFF ) {
4560
            if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
4561
            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4562
        }
4563
        if ( aExp == 0 ) ++expDiff;
4564
        shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4565
        zExp = bExp;
4566
    }
4567
    else {
4568
        if ( aExp == 0x7FFF ) {
4569
            if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
4570
                return propagateFloatx80NaN( a, b );
4571
            }
4572
            return a;
4573
        }
4574
        zSig1 = 0;
4575
        zSig0 = aSig + bSig;
4576
        if ( aExp == 0 ) {
4577
            normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
4578
            goto roundAndPack;
4579
        }
4580
        zExp = aExp;
4581
        goto shiftRight1;
4582
    }
4583
    zSig0 = aSig + bSig;
4584
    if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
4585
 shiftRight1:
4586
    shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
4587
    zSig0 |= LIT64( 0x8000000000000000 );
4588
    ++zExp;
4589
 roundAndPack:
4590
    return
4591
        roundAndPackFloatx80(
4592
            floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
4593
 
4594
}
4595
 
4596
/*----------------------------------------------------------------------------
4597
| Returns the result of subtracting the absolute values of the extended
4598
| double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
4599
| difference is negated before being returned.  `zSign' is ignored if the
4600
| result is a NaN.  The subtraction is performed according to the IEC/IEEE
4601
| Standard for Binary Floating-Point Arithmetic.
4602
*----------------------------------------------------------------------------*/
4603
 
4604
static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
4605
{
4606
    int32 aExp, bExp, zExp;
4607
    bits64 aSig, bSig, zSig0, zSig1;
4608
    int32 expDiff;
4609
    floatx80 z;
4610
 
4611
    aSig = extractFloatx80Frac( a );
4612
    aExp = extractFloatx80Exp( a );
4613
    bSig = extractFloatx80Frac( b );
4614
    bExp = extractFloatx80Exp( b );
4615
    expDiff = aExp - bExp;
4616
    if ( 0 < expDiff ) goto aExpBigger;
4617
    if ( expDiff < 0 ) goto bExpBigger;
4618
    if ( aExp == 0x7FFF ) {
4619
        if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
4620
            return propagateFloatx80NaN( a, b );
4621
        }
4622
        float_raise( float_flag_invalid );
4623
        z.low = floatx80_default_nan_low;
4624
        z.high = floatx80_default_nan_high;
4625
        return z;
4626
    }
4627
    if ( aExp == 0 ) {
4628
        aExp = 1;
4629
        bExp = 1;
4630
    }
4631
    zSig1 = 0;
4632
    if ( bSig < aSig ) goto aBigger;
4633
    if ( aSig < bSig ) goto bBigger;
4634
    return packFloatx80( float_rounding_mode == float_round_down, 0, 0 );
4635
 bExpBigger:
4636
    if ( bExp == 0x7FFF ) {
4637
        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
4638
        return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
4639
    }
4640
    if ( aExp == 0 ) ++expDiff;
4641
    shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4642
 bBigger:
4643
    sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
4644
    zExp = bExp;
4645
    zSign ^= 1;
4646
    goto normalizeRoundAndPack;
4647
 aExpBigger:
4648
    if ( aExp == 0x7FFF ) {
4649
        if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
4650
        return a;
4651
    }
4652
    if ( bExp == 0 ) --expDiff;
4653
    shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4654
 aBigger:
4655
    sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
4656
    zExp = aExp;
4657
 normalizeRoundAndPack:
4658
    return
4659
        normalizeRoundAndPackFloatx80(
4660
            floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
4661
 
4662
}
4663
 
4664
/*----------------------------------------------------------------------------
4665
| Returns the result of adding the extended double-precision floating-point
4666
| values `a' and `b'.  The operation is performed according to the IEC/IEEE
4667
| Standard for Binary Floating-Point Arithmetic.
4668
*----------------------------------------------------------------------------*/
4669
 
4670
floatx80 floatx80_add( floatx80 a, floatx80 b )
4671
{
4672
    flag aSign, bSign;
4673
 
4674
    aSign = extractFloatx80Sign( a );
4675
    bSign = extractFloatx80Sign( b );
4676
    if ( aSign == bSign ) {
4677
        return addFloatx80Sigs( a, b, aSign );
4678
    }
4679
    else {
4680
        return subFloatx80Sigs( a, b, aSign );
4681
    }
4682
 
4683
}
4684
 
4685
/*----------------------------------------------------------------------------
4686
| Returns the result of subtracting the extended double-precision floating-
4687
| point values `a' and `b'.  The operation is performed according to the
4688
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4689
*----------------------------------------------------------------------------*/
4690
 
4691
floatx80 floatx80_sub( floatx80 a, floatx80 b )
4692
{
4693
    flag aSign, bSign;
4694
 
4695
    aSign = extractFloatx80Sign( a );
4696
    bSign = extractFloatx80Sign( b );
4697
    if ( aSign == bSign ) {
4698
        return subFloatx80Sigs( a, b, aSign );
4699
    }
4700
    else {
4701
        return addFloatx80Sigs( a, b, aSign );
4702
    }
4703
 
4704
}
4705
 
4706
/*----------------------------------------------------------------------------
4707
| Returns the result of multiplying the extended double-precision floating-
4708
| point values `a' and `b'.  The operation is performed according to the
4709
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4710
*----------------------------------------------------------------------------*/
4711
 
4712
floatx80 floatx80_mul( floatx80 a, floatx80 b )
4713
{
4714
    flag aSign, bSign, zSign;
4715
    int32 aExp, bExp, zExp;
4716
    bits64 aSig, bSig, zSig0, zSig1;
4717
    floatx80 z;
4718
 
4719
    aSig = extractFloatx80Frac( a );
4720
    aExp = extractFloatx80Exp( a );
4721
    aSign = extractFloatx80Sign( a );
4722
    bSig = extractFloatx80Frac( b );
4723
    bExp = extractFloatx80Exp( b );
4724
    bSign = extractFloatx80Sign( b );
4725
    zSign = aSign ^ bSign;
4726
    if ( aExp == 0x7FFF ) {
4727
        if (    (bits64) ( aSig<<1 )
4728
             || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
4729
            return propagateFloatx80NaN( a, b );
4730
        }
4731
        if ( ( bExp | bSig ) == 0 ) goto invalid;
4732
        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4733
    }
4734
    if ( bExp == 0x7FFF ) {
4735
        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
4736
        if ( ( aExp | aSig ) == 0 ) {
4737
 invalid:
4738
            float_raise( float_flag_invalid );
4739
            z.low = floatx80_default_nan_low;
4740
            z.high = floatx80_default_nan_high;
4741
            return z;
4742
        }
4743
        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4744
    }
4745
    if ( aExp == 0 ) {
4746
        if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4747
        normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4748
    }
4749
    if ( bExp == 0 ) {
4750
        if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
4751
        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4752
    }
4753
    zExp = aExp + bExp - 0x3FFE;
4754
    mul64To128( aSig, bSig, &zSig0, &zSig1 );
4755
    if ( 0 < (sbits64) zSig0 ) {
4756
        shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
4757
        --zExp;
4758
    }
4759
    return
4760
        roundAndPackFloatx80(
4761
            floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
4762
 
4763
}
4764
 
4765
/*----------------------------------------------------------------------------
4766
| Returns the result of dividing the extended double-precision floating-point
4767
| value `a' by the corresponding value `b'.  The operation is performed
4768
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4769
*----------------------------------------------------------------------------*/
4770
 
4771
floatx80 floatx80_div( floatx80 a, floatx80 b )
4772
{
4773
    flag aSign, bSign, zSign;
4774
    int32 aExp, bExp, zExp;
4775
    bits64 aSig, bSig, zSig0, zSig1;
4776
    bits64 rem0, rem1, rem2, term0, term1, term2;
4777
    floatx80 z;
4778
 
4779
    aSig = extractFloatx80Frac( a );
4780
    aExp = extractFloatx80Exp( a );
4781
    aSign = extractFloatx80Sign( a );
4782
    bSig = extractFloatx80Frac( b );
4783
    bExp = extractFloatx80Exp( b );
4784
    bSign = extractFloatx80Sign( b );
4785
    zSign = aSign ^ bSign;
4786
    if ( aExp == 0x7FFF ) {
4787
        if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
4788
        if ( bExp == 0x7FFF ) {
4789
            if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
4790
            goto invalid;
4791
        }
4792
        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4793
    }
4794
    if ( bExp == 0x7FFF ) {
4795
        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
4796
        return packFloatx80( zSign, 0, 0 );
4797
    }
4798
    if ( bExp == 0 ) {
4799
        if ( bSig == 0 ) {
4800
            if ( ( aExp | aSig ) == 0 ) {
4801
 invalid:
4802
                float_raise( float_flag_invalid );
4803
                z.low = floatx80_default_nan_low;
4804
                z.high = floatx80_default_nan_high;
4805
                return z;
4806
            }
4807
            float_raise( float_flag_divbyzero );
4808
            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4809
        }
4810
        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4811
    }
4812
    if ( aExp == 0 ) {
4813
        if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4814
        normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4815
    }
4816
    zExp = aExp - bExp + 0x3FFE;
4817
    rem1 = 0;
4818
    if ( bSig <= aSig ) {
4819
        shift128Right( aSig, 0, 1, &aSig, &rem1 );
4820
        ++zExp;
4821
    }
4822
    zSig0 = estimateDiv128To64( aSig, rem1, bSig );
4823
    mul64To128( bSig, zSig0, &term0, &term1 );
4824
    sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
4825
    while ( (sbits64) rem0 < 0 ) {
4826
        --zSig0;
4827
        add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
4828
    }
4829
    zSig1 = estimateDiv128To64( rem1, 0, bSig );
4830
    if ( (bits64) ( zSig1<<1 ) <= 8 ) {
4831
        mul64To128( bSig, zSig1, &term1, &term2 );
4832
        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4833
        while ( (sbits64) rem1 < 0 ) {
4834
            --zSig1;
4835
            add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
4836
        }
4837
        zSig1 |= ( ( rem1 | rem2 ) != 0 );
4838
    }
4839
    return
4840
        roundAndPackFloatx80(
4841
            floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
4842
 
4843
}
4844
 
4845
/*----------------------------------------------------------------------------
4846
| Returns the remainder of the extended double-precision floating-point value
4847
| `a' with respect to the corresponding value `b'.  The operation is performed
4848
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4849
*----------------------------------------------------------------------------*/
4850
 
4851
floatx80 floatx80_rem( floatx80 a, floatx80 b )
4852
{
4853
    flag aSign, bSign, zSign;
4854
    int32 aExp, bExp, expDiff;
4855
    bits64 aSig0, aSig1, bSig;
4856
    bits64 q, term0, term1, alternateASig0, alternateASig1;
4857
    floatx80 z;
4858
 
4859
    aSig0 = extractFloatx80Frac( a );
4860
    aExp = extractFloatx80Exp( a );
4861
    aSign = extractFloatx80Sign( a );
4862
    bSig = extractFloatx80Frac( b );
4863
    bExp = extractFloatx80Exp( b );
4864
    bSign = extractFloatx80Sign( b );
4865
    if ( aExp == 0x7FFF ) {
4866
        if (    (bits64) ( aSig0<<1 )
4867
             || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
4868
            return propagateFloatx80NaN( a, b );
4869
        }
4870
        goto invalid;
4871
    }
4872
    if ( bExp == 0x7FFF ) {
4873
        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
4874
        return a;
4875
    }
4876
    if ( bExp == 0 ) {
4877
        if ( bSig == 0 ) {
4878
 invalid:
4879
            float_raise( float_flag_invalid );
4880
            z.low = floatx80_default_nan_low;
4881
            z.high = floatx80_default_nan_high;
4882
            return z;
4883
        }
4884
        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4885
    }
4886
    if ( aExp == 0 ) {
4887
        if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
4888
        normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4889
    }
4890
    bSig |= LIT64( 0x8000000000000000 );
4891
    zSign = aSign;
4892
    expDiff = aExp - bExp;
4893
    aSig1 = 0;
4894
    if ( expDiff < 0 ) {
4895
        if ( expDiff < -1 ) return a;
4896
        shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
4897
        expDiff = 0;
4898
    }
4899
    q = ( bSig <= aSig0 );
4900
    if ( q ) aSig0 -= bSig;
4901
    expDiff -= 64;
4902
    while ( 0 < expDiff ) {
4903
        q = estimateDiv128To64( aSig0, aSig1, bSig );
4904
        q = ( 2 < q ) ? q - 2 : 0;
4905
        mul64To128( bSig, q, &term0, &term1 );
4906
        sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4907
        shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
4908
        expDiff -= 62;
4909
    }
4910
    expDiff += 64;
4911
    if ( 0 < expDiff ) {
4912
        q = estimateDiv128To64( aSig0, aSig1, bSig );
4913
        q = ( 2 < q ) ? q - 2 : 0;
4914
        q >>= 64 - expDiff;
4915
        mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
4916
        sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4917
        shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
4918
        while ( le128( term0, term1, aSig0, aSig1 ) ) {
4919
            ++q;
4920
            sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4921
        }
4922
    }
4923
    else {
4924
        term1 = 0;
4925
        term0 = bSig;
4926
    }
4927
    sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
4928
    if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
4929
         || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
4930
              && ( q & 1 ) )
4931
       ) {
4932
        aSig0 = alternateASig0;
4933
        aSig1 = alternateASig1;
4934
        zSign = ! zSign;
4935
    }
4936
    return
4937
        normalizeRoundAndPackFloatx80(
4938
            80, zSign, bExp + expDiff, aSig0, aSig1 );
4939
 
4940
}
4941
 
4942
/*----------------------------------------------------------------------------
4943
| Returns the square root of the extended double-precision floating-point
4944
| value `a'.  The operation is performed according to the IEC/IEEE Standard
4945
| for Binary Floating-Point Arithmetic.
4946
*----------------------------------------------------------------------------*/
4947
 
4948
floatx80 floatx80_sqrt( floatx80 a )
4949
{
4950
    flag aSign;
4951
    int32 aExp, zExp;
4952
    bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
4953
    bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4954
    floatx80 z;
4955
 
4956
    aSig0 = extractFloatx80Frac( a );
4957
    aExp = extractFloatx80Exp( a );
4958
    aSign = extractFloatx80Sign( a );
4959
    if ( aExp == 0x7FFF ) {
4960
        if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a );
4961
        if ( ! aSign ) return a;
4962
        goto invalid;
4963
    }
4964
    if ( aSign ) {
4965
        if ( ( aExp | aSig0 ) == 0 ) return a;
4966
 invalid:
4967
        float_raise( float_flag_invalid );
4968
        z.low = floatx80_default_nan_low;
4969
        z.high = floatx80_default_nan_high;
4970
        return z;
4971
    }
4972
    if ( aExp == 0 ) {
4973
        if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
4974
        normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4975
    }
4976
    zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
4977
    zSig0 = estimateSqrt32( aExp, aSig0>>32 );
4978
    shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
4979
    zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4980
    doubleZSig0 = zSig0<<1;
4981
    mul64To128( zSig0, zSig0, &term0, &term1 );
4982
    sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4983
    while ( (sbits64) rem0 < 0 ) {
4984
        --zSig0;
4985
        doubleZSig0 -= 2;
4986
        add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4987
    }
4988
    zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4989
    if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4990
        if ( zSig1 == 0 ) zSig1 = 1;
4991
        mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4992
        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4993
        mul64To128( zSig1, zSig1, &term2, &term3 );
4994
        sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4995
        while ( (sbits64) rem1 < 0 ) {
4996
            --zSig1;
4997
            shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4998
            term3 |= 1;
4999
            term2 |= doubleZSig0;
5000
            add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5001
        }
5002
        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5003
    }
5004
    shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
5005
    zSig0 |= doubleZSig0;
5006
    return
5007
        roundAndPackFloatx80(
5008
            floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
5009
 
5010
}
5011
 
5012
/*----------------------------------------------------------------------------
5013
| Returns 1 if the extended double-precision floating-point value `a' is
5014
| equal to the corresponding value `b', and 0 otherwise.  The comparison is
5015
| performed according to the IEC/IEEE Standard for Binary Floating-Point
5016
| Arithmetic.
5017
*----------------------------------------------------------------------------*/
5018
 
5019
flag floatx80_eq( floatx80 a, floatx80 b )
5020
{
5021
 
5022
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5023
              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
5024
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5025
              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
5026
       ) {
5027
        if (    floatx80_is_signaling_nan( a )
5028
             || floatx80_is_signaling_nan( b ) ) {
5029
            float_raise( float_flag_invalid );
5030
        }
5031
        return 0;
5032
    }
5033
    return
5034
           ( a.low == b.low )
5035
        && (    ( a.high == b.high )
5036
             || (    ( a.low == 0 )
5037
                  && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
5038
           );
5039
 
5040
}
5041
 
5042
/*----------------------------------------------------------------------------
5043
| Returns 1 if the extended double-precision floating-point value `a' is
5044
| less than or equal to the corresponding value `b', and 0 otherwise.  The
5045
| comparison is performed according to the IEC/IEEE Standard for Binary
5046
| Floating-Point Arithmetic.
5047
*----------------------------------------------------------------------------*/
5048
 
5049
flag floatx80_le( floatx80 a, floatx80 b )
5050
{
5051
    flag aSign, bSign;
5052
 
5053
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5054
              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
5055
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5056
              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
5057
       ) {
5058
        float_raise( float_flag_invalid );
5059
        return 0;
5060
    }
5061
    aSign = extractFloatx80Sign( a );
5062
    bSign = extractFloatx80Sign( b );
5063
    if ( aSign != bSign ) {
5064
        return
5065
               aSign
5066
            || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5067
                 == 0 );
5068
    }
5069
    return
5070
          aSign ? le128( b.high, b.low, a.high, a.low )
5071
        : le128( a.high, a.low, b.high, b.low );
5072
 
5073
}
5074
 
5075
/*----------------------------------------------------------------------------
5076
| Returns 1 if the extended double-precision floating-point value `a' is
5077
| less than the corresponding value `b', and 0 otherwise.  The comparison
5078
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5079
| Arithmetic.
5080
*----------------------------------------------------------------------------*/
5081
 
5082
flag floatx80_lt( floatx80 a, floatx80 b )
5083
{
5084
    flag aSign, bSign;
5085
 
5086
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5087
              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
5088
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5089
              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
5090
       ) {
5091
        float_raise( float_flag_invalid );
5092
        return 0;
5093
    }
5094
    aSign = extractFloatx80Sign( a );
5095
    bSign = extractFloatx80Sign( b );
5096
    if ( aSign != bSign ) {
5097
        return
5098
               aSign
5099
            && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5100
                 != 0 );
5101
    }
5102
    return
5103
          aSign ? lt128( b.high, b.low, a.high, a.low )
5104
        : lt128( a.high, a.low, b.high, b.low );
5105
 
5106
}
5107
 
5108
/*----------------------------------------------------------------------------
5109
| Returns 1 if the extended double-precision floating-point value `a' is equal
5110
| to the corresponding value `b', and 0 otherwise.  The invalid exception is
5111
| raised if either operand is a NaN.  Otherwise, the comparison is performed
5112
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5113
*----------------------------------------------------------------------------*/
5114
 
5115
flag floatx80_eq_signaling( floatx80 a, floatx80 b )
5116
{
5117
 
5118
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5119
              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
5120
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5121
              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
5122
       ) {
5123
        float_raise( float_flag_invalid );
5124
        return 0;
5125
    }
5126
    return
5127
           ( a.low == b.low )
5128
        && (    ( a.high == b.high )
5129
             || (    ( a.low == 0 )
5130
                  && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
5131
           );
5132
 
5133
}
5134
 
5135
/*----------------------------------------------------------------------------
5136
| Returns 1 if the extended double-precision floating-point value `a' is less
5137
| than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
5138
| do not cause an exception.  Otherwise, the comparison is performed according
5139
| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5140
*----------------------------------------------------------------------------*/
5141
 
5142
flag floatx80_le_quiet( floatx80 a, floatx80 b )
5143
{
5144
    flag aSign, bSign;
5145
 
5146
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5147
              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
5148
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5149
              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
5150
       ) {
5151
        if (    floatx80_is_signaling_nan( a )
5152
             || floatx80_is_signaling_nan( b ) ) {
5153
            float_raise( float_flag_invalid );
5154
        }
5155
        return 0;
5156
    }
5157
    aSign = extractFloatx80Sign( a );
5158
    bSign = extractFloatx80Sign( b );
5159
    if ( aSign != bSign ) {
5160
        return
5161
               aSign
5162
            || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5163
                 == 0 );
5164
    }
5165
    return
5166
          aSign ? le128( b.high, b.low, a.high, a.low )
5167
        : le128( a.high, a.low, b.high, b.low );
5168
 
5169
}
5170
 
5171
/*----------------------------------------------------------------------------
5172
| Returns 1 if the extended double-precision floating-point value `a' is less
5173
| than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
5174
| an exception.  Otherwise, the comparison is performed according to the
5175
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5176
*----------------------------------------------------------------------------*/
5177
 
5178
flag floatx80_lt_quiet( floatx80 a, floatx80 b )
5179
{
5180
    flag aSign, bSign;
5181
 
5182
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5183
              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
5184
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5185
              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
5186
       ) {
5187
        if (    floatx80_is_signaling_nan( a )
5188
             || floatx80_is_signaling_nan( b ) ) {
5189
            float_raise( float_flag_invalid );
5190
        }
5191
        return 0;
5192
    }
5193
    aSign = extractFloatx80Sign( a );
5194
    bSign = extractFloatx80Sign( b );
5195
    if ( aSign != bSign ) {
5196
        return
5197
               aSign
5198
            && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5199
                 != 0 );
5200
    }
5201
    return
5202
          aSign ? lt128( b.high, b.low, a.high, a.low )
5203
        : lt128( a.high, a.low, b.high, b.low );
5204
 
5205
}
5206
 
5207
#endif
5208
 
5209
#ifdef FLOAT128
5210
 
5211
/*----------------------------------------------------------------------------
5212
| Returns the result of converting the quadruple-precision floating-point
5213
| value `a' to the 32-bit two's complement integer format.  The conversion
5214
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5215
| Arithmetic---which means in particular that the conversion is rounded
5216
| according to the current rounding mode.  If `a' is a NaN, the largest
5217
| positive integer is returned.  Otherwise, if the conversion overflows, the
5218
| largest integer with the same sign as `a' is returned.
5219
*----------------------------------------------------------------------------*/
5220
 
5221
int32 float128_to_int32( float128 a )
5222
{
5223
    flag aSign;
5224
    int32 aExp, shiftCount;
5225
    bits64 aSig0, aSig1;
5226
 
5227
    aSig1 = extractFloat128Frac1( a );
5228
    aSig0 = extractFloat128Frac0( a );
5229
    aExp = extractFloat128Exp( a );
5230
    aSign = extractFloat128Sign( a );
5231
    if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
5232
    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5233
    aSig0 |= ( aSig1 != 0 );
5234
    shiftCount = 0x4028 - aExp;
5235
    if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
5236
    return roundAndPackInt32( aSign, aSig0 );
5237
 
5238
}
5239
 
5240
/*----------------------------------------------------------------------------
5241
| Returns the result of converting the quadruple-precision floating-point
5242
| value `a' to the 32-bit two's complement integer format.  The conversion
5243
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5244
| Arithmetic, except that the conversion is always rounded toward zero.  If
5245
| `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
5246
| conversion overflows, the largest integer with the same sign as `a' is
5247
| returned.
5248
*----------------------------------------------------------------------------*/
5249
 
5250
int32 float128_to_int32_round_to_zero( float128 a )
5251
{
5252
    flag aSign;
5253
    int32 aExp, shiftCount;
5254
    bits64 aSig0, aSig1, savedASig;
5255
    int32 z;
5256
 
5257
    aSig1 = extractFloat128Frac1( a );
5258
    aSig0 = extractFloat128Frac0( a );
5259
    aExp = extractFloat128Exp( a );
5260
    aSign = extractFloat128Sign( a );
5261
    aSig0 |= ( aSig1 != 0 );
5262
    if ( 0x401E < aExp ) {
5263
        if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
5264
        goto invalid;
5265
    }
5266
    else if ( aExp < 0x3FFF ) {
5267
        if ( aExp || aSig0 ) float_exception_flags |= float_flag_inexact;
5268
        return 0;
5269
    }
5270
    aSig0 |= LIT64( 0x0001000000000000 );
5271
    shiftCount = 0x402F - aExp;
5272
    savedASig = aSig0;
5273
    aSig0 >>= shiftCount;
5274
    z = aSig0;
5275
    if ( aSign ) z = - z;
5276
    if ( ( z < 0 ) ^ aSign ) {
5277
 invalid:
5278
        float_raise( float_flag_invalid );
5279
        return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
5280
    }
5281
    if ( ( aSig0<<shiftCount ) != savedASig ) {
5282
        float_exception_flags |= float_flag_inexact;
5283
    }
5284
    return z;
5285
 
5286
}
5287
 
5288
/*----------------------------------------------------------------------------
5289
| Returns the result of converting the quadruple-precision floating-point
5290
| value `a' to the 64-bit two's complement integer format.  The conversion
5291
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5292
| Arithmetic---which means in particular that the conversion is rounded
5293
| according to the current rounding mode.  If `a' is a NaN, the largest
5294
| positive integer is returned.  Otherwise, if the conversion overflows, the
5295
| largest integer with the same sign as `a' is returned.
5296
*----------------------------------------------------------------------------*/
5297
 
5298
int64 float128_to_int64( float128 a )
5299
{
5300
    flag aSign;
5301
    int32 aExp, shiftCount;
5302
    bits64 aSig0, aSig1;
5303
 
5304
    aSig1 = extractFloat128Frac1( a );
5305
    aSig0 = extractFloat128Frac0( a );
5306
    aExp = extractFloat128Exp( a );
5307
    aSign = extractFloat128Sign( a );
5308
    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5309
    shiftCount = 0x402F - aExp;
5310
    if ( shiftCount <= 0 ) {
5311
        if ( 0x403E < aExp ) {
5312
            float_raise( float_flag_invalid );
5313
            if (    ! aSign
5314
                 || (    ( aExp == 0x7FFF )
5315
                      && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
5316
                    )
5317
               ) {
5318
                return LIT64( 0x7FFFFFFFFFFFFFFF );
5319
            }
5320
            return (sbits64) LIT64( 0x8000000000000000 );
5321
        }
5322
        shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
5323
    }
5324
    else {
5325
        shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
5326
    }
5327
    return roundAndPackInt64( aSign, aSig0, aSig1 );
5328
 
5329
}
5330
 
5331
/*----------------------------------------------------------------------------
5332
| Returns the result of converting the quadruple-precision floating-point
5333
| value `a' to the 64-bit two's complement integer format.  The conversion
5334
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5335
| Arithmetic, except that the conversion is always rounded toward zero.
5336
| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
5337
| the conversion overflows, the largest integer with the same sign as `a' is
5338
| returned.
5339
*----------------------------------------------------------------------------*/
5340
 
5341
int64 float128_to_int64_round_to_zero( float128 a )
5342
{
5343
    flag aSign;
5344
    int32 aExp, shiftCount;
5345
    bits64 aSig0, aSig1;
5346
    int64 z;
5347
 
5348
    aSig1 = extractFloat128Frac1( a );
5349
    aSig0 = extractFloat128Frac0( a );
5350
    aExp = extractFloat128Exp( a );
5351
    aSign = extractFloat128Sign( a );
5352
    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5353
    shiftCount = aExp - 0x402F;
5354
    if ( 0 < shiftCount ) {
5355
        if ( 0x403E <= aExp ) {
5356
            aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
5357
            if (    ( a.high == LIT64( 0xC03E000000000000 ) )
5358
                 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
5359
                if ( aSig1 ) float_exception_flags |= float_flag_inexact;
5360
            }
5361
            else {
5362
                float_raise( float_flag_invalid );
5363
                if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
5364
                    return LIT64( 0x7FFFFFFFFFFFFFFF );
5365
                }
5366
            }
5367
            return (sbits64) LIT64( 0x8000000000000000 );
5368
        }
5369
        z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
5370
        if ( (bits64) ( aSig1<<shiftCount ) ) {
5371
            float_exception_flags |= float_flag_inexact;
5372
        }
5373
    }
5374
    else {
5375
        if ( aExp < 0x3FFF ) {
5376
            if ( aExp | aSig0 | aSig1 ) {
5377
                float_exception_flags |= float_flag_inexact;
5378
            }
5379
            return 0;
5380
        }
5381
        z = aSig0>>( - shiftCount );
5382
        if (    aSig1
5383
             || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
5384
            float_exception_flags |= float_flag_inexact;
5385
        }
5386
    }
5387
    if ( aSign ) z = - z;
5388
    return z;
5389
 
5390
}
5391
 
5392
/*----------------------------------------------------------------------------
5393
| Returns the result of converting the quadruple-precision floating-point
5394
| value `a' to the single-precision floating-point format.  The conversion
5395
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5396
| Arithmetic.
5397
*----------------------------------------------------------------------------*/
5398
 
5399
float32 float128_to_float32( float128 a )
5400
{
5401
    flag aSign;
5402
    int32 aExp;
5403
    bits64 aSig0, aSig1;
5404
    bits32 zSig;
5405
 
5406
    aSig1 = extractFloat128Frac1( a );
5407
    aSig0 = extractFloat128Frac0( a );
5408
    aExp = extractFloat128Exp( a );
5409
    aSign = extractFloat128Sign( a );
5410
    if ( aExp == 0x7FFF ) {
5411
        if ( aSig0 | aSig1 ) {
5412
            return commonNaNToFloat32( float128ToCommonNaN( a ) );
5413
        }
5414
        return packFloat32( aSign, 0xFF, 0 );
5415
    }
5416
    aSig0 |= ( aSig1 != 0 );
5417
    shift64RightJamming( aSig0, 18, &aSig0 );
5418
    zSig = aSig0;
5419
    if ( aExp || zSig ) {
5420
        zSig |= 0x40000000;
5421
        aExp -= 0x3F81;
5422
    }
5423
    return roundAndPackFloat32( aSign, aExp, zSig );
5424
 
5425
}
5426
 
5427
/*----------------------------------------------------------------------------
5428
| Returns the result of converting the quadruple-precision floating-point
5429
| value `a' to the double-precision floating-point format.  The conversion
5430
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5431
| Arithmetic.
5432
*----------------------------------------------------------------------------*/
5433
 
5434
float64 float128_to_float64( float128 a )
5435
{
5436
    flag aSign;
5437
    int32 aExp;
5438
    bits64 aSig0, aSig1;
5439
 
5440
    aSig1 = extractFloat128Frac1( a );
5441
    aSig0 = extractFloat128Frac0( a );
5442
    aExp = extractFloat128Exp( a );
5443
    aSign = extractFloat128Sign( a );
5444
    if ( aExp == 0x7FFF ) {
5445
        if ( aSig0 | aSig1 ) {
5446
            return commonNaNToFloat64( float128ToCommonNaN( a ) );
5447
        }
5448
        return packFloat64( aSign, 0x7FF, 0 );
5449
    }
5450
    shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5451
    aSig0 |= ( aSig1 != 0 );
5452
    if ( aExp || aSig0 ) {
5453
        aSig0 |= LIT64( 0x4000000000000000 );
5454
        aExp -= 0x3C01;
5455
    }
5456
    return roundAndPackFloat64( aSign, aExp, aSig0 );
5457
 
5458
}
5459
 
5460
#ifdef FLOATX80
5461
 
5462
/*----------------------------------------------------------------------------
5463
| Returns the result of converting the quadruple-precision floating-point
5464
| value `a' to the extended double-precision floating-point format.  The
5465
| conversion is performed according to the IEC/IEEE Standard for Binary
5466
| Floating-Point Arithmetic.
5467
*----------------------------------------------------------------------------*/
5468
 
5469
floatx80 float128_to_floatx80( float128 a )
5470
{
5471
    flag aSign;
5472
    int32 aExp;
5473
    bits64 aSig0, aSig1;
5474
 
5475
    aSig1 = extractFloat128Frac1( a );
5476
    aSig0 = extractFloat128Frac0( a );
5477
    aExp = extractFloat128Exp( a );
5478
    aSign = extractFloat128Sign( a );
5479
    if ( aExp == 0x7FFF ) {
5480
        if ( aSig0 | aSig1 ) {
5481
            return commonNaNToFloatx80( float128ToCommonNaN( a ) );
5482
        }
5483
        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5484
    }
5485
    if ( aExp == 0 ) {
5486
        if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
5487
        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5488
    }
5489
    else {
5490
        aSig0 |= LIT64( 0x0001000000000000 );
5491
    }
5492
    shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
5493
    return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 );
5494
 
5495
}
5496
 
5497
#endif
5498
 
5499
/*----------------------------------------------------------------------------
5500
| Rounds the quadruple-precision floating-point value `a' to an integer, and
5501
| returns the result as a quadruple-precision floating-point value.  The
5502
| operation is performed according to the IEC/IEEE Standard for Binary
5503
| Floating-Point Arithmetic.
5504
*----------------------------------------------------------------------------*/
5505
 
5506
float128 float128_round_to_int( float128 a )
5507
{
5508
    flag aSign;
5509
    int32 aExp;
5510
    bits64 lastBitMask, roundBitsMask;
5511
    int8 roundingMode;
5512
    float128 z;
5513
 
5514
    aExp = extractFloat128Exp( a );
5515
    if ( 0x402F <= aExp ) {
5516
        if ( 0x406F <= aExp ) {
5517
            if (    ( aExp == 0x7FFF )
5518
                 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
5519
               ) {
5520
                return propagateFloat128NaN( a, a );
5521
            }
5522
            return a;
5523
        }
5524
        lastBitMask = 1;
5525
        lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
5526
        roundBitsMask = lastBitMask - 1;
5527
        z = a;
5528
        roundingMode = float_rounding_mode;
5529
        if ( roundingMode == float_round_nearest_even ) {
5530
            if ( lastBitMask ) {
5531
                add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
5532
                if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
5533
            }
5534
            else {
5535
                if ( (sbits64) z.low < 0 ) {
5536
                    ++z.high;
5537
                    if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
5538
                }
5539
            }
5540
        }
5541
        else if ( roundingMode != float_round_to_zero ) {
5542
            if (   extractFloat128Sign( z )
5543
                 ^ ( roundingMode == float_round_up ) ) {
5544
                add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
5545
            }
5546
        }
5547
        z.low &= ~ roundBitsMask;
5548
    }
5549
    else {
5550
        if ( aExp < 0x3FFF ) {
5551
            if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
5552
            float_exception_flags |= float_flag_inexact;
5553
            aSign = extractFloat128Sign( a );
5554
            switch ( float_rounding_mode ) {
5555
             case float_round_nearest_even:
5556
                if (    ( aExp == 0x3FFE )
5557
                     && (   extractFloat128Frac0( a )
5558
                          | extractFloat128Frac1( a ) )
5559
                   ) {
5560
                    return packFloat128( aSign, 0x3FFF, 0, 0 );
5561
                }
5562
                break;
5563
             case float_round_down:
5564
                return
5565
                      aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
5566
                    : packFloat128( 0, 0, 0, 0 );
5567
             case float_round_up:
5568
                return
5569
                      aSign ? packFloat128( 1, 0, 0, 0 )
5570
                    : packFloat128( 0, 0x3FFF, 0, 0 );
5571
            }
5572
            return packFloat128( aSign, 0, 0, 0 );
5573
        }
5574
        lastBitMask = 1;
5575
        lastBitMask <<= 0x402F - aExp;
5576
        roundBitsMask = lastBitMask - 1;
5577
        z.low = 0;
5578
        z.high = a.high;
5579
        roundingMode = float_rounding_mode;
5580
        if ( roundingMode == float_round_nearest_even ) {
5581
            z.high += lastBitMask>>1;
5582
            if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
5583
                z.high &= ~ lastBitMask;
5584
            }
5585
        }
5586
        else if ( roundingMode != float_round_to_zero ) {
5587
            if (   extractFloat128Sign( z )
5588
                 ^ ( roundingMode == float_round_up ) ) {
5589
                z.high |= ( a.low != 0 );
5590
                z.high += roundBitsMask;
5591
            }
5592
        }
5593
        z.high &= ~ roundBitsMask;
5594
    }
5595
    if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
5596
        float_exception_flags |= float_flag_inexact;
5597
    }
5598
    return z;
5599
 
5600
}
5601
 
5602
/*----------------------------------------------------------------------------
5603
| Returns the result of adding the absolute values of the quadruple-precision
5604
| floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
5605
| before being returned.  `zSign' is ignored if the result is a NaN.
5606
| The addition is performed according to the IEC/IEEE Standard for Binary
5607
| Floating-Point Arithmetic.
5608
*----------------------------------------------------------------------------*/
5609
 
5610
static float128 addFloat128Sigs( float128 a, float128 b, flag zSign )
5611
{
5612
    int32 aExp, bExp, zExp;
5613
    bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5614
    int32 expDiff;
5615
 
5616
    aSig1 = extractFloat128Frac1( a );
5617
    aSig0 = extractFloat128Frac0( a );
5618
    aExp = extractFloat128Exp( a );
5619
    bSig1 = extractFloat128Frac1( b );
5620
    bSig0 = extractFloat128Frac0( b );
5621
    bExp = extractFloat128Exp( b );
5622
    expDiff = aExp - bExp;
5623
    if ( 0 < expDiff ) {
5624
        if ( aExp == 0x7FFF ) {
5625
            if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
5626
            return a;
5627
        }
5628
        if ( bExp == 0 ) {
5629
            --expDiff;
5630
        }
5631
        else {
5632
            bSig0 |= LIT64( 0x0001000000000000 );
5633
        }
5634
        shift128ExtraRightJamming(
5635
            bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
5636
        zExp = aExp;
5637
    }
5638
    else if ( expDiff < 0 ) {
5639
        if ( bExp == 0x7FFF ) {
5640
            if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5641
            return packFloat128( zSign, 0x7FFF, 0, 0 );
5642
        }
5643
        if ( aExp == 0 ) {
5644
            ++expDiff;
5645
        }
5646
        else {
5647
            aSig0 |= LIT64( 0x0001000000000000 );
5648
        }
5649
        shift128ExtraRightJamming(
5650
            aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
5651
        zExp = bExp;
5652
    }
5653
    else {
5654
        if ( aExp == 0x7FFF ) {
5655
            if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5656
                return propagateFloat128NaN( a, b );
5657
            }
5658
            return a;
5659
        }
5660
        add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5661
        if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 );
5662
        zSig2 = 0;
5663
        zSig0 |= LIT64( 0x0002000000000000 );
5664
        zExp = aExp;
5665
        goto shiftRight1;
5666
    }
5667
    aSig0 |= LIT64( 0x0001000000000000 );
5668
    add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5669
    --zExp;
5670
    if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
5671
    ++zExp;
5672
 shiftRight1:
5673
    shift128ExtraRightJamming(
5674
        zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5675
 roundAndPack:
5676
    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
5677
 
5678
}
5679
 
5680
/*----------------------------------------------------------------------------
5681
| Returns the result of subtracting the absolute values of the quadruple-
5682
| precision floating-point values `a' and `b'.  If `zSign' is 1, the
5683
| difference is negated before being returned.  `zSign' is ignored if the
5684
| result is a NaN.  The subtraction is performed according to the IEC/IEEE
5685
| Standard for Binary Floating-Point Arithmetic.
5686
*----------------------------------------------------------------------------*/
5687
 
5688
static float128 subFloat128Sigs( float128 a, float128 b, flag zSign )
5689
{
5690
    int32 aExp, bExp, zExp;
5691
    bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
5692
    int32 expDiff;
5693
    float128 z;
5694
 
5695
    aSig1 = extractFloat128Frac1( a );
5696
    aSig0 = extractFloat128Frac0( a );
5697
    aExp = extractFloat128Exp( a );
5698
    bSig1 = extractFloat128Frac1( b );
5699
    bSig0 = extractFloat128Frac0( b );
5700
    bExp = extractFloat128Exp( b );
5701
    expDiff = aExp - bExp;
5702
    shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5703
    shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
5704
    if ( 0 < expDiff ) goto aExpBigger;
5705
    if ( expDiff < 0 ) goto bExpBigger;
5706
    if ( aExp == 0x7FFF ) {
5707
        if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5708
            return propagateFloat128NaN( a, b );
5709
        }
5710
        float_raise( float_flag_invalid );
5711
        z.low = float128_default_nan_low;
5712
        z.high = float128_default_nan_high;
5713
        return z;
5714
    }
5715
    if ( aExp == 0 ) {
5716
        aExp = 1;
5717
        bExp = 1;
5718
    }
5719
    if ( bSig0 < aSig0 ) goto aBigger;
5720
    if ( aSig0 < bSig0 ) goto bBigger;
5721
    if ( bSig1 < aSig1 ) goto aBigger;
5722
    if ( aSig1 < bSig1 ) goto bBigger;
5723
    return packFloat128( float_rounding_mode == float_round_down, 0, 0, 0 );
5724
 bExpBigger:
5725
    if ( bExp == 0x7FFF ) {
5726
        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5727
        return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
5728
    }
5729
    if ( aExp == 0 ) {
5730
        ++expDiff;
5731
    }
5732
    else {
5733
        aSig0 |= LIT64( 0x4000000000000000 );
5734
    }
5735
    shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5736
    bSig0 |= LIT64( 0x4000000000000000 );
5737
 bBigger:
5738
    sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
5739
    zExp = bExp;
5740
    zSign ^= 1;
5741
    goto normalizeRoundAndPack;
5742
 aExpBigger:
5743
    if ( aExp == 0x7FFF ) {
5744
        if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
5745
        return a;
5746
    }
5747
    if ( bExp == 0 ) {
5748
        --expDiff;
5749
    }
5750
    else {
5751
        bSig0 |= LIT64( 0x4000000000000000 );
5752
    }
5753
    shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
5754
    aSig0 |= LIT64( 0x4000000000000000 );
5755
 aBigger:
5756
    sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5757
    zExp = aExp;
5758
 normalizeRoundAndPack:
5759
    --zExp;
5760
    return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 );
5761
 
5762
}
5763
 
5764
/*----------------------------------------------------------------------------
5765
| Returns the result of adding the quadruple-precision floating-point values
5766
| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
5767
| for Binary Floating-Point Arithmetic.
5768
*----------------------------------------------------------------------------*/
5769
 
5770
float128 float128_add( float128 a, float128 b )
5771
{
5772
    flag aSign, bSign;
5773
 
5774
    aSign = extractFloat128Sign( a );
5775
    bSign = extractFloat128Sign( b );
5776
    if ( aSign == bSign ) {
5777
        return addFloat128Sigs( a, b, aSign );
5778
    }
5779
    else {
5780
        return subFloat128Sigs( a, b, aSign );
5781
    }
5782
 
5783
}
5784
 
5785
/*----------------------------------------------------------------------------
5786
| Returns the result of subtracting the quadruple-precision floating-point
5787
| values `a' and `b'.  The operation is performed according to the IEC/IEEE
5788
| Standard for Binary Floating-Point Arithmetic.
5789
*----------------------------------------------------------------------------*/
5790
 
5791
float128 float128_sub( float128 a, float128 b )
5792
{
5793
    flag aSign, bSign;
5794
 
5795
    aSign = extractFloat128Sign( a );
5796
    bSign = extractFloat128Sign( b );
5797
    if ( aSign == bSign ) {
5798
        return subFloat128Sigs( a, b, aSign );
5799
    }
5800
    else {
5801
        return addFloat128Sigs( a, b, aSign );
5802
    }
5803
 
5804
}
5805
 
5806
/*----------------------------------------------------------------------------
5807
| Returns the result of multiplying the quadruple-precision floating-point
5808
| values `a' and `b'.  The operation is performed according to the IEC/IEEE
5809
| Standard for Binary Floating-Point Arithmetic.
5810
*----------------------------------------------------------------------------*/
5811
 
5812
float128 float128_mul( float128 a, float128 b )
5813
{
5814
    flag aSign, bSign, zSign;
5815
    int32 aExp, bExp, zExp;
5816
    bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
5817
    float128 z;
5818
 
5819
    aSig1 = extractFloat128Frac1( a );
5820
    aSig0 = extractFloat128Frac0( a );
5821
    aExp = extractFloat128Exp( a );
5822
    aSign = extractFloat128Sign( a );
5823
    bSig1 = extractFloat128Frac1( b );
5824
    bSig0 = extractFloat128Frac0( b );
5825
    bExp = extractFloat128Exp( b );
5826
    bSign = extractFloat128Sign( b );
5827
    zSign = aSign ^ bSign;
5828
    if ( aExp == 0x7FFF ) {
5829
        if (    ( aSig0 | aSig1 )
5830
             || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5831
            return propagateFloat128NaN( a, b );
5832
        }
5833
        if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
5834
        return packFloat128( zSign, 0x7FFF, 0, 0 );
5835
    }
5836
    if ( bExp == 0x7FFF ) {
5837
        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5838
        if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5839
 invalid:
5840
            float_raise( float_flag_invalid );
5841
            z.low = float128_default_nan_low;
5842
            z.high = float128_default_nan_high;
5843
            return z;
5844
        }
5845
        return packFloat128( zSign, 0x7FFF, 0, 0 );
5846
    }
5847
    if ( aExp == 0 ) {
5848
        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5849
        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5850
    }
5851
    if ( bExp == 0 ) {
5852
        if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5853
        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5854
    }
5855
    zExp = aExp + bExp - 0x4000;
5856
    aSig0 |= LIT64( 0x0001000000000000 );
5857
    shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
5858
    mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
5859
    add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
5860
    zSig2 |= ( zSig3 != 0 );
5861
    if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
5862
        shift128ExtraRightJamming(
5863
            zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5864
        ++zExp;
5865
    }
5866
    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
5867
 
5868
}
5869
 
5870
/*----------------------------------------------------------------------------
5871
| Returns the result of dividing the quadruple-precision floating-point value
5872
| `a' by the corresponding value `b'.  The operation is performed according to
5873
| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5874
*----------------------------------------------------------------------------*/
5875
 
5876
float128 float128_div( float128 a, float128 b )
5877
{
5878
    flag aSign, bSign, zSign;
5879
    int32 aExp, bExp, zExp;
5880
    bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5881
    bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5882
    float128 z;
5883
 
5884
    aSig1 = extractFloat128Frac1( a );
5885
    aSig0 = extractFloat128Frac0( a );
5886
    aExp = extractFloat128Exp( a );
5887
    aSign = extractFloat128Sign( a );
5888
    bSig1 = extractFloat128Frac1( b );
5889
    bSig0 = extractFloat128Frac0( b );
5890
    bExp = extractFloat128Exp( b );
5891
    bSign = extractFloat128Sign( b );
5892
    zSign = aSign ^ bSign;
5893
    if ( aExp == 0x7FFF ) {
5894
        if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
5895
        if ( bExp == 0x7FFF ) {
5896
            if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5897
            goto invalid;
5898
        }
5899
        return packFloat128( zSign, 0x7FFF, 0, 0 );
5900
    }
5901
    if ( bExp == 0x7FFF ) {
5902
        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5903
        return packFloat128( zSign, 0, 0, 0 );
5904
    }
5905
    if ( bExp == 0 ) {
5906
        if ( ( bSig0 | bSig1 ) == 0 ) {
5907
            if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5908
 invalid:
5909
                float_raise( float_flag_invalid );
5910
                z.low = float128_default_nan_low;
5911
                z.high = float128_default_nan_high;
5912
                return z;
5913
            }
5914
            float_raise( float_flag_divbyzero );
5915
            return packFloat128( zSign, 0x7FFF, 0, 0 );
5916
        }
5917
        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5918
    }
5919
    if ( aExp == 0 ) {
5920
        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5921
        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5922
    }
5923
    zExp = aExp - bExp + 0x3FFD;
5924
    shortShift128Left(
5925
        aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
5926
    shortShift128Left(
5927
        bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5928
    if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
5929
        shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
5930
        ++zExp;
5931
    }
5932
    zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
5933
    mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
5934
    sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
5935
    while ( (sbits64) rem0 < 0 ) {
5936
        --zSig0;
5937
        add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
5938
    }
5939
    zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
5940
    if ( ( zSig1 & 0x3FFF ) <= 4 ) {
5941
        mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
5942
        sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
5943
        while ( (sbits64) rem1 < 0 ) {
5944
            --zSig1;
5945
            add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
5946
        }
5947
        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5948
    }
5949
    shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
5950
    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
5951
 
5952
}
5953
 
5954
/*----------------------------------------------------------------------------
5955
| Returns the remainder of the quadruple-precision floating-point value `a'
5956
| with respect to the corresponding value `b'.  The operation is performed
5957
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5958
*----------------------------------------------------------------------------*/
5959
 
5960
float128 float128_rem( float128 a, float128 b )
5961
{
5962
    flag aSign, bSign, zSign;
5963
    int32 aExp, bExp, expDiff;
5964
    bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
5965
    bits64 allZero, alternateASig0, alternateASig1, sigMean1;
5966
    sbits64 sigMean0;
5967
    float128 z;
5968
 
5969
    aSig1 = extractFloat128Frac1( a );
5970
    aSig0 = extractFloat128Frac0( a );
5971
    aExp = extractFloat128Exp( a );
5972
    aSign = extractFloat128Sign( a );
5973
    bSig1 = extractFloat128Frac1( b );
5974
    bSig0 = extractFloat128Frac0( b );
5975
    bExp = extractFloat128Exp( b );
5976
    bSign = extractFloat128Sign( b );
5977
    if ( aExp == 0x7FFF ) {
5978
        if (    ( aSig0 | aSig1 )
5979
             || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5980
            return propagateFloat128NaN( a, b );
5981
        }
5982
        goto invalid;
5983
    }
5984
    if ( bExp == 0x7FFF ) {
5985
        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5986
        return a;
5987
    }
5988
    if ( bExp == 0 ) {
5989
        if ( ( bSig0 | bSig1 ) == 0 ) {
5990
 invalid:
5991
            float_raise( float_flag_invalid );
5992
            z.low = float128_default_nan_low;
5993
            z.high = float128_default_nan_high;
5994
            return z;
5995
        }
5996
        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5997
    }
5998
    if ( aExp == 0 ) {
5999
        if ( ( aSig0 | aSig1 ) == 0 ) return a;
6000
        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6001
    }
6002
    expDiff = aExp - bExp;
6003
    if ( expDiff < -1 ) return a;
6004
    shortShift128Left(
6005
        aSig0 | LIT64( 0x0001000000000000 ),
6006
        aSig1,
6007
        15 - ( expDiff < 0 ),
6008
        &aSig0,
6009
        &aSig1
6010
    );
6011
    shortShift128Left(
6012
        bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6013
    q = le128( bSig0, bSig1, aSig0, aSig1 );
6014
    if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6015
    expDiff -= 64;
6016
    while ( 0 < expDiff ) {
6017
        q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6018
        q = ( 4 < q ) ? q - 4 : 0;
6019
        mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6020
        shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
6021
        shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
6022
        sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
6023
        expDiff -= 61;
6024
    }
6025
    if ( -64 < expDiff ) {
6026
        q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6027
        q = ( 4 < q ) ? q - 4 : 0;
6028
        q >>= - expDiff;
6029
        shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6030
        expDiff += 52;
6031
        if ( expDiff < 0 ) {
6032
            shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6033
        }
6034
        else {
6035
            shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
6036
        }
6037
        mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6038
        sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
6039
    }
6040
    else {
6041
        shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
6042
        shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6043
    }
6044
    do {
6045
        alternateASig0 = aSig0;
6046
        alternateASig1 = aSig1;
6047
        ++q;
6048
        sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6049
    } while ( 0 <= (sbits64) aSig0 );
6050
    add128(
6051
        aSig0, aSig1, alternateASig0, alternateASig1, &sigMean0, &sigMean1 );
6052
    if (    ( sigMean0 < 0 )
6053
         || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
6054
        aSig0 = alternateASig0;
6055
        aSig1 = alternateASig1;
6056
    }
6057
    zSign = ( (sbits64) aSig0 < 0 );
6058
    if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
6059
    return
6060
        normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 );
6061
 
6062
}
6063
 
6064
/*----------------------------------------------------------------------------
6065
| Returns the square root of the quadruple-precision floating-point value `a'.
6066
| The operation is performed according to the IEC/IEEE Standard for Binary
6067
| Floating-Point Arithmetic.
6068
*----------------------------------------------------------------------------*/
6069
 
6070
float128 float128_sqrt( float128 a )
6071
{
6072
    flag aSign;
6073
    int32 aExp, zExp;
6074
    bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
6075
    bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6076
    float128 z;
6077
 
6078
    aSig1 = extractFloat128Frac1( a );
6079
    aSig0 = extractFloat128Frac0( a );
6080
    aExp = extractFloat128Exp( a );
6081
    aSign = extractFloat128Sign( a );
6082
    if ( aExp == 0x7FFF ) {
6083
        if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a );
6084
        if ( ! aSign ) return a;
6085
        goto invalid;
6086
    }
6087
    if ( aSign ) {
6088
        if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
6089
 invalid:
6090
        float_raise( float_flag_invalid );
6091
        z.low = float128_default_nan_low;
6092
        z.high = float128_default_nan_high;
6093
        return z;
6094
    }
6095
    if ( aExp == 0 ) {
6096
        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
6097
        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6098
    }
6099
    zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
6100
    aSig0 |= LIT64( 0x0001000000000000 );
6101
    zSig0 = estimateSqrt32( aExp, aSig0>>17 );
6102
    shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
6103
    zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6104
    doubleZSig0 = zSig0<<1;
6105
    mul64To128( zSig0, zSig0, &term0, &term1 );
6106
    sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6107
    while ( (sbits64) rem0 < 0 ) {
6108
        --zSig0;
6109
        doubleZSig0 -= 2;
6110
        add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6111
    }
6112
    zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6113
    if ( ( zSig1 & 0x1FFF ) <= 5 ) {
6114
        if ( zSig1 == 0 ) zSig1 = 1;
6115
        mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6116
        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6117
        mul64To128( zSig1, zSig1, &term2, &term3 );
6118
        sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6119
        while ( (sbits64) rem1 < 0 ) {
6120
            --zSig1;
6121
            shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6122
            term3 |= 1;
6123
            term2 |= doubleZSig0;
6124
            add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6125
        }
6126
        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6127
    }
6128
    shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
6129
    return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 );
6130
 
6131
}
6132
 
6133
/*----------------------------------------------------------------------------
6134
| Returns 1 if the quadruple-precision floating-point value `a' is equal to
6135
| the corresponding value `b', and 0 otherwise.  The comparison is performed
6136
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6137
*----------------------------------------------------------------------------*/
6138
 
6139
flag float128_eq( float128 a, float128 b )
6140
{
6141
 
6142
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6143
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6144
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
6145
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6146
       ) {
6147
        if (    float128_is_signaling_nan( a )
6148
             || float128_is_signaling_nan( b ) ) {
6149
            float_raise( float_flag_invalid );
6150
        }
6151
        return 0;
6152
    }
6153
    return
6154
           ( a.low == b.low )
6155
        && (    ( a.high == b.high )
6156
             || (    ( a.low == 0 )
6157
                  && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
6158
           );
6159
 
6160
}
6161
 
6162
/*----------------------------------------------------------------------------
6163
| Returns 1 if the quadruple-precision floating-point value `a' is less than
6164
| or equal to the corresponding value `b', and 0 otherwise.  The comparison
6165
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
6166
| Arithmetic.
6167
*----------------------------------------------------------------------------*/
6168
 
6169
flag float128_le( float128 a, float128 b )
6170
{
6171
    flag aSign, bSign;
6172
 
6173
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6174
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6175
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
6176
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6177
       ) {
6178
        float_raise( float_flag_invalid );
6179
        return 0;
6180
    }
6181
    aSign = extractFloat128Sign( a );
6182
    bSign = extractFloat128Sign( b );
6183
    if ( aSign != bSign ) {
6184
        return
6185
               aSign
6186
            || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6187
                 == 0 );
6188
    }
6189
    return
6190
          aSign ? le128( b.high, b.low, a.high, a.low )
6191
        : le128( a.high, a.low, b.high, b.low );
6192
 
6193
}
6194
 
6195
/*----------------------------------------------------------------------------
6196
| Returns 1 if the quadruple-precision floating-point value `a' is less than
6197
| the corresponding value `b', and 0 otherwise.  The comparison is performed
6198
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6199
*----------------------------------------------------------------------------*/
6200
 
6201
flag float128_lt( float128 a, float128 b )
6202
{
6203
    flag aSign, bSign;
6204
 
6205
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6206
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6207
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
6208
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6209
       ) {
6210
        float_raise( float_flag_invalid );
6211
        return 0;
6212
    }
6213
    aSign = extractFloat128Sign( a );
6214
    bSign = extractFloat128Sign( b );
6215
    if ( aSign != bSign ) {
6216
        return
6217
               aSign
6218
            && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6219
                 != 0 );
6220
    }
6221
    return
6222
          aSign ? lt128( b.high, b.low, a.high, a.low )
6223
        : lt128( a.high, a.low, b.high, b.low );
6224
 
6225
}
6226
 
6227
/*----------------------------------------------------------------------------
6228
| Returns 1 if the quadruple-precision floating-point value `a' is equal to
6229
| the corresponding value `b', and 0 otherwise.  The invalid exception is
6230
| raised if either operand is a NaN.  Otherwise, the comparison is performed
6231
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6232
*----------------------------------------------------------------------------*/
6233
 
6234
flag float128_eq_signaling( float128 a, float128 b )
6235
{
6236
 
6237
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6238
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6239
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
6240
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6241
       ) {
6242
        float_raise( float_flag_invalid );
6243
        return 0;
6244
    }
6245
    return
6246
           ( a.low == b.low )
6247
        && (    ( a.high == b.high )
6248
             || (    ( a.low == 0 )
6249
                  && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
6250
           );
6251
 
6252
}
6253
 
6254
/*----------------------------------------------------------------------------
6255
| Returns 1 if the quadruple-precision floating-point value `a' is less than
6256
| or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
6257
| cause an exception.  Otherwise, the comparison is performed according to the
6258
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6259
*----------------------------------------------------------------------------*/
6260
 
6261
flag float128_le_quiet( float128 a, float128 b )
6262
{
6263
    flag aSign, bSign;
6264
 
6265
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6266
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6267
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
6268
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6269
       ) {
6270
        if (    float128_is_signaling_nan( a )
6271
             || float128_is_signaling_nan( b ) ) {
6272
            float_raise( float_flag_invalid );
6273
        }
6274
        return 0;
6275
    }
6276
    aSign = extractFloat128Sign( a );
6277
    bSign = extractFloat128Sign( b );
6278
    if ( aSign != bSign ) {
6279
        return
6280
               aSign
6281
            || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6282
                 == 0 );
6283
    }
6284
    return
6285
          aSign ? le128( b.high, b.low, a.high, a.low )
6286
        : le128( a.high, a.low, b.high, b.low );
6287
 
6288
}
6289
 
6290
/*----------------------------------------------------------------------------
6291
| Returns 1 if the quadruple-precision floating-point value `a' is less than
6292
| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
6293
| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
6294
| Standard for Binary Floating-Point Arithmetic.
6295
*----------------------------------------------------------------------------*/
6296
 
6297
flag float128_lt_quiet( float128 a, float128 b )
6298
{
6299
    flag aSign, bSign;
6300
 
6301
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6302
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6303
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
6304
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6305
       ) {
6306
        if (    float128_is_signaling_nan( a )
6307
             || float128_is_signaling_nan( b ) ) {
6308
            float_raise( float_flag_invalid );
6309
        }
6310
        return 0;
6311
    }
6312
    aSign = extractFloat128Sign( a );
6313
    bSign = extractFloat128Sign( b );
6314
    if ( aSign != bSign ) {
6315
        return
6316
               aSign
6317
            && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6318
                 != 0 );
6319
    }
6320
    return
6321
          aSign ? lt128( b.high, b.low, a.high, a.low )
6322
        : lt128( a.high, a.low, b.high, b.low );
6323
 
6324
}
6325
 
6326
#endif
6327
 

powered by: WebSVN 2.1.0

© copyright 1999-2025 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.