OpenCores
URL https://opencores.org/ocsvn/openrisc_me/openrisc_me/trunk

Subversion Repositories openrisc_me

[/] [openrisc/] [trunk/] [gnu-src/] [gcc-4.5.1/] [libgcc/] [config/] [libbid/] [bid128_add.c] - Blame information for rev 280

Go to most recent revision | Details | Compare with Previous | View Log

Line No. Rev Author Line
1 272 jeremybenn
/* Copyright (C) 2007, 2009  Free Software Foundation, Inc.
2
 
3
This file is part of GCC.
4
 
5
GCC is free software; you can redistribute it and/or modify it under
6
the terms of the GNU General Public License as published by the Free
7
Software Foundation; either version 3, or (at your option) any later
8
version.
9
 
10
GCC is distributed in the hope that it will be useful, but WITHOUT ANY
11
WARRANTY; without even the implied warranty of MERCHANTABILITY or
12
FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
13
for more details.
14
 
15
Under Section 7 of GPL version 3, you are granted additional
16
permissions described in the GCC Runtime Library Exception, version
17
3.1, as published by the Free Software Foundation.
18
 
19
You should have received a copy of the GNU General Public License and
20
a copy of the GCC Runtime Library Exception along with this program;
21
see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
22
<http://www.gnu.org/licenses/>.  */
23
 
24
#include "bid_internal.h"
25
 
26
 
27
#if DECIMAL_CALL_BY_REFERENCE
28
void
29
bid64dq_add (UINT64 * pres, UINT64 * px, UINT128 * py
30
             _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
31
             _EXC_INFO_PARAM) {
32
  UINT64 x = *px;
33
#if !DECIMAL_GLOBAL_ROUNDING
34
  unsigned int rnd_mode = *prnd_mode;
35
#endif
36
#else
37
UINT64
38
bid64dq_add (UINT64 x, UINT128 y
39
             _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
40
             _EXC_INFO_PARAM) {
41
#endif
42
  UINT64 res = 0xbaddbaddbaddbaddull;
43
  UINT128 x1;
44
 
45
#if DECIMAL_CALL_BY_REFERENCE
46
  bid64_to_bid128 (&x1, &x _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
47
  bid64qq_add (&res, &x1, py
48
               _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
49
               _EXC_INFO_ARG);
50
#else
51
  x1 = bid64_to_bid128 (x _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
52
  res = bid64qq_add (x1, y
53
                     _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
54
                     _EXC_INFO_ARG);
55
#endif
56
  BID_RETURN (res);
57
}
58
 
59
 
60
#if DECIMAL_CALL_BY_REFERENCE
61
void
62
bid64qd_add (UINT64 * pres, UINT128 * px, UINT64 * py
63
             _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
64
             _EXC_INFO_PARAM) {
65
  UINT64 y = *py;
66
#if !DECIMAL_GLOBAL_ROUNDING
67
  unsigned int rnd_mode = *prnd_mode;
68
#endif
69
#else
70
UINT64
71
bid64qd_add (UINT128 x, UINT64 y
72
             _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
73
             _EXC_INFO_PARAM) {
74
#endif
75
  UINT64 res = 0xbaddbaddbaddbaddull;
76
  UINT128 y1;
77
 
78
#if DECIMAL_CALL_BY_REFERENCE
79
  bid64_to_bid128 (&y1, &y _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
80
  bid64qq_add (&res, px, &y1
81
               _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
82
               _EXC_INFO_ARG);
83
#else
84
  y1 = bid64_to_bid128 (y _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
85
  res = bid64qq_add (x, y1
86
                     _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
87
                     _EXC_INFO_ARG);
88
#endif
89
  BID_RETURN (res);
90
}
91
 
92
 
93
#if DECIMAL_CALL_BY_REFERENCE
94
void
95
bid64qq_add (UINT64 * pres, UINT128 * px, UINT128 * py
96
             _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
97
             _EXC_INFO_PARAM) {
98
  UINT128 x = *px, y = *py;
99
#if !DECIMAL_GLOBAL_ROUNDING
100
  unsigned int rnd_mode = *prnd_mode;
101
#endif
102
#else
103
UINT64
104
bid64qq_add (UINT128 x, UINT128 y
105
             _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
106
             _EXC_INFO_PARAM) {
107
#endif
108
 
109
  UINT128 one = { {0x0000000000000001ull, 0x3040000000000000ull}
110
  };
111
  UINT64 res = 0xbaddbaddbaddbaddull;
112
 
113
  BID_SWAP128 (one);
114
#if DECIMAL_CALL_BY_REFERENCE
115
  bid64qqq_fma (&res, &one, &x, &y
116
                _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
117
                _EXC_INFO_ARG);
118
#else
119
  res = bid64qqq_fma (one, x, y
120
                      _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
121
                      _EXC_INFO_ARG);
122
#endif
123
  BID_RETURN (res);
124
}
125
 
126
 
127
#if DECIMAL_CALL_BY_REFERENCE
128
void
129
bid128dd_add (UINT128 * pres, UINT64 * px, UINT64 * py
130
              _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
131
              _EXC_INFO_PARAM) {
132
  UINT64 x = *px, y = *py;
133
#if !DECIMAL_GLOBAL_ROUNDING
134
  unsigned int rnd_mode = *prnd_mode;
135
#endif
136
#else
137
UINT128
138
bid128dd_add (UINT64 x, UINT64 y
139
              _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
140
              _EXC_INFO_PARAM) {
141
#endif
142
  UINT128 res = { {0xbaddbaddbaddbaddull, 0xbaddbaddbaddbaddull}
143
  };
144
  UINT128 x1, y1;
145
 
146
#if DECIMAL_CALL_BY_REFERENCE
147
  bid64_to_bid128 (&x1, &x _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
148
  bid64_to_bid128 (&y1, &y _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
149
  bid128_add (&res, &x1, &y1
150
              _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
151
              _EXC_INFO_ARG);
152
#else
153
  x1 = bid64_to_bid128 (x _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
154
  y1 = bid64_to_bid128 (y _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
155
  res = bid128_add (x1, y1
156
                    _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
157
                    _EXC_INFO_ARG);
158
#endif
159
  BID_RETURN (res);
160
}
161
 
162
 
163
#if DECIMAL_CALL_BY_REFERENCE
164
void
165
bid128dq_add (UINT128 * pres, UINT64 * px, UINT128 * py
166
              _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
167
              _EXC_INFO_PARAM) {
168
  UINT64 x = *px;
169
#if !DECIMAL_GLOBAL_ROUNDING
170
  unsigned int rnd_mode = *prnd_mode;
171
#endif
172
#else
173
UINT128
174
bid128dq_add (UINT64 x, UINT128 y
175
              _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
176
              _EXC_INFO_PARAM) {
177
#endif
178
  UINT128 res = { {0xbaddbaddbaddbaddull, 0xbaddbaddbaddbaddull}
179
  };
180
  UINT128 x1;
181
 
182
#if DECIMAL_CALL_BY_REFERENCE
183
  bid64_to_bid128 (&x1, &x _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
184
  bid128_add (&res, &x1, py
185
              _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
186
              _EXC_INFO_ARG);
187
#else
188
  x1 = bid64_to_bid128 (x _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
189
  res = bid128_add (x1, y
190
                    _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
191
                    _EXC_INFO_ARG);
192
#endif
193
  BID_RETURN (res);
194
}
195
 
196
 
197
#if DECIMAL_CALL_BY_REFERENCE
198
void
199
bid128qd_add (UINT128 * pres, UINT128 * px, UINT64 * py
200
              _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
201
              _EXC_INFO_PARAM) {
202
  UINT64 y = *py;
203
#if !DECIMAL_GLOBAL_ROUNDING
204
  unsigned int rnd_mode = *prnd_mode;
205
#endif
206
#else
207
UINT128
208
bid128qd_add (UINT128 x, UINT64 y
209
              _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
210
              _EXC_INFO_PARAM) {
211
#endif
212
  UINT128 res = { {0xbaddbaddbaddbaddull, 0xbaddbaddbaddbaddull}
213
  };
214
  UINT128 y1;
215
 
216
#if DECIMAL_CALL_BY_REFERENCE
217
  bid64_to_bid128 (&y1, &y _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
218
  bid128_add (&res, px, &y1
219
              _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
220
              _EXC_INFO_ARG);
221
#else
222
  y1 = bid64_to_bid128 (y _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
223
  res = bid128_add (x, y1
224
                    _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
225
                    _EXC_INFO_ARG);
226
#endif
227
  BID_RETURN (res);
228
}
229
 
230
 
231
// bid128_add stands for bid128qq_add
232
 
233
 
234
/*****************************************************************************
235
 *  BID64/BID128 sub
236
 ****************************************************************************/
237
 
238
#if DECIMAL_CALL_BY_REFERENCE
239
void
240
bid64dq_sub (UINT64 * pres, UINT64 * px, UINT128 * py
241
             _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
242
             _EXC_INFO_PARAM) {
243
  UINT64 x = *px;
244
#if !DECIMAL_GLOBAL_ROUNDING
245
  unsigned int rnd_mode = *prnd_mode;
246
#endif
247
#else
248
UINT64
249
bid64dq_sub (UINT64 x, UINT128 y
250
             _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
251
             _EXC_INFO_PARAM) {
252
#endif
253
  UINT64 res = 0xbaddbaddbaddbaddull;
254
  UINT128 x1;
255
 
256
#if DECIMAL_CALL_BY_REFERENCE
257
  bid64_to_bid128 (&x1, &x _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
258
  bid64qq_sub (&res, &x1, py
259
               _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
260
               _EXC_INFO_ARG);
261
#else
262
  x1 = bid64_to_bid128 (x _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
263
  res = bid64qq_sub (x1, y
264
                     _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
265
                     _EXC_INFO_ARG);
266
#endif
267
  BID_RETURN (res);
268
}
269
 
270
 
271
#if DECIMAL_CALL_BY_REFERENCE
272
void
273
bid64qd_sub (UINT64 * pres, UINT128 * px, UINT64 * py
274
             _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
275
             _EXC_INFO_PARAM) {
276
  UINT64 y = *py;
277
#if !DECIMAL_GLOBAL_ROUNDING
278
  unsigned int rnd_mode = *prnd_mode;
279
#endif
280
#else
281
UINT64
282
bid64qd_sub (UINT128 x, UINT64 y
283
             _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
284
             _EXC_INFO_PARAM) {
285
#endif
286
  UINT64 res = 0xbaddbaddbaddbaddull;
287
  UINT128 y1;
288
 
289
#if DECIMAL_CALL_BY_REFERENCE
290
  bid64_to_bid128 (&y1, &y _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
291
  bid64qq_sub (&res, px, &y1
292
               _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
293
               _EXC_INFO_ARG);
294
#else
295
  y1 = bid64_to_bid128 (y _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
296
  res = bid64qq_sub (x, y1
297
                     _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
298
                     _EXC_INFO_ARG);
299
#endif
300
  BID_RETURN (res);
301
}
302
 
303
 
304
#if DECIMAL_CALL_BY_REFERENCE
305
void
306
bid64qq_sub (UINT64 * pres, UINT128 * px, UINT128 * py
307
             _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
308
             _EXC_INFO_PARAM) {
309
  UINT128 x = *px, y = *py;
310
#if !DECIMAL_GLOBAL_ROUNDING
311
  unsigned int rnd_mode = *prnd_mode;
312
#endif
313
#else
314
UINT64
315
bid64qq_sub (UINT128 x, UINT128 y
316
             _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
317
             _EXC_INFO_PARAM) {
318
#endif
319
 
320
  UINT128 one = { {0x0000000000000001ull, 0x3040000000000000ull}
321
  };
322
  UINT64 res = 0xbaddbaddbaddbaddull;
323
  UINT64 y_sign;
324
 
325
  BID_SWAP128 (one);
326
  if ((y.w[HIGH_128W] & MASK_NAN) != MASK_NAN) {        // y is not NAN
327
    // change its sign
328
    y_sign = y.w[HIGH_128W] & MASK_SIGN;        // 0 for positive, MASK_SIGN for negative
329
    if (y_sign)
330
      y.w[HIGH_128W] = y.w[HIGH_128W] & 0x7fffffffffffffffull;
331
    else
332
      y.w[HIGH_128W] = y.w[HIGH_128W] | 0x8000000000000000ull;
333
  }
334
#if DECIMAL_CALL_BY_REFERENCE
335
  bid64qqq_fma (&res, &one, &x, &y
336
                _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
337
                _EXC_INFO_ARG);
338
#else
339
  res = bid64qqq_fma (one, x, y
340
                      _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
341
                      _EXC_INFO_ARG);
342
#endif
343
  BID_RETURN (res);
344
}
345
 
346
 
347
#if DECIMAL_CALL_BY_REFERENCE
348
void
349
bid128dd_sub (UINT128 * pres, UINT64 * px, UINT64 * py
350
              _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
351
              _EXC_INFO_PARAM) {
352
  UINT64 x = *px, y = *py;
353
#if !DECIMAL_GLOBAL_ROUNDING
354
  unsigned int rnd_mode = *prnd_mode;
355
#endif
356
#else
357
UINT128
358
bid128dd_sub (UINT64 x, UINT64 y
359
              _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
360
              _EXC_INFO_PARAM) {
361
#endif
362
  UINT128 res = { {0xbaddbaddbaddbaddull, 0xbaddbaddbaddbaddull}
363
  };
364
  UINT128 x1, y1;
365
 
366
#if DECIMAL_CALL_BY_REFERENCE
367
  bid64_to_bid128 (&x1, &x _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
368
  bid64_to_bid128 (&y1, &y _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
369
  bid128_sub (&res, &x1, &y1
370
              _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
371
              _EXC_INFO_ARG);
372
#else
373
  x1 = bid64_to_bid128 (x _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
374
  y1 = bid64_to_bid128 (y _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
375
  res = bid128_sub (x1, y1
376
                    _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
377
                    _EXC_INFO_ARG);
378
#endif
379
  BID_RETURN (res);
380
}
381
 
382
 
383
#if DECIMAL_CALL_BY_REFERENCE
384
void
385
bid128dq_sub (UINT128 * pres, UINT64 * px, UINT128 * py
386
              _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
387
              _EXC_INFO_PARAM) {
388
  UINT64 x = *px;
389
#if !DECIMAL_GLOBAL_ROUNDING
390
  unsigned int rnd_mode = *prnd_mode;
391
#endif
392
#else
393
UINT128
394
bid128dq_sub (UINT64 x, UINT128 y
395
              _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
396
              _EXC_INFO_PARAM) {
397
#endif
398
  UINT128 res = { {0xbaddbaddbaddbaddull, 0xbaddbaddbaddbaddull}
399
  };
400
  UINT128 x1;
401
 
402
#if DECIMAL_CALL_BY_REFERENCE
403
  bid64_to_bid128 (&x1, &x _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
404
  bid128_sub (&res, &x1, py
405
              _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
406
              _EXC_INFO_ARG);
407
#else
408
  x1 = bid64_to_bid128 (x _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
409
  res = bid128_sub (x1, y
410
                    _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
411
                    _EXC_INFO_ARG);
412
#endif
413
  BID_RETURN (res);
414
}
415
 
416
 
417
#if DECIMAL_CALL_BY_REFERENCE
418
void
419
bid128qd_sub (UINT128 * pres, UINT128 * px, UINT64 * py
420
              _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
421
              _EXC_INFO_PARAM) {
422
  UINT64 y = *py;
423
#if !DECIMAL_GLOBAL_ROUNDING
424
  unsigned int rnd_mode = *prnd_mode;
425
#endif
426
#else
427
UINT128
428
bid128qd_sub (UINT128 x, UINT64 y
429
              _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
430
              _EXC_INFO_PARAM) {
431
#endif
432
  UINT128 res = { {0xbaddbaddbaddbaddull, 0xbaddbaddbaddbaddull}
433
  };
434
  UINT128 y1;
435
 
436
#if DECIMAL_CALL_BY_REFERENCE
437
  bid64_to_bid128 (&y1, &y _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
438
  bid128_sub (&res, px, &y1
439
              _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
440
              _EXC_INFO_ARG);
441
#else
442
  y1 = bid64_to_bid128 (y _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
443
  res = bid128_sub (x, y1
444
                    _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
445
                    _EXC_INFO_ARG);
446
#endif
447
  BID_RETURN (res);
448
}
449
 
450
#if DECIMAL_CALL_BY_REFERENCE
451
void
452
bid128_add (UINT128 * pres, UINT128 * px, UINT128 * py
453
            _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
454
            _EXC_INFO_PARAM) {
455
  UINT128 x = *px, y = *py;
456
#if !DECIMAL_GLOBAL_ROUNDING
457
  unsigned int rnd_mode = *prnd_mode;
458
#endif
459
#else
460
UINT128
461
bid128_add (UINT128 x, UINT128 y
462
            _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
463
            _EXC_INFO_PARAM) {
464
#endif
465
  UINT128 res = { {0xbaddbaddbaddbaddull, 0xbaddbaddbaddbaddull}
466
  };
467
  UINT64 x_sign, y_sign, tmp_sign;
468
  UINT64 x_exp, y_exp, tmp_exp; // e1 = x_exp, e2 = y_exp
469
  UINT64 C1_hi, C2_hi, tmp_signif_hi;
470
  UINT64 C1_lo, C2_lo, tmp_signif_lo;
471
  // Note: C1.w[1], C1.w[0] represent C1_hi, C1_lo (all UINT64)
472
  // Note: C2.w[1], C2.w[0] represent C2_hi, C2_lo (all UINT64)
473
  UINT64 tmp64, tmp64A, tmp64B;
474
  BID_UI64DOUBLE tmp1, tmp2;
475
  int x_nr_bits, y_nr_bits;
476
  int q1, q2, delta, scale, x1, ind, shift, tmp_inexact = 0;
477
  UINT64 halfulp64;
478
  UINT128 halfulp128;
479
  UINT128 C1, C2;
480
  UINT128 ten2m1;
481
  UINT128 highf2star;           // top 128 bits in f2*; low 128 bits in R256[1], R256[0]
482
  UINT256 P256, Q256, R256;
483
  int is_inexact = 0, is_midpoint_lt_even = 0, is_midpoint_gt_even = 0;
484
  int is_inexact_lt_midpoint = 0, is_inexact_gt_midpoint = 0;
485
  int second_pass = 0;
486
 
487
  BID_SWAP128 (x);
488
  BID_SWAP128 (y);
489
  x_sign = x.w[1] & MASK_SIGN;  // 0 for positive, MASK_SIGN for negative
490
  y_sign = y.w[1] & MASK_SIGN;  // 0 for positive, MASK_SIGN for negative
491
 
492
  // check for NaN or Infinity
493
  if (((x.w[1] & MASK_SPECIAL) == MASK_SPECIAL)
494
      || ((y.w[1] & MASK_SPECIAL) == MASK_SPECIAL)) {
495
    // x is special or y is special
496
    if ((x.w[1] & MASK_NAN) == MASK_NAN) {      // x is NAN
497
      // check first for non-canonical NaN payload
498
      if (((x.w[1] & 0x00003fffffffffffull) > 0x0000314dc6448d93ull) ||
499
          (((x.w[1] & 0x00003fffffffffffull) == 0x0000314dc6448d93ull)
500
           && (x.w[0] > 0x38c15b09ffffffffull))) {
501
        x.w[1] = x.w[1] & 0xffffc00000000000ull;
502
        x.w[0] = 0x0ull;
503
      }
504
      if ((x.w[1] & MASK_SNAN) == MASK_SNAN) {  // x is SNAN
505
        // set invalid flag
506
        *pfpsf |= INVALID_EXCEPTION;
507
        // return quiet (x)
508
        res.w[1] = x.w[1] & 0xfc003fffffffffffull;
509
        // clear out also G[6]-G[16]
510
        res.w[0] = x.w[0];
511
      } else {  // x is QNaN
512
        // return x
513
        res.w[1] = x.w[1] & 0xfc003fffffffffffull;
514
        // clear out G[6]-G[16]
515
        res.w[0] = x.w[0];
516
        // if y = SNaN signal invalid exception
517
        if ((y.w[1] & MASK_SNAN) == MASK_SNAN) {
518
          // set invalid flag
519
          *pfpsf |= INVALID_EXCEPTION;
520
        }
521
      }
522
      BID_SWAP128 (res);
523
      BID_RETURN (res);
524
    } else if ((y.w[1] & MASK_NAN) == MASK_NAN) {       // y is NAN
525
      // check first for non-canonical NaN payload
526
      if (((y.w[1] & 0x00003fffffffffffull) > 0x0000314dc6448d93ull) ||
527
          (((y.w[1] & 0x00003fffffffffffull) == 0x0000314dc6448d93ull)
528
           && (y.w[0] > 0x38c15b09ffffffffull))) {
529
        y.w[1] = y.w[1] & 0xffffc00000000000ull;
530
        y.w[0] = 0x0ull;
531
      }
532
      if ((y.w[1] & MASK_SNAN) == MASK_SNAN) {  // y is SNAN
533
        // set invalid flag
534
        *pfpsf |= INVALID_EXCEPTION;
535
        // return quiet (y)
536
        res.w[1] = y.w[1] & 0xfc003fffffffffffull;
537
        // clear out also G[6]-G[16]
538
        res.w[0] = y.w[0];
539
      } else {  // y is QNaN
540
        // return y
541
        res.w[1] = y.w[1] & 0xfc003fffffffffffull;
542
        // clear out G[6]-G[16]
543
        res.w[0] = y.w[0];
544
      }
545
      BID_SWAP128 (res);
546
      BID_RETURN (res);
547
    } else {    // neither x not y is NaN; at least one is infinity
548
      if ((x.w[1] & MASK_ANY_INF) == MASK_INF) {        // x is infinity
549
        if ((y.w[1] & MASK_ANY_INF) == MASK_INF) {      // y is infinity
550
          // if same sign, return either of them
551
          if ((x.w[1] & MASK_SIGN) == (y.w[1] & MASK_SIGN)) {
552
            res.w[1] = x_sign | MASK_INF;
553
            res.w[0] = 0x0ull;
554
          } else {      // x and y are infinities of opposite signs
555
            // set invalid flag
556
            *pfpsf |= INVALID_EXCEPTION;
557
            // return QNaN Indefinite
558
            res.w[1] = 0x7c00000000000000ull;
559
            res.w[0] = 0x0000000000000000ull;
560
          }
561
        } else {        // y is 0 or finite
562
          // return x
563
          res.w[1] = x_sign | MASK_INF;
564
          res.w[0] = 0x0ull;
565
        }
566
      } else {  // x is not NaN or infinity, so y must be infinity
567
        res.w[1] = y_sign | MASK_INF;
568
        res.w[0] = 0x0ull;
569
      }
570
      BID_SWAP128 (res);
571
      BID_RETURN (res);
572
    }
573
  }
574
  // unpack the arguments
575
 
576
  // unpack x 
577
  C1_hi = x.w[1] & MASK_COEFF;
578
  C1_lo = x.w[0];
579
  // test for non-canonical values:
580
  // - values whose encoding begins with x00, x01, or x10 and whose 
581
  //   coefficient is larger than 10^34 -1, or
582
  // - values whose encoding begins with x1100, x1101, x1110 (if NaNs 
583
  //   and infinitis were eliminated already this test is reduced to 
584
  //   checking for x10x) 
585
 
586
  // x is not infinity; check for non-canonical values - treated as zero
587
  if ((x.w[1] & 0x6000000000000000ull) == 0x6000000000000000ull) {
588
    // G0_G1=11; non-canonical
589
    x_exp = (x.w[1] << 2) & MASK_EXP;   // biased and shifted left 49 bits
590
    C1_hi = 0;   // significand high
591
    C1_lo = 0;   // significand low
592
  } else {      // G0_G1 != 11
593
    x_exp = x.w[1] & MASK_EXP;  // biased and shifted left 49 bits
594
    if (C1_hi > 0x0001ed09bead87c0ull ||
595
        (C1_hi == 0x0001ed09bead87c0ull
596
         && C1_lo > 0x378d8e63ffffffffull)) {
597
      // x is non-canonical if coefficient is larger than 10^34 -1
598
      C1_hi = 0;
599
      C1_lo = 0;
600
    } else {    // canonical
601
      ;
602
    }
603
  }
604
 
605
  // unpack y  
606
  C2_hi = y.w[1] & MASK_COEFF;
607
  C2_lo = y.w[0];
608
  // y is not infinity; check for non-canonical values - treated as zero 
609
  if ((y.w[1] & 0x6000000000000000ull) == 0x6000000000000000ull) {
610
    // G0_G1=11; non-canonical 
611
    y_exp = (y.w[1] << 2) & MASK_EXP;   // biased and shifted left 49 bits
612
    C2_hi = 0;   // significand high
613
    C2_lo = 0;   // significand low 
614
  } else {      // G0_G1 != 11 
615
    y_exp = y.w[1] & MASK_EXP;  // biased and shifted left 49 bits
616
    if (C2_hi > 0x0001ed09bead87c0ull ||
617
        (C2_hi == 0x0001ed09bead87c0ull
618
         && C2_lo > 0x378d8e63ffffffffull)) {
619
      // y is non-canonical if coefficient is larger than 10^34 -1 
620
      C2_hi = 0;
621
      C2_lo = 0;
622
    } else {    // canonical
623
      ;
624
    }
625
  }
626
 
627
  if ((C1_hi == 0x0ull) && (C1_lo == 0x0ull)) {
628
    // x is 0 and y is not special
629
    // if y is 0 return 0 with the smaller exponent
630
    if ((C2_hi == 0x0ull) && (C2_lo == 0x0ull)) {
631
      if (x_exp < y_exp)
632
        res.w[1] = x_exp;
633
      else
634
        res.w[1] = y_exp;
635
      if (x_sign && y_sign)
636
        res.w[1] = res.w[1] | x_sign;   // both negative
637
      else if (rnd_mode == ROUNDING_DOWN && x_sign != y_sign)
638
        res.w[1] = res.w[1] | 0x8000000000000000ull;    // -0
639
      // else; // res = +0
640
      res.w[0] = 0;
641
    } else {
642
      // for 0 + y return y, with the preferred exponent
643
      if (y_exp <= x_exp) {
644
        res.w[1] = y.w[1];
645
        res.w[0] = y.w[0];
646
      } else {  // if y_exp > x_exp
647
        // return (C2 * 10^scale) * 10^(y_exp - scale)
648
        // where scale = min (P34-q2, y_exp-x_exp)
649
        // determine q2 = nr. of decimal digits in y
650
        //  determine first the nr. of bits in y (y_nr_bits)
651
 
652
        if (C2_hi == 0) {        // y_bits is the nr. of bits in C2_lo
653
          if (C2_lo >= 0x0020000000000000ull) { // y >= 2^53
654
            // split the 64-bit value in two 32-bit halves to avoid 
655
            // rounding errors
656
            if (C2_lo >= 0x0000000100000000ull) {       // y >= 2^32
657
              tmp2.d = (double) (C2_lo >> 32);  // exact conversion
658
              y_nr_bits =
659
                32 +
660
                ((((unsigned int) (tmp2.ui64 >> 52)) & 0x7ff) - 0x3ff);
661
            } else {    // y < 2^32
662
              tmp2.d = (double) (C2_lo);        // exact conversion
663
              y_nr_bits =
664
                ((((unsigned int) (tmp2.ui64 >> 52)) & 0x7ff) - 0x3ff);
665
            }
666
          } else {      // if y < 2^53
667
            tmp2.d = (double) C2_lo;    // exact conversion
668
            y_nr_bits =
669
              ((((unsigned int) (tmp2.ui64 >> 52)) & 0x7ff) - 0x3ff);
670
          }
671
        } else {        // C2_hi != 0 => nr. bits = 64 + nr_bits (C2_hi)
672
          tmp2.d = (double) C2_hi;      // exact conversion
673
          y_nr_bits =
674
            64 + ((((unsigned int) (tmp2.ui64 >> 52)) & 0x7ff) - 0x3ff);
675
        }
676
        q2 = nr_digits[y_nr_bits].digits;
677
        if (q2 == 0) {
678
          q2 = nr_digits[y_nr_bits].digits1;
679
          if (C2_hi > nr_digits[y_nr_bits].threshold_hi ||
680
              (C2_hi == nr_digits[y_nr_bits].threshold_hi &&
681
               C2_lo >= nr_digits[y_nr_bits].threshold_lo))
682
            q2++;
683
        }
684
        // return (C2 * 10^scale) * 10^(y_exp - scale)
685
        // where scale = min (P34-q2, y_exp-x_exp)
686
        scale = P34 - q2;
687
        ind = (y_exp - x_exp) >> 49;
688
        if (ind < scale)
689
          scale = ind;
690
        if (scale == 0) {
691
          res.w[1] = y.w[1];
692
          res.w[0] = y.w[0];
693
        } else if (q2 <= 19) {  // y fits in 64 bits 
694
          if (scale <= 19) {    // 10^scale fits in 64 bits
695
            // 64 x 64 C2_lo * ten2k64[scale]
696
            __mul_64x64_to_128MACH (res, C2_lo, ten2k64[scale]);
697
          } else {      // 10^scale fits in 128 bits
698
            // 64 x 128 C2_lo * ten2k128[scale - 20]
699
            __mul_128x64_to_128 (res, C2_lo, ten2k128[scale - 20]);
700
          }
701
        } else {        // y fits in 128 bits, but 10^scale must fit in 64 bits 
702
          // 64 x 128 ten2k64[scale] * C2
703
          C2.w[1] = C2_hi;
704
          C2.w[0] = C2_lo;
705
          __mul_128x64_to_128 (res, ten2k64[scale], C2);
706
        }
707
        // subtract scale from the exponent
708
        y_exp = y_exp - ((UINT64) scale << 49);
709
        res.w[1] = res.w[1] | y_sign | y_exp;
710
      }
711
    }
712
    BID_SWAP128 (res);
713
    BID_RETURN (res);
714
  } else if ((C2_hi == 0x0ull) && (C2_lo == 0x0ull)) {
715
    // y is 0 and x is not special, and not zero
716
    // for x + 0 return x, with the preferred exponent
717
    if (x_exp <= y_exp) {
718
      res.w[1] = x.w[1];
719
      res.w[0] = x.w[0];
720
    } else {    // if x_exp > y_exp
721
      // return (C1 * 10^scale) * 10^(x_exp - scale)
722
      // where scale = min (P34-q1, x_exp-y_exp)
723
      // determine q1 = nr. of decimal digits in x
724
      //  determine first the nr. of bits in x
725
      if (C1_hi == 0) {  // x_bits is the nr. of bits in C1_lo
726
        if (C1_lo >= 0x0020000000000000ull) {   // x >= 2^53
727
          // split the 64-bit value in two 32-bit halves to avoid 
728
          // rounding errors
729
          if (C1_lo >= 0x0000000100000000ull) { // x >= 2^32
730
            tmp1.d = (double) (C1_lo >> 32);    // exact conversion
731
            x_nr_bits =
732
              32 + ((((unsigned int) (tmp1.ui64 >> 52)) & 0x7ff) -
733
                    0x3ff);
734
          } else {      // x < 2^32
735
            tmp1.d = (double) (C1_lo);  // exact conversion
736
            x_nr_bits =
737
              ((((unsigned int) (tmp1.ui64 >> 52)) & 0x7ff) - 0x3ff);
738
          }
739
        } else {        // if x < 2^53
740
          tmp1.d = (double) C1_lo;      // exact conversion
741
          x_nr_bits =
742
            ((((unsigned int) (tmp1.ui64 >> 52)) & 0x7ff) - 0x3ff);
743
        }
744
      } else {  // C1_hi != 0 => nr. bits = 64 + nr_bits (C1_hi)
745
        tmp1.d = (double) C1_hi;        // exact conversion
746
        x_nr_bits =
747
          64 + ((((unsigned int) (tmp1.ui64 >> 52)) & 0x7ff) - 0x3ff);
748
      }
749
      q1 = nr_digits[x_nr_bits].digits;
750
      if (q1 == 0) {
751
        q1 = nr_digits[x_nr_bits].digits1;
752
        if (C1_hi > nr_digits[x_nr_bits].threshold_hi ||
753
            (C1_hi == nr_digits[x_nr_bits].threshold_hi &&
754
             C1_lo >= nr_digits[x_nr_bits].threshold_lo))
755
          q1++;
756
      }
757
      // return (C1 * 10^scale) * 10^(x_exp - scale)
758
      // where scale = min (P34-q1, x_exp-y_exp)  
759
      scale = P34 - q1;
760
      ind = (x_exp - y_exp) >> 49;
761
      if (ind < scale)
762
        scale = ind;
763
      if (scale == 0) {
764
        res.w[1] = x.w[1];
765
        res.w[0] = x.w[0];
766
      } else if (q1 <= 19) {    // x fits in 64 bits  
767
        if (scale <= 19) {      // 10^scale fits in 64 bits
768
          // 64 x 64 C1_lo * ten2k64[scale] 
769
          __mul_64x64_to_128MACH (res, C1_lo, ten2k64[scale]);
770
        } else {        // 10^scale fits in 128 bits
771
          // 64 x 128 C1_lo * ten2k128[scale - 20]
772
          __mul_128x64_to_128 (res, C1_lo, ten2k128[scale - 20]);
773
        }
774
      } else {  // x fits in 128 bits, but 10^scale must fit in 64 bits
775
        // 64 x 128 ten2k64[scale] * C1
776
        C1.w[1] = C1_hi;
777
        C1.w[0] = C1_lo;
778
        __mul_128x64_to_128 (res, ten2k64[scale], C1);
779
      }
780
      // subtract scale from the exponent
781
      x_exp = x_exp - ((UINT64) scale << 49);
782
      res.w[1] = res.w[1] | x_sign | x_exp;
783
    }
784
    BID_SWAP128 (res);
785
    BID_RETURN (res);
786
  } else {      // x and y are not canonical, not special, and are not zero
787
    // note that the result may still be zero, and then it has to have the
788
    // preferred exponent
789
    if (x_exp < y_exp) {        // if exp_x < exp_y then swap x and y 
790
      tmp_sign = x_sign;
791
      tmp_exp = x_exp;
792
      tmp_signif_hi = C1_hi;
793
      tmp_signif_lo = C1_lo;
794
      x_sign = y_sign;
795
      x_exp = y_exp;
796
      C1_hi = C2_hi;
797
      C1_lo = C2_lo;
798
      y_sign = tmp_sign;
799
      y_exp = tmp_exp;
800
      C2_hi = tmp_signif_hi;
801
      C2_lo = tmp_signif_lo;
802
    }
803
    // q1 = nr. of decimal digits in x
804
    //  determine first the nr. of bits in x
805
    if (C1_hi == 0) {    // x_bits is the nr. of bits in C1_lo
806
      if (C1_lo >= 0x0020000000000000ull) {     // x >= 2^53
807
        //split the 64-bit value in two 32-bit halves to avoid rounding errors
808
        if (C1_lo >= 0x0000000100000000ull) {   // x >= 2^32
809
          tmp1.d = (double) (C1_lo >> 32);      // exact conversion
810
          x_nr_bits =
811
            32 + ((((unsigned int) (tmp1.ui64 >> 52)) & 0x7ff) - 0x3ff);
812
        } else {        // x < 2^32
813
          tmp1.d = (double) (C1_lo);    // exact conversion
814
          x_nr_bits =
815
            ((((unsigned int) (tmp1.ui64 >> 52)) & 0x7ff) - 0x3ff);
816
        }
817
      } else {  // if x < 2^53
818
        tmp1.d = (double) C1_lo;        // exact conversion
819
        x_nr_bits =
820
          ((((unsigned int) (tmp1.ui64 >> 52)) & 0x7ff) - 0x3ff);
821
      }
822
    } else {    // C1_hi != 0 => nr. bits = 64 + nr_bits (C1_hi)
823
      tmp1.d = (double) C1_hi;  // exact conversion
824
      x_nr_bits =
825
        64 + ((((unsigned int) (tmp1.ui64 >> 52)) & 0x7ff) - 0x3ff);
826
    }
827
 
828
    q1 = nr_digits[x_nr_bits].digits;
829
    if (q1 == 0) {
830
      q1 = nr_digits[x_nr_bits].digits1;
831
      if (C1_hi > nr_digits[x_nr_bits].threshold_hi ||
832
          (C1_hi == nr_digits[x_nr_bits].threshold_hi &&
833
           C1_lo >= nr_digits[x_nr_bits].threshold_lo))
834
        q1++;
835
    }
836
    // q2 = nr. of decimal digits in y
837
    //  determine first the nr. of bits in y (y_nr_bits)
838
    if (C2_hi == 0) {    // y_bits is the nr. of bits in C2_lo
839
      if (C2_lo >= 0x0020000000000000ull) {     // y >= 2^53
840
        //split the 64-bit value in two 32-bit halves to avoid rounding errors
841
        if (C2_lo >= 0x0000000100000000ull) {   // y >= 2^32
842
          tmp2.d = (double) (C2_lo >> 32);      // exact conversion
843
          y_nr_bits =
844
            32 + ((((unsigned int) (tmp2.ui64 >> 52)) & 0x7ff) - 0x3ff);
845
        } else {        // y < 2^32
846
          tmp2.d = (double) (C2_lo);    // exact conversion
847
          y_nr_bits =
848
            ((((unsigned int) (tmp2.ui64 >> 52)) & 0x7ff) - 0x3ff);
849
        }
850
      } else {  // if y < 2^53
851
        tmp2.d = (double) C2_lo;        // exact conversion
852
        y_nr_bits =
853
          ((((unsigned int) (tmp2.ui64 >> 52)) & 0x7ff) - 0x3ff);
854
      }
855
    } else {    // C2_hi != 0 => nr. bits = 64 + nr_bits (C2_hi)
856
      tmp2.d = (double) C2_hi;  // exact conversion
857
      y_nr_bits =
858
        64 + ((((unsigned int) (tmp2.ui64 >> 52)) & 0x7ff) - 0x3ff);
859
    }
860
 
861
    q2 = nr_digits[y_nr_bits].digits;
862
    if (q2 == 0) {
863
      q2 = nr_digits[y_nr_bits].digits1;
864
      if (C2_hi > nr_digits[y_nr_bits].threshold_hi ||
865
          (C2_hi == nr_digits[y_nr_bits].threshold_hi &&
866
           C2_lo >= nr_digits[y_nr_bits].threshold_lo))
867
        q2++;
868
    }
869
 
870
    delta = q1 + (int) (x_exp >> 49) - q2 - (int) (y_exp >> 49);
871
 
872
    if (delta >= P34) {
873
      // round the result directly because 0 < C2 < ulp (C1 * 10^(x_exp-e2))
874
      // n = C1 * 10^e1 or n = C1 +/- 10^(q1-P34)) * 10^e1
875
      // the result is inexact; the preferred exponent is the least possible
876
 
877
      if (delta >= P34 + 1) {
878
        // for RN the result is the operand with the larger magnitude,
879
        // possibly scaled up by 10^(P34-q1)
880
        // an overflow cannot occur in this case (rounding to nearest)
881
        if (q1 < P34) { // scale C1 up by 10^(P34-q1)
882
          // Note: because delta >= P34+1 it is certain that 
883
          //     x_exp - ((UINT64)scale << 49) will stay above e_min
884
          scale = P34 - q1;
885
          if (q1 <= 19) {       // C1 fits in 64 bits
886
            // 1 <= q1 <= 19 => 15 <= scale <= 33
887
            if (scale <= 19) {  // 10^scale fits in 64 bits
888
              __mul_64x64_to_128MACH (C1, ten2k64[scale], C1_lo);
889
            } else {    // if 20 <= scale <= 33
890
              // C1 * 10^scale = (C1 * 10^(scale-19)) * 10^19 where
891
              // (C1 * 10^(scale-19)) fits in 64 bits
892
              C1_lo = C1_lo * ten2k64[scale - 19];
893
              __mul_64x64_to_128MACH (C1, ten2k64[19], C1_lo);
894
            }
895
          } else {      //if 20 <= q1 <= 33=P34-1 then C1 fits only in 128 bits
896
            // => 1 <= P34 - q1 <= 14 so 10^(P34-q1) fits in 64 bits
897
            C1.w[1] = C1_hi;
898
            C1.w[0] = C1_lo;
899
            // C1 = ten2k64[P34 - q1] * C1
900
            __mul_128x64_to_128 (C1, ten2k64[P34 - q1], C1);
901
          }
902
          x_exp = x_exp - ((UINT64) scale << 49);
903
          C1_hi = C1.w[1];
904
          C1_lo = C1.w[0];
905
        }
906
        // some special cases arise: if delta = P34 + 1 and C1 = 10^(P34-1) 
907
        // (after scaling) and x_sign != y_sign and C2 > 5*10^(q2-1) => 
908
        // subtract 1 ulp
909
        // Note: do this only for rounding to nearest; for other rounding 
910
        // modes the correction will be applied next
911
        if ((rnd_mode == ROUNDING_TO_NEAREST
912
             || rnd_mode == ROUNDING_TIES_AWAY) && delta == (P34 + 1)
913
            && C1_hi == 0x0000314dc6448d93ull
914
            && C1_lo == 0x38c15b0a00000000ull && x_sign != y_sign
915
            && ((q2 <= 19 && C2_lo > midpoint64[q2 - 1]) || (q2 >= 20
916
                                                             && (C2_hi >
917
                                                                 midpoint128
918
                                                                 [q2 -
919
                                                                  20].
920
                                                                 w[1]
921
                                                                 ||
922
                                                                 (C2_hi
923
                                                                  ==
924
                                                                  midpoint128
925
                                                                  [q2 -
926
                                                                   20].
927
                                                                  w[1]
928
                                                                  &&
929
                                                                  C2_lo
930
                                                                  >
931
                                                                  midpoint128
932
                                                                  [q2 -
933
                                                                   20].
934
                                                                  w
935
                                                                  [0])))))
936
        {
937
          // C1 = 10^34 - 1 and decrement x_exp by 1 (no underflow possible)
938
          C1_hi = 0x0001ed09bead87c0ull;
939
          C1_lo = 0x378d8e63ffffffffull;
940
          x_exp = x_exp - EXP_P1;
941
        }
942
        if (rnd_mode != ROUNDING_TO_NEAREST) {
943
          if ((rnd_mode == ROUNDING_DOWN && x_sign && y_sign) ||
944
              (rnd_mode == ROUNDING_UP && !x_sign && !y_sign)) {
945
            // add 1 ulp and then check for overflow
946
            C1_lo = C1_lo + 1;
947
            if (C1_lo == 0) {    // rounding overflow in the low 64 bits
948
              C1_hi = C1_hi + 1;
949
            }
950
            if (C1_hi == 0x0001ed09bead87c0ull
951
                && C1_lo == 0x378d8e6400000000ull) {
952
              // C1 = 10^34 => rounding overflow
953
              C1_hi = 0x0000314dc6448d93ull;
954
              C1_lo = 0x38c15b0a00000000ull;    // 10^33
955
              x_exp = x_exp + EXP_P1;
956
              if (x_exp == EXP_MAX_P1) {        // overflow
957
                C1_hi = 0x7800000000000000ull;  // +inf
958
                C1_lo = 0x0ull;
959
                x_exp = 0;       // x_sign is preserved
960
                // set overflow flag (the inexact flag was set too)
961
                *pfpsf |= OVERFLOW_EXCEPTION;
962
              }
963
            }
964
          } else if ((rnd_mode == ROUNDING_DOWN && !x_sign && y_sign) ||
965
                     (rnd_mode == ROUNDING_UP && x_sign && !y_sign) ||
966
                     (rnd_mode == ROUNDING_TO_ZERO
967
                      && x_sign != y_sign)) {
968
            // subtract 1 ulp from C1
969
            // Note: because delta >= P34 + 1 the result cannot be zero
970
            C1_lo = C1_lo - 1;
971
            if (C1_lo == 0xffffffffffffffffull)
972
              C1_hi = C1_hi - 1;
973
            // if the coefficient is 10^33 - 1 then make it 10^34 - 1 and 
974
            // decrease the exponent by 1 (because delta >= P34 + 1 the
975
            // exponent will not become less than e_min)
976
            // 10^33 - 1 = 0x0000314dc6448d9338c15b09ffffffff
977
            // 10^34 - 1 = 0x0001ed09bead87c0378d8e63ffffffff
978
            if (C1_hi == 0x0000314dc6448d93ull
979
                && C1_lo == 0x38c15b09ffffffffull) {
980
              // make C1 = 10^34  - 1
981
              C1_hi = 0x0001ed09bead87c0ull;
982
              C1_lo = 0x378d8e63ffffffffull;
983
              x_exp = x_exp - EXP_P1;
984
            }
985
          } else {
986
            ;   // the result is already correct
987
          }
988
        }
989
        // set the inexact flag
990
        *pfpsf |= INEXACT_EXCEPTION;
991
        // assemble the result
992
        res.w[1] = x_sign | x_exp | C1_hi;
993
        res.w[0] = C1_lo;
994
      } else {  // delta = P34 
995
        // in most cases, the smaller operand may be < or = or > 1/2 ulp of the
996
        // larger operand
997
        // however, the case C1 = 10^(q1-1) and x_sign != y_sign is special due
998
        // to accuracy loss after subtraction, and will be treated separately
999
        if (x_sign == y_sign || (q1 <= 20
1000
                                 && (C1_hi != 0
1001
                                     || C1_lo != ten2k64[q1 - 1]))
1002
            || (q1 >= 21 && (C1_hi != ten2k128[q1 - 21].w[1]
1003
                             || C1_lo != ten2k128[q1 - 21].w[0]))) {
1004
          // if x_sign == y_sign or C1 != 10^(q1-1)
1005
          // compare C2 with 1/2 ulp = 5 * 10^(q2-1), the latter read from table
1006
          // Note: cases q1<=19 and q1>=20 can be coalesced at some latency cost
1007
          if (q2 <= 19) {       // C2 and 5*10^(q2-1) both fit in 64 bits
1008
            halfulp64 = midpoint64[q2 - 1];     // 5 * 10^(q2-1)
1009
            if (C2_lo < halfulp64) {    // n2 < 1/2 ulp (n1)
1010
              // for RN the result is the operand with the larger magnitude, 
1011
              // possibly scaled up by 10^(P34-q1)
1012
              // an overflow cannot occur in this case (rounding to nearest)
1013
              if (q1 < P34) {   // scale C1 up by 10^(P34-q1)
1014
                // Note: because delta = P34 it is certain that
1015
                //     x_exp - ((UINT64)scale << 49) will stay above e_min
1016
                scale = P34 - q1;
1017
                if (q1 <= 19) { // C1 fits in 64 bits
1018
                  // 1 <= q1 <= 19 => 15 <= scale <= 33
1019
                  if (scale <= 19) {    // 10^scale fits in 64 bits
1020
                    __mul_64x64_to_128MACH (C1, ten2k64[scale], C1_lo);
1021
                  } else {      // if 20 <= scale <= 33
1022
                    // C1 * 10^scale = (C1 * 10^(scale-19)) * 10^19 where
1023
                    // (C1 * 10^(scale-19)) fits in 64 bits
1024
                    C1_lo = C1_lo * ten2k64[scale - 19];
1025
                    __mul_64x64_to_128MACH (C1, ten2k64[19], C1_lo);
1026
                  }
1027
                } else {        //if 20 <= q1 <= 33=P34-1 then C1 fits only in 128 bits
1028
                  // => 1 <= P34 - q1 <= 14 so 10^(P34-q1) fits in 64 bits
1029
                  C1.w[1] = C1_hi;
1030
                  C1.w[0] = C1_lo;
1031
                  // C1 = ten2k64[P34 - q1] * C1
1032
                  __mul_128x64_to_128 (C1, ten2k64[P34 - q1], C1);
1033
                }
1034
                x_exp = x_exp - ((UINT64) scale << 49);
1035
                C1_hi = C1.w[1];
1036
                C1_lo = C1.w[0];
1037
              }
1038
              if (rnd_mode != ROUNDING_TO_NEAREST) {
1039
                if ((rnd_mode == ROUNDING_DOWN && x_sign && y_sign) ||
1040
                    (rnd_mode == ROUNDING_UP && !x_sign && !y_sign)) {
1041
                  // add 1 ulp and then check for overflow
1042
                  C1_lo = C1_lo + 1;
1043
                  if (C1_lo == 0) {      // rounding overflow in the low 64 bits
1044
                    C1_hi = C1_hi + 1;
1045
                  }
1046
                  if (C1_hi == 0x0001ed09bead87c0ull
1047
                      && C1_lo == 0x378d8e6400000000ull) {
1048
                    // C1 = 10^34 => rounding overflow
1049
                    C1_hi = 0x0000314dc6448d93ull;
1050
                    C1_lo = 0x38c15b0a00000000ull;      // 10^33
1051
                    x_exp = x_exp + EXP_P1;
1052
                    if (x_exp == EXP_MAX_P1) {  // overflow
1053
                      C1_hi = 0x7800000000000000ull;    // +inf
1054
                      C1_lo = 0x0ull;
1055
                      x_exp = 0; // x_sign is preserved
1056
                      // set overflow flag (the inexact flag was set too)
1057
                      *pfpsf |= OVERFLOW_EXCEPTION;
1058
                    }
1059
                  }
1060
                } else
1061
                  if ((rnd_mode == ROUNDING_DOWN && !x_sign && y_sign)
1062
                      || (rnd_mode == ROUNDING_UP && x_sign && !y_sign)
1063
                      || (rnd_mode == ROUNDING_TO_ZERO
1064
                          && x_sign != y_sign)) {
1065
                  // subtract 1 ulp from C1
1066
                  // Note: because delta >= P34 + 1 the result cannot be zero
1067
                  C1_lo = C1_lo - 1;
1068
                  if (C1_lo == 0xffffffffffffffffull)
1069
                    C1_hi = C1_hi - 1;
1070
                  // if the coefficient is 10^33-1 then make it 10^34-1 and 
1071
                  // decrease the exponent by 1 (because delta >= P34 + 1 the
1072
                  // exponent will not become less than e_min)
1073
                  // 10^33 - 1 = 0x0000314dc6448d9338c15b09ffffffff
1074
                  // 10^34 - 1 = 0x0001ed09bead87c0378d8e63ffffffff
1075
                  if (C1_hi == 0x0000314dc6448d93ull
1076
                      && C1_lo == 0x38c15b09ffffffffull) {
1077
                    // make C1 = 10^34  - 1
1078
                    C1_hi = 0x0001ed09bead87c0ull;
1079
                    C1_lo = 0x378d8e63ffffffffull;
1080
                    x_exp = x_exp - EXP_P1;
1081
                  }
1082
                } else {
1083
                  ;     // the result is already correct
1084
                }
1085
              }
1086
              // set the inexact flag
1087
              *pfpsf |= INEXACT_EXCEPTION;
1088
              // assemble the result
1089
              res.w[1] = x_sign | x_exp | C1_hi;
1090
              res.w[0] = C1_lo;
1091
            } else if ((C2_lo == halfulp64)
1092
                       && (q1 < P34 || ((C1_lo & 0x1) == 0))) {
1093
              // n2 = 1/2 ulp (n1) and C1 is even
1094
              // the result is the operand with the larger magnitude,
1095
              // possibly scaled up by 10^(P34-q1)
1096
              // an overflow cannot occur in this case (rounding to nearest)
1097
              if (q1 < P34) {   // scale C1 up by 10^(P34-q1)
1098
                // Note: because delta = P34 it is certain that
1099
                //     x_exp - ((UINT64)scale << 49) will stay above e_min
1100
                scale = P34 - q1;
1101
                if (q1 <= 19) { // C1 fits in 64 bits
1102
                  // 1 <= q1 <= 19 => 15 <= scale <= 33
1103
                  if (scale <= 19) {    // 10^scale fits in 64 bits
1104
                    __mul_64x64_to_128MACH (C1, ten2k64[scale], C1_lo);
1105
                  } else {      // if 20 <= scale <= 33 
1106
                    // C1 * 10^scale = (C1 * 10^(scale-19)) * 10^19 where
1107
                    // (C1 * 10^(scale-19)) fits in 64 bits  
1108
                    C1_lo = C1_lo * ten2k64[scale - 19];
1109
                    __mul_64x64_to_128MACH (C1, ten2k64[19], C1_lo);
1110
                  }
1111
                } else {        //if 20 <= q1 <= 33=P34-1 then C1 fits only in 128 bits
1112
                  // => 1 <= P34 - q1 <= 14 so 10^(P34-q1) fits in 64 bits 
1113
                  C1.w[1] = C1_hi;
1114
                  C1.w[0] = C1_lo;
1115
                  // C1 = ten2k64[P34 - q1] * C1 
1116
                  __mul_128x64_to_128 (C1, ten2k64[P34 - q1], C1);
1117
                }
1118
                x_exp = x_exp - ((UINT64) scale << 49);
1119
                C1_hi = C1.w[1];
1120
                C1_lo = C1.w[0];
1121
              }
1122
              if ((rnd_mode == ROUNDING_TO_NEAREST && x_sign == y_sign
1123
                   && (C1_lo & 0x01)) || (rnd_mode == ROUNDING_TIES_AWAY
1124
                                          && x_sign == y_sign)
1125
                  || (rnd_mode == ROUNDING_UP && !x_sign && !y_sign)
1126
                  || (rnd_mode == ROUNDING_DOWN && x_sign && y_sign)) {
1127
                // add 1 ulp and then check for overflow
1128
                C1_lo = C1_lo + 1;
1129
                if (C1_lo == 0) {        // rounding overflow in the low 64 bits
1130
                  C1_hi = C1_hi + 1;
1131
                }
1132
                if (C1_hi == 0x0001ed09bead87c0ull
1133
                    && C1_lo == 0x378d8e6400000000ull) {
1134
                  // C1 = 10^34 => rounding overflow
1135
                  C1_hi = 0x0000314dc6448d93ull;
1136
                  C1_lo = 0x38c15b0a00000000ull;        // 10^33
1137
                  x_exp = x_exp + EXP_P1;
1138
                  if (x_exp == EXP_MAX_P1) {    // overflow
1139
                    C1_hi = 0x7800000000000000ull;      // +inf
1140
                    C1_lo = 0x0ull;
1141
                    x_exp = 0;   // x_sign is preserved
1142
                    // set overflow flag (the inexact flag was set too)
1143
                    *pfpsf |= OVERFLOW_EXCEPTION;
1144
                  }
1145
                }
1146
              } else
1147
                if ((rnd_mode == ROUNDING_TO_NEAREST && x_sign != y_sign
1148
                     && (C1_lo & 0x01)) || (rnd_mode == ROUNDING_DOWN
1149
                                            && !x_sign && y_sign)
1150
                    || (rnd_mode == ROUNDING_UP && x_sign && !y_sign)
1151
                    || (rnd_mode == ROUNDING_TO_ZERO
1152
                        && x_sign != y_sign)) {
1153
                // subtract 1 ulp from C1
1154
                // Note: because delta >= P34 + 1 the result cannot be zero
1155
                C1_lo = C1_lo - 1;
1156
                if (C1_lo == 0xffffffffffffffffull)
1157
                  C1_hi = C1_hi - 1;
1158
                // if the coefficient is 10^33 - 1 then make it 10^34 - 1
1159
                // and decrease the exponent by 1 (because delta >= P34 + 1
1160
                // the exponent will not become less than e_min)
1161
                // 10^33 - 1 = 0x0000314dc6448d9338c15b09ffffffff
1162
                // 10^34 - 1 = 0x0001ed09bead87c0378d8e63ffffffff
1163
                if (C1_hi == 0x0000314dc6448d93ull
1164
                    && C1_lo == 0x38c15b09ffffffffull) {
1165
                  // make C1 = 10^34  - 1
1166
                  C1_hi = 0x0001ed09bead87c0ull;
1167
                  C1_lo = 0x378d8e63ffffffffull;
1168
                  x_exp = x_exp - EXP_P1;
1169
                }
1170
              } else {
1171
                ;       // the result is already correct
1172
              }
1173
              // set the inexact flag
1174
              *pfpsf |= INEXACT_EXCEPTION;
1175
              // assemble the result 
1176
              res.w[1] = x_sign | x_exp | C1_hi;
1177
              res.w[0] = C1_lo;
1178
            } else {    // if C2_lo > halfulp64 || 
1179
              // (C2_lo == halfulp64 && q1 == P34 && ((C1_lo & 0x1) == 1)), i.e.
1180
              // 1/2 ulp(n1) < n2 < 1 ulp(n1) or n2 = 1/2 ulp(n1) and C1 odd
1181
              // res = x+1 ulp if n1*n2 > 0 and res = x-1 ulp if n1*n2 < 0
1182
              if (q1 < P34) {   // then 1 ulp = 10^(e1+q1-P34) < 10^e1
1183
                // Note: if (q1 == P34) then 1 ulp = 10^(e1+q1-P34) = 10^e1
1184
                // because q1 < P34 we must first replace C1 by 
1185
                // C1 * 10^(P34-q1), and must decrease the exponent by 
1186
                // (P34-q1) (it will still be at least e_min)
1187
                scale = P34 - q1;
1188
                if (q1 <= 19) { // C1 fits in 64 bits
1189
                  // 1 <= q1 <= 19 => 15 <= scale <= 33
1190
                  if (scale <= 19) {    // 10^scale fits in 64 bits
1191
                    __mul_64x64_to_128MACH (C1, ten2k64[scale], C1_lo);
1192
                  } else {      // if 20 <= scale <= 33
1193
                    // C1 * 10^scale = (C1 * 10^(scale-19)) * 10^19 where
1194
                    // (C1 * 10^(scale-19)) fits in 64 bits
1195
                    C1_lo = C1_lo * ten2k64[scale - 19];
1196
                    __mul_64x64_to_128MACH (C1, ten2k64[19], C1_lo);
1197
                  }
1198
                } else {        //if 20 <= q1 <= 33=P34-1 then C1 fits only in 128 bits
1199
                  // => 1 <= P34 - q1 <= 14 so 10^(P34-q1) fits in 64 bits
1200
                  C1.w[1] = C1_hi;
1201
                  C1.w[0] = C1_lo;
1202
                  // C1 = ten2k64[P34 - q1] * C1
1203
                  __mul_128x64_to_128 (C1, ten2k64[P34 - q1], C1);
1204
                }
1205
                x_exp = x_exp - ((UINT64) scale << 49);
1206
                C1_hi = C1.w[1];
1207
                C1_lo = C1.w[0];
1208
                // check for rounding overflow
1209
                if (C1_hi == 0x0001ed09bead87c0ull
1210
                    && C1_lo == 0x378d8e6400000000ull) {
1211
                  // C1 = 10^34 => rounding overflow 
1212
                  C1_hi = 0x0000314dc6448d93ull;
1213
                  C1_lo = 0x38c15b0a00000000ull;        // 10^33
1214
                  x_exp = x_exp + EXP_P1;
1215
                }
1216
              }
1217
              if ((rnd_mode == ROUNDING_TO_NEAREST && x_sign != y_sign)
1218
                  || (rnd_mode == ROUNDING_TIES_AWAY && x_sign != y_sign
1219
                      && C2_lo != halfulp64)
1220
                  || (rnd_mode == ROUNDING_DOWN && !x_sign && y_sign)
1221
                  || (rnd_mode == ROUNDING_UP && x_sign && !y_sign)
1222
                  || (rnd_mode == ROUNDING_TO_ZERO
1223
                      && x_sign != y_sign)) {
1224
                // the result is x - 1
1225
                // for RN n1 * n2 < 0; underflow not possible
1226
                C1_lo = C1_lo - 1;
1227
                if (C1_lo == 0xffffffffffffffffull)
1228
                  C1_hi--;
1229
                // check if we crossed into the lower decade
1230
                if (C1_hi == 0x0000314dc6448d93ull && C1_lo == 0x38c15b09ffffffffull) { // 10^33 - 1
1231
                  C1_hi = 0x0001ed09bead87c0ull;        // 10^34 - 1
1232
                  C1_lo = 0x378d8e63ffffffffull;
1233
                  x_exp = x_exp - EXP_P1;       // no underflow, because n1 >> n2
1234
                }
1235
              } else
1236
                if ((rnd_mode == ROUNDING_TO_NEAREST
1237
                     && x_sign == y_sign)
1238
                    || (rnd_mode == ROUNDING_TIES_AWAY
1239
                        && x_sign == y_sign)
1240
                    || (rnd_mode == ROUNDING_DOWN && x_sign && y_sign)
1241
                    || (rnd_mode == ROUNDING_UP && !x_sign
1242
                        && !y_sign)) {
1243
                // the result is x + 1
1244
                // for RN x_sign = y_sign, i.e. n1*n2 > 0
1245
                C1_lo = C1_lo + 1;
1246
                if (C1_lo == 0) {        // rounding overflow in the low 64 bits
1247
                  C1_hi = C1_hi + 1;
1248
                }
1249
                if (C1_hi == 0x0001ed09bead87c0ull
1250
                    && C1_lo == 0x378d8e6400000000ull) {
1251
                  // C1 = 10^34 => rounding overflow
1252
                  C1_hi = 0x0000314dc6448d93ull;
1253
                  C1_lo = 0x38c15b0a00000000ull;        // 10^33
1254
                  x_exp = x_exp + EXP_P1;
1255
                  if (x_exp == EXP_MAX_P1) {    // overflow
1256
                    C1_hi = 0x7800000000000000ull;      // +inf
1257
                    C1_lo = 0x0ull;
1258
                    x_exp = 0;   // x_sign is preserved
1259
                    // set the overflow flag
1260
                    *pfpsf |= OVERFLOW_EXCEPTION;
1261
                  }
1262
                }
1263
              } else {
1264
                ;       // the result is x
1265
              }
1266
              // set the inexact flag
1267
              *pfpsf |= INEXACT_EXCEPTION;
1268
              // assemble the result
1269
              res.w[1] = x_sign | x_exp | C1_hi;
1270
              res.w[0] = C1_lo;
1271
            }
1272
          } else {      // if q2 >= 20 then 5*10^(q2-1) and C2 (the latter in 
1273
            // most cases) fit only in more than 64 bits
1274
            halfulp128 = midpoint128[q2 - 20];  // 5 * 10^(q2-1)
1275
            if ((C2_hi < halfulp128.w[1])
1276
                || (C2_hi == halfulp128.w[1]
1277
                    && C2_lo < halfulp128.w[0])) {
1278
              // n2 < 1/2 ulp (n1)
1279
              // the result is the operand with the larger magnitude,
1280
              // possibly scaled up by 10^(P34-q1)
1281
              // an overflow cannot occur in this case (rounding to nearest)
1282
              if (q1 < P34) {   // scale C1 up by 10^(P34-q1)
1283
                // Note: because delta = P34 it is certain that
1284
                //     x_exp - ((UINT64)scale << 49) will stay above e_min
1285
                scale = P34 - q1;
1286
                if (q1 <= 19) { // C1 fits in 64 bits
1287
                  // 1 <= q1 <= 19 => 15 <= scale <= 33
1288
                  if (scale <= 19) {    // 10^scale fits in 64 bits
1289
                    __mul_64x64_to_128MACH (C1, ten2k64[scale], C1_lo);
1290
                  } else {      // if 20 <= scale <= 33 
1291
                    // C1 * 10^scale = (C1 * 10^(scale-19)) * 10^19 where
1292
                    // (C1 * 10^(scale-19)) fits in 64 bits  
1293
                    C1_lo = C1_lo * ten2k64[scale - 19];
1294
                    __mul_64x64_to_128MACH (C1, ten2k64[19], C1_lo);
1295
                  }
1296
                } else {        //if 20 <= q1 <= 33=P34-1 then C1 fits only in 128 bits
1297
                  // => 1 <= P34 - q1 <= 14 so 10^(P34-q1) fits in 64 bits 
1298
                  C1.w[1] = C1_hi;
1299
                  C1.w[0] = C1_lo;
1300
                  // C1 = ten2k64[P34 - q1] * C1 
1301
                  __mul_128x64_to_128 (C1, ten2k64[P34 - q1], C1);
1302
                }
1303
                C1_hi = C1.w[1];
1304
                C1_lo = C1.w[0];
1305
                x_exp = x_exp - ((UINT64) scale << 49);
1306
              }
1307
              if (rnd_mode != ROUNDING_TO_NEAREST) {
1308
                if ((rnd_mode == ROUNDING_DOWN && x_sign && y_sign) ||
1309
                    (rnd_mode == ROUNDING_UP && !x_sign && !y_sign)) {
1310
                  // add 1 ulp and then check for overflow
1311
                  C1_lo = C1_lo + 1;
1312
                  if (C1_lo == 0) {      // rounding overflow in the low 64 bits
1313
                    C1_hi = C1_hi + 1;
1314
                  }
1315
                  if (C1_hi == 0x0001ed09bead87c0ull
1316
                      && C1_lo == 0x378d8e6400000000ull) {
1317
                    // C1 = 10^34 => rounding overflow
1318
                    C1_hi = 0x0000314dc6448d93ull;
1319
                    C1_lo = 0x38c15b0a00000000ull;      // 10^33
1320
                    x_exp = x_exp + EXP_P1;
1321
                    if (x_exp == EXP_MAX_P1) {  // overflow
1322
                      C1_hi = 0x7800000000000000ull;    // +inf
1323
                      C1_lo = 0x0ull;
1324
                      x_exp = 0; // x_sign is preserved
1325
                      // set overflow flag (the inexact flag was set too)
1326
                      *pfpsf |= OVERFLOW_EXCEPTION;
1327
                    }
1328
                  }
1329
                } else
1330
                  if ((rnd_mode == ROUNDING_DOWN && !x_sign && y_sign)
1331
                      || (rnd_mode == ROUNDING_UP && x_sign && !y_sign)
1332
                      || (rnd_mode == ROUNDING_TO_ZERO
1333
                          && x_sign != y_sign)) {
1334
                  // subtract 1 ulp from C1
1335
                  // Note: because delta >= P34 + 1 the result cannot be zero
1336
                  C1_lo = C1_lo - 1;
1337
                  if (C1_lo == 0xffffffffffffffffull)
1338
                    C1_hi = C1_hi - 1;
1339
                  // if the coefficient is 10^33-1 then make it 10^34-1 and
1340
                  // decrease the exponent by 1 (because delta >= P34 + 1 the
1341
                  // exponent will not become less than e_min)
1342
                  // 10^33 - 1 = 0x0000314dc6448d9338c15b09ffffffff
1343
                  // 10^34 - 1 = 0x0001ed09bead87c0378d8e63ffffffff
1344
                  if (C1_hi == 0x0000314dc6448d93ull
1345
                      && C1_lo == 0x38c15b09ffffffffull) {
1346
                    // make C1 = 10^34  - 1
1347
                    C1_hi = 0x0001ed09bead87c0ull;
1348
                    C1_lo = 0x378d8e63ffffffffull;
1349
                    x_exp = x_exp - EXP_P1;
1350
                  }
1351
                } else {
1352
                  ;     // the result is already correct
1353
                }
1354
              }
1355
              // set the inexact flag 
1356
              *pfpsf |= INEXACT_EXCEPTION;
1357
              // assemble the result 
1358
              res.w[1] = x_sign | x_exp | C1_hi;
1359
              res.w[0] = C1_lo;
1360
            } else if ((C2_hi == halfulp128.w[1]
1361
                        && C2_lo == halfulp128.w[0])
1362
                       && (q1 < P34 || ((C1_lo & 0x1) == 0))) {
1363
              // midpoint & lsb in C1 is 0
1364
              // n2 = 1/2 ulp (n1) and C1 is even
1365
              // the result is the operand with the larger magnitude,
1366
              // possibly scaled up by 10^(P34-q1)
1367
              // an overflow cannot occur in this case (rounding to nearest)
1368
              if (q1 < P34) {   // scale C1 up by 10^(P34-q1)
1369
                // Note: because delta = P34 it is certain that
1370
                //     x_exp - ((UINT64)scale << 49) will stay above e_min
1371
                scale = P34 - q1;
1372
                if (q1 <= 19) { // C1 fits in 64 bits
1373
                  // 1 <= q1 <= 19 => 15 <= scale <= 33
1374
                  if (scale <= 19) {    // 10^scale fits in 64 bits
1375
                    __mul_64x64_to_128MACH (C1, ten2k64[scale], C1_lo);
1376
                  } else {      // if 20 <= scale <= 33
1377
                    // C1 * 10^scale = (C1 * 10^(scale-19)) * 10^19 where
1378
                    // (C1 * 10^(scale-19)) fits in 64 bits
1379
                    C1_lo = C1_lo * ten2k64[scale - 19];
1380
                    __mul_64x64_to_128MACH (C1, ten2k64[19], C1_lo);
1381
                  }
1382
                } else {        //if 20 <= q1 <= 33=P34-1 then C1 fits only in 128 bits
1383
                  // => 1 <= P34 - q1 <= 14 so 10^(P34-q1) fits in 64 bits
1384
                  C1.w[1] = C1_hi;
1385
                  C1.w[0] = C1_lo;
1386
                  // C1 = ten2k64[P34 - q1] * C1
1387
                  __mul_128x64_to_128 (C1, ten2k64[P34 - q1], C1);
1388
                }
1389
                x_exp = x_exp - ((UINT64) scale << 49);
1390
                C1_hi = C1.w[1];
1391
                C1_lo = C1.w[0];
1392
              }
1393
              if (rnd_mode != ROUNDING_TO_NEAREST) {
1394
                if ((rnd_mode == ROUNDING_TIES_AWAY && x_sign == y_sign)
1395
                    || (rnd_mode == ROUNDING_UP && !y_sign)) {
1396
                  // add 1 ulp and then check for overflow
1397
                  C1_lo = C1_lo + 1;
1398
                  if (C1_lo == 0) {      // rounding overflow in the low 64 bits
1399
                    C1_hi = C1_hi + 1;
1400
                  }
1401
                  if (C1_hi == 0x0001ed09bead87c0ull
1402
                      && C1_lo == 0x378d8e6400000000ull) {
1403
                    // C1 = 10^34 => rounding overflow
1404
                    C1_hi = 0x0000314dc6448d93ull;
1405
                    C1_lo = 0x38c15b0a00000000ull;      // 10^33
1406
                    x_exp = x_exp + EXP_P1;
1407
                    if (x_exp == EXP_MAX_P1) {  // overflow
1408
                      C1_hi = 0x7800000000000000ull;    // +inf
1409
                      C1_lo = 0x0ull;
1410
                      x_exp = 0; // x_sign is preserved
1411
                      // set overflow flag (the inexact flag was set too)
1412
                      *pfpsf |= OVERFLOW_EXCEPTION;
1413
                    }
1414
                  }
1415
                } else if ((rnd_mode == ROUNDING_DOWN && y_sign)
1416
                           || (rnd_mode == ROUNDING_TO_ZERO
1417
                               && x_sign != y_sign)) {
1418
                  // subtract 1 ulp from C1
1419
                  // Note: because delta >= P34 + 1 the result cannot be zero
1420
                  C1_lo = C1_lo - 1;
1421
                  if (C1_lo == 0xffffffffffffffffull)
1422
                    C1_hi = C1_hi - 1;
1423
                  // if the coefficient is 10^33 - 1 then make it 10^34 - 1
1424
                  // and decrease the exponent by 1 (because delta >= P34 + 1
1425
                  // the exponent will not become less than e_min)
1426
                  // 10^33 - 1 = 0x0000314dc6448d9338c15b09ffffffff
1427
                  // 10^34 - 1 = 0x0001ed09bead87c0378d8e63ffffffff
1428
                  if (C1_hi == 0x0000314dc6448d93ull
1429
                      && C1_lo == 0x38c15b09ffffffffull) {
1430
                    // make C1 = 10^34  - 1
1431
                    C1_hi = 0x0001ed09bead87c0ull;
1432
                    C1_lo = 0x378d8e63ffffffffull;
1433
                    x_exp = x_exp - EXP_P1;
1434
                  }
1435
                } else {
1436
                  ;     // the result is already correct
1437
                }
1438
              }
1439
              // set the inexact flag
1440
              *pfpsf |= INEXACT_EXCEPTION;
1441
              // assemble the result
1442
              res.w[1] = x_sign | x_exp | C1_hi;
1443
              res.w[0] = C1_lo;
1444
            } else {    // if C2 > halfulp128 ||
1445
              // (C2 == halfulp128 && q1 == P34 && ((C1 & 0x1) == 1)), i.e.
1446
              // 1/2 ulp(n1) < n2 < 1 ulp(n1) or n2 = 1/2 ulp(n1) and C1 odd
1447
              // res = x+1 ulp if n1*n2 > 0 and res = x-1 ulp if n1*n2 < 0
1448
              if (q1 < P34) {   // then 1 ulp = 10^(e1+q1-P34) < 10^e1
1449
                // Note: if (q1 == P34) then 1 ulp = 10^(e1+q1-P34) = 10^e1
1450
                // because q1 < P34 we must first replace C1 by C1*10^(P34-q1),
1451
                // and must decrease the exponent by (P34-q1) (it will still be
1452
                // at least e_min)
1453
                scale = P34 - q1;
1454
                if (q1 <= 19) { // C1 fits in 64 bits
1455
                  // 1 <= q1 <= 19 => 15 <= scale <= 33
1456
                  if (scale <= 19) {    // 10^scale fits in 64 bits
1457
                    __mul_64x64_to_128MACH (C1, ten2k64[scale], C1_lo);
1458
                  } else {      // if 20 <= scale <= 33
1459
                    // C1 * 10^scale = (C1 * 10^(scale-19)) * 10^19 where
1460
                    // (C1 * 10^(scale-19)) fits in 64 bits
1461
                    C1_lo = C1_lo * ten2k64[scale - 19];
1462
                    __mul_64x64_to_128MACH (C1, ten2k64[19], C1_lo);
1463
                  }
1464
                } else {        //if 20 <= q1 <= 33=P34-1 then C1 fits only in 128 bits
1465
                  // => 1 <= P34 - q1 <= 14 so 10^(P34-q1) fits in 64 bits
1466
                  C1.w[1] = C1_hi;
1467
                  C1.w[0] = C1_lo;
1468
                  // C1 = ten2k64[P34 - q1] * C1
1469
                  __mul_128x64_to_128 (C1, ten2k64[P34 - q1], C1);
1470
                }
1471
                C1_hi = C1.w[1];
1472
                C1_lo = C1.w[0];
1473
                x_exp = x_exp - ((UINT64) scale << 49);
1474
              }
1475
              if ((rnd_mode == ROUNDING_TO_NEAREST && x_sign != y_sign)
1476
                  || (rnd_mode == ROUNDING_TIES_AWAY && x_sign != y_sign
1477
                      && (C2_hi != halfulp128.w[1]
1478
                          || C2_lo != halfulp128.w[0]))
1479
                  || (rnd_mode == ROUNDING_DOWN && !x_sign && y_sign)
1480
                  || (rnd_mode == ROUNDING_UP && x_sign && !y_sign)
1481
                  || (rnd_mode == ROUNDING_TO_ZERO
1482
                      && x_sign != y_sign)) {
1483
                // the result is x - 1
1484
                // for RN n1 * n2 < 0; underflow not possible
1485
                C1_lo = C1_lo - 1;
1486
                if (C1_lo == 0xffffffffffffffffull)
1487
                  C1_hi--;
1488
                // check if we crossed into the lower decade
1489
                if (C1_hi == 0x0000314dc6448d93ull && C1_lo == 0x38c15b09ffffffffull) { // 10^33 - 1
1490
                  C1_hi = 0x0001ed09bead87c0ull;        // 10^34 - 1
1491
                  C1_lo = 0x378d8e63ffffffffull;
1492
                  x_exp = x_exp - EXP_P1;       // no underflow, because n1 >> n2
1493
                }
1494
              } else
1495
                if ((rnd_mode == ROUNDING_TO_NEAREST
1496
                     && x_sign == y_sign)
1497
                    || (rnd_mode == ROUNDING_TIES_AWAY
1498
                        && x_sign == y_sign)
1499
                    || (rnd_mode == ROUNDING_DOWN && x_sign && y_sign)
1500
                    || (rnd_mode == ROUNDING_UP && !x_sign
1501
                        && !y_sign)) {
1502
                // the result is x + 1
1503
                // for RN x_sign = y_sign, i.e. n1*n2 > 0
1504
                C1_lo = C1_lo + 1;
1505
                if (C1_lo == 0) {        // rounding overflow in the low 64 bits
1506
                  C1_hi = C1_hi + 1;
1507
                }
1508
                if (C1_hi == 0x0001ed09bead87c0ull
1509
                    && C1_lo == 0x378d8e6400000000ull) {
1510
                  // C1 = 10^34 => rounding overflow
1511
                  C1_hi = 0x0000314dc6448d93ull;
1512
                  C1_lo = 0x38c15b0a00000000ull;        // 10^33
1513
                  x_exp = x_exp + EXP_P1;
1514
                  if (x_exp == EXP_MAX_P1) {    // overflow
1515
                    C1_hi = 0x7800000000000000ull;      // +inf
1516
                    C1_lo = 0x0ull;
1517
                    x_exp = 0;   // x_sign is preserved
1518
                    // set the overflow flag
1519
                    *pfpsf |= OVERFLOW_EXCEPTION;
1520
                  }
1521
                }
1522
              } else {
1523
                ;       // the result is x
1524
              }
1525
              // set the inexact flag
1526
              *pfpsf |= INEXACT_EXCEPTION;
1527
              // assemble the result
1528
              res.w[1] = x_sign | x_exp | C1_hi;
1529
              res.w[0] = C1_lo;
1530
            }
1531
          }     // end q1 >= 20
1532
          // end case where C1 != 10^(q1-1)
1533
        } else {        // C1 = 10^(q1-1) and x_sign != y_sign
1534
          // instead of C' = (C1 * 10^(e1-e2) + C2)rnd,P34
1535
          // calculate C' = C1 * 10^(e1-e2-x1) + (C2 * 10^(-x1))rnd,P34 
1536
          // where x1 = q2 - 1, 0 <= x1 <= P34 - 1
1537
          // Because C1 = 10^(q1-1) and x_sign != y_sign, C' will have P34 
1538
          // digits and n = C' * 10^(e2+x1)
1539
          // If the result has P34+1 digits, redo the steps above with x1+1
1540
          // If the result has P34-1 digits or less, redo the steps above with 
1541
          // x1-1 but only if initially x1 >= 1
1542
          // NOTE: these two steps can be improved, e.g we could guess if
1543
          // P34+1 or P34-1 digits will be obtained by adding/subtracting 
1544
          // just the top 64 bits of the two operands
1545
          // The result cannot be zero, and it cannot overflow
1546
          x1 = q2 - 1;  // 0 <= x1 <= P34-1
1547
          // Calculate C1 * 10^(e1-e2-x1) where 1 <= e1-e2-x1 <= P34
1548
          // scale = (int)(e1 >> 49) - (int)(e2 >> 49) - x1; 0 <= scale <= P34-1
1549
          scale = P34 - q1 + 1; // scale=e1-e2-x1 = P34+1-q1; 1<=scale<=P34
1550
          // either C1 or 10^(e1-e2-x1) may not fit is 64 bits,
1551
          // but their product fits with certainty in 128 bits
1552
          if (scale >= 20) {    //10^(e1-e2-x1) doesn't fit in 64 bits, but C1 does
1553
            __mul_128x64_to_128 (C1, C1_lo, ten2k128[scale - 20]);
1554
          } else {      // if (scale >= 1
1555
            // if 1 <= scale <= 19 then 10^(e1-e2-x1) fits in 64 bits
1556
            if (q1 <= 19) {     // C1 fits in 64 bits
1557
              __mul_64x64_to_128MACH (C1, C1_lo, ten2k64[scale]);
1558
            } else {    // q1 >= 20
1559
              C1.w[1] = C1_hi;
1560
              C1.w[0] = C1_lo;
1561
              __mul_128x64_to_128 (C1, ten2k64[scale], C1);
1562
            }
1563
          }
1564
          tmp64 = C1.w[0];       // C1.w[1], C1.w[0] contains C1 * 10^(e1-e2-x1)
1565
 
1566
          // now round C2 to q2-x1 = 1 decimal digit
1567
          // C2' = C2 + 1/2 * 10^x1 = C2 + 5 * 10^(x1-1)
1568
          ind = x1 - 1; // -1 <= ind <= P34 - 2
1569
          if (ind >= 0) {        // if (x1 >= 1)
1570
            C2.w[0] = C2_lo;
1571
            C2.w[1] = C2_hi;
1572
            if (ind <= 18) {
1573
              C2.w[0] = C2.w[0] + midpoint64[ind];
1574
              if (C2.w[0] < C2_lo)
1575
                C2.w[1]++;
1576
            } else {    // 19 <= ind <= 32
1577
              C2.w[0] = C2.w[0] + midpoint128[ind - 19].w[0];
1578
              C2.w[1] = C2.w[1] + midpoint128[ind - 19].w[1];
1579
              if (C2.w[0] < C2_lo)
1580
                C2.w[1]++;
1581
            }
1582
            // the approximation of 10^(-x1) was rounded up to 118 bits
1583
            __mul_128x128_to_256 (R256, C2, ten2mk128[ind]);    // R256 = C2*, f2*
1584
            // calculate C2* and f2*
1585
            // C2* is actually floor(C2*) in this case
1586
            // C2* and f2* need shifting and masking, as shown by
1587
            // shiftright128[] and maskhigh128[]
1588
            // the top Ex bits of 10^(-x1) are T* = ten2mk128trunc[ind], e.g.
1589
            // if x1=1, T*=ten2mk128trunc[0]=0x19999999999999999999999999999999
1590
            // if (0 < f2* < 10^(-x1)) then
1591
            //   if floor(C1+C2*) is even then C2* = floor(C2*) - logical right
1592
            //       shift; C2* has p decimal digits, correct by Prop. 1)
1593
            //   else if floor(C1+C2*) is odd C2* = floor(C2*)-1 (logical right
1594
            //       shift; C2* has p decimal digits, correct by Pr. 1)
1595
            // else
1596
            //   C2* = floor(C2*) (logical right shift; C has p decimal digits,
1597
            //       correct by Property 1)
1598
            // n = C2* * 10^(e2+x1)
1599
 
1600
            if (ind <= 2) {
1601
              highf2star.w[1] = 0x0;
1602
              highf2star.w[0] = 0x0;     // low f2* ok
1603
            } else if (ind <= 21) {
1604
              highf2star.w[1] = 0x0;
1605
              highf2star.w[0] = R256.w[2] & maskhigh128[ind];    // low f2* ok
1606
            } else {
1607
              highf2star.w[1] = R256.w[3] & maskhigh128[ind];
1608
              highf2star.w[0] = R256.w[2];       // low f2* is ok
1609
            }
1610
            // shift right C2* by Ex-128 = shiftright128[ind]
1611
            if (ind >= 3) {
1612
              shift = shiftright128[ind];
1613
              if (shift < 64) { // 3 <= shift <= 63
1614
                R256.w[2] =
1615
                  (R256.w[2] >> shift) | (R256.w[3] << (64 - shift));
1616
                R256.w[3] = (R256.w[3] >> shift);
1617
              } else {  // 66 <= shift <= 102
1618
                R256.w[2] = (R256.w[3] >> (shift - 64));
1619
                R256.w[3] = 0x0ULL;
1620
              }
1621
            }
1622
            // redundant
1623
            is_inexact_lt_midpoint = 0;
1624
            is_inexact_gt_midpoint = 0;
1625
            is_midpoint_lt_even = 0;
1626
            is_midpoint_gt_even = 0;
1627
            // determine inexactness of the rounding of C2*
1628
            // (cannot be followed by a second rounding)
1629
            // if (0 < f2* - 1/2 < 10^(-x1)) then
1630
            //   the result is exact
1631
            // else (if f2* - 1/2 > T* then)
1632
            //   the result of is inexact
1633
            if (ind <= 2) {
1634
              if (R256.w[1] > 0x8000000000000000ull ||
1635
                  (R256.w[1] == 0x8000000000000000ull
1636
                   && R256.w[0] > 0x0ull)) {
1637
                // f2* > 1/2 and the result may be exact
1638
                tmp64A = R256.w[1] - 0x8000000000000000ull;     // f* - 1/2
1639
                if ((tmp64A > ten2mk128trunc[ind].w[1]
1640
                     || (tmp64A == ten2mk128trunc[ind].w[1]
1641
                         && R256.w[0] >= ten2mk128trunc[ind].w[0]))) {
1642
                  // set the inexact flag
1643
                  *pfpsf |= INEXACT_EXCEPTION;
1644
                  // this rounding is applied to C2 only!
1645
                  // x_sign != y_sign
1646
                  is_inexact_gt_midpoint = 1;
1647
                }       // else the result is exact
1648
                // rounding down, unless a midpoint in [ODD, EVEN]
1649
              } else {  // the result is inexact; f2* <= 1/2
1650
                // set the inexact flag
1651
                *pfpsf |= INEXACT_EXCEPTION;
1652
                // this rounding is applied to C2 only!
1653
                // x_sign != y_sign
1654
                is_inexact_lt_midpoint = 1;
1655
              }
1656
            } else if (ind <= 21) {     // if 3 <= ind <= 21
1657
              if (highf2star.w[1] > 0x0 || (highf2star.w[1] == 0x0
1658
                                            && highf2star.w[0] >
1659
                                            onehalf128[ind])
1660
                  || (highf2star.w[1] == 0x0
1661
                      && highf2star.w[0] == onehalf128[ind]
1662
                      && (R256.w[1] || R256.w[0]))) {
1663
                // f2* > 1/2 and the result may be exact
1664
                // Calculate f2* - 1/2
1665
                tmp64A = highf2star.w[0] - onehalf128[ind];
1666
                tmp64B = highf2star.w[1];
1667
                if (tmp64A > highf2star.w[0])
1668
                  tmp64B--;
1669
                if (tmp64B || tmp64A
1670
                    || R256.w[1] > ten2mk128trunc[ind].w[1]
1671
                    || (R256.w[1] == ten2mk128trunc[ind].w[1]
1672
                        && R256.w[0] > ten2mk128trunc[ind].w[0])) {
1673
                  // set the inexact flag
1674
                  *pfpsf |= INEXACT_EXCEPTION;
1675
                  // this rounding is applied to C2 only!
1676
                  // x_sign != y_sign
1677
                  is_inexact_gt_midpoint = 1;
1678
                }       // else the result is exact
1679
              } else {  // the result is inexact; f2* <= 1/2
1680
                // set the inexact flag
1681
                *pfpsf |= INEXACT_EXCEPTION;
1682
                // this rounding is applied to C2 only!
1683
                // x_sign != y_sign
1684
                is_inexact_lt_midpoint = 1;
1685
              }
1686
            } else {    // if 22 <= ind <= 33
1687
              if (highf2star.w[1] > onehalf128[ind]
1688
                  || (highf2star.w[1] == onehalf128[ind]
1689
                      && (highf2star.w[0] || R256.w[1]
1690
                          || R256.w[0]))) {
1691
                // f2* > 1/2 and the result may be exact
1692
                // Calculate f2* - 1/2
1693
                // tmp64A = highf2star.w[0];
1694
                tmp64B = highf2star.w[1] - onehalf128[ind];
1695
                if (tmp64B || highf2star.w[0]
1696
                    || R256.w[1] > ten2mk128trunc[ind].w[1]
1697
                    || (R256.w[1] == ten2mk128trunc[ind].w[1]
1698
                        && R256.w[0] > ten2mk128trunc[ind].w[0])) {
1699
                  // set the inexact flag
1700
                  *pfpsf |= INEXACT_EXCEPTION;
1701
                  // this rounding is applied to C2 only!
1702
                  // x_sign != y_sign
1703
                  is_inexact_gt_midpoint = 1;
1704
                }       // else the result is exact
1705
              } else {  // the result is inexact; f2* <= 1/2
1706
                // set the inexact flag
1707
                *pfpsf |= INEXACT_EXCEPTION;
1708
                // this rounding is applied to C2 only!
1709
                // x_sign != y_sign
1710
                is_inexact_lt_midpoint = 1;
1711
              }
1712
            }
1713
            // check for midpoints after determining inexactness
1714
            if ((R256.w[1] || R256.w[0]) && (highf2star.w[1] == 0)
1715
                && (highf2star.w[0] == 0)
1716
                && (R256.w[1] < ten2mk128trunc[ind].w[1]
1717
                    || (R256.w[1] == ten2mk128trunc[ind].w[1]
1718
                        && R256.w[0] <= ten2mk128trunc[ind].w[0]))) {
1719
              // the result is a midpoint
1720
              if ((tmp64 + R256.w[2]) & 0x01) { // MP in [EVEN, ODD]
1721
                // if floor(C2*) is odd C = floor(C2*) - 1; the result may be 0
1722
                R256.w[2]--;
1723
                if (R256.w[2] == 0xffffffffffffffffull)
1724
                  R256.w[3]--;
1725
                // this rounding is applied to C2 only!
1726
                // x_sign != y_sign
1727
                is_midpoint_lt_even = 1;
1728
                is_inexact_lt_midpoint = 0;
1729
                is_inexact_gt_midpoint = 0;
1730
              } else {
1731
                // else MP in [ODD, EVEN]
1732
                // this rounding is applied to C2 only!
1733
                // x_sign != y_sign
1734
                is_midpoint_gt_even = 1;
1735
                is_inexact_lt_midpoint = 0;
1736
                is_inexact_gt_midpoint = 0;
1737
              }
1738
            }
1739
          } else {      // if (ind == -1) only when x1 = 0
1740
            R256.w[2] = C2_lo;
1741
            R256.w[3] = C2_hi;
1742
            is_midpoint_lt_even = 0;
1743
            is_midpoint_gt_even = 0;
1744
            is_inexact_lt_midpoint = 0;
1745
            is_inexact_gt_midpoint = 0;
1746
          }
1747
          // and now subtract C1 * 10^(e1-e2-x1) - (C2 * 10^(-x1))rnd,P34
1748
          // because x_sign != y_sign this last operation is exact
1749
          C1.w[0] = C1.w[0] - R256.w[2];
1750
          C1.w[1] = C1.w[1] - R256.w[3];
1751
          if (C1.w[0] > tmp64)
1752
            C1.w[1]--;  // borrow
1753
          if (C1.w[1] >= 0x8000000000000000ull) {       // negative coefficient!
1754
            C1.w[0] = ~C1.w[0];
1755
            C1.w[0]++;
1756
            C1.w[1] = ~C1.w[1];
1757
            if (C1.w[0] == 0x0)
1758
              C1.w[1]++;
1759
            tmp_sign = y_sign;  // the result will have the sign of y
1760
          } else {
1761
            tmp_sign = x_sign;
1762
          }
1763
          // the difference has exactly P34 digits
1764
          x_sign = tmp_sign;
1765
          if (x1 >= 1)
1766
            y_exp = y_exp + ((UINT64) x1 << 49);
1767
          C1_hi = C1.w[1];
1768
          C1_lo = C1.w[0];
1769
          // general correction from RN to RA, RM, RP, RZ; result uses y_exp
1770
          if (rnd_mode != ROUNDING_TO_NEAREST) {
1771
            if ((!x_sign
1772
                 && ((rnd_mode == ROUNDING_UP && is_inexact_lt_midpoint)
1773
                     ||
1774
                     ((rnd_mode == ROUNDING_TIES_AWAY
1775
                       || rnd_mode == ROUNDING_UP)
1776
                      && is_midpoint_gt_even))) || (x_sign
1777
                                                    &&
1778
                                                    ((rnd_mode ==
1779
                                                      ROUNDING_DOWN
1780
                                                      &&
1781
                                                      is_inexact_lt_midpoint)
1782
                                                     ||
1783
                                                     ((rnd_mode ==
1784
                                                       ROUNDING_TIES_AWAY
1785
                                                       || rnd_mode ==
1786
                                                       ROUNDING_DOWN)
1787
                                                      &&
1788
                                                      is_midpoint_gt_even))))
1789
            {
1790
              // C1 = C1 + 1
1791
              C1_lo = C1_lo + 1;
1792
              if (C1_lo == 0) {  // rounding overflow in the low 64 bits
1793
                C1_hi = C1_hi + 1;
1794
              }
1795
              if (C1_hi == 0x0001ed09bead87c0ull
1796
                  && C1_lo == 0x378d8e6400000000ull) {
1797
                // C1 = 10^34 => rounding overflow
1798
                C1_hi = 0x0000314dc6448d93ull;
1799
                C1_lo = 0x38c15b0a00000000ull;  // 10^33
1800
                y_exp = y_exp + EXP_P1;
1801
              }
1802
            } else if ((is_midpoint_lt_even || is_inexact_gt_midpoint)
1803
                       &&
1804
                       ((x_sign
1805
                         && (rnd_mode == ROUNDING_UP
1806
                             || rnd_mode == ROUNDING_TO_ZERO))
1807
                        || (!x_sign
1808
                            && (rnd_mode == ROUNDING_DOWN
1809
                                || rnd_mode == ROUNDING_TO_ZERO)))) {
1810
              // C1 = C1 - 1
1811
              C1_lo = C1_lo - 1;
1812
              if (C1_lo == 0xffffffffffffffffull)
1813
                C1_hi--;
1814
              // check if we crossed into the lower decade
1815
              if (C1_hi == 0x0000314dc6448d93ull && C1_lo == 0x38c15b09ffffffffull) {   // 10^33 - 1
1816
                C1_hi = 0x0001ed09bead87c0ull;  // 10^34 - 1
1817
                C1_lo = 0x378d8e63ffffffffull;
1818
                y_exp = y_exp - EXP_P1;
1819
                // no underflow, because delta + q2 >= P34 + 1
1820
              }
1821
            } else {
1822
              ; // exact, the result is already correct
1823
            }
1824
          }
1825
          // assemble the result
1826
          res.w[1] = x_sign | y_exp | C1_hi;
1827
          res.w[0] = C1_lo;
1828
        }
1829
      } // end delta = P34
1830
    } else {    // if (|delta| <= P34 - 1)
1831
      if (delta >= 0) {  // if (0 <= delta <= P34 - 1)
1832
        if (delta <= P34 - 1 - q2) {
1833
          // calculate C' directly; the result is exact
1834
          // in this case 1<=q1<=P34-1, 1<=q2<=P34-1 and 0 <= e1-e2 <= P34-2
1835
          // The coefficient of the result is C1 * 10^(e1-e2) + C2 and the
1836
          // exponent is e2; either C1 or 10^(e1-e2) may not fit is 64 bits,
1837
          // but their product fits with certainty in 128 bits (actually in 113)
1838
          scale = delta - q1 + q2;      // scale = (int)(e1 >> 49) - (int)(e2 >> 49) 
1839
 
1840
          if (scale >= 20) {    // 10^(e1-e2) does not fit in 64 bits, but C1 does
1841
            __mul_128x64_to_128 (C1, C1_lo, ten2k128[scale - 20]);
1842
            C1_hi = C1.w[1];
1843
            C1_lo = C1.w[0];
1844
          } else if (scale >= 1) {
1845
            // if 1 <= scale <= 19 then 10^(e1-e2) fits in 64 bits 
1846
            if (q1 <= 19) {     // C1 fits in 64 bits
1847
              __mul_64x64_to_128MACH (C1, C1_lo, ten2k64[scale]);
1848
            } else {    // q1 >= 20
1849
              C1.w[1] = C1_hi;
1850
              C1.w[0] = C1_lo;
1851
              __mul_128x64_to_128 (C1, ten2k64[scale], C1);
1852
            }
1853
            C1_hi = C1.w[1];
1854
            C1_lo = C1.w[0];
1855
          } else {      // if (scale == 0) C1 is unchanged
1856
            C1.w[0] = C1_lo;     // C1.w[1] = C1_hi; 
1857
          }
1858
          // now add C2
1859
          if (x_sign == y_sign) {
1860
            // the result cannot overflow
1861
            C1_lo = C1_lo + C2_lo;
1862
            C1_hi = C1_hi + C2_hi;
1863
            if (C1_lo < C1.w[0])
1864
              C1_hi++;
1865
          } else {      // if x_sign != y_sign
1866
            C1_lo = C1_lo - C2_lo;
1867
            C1_hi = C1_hi - C2_hi;
1868
            if (C1_lo > C1.w[0])
1869
              C1_hi--;
1870
            // the result can be zero, but it cannot overflow
1871
            if (C1_lo == 0 && C1_hi == 0) {
1872
              // assemble the result
1873
              if (x_exp < y_exp)
1874
                res.w[1] = x_exp;
1875
              else
1876
                res.w[1] = y_exp;
1877
              res.w[0] = 0;
1878
              if (rnd_mode == ROUNDING_DOWN) {
1879
                res.w[1] |= 0x8000000000000000ull;
1880
              }
1881
              BID_SWAP128 (res);
1882
              BID_RETURN (res);
1883
            }
1884
            if (C1_hi >= 0x8000000000000000ull) {       // negative coefficient!
1885
              C1_lo = ~C1_lo;
1886
              C1_lo++;
1887
              C1_hi = ~C1_hi;
1888
              if (C1_lo == 0x0)
1889
                C1_hi++;
1890
              x_sign = y_sign;  // the result will have the sign of y
1891
            }
1892
          }
1893
          // assemble the result
1894
          res.w[1] = x_sign | y_exp | C1_hi;
1895
          res.w[0] = C1_lo;
1896
        } else if (delta == P34 - q2) {
1897
          // calculate C' directly; the result may be inexact if it requires 
1898
          // P34+1 decimal digits; in this case the 'cutoff' point for addition
1899
          // is at the position of the lsb of C2, so 0 <= e1-e2 <= P34-1
1900
          // The coefficient of the result is C1 * 10^(e1-e2) + C2 and the
1901
          // exponent is e2; either C1 or 10^(e1-e2) may not fit is 64 bits,
1902
          // but their product fits with certainty in 128 bits (actually in 113)
1903
          scale = delta - q1 + q2;      // scale = (int)(e1 >> 49) - (int)(e2 >> 49)
1904
          if (scale >= 20) {    // 10^(e1-e2) does not fit in 64 bits, but C1 does
1905
            __mul_128x64_to_128 (C1, C1_lo, ten2k128[scale - 20]);
1906
          } else if (scale >= 1) {
1907
            // if 1 <= scale <= 19 then 10^(e1-e2) fits in 64 bits
1908
            if (q1 <= 19) {     // C1 fits in 64 bits
1909
              __mul_64x64_to_128MACH (C1, C1_lo, ten2k64[scale]);
1910
            } else {    // q1 >= 20
1911
              C1.w[1] = C1_hi;
1912
              C1.w[0] = C1_lo;
1913
              __mul_128x64_to_128 (C1, ten2k64[scale], C1);
1914
            }
1915
          } else {      // if (scale == 0) C1 is unchanged
1916
            C1.w[1] = C1_hi;
1917
            C1.w[0] = C1_lo;     // only the low part is necessary
1918
          }
1919
          C1_hi = C1.w[1];
1920
          C1_lo = C1.w[0];
1921
          // now add C2
1922
          if (x_sign == y_sign) {
1923
            // the result can overflow!
1924
            C1_lo = C1_lo + C2_lo;
1925
            C1_hi = C1_hi + C2_hi;
1926
            if (C1_lo < C1.w[0])
1927
              C1_hi++;
1928
            // test for overflow, possible only when C1 >= 10^34
1929
            if (C1_hi > 0x0001ed09bead87c0ull || (C1_hi == 0x0001ed09bead87c0ull && C1_lo >= 0x378d8e6400000000ull)) {  // C1 >= 10^34
1930
              // in this case q = P34 + 1 and x = q - P34 = 1, so multiply 
1931
              // C'' = C'+ 5 = C1 + 5 by k1 ~ 10^(-1) calculated for P34 + 1 
1932
              // decimal digits
1933
              // Calculate C'' = C' + 1/2 * 10^x
1934
              if (C1_lo >= 0xfffffffffffffffbull) {     // low half add has carry
1935
                C1_lo = C1_lo + 5;
1936
                C1_hi = C1_hi + 1;
1937
              } else {
1938
                C1_lo = C1_lo + 5;
1939
              }
1940
              // the approximation of 10^(-1) was rounded up to 118 bits
1941
              // 10^(-1) =~ 33333333333333333333333333333400 * 2^-129
1942
              // 10^(-1) =~ 19999999999999999999999999999a00 * 2^-128
1943
              C1.w[1] = C1_hi;
1944
              C1.w[0] = C1_lo;   // C''
1945
              ten2m1.w[1] = 0x1999999999999999ull;
1946
              ten2m1.w[0] = 0x9999999999999a00ull;
1947
              __mul_128x128_to_256 (P256, C1, ten2m1);  // P256 = C*, f*
1948
              // C* is actually floor(C*) in this case
1949
              // the top Ex = 128 bits of 10^(-1) are 
1950
              // T* = 0x00199999999999999999999999999999
1951
              // if (0 < f* < 10^(-x)) then
1952
              //   if floor(C*) is even then C = floor(C*) - logical right 
1953
              //       shift; C has p decimal digits, correct by Prop. 1)
1954
              //   else if floor(C*) is odd C = floor(C*) - 1 (logical right
1955
              //       shift; C has p decimal digits, correct by Pr. 1)
1956
              // else
1957
              //   C = floor(C*) (logical right shift; C has p decimal digits,
1958
              //       correct by Property 1)
1959
              // n = C * 10^(e2+x)
1960
              if ((P256.w[1] || P256.w[0])
1961
                  && (P256.w[1] < 0x1999999999999999ull
1962
                      || (P256.w[1] == 0x1999999999999999ull
1963
                          && P256.w[0] <= 0x9999999999999999ull))) {
1964
                // the result is a midpoint
1965
                if (P256.w[2] & 0x01) {
1966
                  is_midpoint_gt_even = 1;
1967
                  // if floor(C*) is odd C = floor(C*) - 1; the result is not 0
1968
                  P256.w[2]--;
1969
                  if (P256.w[2] == 0xffffffffffffffffull)
1970
                    P256.w[3]--;
1971
                } else {
1972
                  is_midpoint_lt_even = 1;
1973
                }
1974
              }
1975
              // n = Cstar * 10^(e2+1)
1976
              y_exp = y_exp + EXP_P1;
1977
              // C* != 10^P because C* has P34 digits
1978
              // check for overflow
1979
              if (y_exp == EXP_MAX_P1
1980
                  && (rnd_mode == ROUNDING_TO_NEAREST
1981
                      || rnd_mode == ROUNDING_TIES_AWAY)) {
1982
                // overflow for RN
1983
                res.w[1] = x_sign | 0x7800000000000000ull;      // +/-inf
1984
                res.w[0] = 0x0ull;
1985
                // set the inexact flag
1986
                *pfpsf |= INEXACT_EXCEPTION;
1987
                // set the overflow flag
1988
                *pfpsf |= OVERFLOW_EXCEPTION;
1989
                BID_SWAP128 (res);
1990
                BID_RETURN (res);
1991
              }
1992
              // if (0 < f* - 1/2 < 10^(-x)) then 
1993
              //   the result of the addition is exact 
1994
              // else 
1995
              //   the result of the addition is inexact
1996
              if (P256.w[1] > 0x8000000000000000ull || (P256.w[1] == 0x8000000000000000ull && P256.w[0] > 0x0ull)) {     // the result may be exact
1997
                tmp64 = P256.w[1] - 0x8000000000000000ull;      // f* - 1/2
1998
                if ((tmp64 > 0x1999999999999999ull
1999
                     || (tmp64 == 0x1999999999999999ull
2000
                         && P256.w[0] >= 0x9999999999999999ull))) {
2001
                  // set the inexact flag
2002
                  *pfpsf |= INEXACT_EXCEPTION;
2003
                  is_inexact = 1;
2004
                }       // else the result is exact
2005
              } else {  // the result is inexact
2006
                // set the inexact flag
2007
                *pfpsf |= INEXACT_EXCEPTION;
2008
                is_inexact = 1;
2009
              }
2010
              C1_hi = P256.w[3];
2011
              C1_lo = P256.w[2];
2012
              if (!is_midpoint_gt_even && !is_midpoint_lt_even) {
2013
                is_inexact_lt_midpoint = is_inexact
2014
                  && (P256.w[1] & 0x8000000000000000ull);
2015
                is_inexact_gt_midpoint = is_inexact
2016
                  && !(P256.w[1] & 0x8000000000000000ull);
2017
              }
2018
              // general correction from RN to RA, RM, RP, RZ; 
2019
              // result uses y_exp
2020
              if (rnd_mode != ROUNDING_TO_NEAREST) {
2021
                if ((!x_sign
2022
                     &&
2023
                     ((rnd_mode == ROUNDING_UP
2024
                       && is_inexact_lt_midpoint)
2025
                      ||
2026
                      ((rnd_mode == ROUNDING_TIES_AWAY
2027
                        || rnd_mode == ROUNDING_UP)
2028
                       && is_midpoint_gt_even))) || (x_sign
2029
                                                     &&
2030
                                                     ((rnd_mode ==
2031
                                                       ROUNDING_DOWN
2032
                                                       &&
2033
                                                       is_inexact_lt_midpoint)
2034
                                                      ||
2035
                                                      ((rnd_mode ==
2036
                                                        ROUNDING_TIES_AWAY
2037
                                                        || rnd_mode ==
2038
                                                        ROUNDING_DOWN)
2039
                                                       &&
2040
                                                       is_midpoint_gt_even))))
2041
                {
2042
                  // C1 = C1 + 1
2043
                  C1_lo = C1_lo + 1;
2044
                  if (C1_lo == 0) {      // rounding overflow in the low 64 bits
2045
                    C1_hi = C1_hi + 1;
2046
                  }
2047
                  if (C1_hi == 0x0001ed09bead87c0ull
2048
                      && C1_lo == 0x378d8e6400000000ull) {
2049
                    // C1 = 10^34 => rounding overflow
2050
                    C1_hi = 0x0000314dc6448d93ull;
2051
                    C1_lo = 0x38c15b0a00000000ull;      // 10^33
2052
                    y_exp = y_exp + EXP_P1;
2053
                  }
2054
                } else
2055
                  if ((is_midpoint_lt_even || is_inexact_gt_midpoint)
2056
                      &&
2057
                      ((x_sign
2058
                        && (rnd_mode == ROUNDING_UP
2059
                            || rnd_mode == ROUNDING_TO_ZERO))
2060
                       || (!x_sign
2061
                           && (rnd_mode == ROUNDING_DOWN
2062
                               || rnd_mode == ROUNDING_TO_ZERO)))) {
2063
                  // C1 = C1 - 1
2064
                  C1_lo = C1_lo - 1;
2065
                  if (C1_lo == 0xffffffffffffffffull)
2066
                    C1_hi--;
2067
                  // check if we crossed into the lower decade
2068
                  if (C1_hi == 0x0000314dc6448d93ull && C1_lo == 0x38c15b09ffffffffull) {       // 10^33 - 1
2069
                    C1_hi = 0x0001ed09bead87c0ull;      // 10^34 - 1
2070
                    C1_lo = 0x378d8e63ffffffffull;
2071
                    y_exp = y_exp - EXP_P1;
2072
                    // no underflow, because delta + q2 >= P34 + 1
2073
                  }
2074
                } else {
2075
                  ;     // exact, the result is already correct
2076
                }
2077
                // in all cases check for overflow (RN and RA solved already)
2078
                if (y_exp == EXP_MAX_P1) {      // overflow
2079
                  if ((rnd_mode == ROUNDING_DOWN && x_sign) ||  // RM and res < 0
2080
                      (rnd_mode == ROUNDING_UP && !x_sign)) {   // RP and res > 0
2081
                    C1_hi = 0x7800000000000000ull;      // +inf
2082
                    C1_lo = 0x0ull;
2083
                  } else {      // RM and res > 0, RP and res < 0, or RZ
2084
                    C1_hi = 0x5fffed09bead87c0ull;
2085
                    C1_lo = 0x378d8e63ffffffffull;
2086
                  }
2087
                  y_exp = 0;     // x_sign is preserved
2088
                  // set the inexact flag (in case the exact addition was exact)
2089
                  *pfpsf |= INEXACT_EXCEPTION;
2090
                  // set the overflow flag
2091
                  *pfpsf |= OVERFLOW_EXCEPTION;
2092
                }
2093
              }
2094
            }   // else if (C1 < 10^34) then C1 is the coeff.; the result is exact
2095
          } else {      // if x_sign != y_sign the result is exact
2096
            C1_lo = C1_lo - C2_lo;
2097
            C1_hi = C1_hi - C2_hi;
2098
            if (C1_lo > C1.w[0])
2099
              C1_hi--;
2100
            // the result can be zero, but it cannot overflow
2101
            if (C1_lo == 0 && C1_hi == 0) {
2102
              // assemble the result
2103
              if (x_exp < y_exp)
2104
                res.w[1] = x_exp;
2105
              else
2106
                res.w[1] = y_exp;
2107
              res.w[0] = 0;
2108
              if (rnd_mode == ROUNDING_DOWN) {
2109
                res.w[1] |= 0x8000000000000000ull;
2110
              }
2111
              BID_SWAP128 (res);
2112
              BID_RETURN (res);
2113
            }
2114
            if (C1_hi >= 0x8000000000000000ull) {       // negative coefficient!
2115
              C1_lo = ~C1_lo;
2116
              C1_lo++;
2117
              C1_hi = ~C1_hi;
2118
              if (C1_lo == 0x0)
2119
                C1_hi++;
2120
              x_sign = y_sign;  // the result will have the sign of y
2121
            }
2122
          }
2123
          // assemble the result
2124
          res.w[1] = x_sign | y_exp | C1_hi;
2125
          res.w[0] = C1_lo;
2126
        } else {        // if (delta >= P34 + 1 - q2)
2127
          // instead of C' = (C1 * 10^(e1-e2) + C2)rnd,P34
2128
          // calculate C' = C1 * 10^(e1-e2-x1) + (C2 * 10^(-x1))rnd,P34 
2129
          // where x1 = q1 + e1 - e2 - P34, 1 <= x1 <= P34 - 1
2130
          // In most cases C' will have P34 digits, and n = C' * 10^(e2+x1)
2131
          // If the result has P34+1 digits, redo the steps above with x1+1
2132
          // If the result has P34-1 digits or less, redo the steps above with 
2133
          // x1-1 but only if initially x1 >= 1
2134
          // NOTE: these two steps can be improved, e.g we could guess if
2135
          // P34+1 or P34-1 digits will be obtained by adding/subtracting just
2136
          // the top 64 bits of the two operands
2137
          // The result cannot be zero, but it can overflow
2138
          x1 = delta + q2 - P34;        // 1 <= x1 <= P34-1
2139
        roundC2:
2140
          // Calculate C1 * 10^(e1-e2-x1) where 0 <= e1-e2-x1 <= P34 - 1
2141
          // scale = (int)(e1 >> 49) - (int)(e2 >> 49) - x1; 0 <= scale <= P34-1
2142
          scale = delta - q1 + q2 - x1; // scale = e1 - e2 - x1 = P34 - q1
2143
          // either C1 or 10^(e1-e2-x1) may not fit is 64 bits,
2144
          // but their product fits with certainty in 128 bits (actually in 113)
2145
          if (scale >= 20) {    //10^(e1-e2-x1) doesn't fit in 64 bits, but C1 does
2146
            __mul_128x64_to_128 (C1, C1_lo, ten2k128[scale - 20]);
2147
          } else if (scale >= 1) {
2148
            // if 1 <= scale <= 19 then 10^(e1-e2-x1) fits in 64 bits
2149
            if (q1 <= 19) {     // C1 fits in 64 bits
2150
              __mul_64x64_to_128MACH (C1, C1_lo, ten2k64[scale]);
2151
            } else {    // q1 >= 20
2152
              C1.w[1] = C1_hi;
2153
              C1.w[0] = C1_lo;
2154
              __mul_128x64_to_128 (C1, ten2k64[scale], C1);
2155
            }
2156
          } else {      // if (scale == 0) C1 is unchanged
2157
            C1.w[1] = C1_hi;
2158
            C1.w[0] = C1_lo;
2159
          }
2160
          tmp64 = C1.w[0];       // C1.w[1], C1.w[0] contains C1 * 10^(e1-e2-x1)
2161
 
2162
          // now round C2 to q2-x1 decimal digits, where 1<=x1<=q2-1<=P34-1
2163
          // (but if we got here a second time after x1 = x1 - 1, then 
2164
          // x1 >= 0; note that for x1 = 0 C2 is unchanged)
2165
          // C2' = C2 + 1/2 * 10^x1 = C2 + 5 * 10^(x1-1)
2166
          ind = x1 - 1; // 0 <= ind <= q2-2<=P34-2=32; but note that if x1 = 0
2167
          // during a second pass, then ind = -1
2168
          if (ind >= 0) {        // if (x1 >= 1)
2169
            C2.w[0] = C2_lo;
2170
            C2.w[1] = C2_hi;
2171
            if (ind <= 18) {
2172
              C2.w[0] = C2.w[0] + midpoint64[ind];
2173
              if (C2.w[0] < C2_lo)
2174
                C2.w[1]++;
2175
            } else {    // 19 <= ind <= 32
2176
              C2.w[0] = C2.w[0] + midpoint128[ind - 19].w[0];
2177
              C2.w[1] = C2.w[1] + midpoint128[ind - 19].w[1];
2178
              if (C2.w[0] < C2_lo)
2179
                C2.w[1]++;
2180
            }
2181
            // the approximation of 10^(-x1) was rounded up to 118 bits
2182
            __mul_128x128_to_256 (R256, C2, ten2mk128[ind]);    // R256 = C2*, f2*
2183
            // calculate C2* and f2*
2184
            // C2* is actually floor(C2*) in this case
2185
            // C2* and f2* need shifting and masking, as shown by
2186
            // shiftright128[] and maskhigh128[]
2187
            // the top Ex bits of 10^(-x1) are T* = ten2mk128trunc[ind], e.g.
2188
            // if x1=1, T*=ten2mk128trunc[0]=0x19999999999999999999999999999999
2189
            // if (0 < f2* < 10^(-x1)) then
2190
            //   if floor(C1+C2*) is even then C2* = floor(C2*) - logical right
2191
            //       shift; C2* has p decimal digits, correct by Prop. 1)
2192
            //   else if floor(C1+C2*) is odd C2* = floor(C2*)-1 (logical right
2193
            //       shift; C2* has p decimal digits, correct by Pr. 1)
2194
            // else
2195
            //   C2* = floor(C2*) (logical right shift; C has p decimal digits,
2196
            //       correct by Property 1)
2197
            // n = C2* * 10^(e2+x1)
2198
 
2199
            if (ind <= 2) {
2200
              highf2star.w[1] = 0x0;
2201
              highf2star.w[0] = 0x0;     // low f2* ok
2202
            } else if (ind <= 21) {
2203
              highf2star.w[1] = 0x0;
2204
              highf2star.w[0] = R256.w[2] & maskhigh128[ind];    // low f2* ok
2205
            } else {
2206
              highf2star.w[1] = R256.w[3] & maskhigh128[ind];
2207
              highf2star.w[0] = R256.w[2];       // low f2* is ok
2208
            }
2209
            // shift right C2* by Ex-128 = shiftright128[ind]
2210
            if (ind >= 3) {
2211
              shift = shiftright128[ind];
2212
              if (shift < 64) { // 3 <= shift <= 63
2213
                R256.w[2] =
2214
                  (R256.w[2] >> shift) | (R256.w[3] << (64 - shift));
2215
                R256.w[3] = (R256.w[3] >> shift);
2216
              } else {  // 66 <= shift <= 102
2217
                R256.w[2] = (R256.w[3] >> (shift - 64));
2218
                R256.w[3] = 0x0ULL;
2219
              }
2220
            }
2221
            if (second_pass) {
2222
              is_inexact_lt_midpoint = 0;
2223
              is_inexact_gt_midpoint = 0;
2224
              is_midpoint_lt_even = 0;
2225
              is_midpoint_gt_even = 0;
2226
            }
2227
            // determine inexactness of the rounding of C2* (this may be 
2228
            // followed by a second rounding only if we get P34+1 
2229
            // decimal digits)
2230
            // if (0 < f2* - 1/2 < 10^(-x1)) then
2231
            //   the result is exact
2232
            // else (if f2* - 1/2 > T* then)
2233
            //   the result of is inexact
2234
            if (ind <= 2) {
2235
              if (R256.w[1] > 0x8000000000000000ull ||
2236
                  (R256.w[1] == 0x8000000000000000ull
2237
                   && R256.w[0] > 0x0ull)) {
2238
                // f2* > 1/2 and the result may be exact
2239
                tmp64A = R256.w[1] - 0x8000000000000000ull;     // f* - 1/2
2240
                if ((tmp64A > ten2mk128trunc[ind].w[1]
2241
                     || (tmp64A == ten2mk128trunc[ind].w[1]
2242
                         && R256.w[0] >= ten2mk128trunc[ind].w[0]))) {
2243
                  // set the inexact flag
2244
                  // *pfpsf |= INEXACT_EXCEPTION;
2245
                  tmp_inexact = 1;      // may be set again during a second pass
2246
                  // this rounding is applied to C2 only!
2247
                  if (x_sign == y_sign)
2248
                    is_inexact_lt_midpoint = 1;
2249
                  else  // if (x_sign != y_sign)
2250
                    is_inexact_gt_midpoint = 1;
2251
                }       // else the result is exact
2252
                // rounding down, unless a midpoint in [ODD, EVEN]
2253
              } else {  // the result is inexact; f2* <= 1/2
2254
                // set the inexact flag
2255
                // *pfpsf |= INEXACT_EXCEPTION;
2256
                tmp_inexact = 1;        // just in case we will round a second time
2257
                // rounding up, unless a midpoint in [EVEN, ODD]
2258
                // this rounding is applied to C2 only!
2259
                if (x_sign == y_sign)
2260
                  is_inexact_gt_midpoint = 1;
2261
                else    // if (x_sign != y_sign)
2262
                  is_inexact_lt_midpoint = 1;
2263
              }
2264
            } else if (ind <= 21) {     // if 3 <= ind <= 21
2265
              if (highf2star.w[1] > 0x0 || (highf2star.w[1] == 0x0
2266
                                            && highf2star.w[0] >
2267
                                            onehalf128[ind])
2268
                  || (highf2star.w[1] == 0x0
2269
                      && highf2star.w[0] == onehalf128[ind]
2270
                      && (R256.w[1] || R256.w[0]))) {
2271
                // f2* > 1/2 and the result may be exact
2272
                // Calculate f2* - 1/2
2273
                tmp64A = highf2star.w[0] - onehalf128[ind];
2274
                tmp64B = highf2star.w[1];
2275
                if (tmp64A > highf2star.w[0])
2276
                  tmp64B--;
2277
                if (tmp64B || tmp64A
2278
                    || R256.w[1] > ten2mk128trunc[ind].w[1]
2279
                    || (R256.w[1] == ten2mk128trunc[ind].w[1]
2280
                        && R256.w[0] > ten2mk128trunc[ind].w[0])) {
2281
                  // set the inexact flag
2282
                  // *pfpsf |= INEXACT_EXCEPTION;
2283
                  tmp_inexact = 1;      // may be set again during a second pass
2284
                  // this rounding is applied to C2 only!
2285
                  if (x_sign == y_sign)
2286
                    is_inexact_lt_midpoint = 1;
2287
                  else  // if (x_sign != y_sign)
2288
                    is_inexact_gt_midpoint = 1;
2289
                }       // else the result is exact
2290
              } else {  // the result is inexact; f2* <= 1/2
2291
                // set the inexact flag
2292
                // *pfpsf |= INEXACT_EXCEPTION;
2293
                tmp_inexact = 1;        // may be set again during a second pass
2294
                // rounding up, unless a midpoint in [EVEN, ODD]
2295
                // this rounding is applied to C2 only!
2296
                if (x_sign == y_sign)
2297
                  is_inexact_gt_midpoint = 1;
2298
                else    // if (x_sign != y_sign)
2299
                  is_inexact_lt_midpoint = 1;
2300
              }
2301
            } else {    // if 22 <= ind <= 33
2302
              if (highf2star.w[1] > onehalf128[ind]
2303
                  || (highf2star.w[1] == onehalf128[ind]
2304
                      && (highf2star.w[0] || R256.w[1]
2305
                          || R256.w[0]))) {
2306
                // f2* > 1/2 and the result may be exact
2307
                // Calculate f2* - 1/2
2308
                // tmp64A = highf2star.w[0];
2309
                tmp64B = highf2star.w[1] - onehalf128[ind];
2310
                if (tmp64B || highf2star.w[0]
2311
                    || R256.w[1] > ten2mk128trunc[ind].w[1]
2312
                    || (R256.w[1] == ten2mk128trunc[ind].w[1]
2313
                        && R256.w[0] > ten2mk128trunc[ind].w[0])) {
2314
                  // set the inexact flag
2315
                  // *pfpsf |= INEXACT_EXCEPTION;
2316
                  tmp_inexact = 1;      // may be set again during a second pass
2317
                  // this rounding is applied to C2 only!
2318
                  if (x_sign == y_sign)
2319
                    is_inexact_lt_midpoint = 1;
2320
                  else  // if (x_sign != y_sign)
2321
                    is_inexact_gt_midpoint = 1;
2322
                }       // else the result is exact
2323
              } else {  // the result is inexact; f2* <= 1/2
2324
                // set the inexact flag
2325
                // *pfpsf |= INEXACT_EXCEPTION;
2326
                tmp_inexact = 1;        // may be set again during a second pass
2327
                // rounding up, unless a midpoint in [EVEN, ODD]
2328
                // this rounding is applied to C2 only!
2329
                if (x_sign == y_sign)
2330
                  is_inexact_gt_midpoint = 1;
2331
                else    // if (x_sign != y_sign)
2332
                  is_inexact_lt_midpoint = 1;
2333
              }
2334
            }
2335
            // check for midpoints
2336
            if ((R256.w[1] || R256.w[0]) && (highf2star.w[1] == 0)
2337
                && (highf2star.w[0] == 0)
2338
                && (R256.w[1] < ten2mk128trunc[ind].w[1]
2339
                    || (R256.w[1] == ten2mk128trunc[ind].w[1]
2340
                        && R256.w[0] <= ten2mk128trunc[ind].w[0]))) {
2341
              // the result is a midpoint
2342
              if ((tmp64 + R256.w[2]) & 0x01) { // MP in [EVEN, ODD]
2343
                // if floor(C2*) is odd C = floor(C2*) - 1; the result may be 0
2344
                R256.w[2]--;
2345
                if (R256.w[2] == 0xffffffffffffffffull)
2346
                  R256.w[3]--;
2347
                // this rounding is applied to C2 only!
2348
                if (x_sign == y_sign)
2349
                  is_midpoint_gt_even = 1;
2350
                else    // if (x_sign != y_sign)
2351
                  is_midpoint_lt_even = 1;
2352
                is_inexact_lt_midpoint = 0;
2353
                is_inexact_gt_midpoint = 0;
2354
              } else {
2355
                // else MP in [ODD, EVEN]
2356
                // this rounding is applied to C2 only!
2357
                if (x_sign == y_sign)
2358
                  is_midpoint_lt_even = 1;
2359
                else    // if (x_sign != y_sign)
2360
                  is_midpoint_gt_even = 1;
2361
                is_inexact_lt_midpoint = 0;
2362
                is_inexact_gt_midpoint = 0;
2363
              }
2364
            }
2365
            // end if (ind >= 0)
2366
          } else {      // if (ind == -1); only during a 2nd pass, and when x1 = 0
2367
            R256.w[2] = C2_lo;
2368
            R256.w[3] = C2_hi;
2369
            tmp_inexact = 0;
2370
            // to correct a possible setting to 1 from 1st pass
2371
            if (second_pass) {
2372
              is_midpoint_lt_even = 0;
2373
              is_midpoint_gt_even = 0;
2374
              is_inexact_lt_midpoint = 0;
2375
              is_inexact_gt_midpoint = 0;
2376
            }
2377
          }
2378
          // and now add/subtract C1 * 10^(e1-e2-x1) +/- (C2 * 10^(-x1))rnd,P34
2379
          if (x_sign == y_sign) {       // addition; could overflow
2380
            // no second pass is possible this way (only for x_sign != y_sign)
2381
            C1.w[0] = C1.w[0] + R256.w[2];
2382
            C1.w[1] = C1.w[1] + R256.w[3];
2383
            if (C1.w[0] < tmp64)
2384
              C1.w[1]++;        // carry
2385
            // if the sum has P34+1 digits, i.e. C1>=10^34 redo the calculation
2386
            // with x1=x1+1 
2387
            if (C1.w[1] > 0x0001ed09bead87c0ull || (C1.w[1] == 0x0001ed09bead87c0ull && C1.w[0] >= 0x378d8e6400000000ull)) {     // C1 >= 10^34
2388
              // chop off one more digit from the sum, but make sure there is
2389
              // no double-rounding error (see table - double rounding logic)
2390
              // now round C1 from P34+1 to P34 decimal digits
2391
              // C1' = C1 + 1/2 * 10 = C1 + 5
2392
              if (C1.w[0] >= 0xfffffffffffffffbull) {    // low half add has carry
2393
                C1.w[0] = C1.w[0] + 5;
2394
                C1.w[1] = C1.w[1] + 1;
2395
              } else {
2396
                C1.w[0] = C1.w[0] + 5;
2397
              }
2398
              // the approximation of 10^(-1) was rounded up to 118 bits
2399
              __mul_128x128_to_256 (Q256, C1, ten2mk128[0]);     // Q256 = C1*, f1*
2400
              // C1* is actually floor(C1*) in this case
2401
              // the top 128 bits of 10^(-1) are
2402
              // T* = ten2mk128trunc[0]=0x19999999999999999999999999999999
2403
              // if (0 < f1* < 10^(-1)) then
2404
              //   if floor(C1*) is even then C1* = floor(C1*) - logical right
2405
              //       shift; C1* has p decimal digits, correct by Prop. 1)
2406
              //   else if floor(C1*) is odd C1* = floor(C1*) - 1 (logical right
2407
              //       shift; C1* has p decimal digits, correct by Pr. 1)
2408
              // else
2409
              //   C1* = floor(C1*) (logical right shift; C has p decimal digits
2410
              //       correct by Property 1)
2411
              // n = C1* * 10^(e2+x1+1)
2412
              if ((Q256.w[1] || Q256.w[0])
2413
                  && (Q256.w[1] < ten2mk128trunc[0].w[1]
2414
                      || (Q256.w[1] == ten2mk128trunc[0].w[1]
2415
                          && Q256.w[0] <= ten2mk128trunc[0].w[0]))) {
2416
                // the result is a midpoint
2417
                if (is_inexact_lt_midpoint) {   // for the 1st rounding
2418
                  is_inexact_gt_midpoint = 1;
2419
                  is_inexact_lt_midpoint = 0;
2420
                  is_midpoint_gt_even = 0;
2421
                  is_midpoint_lt_even = 0;
2422
                } else if (is_inexact_gt_midpoint) {    // for the 1st rounding
2423
                  Q256.w[2]--;
2424
                  if (Q256.w[2] == 0xffffffffffffffffull)
2425
                    Q256.w[3]--;
2426
                  is_inexact_gt_midpoint = 0;
2427
                  is_inexact_lt_midpoint = 1;
2428
                  is_midpoint_gt_even = 0;
2429
                  is_midpoint_lt_even = 0;
2430
                } else if (is_midpoint_gt_even) {       // for the 1st rounding
2431
                  // Note: cannot have is_midpoint_lt_even
2432
                  is_inexact_gt_midpoint = 0;
2433
                  is_inexact_lt_midpoint = 1;
2434
                  is_midpoint_gt_even = 0;
2435
                  is_midpoint_lt_even = 0;
2436
                } else {        // the first rounding must have been exact
2437
                  if (Q256.w[2] & 0x01) {       // MP in [EVEN, ODD]
2438
                    // the truncated result is correct
2439
                    Q256.w[2]--;
2440
                    if (Q256.w[2] == 0xffffffffffffffffull)
2441
                      Q256.w[3]--;
2442
                    is_inexact_gt_midpoint = 0;
2443
                    is_inexact_lt_midpoint = 0;
2444
                    is_midpoint_gt_even = 1;
2445
                    is_midpoint_lt_even = 0;
2446
                  } else {      // MP in [ODD, EVEN]
2447
                    is_inexact_gt_midpoint = 0;
2448
                    is_inexact_lt_midpoint = 0;
2449
                    is_midpoint_gt_even = 0;
2450
                    is_midpoint_lt_even = 1;
2451
                  }
2452
                }
2453
                tmp_inexact = 1;        // in all cases
2454
              } else {  // the result is not a midpoint 
2455
                // determine inexactness of the rounding of C1 (the sum C1+C2*)
2456
                // if (0 < f1* - 1/2 < 10^(-1)) then
2457
                //   the result is exact
2458
                // else (if f1* - 1/2 > T* then)
2459
                //   the result of is inexact
2460
                // ind = 0
2461
                if (Q256.w[1] > 0x8000000000000000ull
2462
                    || (Q256.w[1] == 0x8000000000000000ull
2463
                        && Q256.w[0] > 0x0ull)) {
2464
                  // f1* > 1/2 and the result may be exact
2465
                  Q256.w[1] = Q256.w[1] - 0x8000000000000000ull;        // f1* - 1/2
2466
                  if ((Q256.w[1] > ten2mk128trunc[0].w[1]
2467
                       || (Q256.w[1] == ten2mk128trunc[0].w[1]
2468
                           && Q256.w[0] > ten2mk128trunc[0].w[0]))) {
2469
                    is_inexact_gt_midpoint = 0;
2470
                    is_inexact_lt_midpoint = 1;
2471
                    is_midpoint_gt_even = 0;
2472
                    is_midpoint_lt_even = 0;
2473
                    // set the inexact flag
2474
                    tmp_inexact = 1;
2475
                    // *pfpsf |= INEXACT_EXCEPTION;
2476
                  } else {      // else the result is exact for the 2nd rounding
2477
                    if (tmp_inexact) {  // if the previous rounding was inexact
2478
                      if (is_midpoint_lt_even) {
2479
                        is_inexact_gt_midpoint = 1;
2480
                        is_midpoint_lt_even = 0;
2481
                      } else if (is_midpoint_gt_even) {
2482
                        is_inexact_lt_midpoint = 1;
2483
                        is_midpoint_gt_even = 0;
2484
                      } else {
2485
                        ;       // no change
2486
                      }
2487
                    }
2488
                  }
2489
                  // rounding down, unless a midpoint in [ODD, EVEN]
2490
                } else {        // the result is inexact; f1* <= 1/2
2491
                  is_inexact_gt_midpoint = 1;
2492
                  is_inexact_lt_midpoint = 0;
2493
                  is_midpoint_gt_even = 0;
2494
                  is_midpoint_lt_even = 0;
2495
                  // set the inexact flag
2496
                  tmp_inexact = 1;
2497
                  // *pfpsf |= INEXACT_EXCEPTION;
2498
                }
2499
              } // end 'the result is not a midpoint'
2500
              // n = C1 * 10^(e2+x1)
2501
              C1.w[1] = Q256.w[3];
2502
              C1.w[0] = Q256.w[2];
2503
              y_exp = y_exp + ((UINT64) (x1 + 1) << 49);
2504
            } else {    // C1 < 10^34
2505
              // C1.w[1] and C1.w[0] already set
2506
              // n = C1 * 10^(e2+x1)
2507
              y_exp = y_exp + ((UINT64) x1 << 49);
2508
            }
2509
            // check for overflow
2510
            if (y_exp == EXP_MAX_P1
2511
                && (rnd_mode == ROUNDING_TO_NEAREST
2512
                    || rnd_mode == ROUNDING_TIES_AWAY)) {
2513
              res.w[1] = 0x7800000000000000ull | x_sign;        // +/-inf
2514
              res.w[0] = 0x0ull;
2515
              // set the inexact flag
2516
              *pfpsf |= INEXACT_EXCEPTION;
2517
              // set the overflow flag
2518
              *pfpsf |= OVERFLOW_EXCEPTION;
2519
              BID_SWAP128 (res);
2520
              BID_RETURN (res);
2521
            }   // else no overflow
2522
          } else {      // if x_sign != y_sign the result of this subtract. is exact
2523
            C1.w[0] = C1.w[0] - R256.w[2];
2524
            C1.w[1] = C1.w[1] - R256.w[3];
2525
            if (C1.w[0] > tmp64)
2526
              C1.w[1]--;        // borrow
2527
            if (C1.w[1] >= 0x8000000000000000ull) {     // negative coefficient!
2528
              C1.w[0] = ~C1.w[0];
2529
              C1.w[0]++;
2530
              C1.w[1] = ~C1.w[1];
2531
              if (C1.w[0] == 0x0)
2532
                C1.w[1]++;
2533
              tmp_sign = y_sign;
2534
              // the result will have the sign of y if last rnd
2535
            } else {
2536
              tmp_sign = x_sign;
2537
            }
2538
            // if the difference has P34-1 digits or less, i.e. C1 < 10^33 then
2539
            //   redo the calculation with x1=x1-1;
2540
            // redo the calculation also if C1 = 10^33 and 
2541
            //   (is_inexact_gt_midpoint or is_midpoint_lt_even);
2542
            //   (the last part should have really been 
2543
            //   (is_inexact_lt_midpoint or is_midpoint_gt_even) from
2544
            //    the rounding of C2, but the position flags have been reversed)
2545
            // 10^33 = 0x0000314dc6448d93 0x38c15b0a00000000
2546
            if ((C1.w[1] < 0x0000314dc6448d93ull || (C1.w[1] == 0x0000314dc6448d93ull && C1.w[0] < 0x38c15b0a00000000ull)) || (C1.w[1] == 0x0000314dc6448d93ull && C1.w[0] == 0x38c15b0a00000000ull && (is_inexact_gt_midpoint || is_midpoint_lt_even))) {        // C1=10^33
2547
              x1 = x1 - 1;      // x1 >= 0
2548
              if (x1 >= 0) {
2549
                // clear position flags and tmp_inexact
2550
                is_midpoint_lt_even = 0;
2551
                is_midpoint_gt_even = 0;
2552
                is_inexact_lt_midpoint = 0;
2553
                is_inexact_gt_midpoint = 0;
2554
                tmp_inexact = 0;
2555
                second_pass = 1;
2556
                goto roundC2;   // else result has less than P34 digits
2557
              }
2558
            }
2559
            // if the coefficient of the result is 10^34 it means that this
2560
            // must be the second pass, and we are done 
2561
            if (C1.w[1] == 0x0001ed09bead87c0ull && C1.w[0] == 0x378d8e6400000000ull) {  // if  C1 = 10^34
2562
              C1.w[1] = 0x0000314dc6448d93ull;  // C1 = 10^33
2563
              C1.w[0] = 0x38c15b0a00000000ull;
2564
              y_exp = y_exp + ((UINT64) 1 << 49);
2565
            }
2566
            x_sign = tmp_sign;
2567
            if (x1 >= 1)
2568
              y_exp = y_exp + ((UINT64) x1 << 49);
2569
            // x1 = -1 is possible at the end of a second pass when the 
2570
            // first pass started with x1 = 1 
2571
          }
2572
          C1_hi = C1.w[1];
2573
          C1_lo = C1.w[0];
2574
          // general correction from RN to RA, RM, RP, RZ; result uses y_exp
2575
          if (rnd_mode != ROUNDING_TO_NEAREST) {
2576
            if ((!x_sign
2577
                 && ((rnd_mode == ROUNDING_UP && is_inexact_lt_midpoint)
2578
                     ||
2579
                     ((rnd_mode == ROUNDING_TIES_AWAY
2580
                       || rnd_mode == ROUNDING_UP)
2581
                      && is_midpoint_gt_even))) || (x_sign
2582
                                                    &&
2583
                                                    ((rnd_mode ==
2584
                                                      ROUNDING_DOWN
2585
                                                      &&
2586
                                                      is_inexact_lt_midpoint)
2587
                                                     ||
2588
                                                     ((rnd_mode ==
2589
                                                       ROUNDING_TIES_AWAY
2590
                                                       || rnd_mode ==
2591
                                                       ROUNDING_DOWN)
2592
                                                      &&
2593
                                                      is_midpoint_gt_even))))
2594
            {
2595
              // C1 = C1 + 1
2596
              C1_lo = C1_lo + 1;
2597
              if (C1_lo == 0) {  // rounding overflow in the low 64 bits
2598
                C1_hi = C1_hi + 1;
2599
              }
2600
              if (C1_hi == 0x0001ed09bead87c0ull
2601
                  && C1_lo == 0x378d8e6400000000ull) {
2602
                // C1 = 10^34 => rounding overflow
2603
                C1_hi = 0x0000314dc6448d93ull;
2604
                C1_lo = 0x38c15b0a00000000ull;  // 10^33
2605
                y_exp = y_exp + EXP_P1;
2606
              }
2607
            } else if ((is_midpoint_lt_even || is_inexact_gt_midpoint)
2608
                       &&
2609
                       ((x_sign
2610
                         && (rnd_mode == ROUNDING_UP
2611
                             || rnd_mode == ROUNDING_TO_ZERO))
2612
                        || (!x_sign
2613
                            && (rnd_mode == ROUNDING_DOWN
2614
                                || rnd_mode == ROUNDING_TO_ZERO)))) {
2615
              // C1 = C1 - 1
2616
              C1_lo = C1_lo - 1;
2617
              if (C1_lo == 0xffffffffffffffffull)
2618
                C1_hi--;
2619
              // check if we crossed into the lower decade
2620
              if (C1_hi == 0x0000314dc6448d93ull && C1_lo == 0x38c15b09ffffffffull) {   // 10^33 - 1
2621
                C1_hi = 0x0001ed09bead87c0ull;  // 10^34 - 1
2622
                C1_lo = 0x378d8e63ffffffffull;
2623
                y_exp = y_exp - EXP_P1;
2624
                // no underflow, because delta + q2 >= P34 + 1
2625
              }
2626
            } else {
2627
              ; // exact, the result is already correct
2628
            }
2629
            // in all cases check for overflow (RN and RA solved already)
2630
            if (y_exp == EXP_MAX_P1) {  // overflow
2631
              if ((rnd_mode == ROUNDING_DOWN && x_sign) ||      // RM and res < 0
2632
                  (rnd_mode == ROUNDING_UP && !x_sign)) {       // RP and res > 0
2633
                C1_hi = 0x7800000000000000ull;  // +inf
2634
                C1_lo = 0x0ull;
2635
              } else {  // RM and res > 0, RP and res < 0, or RZ
2636
                C1_hi = 0x5fffed09bead87c0ull;
2637
                C1_lo = 0x378d8e63ffffffffull;
2638
              }
2639
              y_exp = 0; // x_sign is preserved
2640
              // set the inexact flag (in case the exact addition was exact)
2641
              *pfpsf |= INEXACT_EXCEPTION;
2642
              // set the overflow flag
2643
              *pfpsf |= OVERFLOW_EXCEPTION;
2644
            }
2645
          }
2646
          // assemble the result
2647
          res.w[1] = x_sign | y_exp | C1_hi;
2648
          res.w[0] = C1_lo;
2649
          if (tmp_inexact)
2650
            *pfpsf |= INEXACT_EXCEPTION;
2651
        }
2652
      } else {  // if (-P34 + 1 <= delta <= -1) <=> 1 <= -delta <= P34 - 1
2653
        // NOTE: the following, up to "} else { // if x_sign != y_sign 
2654
        // the result is exact" is identical to "else if (delta == P34 - q2) {"
2655
        // from above; also, the code is not symmetric: a+b and b+a may take
2656
        // different paths (need to unify eventually!) 
2657
        // calculate C' = C2 + C1 * 10^(e1-e2) directly; the result may be 
2658
        // inexact if it requires P34 + 1 decimal digits; in either case the 
2659
        // 'cutoff' point for addition is at the position of the lsb of C2
2660
        // The coefficient of the result is C1 * 10^(e1-e2) + C2 and the
2661
        // exponent is e2; either C1 or 10^(e1-e2) may not fit is 64 bits,
2662
        // but their product fits with certainty in 128 bits (actually in 113)
2663
        // Note that 0 <= e1 - e2 <= P34 - 2
2664
        //   -P34 + 1 <= delta <= -1 <=> -P34 + 1 <= delta <= -1 <=>
2665
        //   -P34 + 1 <= q1 + e1 - q2 - e2 <= -1 <=>
2666
        //   q2 - q1 - P34 + 1 <= e1 - e2 <= q2 - q1 - 1 <=>
2667
        //   1 - P34 - P34 + 1 <= e1-e2 <= P34 - 1 - 1 => 0 <= e1-e2 <= P34 - 2
2668
        scale = delta - q1 + q2;        // scale = (int)(e1 >> 49) - (int)(e2 >> 49)
2669
        if (scale >= 20) {      // 10^(e1-e2) does not fit in 64 bits, but C1 does
2670
          __mul_128x64_to_128 (C1, C1_lo, ten2k128[scale - 20]);
2671
        } else if (scale >= 1) {
2672
          // if 1 <= scale <= 19 then 10^(e1-e2) fits in 64 bits
2673
          if (q1 <= 19) {       // C1 fits in 64 bits
2674
            __mul_64x64_to_128MACH (C1, C1_lo, ten2k64[scale]);
2675
          } else {      // q1 >= 20
2676
            C1.w[1] = C1_hi;
2677
            C1.w[0] = C1_lo;
2678
            __mul_128x64_to_128 (C1, ten2k64[scale], C1);
2679
          }
2680
        } else {        // if (scale == 0) C1 is unchanged
2681
          C1.w[1] = C1_hi;
2682
          C1.w[0] = C1_lo;       // only the low part is necessary
2683
        }
2684
        C1_hi = C1.w[1];
2685
        C1_lo = C1.w[0];
2686
        // now add C2
2687
        if (x_sign == y_sign) {
2688
          // the result can overflow!
2689
          C1_lo = C1_lo + C2_lo;
2690
          C1_hi = C1_hi + C2_hi;
2691
          if (C1_lo < C1.w[0])
2692
            C1_hi++;
2693
          // test for overflow, possible only when C1 >= 10^34
2694
          if (C1_hi > 0x0001ed09bead87c0ull || (C1_hi == 0x0001ed09bead87c0ull && C1_lo >= 0x378d8e6400000000ull)) {    // C1 >= 10^34
2695
            // in this case q = P34 + 1 and x = q - P34 = 1, so multiply 
2696
            // C'' = C'+ 5 = C1 + 5 by k1 ~ 10^(-1) calculated for P34 + 1 
2697
            // decimal digits
2698
            // Calculate C'' = C' + 1/2 * 10^x
2699
            if (C1_lo >= 0xfffffffffffffffbull) {       // low half add has carry
2700
              C1_lo = C1_lo + 5;
2701
              C1_hi = C1_hi + 1;
2702
            } else {
2703
              C1_lo = C1_lo + 5;
2704
            }
2705
            // the approximation of 10^(-1) was rounded up to 118 bits
2706
            // 10^(-1) =~ 33333333333333333333333333333400 * 2^-129
2707
            // 10^(-1) =~ 19999999999999999999999999999a00 * 2^-128
2708
            C1.w[1] = C1_hi;
2709
            C1.w[0] = C1_lo;     // C''
2710
            ten2m1.w[1] = 0x1999999999999999ull;
2711
            ten2m1.w[0] = 0x9999999999999a00ull;
2712
            __mul_128x128_to_256 (P256, C1, ten2m1);    // P256 = C*, f*
2713
            // C* is actually floor(C*) in this case
2714
            // the top Ex = 128 bits of 10^(-1) are 
2715
            // T* = 0x00199999999999999999999999999999
2716
            // if (0 < f* < 10^(-x)) then
2717
            //   if floor(C*) is even then C = floor(C*) - logical right 
2718
            //       shift; C has p decimal digits, correct by Prop. 1)
2719
            //   else if floor(C*) is odd C = floor(C*) - 1 (logical right
2720
            //       shift; C has p decimal digits, correct by Pr. 1)
2721
            // else
2722
            //   C = floor(C*) (logical right shift; C has p decimal digits,
2723
            //       correct by Property 1)
2724
            // n = C * 10^(e2+x)
2725
            if ((P256.w[1] || P256.w[0])
2726
                && (P256.w[1] < 0x1999999999999999ull
2727
                    || (P256.w[1] == 0x1999999999999999ull
2728
                        && P256.w[0] <= 0x9999999999999999ull))) {
2729
              // the result is a midpoint
2730
              if (P256.w[2] & 0x01) {
2731
                is_midpoint_gt_even = 1;
2732
                // if floor(C*) is odd C = floor(C*) - 1; the result is not 0
2733
                P256.w[2]--;
2734
                if (P256.w[2] == 0xffffffffffffffffull)
2735
                  P256.w[3]--;
2736
              } else {
2737
                is_midpoint_lt_even = 1;
2738
              }
2739
            }
2740
            // n = Cstar * 10^(e2+1)
2741
            y_exp = y_exp + EXP_P1;
2742
            // C* != 10^P34 because C* has P34 digits
2743
            // check for overflow
2744
            if (y_exp == EXP_MAX_P1
2745
                && (rnd_mode == ROUNDING_TO_NEAREST
2746
                    || rnd_mode == ROUNDING_TIES_AWAY)) {
2747
              // overflow for RN
2748
              res.w[1] = x_sign | 0x7800000000000000ull;        // +/-inf
2749
              res.w[0] = 0x0ull;
2750
              // set the inexact flag
2751
              *pfpsf |= INEXACT_EXCEPTION;
2752
              // set the overflow flag
2753
              *pfpsf |= OVERFLOW_EXCEPTION;
2754
              BID_SWAP128 (res);
2755
              BID_RETURN (res);
2756
            }
2757
            // if (0 < f* - 1/2 < 10^(-x)) then 
2758
            //   the result of the addition is exact 
2759
            // else 
2760
            //   the result of the addition is inexact
2761
            if (P256.w[1] > 0x8000000000000000ull || (P256.w[1] == 0x8000000000000000ull && P256.w[0] > 0x0ull)) {       // the result may be exact
2762
              tmp64 = P256.w[1] - 0x8000000000000000ull;        // f* - 1/2
2763
              if ((tmp64 > 0x1999999999999999ull
2764
                   || (tmp64 == 0x1999999999999999ull
2765
                       && P256.w[0] >= 0x9999999999999999ull))) {
2766
                // set the inexact flag
2767
                *pfpsf |= INEXACT_EXCEPTION;
2768
                is_inexact = 1;
2769
              } // else the result is exact
2770
            } else {    // the result is inexact
2771
              // set the inexact flag
2772
              *pfpsf |= INEXACT_EXCEPTION;
2773
              is_inexact = 1;
2774
            }
2775
            C1_hi = P256.w[3];
2776
            C1_lo = P256.w[2];
2777
            if (!is_midpoint_gt_even && !is_midpoint_lt_even) {
2778
              is_inexact_lt_midpoint = is_inexact
2779
                && (P256.w[1] & 0x8000000000000000ull);
2780
              is_inexact_gt_midpoint = is_inexact
2781
                && !(P256.w[1] & 0x8000000000000000ull);
2782
            }
2783
            // general correction from RN to RA, RM, RP, RZ; result uses y_exp
2784
            if (rnd_mode != ROUNDING_TO_NEAREST) {
2785
              if ((!x_sign
2786
                   && ((rnd_mode == ROUNDING_UP
2787
                        && is_inexact_lt_midpoint)
2788
                       || ((rnd_mode == ROUNDING_TIES_AWAY
2789
                            || rnd_mode == ROUNDING_UP)
2790
                           && is_midpoint_gt_even))) || (x_sign
2791
                                                         &&
2792
                                                         ((rnd_mode ==
2793
                                                           ROUNDING_DOWN
2794
                                                           &&
2795
                                                           is_inexact_lt_midpoint)
2796
                                                          ||
2797
                                                          ((rnd_mode ==
2798
                                                            ROUNDING_TIES_AWAY
2799
                                                            || rnd_mode
2800
                                                            ==
2801
                                                            ROUNDING_DOWN)
2802
                                                           &&
2803
                                                           is_midpoint_gt_even))))
2804
              {
2805
                // C1 = C1 + 1
2806
                C1_lo = C1_lo + 1;
2807
                if (C1_lo == 0) {        // rounding overflow in the low 64 bits
2808
                  C1_hi = C1_hi + 1;
2809
                }
2810
                if (C1_hi == 0x0001ed09bead87c0ull
2811
                    && C1_lo == 0x378d8e6400000000ull) {
2812
                  // C1 = 10^34 => rounding overflow
2813
                  C1_hi = 0x0000314dc6448d93ull;
2814
                  C1_lo = 0x38c15b0a00000000ull;        // 10^33
2815
                  y_exp = y_exp + EXP_P1;
2816
                }
2817
              } else
2818
                if ((is_midpoint_lt_even || is_inexact_gt_midpoint) &&
2819
                    ((x_sign && (rnd_mode == ROUNDING_UP ||
2820
                                 rnd_mode == ROUNDING_TO_ZERO)) ||
2821
                     (!x_sign && (rnd_mode == ROUNDING_DOWN ||
2822
                                  rnd_mode == ROUNDING_TO_ZERO)))) {
2823
                // C1 = C1 - 1
2824
                C1_lo = C1_lo - 1;
2825
                if (C1_lo == 0xffffffffffffffffull)
2826
                  C1_hi--;
2827
                // check if we crossed into the lower decade
2828
                if (C1_hi == 0x0000314dc6448d93ull && C1_lo == 0x38c15b09ffffffffull) { // 10^33 - 1
2829
                  C1_hi = 0x0001ed09bead87c0ull;        // 10^34 - 1
2830
                  C1_lo = 0x378d8e63ffffffffull;
2831
                  y_exp = y_exp - EXP_P1;
2832
                  // no underflow, because delta + q2 >= P34 + 1
2833
                }
2834
              } else {
2835
                ;       // exact, the result is already correct
2836
              }
2837
              // in all cases check for overflow (RN and RA solved already)
2838
              if (y_exp == EXP_MAX_P1) {        // overflow
2839
                if ((rnd_mode == ROUNDING_DOWN && x_sign) ||    // RM and res < 0
2840
                    (rnd_mode == ROUNDING_UP && !x_sign)) {     // RP and res > 0
2841
                  C1_hi = 0x7800000000000000ull;        // +inf
2842
                  C1_lo = 0x0ull;
2843
                } else {        // RM and res > 0, RP and res < 0, or RZ
2844
                  C1_hi = 0x5fffed09bead87c0ull;
2845
                  C1_lo = 0x378d8e63ffffffffull;
2846
                }
2847
                y_exp = 0;       // x_sign is preserved
2848
                // set the inexact flag (in case the exact addition was exact)
2849
                *pfpsf |= INEXACT_EXCEPTION;
2850
                // set the overflow flag
2851
                *pfpsf |= OVERFLOW_EXCEPTION;
2852
              }
2853
            }
2854
          }     // else if (C1 < 10^34) then C1 is the coeff.; the result is exact
2855
          // assemble the result
2856
          res.w[1] = x_sign | y_exp | C1_hi;
2857
          res.w[0] = C1_lo;
2858
        } else {        // if x_sign != y_sign the result is exact
2859
          C1_lo = C2_lo - C1_lo;
2860
          C1_hi = C2_hi - C1_hi;
2861
          if (C1_lo > C2_lo)
2862
            C1_hi--;
2863
          if (C1_hi >= 0x8000000000000000ull) { // negative coefficient!
2864
            C1_lo = ~C1_lo;
2865
            C1_lo++;
2866
            C1_hi = ~C1_hi;
2867
            if (C1_lo == 0x0)
2868
              C1_hi++;
2869
            x_sign = y_sign;    // the result will have the sign of y
2870
          }
2871
          // the result can be zero, but it cannot overflow
2872
          if (C1_lo == 0 && C1_hi == 0) {
2873
            // assemble the result
2874
            if (x_exp < y_exp)
2875
              res.w[1] = x_exp;
2876
            else
2877
              res.w[1] = y_exp;
2878
            res.w[0] = 0;
2879
            if (rnd_mode == ROUNDING_DOWN) {
2880
              res.w[1] |= 0x8000000000000000ull;
2881
            }
2882
            BID_SWAP128 (res);
2883
            BID_RETURN (res);
2884
          }
2885
          // assemble the result
2886
          res.w[1] = y_sign | y_exp | C1_hi;
2887
          res.w[0] = C1_lo;
2888
        }
2889
      }
2890
    }
2891
    BID_SWAP128 (res);
2892
    BID_RETURN (res)
2893
  }
2894
}
2895
 
2896
 
2897
 
2898
// bid128_sub stands for bid128qq_sub
2899
 
2900
/*****************************************************************************
2901
 *  BID128 sub
2902
 ****************************************************************************/
2903
 
2904
#if DECIMAL_CALL_BY_REFERENCE
2905
void
2906
bid128_sub (UINT128 * pres, UINT128 * px, UINT128 * py
2907
            _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
2908
            _EXC_INFO_PARAM) {
2909
  UINT128 x = *px, y = *py;
2910
#if !DECIMAL_GLOBAL_ROUNDING
2911
  unsigned int rnd_mode = *prnd_mode;
2912
#endif
2913
#else
2914
UINT128
2915
bid128_sub (UINT128 x, UINT128 y
2916
            _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
2917
            _EXC_INFO_PARAM) {
2918
#endif
2919
 
2920
  UINT128 res;
2921
  UINT64 y_sign;
2922
 
2923
  if ((y.w[HIGH_128W] & MASK_NAN) != MASK_NAN) {        // y is not NAN
2924
    // change its sign
2925
    y_sign = y.w[HIGH_128W] & MASK_SIGN;        // 0 for positive, MASK_SIGN for negative
2926
    if (y_sign)
2927
      y.w[HIGH_128W] = y.w[HIGH_128W] & 0x7fffffffffffffffull;
2928
    else
2929
      y.w[HIGH_128W] = y.w[HIGH_128W] | 0x8000000000000000ull;
2930
  }
2931
#if DECIMAL_CALL_BY_REFERENCE
2932
  bid128_add (&res, &x, &y
2933
              _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
2934
              _EXC_INFO_ARG);
2935
#else
2936
  res = bid128_add (x, y
2937
                    _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
2938
                    _EXC_INFO_ARG);
2939
#endif
2940
  BID_RETURN (res);
2941
}

powered by: WebSVN 2.1.0

© copyright 1999-2024 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.