URL https://opencores.org/ocsvn/openrisc/openrisc/trunk
Subversion Repositories openrisc

[/] [openrisc/] [trunk/] [gnu-stable/] [gcc-4.5.1/] [libgcc/] [config/] [libbid/] [bid128_add.c] - Blame information for rev 826

Details | Compare with Previous | View Log

/* Copyright (C) 2007, 2009  Free Software Foundation, Inc.
 
This file is part of GCC.
 
GCC is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free
Software Foundation; either version 3, or (at your option) any later
version.
 
GCC is distributed in the hope that it will be useful, but WITHOUT ANY
WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
for more details.
 
Under Section 7 of GPL version 3, you are granted additional
permissions described in the GCC Runtime Library Exception, version
3.1, as published by the Free Software Foundation.
 
You should have received a copy of the GNU General Public License and
a copy of the GCC Runtime Library Exception along with this program;
see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
<http://www.gnu.org/licenses/>.  */
 
#include "bid_internal.h"
 
 
#if DECIMAL_CALL_BY_REFERENCE
void
bid64dq_add (UINT64 * pres, UINT64 * px, UINT128 * py
             _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
             _EXC_INFO_PARAM) {
  UINT64 x = *px;
#if !DECIMAL_GLOBAL_ROUNDING
  unsigned int rnd_mode = *prnd_mode;
#endif
#else
UINT64
bid64dq_add (UINT64 x, UINT128 y
             _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
             _EXC_INFO_PARAM) {
#endif
  UINT64 res = 0xbaddbaddbaddbaddull;
  UINT128 x1;
 
#if DECIMAL_CALL_BY_REFERENCE
  bid64_to_bid128 (&x1, &x _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
  bid64qq_add (&res, &x1, py
               _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
               _EXC_INFO_ARG);
#else
  x1 = bid64_to_bid128 (x _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
  res = bid64qq_add (x1, y
                     _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
                     _EXC_INFO_ARG);
#endif
  BID_RETURN (res);
}
 
 
#if DECIMAL_CALL_BY_REFERENCE
void
bid64qd_add (UINT64 * pres, UINT128 * px, UINT64 * py
             _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
             _EXC_INFO_PARAM) {
  UINT64 y = *py;
#if !DECIMAL_GLOBAL_ROUNDING
  unsigned int rnd_mode = *prnd_mode;
#endif
#else
UINT64
bid64qd_add (UINT128 x, UINT64 y
             _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
             _EXC_INFO_PARAM) {
#endif
  UINT64 res = 0xbaddbaddbaddbaddull;
  UINT128 y1;
 
#if DECIMAL_CALL_BY_REFERENCE
  bid64_to_bid128 (&y1, &y _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
  bid64qq_add (&res, px, &y1
               _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
               _EXC_INFO_ARG);
#else
  y1 = bid64_to_bid128 (y _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
  res = bid64qq_add (x, y1
                     _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
                     _EXC_INFO_ARG);
#endif
  BID_RETURN (res);
}
 
 
#if DECIMAL_CALL_BY_REFERENCE
void
bid64qq_add (UINT64 * pres, UINT128 * px, UINT128 * py
             _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
             _EXC_INFO_PARAM) {
  UINT128 x = *px, y = *py;
#if !DECIMAL_GLOBAL_ROUNDING
  unsigned int rnd_mode = *prnd_mode;
#endif
#else
UINT64
bid64qq_add (UINT128 x, UINT128 y
             _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
             _EXC_INFO_PARAM) {
#endif
 
  UINT128 one = { {0x0000000000000001ull, 0x3040000000000000ull}
  };
  UINT64 res = 0xbaddbaddbaddbaddull;
 
  BID_SWAP128 (one);
#if DECIMAL_CALL_BY_REFERENCE
  bid64qqq_fma (&res, &one, &x, &y
                _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
                _EXC_INFO_ARG);
#else
  res = bid64qqq_fma (one, x, y
                      _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
                      _EXC_INFO_ARG);
#endif
  BID_RETURN (res);
}
 
 
#if DECIMAL_CALL_BY_REFERENCE
void
bid128dd_add (UINT128 * pres, UINT64 * px, UINT64 * py
              _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
              _EXC_INFO_PARAM) {
  UINT64 x = *px, y = *py;
#if !DECIMAL_GLOBAL_ROUNDING
  unsigned int rnd_mode = *prnd_mode;
#endif
#else
UINT128
bid128dd_add (UINT64 x, UINT64 y
              _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
              _EXC_INFO_PARAM) {
#endif
  UINT128 res = { {0xbaddbaddbaddbaddull, 0xbaddbaddbaddbaddull}
  };
  UINT128 x1, y1;
 
#if DECIMAL_CALL_BY_REFERENCE
  bid64_to_bid128 (&x1, &x _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
  bid64_to_bid128 (&y1, &y _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
  bid128_add (&res, &x1, &y1
              _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
              _EXC_INFO_ARG);
#else
  x1 = bid64_to_bid128 (x _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
  y1 = bid64_to_bid128 (y _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
  res = bid128_add (x1, y1
                    _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
                    _EXC_INFO_ARG);
#endif
  BID_RETURN (res);
}
 
 
#if DECIMAL_CALL_BY_REFERENCE
void
bid128dq_add (UINT128 * pres, UINT64 * px, UINT128 * py
              _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
              _EXC_INFO_PARAM) {
  UINT64 x = *px;
#if !DECIMAL_GLOBAL_ROUNDING
  unsigned int rnd_mode = *prnd_mode;
#endif
#else
UINT128
bid128dq_add (UINT64 x, UINT128 y
              _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
              _EXC_INFO_PARAM) {
#endif
  UINT128 res = { {0xbaddbaddbaddbaddull, 0xbaddbaddbaddbaddull}
  };
  UINT128 x1;
 
#if DECIMAL_CALL_BY_REFERENCE
  bid64_to_bid128 (&x1, &x _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
  bid128_add (&res, &x1, py
              _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
              _EXC_INFO_ARG);
#else
  x1 = bid64_to_bid128 (x _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
  res = bid128_add (x1, y
                    _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
                    _EXC_INFO_ARG);
#endif
  BID_RETURN (res);
}
 
 
#if DECIMAL_CALL_BY_REFERENCE
void
bid128qd_add (UINT128 * pres, UINT128 * px, UINT64 * py
              _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
              _EXC_INFO_PARAM) {
  UINT64 y = *py;
#if !DECIMAL_GLOBAL_ROUNDING
  unsigned int rnd_mode = *prnd_mode;
#endif
#else
UINT128
bid128qd_add (UINT128 x, UINT64 y
              _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
              _EXC_INFO_PARAM) {
#endif
  UINT128 res = { {0xbaddbaddbaddbaddull, 0xbaddbaddbaddbaddull}
  };
  UINT128 y1;
 
#if DECIMAL_CALL_BY_REFERENCE
  bid64_to_bid128 (&y1, &y _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
  bid128_add (&res, px, &y1
              _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
              _EXC_INFO_ARG);
#else
  y1 = bid64_to_bid128 (y _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
  res = bid128_add (x, y1
                    _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
                    _EXC_INFO_ARG);
#endif
  BID_RETURN (res);
}
 
 
// bid128_add stands for bid128qq_add
 
 
/*****************************************************************************
 *  BID64/BID128 sub
 ****************************************************************************/
 
#if DECIMAL_CALL_BY_REFERENCE
void
bid64dq_sub (UINT64 * pres, UINT64 * px, UINT128 * py
             _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
             _EXC_INFO_PARAM) {
  UINT64 x = *px;
#if !DECIMAL_GLOBAL_ROUNDING
  unsigned int rnd_mode = *prnd_mode;
#endif
#else
UINT64
bid64dq_sub (UINT64 x, UINT128 y
             _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
             _EXC_INFO_PARAM) {
#endif
  UINT64 res = 0xbaddbaddbaddbaddull;
  UINT128 x1;
 
#if DECIMAL_CALL_BY_REFERENCE
  bid64_to_bid128 (&x1, &x _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
  bid64qq_sub (&res, &x1, py
               _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
               _EXC_INFO_ARG);
#else
  x1 = bid64_to_bid128 (x _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
  res = bid64qq_sub (x1, y
                     _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
                     _EXC_INFO_ARG);
#endif
  BID_RETURN (res);
}
 
 
#if DECIMAL_CALL_BY_REFERENCE
void
bid64qd_sub (UINT64 * pres, UINT128 * px, UINT64 * py
             _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
             _EXC_INFO_PARAM) {
  UINT64 y = *py;
#if !DECIMAL_GLOBAL_ROUNDING
  unsigned int rnd_mode = *prnd_mode;
#endif
#else
UINT64
bid64qd_sub (UINT128 x, UINT64 y
             _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
             _EXC_INFO_PARAM) {
#endif
  UINT64 res = 0xbaddbaddbaddbaddull;
  UINT128 y1;
 
#if DECIMAL_CALL_BY_REFERENCE
  bid64_to_bid128 (&y1, &y _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
  bid64qq_sub (&res, px, &y1
               _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
               _EXC_INFO_ARG);
#else
  y1 = bid64_to_bid128 (y _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
  res = bid64qq_sub (x, y1
                     _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
                     _EXC_INFO_ARG);
#endif
  BID_RETURN (res);
}
 
 
#if DECIMAL_CALL_BY_REFERENCE
void
bid64qq_sub (UINT64 * pres, UINT128 * px, UINT128 * py
             _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
             _EXC_INFO_PARAM) {
  UINT128 x = *px, y = *py;
#if !DECIMAL_GLOBAL_ROUNDING
  unsigned int rnd_mode = *prnd_mode;
#endif
#else
UINT64
bid64qq_sub (UINT128 x, UINT128 y
             _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
             _EXC_INFO_PARAM) {
#endif
 
  UINT128 one = { {0x0000000000000001ull, 0x3040000000000000ull}
  };
  UINT64 res = 0xbaddbaddbaddbaddull;
  UINT64 y_sign;
 
  BID_SWAP128 (one);
  if ((y.w[HIGH_128W] & MASK_NAN) != MASK_NAN) {        // y is not NAN
    // change its sign
    y_sign = y.w[HIGH_128W] & MASK_SIGN;        // 0 for positive, MASK_SIGN for negative
    if (y_sign)
      y.w[HIGH_128W] = y.w[HIGH_128W] & 0x7fffffffffffffffull;
    else
      y.w[HIGH_128W] = y.w[HIGH_128W] | 0x8000000000000000ull;
  }
#if DECIMAL_CALL_BY_REFERENCE
  bid64qqq_fma (&res, &one, &x, &y
                _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
                _EXC_INFO_ARG);
#else
  res = bid64qqq_fma (one, x, y
                      _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
                      _EXC_INFO_ARG);
#endif
  BID_RETURN (res);
}
 
 
#if DECIMAL_CALL_BY_REFERENCE
void
bid128dd_sub (UINT128 * pres, UINT64 * px, UINT64 * py
              _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
              _EXC_INFO_PARAM) {
  UINT64 x = *px, y = *py;
#if !DECIMAL_GLOBAL_ROUNDING
  unsigned int rnd_mode = *prnd_mode;
#endif
#else
UINT128
bid128dd_sub (UINT64 x, UINT64 y
              _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
              _EXC_INFO_PARAM) {
#endif
  UINT128 res = { {0xbaddbaddbaddbaddull, 0xbaddbaddbaddbaddull}
  };
  UINT128 x1, y1;
 
#if DECIMAL_CALL_BY_REFERENCE
  bid64_to_bid128 (&x1, &x _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
  bid64_to_bid128 (&y1, &y _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
  bid128_sub (&res, &x1, &y1
              _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
              _EXC_INFO_ARG);
#else
  x1 = bid64_to_bid128 (x _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
  y1 = bid64_to_bid128 (y _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
  res = bid128_sub (x1, y1
                    _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
                    _EXC_INFO_ARG);
#endif
  BID_RETURN (res);
}
 
 
#if DECIMAL_CALL_BY_REFERENCE
void
bid128dq_sub (UINT128 * pres, UINT64 * px, UINT128 * py
              _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
              _EXC_INFO_PARAM) {
  UINT64 x = *px;
#if !DECIMAL_GLOBAL_ROUNDING
  unsigned int rnd_mode = *prnd_mode;
#endif
#else
UINT128
bid128dq_sub (UINT64 x, UINT128 y
              _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
              _EXC_INFO_PARAM) {
#endif
  UINT128 res = { {0xbaddbaddbaddbaddull, 0xbaddbaddbaddbaddull}
  };
  UINT128 x1;
 
#if DECIMAL_CALL_BY_REFERENCE
  bid64_to_bid128 (&x1, &x _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
  bid128_sub (&res, &x1, py
              _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
              _EXC_INFO_ARG);
#else
  x1 = bid64_to_bid128 (x _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
  res = bid128_sub (x1, y
                    _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
                    _EXC_INFO_ARG);
#endif
  BID_RETURN (res);
}
 
 
#if DECIMAL_CALL_BY_REFERENCE
void
bid128qd_sub (UINT128 * pres, UINT128 * px, UINT64 * py
              _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
              _EXC_INFO_PARAM) {
  UINT64 y = *py;
#if !DECIMAL_GLOBAL_ROUNDING
  unsigned int rnd_mode = *prnd_mode;
#endif
#else
UINT128
bid128qd_sub (UINT128 x, UINT64 y
              _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
              _EXC_INFO_PARAM) {
#endif
  UINT128 res = { {0xbaddbaddbaddbaddull, 0xbaddbaddbaddbaddull}
  };
  UINT128 y1;
 
#if DECIMAL_CALL_BY_REFERENCE
  bid64_to_bid128 (&y1, &y _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
  bid128_sub (&res, px, &y1
              _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
              _EXC_INFO_ARG);
#else
  y1 = bid64_to_bid128 (y _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
  res = bid128_sub (x, y1
                    _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
                    _EXC_INFO_ARG);
#endif
  BID_RETURN (res);
}
 
#if DECIMAL_CALL_BY_REFERENCE
void
bid128_add (UINT128 * pres, UINT128 * px, UINT128 * py
            _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
            _EXC_INFO_PARAM) {
  UINT128 x = *px, y = *py;
#if !DECIMAL_GLOBAL_ROUNDING
  unsigned int rnd_mode = *prnd_mode;
#endif
#else
UINT128
bid128_add (UINT128 x, UINT128 y
            _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
            _EXC_INFO_PARAM) {
#endif
  UINT128 res = { {0xbaddbaddbaddbaddull, 0xbaddbaddbaddbaddull}
  };
  UINT64 x_sign, y_sign, tmp_sign;
  UINT64 x_exp, y_exp, tmp_exp; // e1 = x_exp, e2 = y_exp
  UINT64 C1_hi, C2_hi, tmp_signif_hi;
  UINT64 C1_lo, C2_lo, tmp_signif_lo;
  // Note: C1.w[1], C1.w[0] represent C1_hi, C1_lo (all UINT64)
  // Note: C2.w[1], C2.w[0] represent C2_hi, C2_lo (all UINT64)
  UINT64 tmp64, tmp64A, tmp64B;
  BID_UI64DOUBLE tmp1, tmp2;
  int x_nr_bits, y_nr_bits;
  int q1, q2, delta, scale, x1, ind, shift, tmp_inexact = 0;
  UINT64 halfulp64;
  UINT128 halfulp128;
  UINT128 C1, C2;
  UINT128 ten2m1;
  UINT128 highf2star;           // top 128 bits in f2*; low 128 bits in R256[1], R256[0]
  UINT256 P256, Q256, R256;
  int is_inexact = 0, is_midpoint_lt_even = 0, is_midpoint_gt_even = 0;
  int is_inexact_lt_midpoint = 0, is_inexact_gt_midpoint = 0;
  int second_pass = 0;
 
  BID_SWAP128 (x);
  BID_SWAP128 (y);
  x_sign = x.w[1] & MASK_SIGN;  // 0 for positive, MASK_SIGN for negative
  y_sign = y.w[1] & MASK_SIGN;  // 0 for positive, MASK_SIGN for negative
 
  // check for NaN or Infinity
  if (((x.w[1] & MASK_SPECIAL) == MASK_SPECIAL)
      || ((y.w[1] & MASK_SPECIAL) == MASK_SPECIAL)) {
    // x is special or y is special
    if ((x.w[1] & MASK_NAN) == MASK_NAN) {      // x is NAN
      // check first for non-canonical NaN payload
      if (((x.w[1] & 0x00003fffffffffffull) > 0x0000314dc6448d93ull) ||
          (((x.w[1] & 0x00003fffffffffffull) == 0x0000314dc6448d93ull)
           && (x.w[0] > 0x38c15b09ffffffffull))) {
        x.w[1] = x.w[1] & 0xffffc00000000000ull;
        x.w[0] = 0x0ull;
      }
      if ((x.w[1] & MASK_SNAN) == MASK_SNAN) {  // x is SNAN
        // set invalid flag
        *pfpsf |= INVALID_EXCEPTION;
        // return quiet (x)
        res.w[1] = x.w[1] & 0xfc003fffffffffffull;
        // clear out also G[6]-G[16]
        res.w[0] = x.w[0];
      } else {  // x is QNaN
        // return x
        res.w[1] = x.w[1] & 0xfc003fffffffffffull;
        // clear out G[6]-G[16]
        res.w[0] = x.w[0];
        // if y = SNaN signal invalid exception
        if ((y.w[1] & MASK_SNAN) == MASK_SNAN) {
          // set invalid flag
          *pfpsf |= INVALID_EXCEPTION;
        }
      }
      BID_SWAP128 (res);
      BID_RETURN (res);
    } else if ((y.w[1] & MASK_NAN) == MASK_NAN) {       // y is NAN
      // check first for non-canonical NaN payload
      if (((y.w[1] & 0x00003fffffffffffull) > 0x0000314dc6448d93ull) ||
          (((y.w[1] & 0x00003fffffffffffull) == 0x0000314dc6448d93ull)
           && (y.w[0] > 0x38c15b09ffffffffull))) {
        y.w[1] = y.w[1] & 0xffffc00000000000ull;
        y.w[0] = 0x0ull;
      }
      if ((y.w[1] & MASK_SNAN) == MASK_SNAN) {  // y is SNAN
        // set invalid flag
        *pfpsf |= INVALID_EXCEPTION;
        // return quiet (y)
        res.w[1] = y.w[1] & 0xfc003fffffffffffull;
        // clear out also G[6]-G[16]
        res.w[0] = y.w[0];
      } else {  // y is QNaN
        // return y
        res.w[1] = y.w[1] & 0xfc003fffffffffffull;
        // clear out G[6]-G[16]
        res.w[0] = y.w[0];
      }
      BID_SWAP128 (res);
      BID_RETURN (res);
    } else {    // neither x not y is NaN; at least one is infinity
      if ((x.w[1] & MASK_ANY_INF) == MASK_INF) {        // x is infinity
        if ((y.w[1] & MASK_ANY_INF) == MASK_INF) {      // y is infinity
          // if same sign, return either of them
          if ((x.w[1] & MASK_SIGN) == (y.w[1] & MASK_SIGN)) {
            res.w[1] = x_sign | MASK_INF;
            res.w[0] = 0x0ull;
          } else {      // x and y are infinities of opposite signs
            // set invalid flag
            *pfpsf |= INVALID_EXCEPTION;
            // return QNaN Indefinite
            res.w[1] = 0x7c00000000000000ull;
            res.w[0] = 0x0000000000000000ull;
          }
        } else {        // y is 0 or finite
          // return x
          res.w[1] = x_sign | MASK_INF;
          res.w[0] = 0x0ull;
        }
      } else {  // x is not NaN or infinity, so y must be infinity
        res.w[1] = y_sign | MASK_INF;
        res.w[0] = 0x0ull;
      }
      BID_SWAP128 (res);
      BID_RETURN (res);
    }
  }
  // unpack the arguments
 
  // unpack x 
  C1_hi = x.w[1] & MASK_COEFF;
  C1_lo = x.w[0];
  // test for non-canonical values:
  // - values whose encoding begins with x00, x01, or x10 and whose 
  //   coefficient is larger than 10^34 -1, or
  // - values whose encoding begins with x1100, x1101, x1110 (if NaNs 
  //   and infinitis were eliminated already this test is reduced to 
  //   checking for x10x) 
 
  // x is not infinity; check for non-canonical values - treated as zero
  if ((x.w[1] & 0x6000000000000000ull) == 0x6000000000000000ull) {
    // G0_G1=11; non-canonical
    x_exp = (x.w[1] << 2) & MASK_EXP;   // biased and shifted left 49 bits
    C1_hi = 0;   // significand high
    C1_lo = 0;   // significand low
  } else {      // G0_G1 != 11
    x_exp = x.w[1] & MASK_EXP;  // biased and shifted left 49 bits
    if (C1_hi > 0x0001ed09bead87c0ull ||
        (C1_hi == 0x0001ed09bead87c0ull
         && C1_lo > 0x378d8e63ffffffffull)) {
      // x is non-canonical if coefficient is larger than 10^34 -1
      C1_hi = 0;
      C1_lo = 0;
    } else {    // canonical
      ;
    }
  }
 
  // unpack y  
  C2_hi = y.w[1] & MASK_COEFF;
  C2_lo = y.w[0];
  // y is not infinity; check for non-canonical values - treated as zero 
  if ((y.w[1] & 0x6000000000000000ull) == 0x6000000000000000ull) {
    // G0_G1=11; non-canonical 
    y_exp = (y.w[1] << 2) & MASK_EXP;   // biased and shifted left 49 bits
    C2_hi = 0;   // significand high
    C2_lo = 0;   // significand low 
  } else {      // G0_G1 != 11 
    y_exp = y.w[1] & MASK_EXP;  // biased and shifted left 49 bits
    if (C2_hi > 0x0001ed09bead87c0ull ||
        (C2_hi == 0x0001ed09bead87c0ull
         && C2_lo > 0x378d8e63ffffffffull)) {
      // y is non-canonical if coefficient is larger than 10^34 -1 
      C2_hi = 0;
      C2_lo = 0;
    } else {    // canonical
      ;
    }
  }
 
  if ((C1_hi == 0x0ull) && (C1_lo == 0x0ull)) {
    // x is 0 and y is not special
    // if y is 0 return 0 with the smaller exponent
    if ((C2_hi == 0x0ull) && (C2_lo == 0x0ull)) {
      if (x_exp < y_exp)
        res.w[1] = x_exp;
      else
        res.w[1] = y_exp;
      if (x_sign && y_sign)
        res.w[1] = res.w[1] | x_sign;   // both negative
      else if (rnd_mode == ROUNDING_DOWN && x_sign != y_sign)
        res.w[1] = res.w[1] | 0x8000000000000000ull;    // -0
      // else; // res = +0
      res.w[0] = 0;
    } else {
      // for 0 + y return y, with the preferred exponent
      if (y_exp <= x_exp) {
        res.w[1] = y.w[1];
        res.w[0] = y.w[0];
      } else {  // if y_exp > x_exp
        // return (C2 * 10^scale) * 10^(y_exp - scale)
        // where scale = min (P34-q2, y_exp-x_exp)
        // determine q2 = nr. of decimal digits in y
        //  determine first the nr. of bits in y (y_nr_bits)
 
        if (C2_hi == 0) {        // y_bits is the nr. of bits in C2_lo
          if (C2_lo >= 0x0020000000000000ull) { // y >= 2^53
            // split the 64-bit value in two 32-bit halves to avoid 
            // rounding errors
            if (C2_lo >= 0x0000000100000000ull) {       // y >= 2^32
              tmp2.d = (double) (C2_lo >> 32);  // exact conversion
              y_nr_bits =
                32 +
                ((((unsigned int) (tmp2.ui64 >> 52)) & 0x7ff) - 0x3ff);
            } else {    // y < 2^32
              tmp2.d = (double) (C2_lo);        // exact conversion
              y_nr_bits =
                ((((unsigned int) (tmp2.ui64 >> 52)) & 0x7ff) - 0x3ff);
            }
          } else {      // if y < 2^53
            tmp2.d = (double) C2_lo;    // exact conversion
            y_nr_bits =
              ((((unsigned int) (tmp2.ui64 >> 52)) & 0x7ff) - 0x3ff);
          }
        } else {        // C2_hi != 0 => nr. bits = 64 + nr_bits (C2_hi)
          tmp2.d = (double) C2_hi;      // exact conversion
          y_nr_bits =
            64 + ((((unsigned int) (tmp2.ui64 >> 52)) & 0x7ff) - 0x3ff);
        }
        q2 = nr_digits[y_nr_bits].digits;
        if (q2 == 0) {
          q2 = nr_digits[y_nr_bits].digits1;
          if (C2_hi > nr_digits[y_nr_bits].threshold_hi ||
              (C2_hi == nr_digits[y_nr_bits].threshold_hi &&
               C2_lo >= nr_digits[y_nr_bits].threshold_lo))
            q2++;
        }
        // return (C2 * 10^scale) * 10^(y_exp - scale)
        // where scale = min (P34-q2, y_exp-x_exp)
        scale = P34 - q2;
        ind = (y_exp - x_exp) >> 49;
        if (ind < scale)
          scale = ind;
        if (scale == 0) {
          res.w[1] = y.w[1];
          res.w[0] = y.w[0];
        } else if (q2 <= 19) {  // y fits in 64 bits 
          if (scale <= 19) {    // 10^scale fits in 64 bits
            // 64 x 64 C2_lo * ten2k64[scale]
            __mul_64x64_to_128MACH (res, C2_lo, ten2k64[scale]);
          } else {      // 10^scale fits in 128 bits
            // 64 x 128 C2_lo * ten2k128[scale - 20]
            __mul_128x64_to_128 (res, C2_lo, ten2k128[scale - 20]);
          }
        } else {        // y fits in 128 bits, but 10^scale must fit in 64 bits 
          // 64 x 128 ten2k64[scale] * C2
          C2.w[1] = C2_hi;
          C2.w[0] = C2_lo;
          __mul_128x64_to_128 (res, ten2k64[scale], C2);
        }
        // subtract scale from the exponent
        y_exp = y_exp - ((UINT64) scale << 49);
        res.w[1] = res.w[1] | y_sign | y_exp;
      }
    }
    BID_SWAP128 (res);
    BID_RETURN (res);
  } else if ((C2_hi == 0x0ull) && (C2_lo == 0x0ull)) {
    // y is 0 and x is not special, and not zero
    // for x + 0 return x, with the preferred exponent
    if (x_exp <= y_exp) {
      res.w[1] = x.w[1];
      res.w[0] = x.w[0];
    } else {    // if x_exp > y_exp
      // return (C1 * 10^scale) * 10^(x_exp - scale)
      // where scale = min (P34-q1, x_exp-y_exp)
      // determine q1 = nr. of decimal digits in x
      //  determine first the nr. of bits in x
      if (C1_hi == 0) {  // x_bits is the nr. of bits in C1_lo
        if (C1_lo >= 0x0020000000000000ull) {   // x >= 2^53
          // split the 64-bit value in two 32-bit halves to avoid 
          // rounding errors
          if (C1_lo >= 0x0000000100000000ull) { // x >= 2^32
            tmp1.d = (double) (C1_lo >> 32);    // exact conversion
            x_nr_bits =
              32 + ((((unsigned int) (tmp1.ui64 >> 52)) & 0x7ff) -
                    0x3ff);
          } else {      // x < 2^32
            tmp1.d = (double) (C1_lo);  // exact conversion
            x_nr_bits =
              ((((unsigned int) (tmp1.ui64 >> 52)) & 0x7ff) - 0x3ff);
          }
        } else {        // if x < 2^53
          tmp1.d = (double) C1_lo;      // exact conversion
          x_nr_bits =
            ((((unsigned int) (tmp1.ui64 >> 52)) & 0x7ff) - 0x3ff);
        }
      } else {  // C1_hi != 0 => nr. bits = 64 + nr_bits (C1_hi)
        tmp1.d = (double) C1_hi;        // exact conversion
        x_nr_bits =
          64 + ((((unsigned int) (tmp1.ui64 >> 52)) & 0x7ff) - 0x3ff);
      }
      q1 = nr_digits[x_nr_bits].digits;
      if (q1 == 0) {
        q1 = nr_digits[x_nr_bits].digits1;
        if (C1_hi > nr_digits[x_nr_bits].threshold_hi ||
            (C1_hi == nr_digits[x_nr_bits].threshold_hi &&
             C1_lo >= nr_digits[x_nr_bits].threshold_lo))
          q1++;
      }
      // return (C1 * 10^scale) * 10^(x_exp - scale)
      // where scale = min (P34-q1, x_exp-y_exp)  
      scale = P34 - q1;
      ind = (x_exp - y_exp) >> 49;
      if (ind < scale)
        scale = ind;
      if (scale == 0) {
        res.w[1] = x.w[1];
        res.w[0] = x.w[0];
      } else if (q1 <= 19) {    // x fits in 64 bits  
        if (scale <= 19) {      // 10^scale fits in 64 bits
          // 64 x 64 C1_lo * ten2k64[scale] 
          __mul_64x64_to_128MACH (res, C1_lo, ten2k64[scale]);
        } else {        // 10^scale fits in 128 bits
          // 64 x 128 C1_lo * ten2k128[scale - 20]
          __mul_128x64_to_128 (res, C1_lo, ten2k128[scale - 20]);
        }
      } else {  // x fits in 128 bits, but 10^scale must fit in 64 bits
        // 64 x 128 ten2k64[scale] * C1
        C1.w[1] = C1_hi;
        C1.w[0] = C1_lo;
        __mul_128x64_to_128 (res, ten2k64[scale], C1);
      }
      // subtract scale from the exponent
      x_exp = x_exp - ((UINT64) scale << 49);
      res.w[1] = res.w[1] | x_sign | x_exp;
    }
    BID_SWAP128 (res);
    BID_RETURN (res);
  } else {      // x and y are not canonical, not special, and are not zero
    // note that the result may still be zero, and then it has to have the
    // preferred exponent
    if (x_exp < y_exp) {        // if exp_x < exp_y then swap x and y 
      tmp_sign = x_sign;
      tmp_exp = x_exp;
      tmp_signif_hi = C1_hi;
      tmp_signif_lo = C1_lo;
      x_sign = y_sign;
      x_exp = y_exp;
      C1_hi = C2_hi;
      C1_lo = C2_lo;
      y_sign = tmp_sign;
      y_exp = tmp_exp;
      C2_hi = tmp_signif_hi;
      C2_lo = tmp_signif_lo;
    }
    // q1 = nr. of decimal digits in x
    //  determine first the nr. of bits in x
    if (C1_hi == 0) {    // x_bits is the nr. of bits in C1_lo
      if (C1_lo >= 0x0020000000000000ull) {     // x >= 2^53
        //split the 64-bit value in two 32-bit halves to avoid rounding errors
        if (C1_lo >= 0x0000000100000000ull) {   // x >= 2^32
          tmp1.d = (double) (C1_lo >> 32);      // exact conversion
          x_nr_bits =
            32 + ((((unsigned int) (tmp1.ui64 >> 52)) & 0x7ff) - 0x3ff);
        } else {        // x < 2^32
          tmp1.d = (double) (C1_lo);    // exact conversion
          x_nr_bits =
            ((((unsigned int) (tmp1.ui64 >> 52)) & 0x7ff) - 0x3ff);
        }
      } else {  // if x < 2^53
        tmp1.d = (double) C1_lo;        // exact conversion
        x_nr_bits =
          ((((unsigned int) (tmp1.ui64 >> 52)) & 0x7ff) - 0x3ff);
      }
    } else {    // C1_hi != 0 => nr. bits = 64 + nr_bits (C1_hi)
      tmp1.d = (double) C1_hi;  // exact conversion
      x_nr_bits =
        64 + ((((unsigned int) (tmp1.ui64 >> 52)) & 0x7ff) - 0x3ff);
    }
 
    q1 = nr_digits[x_nr_bits].digits;
    if (q1 == 0) {
      q1 = nr_digits[x_nr_bits].digits1;
      if (C1_hi > nr_digits[x_nr_bits].threshold_hi ||
          (C1_hi == nr_digits[x_nr_bits].threshold_hi &&
           C1_lo >= nr_digits[x_nr_bits].threshold_lo))
        q1++;
    }
    // q2 = nr. of decimal digits in y
    //  determine first the nr. of bits in y (y_nr_bits)
    if (C2_hi == 0) {    // y_bits is the nr. of bits in C2_lo
      if (C2_lo >= 0x0020000000000000ull) {     // y >= 2^53
        //split the 64-bit value in two 32-bit halves to avoid rounding errors
        if (C2_lo >= 0x0000000100000000ull) {   // y >= 2^32
          tmp2.d = (double) (C2_lo >> 32);      // exact conversion
          y_nr_bits =
            32 + ((((unsigned int) (tmp2.ui64 >> 52)) & 0x7ff) - 0x3ff);
        } else {        // y < 2^32
          tmp2.d = (double) (C2_lo);    // exact conversion
          y_nr_bits =
            ((((unsigned int) (tmp2.ui64 >> 52)) & 0x7ff) - 0x3ff);
        }
      } else {  // if y < 2^53
        tmp2.d = (double) C2_lo;        // exact conversion
        y_nr_bits =
          ((((unsigned int) (tmp2.ui64 >> 52)) & 0x7ff) - 0x3ff);
      }
    } else {    // C2_hi != 0 => nr. bits = 64 + nr_bits (C2_hi)
      tmp2.d = (double) C2_hi;  // exact conversion
      y_nr_bits =
        64 + ((((unsigned int) (tmp2.ui64 >> 52)) & 0x7ff) - 0x3ff);
    }
 
    q2 = nr_digits[y_nr_bits].digits;
    if (q2 == 0) {
      q2 = nr_digits[y_nr_bits].digits1;
      if (C2_hi > nr_digits[y_nr_bits].threshold_hi ||
          (C2_hi == nr_digits[y_nr_bits].threshold_hi &&
           C2_lo >= nr_digits[y_nr_bits].threshold_lo))
        q2++;
    }
 
    delta = q1 + (int) (x_exp >> 49) - q2 - (int) (y_exp >> 49);
 
    if (delta >= P34) {
      // round the result directly because 0 < C2 < ulp (C1 * 10^(x_exp-e2))
      // n = C1 * 10^e1 or n = C1 +/- 10^(q1-P34)) * 10^e1
      // the result is inexact; the preferred exponent is the least possible
 
      if (delta >= P34 + 1) {
        // for RN the result is the operand with the larger magnitude,
        // possibly scaled up by 10^(P34-q1)
        // an overflow cannot occur in this case (rounding to nearest)
        if (q1 < P34) { // scale C1 up by 10^(P34-q1)
          // Note: because delta >= P34+1 it is certain that 
          //     x_exp - ((UINT64)scale << 49) will stay above e_min
          scale = P34 - q1;
          if (q1 <= 19) {       // C1 fits in 64 bits
            // 1 <= q1 <= 19 => 15 <= scale <= 33
            if (scale <= 19) {  // 10^scale fits in 64 bits
              __mul_64x64_to_128MACH (C1, ten2k64[scale], C1_lo);
            } else {    // if 20 <= scale <= 33
              // C1 * 10^scale = (C1 * 10^(scale-19)) * 10^19 where
              // (C1 * 10^(scale-19)) fits in 64 bits
              C1_lo = C1_lo * ten2k64[scale - 19];
              __mul_64x64_to_128MACH (C1, ten2k64[19], C1_lo);
            }
          } else {      //if 20 <= q1 <= 33=P34-1 then C1 fits only in 128 bits
            // => 1 <= P34 - q1 <= 14 so 10^(P34-q1) fits in 64 bits
            C1.w[1] = C1_hi;
            C1.w[0] = C1_lo;
            // C1 = ten2k64[P34 - q1] * C1
            __mul_128x64_to_128 (C1, ten2k64[P34 - q1], C1);
          }
          x_exp = x_exp - ((UINT64) scale << 49);
          C1_hi = C1.w[1];
          C1_lo = C1.w[0];
        }
        // some special cases arise: if delta = P34 + 1 and C1 = 10^(P34-1) 
        // (after scaling) and x_sign != y_sign and C2 > 5*10^(q2-1) => 
        // subtract 1 ulp
        // Note: do this only for rounding to nearest; for other rounding 
        // modes the correction will be applied next
        if ((rnd_mode == ROUNDING_TO_NEAREST
             || rnd_mode == ROUNDING_TIES_AWAY) && delta == (P34 + 1)
            && C1_hi == 0x0000314dc6448d93ull
            && C1_lo == 0x38c15b0a00000000ull && x_sign != y_sign
            && ((q2 <= 19 && C2_lo > midpoint64[q2 - 1]) || (q2 >= 20
                                                             && (C2_hi >
                                                                 midpoint128
                                                                 [q2 -
                                                                  20].
                                                                 w[1]
                                                                 ||
                                                                 (C2_hi
                                                                  ==
                                                                  midpoint128
                                                                  [q2 -
                                                                   20].
                                                                  w[1]
                                                                  &&
                                                                  C2_lo
                                                                  >
                                                                  midpoint128
                                                                  [q2 -
                                                                   20].
                                                                  w
                                                                  [0])))))
        {
          // C1 = 10^34 - 1 and decrement x_exp by 1 (no underflow possible)
          C1_hi = 0x0001ed09bead87c0ull;
          C1_lo = 0x378d8e63ffffffffull;
          x_exp = x_exp - EXP_P1;
        }
        if (rnd_mode != ROUNDING_TO_NEAREST) {
          if ((rnd_mode == ROUNDING_DOWN && x_sign && y_sign) ||
              (rnd_mode == ROUNDING_UP && !x_sign && !y_sign)) {
            // add 1 ulp and then check for overflow
            C1_lo = C1_lo + 1;
            if (C1_lo == 0) {    // rounding overflow in the low 64 bits
              C1_hi = C1_hi + 1;
            }
            if (C1_hi == 0x0001ed09bead87c0ull
                && C1_lo == 0x378d8e6400000000ull) {
              // C1 = 10^34 => rounding overflow
              C1_hi = 0x0000314dc6448d93ull;
              C1_lo = 0x38c15b0a00000000ull;    // 10^33
              x_exp = x_exp + EXP_P1;
              if (x_exp == EXP_MAX_P1) {        // overflow
                C1_hi = 0x7800000000000000ull;  // +inf
                C1_lo = 0x0ull;
                x_exp = 0;       // x_sign is preserved
                // set overflow flag (the inexact flag was set too)
                *pfpsf |= OVERFLOW_EXCEPTION;
              }
            }
          } else if ((rnd_mode == ROUNDING_DOWN && !x_sign && y_sign) ||
                     (rnd_mode == ROUNDING_UP && x_sign && !y_sign) ||
                     (rnd_mode == ROUNDING_TO_ZERO
                      && x_sign != y_sign)) {
            // subtract 1 ulp from C1
            // Note: because delta >= P34 + 1 the result cannot be zero
            C1_lo = C1_lo - 1;
            if (C1_lo == 0xffffffffffffffffull)
              C1_hi = C1_hi - 1;
            // if the coefficient is 10^33 - 1 then make it 10^34 - 1 and 
            // decrease the exponent by 1 (because delta >= P34 + 1 the
            // exponent will not become less than e_min)
            // 10^33 - 1 = 0x0000314dc6448d9338c15b09ffffffff
            // 10^34 - 1 = 0x0001ed09bead87c0378d8e63ffffffff
            if (C1_hi == 0x0000314dc6448d93ull
                && C1_lo == 0x38c15b09ffffffffull) {
              // make C1 = 10^34  - 1
              C1_hi = 0x0001ed09bead87c0ull;
              C1_lo = 0x378d8e63ffffffffull;
              x_exp = x_exp - EXP_P1;
            }
          } else {
            ;   // the result is already correct
          }
        }
        // set the inexact flag
        *pfpsf |= INEXACT_EXCEPTION;
        // assemble the result
        res.w[1] = x_sign | x_exp | C1_hi;
        res.w[0] = C1_lo;
      } else {  // delta = P34 
        // in most cases, the smaller operand may be < or = or > 1/2 ulp of the
        // larger operand
        // however, the case C1 = 10^(q1-1) and x_sign != y_sign is special due
        // to accuracy loss after subtraction, and will be treated separately
        if (x_sign == y_sign || (q1 <= 20
                                 && (C1_hi != 0
                                     || C1_lo != ten2k64[q1 - 1]))
            || (q1 >= 21 && (C1_hi != ten2k128[q1 - 21].w[1]
                             || C1_lo != ten2k128[q1 - 21].w[0]))) {
          // if x_sign == y_sign or C1 != 10^(q1-1)
          // compare C2 with 1/2 ulp = 5 * 10^(q2-1), the latter read from table
          // Note: cases q1<=19 and q1>=20 can be coalesced at some latency cost
          if (q2 <= 19) {       // C2 and 5*10^(q2-1) both fit in 64 bits
            halfulp64 = midpoint64[q2 - 1];     // 5 * 10^(q2-1)
            if (C2_lo < halfulp64) {    // n2 < 1/2 ulp (n1)
              // for RN the result is the operand with the larger magnitude, 
              // possibly scaled up by 10^(P34-q1)
              // an overflow cannot occur in this case (rounding to nearest)
              if (q1 < P34) {   // scale C1 up by 10^(P34-q1)
                // Note: because delta = P34 it is certain that
                //     x_exp - ((UINT64)scale << 49) will stay above e_min
                scale = P34 - q1;
                if (q1 <= 19) { // C1 fits in 64 bits
                  // 1 <= q1 <= 19 => 15 <= scale <= 33
                  if (scale <= 19) {    // 10^scale fits in 64 bits
                    __mul_64x64_to_128MACH (C1, ten2k64[scale], C1_lo);
                  } else {      // if 20 <= scale <= 33
                    // C1 * 10^scale = (C1 * 10^(scale-19)) * 10^19 where
                    // (C1 * 10^(scale-19)) fits in 64 bits
                    C1_lo = C1_lo * ten2k64[scale - 19];
                    __mul_64x64_to_128MACH (C1, ten2k64[19], C1_lo);
                  }
                } else {        //if 20 <= q1 <= 33=P34-1 then C1 fits only in 128 bits
                  // => 1 <= P34 - q1 <= 14 so 10^(P34-q1) fits in 64 bits
                  C1.w[1] = C1_hi;
                  C1.w[0] = C1_lo;
                  // C1 = ten2k64[P34 - q1] * C1
                  __mul_128x64_to_128 (C1, ten2k64[P34 - q1], C1);
                }
                x_exp = x_exp - ((UINT64) scale << 49);
                C1_hi = C1.w[1];
                C1_lo = C1.w[0];
              }
              if (rnd_mode != ROUNDING_TO_NEAREST) {
                if ((rnd_mode == ROUNDING_DOWN && x_sign && y_sign) ||
                    (rnd_mode == ROUNDING_UP && !x_sign && !y_sign)) {
                  // add 1 ulp and then check for overflow
                  C1_lo = C1_lo + 1;
                  if (C1_lo == 0) {      // rounding overflow in the low 64 bits
                    C1_hi = C1_hi + 1;
                  }
                  if (C1_hi == 0x0001ed09bead87c0ull
                      && C1_lo == 0x378d8e6400000000ull) {
                    // C1 = 10^34 => rounding overflow
                    C1_hi = 0x0000314dc6448d93ull;
                    C1_lo = 0x38c15b0a00000000ull;      // 10^33
                    x_exp = x_exp + EXP_P1;
                    if (x_exp == EXP_MAX_P1) {  // overflow
                      C1_hi = 0x7800000000000000ull;    // +inf
                      C1_lo = 0x0ull;
                      x_exp = 0; // x_sign is preserved
                      // set overflow flag (the inexact flag was set too)
                      *pfpsf |= OVERFLOW_EXCEPTION;
                    }
                  }
                } else
                  if ((rnd_mode == ROUNDING_DOWN && !x_sign && y_sign)
                      || (rnd_mode == ROUNDING_UP && x_sign && !y_sign)
                      || (rnd_mode == ROUNDING_TO_ZERO
                          && x_sign != y_sign)) {
                  // subtract 1 ulp from C1
                  // Note: because delta >= P34 + 1 the result cannot be zero
                  C1_lo = C1_lo - 1;
                  if (C1_lo == 0xffffffffffffffffull)
                    C1_hi = C1_hi - 1;
                  // if the coefficient is 10^33-1 then make it 10^34-1 and 
                  // decrease the exponent by 1 (because delta >= P34 + 1 the
                  // exponent will not become less than e_min)
                  // 10^33 - 1 = 0x0000314dc6448d9338c15b09ffffffff
                  // 10^34 - 1 = 0x0001ed09bead87c0378d8e63ffffffff
                  if (C1_hi == 0x0000314dc6448d93ull
                      && C1_lo == 0x38c15b09ffffffffull) {
                    // make C1 = 10^34  - 1
                    C1_hi = 0x0001ed09bead87c0ull;
                    C1_lo = 0x378d8e63ffffffffull;
                    x_exp = x_exp - EXP_P1;
                  }
                } else {
                  ;     // the result is already correct
                }
              }
              // set the inexact flag
              *pfpsf |= INEXACT_EXCEPTION;
              // assemble the result
              res.w[1] = x_sign | x_exp | C1_hi;
              res.w[0] = C1_lo;
            } else if ((C2_lo == halfulp64)
                       && (q1 < P34 || ((C1_lo & 0x1) == 0))) {
              // n2 = 1/2 ulp (n1) and C1 is even
              // the result is the operand with the larger magnitude,
              // possibly scaled up by 10^(P34-q1)
              // an overflow cannot occur in this case (rounding to nearest)
              if (q1 < P34) {   // scale C1 up by 10^(P34-q1)
                // Note: because delta = P34 it is certain that
                //     x_exp - ((UINT64)scale << 49) will stay above e_min
                scale = P34 - q1;
                if (q1 <= 19) { // C1 fits in 64 bits
                  // 1 <= q1 <= 19 => 15 <= scale <= 33
                  if (scale <= 19) {    // 10^scale fits in 64 bits
                    __mul_64x64_to_128MACH (C1, ten2k64[scale], C1_lo);
                  } else {      // if 20 <= scale <= 33 
                    // C1 * 10^scale = (C1 * 10^(scale-19)) * 10^19 where
                    // (C1 * 10^(scale-19)) fits in 64 bits  
                    C1_lo = C1_lo * ten2k64[scale - 19];
                    __mul_64x64_to_128MACH (C1, ten2k64[19], C1_lo);
                  }
                } else {        //if 20 <= q1 <= 33=P34-1 then C1 fits only in 128 bits
                  // => 1 <= P34 - q1 <= 14 so 10^(P34-q1) fits in 64 bits 
                  C1.w[1] = C1_hi;
                  C1.w[0] = C1_lo;
                  // C1 = ten2k64[P34 - q1] * C1 
                  __mul_128x64_to_128 (C1, ten2k64[P34 - q1], C1);
                }
                x_exp = x_exp - ((UINT64) scale << 49);
                C1_hi = C1.w[1];
                C1_lo = C1.w[0];
              }
              if ((rnd_mode == ROUNDING_TO_NEAREST && x_sign == y_sign
                   && (C1_lo & 0x01)) || (rnd_mode == ROUNDING_TIES_AWAY
                                          && x_sign == y_sign)
                  || (rnd_mode == ROUNDING_UP && !x_sign && !y_sign)
                  || (rnd_mode == ROUNDING_DOWN && x_sign && y_sign)) {
                // add 1 ulp and then check for overflow
                C1_lo = C1_lo + 1;
                if (C1_lo == 0) {        // rounding overflow in the low 64 bits
                  C1_hi = C1_hi + 1;
                }
                if (C1_hi == 0x0001ed09bead87c0ull
                    && C1_lo == 0x378d8e6400000000ull) {
                  // C1 = 10^34 => rounding overflow
                  C1_hi = 0x0000314dc6448d93ull;
                  C1_lo = 0x38c15b0a00000000ull;        // 10^33
                  x_exp = x_exp + EXP_P1;
                  if (x_exp == EXP_MAX_P1) {    // overflow
                    C1_hi = 0x7800000000000000ull;      // +inf
                    C1_lo = 0x0ull;
                    x_exp = 0;   // x_sign is preserved
                    // set overflow flag (the inexact flag was set too)
                    *pfpsf |= OVERFLOW_EXCEPTION;
                  }
                }
              } else
                if ((rnd_mode == ROUNDING_TO_NEAREST && x_sign != y_sign
                     && (C1_lo & 0x01)) || (rnd_mode == ROUNDING_DOWN
                                            && !x_sign && y_sign)
                    || (rnd_mode == ROUNDING_UP && x_sign && !y_sign)
                    || (rnd_mode == ROUNDING_TO_ZERO
                        && x_sign != y_sign)) {
                // subtract 1 ulp from C1
                // Note: because delta >= P34 + 1 the result cannot be zero
                C1_lo = C1_lo - 1;
                if (C1_lo == 0xffffffffffffffffull)
                  C1_hi = C1_hi - 1;
                // if the coefficient is 10^33 - 1 then make it 10^34 - 1
                // and decrease the exponent by 1 (because delta >= P34 + 1
                // the exponent will not become less than e_min)
                // 10^33 - 1 = 0x0000314dc6448d9338c15b09ffffffff
                // 10^34 - 1 = 0x0001ed09bead87c0378d8e63ffffffff
                if (C1_hi == 0x0000314dc6448d93ull
                    && C1_lo == 0x38c15b09ffffffffull) {
                  // make C1 = 10^34  - 1
                  C1_hi = 0x0001ed09bead87c0ull;
                  C1_lo = 0x378d8e63ffffffffull;
                  x_exp = x_exp - EXP_P1;
                }
              } else {
                ;       // the result is already correct
              }
              // set the inexact flag
              *pfpsf |= INEXACT_EXCEPTION;
              // assemble the result 
              res.w[1] = x_sign | x_exp | C1_hi;
              res.w[0] = C1_lo;
            } else {    // if C2_lo > halfulp64 || 
              // (C2_lo == halfulp64 && q1 == P34 && ((C1_lo & 0x1) == 1)), i.e.
              // 1/2 ulp(n1) < n2 < 1 ulp(n1) or n2 = 1/2 ulp(n1) and C1 odd
              // res = x+1 ulp if n1*n2 > 0 and res = x-1 ulp if n1*n2 < 0
              if (q1 < P34) {   // then 1 ulp = 10^(e1+q1-P34) < 10^e1
                // Note: if (q1 == P34) then 1 ulp = 10^(e1+q1-P34) = 10^e1
                // because q1 < P34 we must first replace C1 by 
                // C1 * 10^(P34-q1), and must decrease the exponent by 
                // (P34-q1) (it will still be at least e_min)
                scale = P34 - q1;
                if (q1 <= 19) { // C1 fits in 64 bits
                  // 1 <= q1 <= 19 => 15 <= scale <= 33
                  if (scale <= 19) {    // 10^scale fits in 64 bits
                    __mul_64x64_to_128MACH (C1, ten2k64[scale], C1_lo);
                  } else {      // if 20 <= scale <= 33
                    // C1 * 10^scale = (C1 * 10^(scale-19)) * 10^19 where
                    // (C1 * 10^(scale-19)) fits in 64 bits
                    C1_lo = C1_lo * ten2k64[scale - 19];
                    __mul_64x64_to_128MACH (C1, ten2k64[19], C1_lo);
                  }
                } else {        //if 20 <= q1 <= 33=P34-1 then C1 fits only in 128 bits
                  // => 1 <= P34 - q1 <= 14 so 10^(P34-q1) fits in 64 bits
                  C1.w[1] = C1_hi;
                  C1.w[0] = C1_lo;
                  // C1 = ten2k64[P34 - q1] * C1
                  __mul_128x64_to_128 (C1, ten2k64[P34 - q1], C1);
                }
                x_exp = x_exp - ((UINT64) scale << 49);
                C1_hi = C1.w[1];
                C1_lo = C1.w[0];
                // check for rounding overflow
                if (C1_hi == 0x0001ed09bead87c0ull
                    && C1_lo == 0x378d8e6400000000ull) {
                  // C1 = 10^34 => rounding overflow 
                  C1_hi = 0x0000314dc6448d93ull;
                  C1_lo = 0x38c15b0a00000000ull;        // 10^33
                  x_exp = x_exp + EXP_P1;
                }
              }
              if ((rnd_mode == ROUNDING_TO_NEAREST && x_sign != y_sign)
                  || (rnd_mode == ROUNDING_TIES_AWAY && x_sign != y_sign
                      && C2_lo != halfulp64)
                  || (rnd_mode == ROUNDING_DOWN && !x_sign && y_sign)
                  || (rnd_mode == ROUNDING_UP && x_sign && !y_sign)
                  || (rnd_mode == ROUNDING_TO_ZERO
                      && x_sign != y_sign)) {
                // the result is x - 1
                // for RN n1 * n2 < 0; underflow not possible
                C1_lo = C1_lo - 1;
                if (C1_lo == 0xffffffffffffffffull)
                  C1_hi--;
                // check if we crossed into the lower decade
                if (C1_hi == 0x0000314dc6448d93ull && C1_lo == 0x38c15b09ffffffffull) { // 10^33 - 1
                  C1_hi = 0x0001ed09bead87c0ull;        // 10^34 - 1
                  C1_lo = 0x378d8e63ffffffffull;
                  x_exp = x_exp - EXP_P1;       // no underflow, because n1 >> n2
                }
              } else
                if ((rnd_mode == ROUNDING_TO_NEAREST
                     && x_sign == y_sign)
                    || (rnd_mode == ROUNDING_TIES_AWAY
                        && x_sign == y_sign)
                    || (rnd_mode == ROUNDING_DOWN && x_sign && y_sign)
                    || (rnd_mode == ROUNDING_UP && !x_sign
                        && !y_sign)) {
                // the result is x + 1
                // for RN x_sign = y_sign, i.e. n1*n2 > 0
                C1_lo = C1_lo + 1;
                if (C1_lo == 0) {        // rounding overflow in the low 64 bits
                  C1_hi = C1_hi + 1;
                }
                if (C1_hi == 0x0001ed09bead87c0ull
                    && C1_lo == 0x378d8e6400000000ull) {
                  // C1 = 10^34 => rounding overflow
                  C1_hi = 0x0000314dc6448d93ull;
                  C1_lo = 0x38c15b0a00000000ull;        // 10^33
                  x_exp = x_exp + EXP_P1;
                  if (x_exp == EXP_MAX_P1) {    // overflow
                    C1_hi = 0x7800000000000000ull;      // +inf
                    C1_lo = 0x0ull;
                    x_exp = 0;   // x_sign is preserved
                    // set the overflow flag
                    *pfpsf |= OVERFLOW_EXCEPTION;
                  }
                }
              } else {
                ;       // the result is x
              }
              // set the inexact flag
              *pfpsf |= INEXACT_EXCEPTION;
              // assemble the result
              res.w[1] = x_sign | x_exp | C1_hi;
              res.w[0] = C1_lo;
            }
          } else {      // if q2 >= 20 then 5*10^(q2-1) and C2 (the latter in 
            // most cases) fit only in more than 64 bits
            halfulp128 = midpoint128[q2 - 20];  // 5 * 10^(q2-1)
            if ((C2_hi < halfulp128.w[1])
                || (C2_hi == halfulp128.w[1]
                    && C2_lo < halfulp128.w[0])) {
              // n2 < 1/2 ulp (n1)
              // the result is the operand with the larger magnitude,
              // possibly scaled up by 10^(P34-q1)
              // an overflow cannot occur in this case (rounding to nearest)
              if (q1 < P34) {   // scale C1 up by 10^(P34-q1)
                // Note: because delta = P34 it is certain that
                //     x_exp - ((UINT64)scale << 49) will stay above e_min
                scale = P34 - q1;
                if (q1 <= 19) { // C1 fits in 64 bits
                  // 1 <= q1 <= 19 => 15 <= scale <= 33
                  if (scale <= 19) {    // 10^scale fits in 64 bits
                    __mul_64x64_to_128MACH (C1, ten2k64[scale], C1_lo);
                  } else {      // if 20 <= scale <= 33 
                    // C1 * 10^scale = (C1 * 10^(scale-19)) * 10^19 where
                    // (C1 * 10^(scale-19)) fits in 64 bits  
                    C1_lo = C1_lo * ten2k64[scale - 19];
                    __mul_64x64_to_128MACH (C1, ten2k64[19], C1_lo);
                  }
                } else {        //if 20 <= q1 <= 33=P34-1 then C1 fits only in 128 bits
                  // => 1 <= P34 - q1 <= 14 so 10^(P34-q1) fits in 64 bits 
                  C1.w[1] = C1_hi;
                  C1.w[0] = C1_lo;
                  // C1 = ten2k64[P34 - q1] * C1 
                  __mul_128x64_to_128 (C1, ten2k64[P34 - q1], C1);
                }
                C1_hi = C1.w[1];
                C1_lo = C1.w[0];
                x_exp = x_exp - ((UINT64) scale << 49);
              }
              if (rnd_mode != ROUNDING_TO_NEAREST) {
                if ((rnd_mode == ROUNDING_DOWN && x_sign && y_sign) ||
                    (rnd_mode == ROUNDING_UP && !x_sign && !y_sign)) {
                  // add 1 ulp and then check for overflow
                  C1_lo = C1_lo + 1;
                  if (C1_lo == 0) {      // rounding overflow in the low 64 bits
                    C1_hi = C1_hi + 1;
                  }
                  if (C1_hi == 0x0001ed09bead87c0ull
                      && C1_lo == 0x378d8e6400000000ull) {
                    // C1 = 10^34 => rounding overflow
                    C1_hi = 0x0000314dc6448d93ull;
                    C1_lo = 0x38c15b0a00000000ull;      // 10^33
                    x_exp = x_exp + EXP_P1;
                    if (x_exp == EXP_MAX_P1) {  // overflow
                      C1_hi = 0x7800000000000000ull;    // +inf
                      C1_lo = 0x0ull;
                      x_exp = 0; // x_sign is preserved
                      // set overflow flag (the inexact flag was set too)
                      *pfpsf |= OVERFLOW_EXCEPTION;
                    }
                  }
                } else
                  if ((rnd_mode == ROUNDING_DOWN && !x_sign && y_sign)
                      || (rnd_mode == ROUNDING_UP && x_sign && !y_sign)
                      || (rnd_mode == ROUNDING_TO_ZERO
                          && x_sign != y_sign)) {
                  // subtract 1 ulp from C1
                  // Note: because delta >= P34 + 1 the result cannot be zero
                  C1_lo = C1_lo - 1;
                  if (C1_lo == 0xffffffffffffffffull)
                    C1_hi = C1_hi - 1;
                  // if the coefficient is 10^33-1 then make it 10^34-1 and
                  // decrease the exponent by 1 (because delta >= P34 + 1 the
                  // exponent will not become less than e_min)
                  // 10^33 - 1 = 0x0000314dc6448d9338c15b09ffffffff
                  // 10^34 - 1 = 0x0001ed09bead87c0378d8e63ffffffff
                  if (C1_hi == 0x0000314dc6448d93ull
                      && C1_lo == 0x38c15b09ffffffffull) {
                    // make C1 = 10^34  - 1
                    C1_hi = 0x0001ed09bead87c0ull;
                    C1_lo = 0x378d8e63ffffffffull;
                    x_exp = x_exp - EXP_P1;
                  }
                } else {
                  ;     // the result is already correct
                }
              }
              // set the inexact flag 
              *pfpsf |= INEXACT_EXCEPTION;
              // assemble the result 
              res.w[1] = x_sign | x_exp | C1_hi;
              res.w[0] = C1_lo;
            } else if ((C2_hi == halfulp128.w[1]
                        && C2_lo == halfulp128.w[0])
                       && (q1 < P34 || ((C1_lo & 0x1) == 0))) {
              // midpoint & lsb in C1 is 0
              // n2 = 1/2 ulp (n1) and C1 is even
              // the result is the operand with the larger magnitude,
              // possibly scaled up by 10^(P34-q1)
              // an overflow cannot occur in this case (rounding to nearest)
              if (q1 < P34) {   // scale C1 up by 10^(P34-q1)
                // Note: because delta = P34 it is certain that
                //     x_exp - ((UINT64)scale << 49) will stay above e_min
                scale = P34 - q1;
                if (q1 <= 19) { // C1 fits in 64 bits
                  // 1 <= q1 <= 19 => 15 <= scale <= 33
                  if (scale <= 19) {    // 10^scale fits in 64 bits
                    __mul_64x64_to_128MACH (C1, ten2k64[scale], C1_lo);
                  } else {      // if 20 <= scale <= 33
                    // C1 * 10^scale = (C1 * 10^(scale-19)) * 10^19 where
                    // (C1 * 10^(scale-19)) fits in 64 bits
                    C1_lo = C1_lo * ten2k64[scale - 19];
                    __mul_64x64_to_128MACH (C1, ten2k64[19], C1_lo);
                  }
                } else {        //if 20 <= q1 <= 33=P34-1 then C1 fits only in 128 bits
                  // => 1 <= P34 - q1 <= 14 so 10^(P34-q1) fits in 64 bits
                  C1.w[1] = C1_hi;
                  C1.w[0] = C1_lo;
                  // C1 = ten2k64[P34 - q1] * C1
                  __mul_128x64_to_128 (C1, ten2k64[P34 - q1], C1);
                }
                x_exp = x_exp - ((UINT64) scale << 49);
                C1_hi = C1.w[1];
                C1_lo = C1.w[0];
              }
              if (rnd_mode != ROUNDING_TO_NEAREST) {
                if ((rnd_mode == ROUNDING_TIES_AWAY && x_sign == y_sign)
                    || (rnd_mode == ROUNDING_UP && !y_sign)) {
                  // add 1 ulp and then check for overflow
                  C1_lo = C1_lo + 1;
                  if (C1_lo == 0) {      // rounding overflow in the low 64 bits
                    C1_hi = C1_hi + 1;
                  }
                  if (C1_hi == 0x0001ed09bead87c0ull
                      && C1_lo == 0x378d8e6400000000ull) {
                    // C1 = 10^34 => rounding overflow
                    C1_hi = 0x0000314dc6448d93ull;
                    C1_lo = 0x38c15b0a00000000ull;      // 10^33
                    x_exp = x_exp + EXP_P1;
                    if (x_exp == EXP_MAX_P1) {  // overflow
                      C1_hi = 0x7800000000000000ull;    // +inf
                      C1_lo = 0x0ull;
                      x_exp = 0; // x_sign is preserved
                      // set overflow flag (the inexact flag was set too)
                      *pfpsf |= OVERFLOW_EXCEPTION;
                    }
                  }
                } else if ((rnd_mode == ROUNDING_DOWN && y_sign)
                           || (rnd_mode == ROUNDING_TO_ZERO
                               && x_sign != y_sign)) {
                  // subtract 1 ulp from C1
                  // Note: because delta >= P34 + 1 the result cannot be zero
                  C1_lo = C1_lo - 1;
                  if (C1_lo == 0xffffffffffffffffull)
                    C1_hi = C1_hi - 1;
                  // if the coefficient is 10^33 - 1 then make it 10^34 - 1
                  // and decrease the exponent by 1 (because delta >= P34 + 1
                  // the exponent will not become less than e_min)
                  // 10^33 - 1 = 0x0000314dc6448d9338c15b09ffffffff
                  // 10^34 - 1 = 0x0001ed09bead87c0378d8e63ffffffff
                  if (C1_hi == 0x0000314dc6448d93ull
                      && C1_lo == 0x38c15b09ffffffffull) {
                    // make C1 = 10^34  - 1
                    C1_hi = 0x0001ed09bead87c0ull;
                    C1_lo = 0x378d8e63ffffffffull;
                    x_exp = x_exp - EXP_P1;
                  }
                } else {
                  ;     // the result is already correct
                }
              }
              // set the inexact flag
              *pfpsf |= INEXACT_EXCEPTION;
              // assemble the result
              res.w[1] = x_sign | x_exp | C1_hi;
              res.w[0] = C1_lo;
            } else {    // if C2 > halfulp128 ||
              // (C2 == halfulp128 && q1 == P34 && ((C1 & 0x1) == 1)), i.e.
              // 1/2 ulp(n1) < n2 < 1 ulp(n1) or n2 = 1/2 ulp(n1) and C1 odd
              // res = x+1 ulp if n1*n2 > 0 and res = x-1 ulp if n1*n2 < 0
              if (q1 < P34) {   // then 1 ulp = 10^(e1+q1-P34) < 10^e1
                // Note: if (q1 == P34) then 1 ulp = 10^(e1+q1-P34) = 10^e1
                // because q1 < P34 we must first replace C1 by C1*10^(P34-q1),
                // and must decrease the exponent by (P34-q1) (it will still be
                // at least e_min)
                scale = P34 - q1;
                if (q1 <= 19) { // C1 fits in 64 bits
                  // 1 <= q1 <= 19 => 15 <= scale <= 33
                  if (scale <= 19) {    // 10^scale fits in 64 bits
                    __mul_64x64_to_128MACH (C1, ten2k64[scale], C1_lo);
                  } else {      // if 20 <= scale <= 33
                    // C1 * 10^scale = (C1 * 10^(scale-19)) * 10^19 where
                    // (C1 * 10^(scale-19)) fits in 64 bits
                    C1_lo = C1_lo * ten2k64[scale - 19];
                    __mul_64x64_to_128MACH (C1, ten2k64[19], C1_lo);
                  }
                } else {        //if 20 <= q1 <= 33=P34-1 then C1 fits only in 128 bits
                  // => 1 <= P34 - q1 <= 14 so 10^(P34-q1) fits in 64 bits
                  C1.w[1] = C1_hi;
                  C1.w[0] = C1_lo;
                  // C1 = ten2k64[P34 - q1] * C1
                  __mul_128x64_to_128 (C1, ten2k64[P34 - q1], C1);
                }
                C1_hi = C1.w[1];
                C1_lo = C1.w[0];
                x_exp = x_exp - ((UINT64) scale << 49);
              }
              if ((rnd_mode == ROUNDING_TO_NEAREST && x_sign != y_sign)
                  || (rnd_mode == ROUNDING_TIES_AWAY && x_sign != y_sign
                      && (C2_hi != halfulp128.w[1]
                          || C2_lo != halfulp128.w[0]))
                  || (rnd_mode == ROUNDING_DOWN && !x_sign && y_sign)
                  || (rnd_mode == ROUNDING_UP && x_sign && !y_sign)
                  || (rnd_mode == ROUNDING_TO_ZERO
                      && x_sign != y_sign)) {
                // the result is x - 1
                // for RN n1 * n2 < 0; underflow not possible
                C1_lo = C1_lo - 1;
                if (C1_lo == 0xffffffffffffffffull)
                  C1_hi--;
                // check if we crossed into the lower decade
                if (C1_hi == 0x0000314dc6448d93ull && C1_lo == 0x38c15b09ffffffffull) { // 10^33 - 1
                  C1_hi = 0x0001ed09bead87c0ull;        // 10^34 - 1
                  C1_lo = 0x378d8e63ffffffffull;
                  x_exp = x_exp - EXP_P1;       // no underflow, because n1 >> n2
                }
              } else
                if ((rnd_mode == ROUNDING_TO_NEAREST
                     && x_sign == y_sign)
                    || (rnd_mode == ROUNDING_TIES_AWAY
                        && x_sign == y_sign)
                    || (rnd_mode == ROUNDING_DOWN && x_sign && y_sign)
                    || (rnd_mode == ROUNDING_UP && !x_sign
                        && !y_sign)) {
                // the result is x + 1
                // for RN x_sign = y_sign, i.e. n1*n2 > 0
                C1_lo = C1_lo + 1;
                if (C1_lo == 0) {        // rounding overflow in the low 64 bits
                  C1_hi = C1_hi + 1;
                }
                if (C1_hi == 0x0001ed09bead87c0ull
                    && C1_lo == 0x378d8e6400000000ull) {
                  // C1 = 10^34 => rounding overflow
                  C1_hi = 0x0000314dc6448d93ull;
                  C1_lo = 0x38c15b0a00000000ull;        // 10^33
                  x_exp = x_exp + EXP_P1;
                  if (x_exp == EXP_MAX_P1) {    // overflow
                    C1_hi = 0x7800000000000000ull;      // +inf
                    C1_lo = 0x0ull;
                    x_exp = 0;   // x_sign is preserved
                    // set the overflow flag
                    *pfpsf |= OVERFLOW_EXCEPTION;
                  }
                }
              } else {
                ;       // the result is x
              }
              // set the inexact flag
              *pfpsf |= INEXACT_EXCEPTION;
              // assemble the result
              res.w[1] = x_sign | x_exp | C1_hi;
              res.w[0] = C1_lo;
            }
          }     // end q1 >= 20
          // end case where C1 != 10^(q1-1)
        } else {        // C1 = 10^(q1-1) and x_sign != y_sign
          // instead of C' = (C1 * 10^(e1-e2) + C2)rnd,P34
          // calculate C' = C1 * 10^(e1-e2-x1) + (C2 * 10^(-x1))rnd,P34 
          // where x1 = q2 - 1, 0 <= x1 <= P34 - 1
          // Because C1 = 10^(q1-1) and x_sign != y_sign, C' will have P34 
          // digits and n = C' * 10^(e2+x1)
          // If the result has P34+1 digits, redo the steps above with x1+1
          // If the result has P34-1 digits or less, redo the steps above with 
          // x1-1 but only if initially x1 >= 1
          // NOTE: these two steps can be improved, e.g we could guess if
          // P34+1 or P34-1 digits will be obtained by adding/subtracting 
          // just the top 64 bits of the two operands
          // The result cannot be zero, and it cannot overflow
          x1 = q2 - 1;  // 0 <= x1 <= P34-1
          // Calculate C1 * 10^(e1-e2-x1) where 1 <= e1-e2-x1 <= P34
          // scale = (int)(e1 >> 49) - (int)(e2 >> 49) - x1; 0 <= scale <= P34-1
          scale = P34 - q1 + 1; // scale=e1-e2-x1 = P34+1-q1; 1<=scale<=P34
          // either C1 or 10^(e1-e2-x1) may not fit is 64 bits,
          // but their product fits with certainty in 128 bits
          if (scale >= 20) {    //10^(e1-e2-x1) doesn't fit in 64 bits, but C1 does
            __mul_128x64_to_128 (C1, C1_lo, ten2k128[scale - 20]);
          } else {      // if (scale >= 1
            // if 1 <= scale <= 19 then 10^(e1-e2-x1) fits in 64 bits
            if (q1 <= 19) {     // C1 fits in 64 bits
              __mul_64x64_to_128MACH (C1, C1_lo, ten2k64[scale]);
            } else {    // q1 >= 20
              C1.w[1] = C1_hi;
              C1.w[0] = C1_lo;
              __mul_128x64_to_128 (C1, ten2k64[scale], C1);
            }
          }
          tmp64 = C1.w[0];       // C1.w[1], C1.w[0] contains C1 * 10^(e1-e2-x1)
 
          // now round C2 to q2-x1 = 1 decimal digit
          // C2' = C2 + 1/2 * 10^x1 = C2 + 5 * 10^(x1-1)
          ind = x1 - 1; // -1 <= ind <= P34 - 2
          if (ind >= 0) {        // if (x1 >= 1)
            C2.w[0] = C2_lo;
            C2.w[1] = C2_hi;
            if (ind <= 18) {
              C2.w[0] = C2.w[0] + midpoint64[ind];
              if (C2.w[0] < C2_lo)
                C2.w[1]++;
            } else {    // 19 <= ind <= 32
              C2.w[0] = C2.w[0] + midpoint128[ind - 19].w[0];
              C2.w[1] = C2.w[1] + midpoint128[ind - 19].w[1];
              if (C2.w[0] < C2_lo)
                C2.w[1]++;
            }
            // the approximation of 10^(-x1) was rounded up to 118 bits
            __mul_128x128_to_256 (R256, C2, ten2mk128[ind]);    // R256 = C2*, f2*
            // calculate C2* and f2*
            // C2* is actually floor(C2*) in this case
            // C2* and f2* need shifting and masking, as shown by
            // shiftright128[] and maskhigh128[]
            // the top Ex bits of 10^(-x1) are T* = ten2mk128trunc[ind], e.g.
            // if x1=1, T*=ten2mk128trunc[0]=0x19999999999999999999999999999999
            // if (0 < f2* < 10^(-x1)) then
            //   if floor(C1+C2*) is even then C2* = floor(C2*) - logical right
            //       shift; C2* has p decimal digits, correct by Prop. 1)
            //   else if floor(C1+C2*) is odd C2* = floor(C2*)-1 (logical right
            //       shift; C2* has p decimal digits, correct by Pr. 1)
            // else
            //   C2* = floor(C2*) (logical right shift; C has p decimal digits,
            //       correct by Property 1)
            // n = C2* * 10^(e2+x1)
 
            if (ind <= 2) {
              highf2star.w[1] = 0x0;
              highf2star.w[0] = 0x0;     // low f2* ok
            } else if (ind <= 21) {
              highf2star.w[1] = 0x0;
              highf2star.w[0] = R256.w[2] & maskhigh128[ind];    // low f2* ok
            } else {
              highf2star.w[1] = R256.w[3] & maskhigh128[ind];
              highf2star.w[0] = R256.w[2];       // low f2* is ok
            }
            // shift right C2* by Ex-128 = shiftright128[ind]
            if (ind >= 3) {
              shift = shiftright128[ind];
              if (shift < 64) { // 3 <= shift <= 63
                R256.w[2] =
                  (R256.w[2] >> shift) | (R256.w[3] << (64 - shift));
                R256.w[3] = (R256.w[3] >> shift);
              } else {  // 66 <= shift <= 102
                R256.w[2] = (R256.w[3] >> (shift - 64));
                R256.w[3] = 0x0ULL;
              }
            }
            // redundant
            is_inexact_lt_midpoint = 0;
            is_inexact_gt_midpoint = 0;
            is_midpoint_lt_even = 0;
            is_midpoint_gt_even = 0;
            // determine inexactness of the rounding of C2*
            // (cannot be followed by a second rounding)
            // if (0 < f2* - 1/2 < 10^(-x1)) then
            //   the result is exact
            // else (if f2* - 1/2 > T* then)
            //   the result of is inexact
            if (ind <= 2) {
              if (R256.w[1] > 0x8000000000000000ull ||
                  (R256.w[1] == 0x8000000000000000ull
                   && R256.w[0] > 0x0ull)) {
                // f2* > 1/2 and the result may be exact
                tmp64A = R256.w[1] - 0x8000000000000000ull;     // f* - 1/2
                if ((tmp64A > ten2mk128trunc[ind].w[1]
                     || (tmp64A == ten2mk128trunc[ind].w[1]
                         && R256.w[0] >= ten2mk128trunc[ind].w[0]))) {
                  // set the inexact flag
                  *pfpsf |= INEXACT_EXCEPTION;
                  // this rounding is applied to C2 only!
                  // x_sign != y_sign
                  is_inexact_gt_midpoint = 1;
                }       // else the result is exact
                // rounding down, unless a midpoint in [ODD, EVEN]
              } else {  // the result is inexact; f2* <= 1/2
                // set the inexact flag
                *pfpsf |= INEXACT_EXCEPTION;
                // this rounding is applied to C2 only!
                // x_sign != y_sign
                is_inexact_lt_midpoint = 1;
              }
            } else if (ind <= 21) {     // if 3 <= ind <= 21
              if (highf2star.w[1] > 0x0 || (highf2star.w[1] == 0x0
                                            && highf2star.w[0] >
                                            onehalf128[ind])
                  || (highf2star.w[1] == 0x0
                      && highf2star.w[0] == onehalf128[ind]
                      && (R256.w[1] || R256.w[0]))) {
                // f2* > 1/2 and the result may be exact
                // Calculate f2* - 1/2
                tmp64A = highf2star.w[0] - onehalf128[ind];
                tmp64B = highf2star.w[1];
                if (tmp64A > highf2star.w[0])
                  tmp64B--;
                if (tmp64B || tmp64A
                    || R256.w[1] > ten2mk128trunc[ind].w[1]
                    || (R256.w[1] == ten2mk128trunc[ind].w[1]
                        && R256.w[0] > ten2mk128trunc[ind].w[0])) {
                  // set the inexact flag
                  *pfpsf |= INEXACT_EXCEPTION;
                  // this rounding is applied to C2 only!
                  // x_sign != y_sign
                  is_inexact_gt_midpoint = 1;
                }       // else the result is exact
              } else {  // the result is inexact; f2* <= 1/2
                // set the inexact flag
                *pfpsf |= INEXACT_EXCEPTION;
                // this rounding is applied to C2 only!
                // x_sign != y_sign
                is_inexact_lt_midpoint = 1;
              }
            } else {    // if 22 <= ind <= 33
              if (highf2star.w[1] > onehalf128[ind]
                  || (highf2star.w[1] == onehalf128[ind]
                      && (highf2star.w[0] || R256.w[1]
                          || R256.w[0]))) {
                // f2* > 1/2 and the result may be exact
                // Calculate f2* - 1/2
                // tmp64A = highf2star.w[0];
                tmp64B = highf2star.w[1] - onehalf128[ind];
                if (tmp64B || highf2star.w[0]
                    || R256.w[1] > ten2mk128trunc[ind].w[1]
                    || (R256.w[1] == ten2mk128trunc[ind].w[1]
                        && R256.w[0] > ten2mk128trunc[ind].w[0])) {
                  // set the inexact flag
                  *pfpsf |= INEXACT_EXCEPTION;
                  // this rounding is applied to C2 only!
                  // x_sign != y_sign
                  is_inexact_gt_midpoint = 1;
                }       // else the result is exact
              } else {  // the result is inexact; f2* <= 1/2
                // set the inexact flag
                *pfpsf |= INEXACT_EXCEPTION;
                // this rounding is applied to C2 only!
                // x_sign != y_sign
                is_inexact_lt_midpoint = 1;
              }
            }
            // check for midpoints after determining inexactness
            if ((R256.w[1] || R256.w[0]) && (highf2star.w[1] == 0)
                && (highf2star.w[0] == 0)
                && (R256.w[1] < ten2mk128trunc[ind].w[1]
                    || (R256.w[1] == ten2mk128trunc[ind].w[1]
                        && R256.w[0] <= ten2mk128trunc[ind].w[0]))) {
              // the result is a midpoint
              if ((tmp64 + R256.w[2]) & 0x01) { // MP in [EVEN, ODD]
                // if floor(C2*) is odd C = floor(C2*) - 1; the result may be 0
                R256.w[2]--;
                if (R256.w[2] == 0xffffffffffffffffull)
                  R256.w[3]--;
                // this rounding is applied to C2 only!
                // x_sign != y_sign
                is_midpoint_lt_even = 1;
                is_inexact_lt_midpoint = 0;
                is_inexact_gt_midpoint = 0;
              } else {
                // else MP in [ODD, EVEN]
                // this rounding is applied to C2 only!
                // x_sign != y_sign
                is_midpoint_gt_even = 1;
                is_inexact_lt_midpoint = 0;
                is_inexact_gt_midpoint = 0;
              }
            }
          } else {      // if (ind == -1) only when x1 = 0
            R256.w[2] = C2_lo;
            R256.w[3] = C2_hi;
            is_midpoint_lt_even = 0;
            is_midpoint_gt_even = 0;
            is_inexact_lt_midpoint = 0;
            is_inexact_gt_midpoint = 0;
          }
          // and now subtract C1 * 10^(e1-e2-x1) - (C2 * 10^(-x1))rnd,P34
          // because x_sign != y_sign this last operation is exact
          C1.w[0] = C1.w[0] - R256.w[2];
          C1.w[1] = C1.w[1] - R256.w[3];
          if (C1.w[0] > tmp64)
            C1.w[1]--;  // borrow
          if (C1.w[1] >= 0x8000000000000000ull) {       // negative coefficient!
            C1.w[0] = ~C1.w[0];
            C1.w[0]++;
            C1.w[1] = ~C1.w[1];
            if (C1.w[0] == 0x0)
              C1.w[1]++;
            tmp_sign = y_sign;  // the result will have the sign of y
          } else {
            tmp_sign = x_sign;
          }
          // the difference has exactly P34 digits
          x_sign = tmp_sign;
          if (x1 >= 1)
            y_exp = y_exp + ((UINT64) x1 << 49);
          C1_hi = C1.w[1];
          C1_lo = C1.w[0];
          // general correction from RN to RA, RM, RP, RZ; result uses y_exp
          if (rnd_mode != ROUNDING_TO_NEAREST) {
            if ((!x_sign
                 && ((rnd_mode == ROUNDING_UP && is_inexact_lt_midpoint)
                     ||
                     ((rnd_mode == ROUNDING_TIES_AWAY
                       || rnd_mode == ROUNDING_UP)
                      && is_midpoint_gt_even))) || (x_sign
                                                    &&
                                                    ((rnd_mode ==
                                                      ROUNDING_DOWN
                                                      &&
                                                      is_inexact_lt_midpoint)
                                                     ||
                                                     ((rnd_mode ==
                                                       ROUNDING_TIES_AWAY
                                                       || rnd_mode ==
                                                       ROUNDING_DOWN)
                                                      &&
                                                      is_midpoint_gt_even))))
            {
              // C1 = C1 + 1
              C1_lo = C1_lo + 1;
              if (C1_lo == 0) {  // rounding overflow in the low 64 bits
                C1_hi = C1_hi + 1;
              }
              if (C1_hi == 0x0001ed09bead87c0ull
                  && C1_lo == 0x378d8e6400000000ull) {
                // C1 = 10^34 => rounding overflow
                C1_hi = 0x0000314dc6448d93ull;
                C1_lo = 0x38c15b0a00000000ull;  // 10^33
                y_exp = y_exp + EXP_P1;
              }
            } else if ((is_midpoint_lt_even || is_inexact_gt_midpoint)
                       &&
                       ((x_sign
                         && (rnd_mode == ROUNDING_UP
                             || rnd_mode == ROUNDING_TO_ZERO))
                        || (!x_sign
                            && (rnd_mode == ROUNDING_DOWN
                                || rnd_mode == ROUNDING_TO_ZERO)))) {
              // C1 = C1 - 1
              C1_lo = C1_lo - 1;
              if (C1_lo == 0xffffffffffffffffull)
                C1_hi--;
              // check if we crossed into the lower decade
              if (C1_hi == 0x0000314dc6448d93ull && C1_lo == 0x38c15b09ffffffffull) {   // 10^33 - 1
                C1_hi = 0x0001ed09bead87c0ull;  // 10^34 - 1
                C1_lo = 0x378d8e63ffffffffull;
                y_exp = y_exp - EXP_P1;
                // no underflow, because delta + q2 >= P34 + 1
              }
            } else {
              ; // exact, the result is already correct
            }
          }
          // assemble the result
          res.w[1] = x_sign | y_exp | C1_hi;
          res.w[0] = C1_lo;
        }
      } // end delta = P34
    } else {    // if (|delta| <= P34 - 1)
      if (delta >= 0) {  // if (0 <= delta <= P34 - 1)
        if (delta <= P34 - 1 - q2) {
          // calculate C' directly; the result is exact
          // in this case 1<=q1<=P34-1, 1<=q2<=P34-1 and 0 <= e1-e2 <= P34-2
          // The coefficient of the result is C1 * 10^(e1-e2) + C2 and the
          // exponent is e2; either C1 or 10^(e1-e2) may not fit is 64 bits,
          // but their product fits with certainty in 128 bits (actually in 113)
          scale = delta - q1 + q2;      // scale = (int)(e1 >> 49) - (int)(e2 >> 49) 
 
          if (scale >= 20) {    // 10^(e1-e2) does not fit in 64 bits, but C1 does
            __mul_128x64_to_128 (C1, C1_lo, ten2k128[scale - 20]);
            C1_hi = C1.w[1];
            C1_lo = C1.w[0];
          } else if (scale >= 1) {
            // if 1 <= scale <= 19 then 10^(e1-e2) fits in 64 bits 
            if (q1 <= 19) {     // C1 fits in 64 bits
              __mul_64x64_to_128MACH (C1, C1_lo, ten2k64[scale]);
            } else {    // q1 >= 20
              C1.w[1] = C1_hi;
              C1.w[0] = C1_lo;
              __mul_128x64_to_128 (C1, ten2k64[scale], C1);
            }
            C1_hi = C1.w[1];
            C1_lo = C1.w[0];
          } else {      // if (scale == 0) C1 is unchanged
            C1.w[0] = C1_lo;     // C1.w[1] = C1_hi; 
          }
          // now add C2
          if (x_sign == y_sign) {
            // the result cannot overflow
            C1_lo = C1_lo + C2_lo;
            C1_hi = C1_hi + C2_hi;
            if (C1_lo < C1.w[0])
              C1_hi++;
          } else {      // if x_sign != y_sign
            C1_lo = C1_lo - C2_lo;
            C1_hi = C1_hi - C2_hi;
            if (C1_lo > C1.w[0])
              C1_hi--;
            // the result can be zero, but it cannot overflow
            if (C1_lo == 0 && C1_hi == 0) {
              // assemble the result
              if (x_exp < y_exp)
                res.w[1] = x_exp;
              else
                res.w[1] = y_exp;
              res.w[0] = 0;
              if (rnd_mode == ROUNDING_DOWN) {
                res.w[1] |= 0x8000000000000000ull;
              }
              BID_SWAP128 (res);
              BID_RETURN (res);
            }
            if (C1_hi >= 0x8000000000000000ull) {       // negative coefficient!
              C1_lo = ~C1_lo;
              C1_lo++;
              C1_hi = ~C1_hi;
              if (C1_lo == 0x0)
                C1_hi++;
              x_sign = y_sign;  // the result will have the sign of y
            }
          }
          // assemble the result
          res.w[1] = x_sign | y_exp | C1_hi;
          res.w[0] = C1_lo;
        } else if (delta == P34 - q2) {
          // calculate C' directly; the result may be inexact if it requires 
          // P34+1 decimal digits; in this case the 'cutoff' point for addition
          // is at the position of the lsb of C2, so 0 <= e1-e2 <= P34-1
          // The coefficient of the result is C1 * 10^(e1-e2) + C2 and the
          // exponent is e2; either C1 or 10^(e1-e2) may not fit is 64 bits,
          // but their product fits with certainty in 128 bits (actually in 113)
          scale = delta - q1 + q2;      // scale = (int)(e1 >> 49) - (int)(e2 >> 49)
          if (scale >= 20) {    // 10^(e1-e2) does not fit in 64 bits, but C1 does
            __mul_128x64_to_128 (C1, C1_lo, ten2k128[scale - 20]);
          } else if (scale >= 1) {
            // if 1 <= scale <= 19 then 10^(e1-e2) fits in 64 bits
            if (q1 <= 19) {     // C1 fits in 64 bits
              __mul_64x64_to_128MACH (C1, C1_lo, ten2k64[scale]);
            } else {    // q1 >= 20
              C1.w[1] = C1_hi;
              C1.w[0] = C1_lo;
              __mul_128x64_to_128 (C1, ten2k64[scale], C1);
            }
          } else {      // if (scale == 0) C1 is unchanged
            C1.w[1] = C1_hi;
            C1.w[0] = C1_lo;     // only the low part is necessary
          }
          C1_hi = C1.w[1];
          C1_lo = C1.w[0];
          // now add C2
          if (x_sign == y_sign) {
            // the result can overflow!
            C1_lo = C1_lo + C2_lo;
            C1_hi = C1_hi + C2_hi;
            if (C1_lo < C1.w[0])
              C1_hi++;
            // test for overflow, possible only when C1 >= 10^34
            if (C1_hi > 0x0001ed09bead87c0ull || (C1_hi == 0x0001ed09bead87c0ull && C1_lo >= 0x378d8e6400000000ull)) {  // C1 >= 10^34
              // in this case q = P34 + 1 and x = q - P34 = 1, so multiply 
              // C'' = C'+ 5 = C1 + 5 by k1 ~ 10^(-1) calculated for P34 + 1 
              // decimal digits
              // Calculate C'' = C' + 1/2 * 10^x
              if (C1_lo >= 0xfffffffffffffffbull) {     // low half add has carry
                C1_lo = C1_lo + 5;
                C1_hi = C1_hi + 1;
              } else {
                C1_lo = C1_lo + 5;
              }
              // the approximation of 10^(-1) was rounded up to 118 bits
              // 10^(-1) =~ 33333333333333333333333333333400 * 2^-129
              // 10^(-1) =~ 19999999999999999999999999999a00 * 2^-128
              C1.w[1] = C1_hi;
              C1.w[0] = C1_lo;   // C''
              ten2m1.w[1] = 0x1999999999999999ull;
              ten2m1.w[0] = 0x9999999999999a00ull;
              __mul_128x128_to_256 (P256, C1, ten2m1);  // P256 = C*, f*
              // C* is actually floor(C*) in this case
              // the top Ex = 128 bits of 10^(-1) are 
              // T* = 0x00199999999999999999999999999999
              // if (0 < f* < 10^(-x)) then
              //   if floor(C*) is even then C = floor(C*) - logical right 
              //       shift; C has p decimal digits, correct by Prop. 1)
              //   else if floor(C*) is odd C = floor(C*) - 1 (logical right
              //       shift; C has p decimal digits, correct by Pr. 1)
              // else
              //   C = floor(C*) (logical right shift; C has p decimal digits,
              //       correct by Property 1)
              // n = C * 10^(e2+x)
              if ((P256.w[1] || P256.w[0])
                  && (P256.w[1] < 0x1999999999999999ull
                      || (P256.w[1] == 0x1999999999999999ull
                          && P256.w[0] <= 0x9999999999999999ull))) {
                // the result is a midpoint
                if (P256.w[2] & 0x01) {
                  is_midpoint_gt_even = 1;
                  // if floor(C*) is odd C = floor(C*) - 1; the result is not 0
                  P256.w[2]--;
                  if (P256.w[2] == 0xffffffffffffffffull)
                    P256.w[3]--;
                } else {
                  is_midpoint_lt_even = 1;
                }
              }
              // n = Cstar * 10^(e2+1)
              y_exp = y_exp + EXP_P1;
              // C* != 10^P because C* has P34 digits
              // check for overflow
              if (y_exp == EXP_MAX_P1
                  && (rnd_mode == ROUNDING_TO_NEAREST
                      || rnd_mode == ROUNDING_TIES_AWAY)) {
                // overflow for RN
                res.w[1] = x_sign | 0x7800000000000000ull;      // +/-inf
                res.w[0] = 0x0ull;
                // set the inexact flag
                *pfpsf |= INEXACT_EXCEPTION;
                // set the overflow flag
                *pfpsf |= OVERFLOW_EXCEPTION;
                BID_SWAP128 (res);
                BID_RETURN (res);
              }
              // if (0 < f* - 1/2 < 10^(-x)) then 
              //   the result of the addition is exact 
              // else 
              //   the result of the addition is inexact
              if (P256.w[1] > 0x8000000000000000ull || (P256.w[1] == 0x8000000000000000ull && P256.w[0] > 0x0ull)) {     // the result may be exact
                tmp64 = P256.w[1] - 0x8000000000000000ull;      // f* - 1/2
                if ((tmp64 > 0x1999999999999999ull
                     || (tmp64 == 0x1999999999999999ull
                         && P256.w[0] >= 0x9999999999999999ull))) {
                  // set the inexact flag
                  *pfpsf |= INEXACT_EXCEPTION;
                  is_inexact = 1;
                }       // else the result is exact
              } else {  // the result is inexact
                // set the inexact flag
                *pfpsf |= INEXACT_EXCEPTION;
                is_inexact = 1;
              }
              C1_hi = P256.w[3];
              C1_lo = P256.w[2];
              if (!is_midpoint_gt_even && !is_midpoint_lt_even) {
                is_inexact_lt_midpoint = is_inexact
                  && (P256.w[1] & 0x8000000000000000ull);
                is_inexact_gt_midpoint = is_inexact
                  && !(P256.w[1] & 0x8000000000000000ull);
              }
              // general correction from RN to RA, RM, RP, RZ; 
              // result uses y_exp
              if (rnd_mode != ROUNDING_TO_NEAREST) {
                if ((!x_sign
                     &&
                     ((rnd_mode == ROUNDING_UP
                       && is_inexact_lt_midpoint)
                      ||
                      ((rnd_mode == ROUNDING_TIES_AWAY
                        || rnd_mode == ROUNDING_UP)
                       && is_midpoint_gt_even))) || (x_sign
                                                     &&
                                                     ((rnd_mode ==
                                                       ROUNDING_DOWN
                                                       &&
                                                       is_inexact_lt_midpoint)
                                                      ||
                                                      ((rnd_mode ==
                                                        ROUNDING_TIES_AWAY
                                                        || rnd_mode ==
                                                        ROUNDING_DOWN)
                                                       &&
                                                       is_midpoint_gt_even))))
                {
                  // C1 = C1 + 1
                  C1_lo = C1_lo + 1;
                  if (C1_lo == 0) {      // rounding overflow in the low 64 bits
                    C1_hi = C1_hi + 1;
                  }
                  if (C1_hi == 0x0001ed09bead87c0ull
                      && C1_lo == 0x378d8e6400000000ull) {
                    // C1 = 10^34 => rounding overflow
                    C1_hi = 0x0000314dc6448d93ull;
                    C1_lo = 0x38c15b0a00000000ull;      // 10^33
                    y_exp = y_exp + EXP_P1;
                  }
                } else
                  if ((is_midpoint_lt_even || is_inexact_gt_midpoint)
                      &&
                      ((x_sign
                        && (rnd_mode == ROUNDING_UP
                            || rnd_mode == ROUNDING_TO_ZERO))
                       || (!x_sign
                           && (rnd_mode == ROUNDING_DOWN
                               || rnd_mode == ROUNDING_TO_ZERO)))) {
                  // C1 = C1 - 1
                  C1_lo = C1_lo - 1;
                  if (C1_lo == 0xffffffffffffffffull)
                    C1_hi--;
                  // check if we crossed into the lower decade
                  if (C1_hi == 0x0000314dc6448d93ull && C1_lo == 0x38c15b09ffffffffull) {       // 10^33 - 1
                    C1_hi = 0x0001ed09bead87c0ull;      // 10^34 - 1
                    C1_lo = 0x378d8e63ffffffffull;
                    y_exp = y_exp - EXP_P1;
                    // no underflow, because delta + q2 >= P34 + 1
                  }
                } else {
                  ;     // exact, the result is already correct
                }
                // in all cases check for overflow (RN and RA solved already)
                if (y_exp == EXP_MAX_P1) {      // overflow
                  if ((rnd_mode == ROUNDING_DOWN && x_sign) ||  // RM and res < 0
                      (rnd_mode == ROUNDING_UP && !x_sign)) {   // RP and res > 0
                    C1_hi = 0x7800000000000000ull;      // +inf
                    C1_lo = 0x0ull;
                  } else {      // RM and res > 0, RP and res < 0, or RZ
                    C1_hi = 0x5fffed09bead87c0ull;
                    C1_lo = 0x378d8e63ffffffffull;
                  }
                  y_exp = 0;     // x_sign is preserved
                  // set the inexact flag (in case the exact addition was exact)
                  *pfpsf |= INEXACT_EXCEPTION;
                  // set the overflow flag
                  *pfpsf |= OVERFLOW_EXCEPTION;
                }
              }
            }   // else if (C1 < 10^34) then C1 is the coeff.; the result is exact
          } else {      // if x_sign != y_sign the result is exact
            C1_lo = C1_lo - C2_lo;
            C1_hi = C1_hi - C2_hi;
            if (C1_lo > C1.w[0])
              C1_hi--;
            // the result can be zero, but it cannot overflow
            if (C1_lo == 0 && C1_hi == 0) {
              // assemble the result
              if (x_exp < y_exp)
                res.w[1] = x_exp;
              else
                res.w[1] = y_exp;
              res.w[0] = 0;
              if (rnd_mode == ROUNDING_DOWN) {
                res.w[1] |= 0x8000000000000000ull;
              }
              BID_SWAP128 (res);
              BID_RETURN (res);
            }
            if (C1_hi >= 0x8000000000000000ull) {       // negative coefficient!
              C1_lo = ~C1_lo;
              C1_lo++;
              C1_hi = ~C1_hi;
              if (C1_lo == 0x0)
                C1_hi++;
              x_sign = y_sign;  // the result will have the sign of y
            }
          }
          // assemble the result
          res.w[1] = x_sign | y_exp | C1_hi;
          res.w[0] = C1_lo;
        } else {        // if (delta >= P34 + 1 - q2)
          // instead of C' = (C1 * 10^(e1-e2) + C2)rnd,P34
          // calculate C' = C1 * 10^(e1-e2-x1) + (C2 * 10^(-x1))rnd,P34 
          // where x1 = q1 + e1 - e2 - P34, 1 <= x1 <= P34 - 1
          // In most cases C' will have P34 digits, and n = C' * 10^(e2+x1)
          // If the result has P34+1 digits, redo the steps above with x1+1
          // If the result has P34-1 digits or less, redo the steps above with 
          // x1-1 but only if initially x1 >= 1
          // NOTE: these two steps can be improved, e.g we could guess if
          // P34+1 or P34-1 digits will be obtained by adding/subtracting just
          // the top 64 bits of the two operands
          // The result cannot be zero, but it can overflow
          x1 = delta + q2 - P34;        // 1 <= x1 <= P34-1
        roundC2:
          // Calculate C1 * 10^(e1-e2-x1) where 0 <= e1-e2-x1 <= P34 - 1
          // scale = (int)(e1 >> 49) - (int)(e2 >> 49) - x1; 0 <= scale <= P34-1
          scale = delta - q1 + q2 - x1; // scale = e1 - e2 - x1 = P34 - q1
          // either C1 or 10^(e1-e2-x1) may not fit is 64 bits,
          // but their product fits with certainty in 128 bits (actually in 113)
          if (scale >= 20) {    //10^(e1-e2-x1) doesn't fit in 64 bits, but C1 does
            __mul_128x64_to_128 (C1, C1_lo, ten2k128[scale - 20]);
          } else if (scale >= 1) {
            // if 1 <= scale <= 19 then 10^(e1-e2-x1) fits in 64 bits
            if (q1 <= 19) {     // C1 fits in 64 bits
              __mul_64x64_to_128MACH (C1, C1_lo, ten2k64[scale]);
            } else {    // q1 >= 20
              C1.w[1] = C1_hi;
              C1.w[0] = C1_lo;
              __mul_128x64_to_128 (C1, ten2k64[scale], C1);
            }
          } else {      // if (scale == 0) C1 is unchanged
            C1.w[1] = C1_hi;
            C1.w[0] = C1_lo;
          }
          tmp64 = C1.w[0];       // C1.w[1], C1.w[0] contains C1 * 10^(e1-e2-x1)
 
          // now round C2 to q2-x1 decimal digits, where 1<=x1<=q2-1<=P34-1
          // (but if we got here a second time after x1 = x1 - 1, then 
          // x1 >= 0; note that for x1 = 0 C2 is unchanged)
          // C2' = C2 + 1/2 * 10^x1 = C2 + 5 * 10^(x1-1)
          ind = x1 - 1; // 0 <= ind <= q2-2<=P34-2=32; but note that if x1 = 0
          // during a second pass, then ind = -1
          if (ind >= 0) {        // if (x1 >= 1)
            C2.w[0] = C2_lo;
            C2.w[1] = C2_hi;
            if (ind <= 18) {
              C2.w[0] = C2.w[0] + midpoint64[ind];
              if (C2.w[0] < C2_lo)
                C2.w[1]++;
            } else {    // 19 <= ind <= 32
              C2.w[0] = C2.w[0] + midpoint128[ind - 19].w[0];
              C2.w[1] = C2.w[1] + midpoint128[ind - 19].w[1];
              if (C2.w[0] < C2_lo)
                C2.w[1]++;
            }
            // the approximation of 10^(-x1) was rounded up to 118 bits
            __mul_128x128_to_256 (R256, C2, ten2mk128[ind]);    // R256 = C2*, f2*
            // calculate C2* and f2*
            // C2* is actually floor(C2*) in this case
            // C2* and f2* need shifting and masking, as shown by
            // shiftright128[] and maskhigh128[]
            // the top Ex bits of 10^(-x1) are T* = ten2mk128trunc[ind], e.g.
            // if x1=1, T*=ten2mk128trunc[0]=0x19999999999999999999999999999999
            // if (0 < f2* < 10^(-x1)) then
            //   if floor(C1+C2*) is even then C2* = floor(C2*) - logical right
            //       shift; C2* has p decimal digits, correct by Prop. 1)
            //   else if floor(C1+C2*) is odd C2* = floor(C2*)-1 (logical right
            //       shift; C2* has p decimal digits, correct by Pr. 1)
            // else
            //   C2* = floor(C2*) (logical right shift; C has p decimal digits,
            //       correct by Property 1)
            // n = C2* * 10^(e2+x1)
 
            if (ind <= 2) {
              highf2star.w[1] = 0x0;
              highf2star.w[0] = 0x0;     // low f2* ok
            } else if (ind <= 21) {
              highf2star.w[1] = 0x0;
              highf2star.w[0] = R256.w[2] & maskhigh128[ind];    // low f2* ok
            } else {
              highf2star.w[1] = R256.w[3] & maskhigh128[ind];
              highf2star.w[0] = R256.w[2];       // low f2* is ok
            }
            // shift right C2* by Ex-128 = shiftright128[ind]
            if (ind >= 3) {
              shift = shiftright128[ind];
              if (shift < 64) { // 3 <= shift <= 63
                R256.w[2] =
                  (R256.w[2] >> shift) | (R256.w[3] << (64 - shift));
                R256.w[3] = (R256.w[3] >> shift);
              } else {  // 66 <= shift <= 102
                R256.w[2] = (R256.w[3] >> (shift - 64));
                R256.w[3] = 0x0ULL;
              }
            }
            if (second_pass) {
              is_inexact_lt_midpoint = 0;
              is_inexact_gt_midpoint = 0;
              is_midpoint_lt_even = 0;
              is_midpoint_gt_even = 0;
            }
            // determine inexactness of the rounding of C2* (this may be 
            // followed by a second rounding only if we get P34+1 
            // decimal digits)
            // if (0 < f2* - 1/2 < 10^(-x1)) then
            //   the result is exact
            // else (if f2* - 1/2 > T* then)
            //   the result of is inexact
            if (ind <= 2) {
              if (R256.w[1] > 0x8000000000000000ull ||
                  (R256.w[1] == 0x8000000000000000ull
                   && R256.w[0] > 0x0ull)) {
                // f2* > 1/2 and the result may be exact
                tmp64A = R256.w[1] - 0x8000000000000000ull;     // f* - 1/2
                if ((tmp64A > ten2mk128trunc[ind].w[1]
                     || (tmp64A == ten2mk128trunc[ind].w[1]
                         && R256.w[0] >= ten2mk128trunc[ind].w[0]))) {
                  // set the inexact flag
                  // *pfpsf |= INEXACT_EXCEPTION;
                  tmp_inexact = 1;      // may be set again during a second pass
                  // this rounding is applied to C2 only!
                  if (x_sign == y_sign)
                    is_inexact_lt_midpoint = 1;
                  else  // if (x_sign != y_sign)
                    is_inexact_gt_midpoint = 1;
                }       // else the result is exact
                // rounding down, unless a midpoint in [ODD, EVEN]
              } else {  // the result is inexact; f2* <= 1/2
                // set the inexact flag
                // *pfpsf |= INEXACT_EXCEPTION;
                tmp_inexact = 1;        // just in case we will round a second time
                // rounding up, unless a midpoint in [EVEN, ODD]
                // this rounding is applied to C2 only!
                if (x_sign == y_sign)
                  is_inexact_gt_midpoint = 1;
                else    // if (x_sign != y_sign)
                  is_inexact_lt_midpoint = 1;
              }
            } else if (ind <= 21) {     // if 3 <= ind <= 21
              if (highf2star.w[1] > 0x0 || (highf2star.w[1] == 0x0
                                            && highf2star.w[0] >
                                            onehalf128[ind])
                  || (highf2star.w[1] == 0x0
                      && highf2star.w[0] == onehalf128[ind]
                      && (R256.w[1] || R256.w[0]))) {
                // f2* > 1/2 and the result may be exact
                // Calculate f2* - 1/2
                tmp64A = highf2star.w[0] - onehalf128[ind];
                tmp64B = highf2star.w[1];
                if (tmp64A > highf2star.w[0])
                  tmp64B--;
                if (tmp64B || tmp64A
                    || R256.w[1] > ten2mk128trunc[ind].w[1]
                    || (R256.w[1] == ten2mk128trunc[ind].w[1]
                        && R256.w[0] > ten2mk128trunc[ind].w[0])) {
                  // set the inexact flag
                  // *pfpsf |= INEXACT_EXCEPTION;
                  tmp_inexact = 1;      // may be set again during a second pass
                  // this rounding is applied to C2 only!
                  if (x_sign == y_sign)
                    is_inexact_lt_midpoint = 1;
                  else  // if (x_sign != y_sign)
                    is_inexact_gt_midpoint = 1;
                }       // else the result is exact
              } else {  // the result is inexact; f2* <= 1/2
                // set the inexact flag
                // *pfpsf |= INEXACT_EXCEPTION;
                tmp_inexact = 1;        // may be set again during a second pass
                // rounding up, unless a midpoint in [EVEN, ODD]
                // this rounding is applied to C2 only!
                if (x_sign == y_sign)
                  is_inexact_gt_midpoint = 1;
                else    // if (x_sign != y_sign)
                  is_inexact_lt_midpoint = 1;
              }
            } else {    // if 22 <= ind <= 33
              if (highf2star.w[1] > onehalf128[ind]
                  || (highf2star.w[1] == onehalf128[ind]
                      && (highf2star.w[0] || R256.w[1]
                          || R256.w[0]))) {
                // f2* > 1/2 and the result may be exact
                // Calculate f2* - 1/2
                // tmp64A = highf2star.w[0];
                tmp64B = highf2star.w[1] - onehalf128[ind];
                if (tmp64B || highf2star.w[0]
                    || R256.w[1] > ten2mk128trunc[ind].w[1]
                    || (R256.w[1] == ten2mk128trunc[ind].w[1]
                        && R256.w[0] > ten2mk128trunc[ind].w[0])) {
                  // set the inexact flag
                  // *pfpsf |= INEXACT_EXCEPTION;
                  tmp_inexact = 1;      // may be set again during a second pass
                  // this rounding is applied to C2 only!
                  if (x_sign == y_sign)
                    is_inexact_lt_midpoint = 1;
                  else  // if (x_sign != y_sign)
                    is_inexact_gt_midpoint = 1;
                }       // else the result is exact
              } else {  // the result is inexact; f2* <= 1/2
                // set the inexact flag
                // *pfpsf |= INEXACT_EXCEPTION;
                tmp_inexact = 1;        // may be set again during a second pass
                // rounding up, unless a midpoint in [EVEN, ODD]
                // this rounding is applied to C2 only!
                if (x_sign == y_sign)
                  is_inexact_gt_midpoint = 1;
                else    // if (x_sign != y_sign)
                  is_inexact_lt_midpoint = 1;
              }
            }
            // check for midpoints
            if ((R256.w[1] || R256.w[0]) && (highf2star.w[1] == 0)
                && (highf2star.w[0] == 0)
                && (R256.w[1] < ten2mk128trunc[ind].w[1]
                    || (R256.w[1] == ten2mk128trunc[ind].w[1]
                        && R256.w[0] <= ten2mk128trunc[ind].w[0]))) {
              // the result is a midpoint
              if ((tmp64 + R256.w[2]) & 0x01) { // MP in [EVEN, ODD]
                // if floor(C2*) is odd C = floor(C2*) - 1; the result may be 0
                R256.w[2]--;
                if (R256.w[2] == 0xffffffffffffffffull)
                  R256.w[3]--;
                // this rounding is applied to C2 only!
                if (x_sign == y_sign)
                  is_midpoint_gt_even = 1;
                else    // if (x_sign != y_sign)
                  is_midpoint_lt_even = 1;
                is_inexact_lt_midpoint = 0;
                is_inexact_gt_midpoint = 0;
              } else {
                // else MP in [ODD, EVEN]
                // this rounding is applied to C2 only!
                if (x_sign == y_sign)
                  is_midpoint_lt_even = 1;
                else    // if (x_sign != y_sign)
                  is_midpoint_gt_even = 1;
                is_inexact_lt_midpoint = 0;
                is_inexact_gt_midpoint = 0;
              }
            }
            // end if (ind >= 0)
          } else {      // if (ind == -1); only during a 2nd pass, and when x1 = 0
            R256.w[2] = C2_lo;
            R256.w[3] = C2_hi;
            tmp_inexact = 0;
            // to correct a possible setting to 1 from 1st pass
            if (second_pass) {
              is_midpoint_lt_even = 0;
              is_midpoint_gt_even = 0;
              is_inexact_lt_midpoint = 0;
              is_inexact_gt_midpoint = 0;
            }
          }
          // and now add/subtract C1 * 10^(e1-e2-x1) +/- (C2 * 10^(-x1))rnd,P34
          if (x_sign == y_sign) {       // addition; could overflow
            // no second pass is possible this way (only for x_sign != y_sign)
            C1.w[0] = C1.w[0] + R256.w[2];
            C1.w[1] = C1.w[1] + R256.w[3];
            if (C1.w[0] < tmp64)
              C1.w[1]++;        // carry
            // if the sum has P34+1 digits, i.e. C1>=10^34 redo the calculation
            // with x1=x1+1 
            if (C1.w[1] > 0x0001ed09bead87c0ull || (C1.w[1] == 0x0001ed09bead87c0ull && C1.w[0] >= 0x378d8e6400000000ull)) {     // C1 >= 10^34
              // chop off one more digit from the sum, but make sure there is
              // no double-rounding error (see table - double rounding logic)
              // now round C1 from P34+1 to P34 decimal digits
              // C1' = C1 + 1/2 * 10 = C1 + 5
              if (C1.w[0] >= 0xfffffffffffffffbull) {    // low half add has carry
                C1.w[0] = C1.w[0] + 5;
                C1.w[1] = C1.w[1] + 1;
              } else {
                C1.w[0] = C1.w[0] + 5;
              }
              // the approximation of 10^(-1) was rounded up to 118 bits
              __mul_128x128_to_256 (Q256, C1, ten2mk128[0]);     // Q256 = C1*, f1*
              // C1* is actually floor(C1*) in this case
              // the top 128 bits of 10^(-1) are
              // T* = ten2mk128trunc[0]=0x19999999999999999999999999999999
              // if (0 < f1* < 10^(-1)) then
              //   if floor(C1*) is even then C1* = floor(C1*) - logical right
              //       shift; C1* has p decimal digits, correct by Prop. 1)
              //   else if floor(C1*) is odd C1* = floor(C1*) - 1 (logical right
              //       shift; C1* has p decimal digits, correct by Pr. 1)
              // else
              //   C1* = floor(C1*) (logical right shift; C has p decimal digits
              //       correct by Property 1)
              // n = C1* * 10^(e2+x1+1)
              if ((Q256.w[1] || Q256.w[0])
                  && (Q256.w[1] < ten2mk128trunc[0].w[1]
                      || (Q256.w[1] == ten2mk128trunc[0].w[1]
                          && Q256.w[0] <= ten2mk128trunc[0].w[0]))) {
                // the result is a midpoint
                if (is_inexact_lt_midpoint) {   // for the 1st rounding
                  is_inexact_gt_midpoint = 1;
                  is_inexact_lt_midpoint = 0;
                  is_midpoint_gt_even = 0;
                  is_midpoint_lt_even = 0;
                } else if (is_inexact_gt_midpoint) {    // for the 1st rounding
                  Q256.w[2]--;
                  if (Q256.w[2] == 0xffffffffffffffffull)
                    Q256.w[3]--;
                  is_inexact_gt_midpoint = 0;
                  is_inexact_lt_midpoint = 1;
                  is_midpoint_gt_even = 0;
                  is_midpoint_lt_even = 0;
                } else if (is_midpoint_gt_even) {       // for the 1st rounding
                  // Note: cannot have is_midpoint_lt_even
                  is_inexact_gt_midpoint = 0;
                  is_inexact_lt_midpoint = 1;
                  is_midpoint_gt_even = 0;
                  is_midpoint_lt_even = 0;
                } else {        // the first rounding must have been exact
                  if (Q256.w[2] & 0x01) {       // MP in [EVEN, ODD]
                    // the truncated result is correct
                    Q256.w[2]--;
                    if (Q256.w[2] == 0xffffffffffffffffull)
                      Q256.w[3]--;
                    is_inexact_gt_midpoint = 0;
                    is_inexact_lt_midpoint = 0;
                    is_midpoint_gt_even = 1;
                    is_midpoint_lt_even = 0;
                  } else {      // MP in [ODD, EVEN]
                    is_inexact_gt_midpoint = 0;
                    is_inexact_lt_midpoint = 0;
                    is_midpoint_gt_even = 0;
                    is_midpoint_lt_even = 1;
                  }
                }
                tmp_inexact = 1;        // in all cases
              } else {  // the result is not a midpoint 
                // determine inexactness of the rounding of C1 (the sum C1+C2*)
                // if (0 < f1* - 1/2 < 10^(-1)) then
                //   the result is exact
                // else (if f1* - 1/2 > T* then)
                //   the result of is inexact
                // ind = 0
                if (Q256.w[1] > 0x8000000000000000ull
                    || (Q256.w[1] == 0x8000000000000000ull
                        && Q256.w[0] > 0x0ull)) {
                  // f1* > 1/2 and the result may be exact
                  Q256.w[1] = Q256.w[1] - 0x8000000000000000ull;        // f1* - 1/2
                  if ((Q256.w[1] > ten2mk128trunc[0].w[1]
                       || (Q256.w[1] == ten2mk128trunc[0].w[1]
                           && Q256.w[0] > ten2mk128trunc[0].w[0]))) {
                    is_inexact_gt_midpoint = 0;
                    is_inexact_lt_midpoint = 1;
                    is_midpoint_gt_even = 0;
                    is_midpoint_lt_even = 0;
                    // set the inexact flag
                    tmp_inexact = 1;
                    // *pfpsf |= INEXACT_EXCEPTION;
                  } else {      // else the result is exact for the 2nd rounding
                    if (tmp_inexact) {  // if the previous rounding was inexact
                      if (is_midpoint_lt_even) {
                        is_inexact_gt_midpoint = 1;
                        is_midpoint_lt_even = 0;
                      } else if (is_midpoint_gt_even) {
                        is_inexact_lt_midpoint = 1;
                        is_midpoint_gt_even = 0;
                      } else {
                        ;       // no change
                      }
                    }
                  }
                  // rounding down, unless a midpoint in [ODD, EVEN]
                } else {        // the result is inexact; f1* <= 1/2
                  is_inexact_gt_midpoint = 1;
                  is_inexact_lt_midpoint = 0;
                  is_midpoint_gt_even = 0;
                  is_midpoint_lt_even = 0;
                  // set the inexact flag
                  tmp_inexact = 1;
                  // *pfpsf |= INEXACT_EXCEPTION;
                }
              } // end 'the result is not a midpoint'
              // n = C1 * 10^(e2+x1)
              C1.w[1] = Q256.w[3];
              C1.w[0] = Q256.w[2];
              y_exp = y_exp + ((UINT64) (x1 + 1) << 49);
            } else {    // C1 < 10^34
              // C1.w[1] and C1.w[0] already set
              // n = C1 * 10^(e2+x1)
              y_exp = y_exp + ((UINT64) x1 << 49);
            }
            // check for overflow
            if (y_exp == EXP_MAX_P1
                && (rnd_mode == ROUNDING_TO_NEAREST
                    || rnd_mode == ROUNDING_TIES_AWAY)) {
              res.w[1] = 0x7800000000000000ull | x_sign;        // +/-inf
              res.w[0] = 0x0ull;
              // set the inexact flag
              *pfpsf |= INEXACT_EXCEPTION;
              // set the overflow flag
              *pfpsf |= OVERFLOW_EXCEPTION;
              BID_SWAP128 (res);
              BID_RETURN (res);
            }   // else no overflow
          } else {      // if x_sign != y_sign the result of this subtract. is exact
            C1.w[0] = C1.w[0] - R256.w[2];
            C1.w[1] = C1.w[1] - R256.w[3];
            if (C1.w[0] > tmp64)
              C1.w[1]--;        // borrow
            if (C1.w[1] >= 0x8000000000000000ull) {     // negative coefficient!
              C1.w[0] = ~C1.w[0];
              C1.w[0]++;
              C1.w[1] = ~C1.w[1];
              if (C1.w[0] == 0x0)
                C1.w[1]++;
              tmp_sign = y_sign;
              // the result will have the sign of y if last rnd
            } else {
              tmp_sign = x_sign;
            }
            // if the difference has P34-1 digits or less, i.e. C1 < 10^33 then
            //   redo the calculation with x1=x1-1;
            // redo the calculation also if C1 = 10^33 and 
            //   (is_inexact_gt_midpoint or is_midpoint_lt_even);
            //   (the last part should have really been 
            //   (is_inexact_lt_midpoint or is_midpoint_gt_even) from
            //    the rounding of C2, but the position flags have been reversed)
            // 10^33 = 0x0000314dc6448d93 0x38c15b0a00000000
            if ((C1.w[1] < 0x0000314dc6448d93ull || (C1.w[1] == 0x0000314dc6448d93ull && C1.w[0] < 0x38c15b0a00000000ull)) || (C1.w[1] == 0x0000314dc6448d93ull && C1.w[0] == 0x38c15b0a00000000ull && (is_inexact_gt_midpoint || is_midpoint_lt_even))) {        // C1=10^33
              x1 = x1 - 1;      // x1 >= 0
              if (x1 >= 0) {
                // clear position flags and tmp_inexact
                is_midpoint_lt_even = 0;
                is_midpoint_gt_even = 0;
                is_inexact_lt_midpoint = 0;
                is_inexact_gt_midpoint = 0;
                tmp_inexact = 0;
                second_pass = 1;
                goto roundC2;   // else result has less than P34 digits
              }
            }
            // if the coefficient of the result is 10^34 it means that this
            // must be the second pass, and we are done 
            if (C1.w[1] == 0x0001ed09bead87c0ull && C1.w[0] == 0x378d8e6400000000ull) {  // if  C1 = 10^34
              C1.w[1] = 0x0000314dc6448d93ull;  // C1 = 10^33
              C1.w[0] = 0x38c15b0a00000000ull;
              y_exp = y_exp + ((UINT64) 1 << 49);
            }
            x_sign = tmp_sign;
            if (x1 >= 1)
              y_exp = y_exp + ((UINT64) x1 << 49);
            // x1 = -1 is possible at the end of a second pass when the 
            // first pass started with x1 = 1 
          }
          C1_hi = C1.w[1];
          C1_lo = C1.w[0];
          // general correction from RN to RA, RM, RP, RZ; result uses y_exp
          if (rnd_mode != ROUNDING_TO_NEAREST) {
            if ((!x_sign
                 && ((rnd_mode == ROUNDING_UP && is_inexact_lt_midpoint)
                     ||
                     ((rnd_mode == ROUNDING_TIES_AWAY
                       || rnd_mode == ROUNDING_UP)
                      && is_midpoint_gt_even))) || (x_sign
                                                    &&
                                                    ((rnd_mode ==
                                                      ROUNDING_DOWN
                                                      &&
                                                      is_inexact_lt_midpoint)
                                                     ||
                                                     ((rnd_mode ==
                                                       ROUNDING_TIES_AWAY
                                                       || rnd_mode ==
                                                       ROUNDING_DOWN)
                                                      &&
                                                      is_midpoint_gt_even))))
            {
              // C1 = C1 + 1
              C1_lo = C1_lo + 1;
              if (C1_lo == 0) {  // rounding overflow in the low 64 bits
                C1_hi = C1_hi + 1;
              }
              if (C1_hi == 0x0001ed09bead87c0ull
                  && C1_lo == 0x378d8e6400000000ull) {
                // C1 = 10^34 => rounding overflow
                C1_hi = 0x0000314dc6448d93ull;
                C1_lo = 0x38c15b0a00000000ull;  // 10^33
                y_exp = y_exp + EXP_P1;
              }
            } else if ((is_midpoint_lt_even || is_inexact_gt_midpoint)
                       &&
                       ((x_sign
                         && (rnd_mode == ROUNDING_UP
                             || rnd_mode == ROUNDING_TO_ZERO))
                        || (!x_sign
                            && (rnd_mode == ROUNDING_DOWN
                                || rnd_mode == ROUNDING_TO_ZERO)))) {
              // C1 = C1 - 1
              C1_lo = C1_lo - 1;
              if (C1_lo == 0xffffffffffffffffull)
                C1_hi--;
              // check if we crossed into the lower decade
              if (C1_hi == 0x0000314dc6448d93ull && C1_lo == 0x38c15b09ffffffffull) {   // 10^33 - 1
                C1_hi = 0x0001ed09bead87c0ull;  // 10^34 - 1
                C1_lo = 0x378d8e63ffffffffull;
                y_exp = y_exp - EXP_P1;
                // no underflow, because delta + q2 >= P34 + 1
              }
            } else {
              ; // exact, the result is already correct
            }
            // in all cases check for overflow (RN and RA solved already)
            if (y_exp == EXP_MAX_P1) {  // overflow
              if ((rnd_mode == ROUNDING_DOWN && x_sign) ||      // RM and res < 0
                  (rnd_mode == ROUNDING_UP && !x_sign)) {       // RP and res > 0
                C1_hi = 0x7800000000000000ull;  // +inf
                C1_lo = 0x0ull;
              } else {  // RM and res > 0, RP and res < 0, or RZ
                C1_hi = 0x5fffed09bead87c0ull;
                C1_lo = 0x378d8e63ffffffffull;
              }
              y_exp = 0; // x_sign is preserved
              // set the inexact flag (in case the exact addition was exact)
              *pfpsf |= INEXACT_EXCEPTION;
              // set the overflow flag
              *pfpsf |= OVERFLOW_EXCEPTION;
            }
          }
          // assemble the result
          res.w[1] = x_sign | y_exp | C1_hi;
          res.w[0] = C1_lo;
          if (tmp_inexact)
            *pfpsf |= INEXACT_EXCEPTION;
        }
      } else {  // if (-P34 + 1 <= delta <= -1) <=> 1 <= -delta <= P34 - 1
        // NOTE: the following, up to "} else { // if x_sign != y_sign 
        // the result is exact" is identical to "else if (delta == P34 - q2) {"
        // from above; also, the code is not symmetric: a+b and b+a may take
        // different paths (need to unify eventually!) 
        // calculate C' = C2 + C1 * 10^(e1-e2) directly; the result may be 
        // inexact if it requires P34 + 1 decimal digits; in either case the 
        // 'cutoff' point for addition is at the position of the lsb of C2
        // The coefficient of the result is C1 * 10^(e1-e2) + C2 and the
        // exponent is e2; either C1 or 10^(e1-e2) may not fit is 64 bits,
        // but their product fits with certainty in 128 bits (actually in 113)
        // Note that 0 <= e1 - e2 <= P34 - 2
        //   -P34 + 1 <= delta <= -1 <=> -P34 + 1 <= delta <= -1 <=>
        //   -P34 + 1 <= q1 + e1 - q2 - e2 <= -1 <=>
        //   q2 - q1 - P34 + 1 <= e1 - e2 <= q2 - q1 - 1 <=>
        //   1 - P34 - P34 + 1 <= e1-e2 <= P34 - 1 - 1 => 0 <= e1-e2 <= P34 - 2
        scale = delta - q1 + q2;        // scale = (int)(e1 >> 49) - (int)(e2 >> 49)
        if (scale >= 20) {      // 10^(e1-e2) does not fit in 64 bits, but C1 does
          __mul_128x64_to_128 (C1, C1_lo, ten2k128[scale - 20]);
        } else if (scale >= 1) {
          // if 1 <= scale <= 19 then 10^(e1-e2) fits in 64 bits
          if (q1 <= 19) {       // C1 fits in 64 bits
            __mul_64x64_to_128MACH (C1, C1_lo, ten2k64[scale]);
          } else {      // q1 >= 20
            C1.w[1] = C1_hi;
            C1.w[0] = C1_lo;
            __mul_128x64_to_128 (C1, ten2k64[scale], C1);
          }
        } else {        // if (scale == 0) C1 is unchanged
          C1.w[1] = C1_hi;
          C1.w[0] = C1_lo;       // only the low part is necessary
        }
        C1_hi = C1.w[1];
        C1_lo = C1.w[0];
        // now add C2
        if (x_sign == y_sign) {
          // the result can overflow!
          C1_lo = C1_lo + C2_lo;
          C1_hi = C1_hi + C2_hi;
          if (C1_lo < C1.w[0])
            C1_hi++;
          // test for overflow, possible only when C1 >= 10^34
          if (C1_hi > 0x0001ed09bead87c0ull || (C1_hi == 0x0001ed09bead87c0ull && C1_lo >= 0x378d8e6400000000ull)) {    // C1 >= 10^34
            // in this case q = P34 + 1 and x = q - P34 = 1, so multiply 
            // C'' = C'+ 5 = C1 + 5 by k1 ~ 10^(-1) calculated for P34 + 1 
            // decimal digits
            // Calculate C'' = C' + 1/2 * 10^x
            if (C1_lo >= 0xfffffffffffffffbull) {       // low half add has carry
              C1_lo = C1_lo + 5;
              C1_hi = C1_hi + 1;
            } else {
              C1_lo = C1_lo + 5;
            }
            // the approximation of 10^(-1) was rounded up to 118 bits
            // 10^(-1) =~ 33333333333333333333333333333400 * 2^-129
            // 10^(-1) =~ 19999999999999999999999999999a00 * 2^-128
            C1.w[1] = C1_hi;
            C1.w[0] = C1_lo;     // C''
            ten2m1.w[1] = 0x1999999999999999ull;
            ten2m1.w[0] = 0x9999999999999a00ull;
            __mul_128x128_to_256 (P256, C1, ten2m1);    // P256 = C*, f*
            // C* is actually floor(C*) in this case
            // the top Ex = 128 bits of 10^(-1) are 
            // T* = 0x00199999999999999999999999999999
            // if (0 < f* < 10^(-x)) then
            //   if floor(C*) is even then C = floor(C*) - logical right 
            //       shift; C has p decimal digits, correct by Prop. 1)
            //   else if floor(C*) is odd C = floor(C*) - 1 (logical right
            //       shift; C has p decimal digits, correct by Pr. 1)
            // else
            //   C = floor(C*) (logical right shift; C has p decimal digits,
            //       correct by Property 1)
            // n = C * 10^(e2+x)
            if ((P256.w[1] || P256.w[0])
                && (P256.w[1] < 0x1999999999999999ull
                    || (P256.w[1] == 0x1999999999999999ull
                        && P256.w[0] <= 0x9999999999999999ull))) {
              // the result is a midpoint
              if (P256.w[2] & 0x01) {
                is_midpoint_gt_even = 1;
                // if floor(C*) is odd C = floor(C*) - 1; the result is not 0
                P256.w[2]--;
                if (P256.w[2] == 0xffffffffffffffffull)
                  P256.w[3]--;
              } else {
                is_midpoint_lt_even = 1;
              }
            }
            // n = Cstar * 10^(e2+1)
            y_exp = y_exp + EXP_P1;
            // C* != 10^P34 because C* has P34 digits
            // check for overflow
            if (y_exp == EXP_MAX_P1
                && (rnd_mode == ROUNDING_TO_NEAREST
                    || rnd_mode == ROUNDING_TIES_AWAY)) {
              // overflow for RN
              res.w[1] = x_sign | 0x7800000000000000ull;        // +/-inf
              res.w[0] = 0x0ull;
              // set the inexact flag
              *pfpsf |= INEXACT_EXCEPTION;
              // set the overflow flag
              *pfpsf |= OVERFLOW_EXCEPTION;
              BID_SWAP128 (res);
              BID_RETURN (res);
            }
            // if (0 < f* - 1/2 < 10^(-x)) then 
            //   the result of the addition is exact 
            // else 
            //   the result of the addition is inexact
            if (P256.w[1] > 0x8000000000000000ull || (P256.w[1] == 0x8000000000000000ull && P256.w[0] > 0x0ull)) {       // the result may be exact
              tmp64 = P256.w[1] - 0x8000000000000000ull;        // f* - 1/2
              if ((tmp64 > 0x1999999999999999ull
                   || (tmp64 == 0x1999999999999999ull
                       && P256.w[0] >= 0x9999999999999999ull))) {
                // set the inexact flag
                *pfpsf |= INEXACT_EXCEPTION;
                is_inexact = 1;
              } // else the result is exact
            } else {    // the result is inexact
              // set the inexact flag
              *pfpsf |= INEXACT_EXCEPTION;
              is_inexact = 1;
            }
            C1_hi = P256.w[3];
            C1_lo = P256.w[2];
            if (!is_midpoint_gt_even && !is_midpoint_lt_even) {
              is_inexact_lt_midpoint = is_inexact
                && (P256.w[1] & 0x8000000000000000ull);
              is_inexact_gt_midpoint = is_inexact
                && !(P256.w[1] & 0x8000000000000000ull);
            }
            // general correction from RN to RA, RM, RP, RZ; result uses y_exp
            if (rnd_mode != ROUNDING_TO_NEAREST) {
              if ((!x_sign
                   && ((rnd_mode == ROUNDING_UP
                        && is_inexact_lt_midpoint)
                       || ((rnd_mode == ROUNDING_TIES_AWAY
                            || rnd_mode == ROUNDING_UP)
                           && is_midpoint_gt_even))) || (x_sign
                                                         &&
                                                         ((rnd_mode ==
                                                           ROUNDING_DOWN
                                                           &&
                                                           is_inexact_lt_midpoint)
                                                          ||
                                                          ((rnd_mode ==
                                                            ROUNDING_TIES_AWAY
                                                            || rnd_mode
                                                            ==
                                                            ROUNDING_DOWN)
                                                           &&
                                                           is_midpoint_gt_even))))
              {
                // C1 = C1 + 1
                C1_lo = C1_lo + 1;
                if (C1_lo == 0) {        // rounding overflow in the low 64 bits
                  C1_hi = C1_hi + 1;
                }
                if (C1_hi == 0x0001ed09bead87c0ull
                    && C1_lo == 0x378d8e6400000000ull) {
                  // C1 = 10^34 => rounding overflow
                  C1_hi = 0x0000314dc6448d93ull;
                  C1_lo = 0x38c15b0a00000000ull;        // 10^33
                  y_exp = y_exp + EXP_P1;
                }
              } else
                if ((is_midpoint_lt_even || is_inexact_gt_midpoint) &&
                    ((x_sign && (rnd_mode == ROUNDING_UP ||
                                 rnd_mode == ROUNDING_TO_ZERO)) ||
                     (!x_sign && (rnd_mode == ROUNDING_DOWN ||
                                  rnd_mode == ROUNDING_TO_ZERO)))) {
                // C1 = C1 - 1
                C1_lo = C1_lo - 1;
                if (C1_lo == 0xffffffffffffffffull)
                  C1_hi--;
                // check if we crossed into the lower decade
                if (C1_hi == 0x0000314dc6448d93ull && C1_lo == 0x38c15b09ffffffffull) { // 10^33 - 1
                  C1_hi = 0x0001ed09bead87c0ull;        // 10^34 - 1
                  C1_lo = 0x378d8e63ffffffffull;
                  y_exp = y_exp - EXP_P1;
                  // no underflow, because delta + q2 >= P34 + 1
                }
              } else {
                ;       // exact, the result is already correct
              }
              // in all cases check for overflow (RN and RA solved already)
              if (y_exp == EXP_MAX_P1) {        // overflow
                if ((rnd_mode == ROUNDING_DOWN && x_sign) ||    // RM and res < 0
                    (rnd_mode == ROUNDING_UP && !x_sign)) {     // RP and res > 0
                  C1_hi = 0x7800000000000000ull;        // +inf
                  C1_lo = 0x0ull;
                } else {        // RM and res > 0, RP and res < 0, or RZ
                  C1_hi = 0x5fffed09bead87c0ull;
                  C1_lo = 0x378d8e63ffffffffull;
                }
                y_exp = 0;       // x_sign is preserved
                // set the inexact flag (in case the exact addition was exact)
                *pfpsf |= INEXACT_EXCEPTION;
                // set the overflow flag
                *pfpsf |= OVERFLOW_EXCEPTION;
              }
            }
          }     // else if (C1 < 10^34) then C1 is the coeff.; the result is exact
          // assemble the result
          res.w[1] = x_sign | y_exp | C1_hi;
          res.w[0] = C1_lo;
        } else {        // if x_sign != y_sign the result is exact
          C1_lo = C2_lo - C1_lo;
          C1_hi = C2_hi - C1_hi;
          if (C1_lo > C2_lo)
            C1_hi--;
          if (C1_hi >= 0x8000000000000000ull) { // negative coefficient!
            C1_lo = ~C1_lo;
            C1_lo++;
            C1_hi = ~C1_hi;
            if (C1_lo == 0x0)
              C1_hi++;
            x_sign = y_sign;    // the result will have the sign of y
          }
          // the result can be zero, but it cannot overflow
          if (C1_lo == 0 && C1_hi == 0) {
            // assemble the result
            if (x_exp < y_exp)
              res.w[1] = x_exp;
            else
              res.w[1] = y_exp;
            res.w[0] = 0;
            if (rnd_mode == ROUNDING_DOWN) {
              res.w[1] |= 0x8000000000000000ull;
            }
            BID_SWAP128 (res);
            BID_RETURN (res);
          }
          // assemble the result
          res.w[1] = y_sign | y_exp | C1_hi;
          res.w[0] = C1_lo;
        }
      }
    }
    BID_SWAP128 (res);
    BID_RETURN (res)
  }
}
 
 
 
// bid128_sub stands for bid128qq_sub
 
/*****************************************************************************
 *  BID128 sub
 ****************************************************************************/
 
#if DECIMAL_CALL_BY_REFERENCE
void
bid128_sub (UINT128 * pres, UINT128 * px, UINT128 * py
            _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
            _EXC_INFO_PARAM) {
  UINT128 x = *px, y = *py;
#if !DECIMAL_GLOBAL_ROUNDING
  unsigned int rnd_mode = *prnd_mode;
#endif
#else
UINT128
bid128_sub (UINT128 x, UINT128 y
            _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
            _EXC_INFO_PARAM) {
#endif
 
  UINT128 res;
  UINT64 y_sign;
 
  if ((y.w[HIGH_128W] & MASK_NAN) != MASK_NAN) {        // y is not NAN
    // change its sign
    y_sign = y.w[HIGH_128W] & MASK_SIGN;        // 0 for positive, MASK_SIGN for negative
    if (y_sign)
      y.w[HIGH_128W] = y.w[HIGH_128W] & 0x7fffffffffffffffull;
    else
      y.w[HIGH_128W] = y.w[HIGH_128W] | 0x8000000000000000ull;
  }
#if DECIMAL_CALL_BY_REFERENCE
  bid128_add (&res, &x, &y
              _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
              _EXC_INFO_ARG);
#else
  res = bid128_add (x, y
                    _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
                    _EXC_INFO_ARG);
#endif
  BID_RETURN (res);
}
Browse

Tools

Subversion Repositories openrisc

[/] [openrisc/] [trunk/] [gnu-stable/] [gcc-4.5.1/] [libgcc/] [config/] [libbid/] [bid128_add.c] - Blame information for rev 826

Line No.	Rev	Author	Line
1	272	jeremybenn	`/* Copyright (C) 2007, 2009 Free Software Foundation, Inc.`
2
3			`This file is part of GCC.`
4
5			`GCC is free software; you can redistribute it and/or modify it under`
6			`the terms of the GNU General Public License as published by the Free`
7			`Software Foundation; either version 3, or (at your option) any later`
8			`version.`
9
10			`GCC is distributed in the hope that it will be useful, but WITHOUT ANY`
11			`WARRANTY; without even the implied warranty of MERCHANTABILITY or`
12			`FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License`
13			`for more details.`
14
15			`Under Section 7 of GPL version 3, you are granted additional`
16			`permissions described in the GCC Runtime Library Exception, version`
17			`3.1, as published by the Free Software Foundation.`
18
19			`You should have received a copy of the GNU General Public License and`
20			`a copy of the GCC Runtime Library Exception along with this program;`
21			`see the files COPYING3 and COPYING.RUNTIME respectively. If not, see`
22			`<http://www.gnu.org/licenses/>. */`
23
24			`#include "bid_internal.h"`
25
26
27			`#if DECIMAL_CALL_BY_REFERENCE`
28			`void`
29			`bid64dq_add (UINT64 * pres, UINT64 * px, UINT128 * py`
30			`_RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM`
31			`_EXC_INFO_PARAM) {`
32			`UINT64 x = *px;`
33			`#if !DECIMAL_GLOBAL_ROUNDING`
34			`unsigned int rnd_mode = *prnd_mode;`
35			`#endif`
36			`#else`
37			`UINT64`
38			`bid64dq_add (UINT64 x, UINT128 y`
39			`_RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM`
40			`_EXC_INFO_PARAM) {`