OpenCores
URL https://opencores.org/ocsvn/openrisc/openrisc/trunk

Subversion Repositories openrisc

[/] [openrisc/] [trunk/] [gnu-dev/] [or1k-gcc/] [libgcc/] [config/] [libbid/] [bid_sqrt_macros.h] - Blame information for rev 801

Go to most recent revision | Details | Compare with Previous | View Log

Line No. Rev Author Line
1 734 jeremybenn
/* Copyright (C) 2007, 2009  Free Software Foundation, Inc.
2
 
3
This file is part of GCC.
4
 
5
GCC is free software; you can redistribute it and/or modify it under
6
the terms of the GNU General Public License as published by the Free
7
Software Foundation; either version 3, or (at your option) any later
8
version.
9
 
10
GCC is distributed in the hope that it will be useful, but WITHOUT ANY
11
WARRANTY; without even the implied warranty of MERCHANTABILITY or
12
FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
13
for more details.
14
 
15
Under Section 7 of GPL version 3, you are granted additional
16
permissions described in the GCC Runtime Library Exception, version
17
3.1, as published by the Free Software Foundation.
18
 
19
You should have received a copy of the GNU General Public License and
20
a copy of the GCC Runtime Library Exception along with this program;
21
see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
22
<http://www.gnu.org/licenses/>.  */
23
 
24
#ifndef _SQRT_MACROS_H_
25
#define _SQRT_MACROS_H_
26
 
27
#define FENCE __fence
28
 
29
#if DOUBLE_EXTENDED_ON
30
 
31
extern BINARY80 SQRT80 (BINARY80);
32
 
33
 
34
__BID_INLINE__ UINT64
35
short_sqrt128 (UINT128 A10) {
36
  BINARY80 lx, ly, l64;
37
  int_float f64;
38
 
39
  // 2^64
40
  f64.i = 0x5f800000;
41
  l64 = (BINARY80) f64.d;
42
  lx = (BINARY80) A10.w[1] * l64 + (BINARY80) A10.w[0];
43
  ly = SQRT80 (lx);
44
  return (UINT64) ly;
45
}
46
 
47
 
48
__BID_INLINE__ void
49
long_sqrt128 (UINT128 * pCS, UINT256 C256) {
50
  UINT256 C4;
51
  UINT128 CS;
52
  UINT64 X;
53
  SINT64 SE;
54
  BINARY80 l64, lm64, l128, lxL, lx, ly, lS, lSH, lSL, lE, l3, l2,
55
    l1, l0, lp, lCl;
56
  int_float fx, f64, fm64;
57
  int *ple = (int *) &lx;
58
 
59
  // 2^64
60
  f64.i = 0x5f800000;
61
  l64 = (BINARY80) f64.d;
62
 
63
  l128 = l64 * l64;
64
  lx = l3 = (BINARY80) C256.w[3] * l64 * l128;
65
  l2 = (BINARY80) C256.w[2] * l128;
66
  lx = FENCE (lx + l2);
67
  l1 = (BINARY80) C256.w[1] * l64;
68
  lx = FENCE (lx + l1);
69
  l0 = (BINARY80) C256.w[0];
70
  lx = FENCE (lx + l0);
71
  // sqrt(C256)
72
  lS = SQRT80 (lx);
73
 
74
  // get coefficient
75
  // 2^(-64)
76
  fm64.i = 0x1f800000;
77
  lm64 = (BINARY80) fm64.d;
78
  CS.w[1] = (UINT64) (lS * lm64);
79
  CS.w[0] = (UINT64) (lS - (BINARY80) CS.w[1] * l64);
80
 
81
  ///////////////////////////////////////
82
  //  CAUTION!
83
  //  little endian code only
84
  //  add solution for big endian
85
  //////////////////////////////////////
86
  lSH = lS;
87
  *((UINT64 *) & lSH) &= 0xffffffff00000000ull;
88
 
89
  // correction for C256 rounding
90
  lCl = FENCE (l3 - lx);
91
  lCl = FENCE (lCl + l2);
92
  lCl = FENCE (lCl + l1);
93
  lCl = FENCE (lCl + l0);
94
 
95
  lSL = lS - lSH;
96
 
97
  //////////////////////////////////////////
98
  //   Watch for compiler re-ordering
99
  //
100
  /////////////////////////////////////////
101
  // C256-S^2
102
  lxL = FENCE (lx - lSH * lSH);
103
  lp = lSH * lSL;
104
  lp += lp;
105
  lxL = FENCE (lxL - lp);
106
  lSL *= lSL;
107
  lxL = FENCE (lxL - lSL);
108
  lCl += lxL;
109
 
110
  // correction term
111
  lE = lCl / (lS + lS);
112
 
113
  // get low part of coefficient
114
  X = CS.w[0];
115
  if (lCl >= 0) {
116
    SE = (SINT64) (lE);
117
    CS.w[0] += SE;
118
    if (CS.w[0] < X)
119
      CS.w[1]++;
120
  } else {
121
    SE = (SINT64) (-lE);
122
    CS.w[0] -= SE;
123
    if (CS.w[0] > X)
124
      CS.w[1]--;
125
  }
126
 
127
  pCS->w[0] = CS.w[0];
128
  pCS->w[1] = CS.w[1];
129
}
130
 
131
#else
132
 
133
extern double sqrt (double);
134
 
135
__BID_INLINE__ UINT64
136
short_sqrt128 (UINT128 A10) {
137
  UINT256 ARS, ARS0, AE0, AE, S;
138
 
139
  UINT64 MY, ES, CY;
140
  double lx, l64;
141
  int_double f64, ly;
142
  int ey, k;
143
 
144
  // 2^64
145
  f64.i = 0x43f0000000000000ull;
146
  l64 = f64.d;
147
  lx = (double) A10.w[1] * l64 + (double) A10.w[0];
148
  ly.d = 1.0 / sqrt (lx);
149
 
150
  MY = (ly.i & 0x000fffffffffffffull) | 0x0010000000000000ull;
151
  ey = 0x3ff - (ly.i >> 52);
152
 
153
  // A10*RS^2
154
  __mul_64x128_to_192 (ARS0, MY, A10);
155
  __mul_64x192_to_256 (ARS, MY, ARS0);
156
 
157
  // shr by 2*ey+40, to get a 64-bit value
158
  k = (ey << 1) + 104 - 64;
159
  if (k >= 128) {
160
    if (k > 128)
161
      ES = (ARS.w[2] >> (k - 128)) | (ARS.w[3] << (192 - k));
162
    else
163
      ES = ARS.w[2];
164
  } else {
165
    if (k >= 64) {
166
      ARS.w[0] = ARS.w[1];
167
      ARS.w[1] = ARS.w[2];
168
      k -= 64;
169
    }
170
    if (k) {
171
      __shr_128 (ARS, ARS, k);
172
    }
173
    ES = ARS.w[0];
174
  }
175
 
176
  ES = ((SINT64) ES) >> 1;
177
 
178
  if (((SINT64) ES) < 0) {
179
    ES = -ES;
180
 
181
    // A*RS*eps (scaled by 2^64)
182
    __mul_64x192_to_256 (AE0, ES, ARS0);
183
 
184
    AE.w[0] = AE0.w[1];
185
    AE.w[1] = AE0.w[2];
186
    AE.w[2] = AE0.w[3];
187
 
188
    __add_carry_out (S.w[0], CY, ARS0.w[0], AE.w[0]);
189
    __add_carry_in_out (S.w[1], CY, ARS0.w[1], AE.w[1], CY);
190
    S.w[2] = ARS0.w[2] + AE.w[2] + CY;
191
  } else {
192
    // A*RS*eps (scaled by 2^64)
193
    __mul_64x192_to_256 (AE0, ES, ARS0);
194
 
195
    AE.w[0] = AE0.w[1];
196
    AE.w[1] = AE0.w[2];
197
    AE.w[2] = AE0.w[3];
198
 
199
    __sub_borrow_out (S.w[0], CY, ARS0.w[0], AE.w[0]);
200
    __sub_borrow_in_out (S.w[1], CY, ARS0.w[1], AE.w[1], CY);
201
    S.w[2] = ARS0.w[2] - AE.w[2] - CY;
202
  }
203
 
204
  k = ey + 51;
205
 
206
  if (k >= 64) {
207
    if (k >= 128) {
208
      S.w[0] = S.w[2];
209
      S.w[1] = 0;
210
      k -= 128;
211
    } else {
212
      S.w[0] = S.w[1];
213
      S.w[1] = S.w[2];
214
    }
215
    k -= 64;
216
  }
217
  if (k) {
218
    __shr_128 (S, S, k);
219
  }
220
 
221
 
222
  return (UINT64) ((S.w[0] + 1) >> 1);
223
 
224
}
225
 
226
 
227
 
228
__BID_INLINE__ void
229
long_sqrt128 (UINT128 * pCS, UINT256 C256) {
230
  UINT512 ARS0, ARS;
231
  UINT256 ARS00, AE, AE2, S;
232
  UINT128 ES, ES2, ARS1;
233
  UINT64 ES32, CY, MY;
234
  double l64, l128, lx, l2, l1, l0;
235
  int_double f64, ly;
236
  int ey, k, k2;
237
 
238
  // 2^64
239
  f64.i = 0x43f0000000000000ull;
240
  l64 = f64.d;
241
 
242
  l128 = l64 * l64;
243
  lx = (double) C256.w[3] * l64 * l128;
244
  l2 = (double) C256.w[2] * l128;
245
  lx = FENCE (lx + l2);
246
  l1 = (double) C256.w[1] * l64;
247
  lx = FENCE (lx + l1);
248
  l0 = (double) C256.w[0];
249
  lx = FENCE (lx + l0);
250
  // sqrt(C256)
251
  ly.d = 1.0 / sqrt (lx);
252
 
253
  MY = (ly.i & 0x000fffffffffffffull) | 0x0010000000000000ull;
254
  ey = 0x3ff - (ly.i >> 52);
255
 
256
  // A10*RS^2, scaled by 2^(2*ey+104)
257
  __mul_64x256_to_320 (ARS0, MY, C256);
258
  __mul_64x320_to_384 (ARS, MY, ARS0);
259
 
260
  // shr by k=(2*ey+104)-128
261
  // expect k is in the range (192, 256) if result in [10^33, 10^34)
262
  // apply an additional signed shift by 1 at the same time (to get eps=eps0/2)
263
  k = (ey << 1) + 104 - 128 - 192;
264
  k2 = 64 - k;
265
  ES.w[0] = (ARS.w[3] >> (k + 1)) | (ARS.w[4] << (k2 - 1));
266
  ES.w[1] = (ARS.w[4] >> k) | (ARS.w[5] << k2);
267
  ES.w[1] = ((SINT64) ES.w[1]) >> 1;
268
 
269
  // A*RS >> 192 (for error term computation)
270
  ARS1.w[0] = ARS0.w[3];
271
  ARS1.w[1] = ARS0.w[4];
272
 
273
  // A*RS>>64
274
  ARS00.w[0] = ARS0.w[1];
275
  ARS00.w[1] = ARS0.w[2];
276
  ARS00.w[2] = ARS0.w[3];
277
  ARS00.w[3] = ARS0.w[4];
278
 
279
  if (((SINT64) ES.w[1]) < 0) {
280
    ES.w[0] = -ES.w[0];
281
    ES.w[1] = -ES.w[1];
282
    if (ES.w[0])
283
      ES.w[1]--;
284
 
285
    // A*RS*eps 
286
    __mul_128x128_to_256 (AE, ES, ARS1);
287
 
288
    __add_carry_out (S.w[0], CY, ARS00.w[0], AE.w[0]);
289
    __add_carry_in_out (S.w[1], CY, ARS00.w[1], AE.w[1], CY);
290
    __add_carry_in_out (S.w[2], CY, ARS00.w[2], AE.w[2], CY);
291
    S.w[3] = ARS00.w[3] + AE.w[3] + CY;
292
  } else {
293
    // A*RS*eps 
294
    __mul_128x128_to_256 (AE, ES, ARS1);
295
 
296
    __sub_borrow_out (S.w[0], CY, ARS00.w[0], AE.w[0]);
297
    __sub_borrow_in_out (S.w[1], CY, ARS00.w[1], AE.w[1], CY);
298
    __sub_borrow_in_out (S.w[2], CY, ARS00.w[2], AE.w[2], CY);
299
    S.w[3] = ARS00.w[3] - AE.w[3] - CY;
300
  }
301
 
302
  // 3/2*eps^2, scaled by 2^128
303
  ES32 = ES.w[1] + (ES.w[1] >> 1);
304
  __mul_64x64_to_128 (ES2, ES32, ES.w[1]);
305
  // A*RS*3/2*eps^2
306
  __mul_128x128_to_256 (AE2, ES2, ARS1);
307
 
308
  // result, scaled by 2^(ey+52-64)
309
  __add_carry_out (S.w[0], CY, S.w[0], AE2.w[0]);
310
  __add_carry_in_out (S.w[1], CY, S.w[1], AE2.w[1], CY);
311
  __add_carry_in_out (S.w[2], CY, S.w[2], AE2.w[2], CY);
312
  S.w[3] = S.w[3] + AE2.w[3] + CY;
313
 
314
  // k in (0, 64)
315
  k = ey + 51 - 128;
316
  k2 = 64 - k;
317
  S.w[0] = (S.w[1] >> k) | (S.w[2] << k2);
318
  S.w[1] = (S.w[2] >> k) | (S.w[3] << k2);
319
 
320
  // round to nearest
321
  S.w[0]++;
322
  if (!S.w[0])
323
    S.w[1]++;
324
 
325
  pCS->w[0] = (S.w[1] << 63) | (S.w[0] >> 1);
326
  pCS->w[1] = S.w[1] >> 1;
327
 
328
}
329
 
330
#endif
331
#endif

powered by: WebSVN 2.1.0

© copyright 1999-2024 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.