1 |
734 |
jeremybenn |
/* Copyright (C) 2011 Free Software Foundation, Inc.
|
2 |
|
|
Contributed by Embecosm on behalf of Adapteva, Inc.
|
3 |
|
|
|
4 |
|
|
This file is part of GCC.
|
5 |
|
|
|
6 |
|
|
GCC is free software; you can redistribute it and/or modify it under
|
7 |
|
|
the terms of the GNU General Public License as published by the Free
|
8 |
|
|
Software Foundation; either version 3, or (at your option) any later
|
9 |
|
|
version.
|
10 |
|
|
|
11 |
|
|
GCC is distributed in the hope that it will be useful, but WITHOUT ANY
|
12 |
|
|
WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
13 |
|
|
FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
14 |
|
|
for more details.
|
15 |
|
|
|
16 |
|
|
Under Section 7 of GPL version 3, you are granted additional
|
17 |
|
|
permissions described in the GCC Runtime Library Exception, version
|
18 |
|
|
3.1, as published by the Free Software Foundation.
|
19 |
|
|
|
20 |
|
|
You should have received a copy of the GNU General Public License and
|
21 |
|
|
a copy of the GCC Runtime Library Exception along with this program;
|
22 |
|
|
see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
|
23 |
|
|
. */
|
24 |
|
|
|
25 |
|
|
#include "../epiphany-asm.h"
|
26 |
|
|
|
27 |
|
|
.section _fast_div_text,"a",@progbits;
|
28 |
|
|
.balign 8;
|
29 |
|
|
_fast_div_table:
|
30 |
|
|
.word 0x007fffff// mantissa mask
|
31 |
|
|
.word 0x40257ebb// hold constant a = 2.58586
|
32 |
|
|
|
33 |
|
|
.word 0x3f000000// hold constant 126 shifted to bits [30:23]
|
34 |
|
|
.word 0xc0ba2e88// hold constant b = -5.81818
|
35 |
|
|
|
36 |
|
|
.word 0x4087c1e8// hold constant c = 4.24242
|
37 |
|
|
.word 0x40000000// to hold constant 2 for Newton-Raphson iterations
|
38 |
|
|
|
39 |
|
|
.global SYM(__fast_recipsf2)
|
40 |
|
|
FUNC(__fast_recipsf2)
|
41 |
|
|
SYM(__fast_recipsf2):
|
42 |
|
|
|
43 |
|
|
//###################
|
44 |
|
|
//# input operands:
|
45 |
|
|
//###################
|
46 |
|
|
// Divisor
|
47 |
|
|
//R0
|
48 |
|
|
// Function address (used with negative offsets to read _fast_div_table)
|
49 |
|
|
//R1
|
50 |
|
|
/* Scratch registers: two single (TMP0/TMP5) and two pairs. */
|
51 |
|
|
#define P0L TMP1
|
52 |
|
|
#define P0H TMP2
|
53 |
|
|
#define P1L TMP3
|
54 |
|
|
#define P1H TMP4
|
55 |
|
|
|
56 |
|
|
//#########################################
|
57 |
|
|
//# Constants to be used in the algorithm
|
58 |
|
|
//#########################################
|
59 |
|
|
ldrd P0L , [ R1 , -3 ]
|
60 |
|
|
|
61 |
|
|
ldrd P1L , [ R1 , -2 ]
|
62 |
|
|
|
63 |
|
|
|
64 |
|
|
|
65 |
|
|
//#############################################################################
|
66 |
|
|
//# The Algorithm
|
67 |
|
|
//#
|
68 |
|
|
//# Operation: C=A/B
|
69 |
|
|
//# stage 1 - find the reciprocal 1/B according to the following scheme:
|
70 |
|
|
//# B = (2^E)*m (1
|
71 |
|
|
//# 1/B = 1/((2^E)*m) = 1/((2^(E+1))*m1) (0.5
|
72 |
|
|
//# = (2^-(E+1))*(1/m1) = (2^E1)*(1/m1)
|
73 |
|
|
//#
|
74 |
|
|
//# Now we can find the new exponent:
|
75 |
|
|
//# e1 = E1+127 = -E-1+127 = -e+127-1+127 = 253-e **
|
76 |
|
|
//# 1/m1 alreadt has the exponent 127, so we have to add 126-e.
|
77 |
|
|
//# the exponent might underflow, which we can detect as a sign change.
|
78 |
|
|
//# Since the architeture uses flush-to-zero for subnormals, we can
|
79 |
|
|
//# give the result 0. then.
|
80 |
|
|
//#
|
81 |
|
|
//# The 1/m1 term with 0.5
|
82 |
|
|
//# 1/m1 = 2.58586*(m1^2) - 5.81818*m1 + 4.24242
|
83 |
|
|
//#
|
84 |
|
|
//# Next step is to use two iterations of Newton-Raphson algorithm to complete
|
85 |
|
|
//# the reciprocal calculation.
|
86 |
|
|
//#
|
87 |
|
|
//# Final result is achieved by multiplying A with 1/B
|
88 |
|
|
//#############################################################################
|
89 |
|
|
|
90 |
|
|
|
91 |
|
|
|
92 |
|
|
// R0 exponent and sign "replacement" into TMP0
|
93 |
|
|
AND TMP0,R0,P0L ;
|
94 |
|
|
ORR TMP0,TMP0,P1L
|
95 |
|
|
SUB TMP5,R0,TMP0 // R0 sign/exponent extraction into TMP5
|
96 |
|
|
// Calculate new mantissa
|
97 |
|
|
FMADD P1H,TMP0,P0H ;
|
98 |
|
|
// Calculate new exponent offset 126 - "old exponent"
|
99 |
|
|
SUB P1L,P1L,TMP5
|
100 |
|
|
ldrd P0L , [ R1 , -1 ]
|
101 |
|
|
FMADD P0L,TMP0,P1H ;
|
102 |
|
|
eor P1H,r0,P1L // check for overflow (N-BIT).
|
103 |
|
|
blt .Lret_0
|
104 |
|
|
// P0L exponent and sign "replacement"
|
105 |
|
|
sub P0L,P0L,TMP5
|
106 |
|
|
|
107 |
|
|
// Newton-Raphson iteration #1
|
108 |
|
|
MOV TMP0,P0H ;
|
109 |
|
|
FMSUB P0H,R0,P0L ;
|
110 |
|
|
FMUL P0L,P0H,P0L ;
|
111 |
|
|
// Newton-Raphson iteration #2
|
112 |
|
|
FMSUB TMP0,R0,P0L ;
|
113 |
|
|
FMUL R0,TMP0,P0L ;
|
114 |
|
|
jr lr
|
115 |
|
|
.Lret_0:ldrd P0L , [ R1 , -3 ]
|
116 |
|
|
lsr TMP0,r0,31 ; extract sign
|
117 |
|
|
lsl TMP0,TMP0,31
|
118 |
|
|
add P0L,P0L,r0 ; check for NaN input
|
119 |
|
|
eor P0L,P0L,r0
|
120 |
|
|
movgte r0,TMP0
|
121 |
|
|
jr lr
|
122 |
|
|
// Quotient calculation is expected by the caller: FMUL quotient,divident,R0
|
123 |
|
|
;
|
124 |
|
|
ENDFUNC(__fast_recipsf2)
|