1 |
3 |
mgraep |
-- Copyright (c) 2013 Malte Graeper (mgraep@t-online.de) All rights reserved.
|
2 |
|
|
|
3 |
2 |
mgraep |
library IEEE;
|
4 |
|
|
use IEEE.std_logic_1164.all;
|
5 |
|
|
use IEEE.numeric_std.all;
|
6 |
|
|
|
7 |
|
|
package qfp32_divider_p is
|
8 |
|
|
|
9 |
|
|
function zero_blocks (
|
10 |
|
|
data : unsigned;
|
11 |
|
|
block_size : integer)
|
12 |
|
|
return std_ulogic_vector;
|
13 |
|
|
|
14 |
|
|
end package qfp32_divider_p;
|
15 |
|
|
|
16 |
|
|
package body qfp32_divider_p is
|
17 |
|
|
|
18 |
|
|
function zero_blocks (
|
19 |
|
|
data : unsigned;
|
20 |
|
|
block_size : integer)
|
21 |
|
|
return std_ulogic_vector is
|
22 |
|
|
|
23 |
|
|
constant max_blocks : integer := data'length/block_size;
|
24 |
|
|
variable data_zero : std_ulogic_vector(max_blocks-1 downto 0);
|
25 |
|
|
variable data_downto : unsigned(data'length-1 downto 0);
|
26 |
|
|
|
27 |
|
|
begin
|
28 |
|
|
|
29 |
|
|
-- workaround for slice problems
|
30 |
|
|
-- if parameter is unsigned with std_ulogic_vector concat there are
|
31 |
|
|
-- problems => use to_unsigned instead of std_ulogic_vector
|
32 |
|
|
data_downto := data;
|
33 |
|
|
|
34 |
|
|
data_zero := (others => '0');
|
35 |
|
|
for i in 0 to max_blocks-1 loop
|
36 |
|
|
if data_downto(data'length-1 downto data'length-(i+1)*block_size) = to_unsigned(0,(i+1)*block_size) then
|
37 |
|
|
data_zero(i) := '1';
|
38 |
|
|
end if;
|
39 |
|
|
end loop; -- i
|
40 |
|
|
|
41 |
|
|
return data_zero;
|
42 |
|
|
end zero_blocks;
|
43 |
|
|
|
44 |
|
|
end package body qfp32_divider_p;
|
45 |
|
|
|
46 |
|
|
|
47 |
|
|
library IEEE;
|
48 |
|
|
use IEEE.std_logic_1164.all;
|
49 |
|
|
use IEEE.numeric_std.all;
|
50 |
|
|
|
51 |
|
|
library work;
|
52 |
|
|
use work.qfp_p.all;
|
53 |
|
|
use work.qfp32_divider_p.all;
|
54 |
|
|
|
55 |
|
|
entity qfp32_divider is
|
56 |
|
|
|
57 |
|
|
port (
|
58 |
|
|
clk_i : in std_ulogic;
|
59 |
|
|
reset_n_i : in std_ulogic;
|
60 |
|
|
|
61 |
|
|
start_i : in std_ulogic;
|
62 |
|
|
ready_o : out std_ulogic;
|
63 |
|
|
|
64 |
|
|
regA_i : in qfp32_t;
|
65 |
|
|
regB_i : in qfp32_t;
|
66 |
|
|
|
67 |
|
|
complete_o : out std_ulogic;
|
68 |
|
|
result_o : out qfp32_raw_t);
|
69 |
|
|
|
70 |
|
|
end qfp32_divider;
|
71 |
|
|
|
72 |
|
|
architecture Rtl of qfp32_divider is
|
73 |
|
|
|
74 |
|
|
-- r=(1/d)*2^(29)
|
75 |
|
|
-- shifting rem left each time (in loop), the result is effectivly multiplied by 2^(29)
|
76 |
|
|
-- QFPx0: d = v*2^24
|
77 |
|
|
-- QFPx8: d = v*2^16
|
78 |
|
|
-- QFPx16: d = v*2^8
|
79 |
|
|
-- QFPx24: d = v*2^0
|
80 |
|
|
|
81 |
|
|
signal start_1d : std_ulogic;
|
82 |
|
|
|
83 |
|
|
signal p1_divisor_mant : unsigned(28 downto 0);
|
84 |
|
|
signal p1_dividend_mant : unsigned(28 downto 0);
|
85 |
|
|
signal p1_divisor_zero : std_ulogic_vector(3 downto 0);
|
86 |
|
|
signal p1_allowed_dividend_shift : unsigned(1 downto 0);
|
87 |
|
|
-- if the msb of divisor is set, the possible additional shift cannot happen because
|
88 |
|
|
-- the condition 'dividend_top_bits >= 2*divisor_top_bits can never be
|
89 |
|
|
-- fullfilled (both vectors are 29 bits) therefore if an additional shift happens
|
90 |
|
|
-- at most the 28th bit of divisor is set and shifted by 8 eg 36bits is enough
|
91 |
|
|
signal p1_divisor : unsigned(35 downto 0); -- 28+8 buffer for shifting
|
92 |
|
|
signal p1_dividend : unsigned(32 downto 0); -- 29+4
|
93 |
|
|
signal p1_exp : unsigned(2 downto 0);
|
94 |
|
|
signal p1_delta_exp : unsigned(2 downto 0);
|
95 |
|
|
signal p1_adjust_divisor : unsigned(1 downto 0);
|
96 |
|
|
signal p1_adjust_divisor_final : unsigned(2 downto 0);
|
97 |
|
|
signal p1_adjust_dividend : unsigned(1 downto 0);
|
98 |
|
|
signal p1_top_bits : unsigned(7 downto 0);
|
99 |
|
|
signal p1_sign : std_ulogic;
|
100 |
|
|
signal p1_exp_ov : std_ulogic; -- if p1_exp_sum > 7 => result will be maximum
|
101 |
|
|
signal p1_exp_sum : unsigned(3 downto 0);
|
102 |
|
|
signal p1_rem : unsigned(41 downto 0); -- +1 bit for shift buffer, +5 for division correction, +2 to make size after division correction same as divisor
|
103 |
|
|
signal p1_div_by_zero : std_ulogic;
|
104 |
|
|
|
105 |
|
|
signal p2_busy : std_ulogic;
|
106 |
|
|
signal p2_divisor : unsigned(35 downto 0);
|
107 |
|
|
signal p2_exp : unsigned(2 downto 0);
|
108 |
|
|
signal p2_exp_ov : std_ulogic;
|
109 |
|
|
signal p2_exp_adjusted : unsigned(2 downto 0);
|
110 |
|
|
signal p2_sign : std_ulogic;
|
111 |
|
|
signal p2_rem : unsigned(41 downto 0);
|
112 |
|
|
signal p2_rem_shft : unsigned(41 downto 0);
|
113 |
|
|
signal p2_rem_next : unsigned(41 downto 0);
|
114 |
|
|
|
115 |
|
|
signal p2_sub : unsigned(36 downto 0);
|
116 |
|
|
signal p2_quo : unsigned(28 downto 0); -- extend for rounding bit calculation!!
|
117 |
|
|
signal p2_quo_adjusted : unsigned(36 downto 0);
|
118 |
|
|
signal p2_quo_shft : unsigned(28 downto 0);
|
119 |
|
|
signal p2_quo_next : unsigned(28 downto 0);
|
120 |
|
|
signal p2_cnt : unsigned(4 downto 0);
|
121 |
|
|
signal p2_complete : std_ulogic;
|
122 |
|
|
signal p2_complete_1d : std_ulogic;
|
123 |
|
|
|
124 |
|
|
begin -- Rtl
|
125 |
|
|
|
126 |
|
|
process (clk_i, reset_n_i)
|
127 |
|
|
begin -- process
|
128 |
|
|
if reset_n_i = '0' then -- asynchronous reset (active low)
|
129 |
|
|
start_1d <= '0';
|
130 |
|
|
p2_busy <= '0';
|
131 |
|
|
p2_rem <= to_unsigned(0,42);
|
132 |
|
|
p2_exp <= to_unsigned(0,3);
|
133 |
|
|
p2_exp_ov <= '0';
|
134 |
|
|
p2_sign <= '0';
|
135 |
|
|
p2_divisor <= to_unsigned(0,36);
|
136 |
|
|
p2_quo <= to_unsigned(0,29);
|
137 |
|
|
p2_cnt <= to_unsigned(0,5);
|
138 |
|
|
p2_complete_1d <= '0';
|
139 |
|
|
elsif clk_i'event and clk_i = '1' then -- rising clock edge
|
140 |
|
|
|
141 |
|
|
start_1d <= '0';
|
142 |
|
|
if start_i = '1' and p2_busy = '0' then
|
143 |
|
|
start_1d <= '1';
|
144 |
|
|
end if;
|
145 |
|
|
|
146 |
|
|
p2_complete_1d <= '0';
|
147 |
|
|
if start_1d = '1' then
|
148 |
|
|
p2_rem <= p1_rem;
|
149 |
|
|
p2_exp <= p1_exp;
|
150 |
|
|
p2_exp_ov <= p1_exp_ov;
|
151 |
|
|
p2_sign <= p1_sign;
|
152 |
|
|
p2_divisor <= p1_divisor;
|
153 |
|
|
p2_quo <= to_unsigned(0,29);
|
154 |
|
|
p2_cnt <= to_unsigned(28,5);
|
155 |
|
|
p2_busy <= '1';
|
156 |
|
|
elsif p2_busy = '1' then
|
157 |
|
|
p2_rem <= p2_rem_next;
|
158 |
|
|
p2_quo <= p2_quo_next;
|
159 |
|
|
p2_cnt <= p2_cnt-1;
|
160 |
|
|
if p2_complete = '1' then
|
161 |
|
|
p2_complete_1d <= '1';
|
162 |
|
|
p2_busy <= '0';
|
163 |
|
|
-- reset count
|
164 |
|
|
p2_cnt <= to_unsigned(28,5);
|
165 |
|
|
end if;
|
166 |
|
|
end if;
|
167 |
|
|
end if;
|
168 |
|
|
end process;
|
169 |
|
|
|
170 |
|
|
process (p1_adjust_dividend, p1_adjust_divisor, p1_adjust_divisor_final,
|
171 |
|
|
p1_allowed_dividend_shift, p1_delta_exp, p1_div_by_zero,
|
172 |
|
|
p1_dividend, p1_dividend(32 downto 25), p1_dividend_mant,
|
173 |
|
|
p1_divisor_mant, p1_divisor_mant(12 downto 5),
|
174 |
|
|
p1_divisor_mant(20 downto 13), p1_divisor_mant(28 downto 21),
|
175 |
|
|
p1_divisor_mant(4 downto 0), p1_divisor_zero(0),
|
176 |
|
|
p1_divisor_zero(1 downto 0), p1_divisor_zero(1),
|
177 |
|
|
p1_divisor_zero(2 downto 1), p1_divisor_zero(2),
|
178 |
|
|
p1_divisor_zero(3 downto 2), p1_divisor_zero(3), p1_exp_sum,
|
179 |
|
|
p1_exp_sum(2 downto 0), p1_top_bits, p2_divisor(35 downto 0),
|
180 |
|
|
p2_exp, p2_quo(27 downto 0), p2_quo(28 downto 0), p2_quo_shft,
|
181 |
|
|
p2_rem(40 downto 0), p2_rem_shft, p2_rem_shft(41 downto 5),
|
182 |
|
|
p2_sub(35 downto 0), p2_sub(36), regA_i.fmt.exp, regA_i.fmt.sign,
|
183 |
|
|
regA_i.mant, regB_i.fmt.exp, regB_i.fmt.sign, regB_i.mant)
|
184 |
|
|
begin -- process
|
185 |
|
|
|
186 |
|
|
-- stage 1
|
187 |
|
|
|
188 |
|
|
p1_dividend_mant <= regA_i.mant;
|
189 |
|
|
p1_divisor_mant <= regB_i.mant;
|
190 |
|
|
|
191 |
|
|
p1_divisor_zero <= zero_blocks(p1_divisor_mant & to_unsigned(0,3),8);
|
192 |
|
|
-- p1_dividend_zero <= zero_blocks(p1_dividend_mant,8);
|
193 |
|
|
|
194 |
|
|
p1_delta_exp <= to_unsigned(3,3)+('0' & regA_i.fmt.exp)-('0' & regB_i.fmt.exp);
|
195 |
|
|
|
196 |
|
|
-- determine maximum allowed left shift of dividend
|
197 |
|
|
p1_allowed_dividend_shift <= to_unsigned(0,2);
|
198 |
|
|
|
199 |
|
|
if p1_divisor_zero(1 downto 0) = "01" or p1_delta_exp = to_unsigned(4,3) then
|
200 |
|
|
p1_allowed_dividend_shift <= to_unsigned(1,2);
|
201 |
|
|
elsif p1_divisor_zero(2 downto 1) = "01" or p1_delta_exp = to_unsigned(5,3) then
|
202 |
|
|
p1_allowed_dividend_shift <= to_unsigned(2,2);
|
203 |
|
|
elsif p1_divisor_zero(3 downto 2) = "01" or p1_delta_exp = to_unsigned(6,3) then
|
204 |
|
|
p1_allowed_dividend_shift <= to_unsigned(3,2);
|
205 |
|
|
end if;
|
206 |
|
|
|
207 |
|
|
-- adjust dividend
|
208 |
|
|
p1_adjust_dividend <= to_unsigned(0,2);
|
209 |
|
|
|
210 |
|
|
if p1_dividend_mant < to_unsigned(2**25,29) and p1_allowed_dividend_shift > to_unsigned(0,2) then
|
211 |
|
|
if p1_dividend_mant >= to_unsigned(2**17,29) or p1_allowed_dividend_shift = to_unsigned(1,3) then
|
212 |
|
|
p1_adjust_dividend <= to_unsigned(1,2);
|
213 |
|
|
elsif p1_dividend_mant >= to_unsigned(2**9,29) or p1_allowed_dividend_shift = to_unsigned(2,3) then
|
214 |
|
|
p1_adjust_dividend <= to_unsigned(2,2);
|
215 |
|
|
elsif p1_dividend_mant >= to_unsigned(2**1,29) or p1_allowed_dividend_shift = to_unsigned(3,2) then
|
216 |
|
|
p1_adjust_dividend <= to_unsigned(3,2);
|
217 |
|
|
end if;
|
218 |
|
|
end if;
|
219 |
|
|
|
220 |
|
|
p1_dividend <= fast_shift(to_unsigned(0,4) & p1_dividend_mant,to_integer(p1_adjust_dividend)*8,fast_shift_left); -- extend with 4bits
|
221 |
|
|
|
222 |
|
|
-- adjust divisor so that divisor >= dividend (when possible)
|
223 |
|
|
p1_div_by_zero <= '0';
|
224 |
|
|
p1_adjust_divisor <= to_unsigned(0,2);
|
225 |
|
|
p1_top_bits <= p1_divisor_mant(28 downto 21);
|
226 |
|
|
|
227 |
|
|
if p1_divisor_zero(0) = '1' then
|
228 |
|
|
if p1_divisor_zero(1) = '0' then
|
229 |
|
|
p1_top_bits <= p1_divisor_mant(20 downto 13);
|
230 |
|
|
p1_adjust_divisor <= to_unsigned(1,2);
|
231 |
|
|
elsif p1_divisor_zero(2) = '0' then
|
232 |
|
|
p1_top_bits <= p1_divisor_mant(12 downto 5);
|
233 |
|
|
p1_adjust_divisor <= to_unsigned(2,2);
|
234 |
|
|
elsif p1_divisor_zero(3) = '0' then
|
235 |
|
|
p1_top_bits <= p1_divisor_mant(4 downto 0) & "000";
|
236 |
|
|
p1_adjust_divisor <= to_unsigned(3,2);
|
237 |
|
|
else
|
238 |
|
|
p1_div_by_zero <= '1';
|
239 |
|
|
end if;
|
240 |
|
|
end if;
|
241 |
|
|
|
242 |
|
|
-- because dividend will be shifted right by 5 and left by 1 (= shifted right by 4) before division, only the
|
243 |
|
|
-- top 4 bits are used for extra shift determination; p1_top_bits will be
|
244 |
|
|
-- shifted left by 1 cause only most significant bit position must be same; some example
|
245 |
|
|
-- dividend: XXXXXAAA
|
246 |
|
|
-- divisor: BBBBBBBB
|
247 |
|
|
-- msb position counts eg
|
248 |
|
|
-- XXXXX111
|
249 |
|
|
-- 00000100
|
250 |
|
|
-- is a valid combination therefore the <= operator is not enough
|
251 |
|
|
|
252 |
|
|
p1_adjust_divisor_final <= '0' & p1_adjust_divisor;
|
253 |
|
|
if ('0' & p1_dividend(32 downto 25)) >= (p1_top_bits & '0') then
|
254 |
|
|
p1_adjust_divisor_final <= ('0' & p1_adjust_divisor)+1;
|
255 |
|
|
end if;
|
256 |
|
|
|
257 |
|
|
p1_divisor <= fast_shift(to_unsigned(0,7) & p1_divisor_mant,to_integer(p1_adjust_divisor_final)*8,fast_shift_left); -- 8bit overhead for shifting left
|
258 |
|
|
|
259 |
|
|
-- build resulting fmt
|
260 |
|
|
p1_sign <= regA_i.fmt.sign xor regB_i.fmt.sign;
|
261 |
|
|
|
262 |
|
|
p1_exp_sum <= ('0' & p1_delta_exp)-('0' & p1_adjust_dividend)+('0' & p1_adjust_divisor_final);
|
263 |
|
|
|
264 |
|
|
p1_exp_ov <= '0';
|
265 |
|
|
p1_exp <= to_unsigned(7,3);
|
266 |
|
|
if p1_div_by_zero = '1' or p1_exp_sum >= to_unsigned(8,4) then
|
267 |
|
|
p1_exp_ov <= '1';
|
268 |
|
|
else
|
269 |
|
|
p1_exp <= p1_exp_sum(2 downto 0);
|
270 |
|
|
end if;
|
271 |
|
|
|
272 |
|
|
p1_rem <= "000000000" & p1_dividend;
|
273 |
|
|
|
274 |
|
|
-- stage 2
|
275 |
|
|
|
276 |
|
|
-- shift
|
277 |
|
|
p2_rem_shft <= p2_rem(40 downto 0) & '0';
|
278 |
|
|
p2_quo_shft <= p2_quo(27 downto 0) & '0';
|
279 |
|
|
|
280 |
|
|
-- situation when rem and divisor have same msb position but rem is still greater
|
281 |
|
|
-- therefore the 41th of p2_rem_shft is mostly zero but in the case above
|
282 |
|
|
-- it will '1' and p2_divisor is less (always has a '0' at this position, see below)
|
283 |
|
|
p2_sub <= p2_rem_shft(41 downto 5)-('0' & p2_divisor(35 downto 0));
|
284 |
|
|
|
285 |
|
|
p2_rem_next <= p2_rem_shft;
|
286 |
|
|
p2_quo_next <= p2_quo_shft;
|
287 |
|
|
|
288 |
|
|
-- check for sub overflow eg. p2_rem_shft >= p2_divisor
|
289 |
|
|
if p2_sub(36) = '0' then -- no overflow: therefore do sub
|
290 |
|
|
p2_rem_next(41 downto 5) <= '0' & p2_sub(35 downto 0);
|
291 |
|
|
p2_quo_next(0) <= '1';
|
292 |
|
|
end if;
|
293 |
|
|
|
294 |
|
|
-- if exp > 3 normalize cannot correct it full therefore pre shift left (but loosing precision)
|
295 |
|
|
p2_exp_adjusted <= p2_exp;
|
296 |
|
|
p2_quo_adjusted <= "00000000" & p2_quo(28 downto 0);
|
297 |
|
|
|
298 |
|
|
if p2_exp >= to_unsigned(4,3) then
|
299 |
|
|
p2_exp_adjusted <= p2_exp-1;
|
300 |
|
|
p2_quo_adjusted <= p2_quo(28 downto 0) & "00000000";
|
301 |
|
|
end if;
|
302 |
|
|
|
303 |
|
|
end process;
|
304 |
|
|
|
305 |
|
|
p2_complete <= '1' when p2_cnt = to_unsigned(0,5) else '0';
|
306 |
|
|
|
307 |
|
|
ready_o <= not p2_busy and not start_1d;
|
308 |
|
|
result_o <= ((15 downto 0 => '0') & p2_quo_adjusted,to_unsigned(0,4) & p2_exp_ov,p2_exp_adjusted,p2_sign);
|
309 |
|
|
complete_o <= p2_complete_1d;
|
310 |
|
|
|
311 |
|
|
end Rtl;
|
312 |
|
|
|