1 |
36 |
dgisselq |
////////////////////////////////////////////////////////////////////////////////
|
2 |
|
|
//
|
3 |
|
|
// Filename: ../rtl/longbimpy.v
|
4 |
|
|
//
|
5 |
|
|
// Project: A General Purpose Pipelined FFT Implementation
|
6 |
|
|
//
|
7 |
|
|
// Purpose: A portable shift and add multiply, built with the knowledge
|
8 |
|
|
// of the existence of a six bit LUT and carry chain. That knowledge
|
9 |
|
|
// allows us to multiply two bits from one value at a time against all
|
10 |
|
|
// of the bits of the other value. This sub multiply is called the
|
11 |
|
|
// bimpy.
|
12 |
|
|
//
|
13 |
|
|
// For minimal processing delay, make the first parameter the one with
|
14 |
|
|
// the least bits, so that AWIDTH <= BWIDTH.
|
15 |
|
|
//
|
16 |
|
|
//
|
17 |
|
|
//
|
18 |
|
|
// Creator: Dan Gisselquist, Ph.D.
|
19 |
|
|
// Gisselquist Technology, LLC
|
20 |
|
|
//
|
21 |
|
|
////////////////////////////////////////////////////////////////////////////////
|
22 |
|
|
//
|
23 |
|
|
// Copyright (C) 2015-2018, Gisselquist Technology, LLC
|
24 |
|
|
//
|
25 |
|
|
// This program is free software (firmware): you can redistribute it and/or
|
26 |
|
|
// modify it under the terms of the GNU General Public License as published
|
27 |
|
|
// by the Free Software Foundation, either version 3 of the License, or (at
|
28 |
|
|
// your option) any later version.
|
29 |
|
|
//
|
30 |
|
|
// This program is distributed in the hope that it will be useful, but WITHOUT
|
31 |
|
|
// ANY WARRANTY; without even the implied warranty of MERCHANTIBILITY or
|
32 |
|
|
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
33 |
|
|
// for more details.
|
34 |
|
|
//
|
35 |
|
|
// You should have received a copy of the GNU General Public License along
|
36 |
|
|
// with this program. (It's in the $(ROOT)/doc directory, run make with no
|
37 |
|
|
// target there if the PDF file isn't present.) If not, see
|
38 |
|
|
// <http://www.gnu.org/licenses/> for a copy.
|
39 |
|
|
//
|
40 |
|
|
// License: GPL, v3, as defined and found on www.gnu.org,
|
41 |
|
|
// http://www.gnu.org/licenses/gpl.html
|
42 |
|
|
//
|
43 |
|
|
//
|
44 |
|
|
////////////////////////////////////////////////////////////////////////////////
|
45 |
|
|
//
|
46 |
|
|
//
|
47 |
|
|
`default_nettype none
|
48 |
|
|
//
|
49 |
|
|
module longbimpy(i_clk, i_ce, i_a_unsorted, i_b_unsorted, o_r);
|
50 |
|
|
parameter IAW=8, // The width of i_a, min width is 5
|
51 |
|
|
IBW=12, // The width of i_b, can be anything
|
52 |
|
|
// The following three parameters should not be changed
|
53 |
|
|
// by any implementation, but are based upon hardware
|
54 |
|
|
// and the above values:
|
55 |
|
|
OW=IAW+IBW; // The output width
|
56 |
|
|
localparam AW = (IAW<IBW) ? IAW : IBW,
|
57 |
|
|
BW = (IAW<IBW) ? IBW : IAW,
|
58 |
|
|
IW=(AW+1)&(-2), // Internal width of A
|
59 |
|
|
LUTB=2, // How many bits we can multiply by at once
|
60 |
|
|
TLEN=(AW+(LUTB-1))/LUTB; // Nmbr of rows in our tableau
|
61 |
|
|
input i_clk, i_ce;
|
62 |
|
|
input [(IAW-1):0] i_a_unsorted;
|
63 |
|
|
input [(IBW-1):0] i_b_unsorted;
|
64 |
|
|
output reg [(AW+BW-1):0] o_r;
|
65 |
|
|
|
66 |
|
|
//
|
67 |
|
|
// Swap parameter order, so that AW <= BW -- for performance
|
68 |
|
|
// reasons
|
69 |
|
|
wire [AW-1:0] i_a;
|
70 |
|
|
wire [BW-1:0] i_b;
|
71 |
|
|
generate if (IAW <= IBW)
|
72 |
|
|
begin : NO_PARAM_CHANGE
|
73 |
|
|
assign i_a = i_a_unsorted;
|
74 |
|
|
assign i_b = i_b_unsorted;
|
75 |
|
|
end else begin : SWAP_PARAMETERS
|
76 |
|
|
assign i_a = i_b_unsorted;
|
77 |
|
|
assign i_b = i_a_unsorted;
|
78 |
|
|
end endgenerate
|
79 |
|
|
|
80 |
|
|
reg [(IW-1):0] u_a;
|
81 |
|
|
reg [(BW-1):0] u_b;
|
82 |
|
|
reg sgn;
|
83 |
|
|
|
84 |
|
|
reg [(IW-1-2*(LUTB)):0] r_a[0:(TLEN-3)];
|
85 |
|
|
reg [(BW-1):0] r_b[0:(TLEN-3)];
|
86 |
|
|
reg [(TLEN-1):0] r_s;
|
87 |
|
|
reg [(IW+BW-1):0] acc[0:(TLEN-2)];
|
88 |
|
|
genvar k;
|
89 |
|
|
|
90 |
|
|
// First step:
|
91 |
|
|
// Switch to unsigned arithmetic for our multiply, keeping track
|
92 |
|
|
// of the along the way. We'll then add the sign again later at
|
93 |
|
|
// the end.
|
94 |
|
|
//
|
95 |
|
|
// If we were forced to stay within two's complement arithmetic,
|
96 |
|
|
// taking the absolute value here would require an additional bit.
|
97 |
|
|
// However, because our results are now unsigned, we can stay
|
98 |
|
|
// within the number of bits given (for now).
|
99 |
|
|
generate if (IW > AW)
|
100 |
|
|
begin
|
101 |
|
|
always @(posedge i_clk)
|
102 |
|
|
if (i_ce)
|
103 |
|
|
u_a <= { 1'b0, (i_a[AW-1])?(-i_a):(i_a) };
|
104 |
|
|
end else begin
|
105 |
|
|
always @(posedge i_clk)
|
106 |
|
|
if (i_ce)
|
107 |
|
|
u_a <= (i_a[AW-1])?(-i_a):(i_a);
|
108 |
|
|
end endgenerate
|
109 |
|
|
|
110 |
|
|
always @(posedge i_clk)
|
111 |
|
|
if (i_ce)
|
112 |
|
|
begin
|
113 |
|
|
u_b <= (i_b[BW-1])?(-i_b):(i_b);
|
114 |
|
|
sgn <= i_a[AW-1] ^ i_b[BW-1];
|
115 |
|
|
end
|
116 |
|
|
|
117 |
|
|
wire [(BW+LUTB-1):0] pr_a, pr_b;
|
118 |
|
|
|
119 |
|
|
//
|
120 |
|
|
// Second step: First two 2xN products.
|
121 |
|
|
//
|
122 |
|
|
// Since we have no tableau of additions (yet), we can do both
|
123 |
|
|
// of the first two rows at the same time and add them together.
|
124 |
|
|
// For the next round, we'll then have a previous sum to accumulate
|
125 |
|
|
// with new and subsequent product, and so only do one product at
|
126 |
|
|
// a time can follow this--but the first clock can do two at a time.
|
127 |
|
|
bimpy #(BW) lmpy_0(i_clk,i_ce,u_a[( LUTB-1): 0], u_b, pr_a);
|
128 |
|
|
bimpy #(BW) lmpy_1(i_clk,i_ce,u_a[(2*LUTB-1):LUTB], u_b, pr_b);
|
129 |
|
|
always @(posedge i_clk)
|
130 |
|
|
if (i_ce) r_a[0] <= u_a[(IW-1):(2*LUTB)];
|
131 |
|
|
always @(posedge i_clk)
|
132 |
|
|
if (i_ce) r_b[0] <= u_b;
|
133 |
|
|
always @(posedge i_clk)
|
134 |
|
|
if (i_ce) r_s <= { r_s[(TLEN-2):0], sgn };
|
135 |
|
|
always @(posedge i_clk) // One clk after p[0],p[1] become valid
|
136 |
|
|
if (i_ce) acc[0] <= { {(IW-LUTB){1'b0}}, pr_a}
|
137 |
|
|
+{ {(IW-(2*LUTB)){1'b0}}, pr_b, {(LUTB){1'b0}} };
|
138 |
|
|
|
139 |
|
|
generate // Keep track of intermediate values, before multiplying them
|
140 |
|
|
if (TLEN > 3) for(k=0; k<TLEN-3; k=k+1)
|
141 |
|
|
begin : gencopies
|
142 |
|
|
always @(posedge i_clk)
|
143 |
|
|
if (i_ce)
|
144 |
|
|
begin
|
145 |
|
|
r_a[k+1] <= { {(LUTB){1'b0}},
|
146 |
|
|
r_a[k][(IW-1-(2*LUTB)):LUTB] };
|
147 |
|
|
r_b[k+1] <= r_b[k];
|
148 |
|
|
end
|
149 |
|
|
end endgenerate
|
150 |
|
|
|
151 |
|
|
generate // The actual multiply and accumulate stage
|
152 |
|
|
if (TLEN > 2) for(k=0; k<TLEN-2; k=k+1)
|
153 |
|
|
begin : genstages
|
154 |
|
|
// First, the multiply: 2-bits times BW bits
|
155 |
|
|
wire [(BW+LUTB-1):0] genp;
|
156 |
|
|
bimpy #(BW) genmpy(i_clk,i_ce,r_a[k][(LUTB-1):0],r_b[k], genp);
|
157 |
|
|
|
158 |
|
|
// Then the accumulate step -- on the next clock
|
159 |
|
|
always @(posedge i_clk)
|
160 |
|
|
if (i_ce)
|
161 |
|
|
acc[k+1] <= acc[k] + {{(IW-LUTB*(k+3)){1'b0}},
|
162 |
|
|
genp, {(LUTB*(k+2)){1'b0}} };
|
163 |
|
|
end endgenerate
|
164 |
|
|
|
165 |
|
|
wire [(IW+BW-1):0] w_r;
|
166 |
|
|
assign w_r = (r_s[TLEN-1]) ? (-acc[TLEN-2]) : acc[TLEN-2];
|
167 |
|
|
always @(posedge i_clk)
|
168 |
|
|
if (i_ce)
|
169 |
|
|
o_r <= w_r[(AW+BW-1):0];
|
170 |
|
|
|
171 |
|
|
generate if (IW > AW)
|
172 |
|
|
begin : VUNUSED
|
173 |
|
|
// verilator lint_off UNUSED
|
174 |
|
|
wire [(IW-AW)-1:0] unused;
|
175 |
|
|
assign unused = w_r[(IW+BW-1):(AW+BW)];
|
176 |
|
|
// verilator lint_on UNUSED
|
177 |
|
|
end endgenerate
|
178 |
|
|
|
179 |
|
|
endmodule
|