1 |
36 |
dgisselq |
////////////////////////////////////////////////////////////////////////////////
|
2 |
|
|
//
|
3 |
|
|
// Filename: softmpy.cpp
|
4 |
|
|
//
|
5 |
|
|
// Project: A General Purpose Pipelined FFT Implementation
|
6 |
|
|
//
|
7 |
|
|
// Purpose: If the chip doesn't have any hardware multiplies, you'll need
|
8 |
|
|
// a soft-multiply implementation. This provides that
|
9 |
|
|
// implementation.
|
10 |
|
|
//
|
11 |
|
|
// Creator: Dan Gisselquist, Ph.D.
|
12 |
|
|
// Gisselquist Technology, LLC
|
13 |
|
|
//
|
14 |
|
|
////////////////////////////////////////////////////////////////////////////////
|
15 |
|
|
//
|
16 |
|
|
// Copyright (C) 2015-2018, Gisselquist Technology, LLC
|
17 |
|
|
//
|
18 |
|
|
// This program is free software (firmware): you can redistribute it and/or
|
19 |
|
|
// modify it under the terms of the GNU General Public License as published
|
20 |
|
|
// by the Free Software Foundation, either version 3 of the License, or (at
|
21 |
|
|
// your option) any later version.
|
22 |
|
|
//
|
23 |
|
|
// This program is distributed in the hope that it will be useful, but WITHOUT
|
24 |
|
|
// ANY WARRANTY; without even the implied warranty of MERCHANTIBILITY or
|
25 |
|
|
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
26 |
|
|
// for more details.
|
27 |
|
|
//
|
28 |
|
|
// You should have received a copy of the GNU General Public License along
|
29 |
|
|
// with this program. (It's in the $(ROOT)/doc directory, run make with no
|
30 |
|
|
// target there if the PDF file isn't present.) If not, see
|
31 |
|
|
// <http://www.gnu.org/licenses/> for a copy.
|
32 |
|
|
//
|
33 |
|
|
// License: GPL, v3, as defined and found on www.gnu.org,
|
34 |
|
|
// http://www.gnu.org/licenses/gpl.html
|
35 |
|
|
//
|
36 |
|
|
//
|
37 |
|
|
////////////////////////////////////////////////////////////////////////////////
|
38 |
|
|
//
|
39 |
|
|
//
|
40 |
|
|
#define _CRT_SECURE_NO_WARNINGS // ms vs 2012 doesn't like fopen
|
41 |
|
|
#include <stdio.h>
|
42 |
|
|
#include <stdlib.h>
|
43 |
|
|
|
44 |
|
|
#ifdef _MSC_VER // added for ms vs compatibility
|
45 |
|
|
|
46 |
|
|
#include <io.h>
|
47 |
|
|
#include <direct.h>
|
48 |
|
|
#define _USE_MATH_DEFINES
|
49 |
|
|
|
50 |
|
|
#endif
|
51 |
|
|
|
52 |
|
|
#include <string.h>
|
53 |
|
|
#include <string>
|
54 |
|
|
#include <math.h>
|
55 |
|
|
#include <ctype.h>
|
56 |
|
|
#include <assert.h>
|
57 |
|
|
|
58 |
|
|
#include "defaults.h"
|
59 |
|
|
#include "legal.h"
|
60 |
|
|
#include "softmpy.h"
|
61 |
|
|
|
62 |
|
|
void build_multiply(const char *fname) {
|
63 |
|
|
FILE *fp = fopen(fname, "w");
|
64 |
|
|
if (NULL == fp) {
|
65 |
|
|
fprintf(stderr, "Could not open \'%s\' for writing\n", fname);
|
66 |
|
|
perror("O/S Err was:");
|
67 |
|
|
return;
|
68 |
|
|
}
|
69 |
|
|
|
70 |
|
|
fprintf(fp,
|
71 |
|
|
SLASHLINE
|
72 |
|
|
"//\n"
|
73 |
|
|
"// Filename:\tshiftaddmpy.v\n"
|
74 |
|
|
"//\n"
|
75 |
|
|
"// Project:\t%s\n"
|
76 |
|
|
"//\n"
|
77 |
|
|
"// Purpose:\tA portable shift and add multiply.\n"
|
78 |
|
|
"//\n"
|
79 |
|
|
"// While both Xilinx and Altera will offer single clock multiplies, this\n"
|
80 |
|
|
"// simple approach will multiply two numbers on any architecture. The\n"
|
81 |
|
|
"// result maintains the full width of the multiply, there are no extra\n"
|
82 |
|
|
"// stuff bits, no rounding, no shifted bits, etc.\n"
|
83 |
|
|
"//\n"
|
84 |
|
|
"// Further, for those applications that can support it, this multiply\n"
|
85 |
|
|
"// is pipelined and will produce one answer per clock.\n"
|
86 |
|
|
"//\n"
|
87 |
|
|
"// For minimal processing delay, make the first parameter the one with\n"
|
88 |
|
|
"// the least bits, so that AWIDTH <= BWIDTH.\n"
|
89 |
|
|
"//\n"
|
90 |
|
|
"// The processing delay in this multiply is (AWIDTH+1) cycles. That is,\n"
|
91 |
|
|
"// if the data is present on the input at clock t=0, the result will be\n"
|
92 |
|
|
"// present on the output at time t=AWIDTH+1;\n"
|
93 |
|
|
"//\n"
|
94 |
|
|
"//\n%s"
|
95 |
|
|
"//\n", prjname, creator);
|
96 |
|
|
|
97 |
|
|
fprintf(fp, "%s", cpyleft);
|
98 |
|
|
fprintf(fp, "//\n//\n`default_nettype\tnone\n//\n");
|
99 |
|
|
fprintf(fp,
|
100 |
|
|
"module shiftaddmpy(i_clk, i_ce, i_a, i_b, o_r);\n"
|
101 |
|
|
"\tparameter\tAWIDTH=%d,BWIDTH=", TST_SHIFTADDMPY_AW);
|
102 |
|
|
#ifdef TST_SHIFTADDMPY_BW
|
103 |
|
|
fprintf(fp, "%d;\n", TST_SHIFTADDMPY_BW);
|
104 |
|
|
#else
|
105 |
|
|
fprintf(fp, "AWIDTH;\n");
|
106 |
|
|
#endif
|
107 |
|
|
fprintf(fp,
|
108 |
|
|
"\tinput\t\t\t\t\ti_clk, i_ce;\n"
|
109 |
|
|
"\tinput\t\t[(AWIDTH-1):0]\t\ti_a;\n"
|
110 |
|
|
"\tinput\t\t[(BWIDTH-1):0]\t\ti_b;\n"
|
111 |
|
|
"\toutput\treg\t[(AWIDTH+BWIDTH-1):0]\to_r;\n"
|
112 |
|
|
"\n"
|
113 |
|
|
"\treg\t[(AWIDTH-1):0]\tu_a;\n"
|
114 |
|
|
"\treg\t[(BWIDTH-1):0]\tu_b;\n"
|
115 |
|
|
"\treg\t\t\tsgn;\n"
|
116 |
|
|
"\n"
|
117 |
|
|
"\treg\t[(AWIDTH-2):0]\t\tr_a[0:(AWIDTH-1)];\n"
|
118 |
|
|
"\treg\t[(AWIDTH+BWIDTH-2):0]\tr_b[0:(AWIDTH-1)];\n"
|
119 |
|
|
"\treg\t\t\t\tr_s[0:(AWIDTH-1)];\n"
|
120 |
|
|
"\treg\t[(AWIDTH+BWIDTH-1):0]\tacc[0:(AWIDTH-1)];\n"
|
121 |
|
|
"\tgenvar k;\n"
|
122 |
|
|
"\n"
|
123 |
|
|
"\t// If we were forced to stay within two\'s complement arithmetic,\n"
|
124 |
|
|
"\t// taking the absolute value here would require an additional bit.\n"
|
125 |
|
|
"\t// However, because our results are now unsigned, we can stay\n"
|
126 |
|
|
"\t// within the number of bits given (for now).\n"
|
127 |
|
|
"\talways @(posedge i_clk)\n"
|
128 |
|
|
"\t\tif (i_ce)\n"
|
129 |
|
|
"\t\tbegin\n"
|
130 |
|
|
"\t\t\tu_a <= (i_a[AWIDTH-1])?(-i_a):(i_a);\n"
|
131 |
|
|
"\t\t\tu_b <= (i_b[BWIDTH-1])?(-i_b):(i_b);\n"
|
132 |
|
|
"\t\t\tsgn <= i_a[AWIDTH-1] ^ i_b[BWIDTH-1];\n"
|
133 |
|
|
"\t\tend\n"
|
134 |
|
|
"\n"
|
135 |
|
|
"\talways @(posedge i_clk)\n"
|
136 |
|
|
"\t\tif (i_ce)\n"
|
137 |
|
|
"\t\tbegin\n"
|
138 |
|
|
"\t\t\tacc[0] <= (u_a[0]) ? { {(AWIDTH){1\'b0}}, u_b }\n"
|
139 |
|
|
"\t\t\t\t\t: {(AWIDTH+BWIDTH){1\'b0}};\n"
|
140 |
|
|
"\t\t\tr_a[0] <= { u_a[(AWIDTH-1):1] };\n"
|
141 |
|
|
"\t\t\tr_b[0] <= { {(AWIDTH-1){1\'b0}}, u_b };\n"
|
142 |
|
|
"\t\t\tr_s[0] <= sgn; // The final sign, needs to be preserved\n"
|
143 |
|
|
"\t\tend\n"
|
144 |
|
|
"\n"
|
145 |
|
|
"\tgenerate\n"
|
146 |
|
|
"\tfor(k=0; k<AWIDTH-1; k=k+1)\n"
|
147 |
|
|
"\tbegin : genstages\n"
|
148 |
|
|
"\t\talways @(posedge i_clk)\n"
|
149 |
|
|
"\t\tif (i_ce)\n"
|
150 |
|
|
"\t\tbegin\n"
|
151 |
|
|
"\t\t\tacc[k+1] <= acc[k] + ((r_a[k][0]) ? {r_b[k],1\'b0}:0);\n"
|
152 |
|
|
"\t\t\tr_a[k+1] <= { 1\'b0, r_a[k][(AWIDTH-2):1] };\n"
|
153 |
|
|
"\t\t\tr_b[k+1] <= { r_b[k][(AWIDTH+BWIDTH-3):0], 1\'b0};\n"
|
154 |
|
|
"\t\t\tr_s[k+1] <= r_s[k];\n"
|
155 |
|
|
"\t\tend\n"
|
156 |
|
|
"\tend\n"
|
157 |
|
|
"\tendgenerate\n"
|
158 |
|
|
"\n"
|
159 |
|
|
"\talways @(posedge i_clk)\n"
|
160 |
|
|
"\t\tif (i_ce)\n"
|
161 |
|
|
"\t\t\to_r <= (r_s[AWIDTH-1]) ? (-acc[AWIDTH-1]) : acc[AWIDTH-1];\n"
|
162 |
|
|
"\n"
|
163 |
|
|
"endmodule\n");
|
164 |
|
|
|
165 |
|
|
fclose(fp);
|
166 |
|
|
}
|
167 |
|
|
|
168 |
|
|
void build_bimpy(const char *fname) {
|
169 |
|
|
FILE *fp = fopen(fname, "w");
|
170 |
|
|
if (NULL == fp) {
|
171 |
|
|
fprintf(stderr, "Could not open \'%s\' for writing\n", fname);
|
172 |
|
|
perror("O/S Err was:");
|
173 |
|
|
return;
|
174 |
|
|
}
|
175 |
|
|
|
176 |
|
|
fprintf(fp,
|
177 |
|
|
SLASHLINE
|
178 |
|
|
"//\n"
|
179 |
|
|
"// Filename:\t%s\n"
|
180 |
|
|
"//\n"
|
181 |
|
|
"// Project:\t%s\n"
|
182 |
|
|
"//\n"
|
183 |
|
|
"// Purpose:\tA simple 2-bit multiply based upon the fact that LUT's allow\n"
|
184 |
|
|
"// 6-bits of input. In other words, I could build a 3-bit\n"
|
185 |
|
|
"// multiply from 6 LUTs (5 actually, since the first could have two\n"
|
186 |
|
|
"// outputs). This would allow multiplication of three bit digits, save\n"
|
187 |
|
|
"// only for the fact that you would need two bits of carry. The bimpy\n"
|
188 |
|
|
"// approach throttles back a bit and does a 2x2 bit multiply in a LUT,\n"
|
189 |
|
|
"// guaranteeing that it will never carry more than one bit. While this\n"
|
190 |
|
|
"// multiply is hardware independent (and can still run under Verilator\n"
|
191 |
|
|
"// therefore), it is really motivated by trying to optimize for a\n"
|
192 |
|
|
"// specific piece of hardware (Xilinx-7 series ...) that has at least\n"
|
193 |
|
|
"// 4-input LUT's with carry chains.\n"
|
194 |
|
|
"//\n"
|
195 |
|
|
"//\n"
|
196 |
|
|
"//\n%s"
|
197 |
|
|
"//\n", fname, prjname, creator);
|
198 |
|
|
|
199 |
|
|
fprintf(fp, "%s", cpyleft);
|
200 |
|
|
fprintf(fp, "//\n//\n`default_nettype\tnone\n//\n");
|
201 |
|
|
fprintf(fp,
|
202 |
|
|
"module bimpy(i_clk, i_ce, i_a, i_b, o_r);\n"
|
203 |
|
|
"\tparameter\tBW=18, // Number of bits in i_b\n"
|
204 |
|
|
"\t\t\tLUTB=2; // Number of bits in i_a for our LUT multiply\n"
|
205 |
|
|
"\tinput\t\t\t\ti_clk, i_ce;\n"
|
206 |
|
|
"\tinput\t\t[(LUTB-1):0]\ti_a;\n"
|
207 |
|
|
"\tinput\t\t[(BW-1):0]\ti_b;\n"
|
208 |
|
|
"\toutput\treg\t[(BW+LUTB-1):0] o_r;\n"
|
209 |
|
|
"\n"
|
210 |
|
|
"\twire [(BW+LUTB-2):0] w_r;\n"
|
211 |
|
|
"\twire [(BW+LUTB-3):1] c;\n"
|
212 |
|
|
"\n"
|
213 |
|
|
"\tassign\tw_r = { ((i_a[1])?i_b:{(BW){1\'b0}}), 1\'b0 }\n"
|
214 |
|
|
"\t\t\t\t^ { 1\'b0, ((i_a[0])?i_b:{(BW){1\'b0}}) };\n"
|
215 |
|
|
"\tassign\tc = { ((i_a[1])?i_b[(BW-2):0]:{(BW-1){1\'b0}}) }\n"
|
216 |
|
|
"\t\t\t& ((i_a[0])?i_b[(BW-1):1]:{(BW-1){1\'b0}});\n"
|
217 |
|
|
"\n"
|
218 |
|
|
"\talways @(posedge i_clk)\n"
|
219 |
|
|
"\t\tif (i_ce)\n"
|
220 |
|
|
"\t\t\to_r <= w_r + { c, 2'b0 };\n"
|
221 |
|
|
"\n"
|
222 |
|
|
"endmodule\n");
|
223 |
|
|
|
224 |
|
|
fclose(fp);
|
225 |
|
|
}
|
226 |
|
|
|
227 |
|
|
void build_longbimpy(const char *fname) {
|
228 |
|
|
FILE *fp = fopen(fname, "w");
|
229 |
|
|
if (NULL == fp) {
|
230 |
|
|
fprintf(stderr, "Could not open \'%s\' for writing\n", fname);
|
231 |
|
|
perror("O/S Err was:");
|
232 |
|
|
return;
|
233 |
|
|
}
|
234 |
|
|
|
235 |
|
|
fprintf(fp,
|
236 |
|
|
SLASHLINE
|
237 |
|
|
"//\n"
|
238 |
|
|
"// Filename: %s\n"
|
239 |
|
|
"//\n"
|
240 |
|
|
"// Project: %s\n"
|
241 |
|
|
"//\n"
|
242 |
|
|
"// Purpose: A portable shift and add multiply, built with the knowledge\n"
|
243 |
|
|
"// of the existence of a six bit LUT and carry chain. That knowledge\n"
|
244 |
|
|
"// allows us to multiply two bits from one value at a time against all\n"
|
245 |
|
|
"// of the bits of the other value. This sub multiply is called the\n"
|
246 |
|
|
"// bimpy.\n"
|
247 |
|
|
"//\n"
|
248 |
|
|
"// For minimal processing delay, make the first parameter the one with\n"
|
249 |
|
|
"// the least bits, so that AWIDTH <= BWIDTH.\n"
|
250 |
|
|
"//\n"
|
251 |
|
|
"//\n"
|
252 |
|
|
"//\n%s"
|
253 |
|
|
"//\n", fname, prjname, creator);
|
254 |
|
|
|
255 |
|
|
fprintf(fp, "%s", cpyleft);
|
256 |
|
|
fprintf(fp, "//\n//\n`default_nettype\tnone\n//\n");
|
257 |
|
|
fprintf(fp,
|
258 |
|
|
"module longbimpy(i_clk, i_ce, i_a_unsorted, i_b_unsorted, o_r);\n"
|
259 |
|
|
"\tparameter IAW=%d, // The width of i_a, min width is 5\n"
|
260 |
|
|
"\t\t\tIBW=", TST_LONGBIMPY_AW);
|
261 |
|
|
#ifdef TST_LONGBIMPY_BW
|
262 |
|
|
fprintf(fp, "%d", TST_LONGBIMPY_BW);
|
263 |
|
|
#else
|
264 |
|
|
fprintf(fp, "IAW");
|
265 |
|
|
#endif
|
266 |
|
|
|
267 |
|
|
fprintf(fp, ", // The width of i_b, can be anything\n"
|
268 |
|
|
"\t\t\t// The following three parameters should not be changed\n"
|
269 |
|
|
"\t\t\t// by any implementation, but are based upon hardware\n"
|
270 |
|
|
"\t\t\t// and the above values:\n"
|
271 |
|
|
"\t\t\tOW=IAW+IBW; // The output width\n");
|
272 |
|
|
fprintf(fp,
|
273 |
|
|
"\tlocalparam AW = (IAW<IBW) ? IAW : IBW,\n"
|
274 |
|
|
"\t\t\tBW = (IAW<IBW) ? IBW : IAW,\n"
|
275 |
|
|
"\t\t\tIW=(AW+1)&(-2), // Internal width of A\n"
|
276 |
|
|
"\t\t\tLUTB=2, // How many bits we can multiply by at once\n"
|
277 |
|
|
"\t\t\tTLEN=(AW+(LUTB-1))/LUTB; // Nmbr of rows in our tableau\n"
|
278 |
|
|
"\tinput\t\t\t\ti_clk, i_ce;\n"
|
279 |
|
|
"\tinput\t\t[(IAW-1):0]\ti_a_unsorted;\n"
|
280 |
|
|
"\tinput\t\t[(IBW-1):0]\ti_b_unsorted;\n"
|
281 |
|
|
"\toutput\treg\t[(AW+BW-1):0]\to_r;\n"
|
282 |
|
|
"\n"
|
283 |
|
|
"\t//\n"
|
284 |
|
|
"\t// Swap parameter order, so that AW <= BW -- for performance\n"
|
285 |
|
|
"\t// reasons\n"
|
286 |
|
|
"\twire [AW-1:0] i_a;\n"
|
287 |
|
|
"\twire [BW-1:0] i_b;\n"
|
288 |
|
|
"\tgenerate if (IAW <= IBW)\n"
|
289 |
|
|
"\tbegin : NO_PARAM_CHANGE\n"
|
290 |
|
|
"\t\tassign i_a = i_a_unsorted;\n"
|
291 |
|
|
"\t\tassign i_b = i_b_unsorted;\n"
|
292 |
|
|
"\tend else begin : SWAP_PARAMETERS\n"
|
293 |
|
|
"\t\tassign i_a = i_b_unsorted;\n"
|
294 |
|
|
"\t\tassign i_b = i_a_unsorted;\n"
|
295 |
|
|
"\tend endgenerate\n"
|
296 |
|
|
"\n"
|
297 |
|
|
"\treg\t[(IW-1):0]\tu_a;\n"
|
298 |
|
|
"\treg\t[(BW-1):0]\tu_b;\n"
|
299 |
|
|
"\treg\t\t\tsgn;\n"
|
300 |
|
|
"\n"
|
301 |
|
|
"\treg\t[(IW-1-2*(LUTB)):0]\tr_a[0:(TLEN-3)];\n"
|
302 |
|
|
"\treg\t[(BW-1):0]\t\tr_b[0:(TLEN-3)];\n"
|
303 |
|
|
"\treg\t[(TLEN-1):0]\t\tr_s;\n"
|
304 |
|
|
"\treg\t[(IW+BW-1):0]\t\tacc[0:(TLEN-2)];\n"
|
305 |
|
|
"\tgenvar k;\n"
|
306 |
|
|
"\n"
|
307 |
|
|
"\t// First step:\n"
|
308 |
|
|
"\t// Switch to unsigned arithmetic for our multiply, keeping track\n"
|
309 |
|
|
"\t// of the along the way. We'll then add the sign again later at\n"
|
310 |
|
|
"\t// the end.\n"
|
311 |
|
|
"\t//\n"
|
312 |
|
|
"\t// If we were forced to stay within two's complement arithmetic,\n"
|
313 |
|
|
"\t// taking the absolute value here would require an additional bit.\n"
|
314 |
|
|
"\t// However, because our results are now unsigned, we can stay\n"
|
315 |
|
|
"\t// within the number of bits given (for now).\n"
|
316 |
|
|
"\tgenerate if (IW > AW)\n"
|
317 |
|
|
"\tbegin\n"
|
318 |
|
|
"\t\talways @(posedge i_clk)\n"
|
319 |
|
|
"\t\t\tif (i_ce)\n"
|
320 |
|
|
"\t\t\t\tu_a <= { 1\'b0, (i_a[AW-1])?(-i_a):(i_a) };\n"
|
321 |
|
|
"\tend else begin\n"
|
322 |
|
|
"\t\talways @(posedge i_clk)\n"
|
323 |
|
|
"\t\t\tif (i_ce)\n"
|
324 |
|
|
"\t\t\t\tu_a <= (i_a[AW-1])?(-i_a):(i_a);\n"
|
325 |
|
|
"\tend endgenerate\n"
|
326 |
|
|
"\n"
|
327 |
|
|
"\talways @(posedge i_clk)\n"
|
328 |
|
|
"\t\tif (i_ce)\n"
|
329 |
|
|
"\t\tbegin\n"
|
330 |
|
|
"\t\t\tu_b <= (i_b[BW-1])?(-i_b):(i_b);\n"
|
331 |
|
|
"\t\t\tsgn <= i_a[AW-1] ^ i_b[BW-1];\n"
|
332 |
|
|
"\t\tend\n"
|
333 |
|
|
"\n"
|
334 |
|
|
"\twire [(BW+LUTB-1):0] pr_a, pr_b;\n"
|
335 |
|
|
"\n"
|
336 |
|
|
"\t//\n"
|
337 |
|
|
"\t// Second step: First two 2xN products.\n"
|
338 |
|
|
"\t//\n"
|
339 |
|
|
"\t// Since we have no tableau of additions (yet), we can do both\n"
|
340 |
|
|
"\t// of the first two rows at the same time and add them together.\n"
|
341 |
|
|
"\t// For the next round, we'll then have a previous sum to accumulate\n"
|
342 |
|
|
"\t// with new and subsequent product, and so only do one product at\n"
|
343 |
|
|
"\t// a time can follow this--but the first clock can do two at a time.\n"
|
344 |
|
|
"\tbimpy\t#(BW) lmpy_0(i_clk,i_ce,u_a[( LUTB-1): 0], u_b, pr_a);\n"
|
345 |
|
|
"\tbimpy\t#(BW) lmpy_1(i_clk,i_ce,u_a[(2*LUTB-1):LUTB], u_b, pr_b);\n"
|
346 |
|
|
"\talways @(posedge i_clk)\n"
|
347 |
|
|
"\t\tif (i_ce) r_a[0] <= u_a[(IW-1):(2*LUTB)];\n"
|
348 |
|
|
"\talways @(posedge i_clk)\n"
|
349 |
|
|
"\t\tif (i_ce) r_b[0] <= u_b;\n"
|
350 |
|
|
"\talways @(posedge i_clk)\n"
|
351 |
|
|
"\t\tif (i_ce) r_s <= { r_s[(TLEN-2):0], sgn };\n"
|
352 |
|
|
"\talways @(posedge i_clk) // One clk after p[0],p[1] become valid\n"
|
353 |
|
|
"\t\tif (i_ce) acc[0] <= { {(IW-LUTB){1\'b0}}, pr_a}\n"
|
354 |
|
|
"\t\t\t +{ {(IW-(2*LUTB)){1\'b0}}, pr_b, {(LUTB){1\'b0}} };\n"
|
355 |
|
|
"\n"
|
356 |
|
|
"\tgenerate // Keep track of intermediate values, before multiplying them\n"
|
357 |
|
|
"\tif (TLEN > 3) for(k=0; k<TLEN-3; k=k+1)\n"
|
358 |
|
|
"\tbegin : gencopies\n"
|
359 |
|
|
"\t\talways @(posedge i_clk)\n"
|
360 |
|
|
"\t\tif (i_ce)\n"
|
361 |
|
|
"\t\tbegin\n"
|
362 |
|
|
"\t\t\tr_a[k+1] <= { {(LUTB){1\'b0}},\n"
|
363 |
|
|
"\t\t\t\tr_a[k][(IW-1-(2*LUTB)):LUTB] };\n"
|
364 |
|
|
"\t\t\tr_b[k+1] <= r_b[k];\n"
|
365 |
|
|
"\t\tend\n"
|
366 |
|
|
"\tend endgenerate\n"
|
367 |
|
|
"\n"
|
368 |
|
|
"\tgenerate // The actual multiply and accumulate stage\n"
|
369 |
|
|
"\tif (TLEN > 2) for(k=0; k<TLEN-2; k=k+1)\n"
|
370 |
|
|
"\tbegin : genstages\n"
|
371 |
|
|
"\t\t// First, the multiply: 2-bits times BW bits\n"
|
372 |
|
|
"\t\twire\t[(BW+LUTB-1):0] genp;\n"
|
373 |
|
|
"\t\tbimpy #(BW) genmpy(i_clk,i_ce,r_a[k][(LUTB-1):0],r_b[k], genp);\n"
|
374 |
|
|
"\n"
|
375 |
|
|
"\t\t// Then the accumulate step -- on the next clock\n"
|
376 |
|
|
"\t\talways @(posedge i_clk)\n"
|
377 |
|
|
"\t\t\tif (i_ce)\n"
|
378 |
|
|
"\t\t\t\tacc[k+1] <= acc[k] + {{(IW-LUTB*(k+3)){1\'b0}},\n"
|
379 |
|
|
"\t\t\t\t\tgenp, {(LUTB*(k+2)){1\'b0}} };\n"
|
380 |
|
|
"\tend endgenerate\n"
|
381 |
|
|
"\n"
|
382 |
|
|
"\twire [(IW+BW-1):0] w_r;\n"
|
383 |
|
|
"\tassign\tw_r = (r_s[TLEN-1]) ? (-acc[TLEN-2]) : acc[TLEN-2];\n"
|
384 |
|
|
"\talways @(posedge i_clk)\n"
|
385 |
|
|
"\t\tif (i_ce)\n"
|
386 |
|
|
"\t\t\to_r <= w_r[(AW+BW-1):0];\n"
|
387 |
|
|
"\n"
|
388 |
|
|
"\tgenerate if (IW > AW)\n"
|
389 |
|
|
"\tbegin : VUNUSED\n"
|
390 |
|
|
"\t\t// verilator lint_off UNUSED\n"
|
391 |
|
|
"\t\twire\t[(IW-AW)-1:0]\tunused;\n"
|
392 |
|
|
"\t\tassign\tunused = w_r[(IW+BW-1):(AW+BW)];\n"
|
393 |
|
|
"\t\t// verilator lint_on UNUSED\n"
|
394 |
|
|
"\tend endgenerate\n"
|
395 |
|
|
"\n"
|
396 |
|
|
"endmodule\n");
|
397 |
|
|
|
398 |
|
|
fclose(fp);
|
399 |
|
|
}
|
400 |
|
|
|