URL https://opencores.org/ocsvn/dblclockfft/dblclockfft/trunk

Subversion Repositories dblclockfft

[/] [dblclockfft/] [trunk/] [sw/] [softmpy.cpp] - Blame information for rev 36

Go to most recent revision | Details | Compare with Previous | View Log


////////////////////////////////////////////////////////////////////////////////
//
// Filename:    softmpy.cpp
//
// Project:     A General Purpose Pipelined FFT Implementation
//
// Purpose:     If the chip doesn't have any hardware multiplies, you'll need
//              a soft-multiply implementation.  This provides that
//      implementation.
//
// Creator:     Dan Gisselquist, Ph.D.
//              Gisselquist Technology, LLC
//
////////////////////////////////////////////////////////////////////////////////
//
// Copyright (C) 2015-2018, Gisselquist Technology, LLC
//
// This program is free software (firmware): you can redistribute it and/or
// modify it under the terms of  the GNU General Public License as published
// by the Free Software Foundation, either version 3 of the License, or (at
// your option) any later version.
//
// This program is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTIBILITY or
// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
// for more details.
//
// You should have received a copy of the GNU General Public License along
// with this program.  (It's in the $(ROOT)/doc directory, run make with no
// target there if the PDF file isn't present.)  If not, see
// <http://www.gnu.org/licenses/> for a copy.
//
// License:     GPL, v3, as defined and found on www.gnu.org,
//              http://www.gnu.org/licenses/gpl.html
//
//
////////////////////////////////////////////////////////////////////////////////
//
//
#define _CRT_SECURE_NO_WARNINGS   //  ms vs 2012 doesn't like fopen
#include <stdio.h>
#include <stdlib.h>
 
#ifdef _MSC_VER //  added for ms vs compatibility
 
#include <io.h>
#include <direct.h>
#define _USE_MATH_DEFINES
 
#endif
 
#include <string.h>
#include <string>
#include <math.h>
#include <ctype.h>
#include <assert.h>
 
#include "defaults.h"
#include "legal.h"
#include "softmpy.h"
 
void    build_multiply(const char *fname) {
        FILE    *fp = fopen(fname, "w");
        if (NULL == fp) {
                fprintf(stderr, "Could not open \'%s\' for writing\n", fname);
                perror("O/S Err was:");
                return;
        }
 
        fprintf(fp,
SLASHLINE
"//\n"
"// Filename:\tshiftaddmpy.v\n"
"//\n"
"// Project:\t%s\n"
"//\n"
"// Purpose:\tA portable shift and add multiply.\n"
"//\n"
"//     While both Xilinx and Altera will offer single clock multiplies, this\n"
"//     simple approach will multiply two numbers on any architecture.  The\n"
"//     result maintains the full width of the multiply, there are no extra\n"
"//     stuff bits, no rounding, no shifted bits, etc.\n"
"//\n"
"//     Further, for those applications that can support it, this multiply\n"
"//     is pipelined and will produce one answer per clock.\n"
"//\n"
"//     For minimal processing delay, make the first parameter the one with\n"
"//     the least bits, so that AWIDTH <= BWIDTH.\n"
"//\n"
"//     The processing delay in this multiply is (AWIDTH+1) cycles.  That is,\n"
"//     if the data is present on the input at clock t=0, the result will be\n"
"//     present on the output at time t=AWIDTH+1;\n"
"//\n"
"//\n%s"
"//\n", prjname, creator);
 
        fprintf(fp, "%s", cpyleft);
        fprintf(fp, "//\n//\n`default_nettype\tnone\n//\n");
        fprintf(fp,
"module shiftaddmpy(i_clk, i_ce, i_a, i_b, o_r);\n"
        "\tparameter\tAWIDTH=%d,BWIDTH=", TST_SHIFTADDMPY_AW);
#ifdef  TST_SHIFTADDMPY_BW
        fprintf(fp, "%d;\n", TST_SHIFTADDMPY_BW);
#else
        fprintf(fp, "AWIDTH;\n");
#endif
        fprintf(fp,
        "\tinput\t\t\t\t\ti_clk, i_ce;\n"
        "\tinput\t\t[(AWIDTH-1):0]\t\ti_a;\n"
        "\tinput\t\t[(BWIDTH-1):0]\t\ti_b;\n"
        "\toutput\treg\t[(AWIDTH+BWIDTH-1):0]\to_r;\n"
"\n"
        "\treg\t[(AWIDTH-1):0]\tu_a;\n"
        "\treg\t[(BWIDTH-1):0]\tu_b;\n"
        "\treg\t\t\tsgn;\n"
"\n"
        "\treg\t[(AWIDTH-2):0]\t\tr_a[0:(AWIDTH-1)];\n"
        "\treg\t[(AWIDTH+BWIDTH-2):0]\tr_b[0:(AWIDTH-1)];\n"
        "\treg\t\t\t\tr_s[0:(AWIDTH-1)];\n"
        "\treg\t[(AWIDTH+BWIDTH-1):0]\tacc[0:(AWIDTH-1)];\n"
        "\tgenvar k;\n"
"\n"
        "\t// If we were forced to stay within two\'s complement arithmetic,\n"
        "\t// taking the absolute value here would require an additional bit.\n"
        "\t// However, because our results are now unsigned, we can stay\n"
        "\t// within the number of bits given (for now).\n"
        "\talways @(posedge i_clk)\n"
                "\t\tif (i_ce)\n"
                "\t\tbegin\n"
                        "\t\t\tu_a <= (i_a[AWIDTH-1])?(-i_a):(i_a);\n"
                        "\t\t\tu_b <= (i_b[BWIDTH-1])?(-i_b):(i_b);\n"
                        "\t\t\tsgn <= i_a[AWIDTH-1] ^ i_b[BWIDTH-1];\n"
                "\t\tend\n"
"\n"
        "\talways @(posedge i_clk)\n"
                "\t\tif (i_ce)\n"
                "\t\tbegin\n"
                        "\t\t\tacc[0] <= (u_a[0]) ? { {(AWIDTH){1\'b0}}, u_b }\n"
                        "\t\t\t\t\t: {(AWIDTH+BWIDTH){1\'b0}};\n"
                        "\t\t\tr_a[0] <= { u_a[(AWIDTH-1):1] };\n"
                        "\t\t\tr_b[0] <= { {(AWIDTH-1){1\'b0}}, u_b };\n"
                        "\t\t\tr_s[0] <= sgn; // The final sign, needs to be preserved\n"
                "\t\tend\n"
"\n"
        "\tgenerate\n"
        "\tfor(k=0; k<AWIDTH-1; k=k+1)\n"
        "\tbegin : genstages\n"
                "\t\talways @(posedge i_clk)\n"
                "\t\tif (i_ce)\n"
                "\t\tbegin\n"
                        "\t\t\tacc[k+1] <= acc[k] + ((r_a[k][0]) ? {r_b[k],1\'b0}:0);\n"
                        "\t\t\tr_a[k+1] <= { 1\'b0, r_a[k][(AWIDTH-2):1] };\n"
                        "\t\t\tr_b[k+1] <= { r_b[k][(AWIDTH+BWIDTH-3):0], 1\'b0};\n"
                        "\t\t\tr_s[k+1] <= r_s[k];\n"
                "\t\tend\n"
        "\tend\n"
        "\tendgenerate\n"
"\n"
        "\talways @(posedge i_clk)\n"
                "\t\tif (i_ce)\n"
                        "\t\t\to_r <= (r_s[AWIDTH-1]) ? (-acc[AWIDTH-1]) : acc[AWIDTH-1];\n"
"\n"
"endmodule\n");
 
        fclose(fp);
}
 
void    build_bimpy(const char *fname) {
        FILE    *fp = fopen(fname, "w");
        if (NULL == fp) {
                fprintf(stderr, "Could not open \'%s\' for writing\n", fname);
                perror("O/S Err was:");
                return;
        }
 
        fprintf(fp,
SLASHLINE
"//\n"
"// Filename:\t%s\n"
"//\n"
"// Project:\t%s\n"
"//\n"
"// Purpose:\tA simple 2-bit multiply based upon the fact that LUT's allow\n"
"//             6-bits of input.  In other words, I could build a 3-bit\n"
"//     multiply from 6 LUTs (5 actually, since the first could have two\n"
"//     outputs).  This would allow multiplication of three bit digits, save\n"
"//     only for the fact that you would need two bits of carry.  The bimpy\n"
"//     approach throttles back a bit and does a 2x2 bit multiply in a LUT,\n"
"//     guaranteeing that it will never carry more than one bit.  While this\n"
"//     multiply is hardware independent (and can still run under Verilator\n"
"//     therefore), it is really motivated by trying to optimize for a\n"
"//     specific piece of hardware (Xilinx-7 series ...) that has at least\n"
"//     4-input LUT's with carry chains.\n"
"//\n"
"//\n"
"//\n%s"
"//\n", fname, prjname, creator);
 
        fprintf(fp, "%s", cpyleft);
        fprintf(fp, "//\n//\n`default_nettype\tnone\n//\n");
        fprintf(fp,
"module bimpy(i_clk, i_ce, i_a, i_b, o_r);\n"
"\tparameter\tBW=18, // Number of bits in i_b\n"
"\t\t\tLUTB=2; // Number of bits in i_a for our LUT multiply\n"
"\tinput\t\t\t\ti_clk, i_ce;\n"
"\tinput\t\t[(LUTB-1):0]\ti_a;\n"
"\tinput\t\t[(BW-1):0]\ti_b;\n"
"\toutput\treg\t[(BW+LUTB-1):0] o_r;\n"
"\n"
"\twire [(BW+LUTB-2):0] w_r;\n"
"\twire [(BW+LUTB-3):1] c;\n"
"\n"
"\tassign\tw_r =  { ((i_a[1])?i_b:{(BW){1\'b0}}), 1\'b0 }\n"
"\t\t\t\t^ { 1\'b0, ((i_a[0])?i_b:{(BW){1\'b0}}) };\n"
"\tassign\tc = { ((i_a[1])?i_b[(BW-2):0]:{(BW-1){1\'b0}}) }\n"
"\t\t\t& ((i_a[0])?i_b[(BW-1):1]:{(BW-1){1\'b0}});\n"
"\n"
"\talways @(posedge i_clk)\n"
"\t\tif (i_ce)\n"
"\t\t\to_r <= w_r + { c, 2'b0 };\n"
"\n"
"endmodule\n");
 
        fclose(fp);
}
 
void    build_longbimpy(const char *fname) {
        FILE    *fp = fopen(fname, "w");
        if (NULL == fp) {
                fprintf(stderr, "Could not open \'%s\' for writing\n", fname);
                perror("O/S Err was:");
                return;
        }
 
        fprintf(fp,
SLASHLINE
"//\n"
"// Filename:   %s\n"
"//\n"
"// Project:    %s\n"
"//\n"
"// Purpose:    A portable shift and add multiply, built with the knowledge\n"
"//     of the existence of a six bit LUT and carry chain.  That knowledge\n"
"//     allows us to multiply two bits from one value at a time against all\n"
"//     of the bits of the other value.  This sub multiply is called the\n"
"//     bimpy.\n"
"//\n"
"//     For minimal processing delay, make the first parameter the one with\n"
"//     the least bits, so that AWIDTH <= BWIDTH.\n"
"//\n"
"//\n"
"//\n%s"
"//\n", fname, prjname, creator);
 
        fprintf(fp, "%s", cpyleft);
        fprintf(fp, "//\n//\n`default_nettype\tnone\n//\n");
        fprintf(fp,
"module longbimpy(i_clk, i_ce, i_a_unsorted, i_b_unsorted, o_r);\n"
        "\tparameter    IAW=%d, // The width of i_a, min width is 5\n"
                        "\t\t\tIBW=", TST_LONGBIMPY_AW);
#ifdef  TST_LONGBIMPY_BW
        fprintf(fp, "%d", TST_LONGBIMPY_BW);
#else
        fprintf(fp, "IAW");
#endif
 
        fprintf(fp, ",  // The width of i_b, can be anything\n"
                        "\t\t\t// The following three parameters should not be changed\n"
                        "\t\t\t// by any implementation, but are based upon hardware\n"
                        "\t\t\t// and the above values:\n"
                        "\t\t\tOW=IAW+IBW;      // The output width\n");
        fprintf(fp,
        "\tlocalparam   AW = (IAW<IBW) ? IAW : IBW,\n"
                        "\t\t\tBW = (IAW<IBW) ? IBW : IAW,\n"
                        "\t\t\tIW=(AW+1)&(-2),  // Internal width of A\n"
                        "\t\t\tLUTB=2,  // How many bits we can multiply by at once\n"
                        "\t\t\tTLEN=(AW+(LUTB-1))/LUTB; // Nmbr of rows in our tableau\n"
        "\tinput\t\t\t\ti_clk, i_ce;\n"
        "\tinput\t\t[(IAW-1):0]\ti_a_unsorted;\n"
        "\tinput\t\t[(IBW-1):0]\ti_b_unsorted;\n"
        "\toutput\treg\t[(AW+BW-1):0]\to_r;\n"
"\n"
        "\t//\n"
        "\t// Swap parameter order, so that AW <= BW -- for performance\n"
        "\t// reasons\n"
        "\twire [AW-1:0]        i_a;\n"
        "\twire [BW-1:0]        i_b;\n"
        "\tgenerate if (IAW <= IBW)\n"
        "\tbegin : NO_PARAM_CHANGE\n"
        "\t\tassign i_a = i_a_unsorted;\n"
        "\t\tassign i_b = i_b_unsorted;\n"
        "\tend else begin : SWAP_PARAMETERS\n"
        "\t\tassign i_a = i_b_unsorted;\n"
        "\t\tassign i_b = i_a_unsorted;\n"
        "\tend endgenerate\n"
"\n"
        "\treg\t[(IW-1):0]\tu_a;\n"
        "\treg\t[(BW-1):0]\tu_b;\n"
        "\treg\t\t\tsgn;\n"
"\n"
        "\treg\t[(IW-1-2*(LUTB)):0]\tr_a[0:(TLEN-3)];\n"
        "\treg\t[(BW-1):0]\t\tr_b[0:(TLEN-3)];\n"
        "\treg\t[(TLEN-1):0]\t\tr_s;\n"
        "\treg\t[(IW+BW-1):0]\t\tacc[0:(TLEN-2)];\n"
        "\tgenvar k;\n"
"\n"
        "\t// First step:\n"
        "\t// Switch to unsigned arithmetic for our multiply, keeping track\n"
        "\t// of the along the way.  We'll then add the sign again later at\n"
        "\t// the end.\n"
        "\t//\n"
        "\t// If we were forced to stay within two's complement arithmetic,\n"
        "\t// taking the absolute value here would require an additional bit.\n"
        "\t// However, because our results are now unsigned, we can stay\n"
        "\t// within the number of bits given (for now).\n"
        "\tgenerate if (IW > AW)\n"
        "\tbegin\n"
                "\t\talways @(posedge i_clk)\n"
                        "\t\t\tif (i_ce)\n"
                        "\t\t\t\tu_a <= { 1\'b0, (i_a[AW-1])?(-i_a):(i_a) };\n"
        "\tend else begin\n"
                "\t\talways @(posedge i_clk)\n"
                        "\t\t\tif (i_ce)\n"
                        "\t\t\t\tu_a <= (i_a[AW-1])?(-i_a):(i_a);\n"
        "\tend endgenerate\n"
"\n"
        "\talways @(posedge i_clk)\n"
                "\t\tif (i_ce)\n"
                "\t\tbegin\n"
                        "\t\t\tu_b <= (i_b[BW-1])?(-i_b):(i_b);\n"
                        "\t\t\tsgn <= i_a[AW-1] ^ i_b[BW-1];\n"
                "\t\tend\n"
"\n"
        "\twire [(BW+LUTB-1):0] pr_a, pr_b;\n"
"\n"
        "\t//\n"
        "\t// Second step: First two 2xN products.\n"
        "\t//\n"
        "\t// Since we have no tableau of additions (yet), we can do both\n"
        "\t// of the first two rows at the same time and add them together.\n"
        "\t// For the next round, we'll then have a previous sum to accumulate\n"
        "\t// with new and subsequent product, and so only do one product at\n"
        "\t// a time can follow this--but the first clock can do two at a time.\n"
        "\tbimpy\t#(BW) lmpy_0(i_clk,i_ce,u_a[(  LUTB-1):   0], u_b, pr_a);\n"
        "\tbimpy\t#(BW) lmpy_1(i_clk,i_ce,u_a[(2*LUTB-1):LUTB], u_b, pr_b);\n"
        "\talways @(posedge i_clk)\n"
                "\t\tif (i_ce) r_a[0] <= u_a[(IW-1):(2*LUTB)];\n"
        "\talways @(posedge i_clk)\n"
                "\t\tif (i_ce) r_b[0] <= u_b;\n"
        "\talways @(posedge i_clk)\n"
                "\t\tif (i_ce) r_s <= { r_s[(TLEN-2):0], sgn };\n"
        "\talways @(posedge i_clk) // One clk after p[0],p[1] become valid\n"
                "\t\tif (i_ce) acc[0] <= { {(IW-LUTB){1\'b0}}, pr_a}\n"
                        "\t\t\t  +{ {(IW-(2*LUTB)){1\'b0}}, pr_b, {(LUTB){1\'b0}} };\n"
"\n"
        "\tgenerate // Keep track of intermediate values, before multiplying them\n"
        "\tif (TLEN > 3) for(k=0; k<TLEN-3; k=k+1)\n"
        "\tbegin : gencopies\n"
                "\t\talways @(posedge i_clk)\n"
                "\t\tif (i_ce)\n"
                "\t\tbegin\n"
                        "\t\t\tr_a[k+1] <= { {(LUTB){1\'b0}},\n"
                                "\t\t\t\tr_a[k][(IW-1-(2*LUTB)):LUTB] };\n"
                        "\t\t\tr_b[k+1] <= r_b[k];\n"
                        "\t\tend\n"
        "\tend endgenerate\n"
"\n"
        "\tgenerate // The actual multiply and accumulate stage\n"
        "\tif (TLEN > 2) for(k=0; k<TLEN-2; k=k+1)\n"
        "\tbegin : genstages\n"
                "\t\t// First, the multiply: 2-bits times BW bits\n"
                "\t\twire\t[(BW+LUTB-1):0] genp;\n"
                "\t\tbimpy #(BW) genmpy(i_clk,i_ce,r_a[k][(LUTB-1):0],r_b[k], genp);\n"
"\n"
                "\t\t// Then the accumulate step -- on the next clock\n"
                "\t\talways @(posedge i_clk)\n"
                        "\t\t\tif (i_ce)\n"
                                "\t\t\t\tacc[k+1] <= acc[k] + {{(IW-LUTB*(k+3)){1\'b0}},\n"
                                        "\t\t\t\t\tgenp, {(LUTB*(k+2)){1\'b0}} };\n"
        "\tend endgenerate\n"
"\n"
        "\twire [(IW+BW-1):0]   w_r;\n"
        "\tassign\tw_r = (r_s[TLEN-1]) ? (-acc[TLEN-2]) : acc[TLEN-2];\n"
        "\talways @(posedge i_clk)\n"
                "\t\tif (i_ce)\n"
                        "\t\t\to_r <= w_r[(AW+BW-1):0];\n"
"\n"
        "\tgenerate if (IW > AW)\n"
        "\tbegin : VUNUSED\n"
        "\t\t// verilator lint_off UNUSED\n"
        "\t\twire\t[(IW-AW)-1:0]\tunused;\n"
        "\t\tassign\tunused = w_r[(IW+BW-1):(AW+BW)];\n"
        "\t\t// verilator lint_on UNUSED\n"
        "\tend endgenerate\n"
"\n"
"endmodule\n");
 
        fclose(fp);
}
 

Browse

Tools

Subversion Repositories dblclockfft

[/] [dblclockfft/] [trunk/] [sw/] [softmpy.cpp] - Blame information for rev 36

Line No.	Rev	Author	Line
1	36	dgisselq	`////////////////////////////////////////////////////////////////////////////////`
2			`//`
3			`// Filename: softmpy.cpp`
4			`//`
5			`// Project: A General Purpose Pipelined FFT Implementation`
6			`//`
7			`// Purpose: If the chip doesn't have any hardware multiplies, you'll need`
8			`// a soft-multiply implementation. This provides that`
9			`// implementation.`
10			`//`
11			`// Creator: Dan Gisselquist, Ph.D.`
12			`// Gisselquist Technology, LLC`
13			`//`
14			`////////////////////////////////////////////////////////////////////////////////`
15			`//`
16			`// Copyright (C) 2015-2018, Gisselquist Technology, LLC`
17			`//`
18			`// This program is free software (firmware): you can redistribute it and/or`
19			`// modify it under the terms of the GNU General Public License as published`
20			`// by the Free Software Foundation, either version 3 of the License, or (at`
21			`// your option) any later version.`
22			`//`
23			`// This program is distributed in the hope that it will be useful, but WITHOUT`
24			`// ANY WARRANTY; without even the implied warranty of MERCHANTIBILITY or`
25			`// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License`
26			`// for more details.`
27			`//`
28			`// You should have received a copy of the GNU General Public License along`
29			`// with this program. (It's in the $(ROOT)/doc directory, run make with no`
30			`// target there if the PDF file isn't present.) If not, see`
31			`// <http://www.gnu.org/licenses/> for a copy.`
32			`//`
33			`// License: GPL, v3, as defined and found on www.gnu.org,`
34			`// http://www.gnu.org/licenses/gpl.html`
35			`//`
36			`//`
37			`////////////////////////////////////////////////////////////////////////////////`
38			`//`
39			`//`
40			`#define _CRT_SECURE_NO_WARNINGS // ms vs 2012 doesn't like fopen`
41			`#include <stdio.h>`
42			`#include <stdlib.h>`
43
44			`#ifdef _MSC_VER // added for ms vs compatibility`
45
46			`#include <io.h>`
47			`#include <direct.h>`
48			`#define _USE_MATH_DEFINES`
49
50			`#endif`
51
52			`#include <string.h>`
53			`#include <string>`
54			`#include <math.h>`
55			`#include <ctype.h>`
56			`#include <assert.h>`
57
58			`#include "defaults.h"`
59			`#include "legal.h"`
60			`#include "softmpy.h"`
61
62			`void build_multiply(const char *fname) {`
63			`FILE *fp = fopen(fname, "w");`
64			`if (NULL == fp) {`
65			`fprintf(stderr, "Could not open \'%s\' for writing\n", fname);`
66			`perror("O/S Err was:");`
67			`return;`
68			`}`
69
70			`fprintf(fp,`
71			`SLASHLINE`
72			`"//\n"`
73			`"// Filename:\tshiftaddmpy.v\n"`
74			`"//\n"`
75			`"// Project:\t%s\n"`
76			`"//\n"`
77			`"// Purpose:\tA portable shift and add multiply.\n"`
78			`"//\n"`
79			`"// While both Xilinx and Altera will offer single clock multiplies, this\n"`
80			`"// simple approach will multiply two numbers on any architecture. The\n"`
81			`"// result maintains the full width of the multiply, there are no extra\n"`
82			`"// stuff bits, no rounding, no shifted bits, etc.\n"`
83			`"//\n"`
84			`"// Further, for those applications that can support it, this multiply\n"`
85			`"// is pipelined and will produce one answer per clock.\n"`
86			`"//\n"`
87			`"// For minimal processing delay, make the first parameter the one with\n"`
88			`"// the least bits, so that AWIDTH <= BWIDTH.\n"`
89			`"//\n"`
90			`"// The processing delay in this multiply is (AWIDTH+1) cycles. That is,\n"`
91			`"// if the data is present on the input at clock t=0, the result will be\n"`
92			`"// present on the output at time t=AWIDTH+1;\n"`
93			`"//\n"`
94			`"//\n%s"`
95			`"//\n", prjname, creator);`
96
97			`fprintf(fp, "%s", cpyleft);`
98			fprintf(fp, "//\n//\n`default_nettype\tnone\n//\n");
99			`fprintf(fp,`
100			`"module shiftaddmpy(i_clk, i_ce, i_a, i_b, o_r);\n"`
101			`"\tparameter\tAWIDTH=%d,BWIDTH=", TST_SHIFTADDMPY_AW);`
102			`#ifdef TST_SHIFTADDMPY_BW`
103			`fprintf(fp, "%d;\n", TST_SHIFTADDMPY_BW);`
104			`#else`
105			`fprintf(fp, "AWIDTH;\n");`
106			`#endif`
107			`fprintf(fp,`
108			`"\tinput\t\t\t\t\ti_clk, i_ce;\n"`
109			`"\tinput\t\t[(AWIDTH-1):0]\t\ti_a;\n"`
110			`"\tinput\t\t[(BWIDTH-1):0]\t\ti_b;\n"`
111			`"\toutput\treg\t[(AWIDTH+BWIDTH-1):0]\to_r;\n"`
112			`"\n"`
113			`"\treg\t[(AWIDTH-1):0]\tu_a;\n"`
114			`"\treg\t[(BWIDTH-1):0]\tu_b;\n"`
115			`"\treg\t\t\tsgn;\n"`
116			`"\n"`
117			`"\treg\t[(AWIDTH-2):0]\t\tr_a[0:(AWIDTH-1)];\n"`
118			`"\treg\t[(AWIDTH+BWIDTH-2):0]\tr_b[0:(AWIDTH-1)];\n"`
119			`"\treg\t\t\t\tr_s[0:(AWIDTH-1)];\n"`
120			`"\treg\t[(AWIDTH+BWIDTH-1):0]\tacc[0:(AWIDTH-1)];\n"`
121			`"\tgenvar k;\n"`
122			`"\n"`
123			`"\t// If we were forced to stay within two\'s complement arithmetic,\n"`
124			`"\t// taking the absolute value here would require an additional bit.\n"`
125			`"\t// However, because our results are now unsigned, we can stay\n"`
126			`"\t// within the number of bits given (for now).\n"`
127			`"\talways @(posedge i_clk)\n"`
128			`"\t\tif (i_ce)\n"`
129			`"\t\tbegin\n"`
130			`"\t\t\tu_a <= (i_a[AWIDTH-1])?(-i_a):(i_a);\n"`
131			`"\t\t\tu_b <= (i_b[BWIDTH-1])?(-i_b):(i_b);\n"`
132			`"\t\t\tsgn <= i_a[AWIDTH-1] ^ i_b[BWIDTH-1];\n"`
133			`"\t\tend\n"`
134			`"\n"`
135			`"\talways @(posedge i_clk)\n"`
136			`"\t\tif (i_ce)\n"`
137			`"\t\tbegin\n"`
138			`"\t\t\tacc[0] <= (u_a[0]) ? { {(AWIDTH){1\'b0}}, u_b }\n"`
139			`"\t\t\t\t\t: {(AWIDTH+BWIDTH){1\'b0}};\n"`
140			`"\t\t\tr_a[0] <= { u_a[(AWIDTH-1):1] };\n"`
141			`"\t\t\tr_b[0] <= { {(AWIDTH-1){1\'b0}}, u_b };\n"`
142			`"\t\t\tr_s[0] <= sgn; // The final sign, needs to be preserved\n"`
143			`"\t\tend\n"`
144			`"\n"`
145			`"\tgenerate\n"`
146			`"\tfor(k=0; k<AWIDTH-1; k=k+1)\n"`
147			`"\tbegin : genstages\n"`
148			`"\t\talways @(posedge i_clk)\n"`
149			`"\t\tif (i_ce)\n"`
150			`"\t\tbegin\n"`
151			`"\t\t\tacc[k+1] <= acc[k] + ((r_a[k][0]) ? {r_b[k],1\'b0}:0);\n"`
152			`"\t\t\tr_a[k+1] <= { 1\'b0, r_a[k][(AWIDTH-2):1] };\n"`
153			`"\t\t\tr_b[k+1] <= { r_b[k][(AWIDTH+BWIDTH-3):0], 1\'b0};\n"`
154			`"\t\t\tr_s[k+1] <= r_s[k];\n"`
155			`"\t\tend\n"`
156			`"\tend\n"`
157			`"\tendgenerate\n"`
158			`"\n"`
159			`"\talways @(posedge i_clk)\n"`
160			`"\t\tif (i_ce)\n"`
161			`"\t\t\to_r <= (r_s[AWIDTH-1]) ? (-acc[AWIDTH-1]) : acc[AWIDTH-1];\n"`
162			`"\n"`
163			`"endmodule\n");`
164
165			`fclose(fp);`
166			`}`
167
168			`void build_bimpy(const char *fname) {`
169			`FILE *fp = fopen(fname, "w");`
170			`if (NULL == fp) {`
171			`fprintf(stderr, "Could not open \'%s\' for writing\n", fname);`
172			`perror("O/S Err was:");`
173			`return;`
174			`}`
175
176			`fprintf(fp,`
177			`SLASHLINE`
178			`"//\n"`
179			`"// Filename:\t%s\n"`
180			`"//\n"`
181			`"// Project:\t%s\n"`
182			`"//\n"`
183			`"// Purpose:\tA simple 2-bit multiply based upon the fact that LUT's allow\n"`
184			`"// 6-bits of input. In other words, I could build a 3-bit\n"`
185			`"// multiply from 6 LUTs (5 actually, since the first could have two\n"`
186			`"// outputs). This would allow multiplication of three bit digits, save\n"`
187			`"// only for the fact that you would need two bits of carry. The bimpy\n"`
188			`"// approach throttles back a bit and does a 2x2 bit multiply in a LUT,\n"`
189			`"// guaranteeing that it will never carry more than one bit. While this\n"`
190			`"// multiply is hardware independent (and can still run under Verilator\n"`
191			`"// therefore), it is really motivated by trying to optimize for a\n"`
192			`"// specific piece of hardware (Xilinx-7 series ...) that has at least\n"`
193			`"// 4-input LUT's with carry chains.\n"`
194			`"//\n"`
195			`"//\n"`
196			`"//\n%s"`
197			`"//\n", fname, prjname, creator);`
198
199			`fprintf(fp, "%s", cpyleft);`
200			fprintf(fp, "//\n//\n`default_nettype\tnone\n//\n");
201			`fprintf(fp,`
202			`"module bimpy(i_clk, i_ce, i_a, i_b, o_r);\n"`
203			`"\tparameter\tBW=18, // Number of bits in i_b\n"`
204			`"\t\t\tLUTB=2; // Number of bits in i_a for our LUT multiply\n"`
205			`"\tinput\t\t\t\ti_clk, i_ce;\n"`
206			`"\tinput\t\t[(LUTB-1):0]\ti_a;\n"`
207			`"\tinput\t\t[(BW-1):0]\ti_b;\n"`
208			`"\toutput\treg\t[(BW+LUTB-1):0] o_r;\n"`
209			`"\n"`
210			`"\twire [(BW+LUTB-2):0] w_r;\n"`
211			`"\twire [(BW+LUTB-3):1] c;\n"`
212			`"\n"`
213			`"\tassign\tw_r = { ((i_a[1])?i_b:{(BW){1\'b0}}), 1\'b0 }\n"`
214			`"\t\t\t\t^ { 1\'b0, ((i_a[0])?i_b:{(BW){1\'b0}}) };\n"`
215			`"\tassign\tc = { ((i_a[1])?i_b[(BW-2):0]:{(BW-1){1\'b0}}) }\n"`
216			`"\t\t\t& ((i_a[0])?i_b[(BW-1):1]:{(BW-1){1\'b0}});\n"`
217			`"\n"`
218			`"\talways @(posedge i_clk)\n"`
219			`"\t\tif (i_ce)\n"`
220			`"\t\t\to_r <= w_r + { c, 2'b0 };\n"`
221			`"\n"`
222			`"endmodule\n");`
223
224			`fclose(fp);`
225			`}`
226
227			`void build_longbimpy(const char *fname) {`
228			`FILE *fp = fopen(fname, "w");`
229			`if (NULL == fp) {`
230			`fprintf(stderr, "Could not open \'%s\' for writing\n", fname);`
231			`perror("O/S Err was:");`
232			`return;`
233			`}`
234
235			`fprintf(fp,`
236			`SLASHLINE`
237			`"//\n"`
238			`"// Filename: %s\n"`
239			`"//\n"`
240			`"// Project: %s\n"`
241			`"//\n"`
242			`"// Purpose: A portable shift and add multiply, built with the knowledge\n"`
243			`"// of the existence of a six bit LUT and carry chain. That knowledge\n"`
244			`"// allows us to multiply two bits from one value at a time against all\n"`
245			`"// of the bits of the other value. This sub multiply is called the\n"`
246			`"// bimpy.\n"`
247			`"//\n"`
248			`"// For minimal processing delay, make the first parameter the one with\n"`
249			`"// the least bits, so that AWIDTH <= BWIDTH.\n"`
250			`"//\n"`
251			`"//\n"`
252			`"//\n%s"`
253			`"//\n", fname, prjname, creator);`
254
255			`fprintf(fp, "%s", cpyleft);`
256			fprintf(fp, "//\n//\n`default_nettype\tnone\n//\n");
257			`fprintf(fp,`
258			`"module longbimpy(i_clk, i_ce, i_a_unsorted, i_b_unsorted, o_r);\n"`
259			`"\tparameter IAW=%d, // The width of i_a, min width is 5\n"`
260			`"\t\t\tIBW=", TST_LONGBIMPY_AW);`
261			`#ifdef TST_LONGBIMPY_BW`
262			`fprintf(fp, "%d", TST_LONGBIMPY_BW);`
263			`#else`
264			`fprintf(fp, "IAW");`
265			`#endif`
266
267			`fprintf(fp, ", // The width of i_b, can be anything\n"`
268			`"\t\t\t// The following three parameters should not be changed\n"`
269			`"\t\t\t// by any implementation, but are based upon hardware\n"`
270			`"\t\t\t// and the above values:\n"`
271			`"\t\t\tOW=IAW+IBW; // The output width\n");`
272			`fprintf(fp,`
273			`"\tlocalparam AW = (IAW<IBW) ? IAW : IBW,\n"`
274			`"\t\t\tBW = (IAW<IBW) ? IBW : IAW,\n"`
275			`"\t\t\tIW=(AW+1)&(-2), // Internal width of A\n"`
276			`"\t\t\tLUTB=2, // How many bits we can multiply by at once\n"`
277			`"\t\t\tTLEN=(AW+(LUTB-1))/LUTB; // Nmbr of rows in our tableau\n"`
278			`"\tinput\t\t\t\ti_clk, i_ce;\n"`
279			`"\tinput\t\t[(IAW-1):0]\ti_a_unsorted;\n"`
280			`"\tinput\t\t[(IBW-1):0]\ti_b_unsorted;\n"`
281			`"\toutput\treg\t[(AW+BW-1):0]\to_r;\n"`
282			`"\n"`
283			`"\t//\n"`
284			`"\t// Swap parameter order, so that AW <= BW -- for performance\n"`
285			`"\t// reasons\n"`
286			`"\twire [AW-1:0] i_a;\n"`
287			`"\twire [BW-1:0] i_b;\n"`
288			`"\tgenerate if (IAW <= IBW)\n"`
289			`"\tbegin : NO_PARAM_CHANGE\n"`
290			`"\t\tassign i_a = i_a_unsorted;\n"`
291			`"\t\tassign i_b = i_b_unsorted;\n"`
292			`"\tend else begin : SWAP_PARAMETERS\n"`
293			`"\t\tassign i_a = i_b_unsorted;\n"`
294			`"\t\tassign i_b = i_a_unsorted;\n"`
295			`"\tend endgenerate\n"`
296			`"\n"`
297			`"\treg\t[(IW-1):0]\tu_a;\n"`
298			`"\treg\t[(BW-1):0]\tu_b;\n"`
299			`"\treg\t\t\tsgn;\n"`
300			`"\n"`
301			`"\treg\t[(IW-1-2*(LUTB)):0]\tr_a[0:(TLEN-3)];\n"`
302			`"\treg\t[(BW-1):0]\t\tr_b[0:(TLEN-3)];\n"`
303			`"\treg\t[(TLEN-1):0]\t\tr_s;\n"`
304			`"\treg\t[(IW+BW-1):0]\t\tacc[0:(TLEN-2)];\n"`
305			`"\tgenvar k;\n"`
306			`"\n"`
307			`"\t// First step:\n"`
308			`"\t// Switch to unsigned arithmetic for our multiply, keeping track\n"`
309			`"\t// of the along the way. We'll then add the sign again later at\n"`
310			`"\t// the end.\n"`
311			`"\t//\n"`
312			`"\t// If we were forced to stay within two's complement arithmetic,\n"`
313			`"\t// taking the absolute value here would require an additional bit.\n"`
314			`"\t// However, because our results are now unsigned, we can stay\n"`
315			`"\t// within the number of bits given (for now).\n"`
316			`"\tgenerate if (IW > AW)\n"`
317			`"\tbegin\n"`
318			`"\t\talways @(posedge i_clk)\n"`
319			`"\t\t\tif (i_ce)\n"`
320			`"\t\t\t\tu_a <= { 1\'b0, (i_a[AW-1])?(-i_a):(i_a) };\n"`
321			`"\tend else begin\n"`
322			`"\t\talways @(posedge i_clk)\n"`
323			`"\t\t\tif (i_ce)\n"`
324			`"\t\t\t\tu_a <= (i_a[AW-1])?(-i_a):(i_a);\n"`
325			`"\tend endgenerate\n"`
326			`"\n"`
327			`"\talways @(posedge i_clk)\n"`
328			`"\t\tif (i_ce)\n"`
329			`"\t\tbegin\n"`
330			`"\t\t\tu_b <= (i_b[BW-1])?(-i_b):(i_b);\n"`
331			`"\t\t\tsgn <= i_a[AW-1] ^ i_b[BW-1];\n"`
332			`"\t\tend\n"`
333			`"\n"`
334			`"\twire [(BW+LUTB-1):0] pr_a, pr_b;\n"`
335			`"\n"`
336			`"\t//\n"`
337			`"\t// Second step: First two 2xN products.\n"`
338			`"\t//\n"`
339			`"\t// Since we have no tableau of additions (yet), we can do both\n"`
340			`"\t// of the first two rows at the same time and add them together.\n"`
341			`"\t// For the next round, we'll then have a previous sum to accumulate\n"`
342			`"\t// with new and subsequent product, and so only do one product at\n"`
343			`"\t// a time can follow this--but the first clock can do two at a time.\n"`
344			`"\tbimpy\t#(BW) lmpy_0(i_clk,i_ce,u_a[( LUTB-1): 0], u_b, pr_a);\n"`
345			`"\tbimpy\t#(BW) lmpy_1(i_clk,i_ce,u_a[(2*LUTB-1):LUTB], u_b, pr_b);\n"`
346			`"\talways @(posedge i_clk)\n"`
347			`"\t\tif (i_ce) r_a[0] <= u_a[(IW-1):(2*LUTB)];\n"`
348			`"\talways @(posedge i_clk)\n"`
349			`"\t\tif (i_ce) r_b[0] <= u_b;\n"`
350			`"\talways @(posedge i_clk)\n"`
351			`"\t\tif (i_ce) r_s <= { r_s[(TLEN-2):0], sgn };\n"`
352			`"\talways @(posedge i_clk) // One clk after p[0],p[1] become valid\n"`
353			`"\t\tif (i_ce) acc[0] <= { {(IW-LUTB){1\'b0}}, pr_a}\n"`
354			`"\t\t\t +{ {(IW-(2*LUTB)){1\'b0}}, pr_b, {(LUTB){1\'b0}} };\n"`
355			`"\n"`
356			`"\tgenerate // Keep track of intermediate values, before multiplying them\n"`
357			`"\tif (TLEN > 3) for(k=0; k<TLEN-3; k=k+1)\n"`
358			`"\tbegin : gencopies\n"`
359			`"\t\talways @(posedge i_clk)\n"`
360			`"\t\tif (i_ce)\n"`
361			`"\t\tbegin\n"`
362			`"\t\t\tr_a[k+1] <= { {(LUTB){1\'b0}},\n"`
363			`"\t\t\t\tr_a[k][(IW-1-(2*LUTB)):LUTB] };\n"`
364			`"\t\t\tr_b[k+1] <= r_b[k];\n"`
365			`"\t\tend\n"`
366			`"\tend endgenerate\n"`
367			`"\n"`
368			`"\tgenerate // The actual multiply and accumulate stage\n"`
369			`"\tif (TLEN > 2) for(k=0; k<TLEN-2; k=k+1)\n"`
370			`"\tbegin : genstages\n"`
371			`"\t\t// First, the multiply: 2-bits times BW bits\n"`
372			`"\t\twire\t[(BW+LUTB-1):0] genp;\n"`
373			`"\t\tbimpy #(BW) genmpy(i_clk,i_ce,r_a[k][(LUTB-1):0],r_b[k], genp);\n"`
374			`"\n"`
375			`"\t\t// Then the accumulate step -- on the next clock\n"`
376			`"\t\talways @(posedge i_clk)\n"`
377			`"\t\t\tif (i_ce)\n"`
378			`"\t\t\t\tacc[k+1] <= acc[k] + {{(IW-LUTB*(k+3)){1\'b0}},\n"`
379			`"\t\t\t\t\tgenp, {(LUTB*(k+2)){1\'b0}} };\n"`
380			`"\tend endgenerate\n"`
381			`"\n"`
382			`"\twire [(IW+BW-1):0] w_r;\n"`
383			`"\tassign\tw_r = (r_s[TLEN-1]) ? (-acc[TLEN-2]) : acc[TLEN-2];\n"`
384			`"\talways @(posedge i_clk)\n"`
385			`"\t\tif (i_ce)\n"`
386			`"\t\t\to_r <= w_r[(AW+BW-1):0];\n"`
387			`"\n"`
388			`"\tgenerate if (IW > AW)\n"`
389			`"\tbegin : VUNUSED\n"`
390			`"\t\t// verilator lint_off UNUSED\n"`
391			`"\t\twire\t[(IW-AW)-1:0]\tunused;\n"`
392			`"\t\tassign\tunused = w_r[(IW+BW-1):(AW+BW)];\n"`
393			`"\t\t// verilator lint_on UNUSED\n"`
394			`"\tend endgenerate\n"`
395			`"\n"`
396			`"endmodule\n");`
397
398			`fclose(fp);`
399			`}`
400