OpenCores

Rev 35	Rev 36
Line 1...	Line 1...
`////////////////////////////////////////////////////////////////////////////////`	`////////////////////////////////////////////////////////////////////////////////`
`//`	`//`
`// Filename: fftgen.cpp`	`// Filename: fftgen.cpp`
`//`	`//`
`// Project: A Doubletime Pipelined FFT`	`// Project: A General Purpose Pipelined FFT Implementation`
`//`	`//`
`// Purpose: This is the core generator for the project. Every part`	`// Purpose: This is the core generator for the project. Every part`
`// and piece of this project begins and ends in this program.`	`// and piece of this project begins and ends in this program.`
`// Once built, this program will build an FFT (or IFFT) core of arbitrary`	`// Once built, this program will build an FFT (or IFFT) core of arbitrary`
`// width, precision, etc., that will run at two samples per clock.`	`// width, precision, etc., that will run at two samples per clock.`
Line 25...	Line 25...
`// Creator: Dan Gisselquist, Ph.D.`	`// Creator: Dan Gisselquist, Ph.D.`
`// Gisselquist Technology, LLC`	`// Gisselquist Technology, LLC`
`//`	`//`
`////////////////////////////////////////////////////////////////////////////////`	`////////////////////////////////////////////////////////////////////////////////`
`//`	`//`
`// Copyright (C) 2015-2017, Gisselquist Technology, LLC`	`// Copyright (C) 2015-2018, Gisselquist Technology, LLC`
`//`	`//`
`// This program is free software (firmware): you can redistribute it and/or`	`// This program is free software (firmware): you can redistribute it and/or`
`// modify it under the terms of the GNU General Public License as published`	`// modify it under the terms of the GNU General Public License as published`
`// by the Free Software Foundation, either version 3 of the License, or (at`	`// by the Free Software Foundation, either version 3 of the License, or (at`
`// your option) any later version.`	`// your option) any later version.`
Line 65...	Line 65...
`#define X_OK 0 /* !!!!!! execute permission - unsupported in windows*/`	`#define X_OK 0 /* !!!!!! execute permission - unsupported in windows*/`
`#define F_OK 0 /* Test for existence. */`	`#define F_OK 0 /* Test for existence. */`

`#if _MSC_VER <= 1700`	`#if _MSC_VER <= 1700`

`long long llround(double d) {`
`if (d<0) return -(long long)(-d+0.5);`
`else return (long long)(d+0.5); }`
`int lstat(const char filename, struct stat buf) { return 1; };`	`int lstat(const char filename, struct stat buf) { return 1; };`
`#define S_ISDIR(A) 0`	`#define S_ISDIR(A) 0`

`#else`	`#else`

Line 95...	Line 92...
`#include <string>`	`#include <string>`
`#include <math.h>`	`#include <math.h>`
`#include <ctype.h>`	`#include <ctype.h>`
`#include <assert.h>`	`#include <assert.h>`

`#define DEF_NBITSIN 16`	`#include "defaults.h"`
`#define DEF_COREDIR "fft-core"`	`#include "legal.h"`
`#define DEF_XTRACBITS 4`	`#include "rounding.h"`
`#define DEF_NMPY 0`	`#include "fftlib.h"`
`#define DEF_XTRAPBITS 0`	`#include "bldstage.h"`
`#define USE_OLD_MULTIPLY false`	`#include "bitreverse.h"`
	`#include "softmpy.h"`
`// To coordinate testing, it helps to have some defines in our header file that`	`#include "butterfly.h"`
`// are common with the default parameters found within the various subroutines.`
`// We'll define those common parameters here. These values, however, have no`
`// effect on anything other than bench testing. They do, though, allow us to`
`// bench test exact copies of what is going on within the FFT when necessary`
`// in order to find problems.`
`// First, parameters for the new multiply based upon the bi-multiply structure`
`// (2-bits/2-tableau rows at a time).`
`#define TST_LONGBIMPY_AW 16`
`#define TST_LONGBIMPY_BW 20 // Leave undefined to match AW`

`// We also include parameters for the shift add multiply`	`void build_dblquarters(const char *fname, ROUND_T rounding, const bool async_reset=false, const bool dbg=false) {`
`#define TST_SHIFTADDMPY_AW 16`
`#define TST_SHIFTADDMPY_BW 20 // Leave undefined to match AW`

`// Now for parameters matching the butterfly`
`#define TST_BUTTERFLY_IWIDTH 16`
`#define TST_BUTTERFLY_CWIDTH 20`
`#define TST_BUTTERFLY_OWIDTH 17`

`// Now for parameters matching the qtrstage`
`#define TST_QTRSTAGE_IWIDTH 16`
`#define TST_QTRSTAGE_LGWIDTH 8`

`// Parameters for the dblstage`
`#define TST_DBLSTAGE_IWIDTH 16`
`#define TST_DBLSTAGE_SHIFT 0`

`// Now for parameters matching the dblreverse stage`
`#define TST_DBLREVERSE_LGSIZE 5`

`typedef enum {`
`RND_TRUNCATE, RND_FROMZERO, RND_HALFUP, RND_CONVERGENT`
`} ROUND_T;`

`const char cpyleft[] =`
`"////////////////////////////////////////////////////////////////////////////////\n"`
`"//\n"`
`"// Copyright (C) 2015-2017, Gisselquist Technology, LLC\n"`
`"//\n"`
`"// This program is free software (firmware): you can redistribute it and/or\n"`
`"// modify it under the terms of the GNU General Public License as published\n"`
`"// by the Free Software Foundation, either version 3 of the License, or (at\n"`
`"// your option) any later version.\n"`
`"//\n"`
`"// This program is distributed in the hope that it will be useful, but WITHOUT\n"`
`"// ANY WARRANTY; without even the implied warranty of MERCHANTIBILITY or\n"`
`"// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License\n"`
`"// for more details.\n"`
`"//\n"`
`"// You should have received a copy of the GNU General Public License along\n"`
`"// with this program. (It's in the $(ROOT)/doc directory, run make with no\n"`
`"// target there if the PDF file isn\'t present.) If not, see\n"`
`"// <http://www.gnu.org/licenses/> for a copy.\n"`
`"//\n"`
`"// License: GPL, v3, as defined and found on www.gnu.org,\n"`
`"// http://www.gnu.org/licenses/gpl.html\n"`
`"//\n"`
`"//\n"`
`"////////////////////////////////////////////////////////////////////////////////\n";`
`const char prjname[] = "A Doubletime Pipelined FFT";`
`const char creator[] = "// Creator: Dan Gisselquist, Ph.D.\n"`
`"// Gisselquist Technology, LLC\n";`

`int lgval(int vl) {`
`int lg;`

`for(lg=1; (1<<lg) < vl; lg++)`
`;`
`return lg;`
`}`

`int nextlg(int vl) {`
`int r;`

`for(r=1; r<vl; r<<=1)`
`;`
`return r;`
`}`

`int bflydelay(int nbits, int xtra) {`
`int cbits = nbits + xtra;`
`int delay;`

`if (USE_OLD_MULTIPLY) {`
`if (nbits+1<cbits)`
`delay = nbits+4;`
`else`
`delay = cbits+3;`
`} else {`
`int na=nbits+2, nb=cbits+1;`
`if (nb<na) {`
`int tmp = nb;`
`nb = na; na = tmp;`
`} delay = ((na)/2+(na&1)+2);`
`}`
`return delay;`
`}`

`int lgdelay(int nbits, int xtra) {`
`// The butterfly code needs to compare a valid address, of this`
`// many bits, with an address two greater. This guarantees we`
`// have enough bits for that comparison. We'll also end up with`
`// more storage space to look for these values, but without a`
`// redesign that's just what we'll deal with.`
`return lgval(bflydelay(nbits, xtra)+3);`
`}`

`void build_truncator(const char *fname) {`
`printf("TRUNCATING!\n");`
`FILE *fp = fopen(fname, "w");`
`if (NULL == fp) {`
`fprintf(stderr, "Could not open \'%s\' for writing\n", fname);`
`perror("O/S Err was:");`
`return;`
`}`

`fprintf(fp,`
`"///////////////////////////////////////////////////////////////////////////\n"`
`"//\n"`
`"// Filename: truncate.v\n"`
`"// \n"`
`"// Project: %s\n"`
`"//\n"`
`"// Purpose: Truncation is one of several options that can be used\n"`
`"// internal to the various FFT stages to drop bits from one \n"`
`"// stage to the next. In general, it is the simplest method\n"`
`"// of dropping bits, since it requires only a bit selection.\n"`
`"//\n"`
`"// This form of rounding isn\'t really that great for FFT\'s,\n"`
`"// since it tends to produce a DC bias in the result. (Other\n"`
`"// less pronounced biases may also exist.)\n"`
`"//\n"`
`"// This particular version also registers the output with the\n"`
`"// clock, so there will be a delay of one going through this\n"`
`"// module. This will keep it in line with the other forms of\n"`
`"// rounding that can be used.\n"`
`"//\n"`
`"//\n%s"`
`"//\n",`
`prjname, creator);`

`fprintf(fp, "%s", cpyleft);`
fprintf(fp, "//\n//\n`default_nettype\tnone\n//\n");
`fprintf(fp,`
`"module truncate(i_clk, i_ce, i_val, o_val);\n"`
`"\tparameter\tIWID=16, OWID=8, SHIFT=0;\n"`
`"\tinput\t\t\t\t\ti_clk, i_ce;\n"`
`"\tinput\t\tsigned\t[(IWID-1):0]\ti_val;\n"`
`"\toutput\treg\tsigned\t[(OWID-1):0]\to_val;\n"`
`"\n"`
`"\talways @(posedge i_clk)\n"`
`"\t\tif (i_ce)\n"`
`"\t\t\to_val <= i_val[(IWID-1-SHIFT):(IWID-SHIFT-OWID)];\n"`
`"\n"`
`"endmodule\n");`
`}`


`void build_roundhalfup(const char *fname) {`
`FILE *fp = fopen(fname, "w");`
`if (NULL == fp) {`
`fprintf(stderr, "Could not open \'%s\' for writing\n", fname);`
`perror("O/S Err was:");`
`return;`
`}`

`fprintf(fp,`
`"///////////////////////////////////////////////////////////////////////////\n"`
`"//\n"`
`"// Filename: roundhalfup.v\n"`
`"// \n"`
`"// Project: %s\n"`
`"//\n"`
`"// Purpose: Rounding half up is the way I was always taught to round in\n"`
`"// school. A one half value is added to the result, and then\n"`
`"// the result is truncated. When used in an FFT, this produces\n"`
`"// less bias than the truncation method, although a bias still\n"`
`"// tends to remain.\n"`
`"//\n"`
`"//\n%s"`
`"//\n",`
`prjname, creator);`

`fprintf(fp, "%s", cpyleft);`
fprintf(fp, "//\n//\n`default_nettype\tnone\n//\n");
`fprintf(fp,`
`"module roundhalfup(i_clk, i_ce, i_val, o_val);\n"`
`"\tparameter\tIWID=16, OWID=8, SHIFT=0;\n"`
`"\tinput\t\t\t\t\ti_clk, i_ce;\n"`
`"\tinput\t\tsigned\t[(IWID-1):0]\ti_val;\n"`
`"\toutput\treg\tsigned\t[(OWID-1):0]\to_val;\n"`
`"\n"`
`"\t// Let's deal with two cases to be as general as we can be here\n"`
`"\t//\n"`
`"\t// 1. The desired output would lose no bits at all\n"`
`"\t// 2. One or more bits would be dropped, so the rounding is simply\n"`
`"\t//\t\ta matter of adding one to the bit about to be dropped,\n"`
`"\t//\t\tmoving all halfway and above numbers up to the next\n"`
`"\t//\t\tvalue.\n"`
`"\tgenerate\n"`
`"\tif (IWID-SHIFT == OWID)\n"`
`"\tbegin // No truncation or rounding, output drops no bits\n"`
`"\n"`
`"\t\talways @(posedge i_clk)\n"`
`"\t\t\tif (i_ce)\to_val <= i_val[(IWID-SHIFT-1):0];\n"`
`"\n"`
`"\tend else // if (IWID-SHIFT-1 >= OWID)\n"`
`"\tbegin // Output drops one bit, can only add one or ... not.\n"`
`"\t\twire\t[(OWID-1):0] truncated_value, rounded_up;\n"`
`"\t\twire\t\t\tlast_valid_bit, first_lost_bit;\n"`
`"\t\tassign\ttruncated_value=i_val[(IWID-1-SHIFT):(IWID-SHIFT-OWID)];\n"`
`"\t\tassign\trounded_up=truncated_value + {{(OWID-1){1\'b0}}, 1\'b1 };\n"`
`"\t\tassign\tfirst_lost_bit = i_val[(IWID-SHIFT-OWID-1)];\n"`
`"\n"`
`"\t\talways @(posedge i_clk)\n"`
`"\t\t\tif (i_ce)\n"`
`"\t\t\tbegin\n"`
`"\t\t\t\tif (!first_lost_bit) // Round down / truncate\n"`
`"\t\t\t\t\to_val <= truncated_value;\n"`
`"\t\t\t\telse\n"`
`"\t\t\t\t\to_val <= rounded_up; // even value\n"`
`"\t\t\tend\n"`
`"\n"`
`"\tend\n"`
`"\tendgenerate\n"`
`"\n"`
`"endmodule\n");`
`}`

`void build_roundfromzero(const char *fname) {`
`FILE *fp = fopen(fname, "w");`
`if (NULL == fp) {`
`fprintf(stderr, "Could not open \'%s\' for writing\n", fname);`
`perror("O/S Err was:");`
`return;`
`}`

`fprintf(fp,`
`"///////////////////////////////////////////////////////////////////////////\n"`
`"//\n"`
`"// Filename: roundfromzero.v\n"`
`"// \n"`
`"// Project: %s\n"`
`"//\n"`
`"// Purpose: Truncation is one of several options that can be used\n"`
`"// internal to the various FFT stages to drop bits from one \n"`
`"// stage to the next. In general, it is the simplest method\n"`
`"// of dropping bits, since it requires only a bit selection.\n"`
`"//\n"`
`"// This form of rounding isn\'t really that great for FFT\'s,\n"`
`"// since it tends to produce a DC bias in the result. (Other\n"`
`"// less pronounced biases may also exist.)\n"`
`"//\n"`
`"// This particular version also registers the output with the\n"`
`"// clock, so there will be a delay of one going through this\n"`
`"// module. This will keep it in line with the other forms of\n"`
`"// rounding that can be used.\n"`
`"//\n"`
`"//\n%s"`
`"//\n",`
`prjname, creator);`

`fprintf(fp, "%s", cpyleft);`
fprintf(fp, "//\n//\n`default_nettype\tnone\n//\n");
`fprintf(fp,`
`"module roundfromzero(i_clk, i_ce, i_val, o_val);\n"`
`"\tparameter\tIWID=16, OWID=8, SHIFT=0;\n"`
`"\tinput\t\t\t\t\ti_clk, i_ce;\n"`
`"\tinput\t\tsigned\t[(IWID-1):0]\ti_val;\n"`
`"\toutput\treg\tsigned\t[(OWID-1):0]\to_val;\n"`
`"\n"`
`"\t// Let's deal with three cases to be as general as we can be here\n"`
`"\t//\n"`
`"\t//\t1. The desired output would lose no bits at all\n"`
`"\t//\t2. One bit would be dropped, so the rounding is simply\n"`
`"\t//\t\tadjusting the value to be the closer to zero in\n"`
`"\t//\t\tcases of being halfway between two. If identically\n"`
`"\t//\t\tequal to a number, we just leave it as is.\n"`
`"\t//\t3. Two or more bits would be dropped. In this case, we round\n"`
`"\t//\t\tnormally unless we are rounding a value of exactly\n"`
`"\t//\t\thalfway between the two. In the halfway case, we\n"`
`"\t//\t\tround away from zero.\n"`
`"\tgenerate\n"`
`"\tif (IWID == OWID) // In this case, the shift is irrelevant and\n"`
`"\tbegin // cannot be applied. No truncation or rounding takes\n"`
`"\t// effect here.\n"`
`"\n"`
`"\t\talways @(posedge i_clk)\n"`
`"\t\t\tif (i_ce)\to_val <= i_val[(IWID-1):0];\n"`
`"\n"`
`"\tend else if (IWID-SHIFT == OWID)\n"`
`"\tbegin // No truncation or rounding, output drops no bits\n"`
`"\n"`
`"\t\talways @(posedge i_clk)\n"`
`"\t\t\tif (i_ce)\to_val <= i_val[(IWID-SHIFT-1):0];\n"`
`"\n"`
`"\tend else if (IWID-SHIFT-1 == OWID)\n"`
`"\tbegin // Output drops one bit, can only add one or ... not.\n"`
`"\t\twire\t[(OWID-1):0]\ttruncated_value, rounded_up;\n"`
`"\t\twire\t\t\tsign_bit, first_lost_bit;\n"`
`"\t\tassign\ttruncated_value=i_val[(IWID-1-SHIFT):(IWID-SHIFT-OWID)];\n"`
`"\t\tassign\trounded_up=truncated_value + {{(OWID-1){1\'b0}}, 1\'b1 };\n"`
`"\t\tassign\tfirst_lost_bit = i_val[0];\n"`
`"\t\tassign\tsign_bit = i_val[(IWID-1)];\n"`
`"\n"`
`"\t\talways @(posedge i_clk)\n"`
`"\t\t\tif (i_ce)\n"`
`"\t\t\tbegin\n"`
`"\t\t\t\tif (!first_lost_bit) // Round down / truncate\n"`
`"\t\t\t\t\to_val <= truncated_value;\n"`
`"\t\t\t\telse if (sign_bit)\n"`
`"\t\t\t\t\to_val <= truncated_value;\n"`
`"\t\t\t\telse\n"`
`"\t\t\t\t\to_val <= rounded_up;\n"`
`"\t\t\tend\n"`
`"\n"`
`"\tend else // If there's more than one bit we are dropping\n"`
`"\tbegin\n"`
`"\t\twire\t[(OWID-1):0]\ttruncated_value, rounded_up;\n"`
`"\t\twire\t\t\tsign_bit, first_lost_bit;\n"`
`"\t\tassign\ttruncated_value=i_val[(IWID-1-SHIFT):(IWID-SHIFT-OWID)];\n"`
`"\t\tassign\trounded_up=truncated_value + {{(OWID-1){1\'b0}}, 1\'b1 };\n"`
`"\t\tassign\tfirst_lost_bit = i_val[(IWID-SHIFT-OWID-1)];\n"`
`"\t\tassign\tsign_bit = i_val[(IWID-1)];\n"`
`"\n"`
`"\t\twire\t[(IWID-SHIFT-OWID-2):0]\tother_lost_bits;\n"`
`"\t\tassign\tother_lost_bits = i_val[(IWID-SHIFT-OWID-2):0];\n"`
`"\n"`
`"\t\talways @(posedge i_clk)\n"`
`"\t\t\tif (i_ce)\n"`
`"\t\t\tbegin\n"`
`"\t\t\t\tif (!first_lost_bit) // Round down / truncate\n"`
`"\t\t\t\t\to_val <= truncated_value;\n"`
`"\t\t\t\telse if (\|other_lost_bits) // Round up to\n"`
`"\t\t\t\t\to_val <= rounded_up; // closest value\n"`
`"\t\t\t\telse if (sign_bit)\n"`
`"\t\t\t\t\to_val <= truncated_value;\n"`
`"\t\t\t\telse\n"`
`"\t\t\t\t\to_val <= rounded_up;\n"`
`"\t\t\tend\n"`
`"\tend\n"`
`"\tendgenerate\n"`
`"\n"`
`"endmodule\n");`
`}`

`void build_convround(const char *fname) {`
`FILE *fp = fopen(fname, "w");`
`if (NULL == fp) {`
`fprintf(stderr, "Could not open \'%s\' for writing\n", fname);`
`perror("O/S Err was:");`
`return;`
`}`

`fprintf(fp,`
`"///////////////////////////////////////////////////////////////////////////\n"`
`"//\n"`
`"// Filename: convround.v\n"`
`"// \n"`
`"// Project: %s\n"`
`"//\n"`
`"// Purpose: A convergent rounding routine, also known as banker\'s\n"`
`"// rounding, Dutch rounding, Gaussian rounding, unbiased\n"`
`"// rounding, or ... more, at least according to Wikipedia.\n"`
`"//\n"`
`"// This form of rounding works by rounding, when the direction is in\n"`
`"// question, towards the nearest even value.\n"`
`"//\n"`
`"//\n%s"`
`"//\n",`
`prjname, creator);`

`fprintf(fp, "%s", cpyleft);`
fprintf(fp, "//\n//\n`default_nettype\tnone\n//\n");
`fprintf(fp,`
`"module convround(i_clk, i_ce, i_val, o_val);\n"`
`"\tparameter\tIWID=16, OWID=8, SHIFT=0;\n"`
`"\tinput\t\t\t\t\ti_clk, i_ce;\n"`
`"\tinput\t\tsigned\t[(IWID-1):0]\ti_val;\n"`
`"\toutput\treg\tsigned\t[(OWID-1):0]\to_val;\n"`
`"\n"`
`"\t// Let's deal with three cases to be as general as we can be here\n"`
`"\t//\n"`
`"\t//\t1. The desired output would lose no bits at all\n"`
`"\t//\t2. One bit would be dropped, so the rounding is simply\n"`
`"\t//\t\tadjusting the value to be the nearest even number in\n"`
`"\t//\t\tcases of being halfway between two. If identically\n"`
`"\t//\t\tequal to a number, we just leave it as is.\n"`
`"\t//\t3. Two or more bits would be dropped. In this case, we round\n"`
`"\t//\t\tnormally unless we are rounding a value of exactly\n"`
`"\t//\t\thalfway between the two. In the halfway case we round\n"`
`"\t//\t\tto the nearest even number.\n"`
`"\tgenerate\n"`
`// What if IWID < OWID? We should expand here ... somehow`
`"\tif (IWID == OWID) // In this case, the shift is irrelevant and\n"`
`"\tbegin // cannot be applied. No truncation or rounding takes\n"`
`"\t// effect here.\n"`
`"\n"`
`"\t\talways @(posedge i_clk)\n"`
`"\t\t\tif (i_ce)\to_val <= i_val[(IWID-1):0];\n"`
`"\n"`
`// What if IWID-SHIFT < OWID? Shouldn't we also shift here as well?`
`"\tend else if (IWID-SHIFT == OWID)\n"`
`"\tbegin // No truncation or rounding, output drops no bits\n"`
`"\n"`
`"\t\talways @(posedge i_clk)\n"`
`"\t\t\tif (i_ce)\to_val <= i_val[(IWID-SHIFT-1):0];\n"`
`"\n"`
`"\tend else if (IWID-SHIFT-1 == OWID)\n"`
`// Is there any way to limit the number of bits that are examined here, for the`
`// purpose of simplifying/reducing logic? I mean, if we go from 32 to 16 bits,`
`// must we check all 15 bits for equality to zero?`
`"\tbegin // Output drops one bit, can only add one or ... not.\n"`
`"\t\twire\t[(OWID-1):0] truncated_value, rounded_up;\n"`
`"\t\twire\t\t\tlast_valid_bit, first_lost_bit;\n"`
`"\t\tassign\ttruncated_value=i_val[(IWID-1-SHIFT):(IWID-SHIFT-OWID)];\n"`
`"\t\tassign\trounded_up=truncated_value + {{(OWID-1){1\'b0}}, 1\'b1 };\n"`
`"\t\tassign\tlast_valid_bit = truncated_value[0];\n"`
`"\t\tassign\tfirst_lost_bit = i_val[0];\n"`
`"\n"`
`"\t\talways @(posedge i_clk)\n"`
`"\t\t\tif (i_ce)\n"`
`"\t\t\tbegin\n"`
`"\t\t\t\tif (!first_lost_bit) // Round down / truncate\n"`
`"\t\t\t\t\to_val <= truncated_value;\n"`
`"\t\t\t\telse if (last_valid_bit)// Round up to nearest\n"`
`"\t\t\t\t\to_val <= rounded_up; // even value\n"`
`"\t\t\t\telse // else round down to the nearest\n"`
`"\t\t\t\t\to_val <= truncated_value; // even value\n"`
`"\t\t\tend\n"`
`"\n"`
`"\tend else // If there's more than one bit we are dropping\n"`
`"\tbegin\n"`
`"\t\twire\t[(OWID-1):0] truncated_value, rounded_up;\n"`
`"\t\twire\t\t\tlast_valid_bit, first_lost_bit;\n"`
`"\t\tassign\ttruncated_value=i_val[(IWID-1-SHIFT):(IWID-SHIFT-OWID)];\n"`
`"\t\tassign\trounded_up=truncated_value + {{(OWID-1){1\'b0}}, 1\'b1 };\n"`
`"\t\tassign\tlast_valid_bit = truncated_value[0];\n"`
`"\t\tassign\tfirst_lost_bit = i_val[(IWID-SHIFT-OWID-1)];\n"`
`"\n"`
`"\t\twire\t[(IWID-SHIFT-OWID-2):0]\tother_lost_bits;\n"`
`"\t\tassign\tother_lost_bits = i_val[(IWID-SHIFT-OWID-2):0];\n"`
`"\n"`
`"\t\talways @(posedge i_clk)\n"`
`"\t\t\tif (i_ce)\n"`
`"\t\t\tbegin\n"`
`"\t\t\t\tif (!first_lost_bit) // Round down / truncate\n"`
`"\t\t\t\t\to_val <= truncated_value;\n"`
`"\t\t\t\telse if (\|other_lost_bits) // Round up to\n"`

Line 1...

////////////////////////////////////////////////////////////////////////////////

////////////////////////////////////////////////////////////////////////////////

//

//

// Filename:    fftgen.cpp

// Filename:    fftgen.cpp

//

//

// Project:     A Doubletime Pipelined FFT

// Project:     A General Purpose Pipelined FFT Implementation

//

//

// Purpose:     This is the core generator for the project.  Every part

// Purpose:     This is the core generator for the project.  Every part

//              and piece of this project begins and ends in this program.

//              and piece of this project begins and ends in this program.

//      Once built, this program will build an FFT (or IFFT) core of arbitrary

//      Once built, this program will build an FFT (or IFFT) core of arbitrary

//      width, precision, etc., that will run at two samples per clock.

//      width, precision, etc., that will run at two samples per clock.

Line 25...

// Creator:     Dan Gisselquist, Ph.D.

// Creator:     Dan Gisselquist, Ph.D.

//              Gisselquist Technology, LLC

//              Gisselquist Technology, LLC

//

//

////////////////////////////////////////////////////////////////////////////////

////////////////////////////////////////////////////////////////////////////////

//

//

// Copyright (C) 2015-2017, Gisselquist Technology, LLC

// Copyright (C) 2015-2018, Gisselquist Technology, LLC

//

//

// This program is free software (firmware): you can redistribute it and/or

// This program is free software (firmware): you can redistribute it and/or

// modify it under the terms of  the GNU General Public License as published

// modify it under the terms of  the GNU General Public License as published

// by the Free Software Foundation, either version 3 of the License, or (at

// by the Free Software Foundation, either version 3 of the License, or (at

// your option) any later version.

// your option) any later version.

Line 65...

#define X_OK    0       /* !!!!!! execute permission - unsupported in windows*/

#define X_OK    0       /* !!!!!! execute permission - unsupported in windows*/

#define F_OK    0       /* Test for existence.  */

#define F_OK    0       /* Test for existence.  */

#if _MSC_VER <= 1700

#if _MSC_VER <= 1700

long long llround(double d) {

        if (d<0) return -(long long)(-d+0.5);

        else    return (long long)(d+0.5); }

int lstat(const char *filename, struct stat *buf) { return 1; };

int lstat(const char *filename, struct stat *buf) { return 1; };

#define S_ISDIR(A)      0

#define S_ISDIR(A)      0

#else

#else

Line 95...

Line 92...

#include <string>

#include <string>

#include <math.h>

#include <math.h>

#include <ctype.h>

#include <ctype.h>

#include <assert.h>

#include <assert.h>

#define DEF_NBITSIN     16

#include "defaults.h"

#define DEF_COREDIR     "fft-core"

#include "legal.h"

#define DEF_XTRACBITS   4

#include "rounding.h"

#define DEF_NMPY        0

#include "fftlib.h"

#define DEF_XTRAPBITS   0

#include "bldstage.h"

#define USE_OLD_MULTIPLY        false

#include "bitreverse.h"

#include "softmpy.h"

// To coordinate testing, it helps to have some defines in our header file that

#include "butterfly.h"

// are common with the default parameters found within the various subroutines.

// We'll define those common parameters here.  These values, however, have no

// effect on anything other than bench testing.  They do, though, allow us to

// bench test exact copies of what is going on within the FFT when necessary

// in order to find problems.

// First, parameters for the new multiply based upon the bi-multiply structure

// (2-bits/2-tableau rows at a time).

#define TST_LONGBIMPY_AW        16

#define TST_LONGBIMPY_BW        20      // Leave undefined to match AW

//  We also include parameters for the shift add multiply

void    build_dblquarters(const char *fname, ROUND_T rounding, const bool async_reset=false, const bool dbg=false) {

#define TST_SHIFTADDMPY_AW      16

#define TST_SHIFTADDMPY_BW      20      // Leave undefined to match AW

// Now for parameters matching the butterfly

#define TST_BUTTERFLY_IWIDTH    16

#define TST_BUTTERFLY_CWIDTH    20

#define TST_BUTTERFLY_OWIDTH    17

// Now for parameters matching the qtrstage

#define TST_QTRSTAGE_IWIDTH     16

#define TST_QTRSTAGE_LGWIDTH    8

// Parameters for the dblstage

#define TST_DBLSTAGE_IWIDTH     16

#define TST_DBLSTAGE_SHIFT      0

// Now for parameters matching the dblreverse stage

#define TST_DBLREVERSE_LGSIZE   5

typedef enum {

        RND_TRUNCATE, RND_FROMZERO, RND_HALFUP, RND_CONVERGENT

} ROUND_T;

const char      cpyleft[] =

"////////////////////////////////////////////////////////////////////////////////\n"

"//\n"

"// Copyright (C) 2015-2017, Gisselquist Technology, LLC\n"

"//\n"

"// This program is free software (firmware): you can redistribute it and/or\n"

"// modify it under the terms of  the GNU General Public License as published\n"

"// by the Free Software Foundation, either version 3 of the License, or (at\n"

"// your option) any later version.\n"

"//\n"

"// This program is distributed in the hope that it will be useful, but WITHOUT\n"

"// ANY WARRANTY; without even the implied warranty of MERCHANTIBILITY or\n"

"// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License\n"

"// for more details.\n"

"//\n"

"// You should have received a copy of the GNU General Public License along\n"

"// with this program.  (It's in the $(ROOT)/doc directory, run make with no\n"

"// target there if the PDF file isn\'t present.)  If not, see\n"

"// <http://www.gnu.org/licenses/> for a copy.\n"

"//\n"

"// License:    GPL, v3, as defined and found on www.gnu.org,\n"

"//             http://www.gnu.org/licenses/gpl.html\n"

"//\n"

"//\n"

"////////////////////////////////////////////////////////////////////////////////\n";

const char      prjname[] = "A Doubletime Pipelined FFT";

const char      creator[] =     "// Creator:    Dan Gisselquist, Ph.D.\n"

                                "//             Gisselquist Technology, LLC\n";

int     lgval(int vl) {

        int     lg;

        for(lg=1; (1<<lg) < vl; lg++)

        return lg;

int     nextlg(int vl) {

        int     r;

        for(r=1; r<vl; r<<=1)

        return r;

int     bflydelay(int nbits, int xtra) {

        int     cbits = nbits + xtra;

        int     delay;

        if (USE_OLD_MULTIPLY) {

                if (nbits+1<cbits)

                        delay = nbits+4;

                else

                        delay = cbits+3;

        } else {

                int     na=nbits+2, nb=cbits+1;

                if (nb<na) {

                        int tmp = nb;

                        nb = na; na = tmp;

                } delay = ((na)/2+(na&1)+2);

        return delay;

int     lgdelay(int nbits, int xtra) {

        // The butterfly code needs to compare a valid address, of this

        // many bits, with an address two greater.  This guarantees we

        // have enough bits for that comparison.  We'll also end up with

        // more storage space to look for these values, but without a

        // redesign that's just what we'll deal with.

        return lgval(bflydelay(nbits, xtra)+3);

void    build_truncator(const char *fname) {

        printf("TRUNCATING!\n");

        FILE    *fp = fopen(fname, "w");

        if (NULL == fp) {

                fprintf(stderr, "Could not open \'%s\' for writing\n", fname);

                perror("O/S Err was:");

                return;

        fprintf(fp,

"///////////////////////////////////////////////////////////////////////////\n"

"//\n"

"// Filename:   truncate.v\n"

"//             \n"

"// Project:    %s\n"

"//\n"

"// Purpose:    Truncation is one of several options that can be used\n"

"//             internal to the various FFT stages to drop bits from one \n"

"//             stage to the next.  In general, it is the simplest method\n"

"//             of dropping bits, since it requires only a bit selection.\n"

"//\n"

"//             This form of rounding isn\'t really that great for FFT\'s,\n"

"//             since it tends to produce a DC bias in the result.  (Other\n"

"//             less pronounced biases may also exist.)\n"

"//\n"

"//             This particular version also registers the output with the\n"

"//             clock, so there will be a delay of one going through this\n"

"//             module.  This will keep it in line with the other forms of\n"

"//             rounding that can be used.\n"

"//\n"

"//\n%s"

"//\n",

                prjname, creator);

        fprintf(fp, "%s", cpyleft);

        fprintf(fp, "//\n//\n`default_nettype\tnone\n//\n");

        fprintf(fp,

"module truncate(i_clk, i_ce, i_val, o_val);\n"

        "\tparameter\tIWID=16, OWID=8, SHIFT=0;\n"

        "\tinput\t\t\t\t\ti_clk, i_ce;\n"

        "\tinput\t\tsigned\t[(IWID-1):0]\ti_val;\n"

        "\toutput\treg\tsigned\t[(OWID-1):0]\to_val;\n"

"\n"

        "\talways @(posedge i_clk)\n"

                "\t\tif (i_ce)\n"

                "\t\t\to_val <= i_val[(IWID-1-SHIFT):(IWID-SHIFT-OWID)];\n"

"\n"

"endmodule\n");

void    build_roundhalfup(const char *fname) {

        FILE    *fp = fopen(fname, "w");

        if (NULL == fp) {

                fprintf(stderr, "Could not open \'%s\' for writing\n", fname);

                perror("O/S Err was:");

                return;

        fprintf(fp,

"///////////////////////////////////////////////////////////////////////////\n"

"//\n"

"// Filename:   roundhalfup.v\n"

"//             \n"

"// Project:    %s\n"

"//\n"

"// Purpose:    Rounding half up is the way I was always taught to round in\n"

"//             school.  A one half value is added to the result, and then\n"

"//             the result is truncated.  When used in an FFT, this produces\n"

"//             less bias than the truncation method, although a bias still\n"

"//             tends to remain.\n"

"//\n"

"//\n%s"

"//\n",

                prjname, creator);

        fprintf(fp, "%s", cpyleft);

        fprintf(fp, "//\n//\n`default_nettype\tnone\n//\n");

        fprintf(fp,

"module roundhalfup(i_clk, i_ce, i_val, o_val);\n"

        "\tparameter\tIWID=16, OWID=8, SHIFT=0;\n"

        "\tinput\t\t\t\t\ti_clk, i_ce;\n"

        "\tinput\t\tsigned\t[(IWID-1):0]\ti_val;\n"

        "\toutput\treg\tsigned\t[(OWID-1):0]\to_val;\n"

"\n"

        "\t// Let's deal with two cases to be as general as we can be here\n"

        "\t//\n"

        "\t//   1. The desired output would lose no bits at all\n"

        "\t//   2. One or more bits would be dropped, so the rounding is simply\n"

        "\t//\t\ta matter of adding one to the bit about to be dropped,\n"

        "\t//\t\tmoving all halfway and above numbers up to the next\n"

        "\t//\t\tvalue.\n"

        "\tgenerate\n"

        "\tif (IWID-SHIFT == OWID)\n"

        "\tbegin // No truncation or rounding, output drops no bits\n"

"\n"

                "\t\talways @(posedge i_clk)\n"

                        "\t\t\tif (i_ce)\to_val <= i_val[(IWID-SHIFT-1):0];\n"

"\n"

        "\tend else // if (IWID-SHIFT-1 >= OWID)\n"

        "\tbegin // Output drops one bit, can only add one or ... not.\n"

                "\t\twire\t[(OWID-1):0] truncated_value, rounded_up;\n"

                "\t\twire\t\t\tlast_valid_bit, first_lost_bit;\n"

                "\t\tassign\ttruncated_value=i_val[(IWID-1-SHIFT):(IWID-SHIFT-OWID)];\n"

                "\t\tassign\trounded_up=truncated_value + {{(OWID-1){1\'b0}}, 1\'b1 };\n"

                "\t\tassign\tfirst_lost_bit = i_val[(IWID-SHIFT-OWID-1)];\n"

"\n"

                "\t\talways @(posedge i_clk)\n"

                "\t\t\tif (i_ce)\n"

                "\t\t\tbegin\n"

                        "\t\t\t\tif (!first_lost_bit) // Round down / truncate\n"

                        "\t\t\t\t\to_val <= truncated_value;\n"

                        "\t\t\t\telse\n"

                        "\t\t\t\t\to_val <= rounded_up; // even value\n"

                "\t\t\tend\n"

"\n"

        "\tend\n"

        "\tendgenerate\n"

"\n"

"endmodule\n");

void    build_roundfromzero(const char *fname) {

        FILE    *fp = fopen(fname, "w");

        if (NULL == fp) {

                fprintf(stderr, "Could not open \'%s\' for writing\n", fname);

                perror("O/S Err was:");

                return;

        fprintf(fp,

"///////////////////////////////////////////////////////////////////////////\n"

"//\n"

"// Filename:   roundfromzero.v\n"

"//             \n"

"// Project:    %s\n"

"//\n"

"// Purpose:    Truncation is one of several options that can be used\n"

"//             internal to the various FFT stages to drop bits from one \n"

"//             stage to the next.  In general, it is the simplest method\n"

"//             of dropping bits, since it requires only a bit selection.\n"

"//\n"

"//             This form of rounding isn\'t really that great for FFT\'s,\n"

"//             since it tends to produce a DC bias in the result.  (Other\n"

"//             less pronounced biases may also exist.)\n"

"//\n"

"//             This particular version also registers the output with the\n"

"//             clock, so there will be a delay of one going through this\n"

"//             module.  This will keep it in line with the other forms of\n"

"//             rounding that can be used.\n"

"//\n"

"//\n%s"

"//\n",

                prjname, creator);

        fprintf(fp, "%s", cpyleft);

        fprintf(fp, "//\n//\n`default_nettype\tnone\n//\n");

        fprintf(fp,

"module roundfromzero(i_clk, i_ce, i_val, o_val);\n"

        "\tparameter\tIWID=16, OWID=8, SHIFT=0;\n"

        "\tinput\t\t\t\t\ti_clk, i_ce;\n"

        "\tinput\t\tsigned\t[(IWID-1):0]\ti_val;\n"

        "\toutput\treg\tsigned\t[(OWID-1):0]\to_val;\n"

"\n"

        "\t// Let's deal with three cases to be as general as we can be here\n"

        "\t//\n"

        "\t//\t1. The desired output would lose no bits at all\n"

        "\t//\t2. One bit would be dropped, so the rounding is simply\n"

        "\t//\t\tadjusting the value to be the closer to zero in\n"

        "\t//\t\tcases of being halfway between two.  If identically\n"

        "\t//\t\tequal to a number, we just leave it as is.\n"

        "\t//\t3. Two or more bits would be dropped.  In this case, we round\n"

        "\t//\t\tnormally unless we are rounding a value of exactly\n"

        "\t//\t\thalfway between the two.  In the halfway case, we\n"

        "\t//\t\tround away from zero.\n"

        "\tgenerate\n"

        "\tif (IWID == OWID) // In this case, the shift is irrelevant and\n"

        "\tbegin // cannot be applied.  No truncation or rounding takes\n"

        "\t// effect here.\n"

"\n"

                "\t\talways @(posedge i_clk)\n"

                        "\t\t\tif (i_ce)\to_val <= i_val[(IWID-1):0];\n"

"\n"

        "\tend else if (IWID-SHIFT == OWID)\n"

        "\tbegin // No truncation or rounding, output drops no bits\n"

"\n"

                "\t\talways @(posedge i_clk)\n"

                        "\t\t\tif (i_ce)\to_val <= i_val[(IWID-SHIFT-1):0];\n"

"\n"

        "\tend else if (IWID-SHIFT-1 == OWID)\n"

        "\tbegin // Output drops one bit, can only add one or ... not.\n"

        "\t\twire\t[(OWID-1):0]\ttruncated_value, rounded_up;\n"

        "\t\twire\t\t\tsign_bit, first_lost_bit;\n"

        "\t\tassign\ttruncated_value=i_val[(IWID-1-SHIFT):(IWID-SHIFT-OWID)];\n"

        "\t\tassign\trounded_up=truncated_value + {{(OWID-1){1\'b0}}, 1\'b1 };\n"

        "\t\tassign\tfirst_lost_bit = i_val[0];\n"

        "\t\tassign\tsign_bit = i_val[(IWID-1)];\n"

"\n"

        "\t\talways @(posedge i_clk)\n"

                "\t\t\tif (i_ce)\n"

                "\t\t\tbegin\n"

                        "\t\t\t\tif (!first_lost_bit) // Round down / truncate\n"

                                "\t\t\t\t\to_val <= truncated_value;\n"

                        "\t\t\t\telse if (sign_bit)\n"

                                "\t\t\t\t\to_val <= truncated_value;\n"

                        "\t\t\t\telse\n"

                                "\t\t\t\t\to_val <= rounded_up;\n"

                "\t\t\tend\n"

"\n"

        "\tend else // If there's more than one bit we are dropping\n"

        "\tbegin\n"

                "\t\twire\t[(OWID-1):0]\ttruncated_value, rounded_up;\n"

                "\t\twire\t\t\tsign_bit, first_lost_bit;\n"

                "\t\tassign\ttruncated_value=i_val[(IWID-1-SHIFT):(IWID-SHIFT-OWID)];\n"

                "\t\tassign\trounded_up=truncated_value + {{(OWID-1){1\'b0}}, 1\'b1 };\n"

                "\t\tassign\tfirst_lost_bit = i_val[(IWID-SHIFT-OWID-1)];\n"

                "\t\tassign\tsign_bit = i_val[(IWID-1)];\n"

"\n"

                "\t\twire\t[(IWID-SHIFT-OWID-2):0]\tother_lost_bits;\n"

                "\t\tassign\tother_lost_bits = i_val[(IWID-SHIFT-OWID-2):0];\n"

"\n"

                "\t\talways @(posedge i_clk)\n"

                        "\t\t\tif (i_ce)\n"

                        "\t\t\tbegin\n"

                        "\t\t\t\tif (!first_lost_bit) // Round down / truncate\n"

                                "\t\t\t\t\to_val <= truncated_value;\n"

                        "\t\t\t\telse if (|other_lost_bits) // Round up to\n"

                                "\t\t\t\t\to_val <= rounded_up; // closest value\n"

                        "\t\t\t\telse if (sign_bit)\n"

                                "\t\t\t\t\to_val <= truncated_value;\n"

                        "\t\t\t\telse\n"

                                "\t\t\t\t\to_val <= rounded_up;\n"

                        "\t\t\tend\n"

        "\tend\n"

        "\tendgenerate\n"

"\n"

"endmodule\n");

void    build_convround(const char *fname) {

        FILE    *fp = fopen(fname, "w");

        if (NULL == fp) {

                fprintf(stderr, "Could not open \'%s\' for writing\n", fname);

                perror("O/S Err was:");

                return;

        fprintf(fp,

"///////////////////////////////////////////////////////////////////////////\n"

"//\n"

"// Filename:   convround.v\n"

"//             \n"

"// Project:    %s\n"

"//\n"

"// Purpose:    A convergent rounding routine, also known as banker\'s\n"

"//             rounding, Dutch rounding, Gaussian rounding, unbiased\n"

"//     rounding, or ... more, at least according to Wikipedia.\n"

"//\n"

"//     This form of rounding works by rounding, when the direction is in\n"

"//     question, towards the nearest even value.\n"

"//\n"

"//\n%s"

"//\n",

                prjname, creator);

        fprintf(fp, "%s", cpyleft);

        fprintf(fp, "//\n//\n`default_nettype\tnone\n//\n");

        fprintf(fp,

"module convround(i_clk, i_ce, i_val, o_val);\n"

"\tparameter\tIWID=16, OWID=8, SHIFT=0;\n"

"\tinput\t\t\t\t\ti_clk, i_ce;\n"

"\tinput\t\tsigned\t[(IWID-1):0]\ti_val;\n"

"\toutput\treg\tsigned\t[(OWID-1):0]\to_val;\n"

"\n"

"\t// Let's deal with three cases to be as general as we can be here\n"

"\t//\n"

"\t//\t1. The desired output would lose no bits at all\n"

"\t//\t2. One bit would be dropped, so the rounding is simply\n"

"\t//\t\tadjusting the value to be the nearest even number in\n"

"\t//\t\tcases of being halfway between two.  If identically\n"

"\t//\t\tequal to a number, we just leave it as is.\n"

"\t//\t3. Two or more bits would be dropped.  In this case, we round\n"

"\t//\t\tnormally unless we are rounding a value of exactly\n"

"\t//\t\thalfway between the two.  In the halfway case we round\n"

"\t//\t\tto the nearest even number.\n"

"\tgenerate\n"

// What if IWID < OWID?  We should expand here ... somehow

        "\tif (IWID == OWID) // In this case, the shift is irrelevant and\n"

        "\tbegin // cannot be applied.  No truncation or rounding takes\n"

        "\t// effect here.\n"

"\n"

                "\t\talways @(posedge i_clk)\n"

                        "\t\t\tif (i_ce)\to_val <= i_val[(IWID-1):0];\n"

"\n"

// What if IWID-SHIFT < OWID?  Shouldn't we also shift here as well?

"\tend else if (IWID-SHIFT == OWID)\n"

"\tbegin // No truncation or rounding, output drops no bits\n"

"\n"

"\t\talways @(posedge i_clk)\n"

"\t\t\tif (i_ce)\to_val <= i_val[(IWID-SHIFT-1):0];\n"

"\n"

"\tend else if (IWID-SHIFT-1 == OWID)\n"

// Is there any way to limit the number of bits that are examined here, for the

// purpose of simplifying/reducing logic?  I mean, if we go from 32 to 16 bits,

// must we check all 15 bits for equality to zero?

"\tbegin // Output drops one bit, can only add one or ... not.\n"

"\t\twire\t[(OWID-1):0] truncated_value, rounded_up;\n"

"\t\twire\t\t\tlast_valid_bit, first_lost_bit;\n"

"\t\tassign\ttruncated_value=i_val[(IWID-1-SHIFT):(IWID-SHIFT-OWID)];\n"

"\t\tassign\trounded_up=truncated_value + {{(OWID-1){1\'b0}}, 1\'b1 };\n"

"\t\tassign\tlast_valid_bit = truncated_value[0];\n"

"\t\tassign\tfirst_lost_bit = i_val[0];\n"

"\n"

"\t\talways @(posedge i_clk)\n"

"\t\t\tif (i_ce)\n"

"\t\t\tbegin\n"

"\t\t\t\tif (!first_lost_bit) // Round down / truncate\n"

"\t\t\t\t\to_val <= truncated_value;\n"

"\t\t\t\telse if (last_valid_bit)// Round up to nearest\n"

"\t\t\t\t\to_val <= rounded_up; // even value\n"

"\t\t\t\telse // else round down to the nearest\n"

"\t\t\t\t\to_val <= truncated_value; // even value\n"

"\t\t\tend\n"

"\n"

"\tend else // If there's more than one bit we are dropping\n"

"\tbegin\n"

"\t\twire\t[(OWID-1):0] truncated_value, rounded_up;\n"

"\t\twire\t\t\tlast_valid_bit, first_lost_bit;\n"

"\t\tassign\ttruncated_value=i_val[(IWID-1-SHIFT):(IWID-SHIFT-OWID)];\n"

"\t\tassign\trounded_up=truncated_value + {{(OWID-1){1\'b0}}, 1\'b1 };\n"

"\t\tassign\tlast_valid_bit = truncated_value[0];\n"

"\t\tassign\tfirst_lost_bit = i_val[(IWID-SHIFT-OWID-1)];\n"

"\n"

"\t\twire\t[(IWID-SHIFT-OWID-2):0]\tother_lost_bits;\n"

"\t\tassign\tother_lost_bits = i_val[(IWID-SHIFT-OWID-2):0];\n"

"\n"

"\t\talways @(posedge i_clk)\n"

"\t\t\tif (i_ce)\n"

"\t\t\tbegin\n"

"\t\t\t\tif (!first_lost_bit) // Round down / truncate\n"

"\t\t\t\t\to_val <= truncated_value;\n"

"\t\t\t\telse if (|other_lost_bits) // Round up to\n"

"\t\t\t\t\to_val <= rounded_up; // closest value\n"

"\t\t\t\telse if (last_valid_bit) // Round up to\n"

"\t\t\t\t\to_val <= rounded_up; // nearest even\n"

"\t\t\t\telse   // else round down to nearest even\n"

"\t\t\t\t\to_val <= truncated_value;\n"

"\t\t\tend\n"

"\tend\n"

"\tendgenerate\n"

"\n"

"endmodule\n");

void    build_quarters(const char *fname, ROUND_T rounding, bool dbg=false) {

        FILE    *fp = fopen(fname, "w");

        FILE    *fp = fopen(fname, "w");

        if (NULL == fp) {

        if (NULL == fp) {

                fprintf(stderr, "Could not open \'%s\' for writing\n", fname);

                fprintf(stderr, "Could not open \'%s\' for writing\n", fname);

                perror("O/S Err was:");

                perror("O/S Err was:");

                return;

                return;

Line 583...

Line 120...

        else

        else

                rnd_string = "convround";

                rnd_string = "convround";

        fprintf(fp,

        fprintf(fp,

"///////////////////////////////////////////////////////////////////////////\n"

SLASHLINE

"//\n"

"//\n"

"// Filename:   qtrstage%s.v\n"

"// Filename:\tqtrstage%s.v\n"

"//             \n"

"//             \n"

"// Project:    %s\n"

"// Project:\t%s\n"

"//\n"

"//\n"

"// Purpose:    This file encapsulates the 4 point stage of a decimation in\n"

"// Purpose:    This file encapsulates the 4 point stage of a decimation in\n"

"//             frequency FFT.  This particular implementation is optimized\n"

"//             frequency FFT.  This particular implementation is optimized\n"

"//             so that all of the multiplies are accomplished by additions\n"

"//     so that all of the multiplies are accomplished by additions and\n"

"//             and multiplexers only.\n"

"//     multiplexers only.\n"

"//\n"

"//\n"

"//\n%s"

"//\n%s"

"//\n",

"//\n",

                (dbg)?"_dbg":"", prjname, creator);

                (dbg)?"_dbg":"", prjname, creator);

        fprintf(fp, "%s", cpyleft);

        fprintf(fp, "%s", cpyleft);

        fprintf(fp, "//\n//\n`default_nettype\tnone\n//\n");

        fprintf(fp, "//\n//\n`default_nettype\tnone\n//\n");

        std::string     resetw("i_reset");

        if (async_reset)

                resetw = std::string("i_areset_n");

        fprintf(fp,

        fprintf(fp,

"module\tqtrstage%s(i_clk, i_rst, i_ce, i_sync, i_data, o_data, o_sync%s);\n"

"module\tqtrstage%s(i_clk, %s, i_ce, i_sync, i_data, o_data, o_sync%s);\n"

        "\tparameter    IWIDTH=%d, OWIDTH=IWIDTH+1;\n"

        "\tparameter    IWIDTH=%d, OWIDTH=IWIDTH+1;\n"

        "\t// Parameters specific to the core that should be changed when this\n"

        "\t// Parameters specific to the core that should be changed when this\n"

        "\t// core is built ... Note that the minimum LGSPAN is 2.  Smaller \n"

        "\t// core is built ... Note that the minimum LGSPAN is 2.  Smaller \n"

        "\t// spans must use the fftdoubles stage.\n"

        "\t// spans must use the fftdoubles stage.\n"

        "\tparameter\tLGWIDTH=%d, ODD=0, INVERSE=0,SHIFT=0;\n"

        "\tparameter\tLGWIDTH=%d, ODD=0, INVERSE=0,SHIFT=0;\n"

        "\tinput\t                              i_clk, i_rst, i_ce, i_sync;\n"

        "\tinput\t                              i_clk, %s, i_ce, i_sync;\n"

        "\tinput\t      [(2*IWIDTH-1):0]        i_data;\n"

        "\tinput\t      [(2*IWIDTH-1):0]        i_data;\n"

        "\toutput\treg  [(2*OWIDTH-1):0]        o_data;\n"

        "\toutput\treg  [(2*OWIDTH-1):0]        o_data;\n"

        "\toutput\treg                          o_sync;\n"

        "\toutput\treg                          o_sync;\n"

        "\t\n", (dbg)?"_dbg":"", (dbg)?", o_dbg":"", TST_QTRSTAGE_IWIDTH,

        "\t\n", (dbg)?"_dbg":"",

        TST_QTRSTAGE_LGWIDTH);

        resetw.c_str(),

        (dbg)?", o_dbg":"", TST_QTRSTAGE_IWIDTH,

        TST_QTRSTAGE_LGWIDTH, resetw.c_str());

        if (dbg) { fprintf(fp, "\toutput\twire\t[33:0]\t\t\to_dbg;\n"

        if (dbg) { fprintf(fp, "\toutput\twire\t[33:0]\t\t\to_dbg;\n"

                "\tassign\to_dbg = { ((o_sync)&&(i_ce)), i_ce, o_data[(2*OWIDTH-1):(2*OWIDTH-16)],\n"

                "\tassign\to_dbg = { ((o_sync)&&(i_ce)), i_ce, o_data[(2*OWIDTH-1):(2*OWIDTH-16)],\n"

                        "\t\t\t\t\to_data[(OWIDTH-1):(OWIDTH-16)] };\n"

                        "\t\t\t\t\to_data[(OWIDTH-1):(OWIDTH-16)] };\n"

"\n");

"\n");

Line 673...

Line 216...

        "\tendgenerate\n"

        "\tendgenerate\n"

"\n"

"\n"

*/

*/

        fprintf(fp,

        fprintf(fp,

        "\tinitial wait_for_sync = 1\'b1;\n"

        "\tinitial wait_for_sync = 1\'b1;\n"

        "\tinitial iaddr = 0;\n"

        "\tinitial iaddr = 0;\n");

        if (async_reset)

                fprintf(fp,

                        "\talways @(posedge i_clk, negedge i_areset_n)\n"

                                "\t\tif (!i_reset)\n");

        else

                fprintf(fp,

        "\talways @(posedge i_clk)\n"

        "\talways @(posedge i_clk)\n"

                "\t\tif (i_rst)\n"

                "\t\tif (i_reset)\n");

        fprintf(fp,

                "\t\tbegin\n"

                "\t\tbegin\n"

                        "\t\t\twait_for_sync <= 1\'b1;\n"

                        "\t\t\twait_for_sync <= 1\'b1;\n"

                        "\t\t\tiaddr <= 0;\n"

                        "\t\t\tiaddr <= 0;\n"

                "\t\tend else if ((i_ce)&&((!wait_for_sync)||(i_sync)))\n"

                "\t\tend else if ((i_ce)&&((!wait_for_sync)||(i_sync)))\n"

                "\t\tbegin\n"

                "\t\tbegin\n"

                        "\t\t\tiaddr <= iaddr + { {(LGWIDTH-1){1\'b0}}, 1\'b1 };\n"

                        "\t\t\tiaddr <= iaddr + { {(LGWIDTH-1){1\'b0}}, 1\'b1 };\n"

                        "\t\t\twait_for_sync <= 1\'b0;\n"

                        "\t\t\twait_for_sync <= 1\'b0;\n"

                "\t\tend\n"

                "\t\tend\n\n"

        "\talways @(posedge i_clk)\n"

        "\talways @(posedge i_clk)\n"

                "\t\tif (i_ce)\n"

                "\t\tif (i_ce)\n"

                        "\t\t\timem <= i_data;\n"

                        "\t\t\timem <= i_data;\n"

                "\n\n");

                "\n\n");

        fprintf(fp,

        fprintf(fp,

        "\t// Note that we don\'t check on wait_for_sync or i_sync here.\n"

        "\t// Note that we don\'t check on wait_for_sync or i_sync here.\n"

        "\t// Why not?  Because iaddr will always be zero until after the\n"

        "\t// Why not?  Because iaddr will always be zero until after the\n"

        "\t// first i_ce, so we are safe.\n"

        "\t// first i_ce, so we are safe.\n"

        "\tinitial pipeline = 4\'h0;\n"

        "\tinitial pipeline = 4\'h0;\n");

        if (async_reset)

                fprintf(fp,

        "\talways\t@(posedge i_clk, negedge i_areset_n)\n"

                "\t\tif (!i_reset)\n");

        else

                fprintf(fp,

        "\talways\t@(posedge i_clk)\n"

        "\talways\t@(posedge i_clk)\n"

                "\t\tif (i_rst)\n"

                "\t\tif (i_reset)\n");

        fprintf(fp,

                        "\t\t\tpipeline <= 4\'h0;\n"

                        "\t\t\tpipeline <= 4\'h0;\n"

                "\t\telse if (i_ce) // is our pipeline process full?  Which stages?\n"

                "\t\telse if (i_ce) // is our pipeline process full?  Which stages?\n"

                        "\t\t\tpipeline <= { pipeline[2:0], iaddr[0] };\n\n");

                        "\t\t\tpipeline <= { pipeline[2:0], iaddr[0] };\n\n");

        fprintf(fp,

        fprintf(fp,

        "\t// This is the pipeline[-1] stage, pipeline[0] will be set next.\n"

        "\t// This is the pipeline[-1] stage, pipeline[0] will be set next.\n"

Line 750...

Line 308...

        fprintf(fp,

        fprintf(fp,

        "\t// Don\'t forget in the sync check that we are running\n"

        "\t// Don\'t forget in the sync check that we are running\n"

        "\t// at two clocks per sample.  Thus we need to\n"

        "\t// at two clocks per sample.  Thus we need to\n"

        "\t// produce a sync every 2^(LGWIDTH-1) clocks.\n"

        "\t// produce a sync every 2^(LGWIDTH-1) clocks.\n"

        "\tinitial\to_sync = 1\'b0;\n"

        "\tinitial\to_sync = 1\'b0;\n");

        if (async_reset)

                fprintf(fp,

        "\talways\t@(posedge i_clk, negedge i_areset_n)\n"

                "\t\tif (!i_areset_n)\n");

        else

                fprintf(fp,

        "\talways\t@(posedge i_clk)\n"

        "\talways\t@(posedge i_clk)\n"

                "\t\tif (i_rst)\n"

                "\t\tif (i_reset)\n");

        fprintf(fp,

                "\t\t\to_sync <= 1\'b0;\n"

                "\t\t\to_sync <= 1\'b0;\n"

                "\t\telse if (i_ce)\n"

                "\t\telse if (i_ce)\n"

                        "\t\t\to_sync <= &(~iaddr[(LGWIDTH-2):3]) && (iaddr[2:0] == 3'b101);\n");

                        "\t\t\to_sync <= &(~iaddr[(LGWIDTH-2):3]) && (iaddr[2:0] == 3'b101);\n");

        fprintf(fp, "endmodule\n");

        fprintf(fp, "endmodule\n");

void    build_dblstage(const char *fname, ROUND_T rounding, const bool dbg = false) {

void    build_snglquarters(const char *fname, ROUND_T rounding, const bool async_reset=false, const bool dbg=false) {

        FILE    *fp = fopen(fname, "w");

        FILE    *fp = fopen(fname, "w");

        if (NULL == fp) {

        if (NULL == fp) {

                fprintf(stderr, "Could not open \'%s\' for writing\n", fname);

                fprintf(stderr, "Could not open \'%s\' for writing\n", fname);

                perror("O/S Err was:");

                perror("O/S Err was:");

                return;

                return;

        const   char    *rnd_string;

        const   char    *rnd_string;

        if (rounding == RND_TRUNCATE)

        if (rounding == RND_TRUNCATE)

                rnd_string = "truncate";

                rnd_string = "truncate";

        else if (rounding == RND_FROMZERO)

        else if (rounding == RND_FROMZERO)

                rnd_string = "roundfromzero";

                rnd_string = "roundfromzero";

Line 779...

Line 344...

        else

        else

                rnd_string = "convround";

                rnd_string = "convround";

        fprintf(fp,

        fprintf(fp,

"///////////////////////////////////////////////////////////////////////////\n"

SLASHLINE

"//\n"

"//\n"

"// Filename:   dblstage%s.v\n"

"// Filename:\tqtrstage%s.v\n"

"//\n"

"//\n"

"// Project:    %s\n"

"// Project:\t%s\n"

"//\n"

"//\n"

"// Purpose:    This is part of an FPGA implementation that will process\n"

"// Purpose:    This file encapsulates the 4 point stage of a decimation in\n"

"//             the final stage of a decimate-in-frequency FFT, running\n"

"//             frequency FFT.  This particular implementation is optimized\n"

"//             through the data at two samples per clock.  If you notice\n"

"//     so that all of the multiplies are accomplished by additions and\n"

"//             from the derivation of an FFT, the only time both even and\n"

"//     multiplexers only.\n"

"//             odd samples are used at the same time is in this stage.\n"

"//\n"

"//             Therefore, other than this stage and these twiddles, all of\n"

"// Operation:\n"

"//             the other stages can run two stages at a time at one sample\n"

"//     The operation of this stage is identical to the regular stages of\n"

"//             per clock.\n"

"//     the FFT (see them for details), with one additional and critical\n"

"//\n"

"//     difference: this stage doesn't require any hardware multiplication.\n"

"//             In this implementation, the output is valid one clock after\n"

"//     The multiplies within it may all be accomplished using additions and\n"

"//             the input is valid.  The output also accumulates one bit\n"

"//     subtractions.\n"

"//             above and beyond the number of bits in the input.\n"

"//\n"

"//             \n"

"//     Let's see how this is done.  Given x[n] and x[n+2], cause thats the\n"

"//             i_clk   A system clock\n"

"//     stage we are working on, with i_sync true for x[0] being input,\n"

"//             i_rst   A synchronous reset\n"

"//     produce the output:\n"

"//             i_ce    Circuit enable--nothing happens unless this line is high\n"

"//\n"

"//             i_sync  A synchronization signal, high once per FFT at the start\n"

"//     y[n  ] = x[n] + x[n+2]\n"

"//             i_left  The first (even) complex sample input.  The higher order\n"

"//     y[n+2] = (x[n] - x[n+2]) * e^{-j2pi n/2}        (forward transform)\n"

"//                     bits contain the real portion, low order bits the\n"

"//            = (x[n] - x[n+2]) * -j^n\n"

"//                     imaginary portion, all in two\'s complement.\n"

"//\n"

"//             i_right The next (odd) complex sample input, same format as\n"

"//     y[n].r = x[n].r + x[n+2].r      (This is the easy part)\n"

"//                     i_left.\n"

"//     y[n].i = x[n].i + x[n+2].i\n"

"//             o_left  The first (even) complex output.\n"

"//\n"

"//             o_right The next (odd) complex output.\n"

"//     y[2].r = x[0].r - x[2].r\n"

"//             o_sync  Output synchronization signal.\n"

"//     y[2].i = x[0].i - x[2].i\n"

"//\n"

"//     y[3].r =   (x[1].i - x[3].i)            (forward transform)\n"

"//     y[3].i = - (x[1].r - x[3].r)\n"

"//\n"

"//     y[3].r = - (x[1].i - x[3].i)            (inverse transform)\n"

"//     y[3].i =   (x[1].r - x[3].r)            (INVERSE = 1)\n"

// "//\n"

// "//  When the FFT is run in the two samples per clock mode, this quarter\n"

// "//  stage will operate on either x[0] and x[2] (ODD = 0), or x[1] and\n"

// "//  x[3] (ODD = 1).  In all other cases, it will operate on all four\n"

// "//  values.\n"

"//\n%s"

"//\n%s"

"//\n", (dbg)?"_dbg":"", prjname, creator);

"//\n",

                (dbg)?"_dbg":"", prjname, creator);

        fprintf(fp, "%s", cpyleft);

        fprintf(fp, "%s", cpyleft);

        fprintf(fp, "//\n//\n`default_nettype\tnone\n//\n");

        fprintf(fp, "//\n//\n`default_nettype\tnone\n//\n");

        fprintf(fp,

"module\tdblstage%s(i_clk, i_rst, i_ce, i_sync, i_left, i_right, o_left, o_right, o_sync%s);\n"

        "\tparameter\tIWIDTH=%d,OWIDTH=IWIDTH+1, SHIFT=%d;\n"

        "\tinput\t\ti_clk, i_rst, i_ce, i_sync;\n"

        "\tinput\t\t[(2*IWIDTH-1):0]\ti_left, i_right;\n"

        "\toutput\treg\t[(2*OWIDTH-1):0]\to_left, o_right;\n"

        "\toutput\treg\t\t\to_sync;\n"

        "\n", (dbg)?"_dbg":"", (dbg)?", o_dbg":"",

        TST_DBLSTAGE_IWIDTH, TST_DBLSTAGE_SHIFT);

        std::string     resetw("i_reset");

        if (async_reset)

                resetw = std::string("i_areset_n");

        fprintf(fp,

"module\tqtrstage%s(i_clk, %s, i_ce, i_sync, i_data, o_data, o_sync%s);\n"

        "\tparameter    IWIDTH=%d, OWIDTH=IWIDTH+1;\n"

        "\tparameter\tLGWIDTH=%d, INVERSE=0,SHIFT=0;\n"

        "\tinput\t                              i_clk, %s, i_ce, i_sync;\n"

        "\tinput\t      [(2*IWIDTH-1):0]        i_data;\n"

        "\toutput\treg  [(2*OWIDTH-1):0]        o_data;\n"

        "\toutput\treg                          o_sync;\n"

                "\t\n", (dbg)?"_dbg":"", resetw.c_str(),

                (dbg)?", o_dbg":"", TST_QTRSTAGE_IWIDTH,

                TST_QTRSTAGE_LGWIDTH, resetw.c_str());

        if (dbg) { fprintf(fp, "\toutput\twire\t[33:0]\t\t\to_dbg;\n"

        if (dbg) { fprintf(fp, "\toutput\twire\t[33:0]\t\t\to_dbg;\n"

                "\tassign\to_dbg = { ((o_sync)&&(i_ce)), i_ce, o_left[(2*OWIDTH-1):(2*OWIDTH-16)],\n"

                "\tassign\to_dbg = { ((o_sync)&&(i_ce)), i_ce, o_data[(2*OWIDTH-1):(2*OWIDTH-16)],\n"

                        "\t\t\t\t\to_left[(OWIDTH-1):(OWIDTH-16)] };\n"

                        "\t\t\t\t\to_data[(OWIDTH-1):(OWIDTH-16)] };\n"

"\n");

"\n");

        fprintf(fp,

        fprintf(fp,

        "\twire\tsigned\t[(IWIDTH-1):0]\ti_in_0r, i_in_0i, i_in_1r, i_in_1i;\n"

        "\treg\t        wait_for_sync;\n"

        "\tassign\ti_in_0r = i_left[(2*IWIDTH-1):(IWIDTH)]; \n"

        "\treg\t[2:0]   pipeline;\n"

        "\tassign\ti_in_0i = i_left[(IWIDTH-1):0]; \n"

        "\tassign\ti_in_1r = i_right[(2*IWIDTH-1):(IWIDTH)]; \n"

        "\tassign\ti_in_1i = i_right[(IWIDTH-1):0]; \n"

        "\twire\t[(OWIDTH-1):0]\t\to_out_0r, o_out_0i,\n"

                                "\t\t\t\t\to_out_1r, o_out_1i;\n"

"\n"

"\n"

        "\treg\tsigned [(IWIDTH):0]     sum_r, sum_i, diff_r, diff_i;\n"

"\n"

"\n"

        "\t// Handle a potential rounding situation, when IWIDTH>=OWIDTH.\n"

        "\treg\t[(2*OWIDTH-1):0]\tob_a;\n"

        "\twire\t[(2*OWIDTH-1):0]\tob_b;\n"

        "\treg\t[(OWIDTH-1):0]\t\tob_b_r, ob_b_i;\n"

        "\tassign\tob_b = { ob_b_r, ob_b_i };\n"

"\n"

"\n"

"\n");

        "\treg\t[(LGWIDTH-1):0]\t\tiaddr;\n"

        fprintf(fp,

        "\treg\t[(2*IWIDTH-1):0]\timem\t[0:1];\n"

        "\n"

        "\n"

        "\t// As with any register connected to the sync pulse, these must\n"

        "\twire\tsigned\t[(IWIDTH-1):0]\timem_r, imem_i;\n"

        "\t// have initial values and be reset on the i_rst signal.\n"

        "\tassign\timem_r = imem[1][(2*IWIDTH-1):(IWIDTH)];\n"

        "\t// Other data values need only restrict their updates to i_ce\n"

        "\tassign\timem_i = imem[1][(IWIDTH-1):0];\n"

        "\t// enabled clocks, but sync\'s must obey resets and initial\n"

        "\t// conditions as well.\n"

        "\treg\trnd_sync, r_sync;\n"

"\n"

"\n"

        "\tinitial\trnd_sync      = 1\'b0; // Sync into rounding\n"

        "\twire\tsigned\t[(IWIDTH-1):0]\ti_data_r, i_data_i;\n"

        "\tinitial\tr_sync        = 1\'b0; // Sync coming out\n"

        "\tassign\ti_data_r = i_data[(2*IWIDTH-1):(IWIDTH)];\n"

        "\talways @(posedge i_clk)\n"

        "\tassign\ti_data_i = i_data[(IWIDTH-1):0];\n"

                "\t\tif (i_rst)\n"

                "\t\tbegin\n"

                        "\t\t\trnd_sync <= 1\'b0;\n"

                        "\t\t\tr_sync <= 1\'b0;\n"

                "\t\tend else if (i_ce)\n"

                "\t\tbegin\n"

                        "\t\t\trnd_sync <= i_sync;\n"

                        "\t\t\tr_sync <= rnd_sync;\n"

                "\t\tend\n"

"\n"

"\n"

        "\t// As with other variables, these are really only updated when in\n"

        "\treg  [(2*OWIDTH-1):0]        omem [0:1];\n"

        "\t// the processing pipeline, after the first i_sync.  However, to\n"

        "\t// eliminate as much unnecessary logic as possible, we toggle\n"

        "\t// these any time the i_ce line is enabled, and don\'t reset.\n"

        "\t// them on i_rst.\n");

        fprintf(fp,

        "\t// Don't forget that we accumulate a bit by adding two values\n"

        "\t// together. Therefore our intermediate value must have one more\n"

        "\t// bit than the two originals.\n"

        "\treg\tsigned\t[(IWIDTH):0]\trnd_in_0r, rnd_in_0i;\n"

        "\treg\tsigned\t[(IWIDTH):0]\trnd_in_1r, rnd_in_1i;\n\n"

        "\talways @(posedge i_clk)\n"

                "\t\tif (i_ce)\n"

                "\t\tbegin\n"

                        "\t\t\t//\n"

                        "\t\t\trnd_in_0r <= i_in_0r + i_in_1r;\n"

                        "\t\t\trnd_in_0i <= i_in_0i + i_in_1i;\n"

                        "\t\t\t//\n"

                        "\t\t\trnd_in_1r <= i_in_0r - i_in_1r;\n"

                        "\t\t\trnd_in_1i <= i_in_0i - i_in_1i;\n"

                        "\t\t\t//\n"

                "\t\tend\n"

"\n");

"\n");

        fprintf(fp, "\t//\n"

        "\t// Round our output values down to OWIDTH bits\n"

        "\t//\n");

        fprintf(fp,

        "\twire\tsigned\t[(OWIDTH-1):0]\trnd_sum_r, rnd_sum_i,\n"

        "\t\t\trnd_diff_r, rnd_diff_i, n_rnd_diff_r, n_rnd_diff_i;\n"

        "\t%s #(IWIDTH+1,OWIDTH,SHIFT)\tdo_rnd_sum_r(i_clk, i_ce,\n"

        "\t\t\t\tsum_r, rnd_sum_r);\n\n", rnd_string);

        fprintf(fp,

        "\t%s #(IWIDTH+1,OWIDTH,SHIFT)\tdo_rnd_sum_i(i_clk, i_ce,\n"

        "\t\t\t\tsum_i, rnd_sum_i);\n\n", rnd_string);

        fprintf(fp,

        fprintf(fp,

        "\t%s #(IWIDTH+1,OWIDTH,SHIFT) do_rnd_0r(i_clk, i_ce,\n"

        "\t%s #(IWIDTH+1,OWIDTH,SHIFT)\tdo_rnd_diff_r(i_clk, i_ce,\n"

        "\t\t\t\t\t\t\trnd_in_0r, o_out_0r);\n\n", rnd_string);

        "\t\t\t\tdiff_r, rnd_diff_r);\n\n", rnd_string);

        fprintf(fp,

        "\t%s #(IWIDTH+1,OWIDTH,SHIFT)\tdo_rnd_diff_i(i_clk, i_ce,\n"

        "\t\t\t\tdiff_i, rnd_diff_i);\n\n", rnd_string);

        fprintf(fp, "\tassign n_rnd_diff_r = - rnd_diff_r;\n"

                "\tassign n_rnd_diff_i = - rnd_diff_i;\n");

        fprintf(fp,

        fprintf(fp,

        "\t%s #(IWIDTH+1,OWIDTH,SHIFT) do_rnd_0i(i_clk, i_ce,\n"

        "\tinitial wait_for_sync = 1\'b1;\n"

        "\t\t\t\t\t\t\trnd_in_0i, o_out_0i);\n\n", rnd_string);

        "\tinitial iaddr = 0;\n");

        if (async_reset)

        fprintf(fp,

        fprintf(fp,

        "\t%s #(IWIDTH+1,OWIDTH,SHIFT) do_rnd_1r(i_clk, i_ce,\n"

                        "\talways @(posedge i_clk, negedge i_areset_n)\n"

        "\t\t\t\t\t\t\trnd_in_1r, o_out_1r);\n\n", rnd_string);

                                "\t\tif (!i_reset)\n");

        else

        fprintf(fp,

        fprintf(fp,

        "\t%s #(IWIDTH+1,OWIDTH,SHIFT) do_rnd_1i(i_clk, i_ce,\n"

        "\talways @(posedge i_clk)\n"

        "\t\t\t\t\t\t\trnd_in_1i, o_out_1i);\n\n", rnd_string);

                "\t\tif (i_reset)\n");

        fprintf(fp, "\n"

        fprintf(fp, "\t\tbegin\n"

        "\t// Prior versions of this routine did not include the extra\n"

                        "\t\t\twait_for_sync <= 1\'b1;\n"

        "\t// clock and register/flip-flops that this routine requires.\n"

                        "\t\t\tiaddr <= 0;\n"

        "\t// These are placed in here to correct a bug in Verilator, that\n"

                "\t\tend else if ((i_ce)&&((!wait_for_sync)||(i_sync)))\n"

        "\t// otherwise struggles.  (Hopefully this will fix the problem ...)\n"

                "\t\tbegin\n"

                        "\t\t\tiaddr <= iaddr + 1\'b1;\n"

                        "\t\t\twait_for_sync <= 1\'b0;\n"

                "\t\tend\n\n"

        "\talways @(posedge i_clk)\n"

        "\talways @(posedge i_clk)\n"

                "\t\tif (i_ce)\n"

                "\t\tif (i_ce)\n"

                "\t\tbegin\n"

                "\t\tbegin\n"

                        "\t\t\to_left  <= { o_out_0r, o_out_0i };\n"

                        "\t\t\timem[0] <= i_data;\n"

                        "\t\t\to_right <= { o_out_1r, o_out_1i };\n"

                        "\t\t\timem[1] <= imem[0];\n"

                "\t\tend\n"

                "\t\tend\n"

"\n"

                "\n\n");

        "\tinitial\to_sync = 1'b0; // Final sync coming out of module\n"

        fprintf(fp,

        "\talways @(posedge i_clk)\n"

        "\t// Note that we don\'t check on wait_for_sync or i_sync here.\n"

                "\t\tif (i_rst)\n"

        "\t// Why not?  Because iaddr will always be zero until after the\n"

                "\t\t\to_sync <= 1'b0;\n"

        "\t// first i_ce, so we are safe.\n"

                "\t\telse if (i_ce)\n"

        "\tinitial pipeline = 3\'h0;\n");

                "\t\t\to_sync <= r_sync;\n"

"\n"

"endmodule\n");

        fclose(fp);

void    build_multiply(const char *fname) {

        if (async_reset)

        FILE    *fp = fopen(fname, "w");

                fprintf(fp,

        if (NULL == fp) {

        "\talways\t@(posedge i_clk, negedge i_areset_n)\n"

                fprintf(stderr, "Could not open \'%s\' for writing\n", fname);

                "\t\tif (!i_reset)\n");

                perror("O/S Err was:");

        else

                return;

                fprintf(fp,

        "\talways\t@(posedge i_clk)\n"

                "\t\tif (i_reset)\n");

        fprintf(fp,

        fprintf(fp,

"///////////////////////////////////////////////////////////////////////////\n"

                        "\t\t\tpipeline <= 3\'h0;\n"

"//\n"

                "\t\telse if (i_ce) // is our pipeline process full?  Which stages?\n"

"// Filename:   shiftaddmpy.v\n"

                        "\t\t\tpipeline <= { pipeline[1:0], iaddr[1] };\n\n");

"//\n"

"// Project:    %s\n"

"//\n"

"// Purpose:    A portable shift and add multiply.\n"

"//\n"

"//             While both Xilinx and Altera will offer single clock \n"

"//             multiplies, this simple approach will multiply two numbers\n"

"//             on any architecture.  The result maintains the full width\n"

"//             of the multiply, there are no extra stuff bits, no rounding,\n"

"//             no shifted bits, etc.\n"

"//\n"

"//             Further, for those applications that can support it, this\n"

"//             multiply is pipelined and will produce one answer per clock.\n"

"//\n"

"//             For minimal processing delay, make the first parameter\n"

"//             the one with the least bits, so that AWIDTH <= BWIDTH.\n"

"//\n"

"//             The processing delay in this multiply is (AWIDTH+1) cycles.\n"

"//             That is, if the data is present on the input at clock t=0,\n"

"//             the result will be present on the output at time t=AWIDTH+1;\n"

"//\n"

"//\n%s"

"//\n", prjname, creator);

        fprintf(fp, "%s", cpyleft);

        fprintf(fp, "//\n//\n`default_nettype\tnone\n//\n");

        fprintf(fp,

"module shiftaddmpy(i_clk, i_ce, i_a, i_b, o_r);\n"

        "\tparameter\tAWIDTH=%d,BWIDTH=", TST_SHIFTADDMPY_AW);

#ifdef  TST_SHIFTADDMPY_BW

        fprintf(fp, "%d;\n", TST_SHIFTADDMPY_BW);

#else

        fprintf(fp, "AWIDTH;\n");

#endif

        fprintf(fp,

        fprintf(fp,

        "\tinput\t\t\t\t\ti_clk, i_ce;\n"

        "\t// This is the pipeline[-1] stage, pipeline[0] will be set next.\n"

        "\tinput\t\t[(AWIDTH-1):0]\t\ti_a;\n"

        "\talways\t@(posedge i_clk)\n"

        "\tinput\t\t[(BWIDTH-1):0]\t\ti_b;\n"

                "\t\tif ((i_ce)&&(iaddr[1]))\n"

        "\toutput\treg\t[(AWIDTH+BWIDTH-1):0]\to_r;\n"

"\n"

        "\treg\t[(AWIDTH-1):0]\tu_a;\n"

        "\treg\t[(BWIDTH-1):0]\tu_b;\n"

        "\treg\t\t\tsgn;\n"

"\n"

        "\treg\t[(AWIDTH-2):0]\t\tr_a[0:(AWIDTH-1)];\n"

        "\treg\t[(AWIDTH+BWIDTH-2):0]\tr_b[0:(AWIDTH-1)];\n"

        "\treg\t\t\t\tr_s[0:(AWIDTH-1)];\n"

        "\treg\t[(AWIDTH+BWIDTH-1):0]\tacc[0:(AWIDTH-1)];\n"

        "\tgenvar k;\n"

"\n"

        "\t// If we were forced to stay within two\'s complement arithmetic,\n"

        "\t// taking the absolute value here would require an additional bit.\n"

        "\t// However, because our results are now unsigned, we can stay\n"

        "\t// within the number of bits given (for now).\n"

        "\talways @(posedge i_clk)\n"

                "\t\tif (i_ce)\n"

                "\t\tbegin\n"

                        "\t\t\tu_a <= (i_a[AWIDTH-1])?(-i_a):(i_a);\n"

                        "\t\t\tu_b <= (i_b[BWIDTH-1])?(-i_b):(i_b);\n"

                        "\t\t\tsgn <= i_a[AWIDTH-1] ^ i_b[BWIDTH-1];\n"

                "\t\tend\n"

"\n"

        "\talways @(posedge i_clk)\n"

                "\t\tif (i_ce)\n"

                "\t\tbegin\n"

                "\t\tbegin\n"

                        "\t\t\tacc[0] <= (u_a[0]) ? { {(AWIDTH){1\'b0}}, u_b }\n"

                        "\t\t\tsum_r  <= imem_r + i_data_r;\n"

                        "\t\t\t\t\t: {(AWIDTH+BWIDTH){1\'b0}};\n"

                        "\t\t\tsum_i  <= imem_i + i_data_i;\n"

                        "\t\t\tr_a[0] <= { u_a[(AWIDTH-1):1] };\n"

                        "\t\t\tdiff_r <= imem_r - i_data_r;\n"

                        "\t\t\tr_b[0] <= { {(AWIDTH-1){1\'b0}}, u_b };\n"

                        "\t\t\tdiff_i <= imem_i - i_data_i;\n"

                        "\t\t\tr_s[0] <= sgn; // The final sign, needs to be preserved\n"

                "\t\tend\n\n");

                "\t\tend\n"

        fprintf(fp,

"\n"

        "\t// pipeline[1] takes sum_x and diff_x and produces rnd_x\n\n");

        "\tgenerate\n"

        "\tfor(k=0; k<AWIDTH-1; k=k+1)\n"

        fprintf(fp,

        "\tbegin : genstages\n"

        "\t// Now for pipeline[2].  We can actually do this at all i_ce\n"

                "\t\talways @(posedge i_clk)\n"

        "\t// clock times, since nothing will listen unless pipeline[3]\n"

        "\t// on the next clock.  Thus, we simplify this logic and do\n"

        "\t// it independent of pipeline[2].\n"

        "\talways\t@(posedge i_clk)\n"

                "\t\tif (i_ce)\n"

                "\t\tif (i_ce)\n"

                "\t\tbegin\n"

                "\t\tbegin\n"

                        "\t\t\tacc[k+1] <= acc[k] + ((r_a[k][0]) ? {r_b[k],1\'b0}:0);\n"

                        "\t\t\tob_a <= { rnd_sum_r, rnd_sum_i };\n"

                        "\t\t\tr_a[k+1] <= { 1\'b0, r_a[k][(AWIDTH-2):1] };\n"

                        "\t\t\t// on Even, W = e^{-j2pi 1/4 0} = 1\n"

                        "\t\t\tr_b[k+1] <= { r_b[k][(AWIDTH+BWIDTH-3):0], 1\'b0};\n"

                        "\t\t\tif (!iaddr[0])\n"

                        "\t\t\tr_s[k+1] <= r_s[k];\n"

                        "\t\t\tbegin\n"

                "\t\tend\n"

                        "\t\t\t\tob_b_r <= rnd_diff_r;\n"

        "\tend\n"

                        "\t\t\t\tob_b_i <= rnd_diff_i;\n"

        "\tendgenerate\n"

                        "\t\t\tend else if (INVERSE==0) begin\n"

"\n"

                        "\t\t\t\t// on Odd, W = e^{-j2pi 1/4} = -j\n"

        "\talways @(posedge i_clk)\n"

                        "\t\t\t\tob_b_r <=   rnd_diff_i;\n"

                        "\t\t\t\tob_b_i <= n_rnd_diff_r;\n"

                        "\t\t\tend else begin\n"

                        "\t\t\t\t// on Odd, W = e^{j2pi 1/4} = j\n"

                        "\t\t\t\tob_b_r <= n_rnd_diff_i;\n"

                        "\t\t\t\tob_b_i <=   rnd_diff_r;\n"

                        "\t\t\tend\n"

                "\t\tend\n\n");

        fprintf(fp,

        "\talways\t@(posedge i_clk)\n"

                "\t\tif (i_ce)\n"

                "\t\tif (i_ce)\n"

                        "\t\t\to_r <= (r_s[AWIDTH-1]) ? (-acc[AWIDTH-1]) : acc[AWIDTH-1];\n"

                "\t\tbegin // In sequence, clock = 3\n"

"\n"

                        "\t\t\tomem[0] <= ob_b;\n"

"endmodule\n");

                        "\t\t\tomem[1] <= omem[0];\n"

                        "\t\t\tif (pipeline[2])\n"

        fclose(fp);

                                "\t\t\t\to_data <= ob_a;\n"

                        "\t\t\telse\n"

                                "\t\t\t\to_data <= omem[1];\n"

                "\t\tend\n\n");

void    build_bimpy(const char *fname) {

        fprintf(fp,

        FILE    *fp = fopen(fname, "w");

        "\tinitial\to_sync = 1\'b0;\n");

        if (NULL == fp) {

                fprintf(stderr, "Could not open \'%s\' for writing\n", fname);

                perror("O/S Err was:");

                return;

        if (async_reset)

        fprintf(fp,

        fprintf(fp,

"////////////////////////////////////////////////////////////////////////////////\n"

        "\talways\t@(posedge i_clk, negedge i_areset_n)\n"

"//\n"

                "\t\tif (!i_areset_n)\n");

"// Filename:   %s\n"

        else

"//\n"

                fprintf(fp,

"// Project:    %s\n"

        "\talways\t@(posedge i_clk)\n"

"//\n"

                "\t\tif (i_reset)\n");

"// Purpose:    A simple 2-bit multiply based upon the fact that LUT's allow\n"

        fprintf(fp,

"//             6-bits of input.  In other words, I could build a 3-bit\n"

                "\t\t\to_sync <= 1\'b0;\n"

"//             multiply from 6 LUTs (5 actually, since the first could have\n"

                "\t\telse if (i_ce)\n"

"//             two outputs).  This would allow multiplication of three bit\n"

                        "\t\t\to_sync <= (iaddr[2:0] == 3'b101);\n\n");

"//             digits, save only for the fact that you would need two bits\n"

"//             of carry.  The bimpy approach throttles back a bit and does\n"

"//             a 2x2 bit multiply in a LUT, guaranteeing that it will never\n"

"//             carry more than one bit.  While this multiply is hardware\n"

"//             independent (and can still run under Verilator therefore),\n"

"//             it is really motivated by trying to optimize for a specific\n"

"//             piece of hardware (Xilinx-7 series ...) that has at least\n"

"//             4-input LUT's with carry chains.\n"

"//\n"

"//\n"

"//\n%s"

"//\n", fname, prjname, creator);

        fprintf(fp, "%s", cpyleft);

        if (formal_property_flag) {

        fprintf(fp, "//\n//\n`default_nettype\tnone\n//\n");

        fprintf(fp,

        fprintf(fp,

"module bimpy(i_clk, i_ce, i_a, i_b, o_r);\n"

"`ifdef FORMAL\n"

"\tparameter\tBW=18, // Number of bits in i_b\n"

        "\treg  f_past_valid;\n"

"\t\t\tLUTB=2; // Number of bits in i_a for our LUT multiply\n"

        "\tinitial      f_past_valid = 1'b0;\n"

"\tinput\t\t\t\ti_clk, i_ce;\n"

        "\talways @(posedge i_clk)\n"

"\tinput\t\t[(LUTB-1):0]\ti_a;\n"

        "\t     f_past_valid = 1'b1;\n"

"\tinput\t\t[(BW-1):0]\ti_b;\n"

"\toutput\treg\t[(BW+LUTB-1):0] o_r;\n"

"\n"

"\twire [(BW+LUTB-2):0] w_r;\n"

"\twire [(BW+LUTB-3):1] c;\n"

"\n"

"\tassign\tw_r =  { ((i_a[1])?i_b:{(BW){1'b0}}), 1'b0 }\n"

"\t\t\t\t^ { 1'b0, ((i_a[0])?i_b:{(BW){1'b0}}) };\n"

"\tassign\tc = { ((i_a[1])?i_b[(BW-2):0]:{(BW-1){1'b0}}) }\n"

"\t\t\t& ((i_a[0])?i_b[(BW-1):1]:{(BW-1){1'b0}});\n"

"\n"

"\n"

"`ifdef QTRSTAGE\n"

"\talways @(posedge i_clk)\n"

"\talways @(posedge i_clk)\n"

"\t\tif (i_ce)\n"

        "\t     assume((i_ce)||($past(i_ce))||($past(i_ce,2)));\n"

"\t\t\to_r <= w_r + { c, 2'b0 };\n"

"`endif\n"

"\n"

"\n"

"endmodule\n");

        "\t// The below logic only works if the rounding stage does nothing\n"

        "\tinitial      assert(IWIDTH+1 == OWIDTH);\n"

        fclose(fp);

"\n"

        "\treg  signed [IWIDTH-1:0]     f_piped_real    [0:7];\n"

        "\treg  signed [IWIDTH-1:0]     f_piped_imag    [0:7];\n"

void    build_longbimpy(const char *fname) {

        FILE    *fp = fopen(fname, "w");

        if (NULL == fp) {

                fprintf(stderr, "Could not open \'%s\' for writing\n", fname);

                perror("O/S Err was:");

                return;

        fprintf(fp,

"////////////////////////////////////////////////////////////////////////////////\n"

"//\n"

"// Filename:   %s\n"

"//\n"

"// Project:    %s\n"

"//\n"

"// Purpose:    A portable shift and add multiply, built with the knowledge\n"

"//             of the existence of a six bit LUT and carry chain.  That\n"

"//             knowledge allows us to multiply two bits from one value\n"

"//             at a time against all of the bits of the other value.  This\n"

"//             sub multiply is called the bimpy.\n"

"//\n"

"//             For minimal processing delay, make the first parameter\n"

"//             the one with the least bits, so that AWIDTH <= BWIDTH.\n"

"//\n"

"//\n"

"//\n%s"

"//\n", fname, prjname, creator);

        fprintf(fp, "%s", cpyleft);

        fprintf(fp, "//\n//\n`default_nettype\tnone\n//\n");

        fprintf(fp,

"module longbimpy(i_clk, i_ce, i_a, i_b, o_r);\n"

        "\tparameter    AW=%d,  // The width of i_a, min width is 5\n"

                        "\t\t\tBW=", TST_LONGBIMPY_AW);

#ifdef  TST_LONGBIMPY_BW

        fprintf(fp, "%d", TST_LONGBIMPY_BW);

#else

        fprintf(fp, "AW");

#endif

        fprintf(fp, ",  // The width of i_b, can be anything\n"

                        "\t\t\t// The following three parameters should not be changed\n"

                        "\t\t\t// by any implementation, but are based upon hardware\n"

                        "\t\t\t// and the above values:\n"

                        "\t\t\tOW=AW+BW,        // The output width\n"

                        "\t\t\tIW=(AW+1)&(-2),  // Internal width of A\n"

                        "\t\t\tLUTB=2,  // How many bits we can multiply by at once\n"

                        "\t\t\tTLEN=(AW+(LUTB-1))/LUTB; // Nmbr of rows in our tableau\n"

        "\tinput\t\t\t\ti_clk, i_ce;\n"

        "\tinput\t\t[(AW-1):0]\ti_a;\n"

        "\tinput\t\t[(BW-1):0]\ti_b;\n"

        "\toutput\treg\t[(AW+BW-1):0]\to_r;\n"

"\n"

        "\treg\t[(IW-1):0]\tu_a;\n"

        "\treg\t[(BW-1):0]\tu_b;\n"

        "\treg\t\t\tsgn;\n"

"\n"

        "\treg\t[(IW-1-2*(LUTB)):0]\tr_a[0:(TLEN-3)];\n"

        "\treg\t[(BW-1):0]\t\tr_b[0:(TLEN-3)];\n"

        "\treg\t[(TLEN-1):0]\t\tr_s;\n"

        "\treg\t[(IW+BW-1):0]\t\tacc[0:(TLEN-2)];\n"

        "\tgenvar k;\n"

"\n"

        "\t// First step:\n"

        "\t// Switch to unsigned arithmetic for our multiply, keeping track\n"

        "\t// of the along the way.  We'll then add the sign again later at\n"

        "\t// the end.\n"

        "\t//\n"

        "\t// If we were forced to stay within two's complement arithmetic,\n"

        "\t// taking the absolute value here would require an additional bit.\n"

        "\t// However, because our results are now unsigned, we can stay\n"

        "\t// within the number of bits given (for now).\n"

        "\tgenerate if (IW > AW)\n"

        "\tbegin\n"

                "\t\talways @(posedge i_clk)\n"

                        "\t\t\tif (i_ce)\n"

                        "\t\t\t\tu_a <= { 1'b0, (i_a[AW-1])?(-i_a):(i_a) };\n"

        "\tend else begin\n"

                "\t\talways @(posedge i_clk)\n"

                        "\t\t\tif (i_ce)\n"

                        "\t\t\t\tu_a <= (i_a[AW-1])?(-i_a):(i_a);\n"

        "\tend endgenerate\n"

"\n"

"\n"

        "\talways @(posedge i_clk)\n"

        "\talways @(posedge i_clk)\n"

                "\t\tif (i_ce)\n"

        "\tif (i_ce)\n"

                "\t\tbegin\n"

        "\tbegin\n"

                        "\t\t\tu_b <= (i_b[BW-1])?(-i_b):(i_b);\n"

        "\t     f_piped_real[0] <= i_data[2*IWIDTH-1:IWIDTH];\n"

                        "\t\t\tsgn <= i_a[AW-1] ^ i_b[BW-1];\n"

        "\t     f_piped_imag[0] <= i_data[  IWIDTH-1:0];\n"

                "\t\tend\n"

"\n"

"\n"

        "\twire [(BW+LUTB-1):0] pr_a, pr_b;\n"

        "\t     f_piped_real[1] <= f_piped_real[0];\n"

        "\t     f_piped_imag[1] <= f_piped_imag[0];\n"

"\n"

"\n"

        "\t//\n"

        "\t     f_piped_real[2] <= f_piped_real[1];\n"

        "\t// Second step: First two 2xN products.\n"

        "\t     f_piped_imag[2] <= f_piped_imag[1];\n"

        "\t//\n"

        "\t// Since we have no tableau of additions (yet), we can do both\n"

        "\t// of the first two rows at the same time and add them together.\n"

        "\t// For the next round, we'll then have a previous sum to accumulate\n"

        "\t// with new and subsequent product, and so only do one product at\n"

        "\t// a time can follow this--but the first clock can do two at a time.\n"

        "\tbimpy\t#(BW) lmpy_0(i_clk,i_ce,u_a[(  LUTB-1):   0], u_b, pr_a);\n"

        "\tbimpy\t#(BW) lmpy_1(i_clk,i_ce,u_a[(2*LUTB-1):LUTB], u_b, pr_b);\n"

        "\talways @(posedge i_clk)\n"

                "\t\tif (i_ce) r_a[0] <= u_a[(IW-1):(2*LUTB)];\n"

        "\talways @(posedge i_clk)\n"

                "\t\tif (i_ce) r_b[0] <= u_b;\n"

        "\talways @(posedge i_clk)\n"

                "\t\tif (i_ce) r_s <= { r_s[(TLEN-2):0], sgn };\n"

        "\talways @(posedge i_clk) // One clk after p[0],p[1] become valid\n"

                "\t\tif (i_ce) acc[0] <= { {(IW-LUTB){1'b0}}, pr_a}\n"

                        "\t\t\t  +{ {(IW-(2*LUTB)){1'b0}}, pr_b, {(LUTB){1'b0}} };\n"

"\n"

        "\tgenerate // Keep track of intermediate values, before multiplying them\n"

        "\tif (TLEN > 3) for(k=0; k<TLEN-3; k=k+1)\n"

        "\tbegin : gencopies\n"

                "\t\talways @(posedge i_clk)\n"

                "\t\tif (i_ce)\n"

                "\t\tbegin\n"

                        "\t\t\tr_a[k+1] <= { {(LUTB){1'b0}},\n"

                                "\t\t\t\tr_a[k][(IW-1-(2*LUTB)):LUTB] };\n"

                        "\t\t\tr_b[k+1] <= r_b[k];\n"

                        "\t\tend\n"

        "\tend endgenerate\n"

"\n"

"\n"

        "\tgenerate // The actual multiply and accumulate stage\n"

        "\t     f_piped_real[3] <= f_piped_real[2];\n"

        "\tif (TLEN > 2) for(k=0; k<TLEN-2; k=k+1)\n"

        "\t     f_piped_imag[3] <= f_piped_imag[2];\n"

        "\tbegin : genstages\n"

                "\t\t// First, the multiply: 2-bits times BW bits\n"

                "\t\twire\t[(BW+LUTB-1):0] genp;\n"

                "\t\tbimpy #(BW) genmpy(i_clk,i_ce,r_a[k][(LUTB-1):0],r_b[k], genp);\n"

"\n"

                "\t\t// Then the accumulate step -- on the next clock\n"

                "\t\talways @(posedge i_clk)\n"

                        "\t\t\tif (i_ce)\n"

                                "\t\t\t\tacc[k+1] <= acc[k] + {{(IW-LUTB*(k+3)){1'b0}},\n"

                                        "\t\t\t\t\tgenp, {(LUTB*(k+2)){1'b0}} };\n"

        "\tend endgenerate\n"

"\n"

"\n"

        "\twire [(IW+BW-1):0]   w_r;\n"

        "\t     f_piped_real[4] <= f_piped_real[3];\n"

        "\tassign\tw_r = (r_s[TLEN-1]) ? (-acc[TLEN-2]) : acc[TLEN-2];\n"

        "\t     f_piped_imag[4] <= f_piped_imag[3];\n"

        "\talways @(posedge i_clk)\n"

                "\t\tif (i_ce)\n"

                        "\t\t\to_r <= w_r[(AW+BW-1):0];\n"

"\n"

"\n"

        "\tgenerate if (IW > AW)\n"

        "\t     f_piped_real[5] <= f_piped_real[4];\n"

        "\tbegin : VUNUSED\n"

        "\t     f_piped_imag[5] <= f_piped_imag[4];\n"

        "\t\t// verilator lint_off UNUSED\n"

        "\t\twire\t[(IW-AW)-1:0]\tunused;\n"

        "\t\tassign\tunused = w_r[(IW+BW-1):(AW+BW)];\n"

        "\t\t// verilator lint_on UNUSED\n"

        "\tend endgenerate\n"

"\n"

"\n"

"endmodule\n");

        "\t     f_piped_real[6] <= f_piped_real[5];\n"

        "\t     f_piped_imag[6] <= f_piped_imag[5];\n"

        fclose(fp);

void    build_dblreverse(const char *fname) {

        FILE    *fp = fopen(fname, "w");

        if (NULL == fp) {

                fprintf(stderr, "Could not open \'%s\' for writing\n", fname);

                perror("O/S Err was:");

                return;

        fprintf(fp,

"///////////////////////////////////////////////////////////////////////////\n"

"//\n"

"// Filename:   dblreverse.v\n"

"//\n"

"// Project:    %s\n"

"//\n"

"// Purpose:    This module bitreverses a pipelined FFT input.  Operation is\n"

"//             expected as follows:\n"

"//\n"

"//             i_clk   A running clock at whatever system speed is offered.\n"

"//             i_rst   A synchronous reset signal, that resets all internals\n"

"//             i_ce    If this is one, one input is consumed and an output\n"

"//                     is produced.\n"

"//             i_in_0, i_in_1\n"

"//                     Two inputs to be consumed, each of width WIDTH.\n"

"//             o_out_0, o_out_1\n"

"//                     Two of the bitreversed outputs, also of the same\n"

"//                     width, WIDTH.  Of course, there is a delay from the\n"

"//                     first input to the first output.  For this purpose,\n"

"//                     o_sync is present.\n"

"//             o_sync  This will be a 1\'b1 for the first value in any block.\n"

"//                     Following a reset, this will only become 1\'b1 once\n"

"//                     the data has been loaded and is now valid.  After that,\n"

"//                     all outputs will be valid.\n"

"//\n"

"//     20150602 -- This module has undergone massive rework in order to\n"

"//             ensure that it uses resources efficiently.  As a result, \n"

"//             it now optimizes nicely into block RAMs.  As an unfortunately\n"

"//             side effect, it now passes it\'s bench test (dblrev_tb) but\n"

"//             fails the integration bench test (fft_tb).\n"

"//\n"

"//\n%s"

"//\n", prjname, creator);

        fprintf(fp, "%s", cpyleft);

        fprintf(fp, "//\n//\n`default_nettype\tnone\n//\n");

        fprintf(fp,

"\n\n"

"//\n"

"// How do we do bit reversing at two smples per clock?  Can we separate out\n"

"// our work into eight memory banks, writing two banks at once and reading\n"

"// another two banks in the same clock?\n"

"//\n"

"//     mem[00xxx0] = s_0[n]\n"

"//     mem[00xxx1] = s_1[n]\n"

"//     o_0[n] = mem[10xxx0]\n"

"//     o_1[n] = mem[11xxx0]\n"

"//     ...\n"

"//     mem[01xxx0] = s_0[m]\n"

"//     mem[01xxx1] = s_1[m]\n"

"//     o_0[m] = mem[10xxx1]\n"

"//     o_1[m] = mem[11xxx1]\n"

"//     ...\n"

"//     mem[10xxx0] = s_0[n]\n"

"//     mem[10xxx1] = s_1[n]\n"

"//     o_0[n] = mem[00xxx0]\n"

"//     o_1[n] = mem[01xxx0]\n"

"//     ...\n"

"//     mem[11xxx0] = s_0[m]\n"

"//     mem[11xxx1] = s_1[m]\n"

"//     o_0[m] = mem[00xxx1]\n"

"//     o_1[m] = mem[01xxx1]\n"

"//     ...\n"

"//\n"

"//     The answer is that, yes we can but: we need to use four memory banks\n"

"//     to do it properly.  These four banks are defined by the two bits\n"

"//     that determine the top and bottom of the correct address.  Larger\n"

"//     FFT\'s would require more memories.\n"

"//\n"

"//\n");

        fprintf(fp,

"module dblreverse(i_clk, i_rst, i_ce, i_in_0, i_in_1,\n"

        "\t\to_out_0, o_out_1, o_sync);\n"

        "\tparameter\t\t\tLGSIZE=%d, WIDTH=24;\n"

        "\tinput\t\t\t\ti_clk, i_rst, i_ce;\n"

        "\tinput\t\t[(2*WIDTH-1):0]\ti_in_0, i_in_1;\n"

        "\toutput\twire\t[(2*WIDTH-1):0]\to_out_0, o_out_1;\n"

        "\toutput\treg\t\t\to_sync;\n", TST_DBLREVERSE_LGSIZE);

        fprintf(fp,

"\n"

"\n"

        "\treg\t\t\tin_reset;\n"

        "\t     f_piped_real[7] <= f_piped_real[6];\n"

        "\treg\t[(LGSIZE-1):0]\tiaddr;\n"

        "\t     f_piped_imag[7] <= f_piped_imag[6];\n"

        "\twire\t[(LGSIZE-3):0]\tbraddr;\n"

        "\tend\n"

"\n"

        "\tgenvar\tk;\n"

        "\tgenerate for(k=0; k<LGSIZE-2; k=k+1)\n"

        "\tbegin : gen_a_bit_reversed_value\n"

                "\t\tassign braddr[k] = iaddr[LGSIZE-3-k];\n"

        "\tend endgenerate\n"

"\n"

        "\tinitial iaddr = 0;\n"

        "\tinitial in_reset = 1\'b1;\n"

        "\tinitial o_sync = 1\'b0;\n"

        "\talways @(posedge i_clk)\n"

                "\t\tif (i_rst)\n"

                "\t\tbegin\n"

                        "\t\t\tiaddr <= 0;\n"

                        "\t\t\tin_reset <= 1\'b1;\n"

                        "\t\t\to_sync <= 1\'b0;\n"

                "\t\tend else if (i_ce)\n"

                "\t\tbegin\n"

                        "\t\t\tiaddr <= iaddr + { {(LGSIZE-1){1\'b0}}, 1\'b1 };\n"

                        "\t\t\tif (&iaddr[(LGSIZE-2):0])\n"

                                "\t\t\t\tin_reset <= 1\'b0;\n"

                        "\t\t\tif (in_reset)\n"

                                "\t\t\t\to_sync <= 1\'b0;\n"

                        "\t\t\telse\n"

                                "\t\t\t\to_sync <= ~(|iaddr[(LGSIZE-2):0]);\n"

                "\t\tend\n"

"\n"

"\n"

        "\treg\t[(2*WIDTH-1):0]\tmem_e [0:((1<<(LGSIZE))-1)];\n"

        "\treg  f_rsyncd;\n"

        "\treg\t[(2*WIDTH-1):0]\tmem_o [0:((1<<(LGSIZE))-1)];\n"

        "\twire f_syncd;\n"

"\n"

"\n"

        "\tinitial      f_rsyncd = 0;\n"

        "\talways @(posedge i_clk)\n"

        "\talways @(posedge i_clk)\n"

                "\t\tif (i_ce)\tmem_e[iaddr] <= i_in_0;\n"

        "\tif(i_reset)\n"

        "\talways @(posedge i_clk)\n"

        "\t     f_rsyncd <= 1'b0;\n"

                "\t\tif (i_ce)\tmem_o[iaddr] <= i_in_1;\n"

        "\telse if (!f_rsyncd)\n"

        "\t     f_rsyncd <= (o_sync);\n"

        "\tassign       f_syncd = (f_rsyncd)||(o_sync);\n"

"\n"

"\n"

        "\treg  [1:0]   f_state;\n"

"\n"

"\n"

        "\treg [(2*WIDTH-1):0] evn_out_0, evn_out_1, odd_out_0, odd_out_1;\n"

"\n"

"\n"

        "\tinitial      f_state = 0;\n"

        "\talways @(posedge i_clk)\n"

        "\talways @(posedge i_clk)\n"

                "\t\tif (i_ce)\n\t\t\tevn_out_0 <= mem_e[{~iaddr[LGSIZE-1],1\'b0,braddr}];\n"

        "\tif (i_reset)\n"

        "\talways @(posedge i_clk)\n"

        "\t     f_state <= 0;\n"

                "\t\tif (i_ce)\n\t\t\tevn_out_1 <= mem_e[{~iaddr[LGSIZE-1],1\'b1,braddr}];\n"

        "\telse if ((i_ce)&&((!wait_for_sync)||(i_sync)))\n"

        "\talways @(posedge i_clk)\n"

        "\t     f_state <= f_state + 1;\n"

                "\t\tif (i_ce)\n\t\t\todd_out_0 <= mem_o[{~iaddr[LGSIZE-1],1\'b0,braddr}];\n"

"\n"

        "\talways @(*)\n"

        "\tif (f_state != 0)\n"

        "\t     assume(!i_sync);\n"

"\n"

        "\talways @(posedge i_clk)\n"

        "\talways @(posedge i_clk)\n"

                "\t\tif (i_ce)\n\t\t\todd_out_1 <= mem_o[{~iaddr[LGSIZE-1],1\'b1,braddr}];\n"

        "\t     assert(f_state[1:0] == iaddr[1:0]);\n"

"\n"

        "\twire signed [2*IWIDTH-1:0]   f_i_real, f_i_imag;\n"

        "\tassign                       f_i_real = i_data[2*IWIDTH-1:IWIDTH];\n"

        "\tassign                       f_i_imag = i_data[  IWIDTH-1:0];\n"

"\n"

        "\twire signed [OWIDTH-1:0]     f_o_real, f_o_imag;\n"

        "\tassign                       f_o_real = o_data[2*OWIDTH-1:OWIDTH];\n"

        "\tassign                       f_o_imag = o_data[  OWIDTH-1:0];\n"

"\n"

"\n"

        "\treg\tadrz;\n"

        "\talways @(posedge i_clk)\n"

        "\talways @(posedge i_clk)\n"

                "\t\tif (i_ce) adrz <= iaddr[LGSIZE-2];\n"

        "\tif (f_state == 2'b11)\n"

        "\tbegin\n"

        "\t     assume(f_piped_real[0] != 3'sb100);\n"

        "\t     assume(f_piped_real[2] != 3'sb100);\n"

        "\t     assert(sum_r  == f_piped_real[2] + f_piped_real[0]);\n"

        "\t     assert(sum_i  == f_piped_imag[2] + f_piped_imag[0]);\n"

"\n"

"\n"

        "\tassign\to_out_0 = (adrz)?odd_out_0:evn_out_0;\n"

        "\t     assert(diff_r == f_piped_real[2] - f_piped_real[0]);\n"

        "\tassign\to_out_1 = (adrz)?odd_out_1:evn_out_1;\n"

        "\t     assert(diff_i == f_piped_imag[2] - f_piped_imag[0]);\n"

        "\tend\n"

"\n"

"\n"

"endmodule\n");

        fclose(fp);

void    build_butterfly(const char *fname, int xtracbits, ROUND_T rounding) {

        FILE    *fp = fopen(fname, "w");

        if (NULL == fp) {

                fprintf(stderr, "Could not open \'%s\' for writing\n", fname);

                perror("O/S Err was:");

                return;

        const   char    *rnd_string;

        if (rounding == RND_TRUNCATE)

                rnd_string = "truncate";

        else if (rounding == RND_FROMZERO)

                rnd_string = "roundfromzero";

        else if (rounding == RND_HALFUP)

                rnd_string = "roundhalfup";

        else

                rnd_string = "convround";

        fprintf(fp,

"///////////////////////////////////////////////////////////////////////////\n"

"//\n"

"// Filename:   butterfly.v\n"

"//\n"

"// Project:    %s\n"

"//\n"

"// Purpose:    This routine caculates a butterfly for a decimation\n"

"//             in frequency version of an FFT.  Specifically, given\n"

"//             complex Left and Right values together with a \n"

"//             coefficient, the output of this routine is given\n"

"//             by:\n"

"//\n"

"//             L' = L + R\n"

"//             R' = (L - R)*C\n"

"//\n"

"//             The rest of the junk below handles timing (mostly),\n"

"//             to make certain that L' and R' reach the output at\n"

"//             the same clock.  Further, just to make certain\n"

"//             that is the case, an 'aux' input exists.  This\n"

"//             aux value will come out of this routine synchronized\n"

"//             to the values it came in with.  (i.e., both L', R',\n"

"//             and aux all have the same delay.)  Hence, a caller\n"

"//             of this routine may set aux on the first input with\n"

"//             valid data, and then wait to see aux set on the output\n"

"//             to know when to find the first output with valid data.\n"

"//\n"

"//             All bits are preserved until the very last clock,\n"

"//             where any more bits than OWIDTH will be quietly\n"

"//             discarded.\n"

"//\n"

"//             This design features no overflow checking.\n"

"// \n"

"// Notes:\n"

"//             CORDIC:\n"

"//             Much as we would like, we can't use a cordic here.\n"

"//             The goal is to accomplish an FFT, as defined, and a\n"

"//             CORDIC places a scale factor onto the data.  Removing\n"

"//             the scale factor would cost a two multiplies, which\n"

"//             is precisely what we are trying to avoid.\n"

"//\n"

"//\n"

"//             3-MULTIPLIES:\n"

"//             It should also be possible to do this with three \n"

"//             multiplies and an extra two addition cycles.  \n"

"//\n"

"//             We want\n"

"//                     R+I = (a + jb) * (c + jd)\n"

"//                     R+I = (ac-bd) + j(ad+bc)\n"

"//             We multiply\n"

"//                     P1 = ac\n"

"//                     P2 = bd\n"

"//                     P3 = (a+b)(c+d)\n"

"//             Then \n"

"//                     R+I=(P1-P2)+j(P3-P2-P1)\n"

"//\n"

"//             WIDTHS:\n"

"//             On multiplying an X width number by an\n"

"//             Y width number, X>Y, the result should be (X+Y)\n"

"//             bits, right?\n"

"//             -2^(X-1) <= a <= 2^(X-1) - 1\n"

"//             -2^(Y-1) <= b <= 2^(Y-1) - 1\n"

"//             (2^(Y-1)-1)*(-2^(X-1)) <= ab <= 2^(X-1)2^(Y-1)\n"

"//             -2^(X+Y-2)+2^(X-1) <= ab <= 2^(X+Y-2) <= 2^(X+Y-1) - 1\n"

"//             -2^(X+Y-1) <= ab <= 2^(X+Y-1)-1\n"

"//             YUP!  But just barely.  Do this and you'll really want\n"

"//             to drop a bit, although you will risk overflow in so\n"

"//             doing.\n"

"//\n"

"//     20150602 -- The sync logic lines have been completely redone.  The\n"

"//             synchronization lines no longer go through the FIFO with the\n"

"//             left hand sum, but are kept out of memory.  This allows the\n"

"//             butterfly to use more optimal memory resources, while also\n"

"//             guaranteeing that the sync lines can be properly reset upon\n"

"//             any reset signal.\n"

"//\n"

"//\n%s"

"//\n", prjname, creator);

        fprintf(fp, "%s", cpyleft);

        fprintf(fp, "//\n//\n`default_nettype\tnone\n//\n");

        fprintf(fp,

"module\tbutterfly(i_clk, i_rst, i_ce, i_coef, i_left, i_right, i_aux,\n"

                "\t\to_left, o_right, o_aux);\n"

        "\t// Public changeable parameters ...\n"

        "\tparameter IWIDTH=%d,", TST_BUTTERFLY_IWIDTH);

#ifdef  TST_BUTTERFLY_CWIDTH

        fprintf(fp, "CWIDTH=%d,", TST_BUTTERFLY_CWIDTH);

#else

        fprintf(fp, "CWIDTH=IWIDTH+%d,", xtracbits);

#endif

#ifdef  TST_BUTTERFLY_OWIDTH

        fprintf(fp, "OWIDTH=%d;\n", TST_BUTTERFLY_OWIDTH);

#else

        fprintf(fp, "OWIDTH=IWIDTH+1;\n");

#endif

        fprintf(fp,

        "\t// Parameters specific to the core that should not be changed.\n"

        "\tparameter    MPYDELAY=%d'd%d,\n"

                        "\t\t\tSHIFT=0, AUXLEN=(MPYDELAY+3);\n"

        "\t// The LGDELAY should be the base two log of the MPYDELAY.  If\n"

        "\t// this value is fractional, then round up to the nearest\n"

        "\t// integer: LGDELAY=ceil(log(MPYDELAY)/log(2));\n"

        "\tparameter\tLGDELAY=%d;\n"

        "\tinput\t\ti_clk, i_rst, i_ce;\n"

        "\tinput\t\t[(2*CWIDTH-1):0] i_coef;\n"

        "\tinput\t\t[(2*IWIDTH-1):0] i_left, i_right;\n"

        "\tinput\t\ti_aux;\n"

        "\toutput\twire [(2*OWIDTH-1):0] o_left, o_right;\n"

        "\toutput\treg\to_aux;\n"

        "\n", lgdelay(16,xtracbits), bflydelay(16, xtracbits),

                lgdelay(16,xtracbits));

        fprintf(fp,

        "\treg\t[(2*IWIDTH-1):0]\tr_left, r_right;\n"

        "\treg\t[(2*CWIDTH-1):0]\tr_coef, r_coef_2;\n"

        "\twire\tsigned\t[(IWIDTH-1):0]\tr_left_r, r_left_i, r_right_r, r_right_i;\n"

        "\tassign\tr_left_r  = r_left[ (2*IWIDTH-1):(IWIDTH)];\n"

        "\tassign\tr_left_i  = r_left[ (IWIDTH-1):0];\n"

        "\tassign\tr_right_r = r_right[(2*IWIDTH-1):(IWIDTH)];\n"

        "\tassign\tr_right_i = r_right[(IWIDTH-1):0];\n"

"\n"

        "\treg\tsigned\t[(IWIDTH):0]\tr_sum_r, r_sum_i, r_dif_r, r_dif_i;\n"

"\n"

        "\treg  [(LGDELAY-1):0] fifo_addr;\n"

        "\twire [(LGDELAY-1):0] fifo_read_addr;\n"

        "\tassign\tfifo_read_addr = fifo_addr - MPYDELAY;\n"

        "\treg  [(2*IWIDTH+1):0]        fifo_left [ 0:((1<<LGDELAY)-1)];\n"

"\n");

        fprintf(fp,

        "\t// Set up the input to the multiply\n"

        "\talways @(posedge i_clk)\n"

        "\talways @(posedge i_clk)\n"

                "\t\tif (i_ce)\n"

        "\tif ((f_state == 2'b00)&&((f_syncd)||(iaddr >= 4)))\n"

                "\t\tbegin\n"

        "\tbegin\n"

                        "\t\t\t// One clock just latches the inputs\n"

        "\t     assert(rnd_sum_r  == f_piped_real[3]+f_piped_real[1]);\n"

                        "\t\t\tr_left <= i_left;        // No change in # of bits\n"

        "\t     assert(rnd_sum_i  == f_piped_imag[3]+f_piped_imag[1]);\n"

                        "\t\t\tr_right <= i_right;\n"

        "\t     assert(rnd_diff_r == f_piped_real[3]-f_piped_real[1]);\n"

                        "\t\t\tr_coef  <= i_coef;\n"

        "\t     assert(rnd_diff_i == f_piped_imag[3]-f_piped_imag[1]);\n"

                        "\t\t\t// Next clock adds/subtracts\n"

        "\tend\n"

                        "\t\t\tr_sum_r <= r_left_r + r_right_r; // Now IWIDTH+1 bits\n"

"\n"

                        "\t\t\tr_sum_i <= r_left_i + r_right_i;\n"

                        "\t\t\tr_dif_r <= r_left_r - r_right_r;\n"

                        "\t\t\tr_dif_i <= r_left_i - r_right_i;\n"

                        "\t\t\t// Other inputs are simply delayed on second clock\n"

                        "\t\t\tr_coef_2<= r_coef;\n"

        "\t\tend\n"

"\n");

        fprintf(fp,

        "\t// Don\'t forget to record the even side, since it doesn\'t need\n"

        "\t// to be multiplied, but yet we still need the results in sync\n"

        "\t// with the answer when it is ready.\n"

        "\tinitial fifo_addr = 0;\n"

        "\talways @(posedge i_clk)\n"

        "\talways @(posedge i_clk)\n"

                "\t\tif (i_rst)\n"

        "\tif ((f_state == 2'b10)&&(f_syncd))\n"

                        "\t\t\tfifo_addr <= 0;\n"

        "\tbegin\n"

                "\t\telse if (i_ce)\n"

        "\t     // assert(o_sync);\n"

                        "\t\t\t// Need to delay the sum side--nothing else happens\n"

        "\t     assert(f_o_real == f_piped_real[5] + f_piped_real[3]);\n"

                        "\t\t\t// to it, but it needs to stay synchronized with the\n"

        "\t     assert(f_o_imag == f_piped_imag[5] + f_piped_imag[3]);\n"

                        "\t\t\t// right side.\n"

        "\tend\n"

                        "\t\t\tfifo_addr <= fifo_addr + 1;\n"

"\n"

"\n"

        "\talways @(posedge i_clk)\n"

        "\talways @(posedge i_clk)\n"

                "\t\tif (i_ce)\n"

        "\tif ((f_state == 2'b11)&&(f_syncd))\n"

                        "\t\t\tfifo_left[fifo_addr] <= { r_sum_r, r_sum_i };\n"

        "\tbegin\n"

        "\t     assert(!o_sync);\n"

        "\t     assert(f_o_real == f_piped_real[5] + f_piped_real[3]);\n"

        "\t     assert(f_o_imag == f_piped_imag[5] + f_piped_imag[3]);\n"

        "\tend\n"

"\n"

"\n"

        "\twire\tsigned\t[(CWIDTH-1):0] ir_coef_r, ir_coef_i;\n"

        "\talways @(posedge i_clk)\n"

        "\tassign\tir_coef_r = r_coef_2[(2*CWIDTH-1):CWIDTH];\n"

        "\tif ((f_state == 2'b00)&&(f_syncd))\n"

        "\tassign\tir_coef_i = r_coef_2[(CWIDTH-1):0];\n"

        "\tbegin\n"

        "\twire\tsigned\t[((IWIDTH+2)+(CWIDTH+1)-1):0]\tp_one, p_two, p_three;\n"

        "\t     assert(!o_sync);\n"

        "\t     assert(f_o_real == f_piped_real[7] - f_piped_real[5]);\n"

        "\t     assert(f_o_imag == f_piped_imag[7] - f_piped_imag[5]);\n"

        "\tend\n"

"\n"

"\n"

"\n");

        "\talways @(*)\n"

        fprintf(fp,

        "\tif ((iaddr[2:0] == 0)&&(!wait_for_sync))\n"

        "\t// Multiply output is always a width of the sum of the widths of\n"

        "\t     assume(i_sync);\n"

        "\t// the two inputs.  ALWAYS.  This is independent of the number of\n"

        "\t// bits in p_one, p_two, or p_three.  These values needed to \n"

        "\t// accumulate a bit (or two) each.  However, this approach to a\n"

        "\t// three multiply complex multiply cannot increase the total\n"

        "\t// number of bits in our final output.  We\'ll take care of\n"

        "\t// dropping back down to the proper width, OWIDTH, in our routine\n"

        "\t// below.\n"

"\n"

"\n"

"\n");

        "\talways @(*)\n"

        fprintf(fp,

        "\tif (wait_for_sync)\n"

        "\t// We accomplish here \"Karatsuba\" multiplication.  That is,\n"

        "\t     assert((iaddr == 0)&&(f_state == 2'b00)&&(!o_sync)&&(!f_rsyncd));\n"

        "\t// by doing three multiplies we accomplish the work of four.\n"

        "\t// Let\'s prove to ourselves that this works ... We wish to\n"

        "\t// multiply: (a+jb) * (c+jd), where a+jb is given by\n"

        "\t//\ta + jb = r_dif_r + j r_dif_i, and\n"

        "\t//\tc + jd = ir_coef_r + j ir_coef_i.\n"

        "\t// We do this by calculating the intermediate products P1, P2,\n"

        "\t// and P3 as\n"

        "\t//\tP1 = ac\n"

        "\t//\tP2 = bd\n"

        "\t//\tP3 = (a + b) * (c + d)\n"

        "\t// and then complete our final answer with\n"

        "\t//\tac - bd = P1 - P2 (this checks)\n"

        "\t//\tad + bc = P3 - P2 - P1\n"

        "\t//\t        = (ac + bc + ad + bd) - bd - ac\n"

        "\t//\t        = bc + ad (this checks)\n"

"\n"

"\n"

"\n");

        "\talways @(posedge i_clk)\n"

        fprintf(fp,

        "\tif ((f_past_valid)&&($past(i_ce))&&($past(i_sync))&&(!$past(i_reset)))\n"

        "\t// This should really be based upon an IF, such as in\n"

        "\t     assert(!wait_for_sync);\n"

        "\t// if (IWIDTH < CWIDTH) then ...\n"

"\n"

        "\t// However, this is the only (other) way I know to do it.\n"

        "\talways @(posedge i_clk)\n"

        "\tgenerate if (CWIDTH < IWIDTH+1)\n"

        "\tif ((f_state == 2'b01)&&(f_syncd))\n"

        "\tbegin\n"

        "\tbegin\n"

                "\t\twire\t[(CWIDTH):0]\tp3c_in;\n"

        "\t     assert(!o_sync);\n"

                "\t\twire\t[(IWIDTH+1):0]\tp3d_in;\n"

        "\t     if (INVERSE)\n"

                "\t\tassign\tp3c_in = ir_coef_i + ir_coef_r;\n"

        "\t     begin\n"

                "\t\tassign\tp3d_in = r_dif_r + r_dif_i;\n"

        "\t             assert(f_o_real == -f_piped_imag[7]+f_piped_imag[5]);\n"

                "\n"

        "\t             assert(f_o_imag ==  f_piped_real[7]-f_piped_real[5]);\n"

                "\t\t// We need to pad these first two multiplies by an extra\n"

                "\t\t// bit just to keep them aligned with the third,\n"

                "\t\t// simpler, multiply.\n"

                "\t\t%s #(CWIDTH+1,IWIDTH+2) p1(i_clk, i_ce,\n"

                                "\t\t\t\t{ir_coef_r[CWIDTH-1],ir_coef_r},\n"

                                "\t\t\t\t{r_dif_r[IWIDTH],r_dif_r}, p_one);\n"

                "\t\t%s #(CWIDTH+1,IWIDTH+2) p2(i_clk, i_ce,\n"

                                "\t\t\t\t{ir_coef_i[CWIDTH-1],ir_coef_i},\n"

                                "\t\t\t\t{r_dif_i[IWIDTH],r_dif_i}, p_two);\n"

                "\t\t%s #(CWIDTH+1,IWIDTH+2) p3(i_clk, i_ce,\n"

                        "\t\t\t\tp3c_in, p3d_in, p_three);\n"

        "\tend else begin\n"

        "\tend else begin\n"

                "\t\twire\t[(CWIDTH):0]\tp3c_in;\n"

        "\t             assert(f_o_real ==  f_piped_imag[7]-f_piped_imag[5]);\n"

                "\t\twire\t[(IWIDTH+1):0]\tp3d_in;\n"

        "\t             assert(f_o_imag == -f_piped_real[7]+f_piped_real[5]);\n"

                "\t\tassign\tp3c_in = ir_coef_i + ir_coef_r;\n"

        "\t     end\n"

                "\t\tassign\tp3d_in = r_dif_r + r_dif_i;\n"

                "\n"

                "\t\t%s #(IWIDTH+2,CWIDTH+1) p1a(i_clk, i_ce,\n"

                                "\t\t\t\t{r_dif_r[IWIDTH],r_dif_r},\n"

                                "\t\t\t\t{ir_coef_r[CWIDTH-1],ir_coef_r}, p_one);\n"

                "\t\t%s #(IWIDTH+2,CWIDTH+1) p2a(i_clk, i_ce,\n"

                                "\t\t\t\t{r_dif_i[IWIDTH], r_dif_i},\n"

                                "\t\t\t\t{ir_coef_i[CWIDTH-1],ir_coef_i}, p_two);\n"

                "\t\t%s #(IWIDTH+2,CWIDTH+1) p3a(i_clk, i_ce,\n"

                                "\t\t\t\tp3d_in, p3c_in, p_three);\n"

        "\tend\n"

        "\tend\n"

        "\tendgenerate\n"

"\n",

                (USE_OLD_MULTIPLY)?"shiftaddmpy":"longbimpy",

                (USE_OLD_MULTIPLY)?"shiftaddmpy":"longbimpy",

                (USE_OLD_MULTIPLY)?"shiftaddmpy":"longbimpy",

                (USE_OLD_MULTIPLY)?"shiftaddmpy":"longbimpy",

                (USE_OLD_MULTIPLY)?"shiftaddmpy":"longbimpy",

                (USE_OLD_MULTIPLY)?"shiftaddmpy":"longbimpy");

        fprintf(fp,

        "\t// These values are held in memory and delayed during the\n"

        "\t// multiply.  Here, we recover them.  During the multiply,\n"

        "\t// values were multiplied by 2^(CWIDTH-2)*exp{-j*2*pi*...},\n"

        "\t// therefore, the left_x values need to be right shifted by\n"

        "\t// CWIDTH-2 as well.  The additional bits come from a sign\n"

        "\t// extension.\n"

        "\twire\tsigned\t[(IWIDTH+CWIDTH):0]    fifo_i, fifo_r;\n"

        "\treg\t\t[(2*IWIDTH+1):0]      fifo_read;\n"

        "\tassign\tfifo_r = { {2{fifo_read[2*(IWIDTH+1)-1]}}, fifo_read[(2*(IWIDTH+1)-1):(IWIDTH+1)], {(CWIDTH-2){1\'b0}} };\n"

        "\tassign\tfifo_i = { {2{fifo_read[(IWIDTH+1)-1]}}, fifo_read[((IWIDTH+1)-1):0], {(CWIDTH-2){1\'b0}} };\n"

"\n"

"\n"

"\n"

        "\treg\tsigned\t[(CWIDTH+IWIDTH+3-1):0] mpy_r, mpy_i;\n"

"`endif\n");

"\n");

        fprintf(fp,

        "\t// Let's do some rounding and remove unnecessary bits.\n"

        "\t// We have (IWIDTH+CWIDTH+3) bits here, we need to drop down to\n"

        "\t// OWIDTH, and SHIFT by SHIFT bits in the process.  The trick is\n"

        "\t// that we don\'t need (IWIDTH+CWIDTH+3) bits.  We\'ve accumulated\n"

        "\t// them, but the actual values will never fill all these bits.\n"

        "\t// In particular, we only need:\n"

        "\t//\t IWIDTH bits for the input\n"

        "\t//\t     +1 bit for the add/subtract\n"

        "\t//\t+CWIDTH bits for the coefficient multiply\n"

        "\t//\t     +1 bit for the add/subtract in the complex multiply\n"

        "\t//\t ------\n"

        "\t//\t (IWIDTH+CWIDTH+2) bits at full precision.\n"

        "\t//\n"

        "\t// However, the coefficient multiply multiplied by a maximum value\n"

        "\t// of 2^(CWIDTH-2).  Thus, we only have\n"

        "\t//\t   IWIDTH bits for the input\n"

        "\t//\t       +1 bit for the add/subtract\n"

        "\t//\t+CWIDTH-2 bits for the coefficient multiply\n"

        "\t//\t       +1 (optional) bit for the add/subtract in the cpx mpy.\n"

        "\t//\t -------- ... multiply.  (This last bit may be shifted out.)\n"

        "\t//\t (IWIDTH+CWIDTH) valid output bits. \n"

        "\t// Now, if the user wants to keep any extras of these (via OWIDTH),\n"

        "\t// or if he wishes to arbitrarily shift some of these off (via\n"

        "\t// SHIFT) we accomplish that here.\n"

"\n");

        fprintf(fp,

        "\twire\tsigned\t[(OWIDTH-1):0]\trnd_left_r, rnd_left_i, rnd_right_r, rnd_right_i;\n\n");

        fprintf(fp,

        "\t%s #(CWIDTH+IWIDTH+3,OWIDTH,SHIFT+4) do_rnd_left_r(i_clk, i_ce,\n"

        "\t\t\t\t{ {2{fifo_r[(IWIDTH+CWIDTH)]}}, fifo_r }, rnd_left_r);\n\n",

                rnd_string);

        fprintf(fp,

        "\t%s #(CWIDTH+IWIDTH+3,OWIDTH,SHIFT+4) do_rnd_left_i(i_clk, i_ce,\n"

        "\t\t\t\t{ {2{fifo_i[(IWIDTH+CWIDTH)]}}, fifo_i }, rnd_left_i);\n\n",

                rnd_string);

        fprintf(fp,

        "\t%s #(CWIDTH+IWIDTH+3,OWIDTH,SHIFT+4) do_rnd_right_r(i_clk, i_ce,\n"

        "\t\t\t\tmpy_r, rnd_right_r);\n\n", rnd_string);

        fprintf(fp,

        "\t%s #(CWIDTH+IWIDTH+3,OWIDTH,SHIFT+4) do_rnd_right_i(i_clk, i_ce,\n"

        "\t\t\t\tmpy_i, rnd_right_i);\n\n", rnd_string);

        fprintf(fp,

        "\talways @(posedge i_clk)\n"

                "\t\tif (i_ce)\n"

                "\t\tbegin\n"

                        "\t\t\t// First clock, recover all values\n"

                        "\t\t\tfifo_read <= fifo_left[fifo_read_addr];\n"

                        "\t\t\t// These values are IWIDTH+CWIDTH+3 bits wide\n"

                        "\t\t\t// although they only need to be (IWIDTH+1)\n"

                        "\t\t\t// + (CWIDTH) bits wide.  (We\'ve got two\n"

                        "\t\t\t// extra bits we need to get rid of.)\n"

                        "\t\t\tmpy_r <= p_one - p_two;\n"

                        "\t\t\tmpy_i <= p_three - p_one - p_two;\n"

                "\t\tend\n"

"\n");

        fprintf(fp,

        "\treg\t[(AUXLEN-1):0]\taux_pipeline;\n"

        "\tinitial\taux_pipeline = 0;\n"

        "\talways @(posedge i_clk)\n"

        "\t\tif (i_rst)\n"

        "\t\t\taux_pipeline <= 0;\n"

        "\t\telse if (i_ce)\n"

        "\t\t\taux_pipeline <= { aux_pipeline[(AUXLEN-2):0], i_aux };\n"

"\n");

        fprintf(fp,

        "\tinitial o_aux = 1\'b0;\n"

        "\talways @(posedge i_clk)\n"

                "\t\tif (i_rst)\n"

                "\t\t\to_aux <= 1\'b0;\n"

                "\t\telse if (i_ce)\n"

                "\t\tbegin\n"

                        "\t\t\t// Second clock, latch for final clock\n"

                        "\t\t\to_aux <= aux_pipeline[AUXLEN-1];\n"

                "\t\tend\n"

"\n");

        fprintf(fp,

        fprintf(fp, "endmodule\n");

        "\t// As a final step, we pack our outputs into two packed two\'s\n"

        "\t// complement numbers per output word, so that each output word\n"

        "\t// has (2*OWIDTH) bits in it, with the top half being the real\n"

        "\t// portion and the bottom half being the imaginary portion.\n"

        "\tassign       o_left = { rnd_left_r, rnd_left_i };\n"

        "\tassign       o_right= { rnd_right_r,rnd_right_i};\n"

"\n"

"endmodule\n");

        fclose(fp);

void    build_hwbfly(const char *fname, int xtracbits, ROUND_T rounding) {

void    build_sngllast(const char *fname, const bool async_reset = false) {

        FILE    *fp = fopen(fname, "w");

        FILE    *fp = fopen(fname, "w");

        if (NULL == fp) {

        if (NULL == fp) {

                fprintf(stderr, "Could not open \'%s\' for writing\n", fname);

                fprintf(stderr, "Could not open \'%s\' for writing\n", fname);

                perror("O/S Err was:");

                perror("O/S Err was:");

                return;

                return;

        const   char    *rnd_string;

        std::string     resetw("i_reset");

        if (rounding == RND_TRUNCATE)

        if (async_reset)

                rnd_string = "truncate";

                resetw = std::string("i_areset_n");

        else if (rounding == RND_FROMZERO)

                rnd_string = "roundfromzero";

        else if (rounding == RND_HALFUP)

                rnd_string = "roundhalfup";

        else

                rnd_string = "convround";

        fprintf(fp,

        fprintf(fp,

"///////////////////////////////////////////////////////////////////////////\n"

SLASHLINE

"//\n"

"//\n"

"// Filename:   hwbfly.v\n"

"// Filename:\tlaststage.v\n"

"//\n"

"//\n"

"// Project:    %s\n"

"// Project:    %s\n"

"//\n"

"//\n"

"// Purpose:    This routine is identical to the butterfly.v routine found\n"

"// Purpose:    This is part of an FPGA implementation that will process\n"

"//             in 'butterfly.v', save only that it uses the verilog \n"

"//             the final stage of a decimate-in-frequency FFT, running\n"

"//     operator '*' in hopes that the synthesizer would be able to optimize\n"

"//     through the data at one sample per clock.\n"

"//     it with hardware resources.\n"

"//\n"

"//     It is understood that a hardware multiply can complete its operation in\n"

"//     a single clock.\n"

"//\n"

"//\n"

"//\n%s"

"//\n%s"

"//\n", prjname, creator);

"//\n", prjname, creator);

        fprintf(fp, "%s", cpyleft);

        fprintf(fp, "%s", cpyleft);

        fprintf(fp, "//\n//\n`default_nettype\tnone\n//\n");

        fprintf(fp, "//\n//\n`default_nettype\tnone\n//\n");

        fprintf(fp,

"module hwbfly(i_clk, i_rst, i_ce, i_coef, i_left, i_right, i_aux,\n"

                "\t\to_left, o_right, o_aux);\n"

        "\t// Public changeable parameters ...\n"

        "\tparameter IWIDTH=16,CWIDTH=IWIDTH+%d,OWIDTH=IWIDTH+1;\n"

        "\t// Parameters specific to the core that should not be changed.\n"

        "\tparameter\tSHIFT=0;\n"

        "\tinput\t\ti_clk, i_rst, i_ce;\n"

        "\tinput\t\t[(2*CWIDTH-1):0]\ti_coef;\n"

        "\tinput\t\t[(2*IWIDTH-1):0]\ti_left, i_right;\n"

        "\tinput\t\ti_aux;\n"

        "\toutput\twire\t[(2*OWIDTH-1):0]\to_left, o_right;\n"

        "\toutput\treg\to_aux;\n"

"\n", xtracbits);

        fprintf(fp,

        "\treg\t[(2*IWIDTH-1):0]        r_left, r_right;\n"

        "\treg\t                        r_aux, r_aux_2;\n"

        "\treg\t[(2*CWIDTH-1):0]        r_coef;\n"

        "\twire signed  [(IWIDTH-1):0]  r_left_r, r_left_i, r_right_r, r_right_i;\n"

        "\tassign\tr_left_r  = r_left[ (2*IWIDTH-1):(IWIDTH)];\n"

        "\tassign\tr_left_i  = r_left[ (IWIDTH-1):0];\n"

        "\tassign\tr_right_r = r_right[(2*IWIDTH-1):(IWIDTH)];\n"

        "\tassign\tr_right_i = r_right[(IWIDTH-1):0];\n"

        "\treg  signed  [(CWIDTH-1):0]  ir_coef_r, ir_coef_i;\n"

"\n"

        "\treg  signed  [(IWIDTH):0]    r_sum_r, r_sum_i, r_dif_r, r_dif_i;\n"

"\n"

        "\treg  [(2*IWIDTH+2):0]        leftv, leftvv;\n"

"\n"

        "\t// Set up the input to the multiply\n"

        "\tinitial r_aux   = 1\'b0;\n"

        "\tinitial r_aux_2 = 1\'b0;\n"

        "\talways @(posedge i_clk)\n"

                "\t\tif (i_rst)\n"

                "\t\tbegin\n"

                        "\t\t\tr_aux <= 1\'b0;\n"

                        "\t\t\tr_aux_2 <= 1\'b0;\n"

                "\t\tend else if (i_ce)\n"

                "\t\tbegin\n"

                        "\t\t\t// One clock just latches the inputs\n"

                        "\t\t\tr_aux <= i_aux;\n"

                        "\t\t\t// Next clock adds/subtracts\n"

                        "\t\t\t// Other inputs are simply delayed on second clock\n"

                        "\t\t\tr_aux_2 <= r_aux;\n"

                "\t\tend\n"

        "\talways @(posedge i_clk)\n"

                "\t\tif (i_ce)\n"

                "\t\tbegin\n"

                        "\t\t\t// One clock just latches the inputs\n"

                        "\t\t\tr_left <= i_left;        // No change in # of bits\n"

                        "\t\t\tr_right <= i_right;\n"

                        "\t\t\tr_coef  <= i_coef;\n"

                        "\t\t\t// Next clock adds/subtracts\n"

                        "\t\t\tr_sum_r <= r_left_r + r_right_r; // Now IWIDTH+1 bits\n"

                        "\t\t\tr_sum_i <= r_left_i + r_right_i;\n"

                        "\t\t\tr_dif_r <= r_left_r - r_right_r;\n"

                        "\t\t\tr_dif_i <= r_left_i - r_right_i;\n"

                        "\t\t\t// Other inputs are simply delayed on second clock\n"

                        "\t\t\tir_coef_r <= r_coef[(2*CWIDTH-1):CWIDTH];\n"

                        "\t\t\tir_coef_i <= r_coef[(CWIDTH-1):0];\n"

                "\t\tend\n"

        "\n\n");

        fprintf(fp,

"\t// See comments in the butterfly.v source file for a discussion of\n"

"\t// these operations and the appropriate bit widths.\n\n");

        fprintf(fp,

        "\treg\tsigned  [((IWIDTH+1)+(CWIDTH)-1):0]     p_one, p_two;\n"

        "\treg\tsigned  [((IWIDTH+2)+(CWIDTH+1)-1):0]   p_three;\n"

"\n"

        "\treg\tsigned  [(CWIDTH-1):0]  p1c_in, p2c_in; // Coefficient multiply inputs\n"

        "\treg\tsigned  [(IWIDTH):0]    p1d_in, p2d_in; // Data multiply inputs\n"

        "\treg\tsigned  [(CWIDTH):0]    p3c_in; // Product 3, coefficient input\n"

        "\treg\tsigned  [(IWIDTH+1):0]  p3d_in; // Product 3, data input\n"

"\n"

        "\tinitial leftv    = 0;\n"

        "\tinitial leftvv   = 0;\n"

        "\talways @(posedge i_clk)\n"

        "\tbegin\n"

                "\t\tif (i_rst)\n"

                "\t\tbegin\n"

                        "\t\t\tleftv <= 0;\n"

                        "\t\t\tleftvv <= 0;\n"

                "\t\tend else if (i_ce)\n"

                "\t\tbegin\n"

                        "\t\t\t// Second clock, pipeline = 1\n"

                        "\t\t\tleftv <= { r_aux_2, r_sum_r, r_sum_i };\n"

"\n"

                        "\t\t\t// Third clock, pipeline = 3\n"

                        "\t\t\t//   As desired, each of these lines infers a DSP48\n"

                        "\t\t\tleftvv <= leftv;\n"

                "\t\tend\n"

        "\tend\n"

"\n"

        "\talways @(posedge i_clk)\n"

                "\t\tif (i_ce)\n"

                "\t\tbegin\n"

                        "\t\t\t// Second clock, pipeline = 1\n"

                        "\t\t\tp1c_in <= ir_coef_r;\n"

                        "\t\t\tp2c_in <= ir_coef_i;\n"

                        "\t\t\tp1d_in <= r_dif_r;\n"

                        "\t\t\tp2d_in <= r_dif_i;\n"

                        "\t\t\tp3c_in <= ir_coef_i + ir_coef_r;\n"

                        "\t\t\tp3d_in <= r_dif_r + r_dif_i;\n"

"\n"

"\n"

                        "\t\t\t// Third clock, pipeline = 3\n"

                        "\t\t\t//   As desired, each of these lines infers a DSP48\n"

                        "\t\t\tp_one   <= p1c_in * p1d_in;\n"

                        "\t\t\tp_two   <= p2c_in * p2d_in;\n"

                        "\t\t\tp_three <= p3c_in * p3d_in;\n"

                "\t\tend\n"

"\n"

        "\twire\tsigned [((IWIDTH+2)+(CWIDTH+1)-1):0]   w_one, w_two;\n"

        "\tassign\tw_one = { {(2){p_one[((IWIDTH+1)+(CWIDTH)-1)]}}, p_one };\n"

        "\tassign\tw_two = { {(2){p_two[((IWIDTH+1)+(CWIDTH)-1)]}}, p_two };\n"

"\n");

        fprintf(fp,

        fprintf(fp,

        "\t// These values are held in memory and delayed during the\n"

"module laststage(i_clk, %s, i_ce, i_sync, i_val, o_val, o_sync);\n"

        "\t// multiply.  Here, we recover them.  During the multiply,\n"

"       parameter       IWIDTH=16,OWIDTH=IWIDTH+1, SHIFT=0;\n"

        "\t// values were multiplied by 2^(CWIDTH-2)*exp{-j*2*pi*...},\n"

"       input                                   i_clk, %s, i_ce, i_sync;\n"

        "\t// therefore, the left_x values need to be right shifted by\n"

"       input           [(2*IWIDTH-1):0]        i_val;\n"

        "\t// CWIDTH-2 as well.  The additional bits come from a sign\n"

"       output  wire    [(2*OWIDTH-1):0]        o_val;\n"

        "\t// extension.\n"

"       output  reg                             o_sync;\n\n",

        "\twire\taux_s;\n"

                resetw.c_str(), resetw.c_str());

        "\twire\tsigned\t[(IWIDTH+CWIDTH):0]    left_si, left_sr;\n"

        "\treg\t\t[(2*IWIDTH+2):0]      left_saved;\n"

        fprintf(fp,

        "\tassign\tleft_sr = { {2{left_saved[2*(IWIDTH+1)-1]}}, left_saved[(2*(IWIDTH+1)-1):(IWIDTH+1)], {(CWIDTH-2){1\'b0}} };\n"

"       reg     signed  [(IWIDTH-1):0]  m_r, m_i;\n"

        "\tassign\tleft_si = { {2{left_saved[(IWIDTH+1)-1]}}, left_saved[((IWIDTH+1)-1):0], {(CWIDTH-2){1\'b0}} };\n"

"       wire    signed  [(IWIDTH-1):0]  i_r, i_i;\n"

        "\tassign\taux_s = left_saved[2*IWIDTH+2];\n"

"\n"

"\n"

"       assign  i_r = i_val[(2*IWIDTH-1):(IWIDTH)]; \n"

"       assign  i_i = i_val[(IWIDTH-1):0]; \n"

"\n"

"       // Don't forget that we accumulate a bit by adding two values\n"

"       // together. Therefore our intermediate value must have one more\n"

"       // bit than the two originals.\n"

"       reg     signed  [(IWIDTH):0]    rnd_r, rnd_i, sto_r, sto_i;\n"

"       reg                             wait_for_sync, stage;\n"

"       reg             [1:0]           sync_pipe;\n"

"\n"

"\n"

        "\t(* use_dsp48=\"no\" *)\n"

"       initial wait_for_sync = 1'b1;\n"

        "\treg  signed  [(CWIDTH+IWIDTH+3-1):0] mpy_r, mpy_i;\n");

"       initial stage         = 1'b0;\n");

        fprintf(fp,

        "\twire\tsigned\t[(OWIDTH-1):0]\trnd_left_r, rnd_left_i, rnd_right_r, rnd_right_i;\n\n");

        if (async_reset)

                fprintf(fp, "\talways @(posedge i_clk, negedge i_areset_n)\n\t\tif (!i_areset_n)\n");

        else

                fprintf(fp, "\talways @(posedge i_clk)\n\t\tif (i_reset)\n");

        fprintf(fp,

        fprintf(fp,

        "\t%s #(CWIDTH+IWIDTH+1,OWIDTH,SHIFT+2) do_rnd_left_r(i_clk, i_ce,\n"

"               begin\n"

        "\t\t\t\tleft_sr, rnd_left_r);\n\n",

"                       wait_for_sync <= 1'b1;\n"

                rnd_string);

"                       stage         <= 1'b0;\n"

        fprintf(fp,

"               end else if ((i_ce)&&((!wait_for_sync)||(i_sync))&&(!stage))\n"

        "\t%s #(CWIDTH+IWIDTH+1,OWIDTH,SHIFT+2) do_rnd_left_i(i_clk, i_ce,\n"

"               begin\n"

        "\t\t\t\tleft_si, rnd_left_i);\n\n",

"                       wait_for_sync <= 1'b0;\n"

                rnd_string);

"                       //\n"

        fprintf(fp,

"                       stage <= 1'b1;\n"

        "\t%s #(CWIDTH+IWIDTH+3,OWIDTH,SHIFT+4) do_rnd_right_r(i_clk, i_ce,\n"

"                       //\n"

        "\t\t\t\tmpy_r, rnd_right_r);\n\n", rnd_string);

"               end else if (i_ce)\n"

        fprintf(fp,

"                       stage <= 1'b0;\n\n");

        "\t%s #(CWIDTH+IWIDTH+3,OWIDTH,SHIFT+4) do_rnd_right_i(i_clk, i_ce,\n"

        "\t\t\t\tmpy_i, rnd_right_i);\n\n", rnd_string);

        fprintf(fp, "\tinitial\tsync_pipe = 0;\n");

        if (async_reset)

                fprintf(fp,

                "\talways @(posedge i_clk, negedge i_areset_n)\n"

                "\tif (!i_areset_n)\n");

        else

        fprintf(fp,

        fprintf(fp,

        "\tinitial left_saved = 0;\n"

        "\tinitial o_aux      = 1\'b0;\n"

        "\talways @(posedge i_clk)\n"

        "\t\tif (i_rst)\n"

        "\t\tbegin\n"

                "\t\t\tleft_saved <= 0;\n"

                "\t\t\to_aux <= 1\'b0;\n"

        "\t\tend else if (i_ce)\n"

        "\t\tbegin\n"

                "\t\t\t// First clock, recover all values\n"

                "\t\t\tleft_saved <= leftvv;\n"

"\n"

                "\t\t\t// Second clock, round and latch for final clock\n"

                "\t\t\to_aux <= aux_s;\n"

        "\t\tend\n"

        "\talways @(posedge i_clk)\n"

        "\talways @(posedge i_clk)\n"

        "\t\tif (i_ce)\n"

                "\tif (i_reset)\n");

        "\t\tbegin\n"

                "\t\t\t// These values are IWIDTH+CWIDTH+3 bits wide\n"

                "\t\t\t// although they only need to be (IWIDTH+1)\n"

                "\t\t\t// + (CWIDTH) bits wide.  (We've got two\n"

                "\t\t\t// extra bits we need to get rid of.)\n"

                "\n"

                "\t\t\t// These two lines also infer DSP48\'s.\n"

                "\t\t\t// To keep from using extra DSP48 resources,\n"

                "\t\t\t// they are prevented from using DSP48\'s\n"

                "\t\t\t// by the (* use_dsp48 ... *) comment above.\n"

                "\t\t\tmpy_r <= w_one - w_two;\n"

                "\t\t\tmpy_i <= p_three - w_one - w_two;\n"

        "\t\tend\n"

        "\n");

        fprintf(fp,

        fprintf(fp,

        "\t// As a final step, we pack our outputs into two packed two's\n"

                "\t\tsync_pipe <= 0;\n"

        "\t// complement numbers per output word, so that each output word\n"

                "\telse if (i_ce)\n"

        "\t// has (2*OWIDTH) bits in it, with the top half being the real\n"

                "\t\tsync_pipe <= { sync_pipe[0], i_sync };\n\n");

        "\t// portion and the bottom half being the imaginary portion.\n"

        "\tassign\to_left = { rnd_left_r, rnd_left_i };\n"

        "\tassign\to_right= { rnd_right_r,rnd_right_i};\n"

"\n"

"endmodule\n");

        fprintf(fp, "\tinitial\to_sync = 1\'b0;\n");

        if (async_reset)

void    build_stage(const char *fname, const char *coredir, int stage, bool odd, int nbits, bool inv, int xtra, bool hwmpy=false, bool dbg=false) {

                fprintf(fp,

        FILE    *fstage = fopen(fname, "w");

                "\talways @(posedge i_clk, negedge i_areset_n)\n"

        int     cbits = nbits + xtra;

                "\tif (!i_areset_n)\n");

        else

        if ((cbits * 2) >= sizeof(long long)*8) {

                fprintf(fp,

                fprintf(stderr, "ERROR: CMEM Coefficient precision requested overflows long long data type.\n");

                "\talways @(posedge i_clk)\n"

                exit(-1);

                "\tif (i_reset)\n");

        fprintf(fp,

        if (fstage == NULL) {

                "\t\to_sync <= 1\'b0;\n"

                fprintf(stderr, "ERROR: Could not open %s for writing!\n", fname);

                "\telse if (i_ce)\n"

                perror("O/S Err was:");

                "\t\to_sync <= sync_pipe[1];\n\n");

                fprintf(stderr, "Attempting to continue, but this file will be missing.\n");

                return;

        fprintf(fstage,

        fprintf(fp,

"////////////////////////////////////////////////////////////////////////////\n"

"       always @(posedge i_clk)\n"

"       if (i_ce)\n"

"       begin\n"

"               if (!stage)\n"

"               begin\n"

"                       // Clock 1\n"

"                       m_r <= i_r;\n"

"                       m_i <= i_i;\n"

"                       // Clock 3\n"

"                       rnd_r <= sto_r;\n"

"                       rnd_i <= sto_i;\n"

"//\n"

"//\n"

"// Filename:   %sfftstage_%c%d%s.v\n"

"               end else begin\n"

"                       // Clock 2\n"

"                       rnd_r <= m_r + i_r;\n"

"                       rnd_i <= m_i + i_i;\n"

"//\n"

"//\n"

"// Project:    %s\n"

"                       sto_r <= m_r - i_r;\n"

"                       sto_i <= m_i - i_i;\n"

"//\n"

"//\n"

"// Purpose:    This file is (almost) a Verilog source file.  It is meant to\n"

"               end\n"

"//             be used by a FFT core compiler to generate FFTs which may be\n"

"       end\n"

"//             used as part of an FFT core.  Specifically, this file \n"

"\n"

"//             encapsulates the options of an FFT-stage.  For any 2^N length\n"

"       // Now that we have our results, let's round them and report them\n"

"//             FFT, there shall be (N-1) of these stages.  \n"

"       wire    signed  [(OWIDTH-1):0]  o_r, o_i;\n"

"//\n%s"

"\n"

"//\n",

"       convround #(IWIDTH+1,OWIDTH,SHIFT) do_rnd_r(i_clk, i_ce, rnd_r, o_r);\n"

                (inv)?"i":"", (odd)?'o':'e', stage*2, (dbg)?"_dbg":"", prjname, creator);

"       convround #(IWIDTH+1,OWIDTH,SHIFT) do_rnd_i(i_clk, i_ce, rnd_i, o_i);\n"

        fprintf(fstage, "%s", cpyleft);

"\n"

        fprintf(fstage, "//\n//\n`default_nettype\tnone\n//\n");

"       assign  o_val  = { o_r, o_i };\n"

        fprintf(fstage, "module\t%sfftstage_%c%d%s(i_clk, i_rst, i_ce, i_sync, i_data, o_data, o_sync%s);\n",

                (inv)?"i":"", (odd)?'o':'e', stage*2, (dbg)?"_dbg":"",

                (dbg)?", o_dbg":"");

        // These parameter values are useless at this point--they are to be

        // replaced by the parameter values in the calling program.  Only

        // problem is, the CWIDTH needs to match exactly!

        fprintf(fstage, "\tparameter\tIWIDTH=%d,CWIDTH=%d,OWIDTH=%d;\n",

                nbits, cbits, nbits+1);

        fprintf(fstage,

"\t// Parameters specific to the core that should be changed when this\n"

"\t// core is built ... Note that the minimum LGSPAN (the base two log\n"

"\t// of the span, or the base two log of the current FFT size) is 3.\n"

"\t// Smaller spans (i.e. the span of 2) must use the dblstage module.\n"

"\tparameter\tLGWIDTH=11, LGSPAN=9, LGBDLY=5, BFLYSHIFT=0;\n");

        fprintf(fstage,

"\tinput                                        i_clk, i_rst, i_ce, i_sync;\n"

"\tinput                [(2*IWIDTH-1):0]        i_data;\n"

"\toutput       reg     [(2*OWIDTH-1):0]        o_data;\n"

"\toutput       reg                             o_sync;\n"

"\n");

        if (dbg) { fprintf(fstage, "\toutput\twire\t[33:0]\t\t\to_dbg;\n"

                "\tassign\to_dbg = { ((o_sync)&&(i_ce)), i_ce, o_data[(2*OWIDTH-1):(2*OWIDTH-16)],\n"

                        "\t\t\t\t\to_data[(OWIDTH-1):(OWIDTH-16)] };\n"

"\n");

"\n");

        fprintf(fstage,

"\treg  wait_for_sync;\n"

"\treg  [(2*IWIDTH-1):0]        ib_a, ib_b;\n"

"\treg  [(2*CWIDTH-1):0]        ib_c;\n"

"\treg  ib_sync;\n"

"\n"

"\treg  b_started;\n"

"\twire ob_sync;\n"

"\twire [(2*OWIDTH-1):0]\tob_a, ob_b;\n");

        fprintf(fstage,

"\n"

"\t// %scmem is defined as an array of real and complex values,\n"

"\t// where the top CWIDTH bits are the real value and the bottom\n"

"\t// CWIDTH bits are the imaginary value.\n"

"\t//\n"

"\t// %scmem[i] = { (2^(CWIDTH-2)) * cos(2*pi*i/(2^LGWIDTH)),\n"

"\t//           (2^(CWIDTH-2)) * sin(2*pi*i/(2^LGWIDTH)) };\n"

"\t//\n"

"\treg  [(2*CWIDTH-1):0]        %scmem [0:((1<<LGSPAN)-1)];\n"

"\tinitial\t$readmemh(\"%scmem_%c%d.hex\",%scmem);\n\n",

                (inv)?"i":"", (inv)?"i":"", (inv)?"i":"",

                (inv)?"i":"", (odd)?'o':'e',stage<<1, (inv)?"i":"");

                FILE    *cmem;

                        char    *memfile, *ptr;

                        memfile = new char[strlen(fname)+128];

                        strcpy(memfile, fname);

                        if ((NULL != (ptr = strrchr(memfile, '/')))&&(ptr>memfile)) {

                                ptr++;

                                sprintf(ptr, "%scmem_%c%d.hex", (inv)?"i":"", (odd)?'o':'e', stage*2);

                        } else {

                                sprintf(memfile, "%s/%scmem_%c%d.hex",

                                        coredir, (inv)?"i":"",

                                        (odd)?'o':'e', stage*2);

                        // strcpy(&memfile[strlen(memfile)-2], ".hex");

                        cmem = fopen(memfile, "w");

                        if (NULL == cmem) {

                                fprintf(stderr, "Could not open/write \'%s\' with FFT coefficients.\n", memfile);

                                perror("Err from O/S:");

                                exit(-2);

                        delete[] memfile;

                // fprintf(cmem, "// CBITS = %d, inv = %s\n", cbits, (inv)?"true":"false");

                for(int i=0; i<stage/2; i++) {

                        int k = 2*i+odd;

                        double  W = ((inv)?1:-1)*2.0*M_PI*k/(double)(2*stage);

                        double  c, s;

                        long long ic, is, vl;

                        c = cos(W); s = sin(W);

                        ic = (long long)llround((1ll<<(cbits-2)) * c);

                        is = (long long)llround((1ll<<(cbits-2)) * s);

                        vl = (ic & (~(-1ll << (cbits))));

                        vl <<= (cbits);

                        vl |= (is & (~(-1ll << (cbits))));

                        fprintf(cmem, "%0*llx\n", ((cbits*2+3)/4), vl);

/*

                        fprintf(cmem, "%0*llx\t\t// %f+j%f -> %llx +j%llx\n",

                                ((cbits*2+3)/4), vl, c, s,

                                ic & (~(-1ll<<(((cbits+3)/4)*4))),

                                is & (~(-1ll<<(((cbits+3)/4)*4))));

*/

                } fclose(cmem);

        fprintf(fstage,

        if (formal_property_flag) {

"\treg  [(LGWIDTH-2):0]         iaddr;\n"

                fprintf(fp,

"\treg  [(2*IWIDTH-1):0]        imem    [0:((1<<LGSPAN)-1)];\n"

        "`ifdef FORMAL\n"

                "\treg  f_past_valid;\n"

                "\tinitial      f_past_valid = 1'b0;\n"

                "\talways @(posedge i_clk)\n"

                "\t     f_past_valid <= 1'b1;\n"

        "\n"

        "`ifdef LASTSTAGE\n"

                "\talways @(posedge i_clk)\n"

                "\t     assume((i_ce)||($past(i_ce))||($past(i_ce,2)));\n"

        "`endif\n"

"\n"

"\n"

"\treg  [LGSPAN:0]              oB;\n"

                "\tinitial      assert(IWIDTH+1 == OWIDTH);\n"

"\treg  [(2*OWIDTH-1):0]        omem    [0:((1<<LGSPAN)-1)];\n"

"\n"

"\n"

"\tinitial wait_for_sync = 1\'b1;\n"

                "\treg  signed  [IWIDTH-1:0]    f_piped_real    [0:3];\n"

"\tinitial iaddr = 0;\n"

                "\treg  signed  [IWIDTH-1:0]    f_piped_imag    [0:3];\n"

"\talways @(posedge i_clk)\n"

"\talways @(posedge i_clk)\n"

        "\t\tif (i_rst)\n"

                "\tif (i_ce)\n"

        "\t\tbegin\n"

                "\tbegin\n"

                "\t\t\twait_for_sync <= 1\'b1;\n"

                "\t     f_piped_real[0] <= i_val[2*IWIDTH-1:IWIDTH];\n"

                "\t\t\tiaddr <= 0;\n"

                "\t     f_piped_imag[0] <= i_val[  IWIDTH-1:0];\n"

        "\t\tend\n"

        "\n"

        "\t\telse if ((i_ce)&&((!wait_for_sync)||(i_sync)))\n"

                "\t     f_piped_real[1] <= f_piped_real[0];\n"

        "\t\tbegin\n"

                "\t     f_piped_imag[1] <= f_piped_imag[0];\n"

                "\t\t\t//\n"

        "\n"

                "\t\t\t// First step: Record what we\'re not ready to use yet\n"

                "\t     f_piped_real[2] <= f_piped_real[1];\n"

                "\t\t\t//\n"

                "\t     f_piped_imag[2] <= f_piped_imag[1];\n"

                "\t\t\tiaddr <= iaddr + { {(LGWIDTH-2){1\'b0}}, 1\'b1 };\n"

        "\n"

                "\t\t\twait_for_sync <= 1\'b0;\n"

                "\t     f_piped_real[3] <= f_piped_real[2];\n"

        "\t\tend\n"

                "\t     f_piped_imag[3] <= f_piped_imag[2];\n"

"\talways @(posedge i_clk) // Need to make certain here that we don\'t read\n"

                "\tend\n"

        "\t\tif ((i_ce)&&(!iaddr[LGSPAN])) // and write the same address on\n"

        "\n"

                "\t\t\timem[iaddr[(LGSPAN-1):0]] <= i_data; // the same clk\n"

                "\twire f_syncd;\n"

        "\n");

                "\treg  f_rsyncd;\n"

        "\n"

        fprintf(fstage,

                "\tinitial      f_rsyncd        = 0;\n"

        "\t//\n"

        "\t// Now, we have all the inputs, so let\'s feed the butterfly\n"

        "\t//\n"

        "\tinitial ib_sync = 1\'b0;\n"

        "\talways\t@(posedge i_clk)\n"

                "\t\tif (i_rst)\n"

                        "\t\t\tib_sync <= 1\'b0;\n"

                "\t\telse if ((i_ce)&&(iaddr[LGSPAN]))\n"

                        "\t\t\tbegin\n"

                                "\t\t\t\t// Set the sync to true on the very first\n"

                                "\t\t\t\t// valid input in, and hence on the very\n"

                                "\t\t\t\t// first valid data out per FFT.\n"

                                "\t\t\t\tib_sync <= (iaddr==(1<<(LGSPAN)));\n"

                        "\t\t\tend\n"

        "\talways\t@(posedge i_clk)\n"

                "\t\tif ((i_ce)&&(iaddr[LGSPAN]))\n"

                "\t\t\tbegin\n"

                        "\t\t\t\t// One input from memory, ...\n"

                        "\t\t\t\tib_a <= imem[iaddr[(LGSPAN-1):0]];\n"

                        "\t\t\t\t// One input clocked in from the top\n"

                        "\t\t\t\tib_b <= i_data;\n"

                        "\t\t\t\t// and the coefficient or twiddle factor\n"

                        "\t\t\t\tib_c <= %scmem[iaddr[(LGSPAN-1):0]];\n"

                "\t\t\tend\n\n", (inv)?"i":"");

        if (hwmpy) {

                fprintf(fstage,

        "\thwbfly #(.IWIDTH(IWIDTH),.CWIDTH(CWIDTH),.OWIDTH(OWIDTH),\n"

                        "\t\t\t.SHIFT(BFLYSHIFT))\n"

                "\t\tbfly(i_clk, i_rst, i_ce, ib_c,\n"

                        "\t\t\tib_a, ib_b, ib_sync, ob_a, ob_b, ob_sync);\n");

        } else {

        fprintf(fstage,

        "\tbutterfly #(.IWIDTH(IWIDTH),.CWIDTH(CWIDTH),.OWIDTH(OWIDTH),\n"

                "\t\t\t.MPYDELAY(%d\'d%d),.LGDELAY(LGBDLY),.SHIFT(BFLYSHIFT))\n"

        "\t\tbfly(i_clk, i_rst, i_ce, ib_c,\n"

                "\t\t\tib_a, ib_b, ib_sync, ob_a, ob_b, ob_sync);\n",

                        lgdelay(nbits, xtra), bflydelay(nbits, xtra));

        fprintf(fstage,

        "\t//\n"

        "\t// Next step: recover the outputs from the butterfly\n"

        "\t//\n"

        "\tinitial oB        = 0;\n"

        "\tinitial o_sync    = 0;\n"

        "\tinitial b_started = 0;\n"

        "\talways\t@(posedge i_clk)\n"

        "\t\tif (i_rst)\n"

        "\t\tbegin\n"

                "\t\t\toB <= 0;\n"

                "\t\t\to_sync <= 0;\n"

                "\t\t\tb_started <= 0;\n"

        "\t\tend else if (i_ce)\n"

        "\t\tbegin\n"

        "\t\t\to_sync <= (!oB[LGSPAN])?ob_sync : 1\'b0;\n"

        "\t\t\tif (ob_sync||b_started)\n"

                "\t\t\t\toB <= oB + { {(LGSPAN){1\'b0}}, 1\'b1 };\n"

        "\t\t\tif ((ob_sync)&&(!oB[LGSPAN]))\n"

                "\t\t\t// A butterfly output is available\n"

                        "\t\t\t\tb_started <= 1\'b1;\n"

        "\t\tend\n\n");

        fprintf(fstage,

        "\treg  [(LGSPAN-1):0]\t\tdly_addr;\n"

        "\treg  [(2*OWIDTH-1):0]\tdly_value;\n"

        "\talways @(posedge i_clk)\n"

        "\talways @(posedge i_clk)\n"

        "\t\tif (i_ce)\n"

                "\tif (i_reset)\n"

        "\t\tbegin\n"

                "\t     f_rsyncd <= 1'b0;\n"

        "\t\t\tdly_addr <= oB[(LGSPAN-1):0];\n"

                "\telse if (!f_rsyncd)\n"

        "\t\t\tdly_value <= ob_b;\n"

                "\t     f_rsyncd <= o_sync;\n"

        "\t\tend\n"

                "\tassign       f_syncd = (f_rsyncd)||(o_sync);\n"

        "\n"

                "\treg  f_state;\n"

                "\tinitial      f_state = 0;\n"

        "\talways @(posedge i_clk)\n"

        "\talways @(posedge i_clk)\n"

        "\t\tif (i_ce)\n"

                "\tif (i_reset)\n"

                "\t\t\tomem[dly_addr] <= dly_value;\n"

                "\t     f_state <= 0;\n"

"\n");

                "\telse if ((i_ce)&&((!wait_for_sync)||(i_sync)))\n"

        fprintf(fstage,

                "\t     f_state <= f_state + 1;\n"

        "\n"

                "\talways @(*)\n"

                "\tif (f_state != 0)\n"

                "\t     assume(!i_sync);\n"

        "\n"

                "\talways @(*)\n"

                "\t     assert(stage == f_state[0]);\n"

        "\n"

        "\talways @(posedge i_clk)\n"

        "\talways @(posedge i_clk)\n"

        "\t\tif (i_ce)\n"

                "\tif ((f_state == 1'b1)&&(f_syncd))\n"

        "\t\t\to_data <= (!oB[LGSPAN])?ob_a : omem[oB[(LGSPAN-1):0]];\n"

                "\tbegin\n"

"\n");

                "\t     assert(o_r == f_piped_real[2] + f_piped_real[1]);\n"

        fprintf(fstage, "endmodule\n");

                "\t     assert(o_i == f_piped_imag[2] + f_piped_imag[1]);\n"

                "\tend\n"

        "\n"

                "\talways @(posedge i_clk)\n"

                "\tif ((f_state == 1'b0)&&(f_syncd))\n"

                "\tbegin\n"

                "\t     assert(!o_sync);\n"

                "\t     assert(o_r == f_piped_real[3] - f_piped_real[2]);\n"

                "\t     assert(o_i == f_piped_imag[3] - f_piped_imag[2]);\n"

                "\tend\n"

        "\n"

                "\talways @(*)\n"

                "\tif (wait_for_sync)\n"

                "\tbegin\n"

                "\t     assert(!f_rsyncd);\n"

                "\t     assert(!o_sync);\n"

                "\t     assert(f_state == 0);\n"

                "\tend\n\n");

        fprintf(fp,

"`endif // FORMAL\n"

"endmodule\n");

        fclose(fp);

void    usage(void) {

void    usage(void) {

        fprintf(stderr,

        fprintf(stderr,

"USAGE:\tfftgen [-f <size>] [-d dir] [-c cbits] [-n nbits] [-m mxbits] [-s]\n"

"USAGE:\tfftgen [-f <size>] [-d dir] [-c cbits] [-n nbits] [-m mxbits] [-s]\n"

// "\tfftgen -i\n"

// "\tfftgen -i\n"

"\t-1\tBuild a normal FFT, running at one clock per complex sample, or (for\n"

"\t-1\tBuild a normal FFT, running at one clock per complex sample, or\n"

"\t\ta real FFT) at one clock per two real input samples.\n"

"\t\t(for a real FFT) at one clock per two real input samples.\n"

"\t-a <hdrname>  Create a header of information describing the built-in\n"

"\t\tparameters, useful for module-level testing with Verilator\n"

"\t-c <cbits>\tCauses all internal complex coefficients to be\n"

"\t-c <cbits>\tCauses all internal complex coefficients to be\n"

"\t\tlonger than the corresponding data bits, to help avoid\n"

"\t\tlonger than the corresponding data bits, to help avoid\n"

"\t\tcoefficient truncation errors.  The default is %d bits longer\n"

"\t\tcoefficient truncation errors.  The default is %d bits longer\n"

"\t\tthan the data bits.\n"

"\t\tthan the data bits.\n"

"\t-d <dir>\tPlaces all of the generated verilog files into <dir>.\n"

"\t-d <dir>  Places all of the generated verilog files into <dir>.\n"

"\t\tThe default is a subdirectory of the current directory named %s.\n"

"\t\tThe default is a subdirectory of the current directory\n"

"\t-f <size>\tSets the size of the FFT as the number of complex\n"

"\t\tnamed %s.\n"

"\t-f <size>  Sets the size of the FFT as the number of complex\n"

"\t\tsamples input to the transform.  (No default value, this is\n"

"\t\tsamples input to the transform.  (No default value, this is\n"

"\t\ta required parameter.)\n"

"\t\ta required parameter.)\n"

"\t-i\tAn inverse FFT, meaning that the coefficients are\n"

"\t-i\tAn inverse FFT, meaning that the coefficients are\n"

"\t\tgiven by e^{ j 2 pi k/N n }.  The default is a forward FFT, with\n"

"\t\tgiven by e^{ j 2 pi k/N n }.  The default is a forward FFT, with\n"

"\t\tcoefficients given by e^{ -j 2 pi k/N n }.\n"

"\t\tcoefficients given by e^{ -j 2 pi k/N n }.\n"

"\t-k #\tSets # clocks per sample, used to minimize multiplies.  Also\n"

"\t\tsets one sample in per i_ce clock (opt -1)\n"

"\t-m <mxbits>\tSets the maximum bit width that the FFT should ever\n"

"\t-m <mxbits>\tSets the maximum bit width that the FFT should ever\n"

"\t\tproduce.  Internal values greater than this value will be\n"

"\t\tproduce.  Internal values greater than this value will be\n"

"\t\ttruncated to this value.  (The default value grows the input\n"

"\t\ttruncated to this value.  (The default value grows the input\n"

"\t\tsize by one bit for every two FFT stages.)\n"

"\t\tsize by one bit for every two FFT stages.)\n"

"\t-n <nbits>\tSets the bitwidth for values coming into the (i)FFT.\n"

"\t-n <nbits>\tSets the bitwidth for values coming into the (i)FFT.\n"

"\t\tThe default is %d bits input for each component of the two\n"

"\t\tThe default is %d bits input for each component of the two\n"

"\t\tcomplex values into the FFT.\n"

"\t\tcomplex values into the FFT.\n"

"\t-p <nmpy>\tSets the number of stages that will use any hardware \n"

"\t-p <nmpy>  Sets the number of hardware multiplies (DSPs) to use, versus\n"

"\t\tmultiplication facility, instead of shift-add emulation.\n"

"\t\tshift-add emulation.  The default is not to use any hardware\n"

"\t\tThree multiplies per butterfly, or six multiplies per stage will\n"

"\t\tmultipliers.\n"

"\t\tbe accelerated in this fashion.  The default is not to use any\n"

"\t\thardware multipliers.\n"

"\t-r\tBuild a real-FFT at four input points per sample, rather than a\n"

"\t-r\tBuild a real-FFT at four input points per sample, rather than a\n"

"\t\tcomplex FFT.  (Default is a Complex FFT.)\n"

"\t\tcomplex FFT.  (Default is a Complex FFT.)\n"

"\t-s\tSkip the final bit reversal stage.  This is useful in\n"

"\t-s\tSkip the final bit reversal stage.  This is useful in\n"

"\t\talgorithms that need to apply a filter without needing to do\n"

"\t\talgorithms that need to apply a filter without needing to do\n"

"\t\tbin shifting, as these algorithms can, with this option, just\n"

"\t\tbin shifting, as these algorithms can, with this option, just\n"

Line 2284...

Line 999...

"\t\tinverse FFT the (still bit reversed) result.  (You would need\n"

"\t\tinverse FFT the (still bit reversed) result.  (You would need\n"

"\t\ta decimation in time inverse to do this, which this program does\n"

"\t\ta decimation in time inverse to do this, which this program does\n"

"\t\tnot yet provide.)\n"

"\t\tnot yet provide.)\n"

"\t-S\tInclude the final bit reversal stage (default).\n"

"\t-S\tInclude the final bit reversal stage (default).\n"

"\t-x <xtrabits>\tUse this many extra bits internally, before any final\n"

"\t-x <xtrabits>\tUse this many extra bits internally, before any final\n"

"\t\trounding or truncation of the answer to the final number of bits.\n"

"\t\trounding or truncation of the answer to the final number of\n"

"\t\tThe default is to use %d extra bits internally.\n",

"\t\tbits.  The default is to use %d extra bits internally.\n",

/*

/*

"\t-0\tA forward FFT (default), meaning that the coefficients are\n"

"\t-0\tA forward FFT (default), meaning that the coefficients are\n"

"\t\tgiven by e^{-j 2 pi k/N n }.\n"

"\t\tgiven by e^{-j 2 pi k/N n }.\n"

"\t-1\tAn inverse FFT, meaning that the coefficients are\n"

"\t-1\tAn inverse FFT, meaning that the coefficients are\n"

"\t\tgiven by e^{ j 2 pi k/N n }.\n",

"\t\tgiven by e^{ j 2 pi k/N n }.\n",

Line 2300...

Line 1015...

// Features still needed:

// Features still needed:

//      Interactivity.

//      Interactivity.

int main(int argc, char **argv) {

int main(int argc, char **argv) {

        int     fftsize = -1, lgsize = -1;

        int     fftsize = -1, lgsize = -1;

        int     nbitsin = DEF_NBITSIN, xtracbits = DEF_XTRACBITS,

        int     nbitsin = DEF_NBITSIN, xtracbits = DEF_XTRACBITS,

                        nummpy=DEF_NMPY, nonmpy=2;

                        nummpy=DEF_NMPY, nmpypstage=6, mpy_stages;

        int     nbitsout, maxbitsout = -1, xtrapbits=DEF_XTRAPBITS;

        int     nbitsout, maxbitsout = -1, xtrapbits=DEF_XTRAPBITS, ckpce = 0;

        const char *EMPTYSTR = "";

        bool    bitreverse = true, inverse=false,

        bool    bitreverse = true, inverse=false,

                verbose_flag = false, single_clock = false,

                verbose_flag = false,

                real_fft = false;

                single_clock = false,

                real_fft = false,

                async_reset = false;

        FILE    *vmain;

        FILE    *vmain;

        std::string     coredir = DEF_COREDIR, cmdline = "", hdrname = "";

        std::string     coredir = DEF_COREDIR, cmdline = "", hdrname = "";

        ROUND_T rounding = RND_CONVERGENT;

        ROUND_T rounding = RND_CONVERGENT;

        // ROUND_T      rounding = RND_HALFUP;

        // ROUND_T      rounding = RND_HALFUP;

Line 2316...

Line 1034...

        int     dbgstage = 128;

        int     dbgstage = 128;

        if (argc <= 1)

        if (argc <= 1)

                usage();

                usage();

        // Copy the original command line before we mess with it

        cmdline = argv[0];

        cmdline = argv[0];

        for(int argn=1; argn<argc; argn++) {

        for(int argn=1; argn<argc; argn++) {

                cmdline += " ";

                cmdline += " ";

                cmdline += argv[argn];

                cmdline += argv[argn];

        for(int argn=1; argn<argc; argn++) {

        { int c;

                if ('-' == argv[argn][0]) {

        while((c = getopt(argc, argv, "12Aa:c:d:D:f:hik:m:n:p:rsSx:v")) != -1) {

                        for(int j=1; (argv[argn][j])&&(j<100); j++) {

                switch(c) {

                                switch(argv[argn][j]) {

                case '1':       single_clock = true;  break;

/*

                case '2':       single_clock = false; break;

                                        case '0':

                case 'A':       async_reset  = true;  break;

                                                inverse = false;

                case 'a':       hdrname = strdup(optarg);       break;

                                                break;

                case 'c':       xtracbits = atoi(optarg);       break;

*/

                case 'd':       coredir = std::string(optarg);  break;

                                        case '1':

                case 'D':       dbgstage = atoi(optarg);        break;

                                                single_clock = true;

                case 'f':       fftsize = atoi(optarg);

                                                break;

                                { int sln = strlen(optarg);

                                        case 'a':

                                if (!isdigit(optarg[sln-1])){

                                                if (argn+1 >= argc) {

                                        switch(optarg[sln-1]) {

                                                        printf("ERR: No header filename given\n\n");

                                                        usage(); exit(-1);

                                                hdrname = argv[++argn];

                                                j+= 200;

                                                break;

                                        case 'c':

                                                if (argn+1 >= argc) {

                                                        printf("ERR: No extra number of coefficient bits given!\n\n");

                                                        usage(); exit(-1);

                                                xtracbits = atoi(argv[++argn]);

                                                j+= 200;

                                                break;

                                        case 'd':

                                                if (argn+1 >= argc) {

                                                        printf("ERR: No directory given into which to place the core!\n\n");

                                                        usage(); exit(-1);

                                                coredir = argv[++argn];

                                                j += 200;

                                                break;

                                        case 'D':

                                                dbg = true;

                                                if (argn+1 >= argc) {

                                                        printf("ERR: No debug stage number given!\n\n");

                                                        usage(); exit(-1);

                                                dbgstage = atoi(argv[++argn]);

                                                j+= 200;

                                                break;

                                        case 'f':

                                                if (argn+1 >= argc) {

                                                        printf("ERR: No FFT Size given!\n\n");

                                                        usage(); exit(-1);

                                                fftsize = atoi(argv[++argn]);

                                                { int sln = strlen(argv[argn]);

                                                if (!isdigit(argv[argn][sln-1])){

                                                        switch(argv[argn][sln-1]) {

                                                        case 'k': case 'K':

                                                        case 'k': case 'K':

                                                                fftsize <<= 10;

                                                                fftsize <<= 10;

                                                                break;

                                                                break;

                                                        case 'm': case 'M':

                                                        case 'm': case 'M':

                                                                fftsize <<= 20;

                                                                fftsize <<= 20;

                                                                break;

                                                                break;

                                                        case 'g': case 'G':

                                                        case 'g': case 'G':

                                                                fftsize <<= 30;

                                                                fftsize <<= 30;

                                                                break;

                                                                break;

                                                        default:

                                                        default:

                                                                printf("ERR: Unknown FFT size, %s!\n", argv[argn]);

                                                printf("ERR: Unknown FFT size, %s!\n", optarg);

                                                                exit(-1);

                                                exit(EXIT_FAILURE);

}}

                                                j += 200;

                                                break;

                                        case 'h':

                                                usage();

                                                exit(0);

                                                break;

                                        case 'i':

                                                inverse = true;

                                                break;

                                        case 'm':

                                                if (argn+1 >= argc) {

                                                        printf("ERR: No maximum output bit value given!\n\n");

                                                        exit(-1);

                                                maxbitsout = atoi(argv[++argn]);

                                }} break;

                                                j += 200;

                case 'h':       usage(); exit(EXIT_SUCCESS);    break;

                                                break;

                case 'i':       inverse = true;                 break;

                                        case 'n':

                case 'k':       ckpce = atoi(optarg);

                                                if (argn+1 >= argc) {

                                single_clock = true;

                                                        printf("ERR: No input bit size given!\n\n");

                                                        exit(-1);

                                                nbitsin = atoi(argv[++argn]);

                                                j += 200;

                                                break;

                                        case 'p':

                                                if (argn+1 >= argc) {

                                                        printf("ERR: No number given for number of hardware multiply stages!\n\n");

                                                        exit(-1);

                                                nummpy = atoi(argv[++argn]);

                                                j += 200;

                                                break;

                                        case 'r':

                                                real_fft = true;

                                                break;

                                        case 'S':

                                                bitreverse = true;

                                                break;

                                        case 's':

                                                bitreverse = false;

                                                break;

                                        case 'x':

                                                if (argn+1 >= argc) {

                                                        printf("ERR: No extra number of bits given!\n\n");

                                                        usage(); exit(-1);

                                                } j+= 200;

                                                xtrapbits = atoi(argv[++argn]);

                                                break;

                                        case 'v':

                                                verbose_flag = true;

                                                break;

                                                break;

                case 'm':       maxbitsout = atoi(optarg);      break;

                case 'n':       nbitsin = atoi(optarg);         break;

                case 'p':       nummpy = atoi(optarg);          break;

                case 'r':       real_fft = true;                break;

                case 'S':       bitreverse = true;              break;

                case 's':       bitreverse = false;             break;

                case 'x':       xtrapbits = atoi(optarg);       break;

                case 'v':       verbose_flag = true;            break;

                // case 'z':    variable_size = true;           break;

                                        default:

                                        default:

                                                printf("Unknown argument, -%c\n", argv[argn][j]);

                        printf("Unknown argument, -%c\n", c);

                                                usage();

                                                exit(-1);

                } else {

                        printf("Unrecognized argument, %s\n", argv[argn]);

                        usage();

                        usage();

                        exit(-1);

                        exit(EXIT_FAILURE);

}}

        if (verbose_flag) {

                if (inverse)

                        printf("Building a %d point inverse FFT module, with %s outputs\n",

                                fftsize,

                                (real_fft)?"real ":"complex");

                else

                        printf("Building a %d point %sforward FFT module\n",

                                fftsize,

                                (real_fft)?"real ":"");

                if (!single_clock)

                        printf("  that accepts two inputs per clock\n");

                if (async_reset)

                        printf("  using a negative logic ASYNC reset\n");

                printf("The core will be placed into the %s/ directory\n", coredir.c_str());

                if (hdrname[0])

                        printf("A C header file, %s, will be written capturing these\n"

                                "options for a Verilator testbench\n",

                                        hdrname.c_str());

                // nummpy

                // xtrapbits

        if (real_fft) {

        if (real_fft) {

                printf("The real FFT option is not implemented yet, but still on\nmy to do list.  Please try again later.\n");

                printf("The real FFT option is not implemented yet, but still on\nmy to do list.  Please try again later.\n");

                exit(0);

                exit(EXIT_FAILURE);

        } if (single_clock) {

                printf("The single clock FFT option is not implemented yet, but still on\nmy to do list.  Please try again later.\n");

                exit(0);

        if (ckpce < 1)

        } if (!bitreverse) {

                ckpce = 1;

        if (!bitreverse) {

                printf("WARNING: While I can skip the bit reverse stage, the code to do\n");

                printf("WARNING: While I can skip the bit reverse stage, the code to do\n");

                printf("an inverse FFT on a bit--reversed input has not yet been\n");

                printf("an inverse FFT on a bit--reversed input has not yet been\n");

                printf("built.\n");

                printf("built.\n");

Line 2474...

Line 1134...

        if ((fftsize <= 0)||(nbitsin < 1)||(nbitsin>48)) {

        if ((fftsize <= 0)||(nbitsin < 1)||(nbitsin>48)) {

                printf("INVALID PARAMETERS!!!!\n");

                printf("INVALID PARAMETERS!!!!\n");

                exit(-1);

                exit(EXIT_FAILURE);

        if (nextlg(fftsize) != fftsize) {

        if (nextlg(fftsize) != fftsize) {

                fprintf(stderr, "ERR: FFTSize (%d) *must* be a power of two\n",

                fprintf(stderr, "ERR: FFTSize (%d) *must* be a power of two\n",

                                fftsize);

                                fftsize);

                exit(-1);

                exit(EXIT_FAILURE);

        } else if (fftsize < 2) {

        } else if (fftsize < 2) {

                fprintf(stderr, "ERR: Minimum FFTSize is 2, not %d\n",

                fprintf(stderr, "ERR: Minimum FFTSize is 2, not %d\n",

                                fftsize);

                                fftsize);

                if (fftsize == 1) {

                if (fftsize == 1) {

                        fprintf(stderr, "You do realize that a 1 point FFT makes very little sense\n");

                        fprintf(stderr, "You do realize that a 1 point FFT makes very little sense\n");

Line 2494...

Line 1154...

                        fprintf(stderr, "can be connected straight to the input.\n");

                        fprintf(stderr, "can be connected straight to the input.\n");

                } else {

                } else {

                        fprintf(stderr, "Indeed, a size of %d doesn\'t make much sense to me at all.\n", fftsize);

                        fprintf(stderr, "Indeed, a size of %d doesn\'t make much sense to me at all.\n", fftsize);

                        fprintf(stderr, "Is such an operation even defined?\n");

                        fprintf(stderr, "Is such an operation even defined?\n");

                exit(-1);

                exit(EXIT_FAILURE);

        // Calculate how many output bits we'll have, and what the log

        // Calculate how many output bits we'll have, and what the log

        // based two size of our FFT is.

        // based two size of our FFT is.

Line 2520...

Line 1180...

                if (fftsize <= 2)

                if (fftsize <= 2)

                        bitreverse = false;

                        bitreverse = false;

        } if ((maxbitsout > 0)&&(nbitsout > maxbitsout))

        } if ((maxbitsout > 0)&&(nbitsout > maxbitsout))

                nbitsout = maxbitsout;

                nbitsout = maxbitsout;

        if (verbose_flag) {

                printf("Output samples will be %d bits wide\n", nbitsout);

                printf("This %sFFT will take %d-bit samples in, and produce %d samples out\n", (inverse)?"i":"", nbitsin, nbitsout);

                if (maxbitsout > 0)

                        printf("  Internally, it will allow items to accumulate to %d bits\n", maxbitsout);

                printf("  Twiddle-factors of %d bits will be used\n",

                        nbitsin+xtracbits);

                if (!bitreverse)

                printf("  The output will be left in bit-reversed order\n");

        // Figure out how many multiply stages to use, and how many to skip

        // Figure out how many multiply stages to use, and how many to skip

        if (!single_clock) {

                int     lgv = lgval(fftsize);

                nmpypstage = 6;

        } else if (ckpce <= 1) {

                nmpypstage = 3;

        } else if (ckpce == 2) {

                nmpypstage = 2;

        } else

                nmpypstage = 1;

                nonmpy = lgv - nummpy;

        mpy_stages = nummpy / nmpypstage;

                if (nonmpy < 2) nonmpy = 2;

        if (mpy_stages > lgval(fftsize)-2)

                nummpy = lgv - nonmpy;

                mpy_stages = lgval(fftsize)-2;

                struct stat     sbuf;

                struct stat     sbuf;

                if (lstat(coredir.c_str(), &sbuf)==0) {

                if (lstat(coredir.c_str(), &sbuf)==0) {

                        if (!S_ISDIR(sbuf.st_mode)) {

                        if (!S_ISDIR(sbuf.st_mode)) {

                                fprintf(stderr, "\'%s\' already exists, and is not a directory!\n", coredir.c_str());

                                fprintf(stderr, "\'%s\' already exists, and is not a directory!\n", coredir.c_str());

                                fprintf(stderr, "I will stop now, lest I overwrite something you care about.\n");

                                fprintf(stderr, "I will stop now, lest I overwrite something you care about.\n");

                                fprintf(stderr, "To try again, please remove this file.\n");

                                fprintf(stderr, "To try again, please remove this file.\n");

                                exit(-1);

                                exit(EXIT_FAILURE);

                } else

                } else

                        mkdir(coredir.c_str(), 0755);

                        mkdir(coredir.c_str(), 0755);

                if (access(coredir.c_str(), X_OK|W_OK) != 0) {

                if (access(coredir.c_str(), X_OK|W_OK) != 0) {

                        fprintf(stderr, "I have no access to the directory \'%s\'.\n", coredir.c_str());

                        fprintf(stderr, "I have no access to the directory \'%s\'.\n", coredir.c_str());

                        exit(-1);

                        exit(EXIT_FAILURE);

        if (hdrname.length() > 0) {

        if (hdrname.length() > 0) {

                FILE    *hdr = fopen(hdrname.c_str(), "w");

                FILE    *hdr = fopen(hdrname.c_str(), "w");

                if (hdr == NULL) {

                if (hdr == NULL) {

                        fprintf(stderr, "ERROR: Cannot open %s to create header file\n", hdrname.c_str());

                        fprintf(stderr, "ERROR: Cannot open %s to create header file\n", hdrname.c_str());

                        perror("O/S Err:");

                        perror("O/S Err:");

                        exit(-2);

                        exit(EXIT_FAILURE);

                fprintf(hdr, "/////////////////////////////////////////////////////////////////////////////\n");

                fprintf(hdr,

                fprintf(hdr, "//\n");

SLASHLINE

                fprintf(hdr, "// Filename:      %s\n", hdrname.c_str());

"//\n"

                fprintf(hdr, "//\n");

"// Filename:\t%s\n"

                fprintf(hdr, "// Project:       %s\n", prjname);

"//\n"

                fprintf(hdr, "//\n");

"// Project:\t%s\n"

                fprintf(hdr, "// Purpose:       This simple header file captures the internal constants\n");

"//\n"

                fprintf(hdr, "//                within the FFT that were used to build it, for the purpose\n");

"// Purpose:    This simple header file captures the internal constants\n"

                fprintf(hdr, "//                of making C++ integration (and test bench testing) simpler.  That\n");

"//             within the FFT that were used to build it, for the purpose\n"

                fprintf(hdr, "//                is, should the FFT change size, this will note that size change\n");

"//     of making C++ integration (and test bench testing) simpler.  That is,\n"

                fprintf(hdr, "//                and thus any test bench or other C++ program dependent upon\n");

"//     should the FFT change size, this will note that size change and thus\n"

                fprintf(hdr, "//                either the size of the FFT, the number of bits in or out of\n");

"//     any test bench or other C++ program dependent upon either the size of\n"

                fprintf(hdr, "//                it, etc., can pick up the changes in the defines found within\n");

"//     the FFT, the number of bits in or out of it, etc., can pick up the\n"

                fprintf(hdr, "//                this file.\n");

"//     changes in the defines found within this file.\n"

                fprintf(hdr, "//\n");

"//\n",

                hdrname.c_str(), prjname);

                fprintf(hdr, "%s", creator);

                fprintf(hdr, "%s", creator);

                fprintf(hdr, "//\n");

                fprintf(hdr, "//\n");

                fprintf(hdr, "%s", cpyleft);

                fprintf(hdr, "%s", cpyleft);

                fprintf(hdr, "//\n"

                fprintf(hdr, "//\n"

                "//\n"

                "//\n"

Line 2586...

Line 1263...

                        (inverse)?"I":"", (inverse)?"I":"",

                        (inverse)?"I":"", (inverse)?"I":"",

                        (inverse)?"I":"", nbitsin,

                        (inverse)?"I":"", nbitsin,

                        (inverse)?"I":"", nbitsout,

                        (inverse)?"I":"", nbitsout,

                        (inverse)?"I":"", lgsize,

                        (inverse)?"I":"", lgsize,

                        (inverse)?"I":"", (inverse)?"I":"");

                        (inverse)?"I":"", (inverse)?"I":"");

                if (ckpce > 0)

                        fprintf(hdr, "#define\t%sFFT_CKPCE\t%d\t// Clocks per CE\n",

                                (inverse)?"I":"", ckpce);

                else

                        fprintf(hdr, "// Two samples per i_ce\n");

                if (!bitreverse)

                if (!bitreverse)

                        fprintf(hdr, "#define\t%sFFT_SKIPS_BIT_REVERSE\n",

                        fprintf(hdr, "#define\t%sFFT_SKIPS_BIT_REVERSE\n",

                                (inverse)?"I":"");

                                (inverse)?"I":"");

                if (real_fft)

                if (real_fft)

                        fprintf(hdr, "#define\tRL%sFFT\n\n", (inverse)?"I":"");

                        fprintf(hdr, "#define\tRL%sFFT\n\n", (inverse)?"I":"");

                if (!single_clock)

                if (!single_clock)

                        fprintf(hdr, "#define\tDBLCLK%sFFT\n\n", (inverse)?"I":"");

                        fprintf(hdr, "#define\tDBLCLK%sFFT\n\n", (inverse)?"I":"");

                else

                        fprintf(hdr, "// #define\tDBLCLK%sFFT // this FFT takes one input sample per clock\n\n", (inverse)?"I":"");

                if (USE_OLD_MULTIPLY)

                if (USE_OLD_MULTIPLY)

                        fprintf(hdr, "#define\tUSE_OLD_MULTIPLY\n\n");

                        fprintf(hdr, "#define\tUSE_OLD_MULTIPLY\n\n");

                fprintf(hdr, "// Parameters for testing the longbimpy\n");

                fprintf(hdr, "// Parameters for testing the longbimpy\n");

                fprintf(hdr, "#define\tTST_LONGBIMPY_AW\t%d\n", TST_LONGBIMPY_AW);

                fprintf(hdr, "#define\tTST_LONGBIMPY_AW\t%d\n", TST_LONGBIMPY_AW);

Line 2648...

Line 1332...

                vmain = fopen(fname_string.c_str(), "w");

                vmain = fopen(fname_string.c_str(), "w");

                if (NULL == vmain) {

                if (NULL == vmain) {

                        fprintf(stderr, "Could not open \'%s\' for writing\n", fname_string.c_str());

                        fprintf(stderr, "Could not open \'%s\' for writing\n", fname_string.c_str());

                        perror("Err from O/S:");

                        perror("Err from O/S:");

                        exit(-1);

                        exit(EXIT_FAILURE);

                if (verbose_flag)

                        printf("Opened %s\n", fname_string.c_str());

        fprintf(vmain, "/////////////////////////////////////////////////////////////////////////////\n");

        fprintf(vmain,

        fprintf(vmain, "//\n");

SLASHLINE

        fprintf(vmain, "// Filename:    %sfftmain.v\n", (inverse)?"i":"");

"//\n"

        fprintf(vmain, "//\n");

"// Filename:\t%sfftmain.v\n"

        fprintf(vmain, "// Project:     %s\n", prjname);

"//\n"

        fprintf(vmain, "//\n");

"// Project:    %s\n"

        fprintf(vmain, "// Purpose:     This is the main module in the Doubletime FPGA FFT project.\n");

"//\n"

        fprintf(vmain, "//              As such, all other modules are subordinate to this one.\n");

"// Purpose:    This is the main module in the General Purpose FPGA FFT\n"

        fprintf(vmain, "//              (I have been reading too much legalese this week ...)\n");

"//             implementation.  As such, all other modules are subordinate\n"

        fprintf(vmain, "//              This module accomplish a fixed size Complex FFT on %d data\n", fftsize);

"//     to this one.  This module accomplish a fixed size Complex FFT on\n"

        fprintf(vmain, "//              points.  The FFT is fully pipelined, and accepts as inputs\n");

"//     %d data points.\n",

        fprintf(vmain, "//              two complex two\'s complement samples per clock.\n");

                (inverse)?"i":"",prjname, fftsize);

        fprintf(vmain, "//\n");

        if (single_clock) {

        fprintf(vmain, "// Parameters:\n");

        fprintf(vmain,

        fprintf(vmain, "//      i_clk\tThe clock.  All operations are synchronous with this clock.\n");

"//     The FFT is fully pipelined, and accepts as inputs one complex two\'s\n"

        fprintf(vmain, "//\ti_rst\tSynchronous reset, active high.  Setting this line will\n");

"//     complement sample per clock.\n");

        fprintf(vmain, "//\t\t\tforce the reset of all of the internals to this routine.\n");

        } else {

        fprintf(vmain, "//\t\t\tFurther, following a reset, the o_sync line will go\n");

        fprintf(vmain,

        fprintf(vmain, "//\t\t\thigh the same time the first output sample is valid.\n");

"//     The FFT is fully pipelined, and accepts as inputs two complex two\'s\n"

        fprintf(vmain, "//\ti_ce\tA clock enable line.  If this line is set, this module\n");

"//     complement samples per clock.\n");

        fprintf(vmain, "//\t\t\twill accept two complex values as inputs, and produce\n");

        fprintf(vmain, "//\t\t\ttwo (possibly empty) complex values as outputs.\n");

        fprintf(vmain, "//\ti_left\tThe first of two complex input samples.  This value is split\n");

        fprintf(vmain,

        fprintf(vmain, "//\t\t\tinto two two\'s complement numbers, %d bits each, with\n", nbitsin);

"//\n"

        fprintf(vmain, "//\t\t\tthe real portion in the high order bits, and the\n");

"// Parameters:\n"

        fprintf(vmain, "//\t\t\timaginary portion taking the bottom %d bits.\n", nbitsin);

"//     i_clk\tThe clock.  All operations are synchronous with this clock.\n"

        fprintf(vmain, "//\ti_right\tThis is the same thing as i_left, only this is the second of\n");

"//     i_%sreset%s\tSynchronous reset, active high.  Setting this line will\n"

        fprintf(vmain, "//\t\t\ttwo such samples.  Hence, i_left would contain input\n");

"//     \t\tforce the reset of all of the internals to this routine.\n"

        fprintf(vmain, "//\t\t\tsample zero, i_right would contain sample one.  On the\n");

"//     \t\tFurther, following a reset, the o_sync line will go\n"

        fprintf(vmain, "//\t\t\tnext clock i_left would contain input sample two,\n");

"//     \t\thigh the same time the first output sample is valid.\n",

        fprintf(vmain, "//\t\t\ti_right number three and so forth.\n");

                (async_reset)?"a":"", (async_reset)?"_n":"");

        fprintf(vmain, "//\to_left\tThe first of two output samples, of the same format as i_left,\n");

        if (single_clock) {

        fprintf(vmain, "//\t\t\tonly having %d bits for each of the real and imaginary\n", nbitsout);

                fprintf(vmain,

        fprintf(vmain, "//\t\t\tcomponents, leading to %d bits total.\n", nbitsout*2);

"//     i_ce\tA clock enable line.  If this line is set, this module\n"

        fprintf(vmain, "//\to_right\tThe second of two output samples produced each clock.  This has\n");

"//     \t\twill accept one complex input value, and produce\n"

        fprintf(vmain, "//\t\t\tthe same format as o_left.\n");

"//     \t\tone (possibly empty) complex output value.\n"

        fprintf(vmain, "//\to_sync\tA one bit output indicating the first valid sample produced by\n");

"//     i_sample\tThe complex input sample.  This value is split\n"

        fprintf(vmain, "//\t\t\tthis FFT following a reset.  Ever after, this will\n");

"//     \t\tinto two two\'s complement numbers, %d bits each, with\n"

        fprintf(vmain, "//\t\t\tindicate the first sample of an FFT frame.\n");

"//     \t\tthe real portion in the high order bits, and the\n"

        fprintf(vmain, "//\n");

"//     \t\timaginary portion taking the bottom %d bits.\n"

        fprintf(vmain, "// Arguments:\tThis file was computer generated using the\n");

"//     o_result\tThe output result, of the same format as i_sample,\n"

        fprintf(vmain, "//\t\tfollowing command line:\n");

"//     \t\tonly having %d bits for each of the real and imaginary\n"

        fprintf(vmain, "//\n");

"//     \t\tcomponents, leading to %d bits total.\n"

"//     o_sync\tA one bit output indicating the first sample of the FFT frame.\n"

"//     \t\tIt also indicates the first valid sample out of the FFT\n"

"//     \t\ton the first frame.\n", nbitsin, nbitsin, nbitsout, nbitsout*2);

        } else {

                fprintf(vmain,

"//     i_ce\tA clock enable line.  If this line is set, this module\n"

"//     \t\twill accept two complex values as inputs, and produce\n"

"//     \t\ttwo (possibly empty) complex values as outputs.\n"

"//     i_left\tThe first of two complex input samples.  This value is split\n"

"//     \t\tinto two two\'s complement numbers, %d bits each, with\n"

"//     \t\tthe real portion in the high order bits, and the\n"

"//     \t\timaginary portion taking the bottom %d bits.\n"

"//     i_right\tThis is the same thing as i_left, only this is the second of\n"

"//     \t\ttwo such samples.  Hence, i_left would contain input\n"

"//     \t\tsample zero, i_right would contain sample one.  On the\n"

"//     \t\tnext clock i_left would contain input sample two,\n"

"//     \t\ti_right number three and so forth.\n"

"//     o_left\tThe first of two output samples, of the same format as i_left,\n"

"//     \t\tonly having %d bits for each of the real and imaginary\n"

"//     \t\tcomponents, leading to %d bits total.\n"

"//     o_right\tThe second of two output samples produced each clock.  This has\n"

"//     \t\tthe same format as o_left.\n"

"//     o_sync\tA one bit output indicating the first valid sample produced by\n"

"//     \t\tthis FFT following a reset.  Ever after, this will\n"

"//     \t\tindicate the first sample of an FFT frame.\n",

        nbitsin, nbitsin, nbitsout, nbitsout*2);

        fprintf(vmain,

"//\n"

"// Arguments:\tThis file was computer generated using the following command\n"

"//\t\tline:\n"

"//\n");

        fprintf(vmain, "//\t\t%% %s\n", cmdline.c_str());

        fprintf(vmain, "//\t\t%% %s\n", cmdline.c_str());

        fprintf(vmain, "//\n");

        fprintf(vmain, "//\n");

        fprintf(vmain, "%s", creator);

        fprintf(vmain, "%s", creator);

        fprintf(vmain, "//\n");

        fprintf(vmain, "//\n");

        fprintf(vmain, "%s", cpyleft);

        fprintf(vmain, "%s", cpyleft);

        fprintf(vmain, "//\n//\n`default_nettype\tnone\n//\n");

        fprintf(vmain, "//\n//\n`default_nettype\tnone\n//\n");

        std::string     resetw("i_reset");

        if (async_reset)

                resetw = "i_areset_n";

        fprintf(vmain, "//\n");

        fprintf(vmain, "//\n");

        fprintf(vmain, "//\n");

        fprintf(vmain, "//\n");

        fprintf(vmain, "module %sfftmain(i_clk, i_rst, i_ce,\n", (inverse)?"i":"");

        fprintf(vmain, "module %sfftmain(i_clk, %s, i_ce,\n",

                (inverse)?"i":"", resetw.c_str());

        if (single_clock) {

                fprintf(vmain, "\t\ti_sample, o_result, o_sync%s);\n",

                        (dbg)?", o_dbg":"");

        } else {

        fprintf(vmain, "\t\ti_left, i_right,\n");

        fprintf(vmain, "\t\ti_left, i_right,\n");

        fprintf(vmain, "\t\to_left, o_right, o_sync%s);\n",

        fprintf(vmain, "\t\to_left, o_right, o_sync%s);\n",

                        (dbg)?", o_dbg":"");

                        (dbg)?", o_dbg":"");

        fprintf(vmain, "\tparameter\tIWIDTH=%d, OWIDTH=%d, LGWIDTH=%d;\n", nbitsin, nbitsout, lgsize);

        fprintf(vmain, "\tparameter\tIWIDTH=%d, OWIDTH=%d, LGWIDTH=%d;\n\t//\n", nbitsin, nbitsout, lgsize);

        assert(lgsize > 0);

        assert(lgsize > 0);

        fprintf(vmain, "\tinput\t\ti_clk, i_rst, i_ce;\n");

        fprintf(vmain, "\tinput\t\t\t\t\ti_clk, %s, i_ce;\n\t//\n",

                resetw.c_str());

        if (single_clock) {

        fprintf(vmain, "\tinput\t\t[(2*IWIDTH-1):0]\ti_sample;\n");

        fprintf(vmain, "\toutput\treg\t[(2*OWIDTH-1):0]\to_result;\n");

        } else {

        fprintf(vmain, "\tinput\t\t[(2*IWIDTH-1):0]\ti_left, i_right;\n");

        fprintf(vmain, "\tinput\t\t[(2*IWIDTH-1):0]\ti_left, i_right;\n");

        fprintf(vmain, "\toutput\treg\t[(2*OWIDTH-1):0]\to_left, o_right;\n");

        fprintf(vmain, "\toutput\treg\t[(2*OWIDTH-1):0]\to_left, o_right;\n");

        fprintf(vmain, "\toutput\treg\t\t\to_sync;\n");

        fprintf(vmain, "\toutput\treg\t\t\t\to_sync;\n");

        if (dbg)

        if (dbg)

                fprintf(vmain, "\toutput\twire\t[33:0]\t\to_dbg;\n");

                fprintf(vmain, "\toutput\twire\t[33:0]\t\to_dbg;\n");

        fprintf(vmain, "\n\n");

        fprintf(vmain, "\n\n");

        fprintf(vmain, "\t// Outputs of the FFT, ready for bit reversal.\n");

        fprintf(vmain, "\t// Outputs of the FFT, ready for bit reversal.\n");

        if (single_clock)

                fprintf(vmain, "\twire\t[(2*OWIDTH-1):0]\tbr_sample;\n");

        else

        fprintf(vmain, "\twire\t[(2*OWIDTH-1):0]\tbr_left, br_right;\n");

        fprintf(vmain, "\twire\t[(2*OWIDTH-1):0]\tbr_left, br_right;\n");

        fprintf(vmain, "\n\n");

        int     tmp_size = fftsize, lgtmp = lgsize;

        int     tmp_size = fftsize, lgtmp = lgsize;

        if (fftsize == 2) {

        if (fftsize == 2) {

                if (bitreverse) {

                if (bitreverse) {

                        fprintf(vmain, "\treg\tbr_start;\n");

                        fprintf(vmain, "\treg\tbr_start;\n");

                        fprintf(vmain, "\tinitial br_start = 1\'b0;\n");

                        fprintf(vmain, "\tinitial br_start = 1\'b0;\n");

                        if (async_reset) {

                                fprintf(vmain, "\talways @(posedge i_clk, negedge i_arese_n)\n");

                                fprintf(vmain, "\t\tif (!i_areset_n)\n");

                        } else {

                        fprintf(vmain, "\talways @(posedge i_clk)\n");

                        fprintf(vmain, "\talways @(posedge i_clk)\n");

                        fprintf(vmain, "\t\tif (i_rst)\n");

                                fprintf(vmain, "\t\tif (i_reset)\n");

                        fprintf(vmain, "\t\t\tbr_start <= 1\'b0;\n");

                        fprintf(vmain, "\t\t\tbr_start <= 1\'b0;\n");

                        fprintf(vmain, "\t\telse if (i_ce)\n");

                        fprintf(vmain, "\t\telse if (i_ce)\n");

                        fprintf(vmain, "\t\t\tbr_start <= 1\'b1;\n");

                        fprintf(vmain, "\t\t\tbr_start <= 1\'b1;\n");

                fprintf(vmain, "\n\n");

                fprintf(vmain, "\n\n");

                fprintf(vmain, "\tdblstage\t#(IWIDTH)\tstage_2(i_clk, i_rst, i_ce,\n");

                fprintf(vmain, "\tlaststage\t#(IWIDTH)\tstage_2(i_clk, %s, i_ce,\n", resetw.c_str());

                fprintf(vmain, "\t\t\t(!i_rst), i_left, i_right, br_left, br_right);\n");

                fprintf(vmain, "\t\t\t(%s%s), i_left, i_right, br_left, br_right);\n",

                        (async_reset)?"":"!", resetw.c_str());

                fprintf(vmain, "\n\n");

                fprintf(vmain, "\n\n");

        } else {

        } else {

                int     nbits = nbitsin, dropbit=0;

                int     nbits = nbitsin, dropbit=0;

                int     obits = nbits+1+xtrapbits;

                int     obits = nbits+1+xtrapbits;

                std::string     cmem;

                FILE    *cmemfp;

                if ((maxbitsout > 0)&&(obits > maxbitsout))

                if ((maxbitsout > 0)&&(obits > maxbitsout))

                        obits = maxbitsout;

                        obits = maxbitsout;

                // Always do a first stage

                // Always do a first stage

                        bool    mpystage;

                        bool    mpystage;

                        // Last two stages are always non-multiply stages

                        // Last two stages are always non-multiply stages

                        // since the multiplies can be done by adds

                        // since the multiplies can be done by adds

                        mpystage = ((lgtmp-2) <= nummpy);

                        mpystage = ((lgtmp-2) <= mpy_stages);

                        if (mpystage)

                        if (mpystage)

                                fprintf(vmain, "\t// A hardware optimized FFT stage\n");

                                fprintf(vmain, "\t// A hardware optimized FFT stage\n");

                        fprintf(vmain, "\n\n");

                        fprintf(vmain, "\n\n");

                        fprintf(vmain, "\twire\t\tw_s%d;\n", fftsize);

                        fprintf(vmain, "\twire\t\tw_s%d;\n", fftsize);

                        if (single_clock) {

                                fprintf(vmain, "\twire\t[%d:0]\tw_d%d;\n", 2*(obits+xtrapbits)-1, fftsize);

                                cmem = gen_coeff_fname(EMPTYSTR, fftsize, 1, 0, inverse);

                                cmemfp = gen_coeff_open(cmem.c_str());

                                gen_coeffs(cmemfp, fftsize,  nbitsin+xtracbits, 1, 0, inverse);

                                fprintf(vmain, "\tfftstage%s\t#(IWIDTH,IWIDTH+%d,%d,%d,%d,0,\n\t\t\t%d, %d, \"%s\")\n\t\tstage_%d(i_clk, %s, i_ce,\n",

                                        ((dbg)&&(dbgstage == fftsize))?"_dbg":"",

                                        xtracbits, obits+xtrapbits,

                                        lgsize, lgtmp-1,

                                        (mpystage)?1:0,

                                        ckpce, cmem.c_str(),

                                        fftsize, resetw.c_str());

                                fprintf(vmain, "\t\t\t(%s%s), i_sample, w_d%d, w_s%d%s);\n",

                                        (async_reset)?"":"!", resetw.c_str(),

                                        fftsize, fftsize,

                                        ((dbg)&&(dbgstage == fftsize))

                                                ? ", o_dbg":"");

                        } else {

                        fprintf(vmain, "\t// verilator lint_off UNUSED\n\twire\t\tw_os%d;\n\t// verilator lint_on  UNUSED\n", fftsize);

                        fprintf(vmain, "\t// verilator lint_off UNUSED\n\twire\t\tw_os%d;\n\t// verilator lint_on  UNUSED\n", fftsize);

                        fprintf(vmain, "\twire\t[%d:0]\tw_e%d, w_o%d;\n", 2*(obits+xtrapbits)-1, fftsize, fftsize);

                        fprintf(vmain, "\twire\t[%d:0]\tw_e%d, w_o%d;\n", 2*(obits+xtrapbits)-1, fftsize, fftsize);

                        fprintf(vmain, "\t%sfftstage_e%d%s\t#(IWIDTH,IWIDTH+%d,%d,%d,%d,%d,0)\tstage_e%d(i_clk, i_rst, i_ce,\n",

                                cmem = gen_coeff_fname(EMPTYSTR, fftsize, 2, 0, inverse);

                                (inverse)?"i":"", fftsize,

                                cmemfp = gen_coeff_open(cmem.c_str());

                                gen_coeffs(cmemfp, fftsize,  nbitsin+xtracbits, 2, 0, inverse);

                                fprintf(vmain, "\tfftstage%s\t#(IWIDTH,IWIDTH+%d,%d,%d,%d,0,\n\t\t\t%d, %d, \"%s\")\n\t\tstage_e%d(i_clk, %s, i_ce,\n",

                                        ((dbg)&&(dbgstage == fftsize))?"_dbg":"",

                                        ((dbg)&&(dbgstage == fftsize))?"_dbg":"",

                                xtracbits, obits+xtrapbits,

                                xtracbits, obits+xtrapbits,

                                lgsize, lgtmp-2, lgdelay(nbits,xtracbits),

                                        lgsize, lgtmp-2,

                                fftsize);

                                        (mpystage)?1:0,

                        fprintf(vmain, "\t\t\t(!i_rst), i_left, w_e%d, w_s%d%s);\n", fftsize, fftsize, ((dbg)&&(dbgstage == fftsize))?", o_dbg":"");

                                        ckpce, cmem.c_str(),

                        fprintf(vmain, "\t%sfftstage_o%d\t#(IWIDTH,IWIDTH+%d,%d,%d,%d,%d,0)\tstage_o%d(i_clk, i_rst, i_ce,\n",

                                        fftsize, resetw.c_str());

                                (inverse)?"i":"", fftsize,

                                fprintf(vmain, "\t\t\t(%s%s), i_left, w_e%d, w_s%d%s);\n",

                                        (async_reset)?"":"!", resetw.c_str(),

                                        fftsize, fftsize,

                                        ((dbg)&&(dbgstage == fftsize))?", o_dbg":"");

                                cmem = gen_coeff_fname(EMPTYSTR, fftsize, 2, 1, inverse);

                                cmemfp = gen_coeff_open(cmem.c_str());

                                gen_coeffs(cmemfp, fftsize,  nbitsin+xtracbits, 2, 1, inverse);

                                fprintf(vmain, "\tfftstage\t#(IWIDTH,IWIDTH+%d,%d,%d,%d,0,\n\t\t\t%d, %d, \"%s\")\n\t\tstage_o%d(i_clk, %s, i_ce,\n",

                                xtracbits, obits+xtrapbits,

                                xtracbits, obits+xtrapbits,

                                lgsize, lgtmp-2, lgdelay(nbits,xtracbits),

                                        lgsize, lgtmp-2,

                                fftsize);

                                        (mpystage)?1:0,

                        fprintf(vmain, "\t\t\t(!i_rst), i_right, w_o%d, w_os%d);\n", fftsize, fftsize);

                                        ckpce, cmem.c_str(),

                        fprintf(vmain, "\n\n");

                                        fftsize, resetw.c_str());

                                fprintf(vmain, "\t\t\t(%s%s), i_right, w_o%d, w_os%d);\n",

                                        (async_reset)?"":"!",resetw.c_str(),

                                        fftsize, fftsize);

                        std::string     fname;

                        std::string     fname;

                        char    numstr[12];

                        fname = coredir + "/";

                        fname = coredir + "/";

                        if (inverse) fname += "i";

                        if (inverse)

                        fname += "fftstage_e";

                                fname += "i";

                        sprintf(numstr, "%d", fftsize);

                        fname += "fftstage";

                        fname += numstr;

                        if (dbg) {

                        if ((dbg)&&(dbgstage == fftsize))

                                std::string     dbgname(fname);

                                fname += "_dbg";

                                dbgname += "_dbg";

                        fname += ".v";

                                dbgname += ".v";

                        build_stage(fname.c_str(), coredir.c_str(), fftsize/2, 0, nbits, inverse, xtracbits, mpystage, (dbg)&&(dbgstage == fftsize));    // Even stage

                                if (single_clock)

                                        build_stage(fname.c_str(), fftsize, 1, 0, nbits, xtracbits, ckpce, async_reset, true);

                                else

                                        build_stage(fname.c_str(), fftsize/2, 2, 1, nbits, xtracbits, ckpce, async_reset, true);

                        fname = coredir + "/";

                        if (inverse) fname += "i";

                        fname += "fftstage_o";

                        sprintf(numstr, "%d", fftsize);

                        fname += numstr;

                        fname += ".v";

                        fname += ".v";

                        build_stage(fname.c_str(), coredir.c_str(), fftsize/2, 1, nbits, inverse, xtracbits, mpystage, false);  // Odd  stage

                        if (single_clock) {

                                build_stage(fname.c_str(), fftsize, 1, 0,

                                        nbits, xtracbits, ckpce, async_reset,

                                        false);

                        } else {

                                // All stages use the same Verilog, so we only

                                // need to build one

                                build_stage(fname.c_str(), fftsize/2, 2, 1,

                                        nbits, xtracbits, ckpce, async_reset, false);

                nbits = obits;  // New number of input bits

                nbits = obits;  // New number of input bits

                tmp_size >>= 1; lgtmp--;

                tmp_size >>= 1; lgtmp--;

                dropbit = 0;

                dropbit = 0;

Line 2810...

Line 1592...

                                obits = maxbitsout;

                                obits = maxbitsout;

                                bool            mpystage;

                                bool            mpystage;

                                mpystage = ((lgtmp-2) <= nummpy);

                                mpystage = ((lgtmp-2) <= mpy_stages);

                                if (mpystage)

                                if (mpystage)

                                        fprintf(vmain, "\t// A hardware optimized FFT stage\n");

                                        fprintf(vmain, "\t// A hardware optimized FFT stage\n");

                                fprintf(vmain, "\twire\t\tw_s%d;\n",

                                fprintf(vmain, "\twire\t\tw_s%d;\n",

                                        tmp_size);

                                        tmp_size);

                                if (single_clock) {

                                        fprintf(vmain,"\twire\t[%d:0]\tw_d%d;\n",

                                                2*(obits+xtrapbits)-1,

                                                tmp_size);

                                        cmem = gen_coeff_fname(EMPTYSTR, tmp_size, 1, 0, inverse);

                                        cmemfp = gen_coeff_open(cmem.c_str());

                                        gen_coeffs(cmemfp, tmp_size,

                                                nbits+xtracbits+xtrapbits, 1, 0, inverse);

                                        fprintf(vmain, "\tfftstage%s\t#(%d,%d,%d,%d,%d,%d,\n\t\t\t%d, %d, \"%s\")\n\t\tstage_%d(i_clk, %s, i_ce,\n",

                                                ((dbg)&&(dbgstage==tmp_size))?"_dbg":"",

                                                nbits+xtrapbits,

                                                nbits+xtracbits+xtrapbits,

                                                obits+xtrapbits,

                                                lgsize, lgtmp-1,

                                                (dropbit)?0:0, (mpystage)?1:0,

                                                ckpce,

                                                cmem.c_str(), tmp_size,

                                                resetw.c_str());

                                        fprintf(vmain, "\t\t\tw_s%d, w_d%d, w_d%d, w_s%d%s);\n",

                                                tmp_size<<1, tmp_size<<1,

                                                tmp_size, tmp_size,

                                                ((dbg)&&(dbgstage == tmp_size))

                                                        ?", o_dbg":"");

                                } else {

                                fprintf(vmain, "\t// verilator lint_off UNUSED\n\twire\t\tw_os%d;\n\t// verilator lint_on  UNUSED\n",

                                fprintf(vmain, "\t// verilator lint_off UNUSED\n\twire\t\tw_os%d;\n\t// verilator lint_on  UNUSED\n",

                                        tmp_size);

                                        tmp_size);

                                fprintf(vmain,"\twire\t[%d:0]\tw_e%d, w_o%d;\n",

                                fprintf(vmain,"\twire\t[%d:0]\tw_e%d, w_o%d;\n",

                                        2*(obits+xtrapbits)-1,

                                        2*(obits+xtrapbits)-1,

                                        tmp_size, tmp_size);

                                        tmp_size, tmp_size);

                                fprintf(vmain, "\t%sfftstage_e%d%s\t#(%d,%d,%d,%d,%d,%d,%d)\tstage_e%d(i_clk, i_rst, i_ce,\n",

                                        cmem = gen_coeff_fname(EMPTYSTR, tmp_size, 2, 0, inverse);

                                        (inverse)?"i":"", tmp_size,

                                        cmemfp = gen_coeff_open(cmem.c_str());

                                        gen_coeffs(cmemfp, tmp_size,

                                                nbits+xtracbits+xtrapbits, 2, 0, inverse);

                                        fprintf(vmain, "\tfftstage%s\t#(%d,%d,%d,%d,%d,%d,\n\t\t\t%d, %d, \"%s\")\n\t\tstage_e%d(i_clk, %s, i_ce,\n",

                                        ((dbg)&&(dbgstage==tmp_size))?"_dbg":"",

                                        ((dbg)&&(dbgstage==tmp_size))?"_dbg":"",

                                        nbits+xtrapbits,

                                        nbits+xtrapbits,

                                        nbits+xtracbits+xtrapbits,

                                        nbits+xtracbits+xtrapbits,

                                        obits+xtrapbits,

                                        obits+xtrapbits,

                                        lgsize, lgtmp-2,

                                        lgsize, lgtmp-2,

                                        lgdelay(nbits+xtrapbits,xtracbits),

                                                (dropbit)?0:0, (mpystage)?1:0,

                                        (dropbit)?0:0, tmp_size);

                                                ckpce,

                                fprintf(vmain, "\t\t\t\t\t\tw_s%d, w_e%d, w_e%d, w_s%d%s);\n",

                                                cmem.c_str(), tmp_size,

                                                resetw.c_str());

                                        fprintf(vmain, "\t\t\tw_s%d, w_e%d, w_e%d, w_s%d%s);\n",

                                        tmp_size<<1, tmp_size<<1,

                                        tmp_size<<1, tmp_size<<1,

                                        tmp_size, tmp_size,

                                        tmp_size, tmp_size,

                                        ((dbg)&&(dbgstage == tmp_size))

                                        ((dbg)&&(dbgstage == tmp_size))

                                                ?", o_dbg":"");

                                                ?", o_dbg":"");

                                fprintf(vmain, "\t%sfftstage_o%d\t#(%d,%d,%d,%d,%d,%d,%d)\tstage_o%d(i_clk, i_rst, i_ce,\n",

                                        cmem = gen_coeff_fname(EMPTYSTR,

                                        (inverse)?"i":"", tmp_size,

                                                tmp_size, 2, 1, inverse);

                                        cmemfp = gen_coeff_open(cmem.c_str());

                                        gen_coeffs(cmemfp, tmp_size,

                                                nbits+xtracbits+xtrapbits,

                                                2, 1, inverse);

                                        fprintf(vmain, "\tfftstage\t#(%d,%d,%d,%d,%d,%d,\n\t\t\t%d, %d, \"%s\")\n\t\tstage_o%d(i_clk, %s, i_ce,\n",

                                        nbits+xtrapbits,

                                        nbits+xtrapbits,

                                        nbits+xtracbits+xtrapbits,

                                        nbits+xtracbits+xtrapbits,

                                        obits+xtrapbits,

                                        obits+xtrapbits,

                                        lgsize, lgtmp-2,

                                        lgsize, lgtmp-2,

                                        lgdelay(nbits+xtrapbits,xtracbits),

                                                (dropbit)?0:0, (mpystage)?1:0,

                                        (dropbit)?0:0, tmp_size);

                                                ckpce, cmem.c_str(), tmp_size,

                                fprintf(vmain, "\t\t\t\t\t\tw_s%d, w_o%d, w_o%d, w_os%d);\n",

                                                resetw.c_str());

                                        fprintf(vmain, "\t\t\tw_s%d, w_o%d, w_o%d, w_os%d);\n",

                                        tmp_size<<1, tmp_size<<1,

                                        tmp_size<<1, tmp_size<<1,

                                        tmp_size, tmp_size);

                                        tmp_size, tmp_size);

                                fprintf(vmain, "\n\n");

                                fprintf(vmain, "\n");

                                std::string     fname;

                                char            numstr[12];

                                fname = coredir + "/";

                                if (inverse) fname += "i";

                                fname += "fftstage_e";

                                sprintf(numstr, "%d", tmp_size);

                                fname += numstr;

                                if ((dbg)&&(dbgstage == tmp_size))

                                        fname += "_dbg";

                                fname += ".v";

                                build_stage(fname.c_str(), coredir.c_str(), tmp_size/2, 0,

                                        nbits+xtrapbits, inverse, xtracbits,

                                        mpystage, ((dbg)&&(dbgstage == tmp_size)));     // Even stage

                                fname = coredir + "/";

                                if (inverse) fname += "i";

                                fname += "fftstage_o";

                                sprintf(numstr, "%d", tmp_size);

                                fname += numstr;

                                fname += ".v";

                                build_stage(fname.c_str(), coredir.c_str(), tmp_size/2, 1,

                                        nbits+xtrapbits, inverse, xtracbits,

                                        mpystage, false);       // Odd  stage

                        dropbit ^= 1;

                        dropbit ^= 1;

                        nbits = obits;

                        nbits = obits;

Line 2887...

Line 1680...

                        if ((maxbitsout > 0)&&(obits > maxbitsout))

                        if ((maxbitsout > 0)&&(obits > maxbitsout))

                                obits = maxbitsout;

                                obits = maxbitsout;

                        fprintf(vmain, "\twire\t\tw_s4;\n");

                        fprintf(vmain, "\twire\t\tw_s4;\n");

                        if (single_clock) {

                                fprintf(vmain, "\twire\t[%d:0]\tw_d4;\n",

                                        2*(obits+xtrapbits)-1);

                                fprintf(vmain, "\tqtrstage%s\t#(%d,%d,%d,%d,%d)\tstage_4(i_clk, %s, i_ce,\n",

                                        ((dbg)&&(dbgstage==4))?"_dbg":"",

                                        nbits+xtrapbits, obits+xtrapbits, lgsize,

                                        (inverse)?1:0, (dropbit)?0:0,

                                        resetw.c_str());

                                fprintf(vmain, "\t\t\t\t\t\tw_s8, w_d8, w_d4, w_s4%s);\n",

                                        ((dbg)&&(dbgstage==4))?", o_dbg":"");

                        } else {

                        fprintf(vmain, "\t// verilator lint_off UNUSED\n\twire\t\tw_os4;\n\t// verilator lint_on  UNUSED\n");

                        fprintf(vmain, "\t// verilator lint_off UNUSED\n\twire\t\tw_os4;\n\t// verilator lint_on  UNUSED\n");

                        fprintf(vmain, "\twire\t[%d:0]\tw_e4, w_o4;\n", 2*(obits+xtrapbits)-1);

                        fprintf(vmain, "\twire\t[%d:0]\tw_e4, w_o4;\n", 2*(obits+xtrapbits)-1);

                        fprintf(vmain, "\tqtrstage%s\t#(%d,%d,%d,0,%d,%d)\tstage_e4(i_clk, i_rst, i_ce,\n",

                                fprintf(vmain, "\tqtrstage%s\t#(%d,%d,%d,0,%d,%d)\tstage_e4(i_clk, %s, i_ce,\n",

                                ((dbg)&&(dbgstage==4))?"_dbg":"",

                                ((dbg)&&(dbgstage==4))?"_dbg":"",

                                nbits+xtrapbits, obits+xtrapbits, lgsize,

                                nbits+xtrapbits, obits+xtrapbits, lgsize,

                                (inverse)?1:0, (dropbit)?0:0);

                                        (inverse)?1:0, (dropbit)?0:0,

                                        resetw.c_str());

                        fprintf(vmain, "\t\t\t\t\t\tw_s8, w_e8, w_e4, w_s4%s);\n",

                        fprintf(vmain, "\t\t\t\t\t\tw_s8, w_e8, w_e4, w_s4%s);\n",

                                ((dbg)&&(dbgstage==4))?", o_dbg":"");

                                ((dbg)&&(dbgstage==4))?", o_dbg":"");

                        fprintf(vmain, "\tqtrstage\t#(%d,%d,%d,1,%d,%d)\tstage_o4(i_clk, i_rst, i_ce,\n",

                                fprintf(vmain, "\tqtrstage\t#(%d,%d,%d,1,%d,%d)\tstage_o4(i_clk, %s, i_ce,\n",

                                nbits+xtrapbits, obits+xtrapbits, lgsize, (inverse)?1:0, (dropbit)?0:0);

                                        nbits+xtrapbits, obits+xtrapbits, lgsize, (inverse)?1:0, (dropbit)?0:0,

                                        resetw.c_str());

                        fprintf(vmain, "\t\t\t\t\t\tw_s8, w_o8, w_o4, w_os4);\n");

                        fprintf(vmain, "\t\t\t\t\t\tw_s8, w_o8, w_o4, w_os4);\n");

                        dropbit ^= 1;

                        dropbit ^= 1;

                        nbits = obits;

                        nbits = obits;

                        tmp_size >>= 1; lgtmp--;

                        tmp_size >>= 1; lgtmp--;

Line 2910...

Line 1717...

                        if (obits > nbitsout)

                        if (obits > nbitsout)

                                obits = nbitsout;

                                obits = nbitsout;

                        if ((maxbitsout>0)&&(obits > maxbitsout))

                        if ((maxbitsout>0)&&(obits > maxbitsout))

                                obits = maxbitsout;

                                obits = maxbitsout;

                        fprintf(vmain, "\twire\t\tw_s2;\n");

                        fprintf(vmain, "\twire\t\tw_s2;\n");

                        fprintf(vmain, "\twire\t[%d:0]\tw_e2, w_o2;\n", 2*obits-1);

                        if (single_clock) {

                                fprintf(vmain, "\twire\t[%d:0]\tw_d2;\n",

                                        2*obits-1);

                        } else {

                                fprintf(vmain, "\twire\t[%d:0]\tw_e2, w_o2;\n",

                                        2*obits-1);

                        if ((nbits+xtrapbits+1 == obits)&&(!dropbit))

                        if ((nbits+xtrapbits+1 == obits)&&(!dropbit))

                                printf("WARNING: SCALING OFF BY A FACTOR OF TWO--should\'ve dropped a bit in the last stage.\n");

                                printf("WARNING: SCALING OFF BY A FACTOR OF TWO--should\'ve dropped a bit in the last stage.\n");

                        fprintf(vmain, "\tdblstage\t#(%d,%d,%d)\tstage_2(i_clk, i_rst, i_ce,\n", nbits+xtrapbits, obits,(dropbit)?0:1);

                        if (single_clock) {

                                fprintf(vmain, "\tlaststage\t#(%d,%d,%d)\tstage_2(i_clk, %s, i_ce,\n",

                                        nbits+xtrapbits, obits,(dropbit)?0:1,

                                        resetw.c_str());

                                fprintf(vmain, "\t\t\t\t\tw_s4, w_d4, w_d2, w_s2);\n");

                        } else {

                                fprintf(vmain, "\tlaststage\t#(%d,%d,%d)\tstage_2(i_clk, %s, i_ce,\n",

                                        nbits+xtrapbits, obits,(dropbit)?0:1,

                                        resetw.c_str());

                        fprintf(vmain, "\t\t\t\t\tw_s4, w_e4, w_o4, w_e2, w_o2, w_s2);\n");

                        fprintf(vmain, "\t\t\t\t\tw_s4, w_e4, w_o4, w_e2, w_o2, w_s2);\n");

                        fprintf(vmain, "\n\n");

                        fprintf(vmain, "\n\n");

                        nbits = obits;

                        nbits = obits;

                fprintf(vmain, "\t// Prepare for a (potential) bit-reverse stage.\n");

                fprintf(vmain, "\t// Prepare for a (potential) bit-reverse stage.\n");

                if (single_clock)

                        fprintf(vmain, "\tassign\tbr_sample= w_d2;\n");

                else {

                fprintf(vmain, "\tassign\tbr_left  = w_e2;\n");

                fprintf(vmain, "\tassign\tbr_left  = w_e2;\n");

                fprintf(vmain, "\tassign\tbr_right = w_o2;\n");

                fprintf(vmain, "\tassign\tbr_right = w_o2;\n");

                fprintf(vmain, "\n");

                fprintf(vmain, "\n");

                if (bitreverse) {

                if (bitreverse) {

                        fprintf(vmain, "\twire\tbr_start;\n");

                        fprintf(vmain, "\twire\tbr_start;\n");

                        fprintf(vmain, "\treg\tr_br_started;\n");

                        fprintf(vmain, "\treg\tr_br_started;\n");

                        fprintf(vmain, "\tinitial\tr_br_started = 1\'b0;\n");

                        fprintf(vmain, "\tinitial\tr_br_started = 1\'b0;\n");

                        if (async_reset) {

                                fprintf(vmain, "\talways @(posedge i_clk, negedge i_areset_n)\n");

                                fprintf(vmain, "\t\tif (!i_areset_n)\n");

                        } else {

                        fprintf(vmain, "\talways @(posedge i_clk)\n");

                        fprintf(vmain, "\talways @(posedge i_clk)\n");

                        fprintf(vmain, "\t\tif (i_rst)\n");

                                fprintf(vmain, "\t\tif (i_reset)\n");

                        fprintf(vmain, "\t\t\tr_br_started <= 1\'b0;\n");

                        fprintf(vmain, "\t\t\tr_br_started <= 1\'b0;\n");

                        fprintf(vmain, "\t\telse if (i_ce)\n");

                        fprintf(vmain, "\t\telse if (i_ce)\n");

                        fprintf(vmain, "\t\t\tr_br_started <= r_br_started || w_s2;\n");

                        fprintf(vmain, "\t\t\tr_br_started <= r_br_started || w_s2;\n");

                        fprintf(vmain, "\tassign\tbr_start = r_br_started || w_s2;\n");

                        fprintf(vmain, "\tassign\tbr_start = r_br_started || w_s2;\n");

        fprintf(vmain, "\n");

        fprintf(vmain, "\n");

        fprintf(vmain, "\t// Now for the bit-reversal stage.\n");

        fprintf(vmain, "\t// Now for the bit-reversal stage.\n");

        fprintf(vmain, "\twire\tbr_sync;\n");

        fprintf(vmain, "\twire\tbr_sync;\n");

        fprintf(vmain, "\twire\t[(2*OWIDTH-1):0]\tbr_o_left, br_o_right;\n");

        if (bitreverse) {

        if (bitreverse) {

                fprintf(vmain, "\tdblreverse\t#(%d,%d)\trevstage(i_clk, i_rst,\n", lgsize, nbitsout);

                if (single_clock) {

                        fprintf(vmain, "\twire\t[(2*OWIDTH-1):0]\tbr_o_result;\n");

                        fprintf(vmain, "\tbitreverse\t#(%d,%d)\n\t\trevstage(i_clk, %s,\n", lgsize, nbitsout, resetw.c_str());

                        fprintf(vmain, "\t\t\t(i_ce & br_start), br_sample,\n");

                        fprintf(vmain, "\t\t\tbr_o_result, br_sync);\n");

                } else {

                        fprintf(vmain, "\twire\t[(2*OWIDTH-1):0]\tbr_o_left, br_o_right;\n");

                        fprintf(vmain, "\tbitreverse\t#(%d,%d)\n\t\trevstage(i_clk, %s,\n", lgsize, nbitsout, resetw.c_str());

                fprintf(vmain, "\t\t\t(i_ce & br_start), br_left, br_right,\n");

                fprintf(vmain, "\t\t\t(i_ce & br_start), br_left, br_right,\n");

                fprintf(vmain, "\t\t\tbr_o_left, br_o_right, br_sync);\n");

                fprintf(vmain, "\t\t\tbr_o_left, br_o_right, br_sync);\n");

        } else if (single_clock) {

                fprintf(vmain, "\tassign\tbr_o_result = br_result;\n");

                fprintf(vmain, "\tassign\tbr_sync     = w_s2;\n");

        } else {

        } else {

                fprintf(vmain, "\tassign\tbr_o_left  = br_left;\n");

                fprintf(vmain, "\tassign\tbr_o_left  = br_left;\n");

                fprintf(vmain, "\tassign\tbr_o_right = br_right;\n");

                fprintf(vmain, "\tassign\tbr_o_right = br_right;\n");

                fprintf(vmain, "\tassign\tbr_sync    = w_s2;\n");

                fprintf(vmain, "\tassign\tbr_sync    = w_s2;\n");

        fprintf(vmain, "\n\n");

        fprintf(vmain,

        fprintf(vmain, "\t// Last clock: Register our outputs, we\'re done.\n");

"\n\n"

        fprintf(vmain, "\tinitial\to_sync  = 1\'b0;\n");

"\t// Last clock: Register our outputs, we\'re done.\n"

        fprintf(vmain, "\talways @(posedge i_clk)\n");

"\tinitial\to_sync  = 1\'b0;\n");

        fprintf(vmain, "\t\tif (i_rst)\n");

        if (async_reset)

        fprintf(vmain, "\t\t\to_sync  <= 1\'b0;\n");

                fprintf(vmain,

        fprintf(vmain, "\t\telse if (i_ce)\n");

"\talways @(posedge i_clk, negedge i_areset_n)\n\t\tif (!i_areset_n)\n");

        fprintf(vmain, "\t\t\to_sync  <= br_sync;\n");

        else {

        fprintf(vmain, "\n");

                fprintf(vmain,

        fprintf(vmain, "\talways @(posedge i_clk)\n");

"\talways @(posedge i_clk)\n\t\tif (i_reset)\n");

        fprintf(vmain, "\t\tif (i_ce)\n");

        fprintf(vmain, "\t\tbegin\n");

        fprintf(vmain, "\t\t\to_left  <= br_o_left;\n");

        fprintf(vmain,

        fprintf(vmain, "\t\t\to_right <= br_o_right;\n");

"\t\t\to_sync  <= 1\'b0;\n"

        fprintf(vmain, "\t\tend\n");

"\t\telse if (i_ce)\n"

        fprintf(vmain, "\n\n");

"\t\t\to_sync  <= br_sync;\n"

        fprintf(vmain, "endmodule\n");

"\n"

"\talways @(posedge i_clk)\n"

"\t\tif (i_ce)\n");

        if (single_clock) {

                fprintf(vmain, "\t\t\to_result  <= br_o_result;\n");

        } else {

                fprintf(vmain,

"\t\tbegin\n"

"\t\t\to_left  <= br_o_left;\n"

"\t\t\to_right <= br_o_right;\n"

"\t\tend\n");

        fprintf(vmain,

"\n\n"

"endmodule\n");

        fclose(vmain);

        fclose(vmain);

                std::string     fname;

                std::string     fname;

                fname = coredir + "/butterfly.v";

                fname = coredir + "/butterfly.v";

                build_butterfly(fname.c_str(), xtracbits, rounding);

                build_butterfly(fname.c_str(), xtracbits, rounding,

                        ckpce, async_reset);

                if (nummpy > 0) {

                        fname = coredir + "/hwbfly.v";

                        fname = coredir + "/hwbfly.v";

                        build_hwbfly(fname.c_str(), xtracbits, rounding);

                build_hwbfly(fname.c_str(), xtracbits, rounding,

                        ckpce, async_reset);

                        // To make debugging easier, we build both of these

                        // To make debugging easier, we build both of these

                        fname = coredir + "/shiftaddmpy.v";

                        fname = coredir + "/shiftaddmpy.v";

                        build_multiply(fname.c_str());

                        build_multiply(fname.c_str());

Line 2994...

Line 1853...

                        build_bimpy(fname.c_str());

                        build_bimpy(fname.c_str());

                if ((dbg)&&(dbgstage == 4)) {

                if ((dbg)&&(dbgstage == 4)) {

                        fname = coredir + "/qtrstage_dbg.v";

                        fname = coredir + "/qtrstage_dbg.v";

                        build_quarters(fname.c_str(), rounding, true);

                        if (single_clock)

                                build_snglquarters(fname.c_str(), rounding,

                                        async_reset, true);

                        else

                                build_dblquarters(fname.c_str(), rounding,

                                        async_reset, true);

                fname = coredir + "/qtrstage.v";

                fname = coredir + "/qtrstage.v";

                build_quarters(fname.c_str(), rounding, false);

                if (single_clock)

                        build_snglquarters(fname.c_str(), rounding,

                                        async_reset, false);

                else

                        build_dblquarters(fname.c_str(), rounding,

                                        async_reset, false);

                if (single_clock) {

                        fname = coredir + "/laststage.v";

                        build_sngllast(fname.c_str(), async_reset);

                } else {

                if ((dbg)&&(dbgstage == 2))

                if ((dbg)&&(dbgstage == 2))

                        fname = coredir + "/dblstage_dbg.v";

                                fname = coredir + "/laststage_dbg.v";

                else

                else

                        fname = coredir + "/dblstage.v";

                                fname = coredir + "/laststage.v";

                build_dblstage(fname.c_str(), rounding, (dbg)&&(dbgstage==2));

                        build_dblstage(fname.c_str(), rounding,

                                async_reset, (dbg)&&(dbgstage==2));

                if (bitreverse) {

                if (bitreverse) {

                        fname = coredir + "/dblreverse.v";

                        fname = coredir + "/bitreverse.v";

                        build_dblreverse(fname.c_str());

                        if (single_clock)

                                build_snglbrev(fname.c_str(), async_reset);

                        else

                                build_dblreverse(fname.c_str(), async_reset);

                const   char    *rnd_string = "";

                const   char    *rnd_string = "";

                switch(rounding) {

                switch(rounding) {

                        case RND_TRUNCATE:      rnd_string = "/truncate.v"; break;

                        case RND_TRUNCATE:      rnd_string = "/truncate.v"; break;

Line 3027...

Line 1906...

                        default:

                        default:

                                build_convround(fname.c_str()); break;

                                build_convround(fname.c_str()); break;

        if (verbose_flag)

                printf("All done -- success\n");

 No newline at end of file

 No newline at end of file

Browse

Tools

Subversion Repositories dblclockfft

[/] [dblclockfft/] [trunk/] [sw/] [fftgen.cpp] - Diff between revs 35 and 36