Line 23... |
Line 23... |
"// ANY WARRANTY; without even the implied warranty of MERCHANTIBILITY or\n"
|
"// ANY WARRANTY; without even the implied warranty of MERCHANTIBILITY or\n"
|
"// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License\n"
|
"// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License\n"
|
"// for more details.\n"
|
"// for more details.\n"
|
"//\n"
|
"//\n"
|
"// You should have received a copy of the GNU General Public License along\n"
|
"// You should have received a copy of the GNU General Public License along\n"
|
"// with this program. If not, see <http://www.gnu.org/licenses/>.\n"
|
"// with this program. (It's in the $(ROOT)/doc directory, run make with no\n"
|
|
"// target there if the PDF file isn\'t present.) If not, see\n"
|
|
"// <http://www.gnu.org/licenses/> for a copy.\n"
|
|
"//\n"
|
"// License: GPL, v3, as defined and found on www.gnu.org,\n"
|
"// License: GPL, v3, as defined and found on www.gnu.org,\n"
|
"// http://www.gnu.org/licenses/gpl.html\n"
|
"// http://www.gnu.org/licenses/gpl.html\n"
|
"//\n"
|
"//\n"
|
"//\n"
|
"//\n"
|
"///////////////////////////////////////////////////////////////////////////\n";
|
"///////////////////////////////////////////////////////////////////////////\n";
|
Line 53... |
Line 56... |
|
|
int lgdelay(int nbits, int xtra) {
|
int lgdelay(int nbits, int xtra) {
|
int cbits = nbits + xtra;
|
int cbits = nbits + xtra;
|
int delay = nbits + 2;
|
int delay = nbits + 2;
|
if (nbits+1<cbits)
|
if (nbits+1<cbits)
|
delay = nbits+2;
|
delay = nbits+4;
|
else
|
else
|
delay = cbits+1;
|
delay = cbits+3;
|
return lgval(delay);
|
return lgval(delay);
|
}
|
}
|
|
|
void build_quarters(const char *fname) {
|
void build_quarters(const char *fname) {
|
FILE *fp = fopen(fname, "w");
|
FILE *fp = fopen(fname, "w");
|
Line 74... |
Line 77... |
"//\n"
|
"//\n"
|
"// Filename: qtrstage.v\n"
|
"// Filename: qtrstage.v\n"
|
"// \n"
|
"// \n"
|
"// Project: %s\n"
|
"// Project: %s\n"
|
"//\n"
|
"//\n"
|
"// Purpose: This file is (almost) a Verilog source file. It is meant to\n"
|
"// Purpose: This file encapsulates the 4 point stage of a decimation in\n"
|
"// be used by a FFT core compiler to generate FFTs which may be\n"
|
"// frequency FFT. This particular implementation is optimized\n"
|
"// used as part of an FFT core. Specifically, this file \n"
|
"// so that all of the multiplies are accomplished by additions\n"
|
"// encapsulates the options of a 4 point, decimation in\n"
|
"// and multiplexers only.\n"
|
"// frequency FFT-stage. This particular stage is optimized so\n"
|
"//\n"
|
"// that all of the multiplies are accomplished by additions and\n"
|
|
"// mux'es.\n"
|
|
"//\n%s"
|
"//\n%s"
|
"//\n",
|
"//\n",
|
prjname, creator);
|
prjname, creator);
|
fprintf(fp, "%s", cpyleft);
|
fprintf(fp, "%s", cpyleft);
|
|
|
Line 155... |
Line 156... |
"\t\t\t\tpipeline[2:0] <= { pipeline[1:0], 1'b0 };\n"
|
"\t\t\t\tpipeline[2:0] <= { pipeline[1:0], 1'b0 };\n"
|
"\n"
|
"\n"
|
"\t\t\t// In sequence, clock = 1\n"
|
"\t\t\t// In sequence, clock = 1\n"
|
"\t\t\tif (pipeline[1])\n"
|
"\t\t\tif (pipeline[1])\n"
|
"\t\t\tbegin\n"
|
"\t\t\tbegin\n"
|
"\t\t\t ob_a <= { sum_r[(IWIDTH):(IWIDTH+1-OWIDTH)],\n"
|
"\t\t\t\tob_a <= { sum_r[(IWIDTH):(IWIDTH+1-OWIDTH)],\n"
|
"\t\t\t\t\t\tsum_i[(IWIDTH):(IWIDTH+1-OWIDTH)] };\n"
|
"\t\t\t\t\t\tsum_i[(IWIDTH):(IWIDTH+1-OWIDTH)] };\n"
|
"\t\t\t\t// on Even, W = e^{-j2pi 1/4 0} = 1\n"
|
"\t\t\t\t// on Even, W = e^{-j2pi 1/4 0} = 1\n"
|
"\t\t\t\tif (~ODD)\n"
|
"\t\t\t\tif (~ODD)\n"
|
"\t\t\t\tbegin\n"
|
"\t\t\t\tbegin\n"
|
"\t\t\t\t\tob_b_r <= diff_r[(IWIDTH-SHIFT):(IWIDTH+1-OWIDTH-SHIFT)];\n"
|
"\t\t\t\t\tob_b_r <= diff_r[(IWIDTH-SHIFT):(IWIDTH+1-OWIDTH-SHIFT)];\n"
|
Line 201... |
Line 202... |
"// Filename: dblstage.v\n"
|
"// Filename: dblstage.v\n"
|
"//\n"
|
"//\n"
|
"// Project: %s\n"
|
"// Project: %s\n"
|
"//\n"
|
"//\n"
|
"// Purpose: This is part of an FPGA implementation that will process\n"
|
"// Purpose: This is part of an FPGA implementation that will process\n"
|
"// data at two samples per clock. If you notice from the\n"
|
"// the final stage of a decimate-in-frequency FFT, running\n"
|
"// derivation of an FFT, the only time both even and odd\n"
|
"// through the data at two samples per clock. If you notice\n"
|
"// samples are used at the same time is the first stage.\n"
|
"// from the derivation of an FFT, the only time both even and\n"
|
"// Therefore, after this stage and these twiddles, all of the\n"
|
"// odd samples are used at the same time is in this stage.\n"
|
"// other stages can run two stages at a time at one sample per\n"
|
"// Therefore, other than this stage and these twiddles, all of\n"
|
"// clock.\n"
|
"// the other stages can run two stages at a time at one sample\n"
|
|
"// per clock.\n"
|
"//\n"
|
"//\n"
|
"// In this implementation, the output is valid one clock after\n"
|
"// In this implementation, the output is valid one clock after\n"
|
"// the input is valid. The output also accumulates one bit\n"
|
"// the input is valid. The output also accumulates one bit\n"
|
"// above and beyond the number of bits in the input.\n"
|
"// above and beyond the number of bits in the input.\n"
|
"// \n"
|
"// \n"
|
Line 240... |
Line 242... |
"\tassign\ti_in_1r = i_right[(2*IWIDTH-1):(IWIDTH)]; \n"
|
"\tassign\ti_in_1r = i_right[(2*IWIDTH-1):(IWIDTH)]; \n"
|
"\tassign\ti_in_1i = i_right[(IWIDTH-1):0]; \n"
|
"\tassign\ti_in_1i = i_right[(IWIDTH-1):0]; \n"
|
"\twire\t[(OWIDTH-1):0]\t\to_out_0r, o_out_0i,\n"
|
"\twire\t[(OWIDTH-1):0]\t\to_out_0r, o_out_0i,\n"
|
"\t\t\t\t\to_out_1r, o_out_1i;\n"
|
"\t\t\t\t\to_out_1r, o_out_1i;\n"
|
"\n"
|
"\n"
|
"\t// Don't forget that we accumulate a bit by adding two values together.\n"
|
"\t// Don't forget that we accumulate a bit by adding two values\n"
|
"\t// Therefore our intermediate value must have one more bit than the\n"
|
"\t// together. Therefore our intermediate value must have one more\n"
|
"\t// two originals.\n"
|
"\t// bit than the two originals.\n"
|
"\treg\t[IWIDTH:0]\tout_0r, out_0i, out_1r, out_1i;\n"
|
"\treg\t[IWIDTH:0]\tout_0r, out_0i, out_1r, out_1i;\n"
|
"\n"
|
"\n"
|
"\talways @(posedge i_clk)\n"
|
"\talways @(posedge i_clk)\n"
|
"\t\tif (i_ce)\n"
|
"\t\tif (i_ce)\n"
|
"\t\tbegin\n"
|
"\t\tbegin\n"
|
Line 255... |
Line 257... |
"\t\t\t//\n"
|
"\t\t\t//\n"
|
"\t\t\tout_1r <= i_in_0r - i_in_1r;\n"
|
"\t\t\tout_1r <= i_in_0r - i_in_1r;\n"
|
"\t\t\tout_1i <= i_in_0i - i_in_1i;\n"
|
"\t\t\tout_1i <= i_in_0i - i_in_1i;\n"
|
"\t\tend\n"
|
"\t\tend\n"
|
"\n"
|
"\n"
|
"\t// Now, if the master control program doesn't want to keep all of our\n"
|
"\t// Now, if the master control program doesn't want to keep all of\n"
|
"\t// bits, we can shift down to OWIDTH bits here.\n"
|
"\t// our bits, we can shift down to OWIDTH bits here.\n"
|
"\tassign\to_out_0r = out_0r[(IWIDTH-SHIFT):(IWIDTH+1-OWIDTH-SHIFT)];\n"
|
"\tassign\to_out_0r = out_0r[(IWIDTH-SHIFT):(IWIDTH+1-OWIDTH-SHIFT)];\n"
|
"\tassign\to_out_0i = out_0i[(IWIDTH-SHIFT):(IWIDTH+1-OWIDTH-SHIFT)];\n"
|
"\tassign\to_out_0i = out_0i[(IWIDTH-SHIFT):(IWIDTH+1-OWIDTH-SHIFT)];\n"
|
"\tassign\to_out_1r = out_1r[(IWIDTH-SHIFT):(IWIDTH+1-OWIDTH-SHIFT)];\n"
|
"\tassign\to_out_1r = out_1r[(IWIDTH-SHIFT):(IWIDTH+1-OWIDTH-SHIFT)];\n"
|
"\tassign\to_out_1i = out_1i[(IWIDTH-SHIFT):(IWIDTH+1-OWIDTH-SHIFT)];\n"
|
"\tassign\to_out_1i = out_1i[(IWIDTH-SHIFT):(IWIDTH+1-OWIDTH-SHIFT)];\n"
|
"\n"
|
"\n"
|
Line 324... |
Line 326... |
"\treg\t[(AWIDTH+BWIDTH-2):0]\tr_b[0:(AWIDTH-1)];\n"
|
"\treg\t[(AWIDTH+BWIDTH-2):0]\tr_b[0:(AWIDTH-1)];\n"
|
"\treg\t\t\t\tr_s[0:(AWIDTH-1)];\n"
|
"\treg\t\t\t\tr_s[0:(AWIDTH-1)];\n"
|
"\treg\t[(AWIDTH+BWIDTH-1):0]\tacc[0:(AWIDTH-1)];\n"
|
"\treg\t[(AWIDTH+BWIDTH-1):0]\tacc[0:(AWIDTH-1)];\n"
|
"\tgenvar k;\n"
|
"\tgenvar k;\n"
|
"\n"
|
"\n"
|
|
"\t// If we were forced to stay within two\'s complement arithmetic,\n"
|
|
"\t// taking the absolute value here would require an additional bit.\n"
|
|
"\t// However, because our results are now unsigned, we can stay\n"
|
|
"\t// within the number of bits given (for now).\n"
|
"\talways @(posedge i_clk)\n"
|
"\talways @(posedge i_clk)\n"
|
"\t\tif (i_ce)\n"
|
"\t\tif (i_ce)\n"
|
"\t\tbegin\n"
|
"\t\tbegin\n"
|
"\t\t\tu_a <= (i_a[AWIDTH-1])?(-i_a):(i_a);\n"
|
"\t\t\tu_a <= (i_a[AWIDTH-1])?(-i_a):(i_a);\n"
|
"\t\t\tu_b <= (i_b[BWIDTH-1])?(-i_b):(i_b);\n"
|
"\t\t\tu_b <= (i_b[BWIDTH-1])?(-i_b):(i_b);\n"
|
Line 429... |
Line 435... |
"// mem[11xxx1] = s_1[m]\n"
|
"// mem[11xxx1] = s_1[m]\n"
|
"// o_0[m] = mem[00xxx1]\n"
|
"// o_0[m] = mem[00xxx1]\n"
|
"// o_1[m] = mem[01xxx1]\n"
|
"// o_1[m] = mem[01xxx1]\n"
|
"// ...\n"
|
"// ...\n"
|
"//\n"
|
"//\n"
|
|
"// The answer is that, yes we can but: we need to use four memory banks\n"
|
|
"// to do it properly. These four banks are defined by the two bits\n"
|
|
"// that determine the top and bottom of the correct address. Larger\n"
|
|
"// FFT\'s would require more memories.\n"
|
|
"//\n"
|
"//\n");
|
"//\n");
|
fprintf(fp,
|
fprintf(fp,
|
"module dblreverse(i_clk, i_rst, i_ce, i_in_0, i_in_1,\n"
|
"module dblreverse(i_clk, i_rst, i_ce, i_in_0, i_in_1,\n"
|
"\t\to_out_0, o_out_1, o_sync);\n"
|
"\t\to_out_0, o_out_1, o_sync);\n"
|
"\tparameter\t\t\tLGSIZE=4, WIDTH=24;\n"
|
"\tparameter\t\t\tLGSIZE=4, WIDTH=24;\n"
|
Line 578... |
Line 589... |
|
|
fprintf(fp,
|
fprintf(fp,
|
"module\tbutterfly(i_clk, i_ce, i_coef, i_left, i_right, i_aux,\n"
|
"module\tbutterfly(i_clk, i_ce, i_coef, i_left, i_right, i_aux,\n"
|
"\t\to_left, o_right, o_aux);\n"
|
"\t\to_left, o_right, o_aux);\n"
|
"\t// Public changeable parameters ...\n"
|
"\t// Public changeable parameters ...\n"
|
"\tparameter IWIDTH=16,CWIDTH=IWIDTH,OWIDTH=IWIDTH;\n"
|
"\tparameter IWIDTH=16,CWIDTH=IWIDTH+4,OWIDTH=IWIDTH+1;\n"
|
"\t// Parameters specific to the core that should not be changed.\n"
|
"\t// Parameters specific to the core that should not be changed.\n"
|
"\tparameter MPYDELAY=(IWIDTH+1 < CWIDTH)?(IWIDTH+2):(CWIDTH+1),\n"
|
"\tparameter MPYDELAY=5'd20, // (IWIDTH+1 < CWIDTH)?(IWIDTH+4):(CWIDTH+3),\n"
|
"\t\t\tSHIFT=0, ROUND=1;\n"
|
"\t\t\tSHIFT=0, ROUND=0;\n"
|
"\t// The LGDELAY should be the base two log of the MPYDELAY. If\n"
|
"\t// The LGDELAY should be the base two log of the MPYDELAY. If\n"
|
"\t// this value is fractional, then round up to the nearest\n"
|
"\t// this value is fractional, then round up to the nearest\n"
|
"\t// integer: LGDELAY=ceil(log(MPYDELAY)/log(2));\n"
|
"\t// integer: LGDELAY=ceil(log(MPYDELAY)/log(2));\n"
|
"\tparameter LGDELAY=5;\n"
|
"\tparameter\tLGDELAY=5;\n"
|
"\tinput i_clk, i_ce;\n"
|
"\tinput\t\ti_clk, i_ce;\n"
|
"\tinput [(2*CWIDTH-1):0] i_coef;\n"
|
"\tinput\t\t[(2*CWIDTH-1):0] i_coef;\n"
|
"\tinput [(2*IWIDTH-1):0] i_left, i_right;\n"
|
"\tinput\t\t[(2*IWIDTH-1):0] i_left, i_right;\n"
|
"\tinput i_aux;\n"
|
"\tinput\t\ti_aux;\n"
|
"\toutput wire [(2*OWIDTH-1):0] o_left, o_right;\n"
|
"\toutput\twire [(2*OWIDTH-1):0] o_left, o_right;\n"
|
"\toutput wire o_aux;\n"
|
"\toutput\twire o_aux;\n"
|
"\n"
|
"\n"
|
"\twire [(OWIDTH-1):0] o_left_r, o_left_i, o_right_r, o_right_i;\n"
|
"\twire\t[(OWIDTH-1):0] o_left_r, o_left_i, o_right_r, o_right_i;\n"
|
"\n"
|
"\n"
|
"\treg [(2*IWIDTH-1):0] r_left, r_right;\n"
|
"\treg\t[(2*IWIDTH-1):0]\tr_left, r_right;\n"
|
"\treg r_aux, r_aux_2;\n"
|
"\treg\t\t\t\tr_aux, r_aux_2;\n"
|
"\treg [(2*CWIDTH-1):0] r_coef, r_coef_2;\n"
|
"\treg\t[(2*CWIDTH-1):0]\tr_coef, r_coef_2;\n"
|
"\twire [(CWIDTH-1):0] r_coef_r, r_coef_i;\n"
|
"\twire\tsigned\t[(CWIDTH-1):0]\tr_coef_r, r_coef_i;\n"
|
"\tassign r_coef_r = r_coef_2[ (2*CWIDTH-1):(CWIDTH)];\n"
|
"\tassign\tr_coef_r = r_coef_2[ (2*CWIDTH-1):(CWIDTH)];\n"
|
"\tassign r_coef_i = r_coef_2[ (CWIDTH-1):0];\n"
|
"\tassign\tr_coef_i = r_coef_2[ ( CWIDTH-1):0];\n"
|
"\twire [(IWIDTH-1):0] r_left_r, r_left_i, r_right_r, r_right_i;\n"
|
"\twire\tsigned\t[(IWIDTH-1):0]\tr_left_r, r_left_i, r_right_r, r_right_i;\n"
|
"\tassign r_left_r = i_left[ (2*IWIDTH-1):(IWIDTH)];\n"
|
"\tassign\tr_left_r = r_left[ (2*IWIDTH-1):(IWIDTH)];\n"
|
"\tassign r_left_i = i_left[ (IWIDTH-1):0];\n"
|
"\tassign\tr_left_i = r_left[ (IWIDTH-1):0];\n"
|
"\tassign r_right_r = i_right[(2*IWIDTH-1):(IWIDTH)];\n"
|
"\tassign\tr_right_r = r_right[(2*IWIDTH-1):(IWIDTH)];\n"
|
"\tassign r_right_i = i_right[(IWIDTH-1):0];\n"
|
"\tassign\tr_right_i = r_right[(IWIDTH-1):0];\n"
|
"\n"
|
"\n"
|
"\treg [(IWIDTH):0] r_sum_r, r_sum_i, r_dif_r, r_dif_i;\n"
|
"\treg\tsigned\t[(IWIDTH):0]\tr_sum_r, r_sum_i, r_dif_r, r_dif_i;\n"
|
"\n"
|
"\n"
|
"\treg [(LGDELAY-1):0] fifo_addr;\n"
|
"\treg [(LGDELAY-1):0] fifo_addr;\n"
|
"\twire [(LGDELAY-1):0] fifo_read_addr;\n"
|
"\twire [(LGDELAY-1):0] fifo_read_addr;\n"
|
"\t/* verilator lint_off WIDTH */\n"
|
|
"\tassign fifo_read_addr = fifo_addr - MPYDELAY;\n"
|
"\tassign fifo_read_addr = fifo_addr - MPYDELAY;\n"
|
"\t/* verilator lint_on WIDTH */\n"
|
|
"\treg [(2*IWIDTH+2):0] fifo_left [ 0:((1<<LGDELAY)-1)];\n"
|
"\treg [(2*IWIDTH+2):0] fifo_left [ 0:((1<<LGDELAY)-1)];\n"
|
"\n"
|
"\n");
|
|
fprintf(fp,
|
|
"\t// Set up the input to the multiply\n"
|
"\talways @(posedge i_clk)\n"
|
"\talways @(posedge i_clk)\n"
|
"\t\tif (i_ce)\n"
|
"\t\tif (i_ce)\n"
|
"\t\tbegin\n"
|
"\t\tbegin\n"
|
"\t\t\t// One clock just latches the inputs\n"
|
"\t\t\t// One clock just latches the inputs\n"
|
"\t\t\tr_left <= i_left; // No change in # of bits\n"
|
"\t\t\tr_left <= i_left; // No change in # of bits\n"
|
Line 633... |
Line 644... |
"\t\t\tr_dif_i <= r_left_i - r_right_i;\n"
|
"\t\t\tr_dif_i <= r_left_i - r_right_i;\n"
|
"\t\t\t// Other inputs are simply delayed on second clock\n"
|
"\t\t\t// Other inputs are simply delayed on second clock\n"
|
"\t\t\tr_aux_2 <= r_aux;\n"
|
"\t\t\tr_aux_2 <= r_aux;\n"
|
"\t\t\tr_coef_2<= r_coef;\n"
|
"\t\t\tr_coef_2<= r_coef;\n"
|
"\t\tend\n"
|
"\t\tend\n"
|
"\n"
|
"\n");
|
|
fprintf(fp,
|
|
"\t// Don\'t forget to record the even side, since it doesn\'t need\n"
|
|
"\t// to be multiplied, but yet we still need the results in sync\n"
|
|
"\t// with the answer when it is ready.\n"
|
"\talways @(posedge i_clk)\n"
|
"\talways @(posedge i_clk)\n"
|
"\t\tif (i_ce)\n"
|
"\t\tif (i_ce)\n"
|
"\t\tbegin\n"
|
"\t\tbegin\n"
|
"\t\t\t// Need to delay the sum side--nothing else happens\n"
|
"\t\t\t// Need to delay the sum side--nothing else happens\n"
|
"\t\t\t// to it, but it needs to stay synchronized with the\n"
|
"\t\t\t// to it, but it needs to stay synchronized with the\n"
|
"\t\t\t// right side.\n"
|
"\t\t\t// right side.\n"
|
"\t\t\tfifo_left[fifo_addr] <= { r_aux_2, r_sum_r, r_sum_i };\n"
|
"\t\t\tfifo_left[fifo_addr] <= { r_aux_2, r_sum_r, r_sum_i };\n"
|
"\t\t\tfifo_addr <= fifo_addr + 1;\n"
|
"\t\t\tfifo_addr <= fifo_addr + 1;\n"
|
"\t\tend\n"
|
"\t\tend\n"
|
"\n"
|
"\n"
|
"\twire [(CWIDTH-1):0] ir_coef_r, ir_coef_i;\n"
|
"\twire\tsigned\t[(CWIDTH-1):0] ir_coef_r, ir_coef_i;\n"
|
"\tassign ir_coef_r = r_coef_2[(2*CWIDTH-1):CWIDTH];\n"
|
"\tassign\tir_coef_r = r_coef_2[(2*CWIDTH-1):CWIDTH];\n"
|
"\tassign ir_coef_i = r_coef_2[(CWIDTH-1):0];\n"
|
"\tassign\tir_coef_i = r_coef_2[(CWIDTH-1):0];\n"
|
"\twire [(IWIDTH+CWIDTH+1+2-1):0] p_one, p_two, p_three;\n"
|
"\twire\tsigned\t[((IWIDTH+2)+(CWIDTH+1)-1):0]\tp_one, p_two, p_three;\n"
|
"\n"
|
"\n"
|
"\t// Multiply output is always a width of IWIDTH+CWIDTH-1. ALWAYS.\n"
|
"\n");
|
"\t// We take care of dropping the width to OWIDTH in our routine\n"
|
fprintf(fp,
|
"\t// below, but this is the definition of a multiply.\n"
|
"\t// Multiply output is always a width of the sum of the widths of\n"
|
"\n"
|
"\t// the two inputs. ALWAYS. This is independent of the number of\n"
|
"\n"
|
"\t// bits in p_one, p_two, or p_three. These values needed to \n"
|
"\n"
|
"\t// accumulate a bit (or two) each. However, this approach to a\n"
|
"// This should really be based upon an IF\n"
|
"\t// three multiply complex multiply cannot increase the total\n"
|
"// if (IWIDTH < CWIDTH) then ...\n"
|
"\t// number of bits in our final output. We\'ll take care of\n"
|
|
"\t// dropping back down to the proper width, OWIDTH, in our routine\n"
|
|
"\t// below.\n"
|
|
"\n"
|
|
"\n");
|
|
fprintf(fp,
|
|
"\t// We accomplish here \"Karatsuba\" multiplication. That is,\n"
|
|
"\t// by doing three multiplies we accomplish the work of four.\n"
|
|
"\t// Let\'s prove to ourselves that this works ... We wish to\n"
|
|
"\t// multiply: (a+jb) * (c+jd), where a+jb is given by\n"
|
|
"\t//\ta + jb = r_dif_r + j r_dif_i, and\n"
|
|
"\t//\tc + jd = ir_coef_r + j ir_coef_i.\n"
|
|
"\t// We do this by calculating the intermediate products P1, P2,\n"
|
|
"\t// and P3 as\n"
|
|
"\t//\tP1 = ac\n"
|
|
"\t//\tP2 = bd\n"
|
|
"\t//\tP3 = (a + b) * (c + d)\n"
|
|
"\t// and then complete our final answer with\n"
|
|
"\t//\tac - bd = P1 - P2 (this checks)\n"
|
|
"\t//\tad + bc = P3 - P2 - P1\n"
|
|
"\t//\t = (ac + bc + ad + bd) - bd - ac\n"
|
|
"\t//\t = bc + ad (this checks)\n"
|
|
"\n"
|
|
"\n");
|
|
fprintf(fp,
|
|
"\t// This should really be based upon an IF, such as in\n"
|
|
"\t// if (IWIDTH < CWIDTH) then ...\n"
|
|
"\t// However, this is the only (other) way I know to do it.\n"
|
"\tgenerate\n"
|
"\tgenerate\n"
|
"\tif (CWIDTH < IWIDTH+1)\n"
|
"\tif (CWIDTH < IWIDTH+1)\n"
|
"\tbegin\n"
|
"\tbegin\n"
|
"\t\t// We need to pad these first two multiplies by an extra\n"
|
"\t\t// We need to pad these first two multiplies by an extra\n"
|
"\t\t// just to keep them aligned with the third, simpler,\n"
|
"\t\t// bit just to keep them aligned with the third,\n"
|
"\t\t// multiply.\n"
|
"\t\t// simpler, multiply.\n"
|
"\t\tshiftaddmpy #(CWIDTH+1,IWIDTH+2) p1(i_clk, i_ce,\n"
|
"\t\tshiftaddmpy #(CWIDTH+1,IWIDTH+2) p1(i_clk, i_ce,\n"
|
"\t\t\t\t{ir_coef_r[CWIDTH-1],ir_coef_r},\n"
|
"\t\t\t\t{ir_coef_r[CWIDTH-1],ir_coef_r},\n"
|
"\t\t\t\t{r_dif_r[IWIDTH],r_dif_r}, p_one);\n"
|
"\t\t\t\t{r_dif_r[IWIDTH],r_dif_r}, p_one);\n"
|
"\t\tshiftaddmpy #(CWIDTH+1,IWIDTH+2) p2(i_clk, i_ce,\n"
|
"\t\tshiftaddmpy #(CWIDTH+1,IWIDTH+2) p2(i_clk, i_ce,\n"
|
"\t\t\t\t{ir_coef_r[CWIDTH-1],ir_coef_r},\n"
|
"\t\t\t\t{ir_coef_i[CWIDTH-1],ir_coef_i},\n"
|
"\t\t\t\t{r_dif_i[IWIDTH],r_dif_i}, p_two);\n"
|
"\t\t\t\t{r_dif_i[IWIDTH],r_dif_i}, p_two);\n"
|
"\t\tshiftaddmpy #(CWIDTH+1,IWIDTH+2) p3(i_clk, i_ce,\n"
|
"\t\tshiftaddmpy #(CWIDTH+1,IWIDTH+2) p3(i_clk, i_ce,\n"
|
"\t\t\t\tir_coef_i+ir_coef_r, r_dif_r + r_dif_i, p_three);\n"
|
"\t\t\t\tir_coef_i+ir_coef_r,\n"
|
|
"\t\t\t\tr_dif_r + r_dif_i,\n"
|
|
"\t\t\t\tp_three);\n"
|
"\tend else begin\n"
|
"\tend else begin\n"
|
"\t\tshiftaddmpy #(IWIDTH+2,CWIDTH+1) p1a(i_clk, i_ce,\n"
|
"\t\tshiftaddmpy #(IWIDTH+2,CWIDTH+1) p1a(i_clk, i_ce,\n"
|
"\t\t\t\t{r_dif_r[IWIDTH],r_dif_r},\n"
|
"\t\t\t\t{r_dif_r[IWIDTH],r_dif_r},\n"
|
"\t\t\t\t{ir_coef_r[CWIDTH-1],ir_coef_r}, p_one);\n"
|
"\t\t\t\t{ir_coef_r[CWIDTH-1],ir_coef_r}, p_one);\n"
|
"\t\tshiftaddmpy #(IWIDTH+2,CWIDTH+1) p2a(i_clk, i_ce,\n"
|
"\t\tshiftaddmpy #(IWIDTH+2,CWIDTH+1) p2a(i_clk, i_ce,\n"
|
"\t\t\t\t{r_dif_i[IWIDTH], r_dif_i},\n"
|
"\t\t\t\t{r_dif_i[IWIDTH], r_dif_i},\n"
|
"\t\t\t\t{ir_coef_r[CWIDTH-1],ir_coef_r}, p_two);\n"
|
"\t\t\t\t{ir_coef_i[CWIDTH-1],ir_coef_i}, p_two);\n"
|
"\t\tshiftaddmpy #(IWIDTH+2,CWIDTH+1) p3a(i_clk, i_ce,\n"
|
"\t\tshiftaddmpy #(IWIDTH+2,CWIDTH+1) p3a(i_clk, i_ce,\n"
|
"\t\t\t\tr_dif_r+r_dif_i,\n"
|
"\t\t\t\tr_dif_r+r_dif_i,\n"
|
"\t\t\t\tir_coef_i+ir_coef_r,\n"
|
"\t\t\t\tir_coef_i+ir_coef_r,\n"
|
"\t\t\t\tp_three);\n"
|
"\t\t\t\tp_three);\n"
|
"\tend\n"
|
"\tend\n"
|
"\tendgenerate\n"
|
"\tendgenerate\n"
|
"\n"
|
"\n");
|
|
fprintf(fp,
|
|
"\t// These values are held in memory and delayed during the\n"
|
|
"\t// multiply. Here, we recover them. During the multiply,\n"
|
|
"\t// values were multiplied by 2^(CWIDTH-2)*exp{-j*2*pi*...},\n"
|
|
"\t// therefore, the left_x values need to be right shifted by\n"
|
|
"\t// CWIDTH-2 as well. The additional bits come from a sign\n"
|
|
"\t// extension.\n"
|
"\twire aux;\n"
|
"\twire aux;\n"
|
"\twire [(IWIDTH+CWIDTH):0] left_i, left_r;\n"
|
"\twire\tsigned\t[(IWIDTH+CWIDTH):0] fifo_i, fifo_r;\n"
|
"\treg [(2*IWIDTH+2):0] fifo_read;\n"
|
"\treg\t\t[(2*IWIDTH+2):0] fifo_read;\n"
|
"\tassign left_r = { fifo_read[(2*(IWIDTH+1)-1):(IWIDTH+1)], {(CWIDTH){1'b0}} };\n"
|
"\tassign\tfifo_r = { {2{fifo_read[2*(IWIDTH+1)-1]}}, fifo_read[(2*(IWIDTH+1)-1):(IWIDTH+1)], {(CWIDTH-2){1'b0}} };\n"
|
"\tassign left_i = { fifo_read[((IWIDTH+1)-1):0], {(CWIDTH){1'b0}} };\n"
|
"\tassign\tfifo_i = { {2{fifo_read[(IWIDTH+1)-1]}}, fifo_read[((IWIDTH+1)-1):0], {(CWIDTH-2){1'b0}} };\n"
|
"\tassign aux = fifo_read[2*IWIDTH+2];\n"
|
"\tassign\taux = fifo_read[2*IWIDTH+2];\n"
|
"\n"
|
"\n"
|
"\n"
|
"\n"
|
"\treg [(CWIDTH+IWIDTH+3-1):0] b_left_r, b_left_i, b_right_r, b_right_i, mpy_r, mpy_i;\n"
|
"\treg\tsigned\t[(CWIDTH+IWIDTH+3-1):0] b_left_r, b_left_i,\n"
|
"\treg [(CWIDTH+IWIDTH+3-1):0] rnd;\n"
|
"\t\t\t\t\t\tb_right_r, b_right_i;\n"
|
"\tassign rnd = ((~ROUND)||(SHIFT==0))?\n"
|
"\treg\tsigned\t[(CWIDTH+IWIDTH+3-1):0] mpy_r, mpy_i;\n"
|
"\t\t\t({(CWIDTH+IWIDTH+3){1'b0}})\n"
|
"\treg\tsigned\t[(CWIDTH+IWIDTH+3-1):0] rnd;\n"
|
"\t\t\t: ({ {(OWIDTH+1+SHIFT){1'b0}},1'b1,{(CWIDTH+IWIDTH+3-2-OWIDTH-SHIFT){1'b0}} });\n"
|
"\tgenerate\n"
|
|
"\tif ((~ROUND)||(CWIDTH+IWIDTH-OWIDTH-SHIFT<1))\n"
|
|
"\t\tassign rnd = ({(CWIDTH+IWIDTH+3){1'b0}});\n"
|
|
"\telse\n"
|
|
"\t\tassign rnd = ({ {(OWIDTH+3+SHIFT){1'b0}},1'b1,\n"
|
|
"\t\t\t\t{(CWIDTH+IWIDTH-OWIDTH-SHIFT-1){1'b0}} });\n"
|
|
"\tendgenerate\n"
|
|
"\n");
|
|
fprintf(fp,
|
"\talways @(posedge i_clk)\n"
|
"\talways @(posedge i_clk)\n"
|
"\t\tif (i_ce)\n"
|
"\t\tif (i_ce)\n"
|
"\t\tbegin\n"
|
"\t\tbegin\n"
|
"\t\t\t// First clock, recover all values\n"
|
"\t\t\t// First clock, recover all values\n"
|
"\t\t\tfifo_read <= fifo_left[fifo_read_addr];\n"
|
"\t\t\tfifo_read <= fifo_left[fifo_read_addr];\n"
|
"\t\t\t// These values are IWIDTH+CWIDTH+3 bits wide\n"
|
"\t\t\t// These values are IWIDTH+CWIDTH+3 bits wide\n"
|
|
"\t\t\t// although they only need to be (IWIDTH+1)\n"
|
|
"\t\t\t// + (CWIDTH) bits wide. (We\'ve got two\n"
|
|
"\t\t\t// extra bits we need to get rid of.)\n"
|
"\t\t\tmpy_r <= p_one - p_two;\n"
|
"\t\t\tmpy_r <= p_one - p_two;\n"
|
"\t\t\tmpy_i <= p_three - p_one - p_two;\n"
|
"\t\t\tmpy_i <= p_three - p_one - p_two;\n"
|
"\n"
|
"\n"
|
"\t\t\t// Second clock, round and latch for final clock\n"
|
"\t\t\t// Second clock, round and latch for final clock\n"
|
"\t\t\tb_right_r <= mpy_r + rnd;\n"
|
"\t\t\tb_right_r <= mpy_r + rnd;\n"
|
"\t\t\tb_right_i <= mpy_i + rnd;\n"
|
"\t\t\tb_right_i <= mpy_i + rnd;\n"
|
"\t\t\tb_left_r <= { {2{left_r[(IWIDTH+CWIDTH)]}},left_r } + rnd;\n"
|
"\t\t\tb_left_r <= { {2{fifo_r[(IWIDTH+CWIDTH)]}},fifo_r } + rnd;\n"
|
"\t\t\tb_left_i <= { {2{left_i[(IWIDTH+CWIDTH)]}},left_i } + rnd;\n"
|
"\t\t\tb_left_i <= { {2{fifo_i[(IWIDTH+CWIDTH)]}},fifo_i } + rnd;\n"
|
"\t\t\to_aux <= aux;\n"
|
"\t\t\to_aux <= aux;\n"
|
"\t\tend\n"
|
"\t\tend\n"
|
"\n"
|
"\n");
|
|
fprintf(fp,
|
"\t// Final clock--clock and remove unnecessary bits.\n"
|
"\t// Final clock--clock and remove unnecessary bits.\n"
|
"\t// We have (IWIDTH+CWIDTH+3) bits here, we need to drop down to OWIDTH,\n"
|
"\t// We have (IWIDTH+CWIDTH+3) bits here, we need to drop down to\n"
|
"\t// and SHIFT by SHIFT bits in the process.\n"
|
"\t// OWIDTH, and SHIFT by SHIFT bits in the process. The trick is\n"
|
"\tassign o_left_r = b_left_r[ (CWIDTH+IWIDTH+2-SHIFT):(CWIDTH+IWIDTH+2-SHIFT-OWIDTH+1)];\n"
|
"\t// that we don\'t need (IWIDTH+CWIDTH+3) bits. We\'ve accumulated\n"
|
"\tassign o_left_i = b_left_i[ (CWIDTH+IWIDTH+2-SHIFT):(CWIDTH+IWIDTH+2-SHIFT-OWIDTH+1)];\n"
|
"\t// them, but the actual values will never fill all these bits.\n"
|
"\tassign o_right_r = b_right_r[(CWIDTH+IWIDTH+2-SHIFT):(CWIDTH+IWIDTH+2-SHIFT-OWIDTH+1)];\n"
|
"\t// In particular, we only need:\n"
|
"\tassign o_right_i = b_right_i[(CWIDTH+IWIDTH+2-SHIFT):(CWIDTH+IWIDTH+2-SHIFT-OWIDTH+1)];\n"
|
"\t//\t IWIDTH bits for the input\n"
|
"\n"
|
"\t//\t +1 bit for the add/subtract\n"
|
|
"\t//\t+CWIDTH bits for the coefficient multiply\n"
|
|
"\t//\t +1 bit for the add/subtract in the complex multiply\n"
|
|
"\t//\t ------\n"
|
|
"\t//\t (IWIDTH+CWIDTH+2) bits at full precision.\n"
|
|
"\t//\n"
|
|
"\t// However, the coefficient multiply multiplied by a maximum value\n"
|
|
"\t// of 2^(CWIDTH-2). Thus, we only have\n"
|
|
"\t//\t IWIDTH bits for the input\n"
|
|
"\t//\t +1 bit for the add/subtract\n"
|
|
"\t//\t+CWIDTH-2 bits for the coefficient multiply\n"
|
|
"\t//\t +1 (optional) bit for the add/subtract in the cpx mpy.\n"
|
|
"\t//\t -------- ... multiply. (This last bit may be shifted out.)\n"
|
|
"\t//\t (IWIDTH+CWIDTH) valid output bits. \n"
|
|
"\t// Now, if the user wants to keep any extras of these (via OWIDTH),\n"
|
|
"\t// or if he wishes to arbitrarily shift some of these off (via\n"
|
|
"\t// SHIFT) we accomplish that here.\n"
|
|
"\tassign o_left_r = b_left_r[ (CWIDTH+IWIDTH-1-SHIFT-1):(CWIDTH+IWIDTH-OWIDTH-SHIFT-1)];\n"
|
|
"\tassign o_left_i = b_left_i[ (CWIDTH+IWIDTH-1-SHIFT-1):(CWIDTH+IWIDTH-OWIDTH-SHIFT-1)];\n"
|
|
"\tassign o_right_r = b_right_r[(CWIDTH+IWIDTH-1-SHIFT-1):(CWIDTH+IWIDTH-OWIDTH-SHIFT-1)];\n"
|
|
"\tassign o_right_i = b_right_i[(CWIDTH+IWIDTH-1-SHIFT-1):(CWIDTH+IWIDTH-OWIDTH-SHIFT-1)];\n"
|
|
"\n"
|
|
"\t// As a final step, we pack our outputs into two packed two\'s\n"
|
|
"\t// complement numbers per output word, so that each output word\n"
|
|
"\t// has (2*OWIDTH) bits in it, with the top half being the real\n"
|
|
"\t// portion and the bottom half being the imaginary portion.\n"
|
"\tassign o_left = { o_left_r, o_left_i };\n"
|
"\tassign o_left = { o_left_r, o_left_i };\n"
|
"\tassign o_right= { o_right_r,o_right_i};\n"
|
"\tassign o_right= { o_right_r,o_right_i};\n"
|
"\n"
|
"\n"
|
"endmodule\n");
|
"endmodule\n");
|
fclose(fp);
|
fclose(fp);
|
Line 905... |
Line 993... |
"\t\t\t\toB <= oB + 1;\n"
|
"\t\t\t\toB <= oB + 1;\n"
|
"\t\t\t\to_sync <= 1'b0;\n"
|
"\t\t\t\to_sync <= 1'b0;\n"
|
"\t\t\tend else\n"
|
"\t\t\tend else\n"
|
"\t\t\t\to_sync <= 1'b0;\n"
|
"\t\t\t\to_sync <= 1'b0;\n"
|
"\t\tend\n"
|
"\t\tend\n"
|
"\n"
|
"\n", (inv)?"i":"");
|
|
fprintf(fstage,
|
"\tbutterfly #(.IWIDTH(IWIDTH),.CWIDTH(CWIDTH),.OWIDTH(OWIDTH),\n"
|
"\tbutterfly #(.IWIDTH(IWIDTH),.CWIDTH(CWIDTH),.OWIDTH(OWIDTH),\n"
|
"\t\t\t.LGDELAY(LGBDLY),.SHIFT(BFLYSHIFT))\n"
|
"\t\t\t.MPYDELAY(%d\'d%d),.LGDELAY(LGBDLY),.SHIFT(BFLYSHIFT))\n"
|
"\t\tbfly(i_clk, (b_ce&i_ce), ib_c,\n"
|
"\t\tbfly(i_clk, (b_ce&i_ce), ib_c,\n"
|
"\t\t\tib_a, ib_b, ib_sync, ob_a, ob_b, ob_sync);\n"
|
"\t\t\tib_a, ib_b, ib_sync, ob_a, ob_b, ob_sync);\n"
|
"endmodule;\n",
|
"endmodule;\n",
|
(inv)?"i":"");
|
lgdelay(nbits, xtra), (1<xtra)?(nbits+4):(nbits+xtra+3));
|
}
|
}
|
|
|
void usage(void) {
|
void usage(void) {
|
fprintf(stderr,
|
fprintf(stderr,
|
"USAGE:\tfftgen [-f <size>] [-d dir] [-c cbits] [-n nbits] [-m mxbits] [-s01]\n"
|
"USAGE:\tfftgen [-f <size>] [-d dir] [-c cbits] [-n nbits] [-m mxbits] [-s01]\n"
|