OpenCores
URL https://opencores.org/ocsvn/dblclockfft/dblclockfft/trunk

Subversion Repositories dblclockfft

Compare Revisions

  • This comparison shows the changes necessary to convert path
    /dblclockfft/trunk
    from Rev 4 to Rev 5
    Reverse comparison

Rev 4 → Rev 5

/bench/cpp/butterfly_tb.cpp
4,81 → 4,256
#include "Vbutterfly.h"
#include "verilated.h"
 
void tick(Vbutterfly *bfly) {
bfly->i_clk = 0;
bfly->eval();
bfly->i_clk = 1;
bfly->eval();
}
class BFLY_TB {
public:
Vbutterfly *m_bfly;
unsigned long m_left[64], m_right[64];
int m_addr, m_lastaux;
 
void reset(Vbutterfly *bfly) {
bfly->i_ce = 0;
bfly->i_rst = 1;
tick(bfly);
bfly->i_ce = 0;
bfly->i_rst = 0;
tick(bfly);
}
BFLY_TB(void) {
m_bfly = new Vbutterfly;
m_addr = 0;
}
 
void tick(void) {
m_lastaux = m_bfly->o_aux;
m_bfly->i_clk = 0;
m_bfly->eval();
m_bfly->i_clk = 1;
m_bfly->eval();
}
 
void reset(void) {
m_bfly->i_ce = 0;
m_bfly->i_aux = 1;
m_bfly->i_coef = 0l;
m_bfly->i_left = 0;
m_bfly->i_right = 0;
tick();
m_bfly->i_ce = 1;
m_bfly->i_aux = 1;
 
for(int i=0; i<40; i++)
tick();
m_bfly->i_aux = 0;
tick();
}
 
void test(const int n, const int k, const unsigned long cof,
const unsigned lft, const unsigned rht, const int aux) {
 
m_bfly->i_coef = cof & (~(-1l << 40));
m_bfly->i_left = lft;
m_bfly->i_right = rht;
m_bfly->i_aux = aux & 1;
 
tick();
 
if ((m_bfly->o_aux)&&(!m_lastaux))
printf("\n");
printf("n,k=%d,%3d: COEF=%010lx, LFT=%08x, RHT=%08x, A=%d, OLFT =%09lx, ORHT=%09lx, AUX=%d\n",
n,k,
m_bfly->i_coef & (~(-1l<<40)),
m_bfly->i_left,
m_bfly->i_right,
m_bfly->i_aux,
m_bfly->o_left,
m_bfly->o_right,
m_bfly->o_aux);
/*
printf("\tFI=%010lx",
((((long)m_bfly->v__DOT__r_aux_2)&1l)<<34)
|((((long)m_bfly->v__DOT__r_sum_r)&0x01ffffl)<<17)
|(((long)m_bfly->v__DOT__r_sum_i)&0x01ffffl));
printf("\tFO=%010lx SUMR=%05x SUMI=%05x A=%d",
m_bfly->v__DOT__fifo_read,
m_bfly->v__DOT__r_sum_r,
m_bfly->v__DOT__r_sum_i,
m_bfly->v__DOT__r_aux_2);
printf("\tML=%09lx, MR=%09lx, ",
m_left[ (m_addr-23)&(64-1)],
m_right[(m_addr-23)&(64-1)]);
*/
/*
printf("\tBLFTR=%10lx BLFTI=%10lx",
m_bfly->v__DOT__b_left_r & (~(-1l<<40)),
m_bfly->v__DOT__b_left_i & (~(-1l<<40)));
printf("\tMPYR=%10lx MPYI=%10lx",
m_bfly->v__DOT__mpy_r & (~(-1l<<40)),
m_bfly->v__DOT__mpy_i & (~(-1l<<40)));
printf("\n");
*/
 
if (m_left[(m_addr-23)&(64-1)] != m_bfly->o_left) {
fprintf(stderr, "WRONG O_LEFT!\n");
exit(-1);
}
 
if (m_right[(m_addr-23)&(64-1)] != m_bfly->o_right) {
fprintf(stderr, "WRONG O_RIGHT!\n");
exit(-1);
}
 
// Now, let's calculate an "expected" result ...
long rlft, ilft;
 
// Extract left and right values ...
rlft = (m_bfly->i_left >> 16) & 0x0ffff;
ilft = (m_bfly->i_left ) & 0x0ffff;
// Make certain they are properly sign extended ...
if (rlft & 0x8000) rlft |= (-1<<16);
if (ilft & 0x8000) ilft |= (-1<<16);
 
// Now repeat for the right hand value ...
long rrht, irht;
// Extract left and right values ...
rrht = (m_bfly->i_right >> 16) & 0x0ffff;
irht = (m_bfly->i_right ) & 0x0ffff;
// Make certain they are properly sign extended ...
if (rrht & 0x8000) rrht |= (-1<<16);
if (irht & 0x8000) irht |= (-1<<16);
 
 
// and again for the coefficients
long rcof, icof;
// Extract left and right values ...
rcof = (m_bfly->i_coef >> 20) & 0x0fffff;
icof = (m_bfly->i_coef ) & 0x0fffff;
// Make certain they are properly sign extended ...
if (rcof & 0x80000) rcof |= (-1<<20);
if (icof & 0x80000) icof |= (-1<<20);
 
 
// Now, let's do the butterfly ourselves ...
long sumi, sumr, difi, difr;
sumr = rlft + rrht;
sumi = ilft + irht;
difr = rlft - rrht;
difi = ilft - irht;
 
/*
printf("L=%5lx+%5lx,R=%5lx+%5lx,S=%5lx+%5lx,D=%5lx+%5lx, ",
rlft & 0x02ffffl,
ilft & 0x02ffffl,
rrht & 0x02ffffl,
irht & 0x02ffffl,
sumr & 0x02ffffl,
sumi & 0x02ffffl,
difr & 0x02ffffl,
difi & 0x02ffffl);
*/
long p1, p2, p3, mpyr, mpyi;
p1 = difr * rcof;
p2 = difi * icof;
p3 = (difr + difi) * (rcof + icof);
 
mpyr = p1-p2;
mpyi = p3-p1-p2;
 
/*
printf("RC=%lx, IC=%lx, ", rcof, icof);
printf("P1=%lx,P2=%lx,P3=%lx, ", p1,p2,p3);
printf("MPYr = %lx, ", mpyr);
printf("MPYi = %lx, ", mpyi);
*/
 
long o_left_r, o_left_i, o_right_r, o_right_i;
unsigned long o_left, o_right;
 
o_left_r = sumr & 0x01ffff; o_left_i = sumi & 0x01ffff;
o_left = (o_left_r << 17) | (o_left_i);
 
o_right_r = (mpyr>>18) & 0x01ffff;
o_right_i = (mpyi>>18) & 0x01ffff;
o_right = (o_right_r << 17) | (o_right_i);
/*
printf("oR_r = %lx, ", o_right_r);
printf("oR_i = %lx\n", o_right_i);
*/
 
m_left[ m_addr&(64-1)] = o_left;
m_right[m_addr&(64-1)] = o_right;
 
m_addr++;
}
};
 
int main(int argc, char **argv, char **envp) {
Verilated::commandArgs(argc, argv);
Vbutterfly *bfly = new Vbutterfly;
BFLY_TB *bfly = new BFLY_TB;
int16_t ir0, ii0, lstr, lsti;
int32_t sumr, sumi, difr, difi;
int32_t smr, smi, dfr, dfi;
int rnd = 0;
 
reset(bfly);
const int TESTSZ = 256;
 
for(int k=0; k<270; k++) {
int32_t or0, oi0, or1, oi1;
bfly->reset();
 
bfly->i_ce = 1;
bfly->i_sync = ((k&0x0ff)==0);
// Let's pick some random values, ...
ir0 = rand(); if (ir0&4) ir0 = -ir0;
ii0 = rand(); if (ii0&2) ii0 = -ii0;
bfly->test(9,0,0x4000000000l,0x7fff0000,0x7fff0000, 1);
bfly->test(9,1,0x4000000000l,0x7fff0000,0x80010000, 0);
bfly->test(9,2,0x4000000000l,0x00007fff,0x00008001, 0);
bfly->test(9,3,0x4000000000l,0x00007fff,0x00007fff, 0);
 
bfly->i_data = ((ir0&0x0ffff) << 16) | (ii0 & 0x0ffff);
tick(bfly);
bfly->test(8,0,0x4000000000l,0x80010000,0x80010000, 1);
bfly->test(8,1,0x4000000000l,0x00008001,0x00008001, 0);
 
printf("k=%3d: COEF=%08x, LFT=%08x, RHT=%08x, AUX=%d, OLFT =%09lx, ORHT=%09lx, AUX=%d\n",
k, bfly->i_coef, bfly->i_left, bfly->i_right, bfly->i_aux,
bfly->o_left, bfly->o_right, bfly->o_sync);
bfly->test(9,0,0x4000000000l,0x40000000,0xc0000000, 1);
bfly->test(9,1,0x4000000000l,0x40000000,0x40000000, 0);
bfly->test(9,2,0x4000000000l,0x00004000,0x0000c000, 0);
bfly->test(9,3,0x4000000000l,0x00004000,0x00004000, 0);
 
or0 = (bfly->o_data >> 17) & 0x01ffff;
oi0 = bfly->o_data & 0x01ffff;
if (or0 & 0x010000) or0 |= (-1<<16);
if (oi0 & 0x010000) oi0 |= (-1<<16);
bfly->test(9,0,0x4000000000l,0x20000000,0xe0000000, 1);
bfly->test(9,1,0x4000000000l,0x20000000,0x20000000, 0);
bfly->test(9,2,0x4000000000l,0x00002000,0x0000e000, 0);
bfly->test(9,3,0x4000000000l,0x00002000,0x00002000, 0);
 
if (k>3) {
/*
printf("\tOR0 = %6x, OI0 = %6x, SUM = %6x + %6x, DIF = %6x + %6x\n",
or0, oi0, sumr, sumi, difr, difi);
*/
if (0==(k&1)) {
if (or0 != sumr) {fprintf(stderr, "FAIL 1\n"); exit(-1);}
if (oi0 != sumi) {fprintf(stderr, "FAIL 2\n"); exit(-1);}
} else if (1==(k&1)) {
if (or0 != difr) {fprintf(stderr, "FAIL 3\n"); exit(-1);}
if (oi0 != difi) {fprintf(stderr, "FAIL 4\n"); exit(-1);}
}
}
bfly->test(9,0,0x4000000000l,0x00080000,0xfff80000, 1);
bfly->test(9,1,0x4000000000l,0x00080000,0x00080000, 0);
bfly->test(9,2,0x4000000000l,0x00000008,0x0000fff8, 0);
bfly->test(9,3,0x4000000000l,0x00000008,0x00000008, 0);
 
if (((4==(k&0x0ff))?1:0) != bfly->o_sync) { fprintf(stderr, "BAD O-SYNC\n"); exit(-1); }
bfly->test(9,0,0x4000000000l,0x00010000,0xffff0000, 1);
bfly->test(9,1,0x4000000000l,0x00010000,0x00010000, 0);
bfly->test(9,2,0x4000000000l,0x00000001,0x0000ffff, 0);
bfly->test(9,3,0x4000000000l,0x00000001,0x00000001, 0);
 
if (1 == (k&1)) {
sumr = smr; sumi = smi; difr=dfr, difi= dfi;
for(int n=0; n<4; n++) for(int k=0; k<TESTSZ; k++) {
long iv, rv;
unsigned long lft, rht, cof;
double c, s, W;
bool inv = 1;
int aux;
 
smr = lstr + ir0 + rnd;
smi = lsti + ii0 + rnd;
W = ((inv)?-1:1) * 2.0 * M_PI * (2*k) / TESTSZ * 64;
c = cos(W); s = sin(W);
rv = (long)((double)(1l<<(16-2-n))*c+0.5);
iv = (long)((double)(1l<<(16-2-n))*s+0.5);
 
dfr = lstr - ir0 + rnd;
dfi = lsti - ii0 + rnd;
}
rv = (rv << 16) | (iv & (~(-1<<16)));
lft = rv;
 
lstr = ir0;
lsti = ii0;
W = ((inv)?-1:1) * 2.0 * M_PI * (2*k+1) / TESTSZ * 64;
c = cos(W); s = sin(W);
rv = (long)((double)(1l<<(16-2-n))*c+0.5);
iv = (long)((double)(1l<<(16-2-n))*s+0.5);
 
rv = (rv << 16) | (iv & (~(-1<<16)));
rht = rv;
 
 
// Switch the sign of W
W = ((inv)?1:-1) * 2.0 * M_PI * (2*k) / TESTSZ;
c = cos(W); s = sin(W);
rv = (long)((double)(1l<<(20-2))*c+0.5); // Keep 20-2 bits for
iv = (long)((double)(1l<<(20-2))*s+0.5); // coefficients
 
rv = (rv << 20) | (iv & (~(-1<<20)));
cof = rv;
 
aux = ((k&(TESTSZ-1))==0);
 
bfly->test(n,k, cof, lft, rht, aux);
}
 
delete bfly;
/bench/cpp/Makefile
1,4 → 1,4
all: mpy_tb dblrev_tb dblstage_tb qtrstage_tb
all: mpy_tb dblrev_tb dblstage_tb qtrstage_tb test
 
OBJDR:= ../../sw/fft-core/obj_dir
VINC := -I/usr/share/verilator/include -I$(OBJDR)/
6,6 → 6,7
DBLRV:= $(OBJDR)/Vdblreverse__ALL.a
DBLSG:= $(OBJDR)/Vdblstage__ALL.a
QTRSG:= $(OBJDR)/Vqtrstage__ALL.a
BFLYL:= $(OBJDR)/Vbutterfly__ALL.a
VERILATOR_ROOT := /usr/share/verilator
 
mpy_tb: mpy_tb.cpp $(MPYLB)
20,6 → 21,18
qtrstage_tb: qtrstage_tb.cpp $(QTRSG)
g++ -g $(VINC) $< $(QTRSG) $(VERILATOR_ROOT)/include/verilated.cpp -o $@
 
butterfly_tb: butterfly_tb.cpp $(BFLYL)
g++ -g $(VINC) $< $(BFLYL) $(VERILATOR_ROOT)/include/verilated.cpp -o $@
 
.PHONY: test
test: mpy_tb dblrev_tb dblstage_tb qtrstage_tb butterfly_tb
./mpy_tb
./dblrev_tb
./dblstage_tb
./qtrstage_tb
./butterfly_tb
 
.PHONY: clean
clean:
rm mpy_tb dblrev_tb dblstage_tb qtrstage_tb
 
/sw/fftgen.cpp
25,7 → 25,10
"// for more details.\n"
"//\n"
"// You should have received a copy of the GNU General Public License along\n"
"// with this program. If not, see <http://www.gnu.org/licenses/>.\n"
"// with this program. (It's in the $(ROOT)/doc directory, run make with no\n"
"// target there if the PDF file isn\'t present.) If not, see\n"
"// <http://www.gnu.org/licenses/> for a copy.\n"
"//\n"
"// License: GPL, v3, as defined and found on www.gnu.org,\n"
"// http://www.gnu.org/licenses/gpl.html\n"
"//\n"
55,9 → 58,9
int cbits = nbits + xtra;
int delay = nbits + 2;
if (nbits+1<cbits)
delay = nbits+2;
delay = nbits+4;
else
delay = cbits+1;
delay = cbits+3;
return lgval(delay);
}
 
76,13 → 79,11
"// \n"
"// Project: %s\n"
"//\n"
"// Purpose: This file is (almost) a Verilog source file. It is meant to\n"
"// be used by a FFT core compiler to generate FFTs which may be\n"
"// used as part of an FFT core. Specifically, this file \n"
"// encapsulates the options of a 4 point, decimation in\n"
"// frequency FFT-stage. This particular stage is optimized so\n"
"// that all of the multiplies are accomplished by additions and\n"
"// mux'es.\n"
"// Purpose: This file encapsulates the 4 point stage of a decimation in\n"
"// frequency FFT. This particular implementation is optimized\n"
"// so that all of the multiplies are accomplished by additions\n"
"// and multiplexers only.\n"
"//\n"
"//\n%s"
"//\n",
prjname, creator);
90,100 → 91,100
 
fprintf(fp,
"module\tqtrstage(i_clk, i_rst, i_ce, i_sync, i_data, o_data, o_sync);\n"
"\tparameter IWIDTH=16, OWIDTH=IWIDTH+1;\n"
"\t// Parameters specific to the core that should be changed when this\n"
"\t// core is built ... Note that the minimum LGSPAN is 2. Smaller \n"
"\t// spans must use the fftdoubles stage.\n"
"\tparameter\tLGWIDTH=8, ODD=0, INVERSE=0,SHIFT=0;\n"
"\tinput\t i_clk, i_rst, i_ce, i_sync;\n"
"\tinput\t [(2*IWIDTH-1):0] i_data;\n"
"\toutput\treg [(2*OWIDTH-1):0] o_data;\n"
"\toutput\treg o_sync;\n"
"\t\n"
"\treg\t wait_for_sync;\n"
"\treg\t[2:0] pipeline;\n"
"\tparameter IWIDTH=16, OWIDTH=IWIDTH+1;\n"
"\t// Parameters specific to the core that should be changed when this\n"
"\t// core is built ... Note that the minimum LGSPAN is 2. Smaller \n"
"\t// spans must use the fftdoubles stage.\n"
"\tparameter\tLGWIDTH=8, ODD=0, INVERSE=0,SHIFT=0;\n"
"\tinput\t i_clk, i_rst, i_ce, i_sync;\n"
"\tinput\t [(2*IWIDTH-1):0] i_data;\n"
"\toutput\treg [(2*OWIDTH-1):0] o_data;\n"
"\toutput\treg o_sync;\n"
"\t\n"
"\treg\t wait_for_sync;\n"
"\treg\t[2:0] pipeline;\n"
"\n"
"\treg\t[(IWIDTH):0] sum_r, sum_i, diff_r, diff_i;\n"
"\twire\t[(IWIDTH):0] n_diff_i;\n"
"\tassign n_diff_i = -diff_i;\n"
"\treg\t[(IWIDTH):0] sum_r, sum_i, diff_r, diff_i;\n"
"\twire\t[(IWIDTH):0] n_diff_i;\n"
"\tassign n_diff_i = -diff_i;\n"
"\n"
"\treg\t[(2*OWIDTH-1):0] ob_a;\n"
"\twire\t[(2*OWIDTH-1):0] ob_b;\n"
"\treg\t[(OWIDTH-1):0] ob_b_r, ob_b_i;\n"
"\tassign ob_b = { ob_b_r, ob_b_i };\n"
"\treg\t[(2*OWIDTH-1):0] ob_a;\n"
"\twire\t[(2*OWIDTH-1):0] ob_b;\n"
"\treg\t[(OWIDTH-1):0] ob_b_r, ob_b_i;\n"
"\tassign ob_b = { ob_b_r, ob_b_i };\n"
"\n"
"\treg\t[(LGWIDTH-1):0] iaddr;\n"
"\treg\t[(2*IWIDTH-1):0] imem;\n"
"\treg\t[(LGWIDTH-1):0] iaddr;\n"
"\treg\t[(2*IWIDTH-1):0] imem;\n"
"\n"
"\twire\tsigned\t[(IWIDTH-1):0]\timem_r, imem_i;\n"
"\tassign\timem_r = imem[(2*IWIDTH-1):(IWIDTH)];\n"
"\tassign\timem_i = imem[(IWIDTH-1):0];\n"
"\twire\tsigned\t[(IWIDTH-1):0]\timem_r, imem_i;\n"
"\tassign\timem_r = imem[(2*IWIDTH-1):(IWIDTH)];\n"
"\tassign\timem_i = imem[(IWIDTH-1):0];\n"
"\n"
"\twire\tsigned\t[(IWIDTH-1):0]\ti_data_r, i_data_i;\n"
"\tassign\ti_data_r = i_data[(2*IWIDTH-1):(IWIDTH)];\n"
"\tassign\ti_data_i = i_data[(IWIDTH-1):0];\n"
"\twire\tsigned\t[(IWIDTH-1):0]\ti_data_r, i_data_i;\n"
"\tassign\ti_data_r = i_data[(2*IWIDTH-1):(IWIDTH)];\n"
"\tassign\ti_data_i = i_data[(IWIDTH-1):0];\n"
"\n"
"\treg [(2*OWIDTH-1):0] omem;\n"
"\treg [(2*OWIDTH-1):0] omem;\n"
"\n"
"\twire [(IWIDTH-1):0] rnd;\n"
"\tassign rnd = ((IWIDTH+1-OWIDTH-SHIFT)!=0) ? { {(IWIDTH-1){1'b0}}, (OWIDTH<IWIDTH+1)? 1'b1:1'b0 } : {{(IWIDTH){1'b0}}};\n"
"\twire [(IWIDTH-1):0] rnd;\n"
"\tassign rnd = ((IWIDTH+1-OWIDTH-SHIFT)!=0) ? { {(IWIDTH-1){1'b0}}, (OWIDTH<IWIDTH+1)? 1'b1:1'b0 } : {{(IWIDTH){1'b0}}};\n"
"\n"
"\talways @(posedge i_clk)\n"
"\t\tif (i_rst)\n"
"\t\tbegin\n"
"\t\t\twait_for_sync <= 1'b1;\n"
"\t\t\tiaddr <= 0;\n"
"\t\t\tpipeline <= 3'b000;\n"
"\t\tend\n"
"\t\telse if ((i_ce)&&((~wait_for_sync)||(i_sync)))\n"
"\t\tbegin\n"
"\t\t\t// Always\n"
"\t\t\timem <= i_data;\n"
"\t\t\tiaddr <= iaddr + 1;\n"
"\t\t\twait_for_sync <= 1'b0;\n"
"\talways @(posedge i_clk)\n"
"\t\tif (i_rst)\n"
"\t\tbegin\n"
"\t\t\twait_for_sync <= 1'b1;\n"
"\t\t\tiaddr <= 0;\n"
"\t\t\tpipeline <= 3'b000;\n"
"\t\tend\n"
"\t\telse if ((i_ce)&&((~wait_for_sync)||(i_sync)))\n"
"\t\tbegin\n"
"\t\t\t// Always\n"
"\t\t\timem <= i_data;\n"
"\t\t\tiaddr <= iaddr + 1;\n"
"\t\t\twait_for_sync <= 1'b0;\n"
"\n"
"\t\t\t// In sequence, clock = 0\n"
"\t\t\tif (iaddr[0])\n"
"\t\t\tbegin\n"
"\t\t\t\tsum_r <= imem_r + i_data_r + rnd;\n"
"\t\t\t\tsum_i <= imem_i + i_data_i + rnd;\n"
"\t\t\t\tdiff_r <= imem_r - i_data_r + rnd;\n"
"\t\t\t\tdiff_i <= imem_i - i_data_i + rnd;\n"
"\t\t\t// In sequence, clock = 0\n"
"\t\t\tif (iaddr[0])\n"
"\t\t\tbegin\n"
"\t\t\t\tsum_r <= imem_r + i_data_r + rnd;\n"
"\t\t\t\tsum_i <= imem_i + i_data_i + rnd;\n"
"\t\t\t\tdiff_r <= imem_r - i_data_r + rnd;\n"
"\t\t\t\tdiff_i <= imem_i - i_data_i + rnd;\n"
"\n"
"\t\t\t\tpipeline[2:0] <= { pipeline[1:0], 1'b1 };\n"
"\t\t\tend else\n"
"\t\t\t\tpipeline[2:0] <= { pipeline[1:0], 1'b0 };\n"
"\t\t\t\tpipeline[2:0] <= { pipeline[1:0], 1'b1 };\n"
"\t\t\tend else\n"
"\t\t\t\tpipeline[2:0] <= { pipeline[1:0], 1'b0 };\n"
"\n"
"\t\t\t// In sequence, clock = 1\n"
"\t\t\tif (pipeline[1])\n"
"\t\t\tbegin\n"
"\t\t\t ob_a <= { sum_r[(IWIDTH):(IWIDTH+1-OWIDTH)],\n"
"\t\t\t\t\t\tsum_i[(IWIDTH):(IWIDTH+1-OWIDTH)] };\n"
"\t\t\t\t// on Even, W = e^{-j2pi 1/4 0} = 1\n"
"\t\t\t\tif (~ODD)\n"
"\t\t\t\tbegin\n"
"\t\t\t// In sequence, clock = 1\n"
"\t\t\tif (pipeline[1])\n"
"\t\t\tbegin\n"
"\t\t\t\tob_a <= { sum_r[(IWIDTH):(IWIDTH+1-OWIDTH)],\n"
"\t\t\t\t\t\tsum_i[(IWIDTH):(IWIDTH+1-OWIDTH)] };\n"
"\t\t\t\t// on Even, W = e^{-j2pi 1/4 0} = 1\n"
"\t\t\t\tif (~ODD)\n"
"\t\t\t\tbegin\n"
"\t\t\t\t\tob_b_r <= diff_r[(IWIDTH-SHIFT):(IWIDTH+1-OWIDTH-SHIFT)];\n"
"\t\t\t\t\tob_b_i <= diff_i[(IWIDTH-SHIFT):(IWIDTH+1-OWIDTH-SHIFT)];\n"
"\t\t\t\tend else if (~INVERSE) begin\n"
"\t\t\t\tend else if (~INVERSE) begin\n"
"\t\t\t\t\t// on Odd, W = e^{-j2pi 1/4} = -j\n"
"\t\t\t\t\tob_b_r <= diff_i[(IWIDTH-SHIFT):(IWIDTH+1-OWIDTH-SHIFT)];\n"
"\t\t\t\t\tob_b_i <= diff_r[(IWIDTH-SHIFT):(IWIDTH+1-OWIDTH-SHIFT)];\n"
"\t\t\t\tend else begin\n"
"\t\t\t\tend else begin\n"
"\t\t\t\t\t// on Odd, W = e^{j2pi 1/4} = j\n"
"\t\t\t\t\tob_b_r <= n_diff_i[(IWIDTH-SHIFT):(IWIDTH+1-OWIDTH-SHIFT)];\n"
"\t\t\t\t\tob_b_i <= diff_r[(IWIDTH-SHIFT):(IWIDTH+1-OWIDTH-SHIFT)];\n"
"\t\t\t\tend\n"
"\t\t\t\t// (wire) ob_b <= { ob_b_r, ob_b_i };\n"
"\t\t\tend\n"
"\t\t\t// In sequence, clock = 2\n"
"\t\t\tif (pipeline[2])\n"
"\t\t\tbegin\n"
"\t\t\t\tomem <= ob_b;\n"
"\t\t\t\to_data <= ob_a;\n"
"\t\t\tend else\n"
"\t\t\t\to_data <= omem;\n"
"\t\t\to_sync <= &(~iaddr[(LGWIDTH-1):3]) && (iaddr[2:0] == 3'b100);\n"
"\t\tend\n"
"\t\t\t\tend\n"
"\t\t\t\t// (wire) ob_b <= { ob_b_r, ob_b_i };\n"
"\t\t\tend\n"
"\t\t\t// In sequence, clock = 2\n"
"\t\t\tif (pipeline[2])\n"
"\t\t\tbegin\n"
"\t\t\t\tomem <= ob_b;\n"
"\t\t\t\to_data <= ob_a;\n"
"\t\t\tend else\n"
"\t\t\t\to_data <= omem;\n"
"\t\t\to_sync <= &(~iaddr[(LGWIDTH-1):3]) && (iaddr[2:0] == 3'b100);\n"
"\t\tend\n"
"endmodule\n");
}
 
203,12 → 204,13
"// Project: %s\n"
"//\n"
"// Purpose: This is part of an FPGA implementation that will process\n"
"// data at two samples per clock. If you notice from the\n"
"// derivation of an FFT, the only time both even and odd\n"
"// samples are used at the same time is the first stage.\n"
"// Therefore, after this stage and these twiddles, all of the\n"
"// other stages can run two stages at a time at one sample per\n"
"// clock.\n"
"// the final stage of a decimate-in-frequency FFT, running\n"
"// through the data at two samples per clock. If you notice\n"
"// from the derivation of an FFT, the only time both even and\n"
"// odd samples are used at the same time is in this stage.\n"
"// Therefore, other than this stage and these twiddles, all of\n"
"// the other stages can run two stages at a time at one sample\n"
"// per clock.\n"
"//\n"
"// In this implementation, the output is valid one clock after\n"
"// the input is valid. The output also accumulates one bit\n"
229,43 → 231,43
fprintf(fp, "%s", cpyleft);
fprintf(fp,
"module dblstage(i_clk, i_ce, i_left, i_right, o_left, o_right);\n"
"\tparameter\tIWIDTH=16,OWIDTH=IWIDTH+1, SHIFT=0;\n"
"\tinput\t\ti_clk, i_ce;\n"
"\tinput\t\t[(2*IWIDTH-1):0]\ti_left, i_right;\n"
"\toutput\twire [(2*OWIDTH-1):0]\to_left, o_right;\n"
"\tparameter\tIWIDTH=16,OWIDTH=IWIDTH+1, SHIFT=0;\n"
"\tinput\t\ti_clk, i_ce;\n"
"\tinput\t\t[(2*IWIDTH-1):0]\ti_left, i_right;\n"
"\toutput\twire [(2*OWIDTH-1):0]\to_left, o_right;\n"
"\n"
"\twire\tsigned\t[(IWIDTH-1):0]\ti_in_0r, i_in_0i, i_in_1r, i_in_1i;\n"
"\tassign\ti_in_0r = i_left[(2*IWIDTH-1):(IWIDTH)]; \n"
"\tassign\ti_in_0i = i_left[(IWIDTH-1):0]; \n"
"\tassign\ti_in_1r = i_right[(2*IWIDTH-1):(IWIDTH)]; \n"
"\tassign\ti_in_1i = i_right[(IWIDTH-1):0]; \n"
"\twire\t[(OWIDTH-1):0]\t\to_out_0r, o_out_0i,\n"
"\t\t\t\t\to_out_1r, o_out_1i;\n"
"\twire\tsigned\t[(IWIDTH-1):0]\ti_in_0r, i_in_0i, i_in_1r, i_in_1i;\n"
"\tassign\ti_in_0r = i_left[(2*IWIDTH-1):(IWIDTH)]; \n"
"\tassign\ti_in_0i = i_left[(IWIDTH-1):0]; \n"
"\tassign\ti_in_1r = i_right[(2*IWIDTH-1):(IWIDTH)]; \n"
"\tassign\ti_in_1i = i_right[(IWIDTH-1):0]; \n"
"\twire\t[(OWIDTH-1):0]\t\to_out_0r, o_out_0i,\n"
"\t\t\t\t\to_out_1r, o_out_1i;\n"
"\n"
"\t// Don't forget that we accumulate a bit by adding two values together.\n"
"\t// Therefore our intermediate value must have one more bit than the\n"
"\t// two originals.\n"
"\treg\t[IWIDTH:0]\tout_0r, out_0i, out_1r, out_1i;\n"
"\t// Don't forget that we accumulate a bit by adding two values\n"
"\t// together. Therefore our intermediate value must have one more\n"
"\t// bit than the two originals.\n"
"\treg\t[IWIDTH:0]\tout_0r, out_0i, out_1r, out_1i;\n"
"\n"
"\talways @(posedge i_clk)\n"
"\t\tif (i_ce)\n"
"\t\tbegin\n"
"\t\t\tout_0r <= i_in_0r + i_in_1r;\n"
"\t\t\tout_0i <= i_in_0i + i_in_1i;\n"
"\t\t\t//\n"
"\t\t\tout_1r <= i_in_0r - i_in_1r;\n"
"\t\t\tout_1i <= i_in_0i - i_in_1i;\n"
"\t\tend\n"
"\talways @(posedge i_clk)\n"
"\t\tif (i_ce)\n"
"\t\tbegin\n"
"\t\t\tout_0r <= i_in_0r + i_in_1r;\n"
"\t\t\tout_0i <= i_in_0i + i_in_1i;\n"
"\t\t\t//\n"
"\t\t\tout_1r <= i_in_0r - i_in_1r;\n"
"\t\t\tout_1i <= i_in_0i - i_in_1i;\n"
"\t\tend\n"
"\n"
"\t// Now, if the master control program doesn't want to keep all of our\n"
"\t// bits, we can shift down to OWIDTH bits here.\n"
"\tassign\to_out_0r = out_0r[(IWIDTH-SHIFT):(IWIDTH+1-OWIDTH-SHIFT)];\n"
"\tassign\to_out_0i = out_0i[(IWIDTH-SHIFT):(IWIDTH+1-OWIDTH-SHIFT)];\n"
"\tassign\to_out_1r = out_1r[(IWIDTH-SHIFT):(IWIDTH+1-OWIDTH-SHIFT)];\n"
"\tassign\to_out_1i = out_1i[(IWIDTH-SHIFT):(IWIDTH+1-OWIDTH-SHIFT)];\n"
"\t// Now, if the master control program doesn't want to keep all of\n"
"\t// our bits, we can shift down to OWIDTH bits here.\n"
"\tassign\to_out_0r = out_0r[(IWIDTH-SHIFT):(IWIDTH+1-OWIDTH-SHIFT)];\n"
"\tassign\to_out_0i = out_0i[(IWIDTH-SHIFT):(IWIDTH+1-OWIDTH-SHIFT)];\n"
"\tassign\to_out_1r = out_1r[(IWIDTH-SHIFT):(IWIDTH+1-OWIDTH-SHIFT)];\n"
"\tassign\to_out_1i = out_1i[(IWIDTH-SHIFT):(IWIDTH+1-OWIDTH-SHIFT)];\n"
"\n"
"\tassign\to_left = { o_out_0r, o_out_0i };\n"
"\tassign\to_right = { o_out_1r, o_out_1i };\n"
"\tassign\to_left = { o_out_0r, o_out_0i };\n"
"\tassign\to_right = { o_out_1r, o_out_1i };\n"
"\n"
"endmodule\n");
fclose(fp);
326,6 → 328,10
"\treg\t[(AWIDTH+BWIDTH-1):0]\tacc[0:(AWIDTH-1)];\n"
"\tgenvar k;\n"
"\n"
"\t// If we were forced to stay within two\'s complement arithmetic,\n"
"\t// taking the absolute value here would require an additional bit.\n"
"\t// However, because our results are now unsigned, we can stay\n"
"\t// within the number of bits given (for now).\n"
"\talways @(posedge i_clk)\n"
"\t\tif (i_ce)\n"
"\t\tbegin\n"
431,65 → 437,70
"// o_1[m] = mem[01xxx1]\n"
"// ...\n"
"//\n"
"// The answer is that, yes we can but: we need to use four memory banks\n"
"// to do it properly. These four banks are defined by the two bits\n"
"// that determine the top and bottom of the correct address. Larger\n"
"// FFT\'s would require more memories.\n"
"//\n"
"//\n");
fprintf(fp,
"module dblreverse(i_clk, i_rst, i_ce, i_in_0, i_in_1,\n"
"\t\to_out_0, o_out_1, o_sync);\n"
"\tparameter\t\t\tLGSIZE=4, WIDTH=24;\n"
"\tinput\t\t\t\ti_clk, i_rst, i_ce;\n"
"\tinput\t\t[(2*WIDTH-1):0]\ti_in_0, i_in_1;\n"
"\toutput\treg\t[(2*WIDTH-1):0]\to_out_0, o_out_1;\n"
"\toutput\treg\t\t\to_sync;\n"
"\t\to_out_0, o_out_1, o_sync);\n"
"\tparameter\t\t\tLGSIZE=4, WIDTH=24;\n"
"\tinput\t\t\t\ti_clk, i_rst, i_ce;\n"
"\tinput\t\t[(2*WIDTH-1):0]\ti_in_0, i_in_1;\n"
"\toutput\treg\t[(2*WIDTH-1):0]\to_out_0, o_out_1;\n"
"\toutput\treg\t\t\to_sync;\n"
"\n"
"\treg\tin_reset;\n"
"\treg\t[(LGSIZE):0]\tiaddr;\n"
"\treg\t[(2*WIDTH-1):0]\tmem_0e [0:((1<<(LGSIZE-1))-1)];\n"
"\treg\t[(2*WIDTH-1):0]\tmem_0o [0:((1<<(LGSIZE-1))-1)];\n"
"\treg\t[(2*WIDTH-1):0]\tmem_1e [0:((1<<(LGSIZE-1))-1)];\n"
"\treg\t[(2*WIDTH-1):0]\tmem_1o [0:((1<<(LGSIZE-1))-1)];\n"
"\treg\tin_reset;\n"
"\treg\t[(LGSIZE):0]\tiaddr;\n"
"\treg\t[(2*WIDTH-1):0]\tmem_0e [0:((1<<(LGSIZE-1))-1)];\n"
"\treg\t[(2*WIDTH-1):0]\tmem_0o [0:((1<<(LGSIZE-1))-1)];\n"
"\treg\t[(2*WIDTH-1):0]\tmem_1e [0:((1<<(LGSIZE-1))-1)];\n"
"\treg\t[(2*WIDTH-1):0]\tmem_1o [0:((1<<(LGSIZE-1))-1)];\n"
"\n"
"\twire\t[(2*LGSIZE-1):0] braddr;\n"
"\tgenvar\tk;\n"
"\tgenerate for(k=0; k<LGSIZE; k++)\n"
"\t\tassign braddr[k] = iaddr[LGSIZE-1-k];\n"
"\tendgenerate\n"
"\twire\t[(2*LGSIZE-1):0] braddr;\n"
"\tgenvar\tk;\n"
"\tgenerate for(k=0; k<LGSIZE; k++)\n"
"\t\tassign braddr[k] = iaddr[LGSIZE-1-k];\n"
"\tendgenerate\n"
"\n"
"\talways @(posedge i_clk)\n"
"\t\tif (i_rst)\n"
"\t\tbegin\n"
"\t\t\tiaddr <= 0;\n"
"\t\t\tin_reset <= 1'b1;\n"
"\t\tend else if (i_ce)\n"
"\t\tbegin\n"
"\t\t\tif (iaddr[(LGSIZE-1)])\n"
"\t\t\tbegin\n"
"\t\t\t\tmem_1e[{iaddr[LGSIZE],iaddr[(LGSIZE-2):1]}] <= i_in_0;\n"
"\t\t\t\tmem_1o[{iaddr[LGSIZE],iaddr[(LGSIZE-2):1]}] <= i_in_1;\n"
"\t\t\tend else begin\n"
"\t\t\t\tmem_0e[{iaddr[LGSIZE],iaddr[(LGSIZE-2):1]}] <= i_in_0;\n"
"\t\t\t\tmem_0o[{iaddr[LGSIZE],iaddr[(LGSIZE-2):1]}] <= i_in_1;\n"
"\t\t\tend\n"
"\t\t\tiaddr <= iaddr + 2;\n"
"\t\t\tif (&iaddr[(LGSIZE-1):1])\n"
"\t\t\t\tin_reset <= 1'b0;\n"
"\t\t\tif (in_reset)\n"
"\t\t\tbegin\n"
"\t\t\t\to_out_0 <= {(2*WIDTH){1'b0}};\n"
"\t\t\t\to_out_1 <= {(2*WIDTH){1'b0}};\n"
"\t\t\t\to_sync <= 1'b0;\n"
"\t\t\tend else\n"
"\t\t\tbegin\n"
"\t\t\t\tif (braddr[0])\n"
"\t\t\t\tbegin\n"
"\talways @(posedge i_clk)\n"
"\t\tif (i_rst)\n"
"\t\tbegin\n"
"\t\t\tiaddr <= 0;\n"
"\t\t\tin_reset <= 1'b1;\n"
"\t\tend else if (i_ce)\n"
"\t\tbegin\n"
"\t\t\tif (iaddr[(LGSIZE-1)])\n"
"\t\t\tbegin\n"
"\t\t\t\tmem_1e[{iaddr[LGSIZE],iaddr[(LGSIZE-2):1]}] <= i_in_0;\n"
"\t\t\t\tmem_1o[{iaddr[LGSIZE],iaddr[(LGSIZE-2):1]}] <= i_in_1;\n"
"\t\t\tend else begin\n"
"\t\t\t\tmem_0e[{iaddr[LGSIZE],iaddr[(LGSIZE-2):1]}] <= i_in_0;\n"
"\t\t\t\tmem_0o[{iaddr[LGSIZE],iaddr[(LGSIZE-2):1]}] <= i_in_1;\n"
"\t\t\tend\n"
"\t\t\tiaddr <= iaddr + 2;\n"
"\t\t\tif (&iaddr[(LGSIZE-1):1])\n"
"\t\t\t\tin_reset <= 1'b0;\n"
"\t\t\tif (in_reset)\n"
"\t\t\tbegin\n"
"\t\t\t\to_out_0 <= {(2*WIDTH){1'b0}};\n"
"\t\t\t\to_out_1 <= {(2*WIDTH){1'b0}};\n"
"\t\t\t\to_sync <= 1'b0;\n"
"\t\t\tend else\n"
"\t\t\tbegin\n"
"\t\t\t\tif (braddr[0])\n"
"\t\t\t\tbegin\n"
"\t\t\t\t\to_out_0 <= mem_0o[{~iaddr[LGSIZE],braddr[(LGSIZE-2):1]}];\n"
"\t\t\t\t\to_out_1 <= mem_1o[{~iaddr[LGSIZE],braddr[(LGSIZE-2):1]}];\n"
"\t\t\t\tend else begin\n"
"\t\t\t\tend else begin\n"
"\t\t\t\t\to_out_0 <= mem_0e[{~iaddr[LGSIZE],braddr[(LGSIZE-2):1]}];\n"
"\t\t\t\t\to_out_1 <= mem_1e[{~iaddr[LGSIZE],braddr[(LGSIZE-2):1]}];\n"
"\t\t\t\tend\n"
"\t\t\t\to_sync <= ~(|iaddr[(LGSIZE-1):0]);\n"
"\t\t\tend\n"
"\t\tend\n"
"\t\t\t\tend\n"
"\t\t\t\to_sync <= ~(|iaddr[(LGSIZE-1):0]);\n"
"\t\t\tend\n"
"\t\tend\n"
"\n"
"endmodule;\n");
 
578,46 → 589,46
 
fprintf(fp,
"module\tbutterfly(i_clk, i_ce, i_coef, i_left, i_right, i_aux,\n"
"\t\to_left, o_right, o_aux);\n"
"\t// Public changeable parameters ...\n"
"\tparameter IWIDTH=16,CWIDTH=IWIDTH,OWIDTH=IWIDTH;\n"
"\t// Parameters specific to the core that should not be changed.\n"
"\tparameter MPYDELAY=(IWIDTH+1 < CWIDTH)?(IWIDTH+2):(CWIDTH+1),\n"
"\t\t\tSHIFT=0, ROUND=1;\n"
"\t// The LGDELAY should be the base two log of the MPYDELAY. If\n"
"\t// this value is fractional, then round up to the nearest\n"
"\t// integer: LGDELAY=ceil(log(MPYDELAY)/log(2));\n"
"\tparameter LGDELAY=5;\n"
"\tinput i_clk, i_ce;\n"
"\tinput [(2*CWIDTH-1):0] i_coef;\n"
"\tinput [(2*IWIDTH-1):0] i_left, i_right;\n"
"\tinput i_aux;\n"
"\toutput wire [(2*OWIDTH-1):0] o_left, o_right;\n"
"\toutput wire o_aux;\n"
"\t\to_left, o_right, o_aux);\n"
"\t// Public changeable parameters ...\n"
"\tparameter IWIDTH=16,CWIDTH=IWIDTH+4,OWIDTH=IWIDTH+1;\n"
"\t// Parameters specific to the core that should not be changed.\n"
"\tparameter MPYDELAY=5'd20, // (IWIDTH+1 < CWIDTH)?(IWIDTH+4):(CWIDTH+3),\n"
"\t\t\tSHIFT=0, ROUND=0;\n"
"\t// The LGDELAY should be the base two log of the MPYDELAY. If\n"
"\t// this value is fractional, then round up to the nearest\n"
"\t// integer: LGDELAY=ceil(log(MPYDELAY)/log(2));\n"
"\tparameter\tLGDELAY=5;\n"
"\tinput\t\ti_clk, i_ce;\n"
"\tinput\t\t[(2*CWIDTH-1):0] i_coef;\n"
"\tinput\t\t[(2*IWIDTH-1):0] i_left, i_right;\n"
"\tinput\t\ti_aux;\n"
"\toutput\twire [(2*OWIDTH-1):0] o_left, o_right;\n"
"\toutput\twire o_aux;\n"
"\n"
"\twire [(OWIDTH-1):0] o_left_r, o_left_i, o_right_r, o_right_i;\n"
"\twire\t[(OWIDTH-1):0] o_left_r, o_left_i, o_right_r, o_right_i;\n"
"\n"
"\treg [(2*IWIDTH-1):0] r_left, r_right;\n"
"\treg r_aux, r_aux_2;\n"
"\treg [(2*CWIDTH-1):0] r_coef, r_coef_2;\n"
"\twire [(CWIDTH-1):0] r_coef_r, r_coef_i;\n"
"\tassign r_coef_r = r_coef_2[ (2*CWIDTH-1):(CWIDTH)];\n"
"\tassign r_coef_i = r_coef_2[ (CWIDTH-1):0];\n"
"\twire [(IWIDTH-1):0] r_left_r, r_left_i, r_right_r, r_right_i;\n"
"\tassign r_left_r = i_left[ (2*IWIDTH-1):(IWIDTH)];\n"
"\tassign r_left_i = i_left[ (IWIDTH-1):0];\n"
"\tassign r_right_r = i_right[(2*IWIDTH-1):(IWIDTH)];\n"
"\tassign r_right_i = i_right[(IWIDTH-1):0];\n"
"\treg\t[(2*IWIDTH-1):0]\tr_left, r_right;\n"
"\treg\t\t\t\tr_aux, r_aux_2;\n"
"\treg\t[(2*CWIDTH-1):0]\tr_coef, r_coef_2;\n"
"\twire\tsigned\t[(CWIDTH-1):0]\tr_coef_r, r_coef_i;\n"
"\tassign\tr_coef_r = r_coef_2[ (2*CWIDTH-1):(CWIDTH)];\n"
"\tassign\tr_coef_i = r_coef_2[ ( CWIDTH-1):0];\n"
"\twire\tsigned\t[(IWIDTH-1):0]\tr_left_r, r_left_i, r_right_r, r_right_i;\n"
"\tassign\tr_left_r = r_left[ (2*IWIDTH-1):(IWIDTH)];\n"
"\tassign\tr_left_i = r_left[ (IWIDTH-1):0];\n"
"\tassign\tr_right_r = r_right[(2*IWIDTH-1):(IWIDTH)];\n"
"\tassign\tr_right_i = r_right[(IWIDTH-1):0];\n"
"\n"
"\treg [(IWIDTH):0] r_sum_r, r_sum_i, r_dif_r, r_dif_i;\n"
"\treg\tsigned\t[(IWIDTH):0]\tr_sum_r, r_sum_i, r_dif_r, r_dif_i;\n"
"\n"
"\treg [(LGDELAY-1):0] fifo_addr;\n"
"\twire [(LGDELAY-1):0] fifo_read_addr;\n"
"\t/* verilator lint_off WIDTH */\n"
"\tassign fifo_read_addr = fifo_addr - MPYDELAY;\n"
"\t/* verilator lint_on WIDTH */\n"
"\treg [(2*IWIDTH+2):0] fifo_left [ 0:((1<<LGDELAY)-1)];\n"
"\n"
"\treg [(LGDELAY-1):0] fifo_addr;\n"
"\twire [(LGDELAY-1):0] fifo_read_addr;\n"
"\tassign fifo_read_addr = fifo_addr - MPYDELAY;\n"
"\treg [(2*IWIDTH+2):0] fifo_left [ 0:((1<<LGDELAY)-1)];\n"
"\n");
fprintf(fp,
"\t// Set up the input to the multiply\n"
"\talways @(posedge i_clk)\n"
"\t\tif (i_ce)\n"
"\t\tbegin\n"
635,7 → 646,11
"\t\t\tr_aux_2 <= r_aux;\n"
"\t\t\tr_coef_2<= r_coef;\n"
"\t\tend\n"
"\n"
"\n");
fprintf(fp,
"\t// Don\'t forget to record the even side, since it doesn\'t need\n"
"\t// to be multiplied, but yet we still need the results in sync\n"
"\t// with the answer when it is ready.\n"
"\talways @(posedge i_clk)\n"
"\t\tif (i_ce)\n"
"\t\tbegin\n"
646,33 → 661,62
"\t\t\tfifo_addr <= fifo_addr + 1;\n"
"\t\tend\n"
"\n"
"\twire [(CWIDTH-1):0] ir_coef_r, ir_coef_i;\n"
"\tassign ir_coef_r = r_coef_2[(2*CWIDTH-1):CWIDTH];\n"
"\tassign ir_coef_i = r_coef_2[(CWIDTH-1):0];\n"
"\twire [(IWIDTH+CWIDTH+1+2-1):0] p_one, p_two, p_three;\n"
"\twire\tsigned\t[(CWIDTH-1):0] ir_coef_r, ir_coef_i;\n"
"\tassign\tir_coef_r = r_coef_2[(2*CWIDTH-1):CWIDTH];\n"
"\tassign\tir_coef_i = r_coef_2[(CWIDTH-1):0];\n"
"\twire\tsigned\t[((IWIDTH+2)+(CWIDTH+1)-1):0]\tp_one, p_two, p_three;\n"
"\n"
"\t// Multiply output is always a width of IWIDTH+CWIDTH-1. ALWAYS.\n"
"\t// We take care of dropping the width to OWIDTH in our routine\n"
"\t// below, but this is the definition of a multiply.\n"
"\n");
fprintf(fp,
"\t// Multiply output is always a width of the sum of the widths of\n"
"\t// the two inputs. ALWAYS. This is independent of the number of\n"
"\t// bits in p_one, p_two, or p_three. These values needed to \n"
"\t// accumulate a bit (or two) each. However, this approach to a\n"
"\t// three multiply complex multiply cannot increase the total\n"
"\t// number of bits in our final output. We\'ll take care of\n"
"\t// dropping back down to the proper width, OWIDTH, in our routine\n"
"\t// below.\n"
"\n"
"\n");
fprintf(fp,
"\t// We accomplish here \"Karatsuba\" multiplication. That is,\n"
"\t// by doing three multiplies we accomplish the work of four.\n"
"\t// Let\'s prove to ourselves that this works ... We wish to\n"
"\t// multiply: (a+jb) * (c+jd), where a+jb is given by\n"
"\t//\ta + jb = r_dif_r + j r_dif_i, and\n"
"\t//\tc + jd = ir_coef_r + j ir_coef_i.\n"
"\t// We do this by calculating the intermediate products P1, P2,\n"
"\t// and P3 as\n"
"\t//\tP1 = ac\n"
"\t//\tP2 = bd\n"
"\t//\tP3 = (a + b) * (c + d)\n"
"\t// and then complete our final answer with\n"
"\t//\tac - bd = P1 - P2 (this checks)\n"
"\t//\tad + bc = P3 - P2 - P1\n"
"\t//\t = (ac + bc + ad + bd) - bd - ac\n"
"\t//\t = bc + ad (this checks)\n"
"\n"
"\n"
"// This should really be based upon an IF\n"
"// if (IWIDTH < CWIDTH) then ...\n"
"\n");
fprintf(fp,
"\t// This should really be based upon an IF, such as in\n"
"\t// if (IWIDTH < CWIDTH) then ...\n"
"\t// However, this is the only (other) way I know to do it.\n"
"\tgenerate\n"
"\tif (CWIDTH < IWIDTH+1)\n"
"\tbegin\n"
"\t\t// We need to pad these first two multiplies by an extra\n"
"\t\t// just to keep them aligned with the third, simpler,\n"
"\t\t// multiply.\n"
"\t\t// bit just to keep them aligned with the third,\n"
"\t\t// simpler, multiply.\n"
"\t\tshiftaddmpy #(CWIDTH+1,IWIDTH+2) p1(i_clk, i_ce,\n"
"\t\t\t\t{ir_coef_r[CWIDTH-1],ir_coef_r},\n"
"\t\t\t\t{r_dif_r[IWIDTH],r_dif_r}, p_one);\n"
"\t\tshiftaddmpy #(CWIDTH+1,IWIDTH+2) p2(i_clk, i_ce,\n"
"\t\t\t\t{ir_coef_r[CWIDTH-1],ir_coef_r},\n"
"\t\t\t\t{ir_coef_i[CWIDTH-1],ir_coef_i},\n"
"\t\t\t\t{r_dif_i[IWIDTH],r_dif_i}, p_two);\n"
"\t\tshiftaddmpy #(CWIDTH+1,IWIDTH+2) p3(i_clk, i_ce,\n"
"\t\t\t\tir_coef_i+ir_coef_r, r_dif_r + r_dif_i, p_three);\n"
"\t\t\t\tir_coef_i+ir_coef_r,\n"
"\t\t\t\tr_dif_r + r_dif_i,\n"
"\t\t\t\tp_three);\n"
"\tend else begin\n"
"\t\tshiftaddmpy #(IWIDTH+2,CWIDTH+1) p1a(i_clk, i_ce,\n"
"\t\t\t\t{r_dif_r[IWIDTH],r_dif_r},\n"
679,7 → 723,7
"\t\t\t\t{ir_coef_r[CWIDTH-1],ir_coef_r}, p_one);\n"
"\t\tshiftaddmpy #(IWIDTH+2,CWIDTH+1) p2a(i_clk, i_ce,\n"
"\t\t\t\t{r_dif_i[IWIDTH], r_dif_i},\n"
"\t\t\t\t{ir_coef_r[CWIDTH-1],ir_coef_r}, p_two);\n"
"\t\t\t\t{ir_coef_i[CWIDTH-1],ir_coef_i}, p_two);\n"
"\t\tshiftaddmpy #(IWIDTH+2,CWIDTH+1) p3a(i_clk, i_ce,\n"
"\t\t\t\tr_dif_r+r_dif_i,\n"
"\t\t\t\tir_coef_i+ir_coef_r,\n"
686,20 → 730,35
"\t\t\t\tp_three);\n"
"\tend\n"
"\tendgenerate\n"
"\n"
"\n");
fprintf(fp,
"\t// These values are held in memory and delayed during the\n"
"\t// multiply. Here, we recover them. During the multiply,\n"
"\t// values were multiplied by 2^(CWIDTH-2)*exp{-j*2*pi*...},\n"
"\t// therefore, the left_x values need to be right shifted by\n"
"\t// CWIDTH-2 as well. The additional bits come from a sign\n"
"\t// extension.\n"
"\twire aux;\n"
"\twire [(IWIDTH+CWIDTH):0] left_i, left_r;\n"
"\treg [(2*IWIDTH+2):0] fifo_read;\n"
"\tassign left_r = { fifo_read[(2*(IWIDTH+1)-1):(IWIDTH+1)], {(CWIDTH){1'b0}} };\n"
"\tassign left_i = { fifo_read[((IWIDTH+1)-1):0], {(CWIDTH){1'b0}} };\n"
"\tassign aux = fifo_read[2*IWIDTH+2];\n"
"\twire\tsigned\t[(IWIDTH+CWIDTH):0] fifo_i, fifo_r;\n"
"\treg\t\t[(2*IWIDTH+2):0] fifo_read;\n"
"\tassign\tfifo_r = { {2{fifo_read[2*(IWIDTH+1)-1]}}, fifo_read[(2*(IWIDTH+1)-1):(IWIDTH+1)], {(CWIDTH-2){1'b0}} };\n"
"\tassign\tfifo_i = { {2{fifo_read[(IWIDTH+1)-1]}}, fifo_read[((IWIDTH+1)-1):0], {(CWIDTH-2){1'b0}} };\n"
"\tassign\taux = fifo_read[2*IWIDTH+2];\n"
"\n"
"\n"
"\treg [(CWIDTH+IWIDTH+3-1):0] b_left_r, b_left_i, b_right_r, b_right_i, mpy_r, mpy_i;\n"
"\treg [(CWIDTH+IWIDTH+3-1):0] rnd;\n"
"\tassign rnd = ((~ROUND)||(SHIFT==0))?\n"
"\t\t\t({(CWIDTH+IWIDTH+3){1'b0}})\n"
"\t\t\t: ({ {(OWIDTH+1+SHIFT){1'b0}},1'b1,{(CWIDTH+IWIDTH+3-2-OWIDTH-SHIFT){1'b0}} });\n"
"\treg\tsigned\t[(CWIDTH+IWIDTH+3-1):0] b_left_r, b_left_i,\n"
"\t\t\t\t\t\tb_right_r, b_right_i;\n"
"\treg\tsigned\t[(CWIDTH+IWIDTH+3-1):0] mpy_r, mpy_i;\n"
"\treg\tsigned\t[(CWIDTH+IWIDTH+3-1):0] rnd;\n"
"\tgenerate\n"
"\tif ((~ROUND)||(CWIDTH+IWIDTH-OWIDTH-SHIFT<1))\n"
"\t\tassign rnd = ({(CWIDTH+IWIDTH+3){1'b0}});\n"
"\telse\n"
"\t\tassign rnd = ({ {(OWIDTH+3+SHIFT){1'b0}},1'b1,\n"
"\t\t\t\t{(CWIDTH+IWIDTH-OWIDTH-SHIFT-1){1'b0}} });\n"
"\tendgenerate\n"
"\n");
fprintf(fp,
"\talways @(posedge i_clk)\n"
"\t\tif (i_ce)\n"
"\t\tbegin\n"
706,6 → 765,9
"\t\t\t// First clock, recover all values\n"
"\t\t\tfifo_read <= fifo_left[fifo_read_addr];\n"
"\t\t\t// These values are IWIDTH+CWIDTH+3 bits wide\n"
"\t\t\t// although they only need to be (IWIDTH+1)\n"
"\t\t\t// + (CWIDTH) bits wide. (We\'ve got two\n"
"\t\t\t// extra bits we need to get rid of.)\n"
"\t\t\tmpy_r <= p_one - p_two;\n"
"\t\t\tmpy_i <= p_three - p_one - p_two;\n"
"\n"
712,19 → 774,45
"\t\t\t// Second clock, round and latch for final clock\n"
"\t\t\tb_right_r <= mpy_r + rnd;\n"
"\t\t\tb_right_i <= mpy_i + rnd;\n"
"\t\t\tb_left_r <= { {2{left_r[(IWIDTH+CWIDTH)]}},left_r } + rnd;\n"
"\t\t\tb_left_i <= { {2{left_i[(IWIDTH+CWIDTH)]}},left_i } + rnd;\n"
"\t\t\tb_left_r <= { {2{fifo_r[(IWIDTH+CWIDTH)]}},fifo_r } + rnd;\n"
"\t\t\tb_left_i <= { {2{fifo_i[(IWIDTH+CWIDTH)]}},fifo_i } + rnd;\n"
"\t\t\to_aux <= aux;\n"
"\t\tend\n"
"\n"
"\n");
fprintf(fp,
"\t// Final clock--clock and remove unnecessary bits.\n"
"\t// We have (IWIDTH+CWIDTH+3) bits here, we need to drop down to OWIDTH,\n"
"\t// and SHIFT by SHIFT bits in the process.\n"
"\tassign o_left_r = b_left_r[ (CWIDTH+IWIDTH+2-SHIFT):(CWIDTH+IWIDTH+2-SHIFT-OWIDTH+1)];\n"
"\tassign o_left_i = b_left_i[ (CWIDTH+IWIDTH+2-SHIFT):(CWIDTH+IWIDTH+2-SHIFT-OWIDTH+1)];\n"
"\tassign o_right_r = b_right_r[(CWIDTH+IWIDTH+2-SHIFT):(CWIDTH+IWIDTH+2-SHIFT-OWIDTH+1)];\n"
"\tassign o_right_i = b_right_i[(CWIDTH+IWIDTH+2-SHIFT):(CWIDTH+IWIDTH+2-SHIFT-OWIDTH+1)];\n"
"\t// We have (IWIDTH+CWIDTH+3) bits here, we need to drop down to\n"
"\t// OWIDTH, and SHIFT by SHIFT bits in the process. The trick is\n"
"\t// that we don\'t need (IWIDTH+CWIDTH+3) bits. We\'ve accumulated\n"
"\t// them, but the actual values will never fill all these bits.\n"
"\t// In particular, we only need:\n"
"\t//\t IWIDTH bits for the input\n"
"\t//\t +1 bit for the add/subtract\n"
"\t//\t+CWIDTH bits for the coefficient multiply\n"
"\t//\t +1 bit for the add/subtract in the complex multiply\n"
"\t//\t ------\n"
"\t//\t (IWIDTH+CWIDTH+2) bits at full precision.\n"
"\t//\n"
"\t// However, the coefficient multiply multiplied by a maximum value\n"
"\t// of 2^(CWIDTH-2). Thus, we only have\n"
"\t//\t IWIDTH bits for the input\n"
"\t//\t +1 bit for the add/subtract\n"
"\t//\t+CWIDTH-2 bits for the coefficient multiply\n"
"\t//\t +1 (optional) bit for the add/subtract in the cpx mpy.\n"
"\t//\t -------- ... multiply. (This last bit may be shifted out.)\n"
"\t//\t (IWIDTH+CWIDTH) valid output bits. \n"
"\t// Now, if the user wants to keep any extras of these (via OWIDTH),\n"
"\t// or if he wishes to arbitrarily shift some of these off (via\n"
"\t// SHIFT) we accomplish that here.\n"
"\tassign o_left_r = b_left_r[ (CWIDTH+IWIDTH-1-SHIFT-1):(CWIDTH+IWIDTH-OWIDTH-SHIFT-1)];\n"
"\tassign o_left_i = b_left_i[ (CWIDTH+IWIDTH-1-SHIFT-1):(CWIDTH+IWIDTH-OWIDTH-SHIFT-1)];\n"
"\tassign o_right_r = b_right_r[(CWIDTH+IWIDTH-1-SHIFT-1):(CWIDTH+IWIDTH-OWIDTH-SHIFT-1)];\n"
"\tassign o_right_i = b_right_i[(CWIDTH+IWIDTH-1-SHIFT-1):(CWIDTH+IWIDTH-OWIDTH-SHIFT-1)];\n"
"\n"
"\t// As a final step, we pack our outputs into two packed two\'s\n"
"\t// complement numbers per output word, so that each output word\n"
"\t// has (2*OWIDTH) bits in it, with the top half being the real\n"
"\t// portion and the bottom half being the imaginary portion.\n"
"\tassign o_left = { o_left_r, o_left_i };\n"
"\tassign o_right= { o_right_r,o_right_i};\n"
"\n"
907,13 → 995,14
"\t\t\tend else\n"
"\t\t\t\to_sync <= 1'b0;\n"
"\t\tend\n"
"\n"
"\n", (inv)?"i":"");
fprintf(fstage,
"\tbutterfly #(.IWIDTH(IWIDTH),.CWIDTH(CWIDTH),.OWIDTH(OWIDTH),\n"
"\t\t\t.LGDELAY(LGBDLY),.SHIFT(BFLYSHIFT))\n"
"\t\t\t.MPYDELAY(%d\'d%d),.LGDELAY(LGBDLY),.SHIFT(BFLYSHIFT))\n"
"\t\tbfly(i_clk, (b_ce&i_ce), ib_c,\n"
"\t\t\tib_a, ib_b, ib_sync, ob_a, ob_b, ob_sync);\n"
"endmodule;\n",
(inv)?"i":"");
lgdelay(nbits, xtra), (1<xtra)?(nbits+4):(nbits+xtra+3));
}
 
void usage(void) {
/sw/Makefile
9,17 → 9,17
%.o: %.cpp
$(CXX) -c $< -o $@
 
.PHONY: alltest
alltest: test itest shiftaddmpy butterfly dblreverse qtrstage dblstage
.PHONY: test
test: fft ifft shiftaddmpy butterfly dblreverse qtrstage dblstage
 
.PHONY: test
test: fftgen
.PHONY: fft
fft: fftgen
./fftgen -f 2048 -n 16
cd $(CORED)/; verilator -cc fftmain.v
cd $(OBJDR); make -f Vfftmain.mk
 
.PHONY: itest
itest: fftgen
.PHONY: ifft
ifft: fftgen
./fftgen -f 2048 -1 -n 24 -m 24
cd $(CORED)/; verilator -cc ifftmain.v
cd $(OBJDR); make -f Vifftmain.mk
37,7 → 37,7
.PHONY: butterfly
butterfly: $(OBJDR)/Vbutterfly__ALL.a
 
$(CORED)/butterfly.v: test
$(CORED)/butterfly.v: fft
$(OBJDR)/Vbutterfly.cpp $(OBJDR)/Vbutterfly.h: $(CORED)/butterfly.v
cd $(CORED)/; verilator -cc butterfly.v
$(OBJDR)/Vbutterfly__ALL.a: $(OBJDR)/Vbutterfly.h
47,7 → 47,7
.PHONY: dblreverse
dblreverse: $(OBJDR)/Vdblreverse__ALL.a
 
$(CORED)/dblreverse.v: test
$(CORED)/dblreverse.v: fft
$(OBJDR)/Vdblreverse.cpp $(OBJDR)/Vdblreverse.h: $(CORED)/dblreverse.v
cd $(CORED)/; verilator -cc dblreverse.v
$(OBJDR)/Vdblreverse__ALL.a: $(OBJDR)/Vdblreverse.h
57,7 → 57,7
.PHONY: qtrstage
qtrstage: $(OBJDR)/Vqtrstage__ALL.a
 
$(CORED)/qtrstage.v: test
$(CORED)/qtrstage.v: fft
$(OBJDR)/Vqtrstage.cpp $(OBJDR)/Vqtrstage.h: $(CORED)/qtrstage.v
cd $(CORED)/; verilator -cc qtrstage.v
$(OBJDR)/Vqtrstage__ALL.a: $(OBJDR)/Vqtrstage.h
67,7 → 67,7
.PHONY: dblstage
dblstage: $(OBJDR)/Vdblstage__ALL.a
 
$(CORED)/dblstage.v: test
$(CORED)/dblstage.v: fft
$(OBJDR)/Vdblstage.cpp $(OBJDR)/Vdblstage.h: $(CORED)/dblstage.v
cd $(CORED)/; verilator -cc dblstage.v
$(OBJDR)/Vdblstage__ALL.a: $(OBJDR)/Vdblstage.h
74,6 → 74,7
$(OBJDR)/Vdblstage__ALL.a: $(OBJDR)/Vdblstage.cpp
cd $(OBJDR)/; make -f Vdblstage.mk
 
.PHONY: clean
clean:
rm fftgen fftgen.o
rm -rf $(CORED)

powered by: WebSVN 2.1.0

© copyright 1999-2024 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.