URL
https://opencores.org/ocsvn/dblclockfft/dblclockfft/trunk
Subversion Repositories dblclockfft
Compare Revisions
- This comparison shows the changes necessary to convert path
/dblclockfft/trunk
- from Rev 4 to Rev 5
- ↔ Reverse comparison
Rev 4 → Rev 5
/bench/cpp/butterfly_tb.cpp
4,81 → 4,256
#include "Vbutterfly.h" |
#include "verilated.h" |
|
void tick(Vbutterfly *bfly) { |
bfly->i_clk = 0; |
bfly->eval(); |
bfly->i_clk = 1; |
bfly->eval(); |
} |
class BFLY_TB { |
public: |
Vbutterfly *m_bfly; |
unsigned long m_left[64], m_right[64]; |
int m_addr, m_lastaux; |
|
void reset(Vbutterfly *bfly) { |
bfly->i_ce = 0; |
bfly->i_rst = 1; |
tick(bfly); |
bfly->i_ce = 0; |
bfly->i_rst = 0; |
tick(bfly); |
} |
BFLY_TB(void) { |
m_bfly = new Vbutterfly; |
m_addr = 0; |
} |
|
void tick(void) { |
m_lastaux = m_bfly->o_aux; |
m_bfly->i_clk = 0; |
m_bfly->eval(); |
m_bfly->i_clk = 1; |
m_bfly->eval(); |
} |
|
void reset(void) { |
m_bfly->i_ce = 0; |
m_bfly->i_aux = 1; |
m_bfly->i_coef = 0l; |
m_bfly->i_left = 0; |
m_bfly->i_right = 0; |
tick(); |
m_bfly->i_ce = 1; |
m_bfly->i_aux = 1; |
|
for(int i=0; i<40; i++) |
tick(); |
m_bfly->i_aux = 0; |
tick(); |
} |
|
void test(const int n, const int k, const unsigned long cof, |
const unsigned lft, const unsigned rht, const int aux) { |
|
m_bfly->i_coef = cof & (~(-1l << 40)); |
m_bfly->i_left = lft; |
m_bfly->i_right = rht; |
m_bfly->i_aux = aux & 1; |
|
tick(); |
|
if ((m_bfly->o_aux)&&(!m_lastaux)) |
printf("\n"); |
printf("n,k=%d,%3d: COEF=%010lx, LFT=%08x, RHT=%08x, A=%d, OLFT =%09lx, ORHT=%09lx, AUX=%d\n", |
n,k, |
m_bfly->i_coef & (~(-1l<<40)), |
m_bfly->i_left, |
m_bfly->i_right, |
m_bfly->i_aux, |
m_bfly->o_left, |
m_bfly->o_right, |
m_bfly->o_aux); |
/* |
printf("\tFI=%010lx", |
((((long)m_bfly->v__DOT__r_aux_2)&1l)<<34) |
|((((long)m_bfly->v__DOT__r_sum_r)&0x01ffffl)<<17) |
|(((long)m_bfly->v__DOT__r_sum_i)&0x01ffffl)); |
printf("\tFO=%010lx SUMR=%05x SUMI=%05x A=%d", |
m_bfly->v__DOT__fifo_read, |
m_bfly->v__DOT__r_sum_r, |
m_bfly->v__DOT__r_sum_i, |
m_bfly->v__DOT__r_aux_2); |
printf("\tML=%09lx, MR=%09lx, ", |
m_left[ (m_addr-23)&(64-1)], |
m_right[(m_addr-23)&(64-1)]); |
*/ |
/* |
printf("\tBLFTR=%10lx BLFTI=%10lx", |
m_bfly->v__DOT__b_left_r & (~(-1l<<40)), |
m_bfly->v__DOT__b_left_i & (~(-1l<<40))); |
printf("\tMPYR=%10lx MPYI=%10lx", |
m_bfly->v__DOT__mpy_r & (~(-1l<<40)), |
m_bfly->v__DOT__mpy_i & (~(-1l<<40))); |
printf("\n"); |
*/ |
|
if (m_left[(m_addr-23)&(64-1)] != m_bfly->o_left) { |
fprintf(stderr, "WRONG O_LEFT!\n"); |
exit(-1); |
} |
|
if (m_right[(m_addr-23)&(64-1)] != m_bfly->o_right) { |
fprintf(stderr, "WRONG O_RIGHT!\n"); |
exit(-1); |
} |
|
// Now, let's calculate an "expected" result ... |
long rlft, ilft; |
|
// Extract left and right values ... |
rlft = (m_bfly->i_left >> 16) & 0x0ffff; |
ilft = (m_bfly->i_left ) & 0x0ffff; |
// Make certain they are properly sign extended ... |
if (rlft & 0x8000) rlft |= (-1<<16); |
if (ilft & 0x8000) ilft |= (-1<<16); |
|
// Now repeat for the right hand value ... |
long rrht, irht; |
// Extract left and right values ... |
rrht = (m_bfly->i_right >> 16) & 0x0ffff; |
irht = (m_bfly->i_right ) & 0x0ffff; |
// Make certain they are properly sign extended ... |
if (rrht & 0x8000) rrht |= (-1<<16); |
if (irht & 0x8000) irht |= (-1<<16); |
|
|
// and again for the coefficients |
long rcof, icof; |
// Extract left and right values ... |
rcof = (m_bfly->i_coef >> 20) & 0x0fffff; |
icof = (m_bfly->i_coef ) & 0x0fffff; |
// Make certain they are properly sign extended ... |
if (rcof & 0x80000) rcof |= (-1<<20); |
if (icof & 0x80000) icof |= (-1<<20); |
|
|
// Now, let's do the butterfly ourselves ... |
long sumi, sumr, difi, difr; |
sumr = rlft + rrht; |
sumi = ilft + irht; |
difr = rlft - rrht; |
difi = ilft - irht; |
|
/* |
printf("L=%5lx+%5lx,R=%5lx+%5lx,S=%5lx+%5lx,D=%5lx+%5lx, ", |
rlft & 0x02ffffl, |
ilft & 0x02ffffl, |
rrht & 0x02ffffl, |
irht & 0x02ffffl, |
sumr & 0x02ffffl, |
sumi & 0x02ffffl, |
difr & 0x02ffffl, |
difi & 0x02ffffl); |
*/ |
long p1, p2, p3, mpyr, mpyi; |
p1 = difr * rcof; |
p2 = difi * icof; |
p3 = (difr + difi) * (rcof + icof); |
|
mpyr = p1-p2; |
mpyi = p3-p1-p2; |
|
/* |
printf("RC=%lx, IC=%lx, ", rcof, icof); |
printf("P1=%lx,P2=%lx,P3=%lx, ", p1,p2,p3); |
printf("MPYr = %lx, ", mpyr); |
printf("MPYi = %lx, ", mpyi); |
*/ |
|
long o_left_r, o_left_i, o_right_r, o_right_i; |
unsigned long o_left, o_right; |
|
o_left_r = sumr & 0x01ffff; o_left_i = sumi & 0x01ffff; |
o_left = (o_left_r << 17) | (o_left_i); |
|
o_right_r = (mpyr>>18) & 0x01ffff; |
o_right_i = (mpyi>>18) & 0x01ffff; |
o_right = (o_right_r << 17) | (o_right_i); |
/* |
printf("oR_r = %lx, ", o_right_r); |
printf("oR_i = %lx\n", o_right_i); |
*/ |
|
m_left[ m_addr&(64-1)] = o_left; |
m_right[m_addr&(64-1)] = o_right; |
|
m_addr++; |
} |
}; |
|
int main(int argc, char **argv, char **envp) { |
Verilated::commandArgs(argc, argv); |
Vbutterfly *bfly = new Vbutterfly; |
BFLY_TB *bfly = new BFLY_TB; |
int16_t ir0, ii0, lstr, lsti; |
int32_t sumr, sumi, difr, difi; |
int32_t smr, smi, dfr, dfi; |
int rnd = 0; |
|
reset(bfly); |
const int TESTSZ = 256; |
|
for(int k=0; k<270; k++) { |
int32_t or0, oi0, or1, oi1; |
bfly->reset(); |
|
bfly->i_ce = 1; |
bfly->i_sync = ((k&0x0ff)==0); |
// Let's pick some random values, ... |
ir0 = rand(); if (ir0&4) ir0 = -ir0; |
ii0 = rand(); if (ii0&2) ii0 = -ii0; |
bfly->test(9,0,0x4000000000l,0x7fff0000,0x7fff0000, 1); |
bfly->test(9,1,0x4000000000l,0x7fff0000,0x80010000, 0); |
bfly->test(9,2,0x4000000000l,0x00007fff,0x00008001, 0); |
bfly->test(9,3,0x4000000000l,0x00007fff,0x00007fff, 0); |
|
bfly->i_data = ((ir0&0x0ffff) << 16) | (ii0 & 0x0ffff); |
tick(bfly); |
bfly->test(8,0,0x4000000000l,0x80010000,0x80010000, 1); |
bfly->test(8,1,0x4000000000l,0x00008001,0x00008001, 0); |
|
printf("k=%3d: COEF=%08x, LFT=%08x, RHT=%08x, AUX=%d, OLFT =%09lx, ORHT=%09lx, AUX=%d\n", |
k, bfly->i_coef, bfly->i_left, bfly->i_right, bfly->i_aux, |
bfly->o_left, bfly->o_right, bfly->o_sync); |
bfly->test(9,0,0x4000000000l,0x40000000,0xc0000000, 1); |
bfly->test(9,1,0x4000000000l,0x40000000,0x40000000, 0); |
bfly->test(9,2,0x4000000000l,0x00004000,0x0000c000, 0); |
bfly->test(9,3,0x4000000000l,0x00004000,0x00004000, 0); |
|
or0 = (bfly->o_data >> 17) & 0x01ffff; |
oi0 = bfly->o_data & 0x01ffff; |
if (or0 & 0x010000) or0 |= (-1<<16); |
if (oi0 & 0x010000) oi0 |= (-1<<16); |
bfly->test(9,0,0x4000000000l,0x20000000,0xe0000000, 1); |
bfly->test(9,1,0x4000000000l,0x20000000,0x20000000, 0); |
bfly->test(9,2,0x4000000000l,0x00002000,0x0000e000, 0); |
bfly->test(9,3,0x4000000000l,0x00002000,0x00002000, 0); |
|
if (k>3) { |
/* |
printf("\tOR0 = %6x, OI0 = %6x, SUM = %6x + %6x, DIF = %6x + %6x\n", |
or0, oi0, sumr, sumi, difr, difi); |
*/ |
if (0==(k&1)) { |
if (or0 != sumr) {fprintf(stderr, "FAIL 1\n"); exit(-1);} |
if (oi0 != sumi) {fprintf(stderr, "FAIL 2\n"); exit(-1);} |
} else if (1==(k&1)) { |
if (or0 != difr) {fprintf(stderr, "FAIL 3\n"); exit(-1);} |
if (oi0 != difi) {fprintf(stderr, "FAIL 4\n"); exit(-1);} |
} |
} |
bfly->test(9,0,0x4000000000l,0x00080000,0xfff80000, 1); |
bfly->test(9,1,0x4000000000l,0x00080000,0x00080000, 0); |
bfly->test(9,2,0x4000000000l,0x00000008,0x0000fff8, 0); |
bfly->test(9,3,0x4000000000l,0x00000008,0x00000008, 0); |
|
if (((4==(k&0x0ff))?1:0) != bfly->o_sync) { fprintf(stderr, "BAD O-SYNC\n"); exit(-1); } |
bfly->test(9,0,0x4000000000l,0x00010000,0xffff0000, 1); |
bfly->test(9,1,0x4000000000l,0x00010000,0x00010000, 0); |
bfly->test(9,2,0x4000000000l,0x00000001,0x0000ffff, 0); |
bfly->test(9,3,0x4000000000l,0x00000001,0x00000001, 0); |
|
if (1 == (k&1)) { |
sumr = smr; sumi = smi; difr=dfr, difi= dfi; |
for(int n=0; n<4; n++) for(int k=0; k<TESTSZ; k++) { |
long iv, rv; |
unsigned long lft, rht, cof; |
double c, s, W; |
bool inv = 1; |
int aux; |
|
smr = lstr + ir0 + rnd; |
smi = lsti + ii0 + rnd; |
W = ((inv)?-1:1) * 2.0 * M_PI * (2*k) / TESTSZ * 64; |
c = cos(W); s = sin(W); |
rv = (long)((double)(1l<<(16-2-n))*c+0.5); |
iv = (long)((double)(1l<<(16-2-n))*s+0.5); |
|
dfr = lstr - ir0 + rnd; |
dfi = lsti - ii0 + rnd; |
} |
rv = (rv << 16) | (iv & (~(-1<<16))); |
lft = rv; |
|
lstr = ir0; |
lsti = ii0; |
W = ((inv)?-1:1) * 2.0 * M_PI * (2*k+1) / TESTSZ * 64; |
c = cos(W); s = sin(W); |
rv = (long)((double)(1l<<(16-2-n))*c+0.5); |
iv = (long)((double)(1l<<(16-2-n))*s+0.5); |
|
rv = (rv << 16) | (iv & (~(-1<<16))); |
rht = rv; |
|
|
// Switch the sign of W |
W = ((inv)?1:-1) * 2.0 * M_PI * (2*k) / TESTSZ; |
c = cos(W); s = sin(W); |
rv = (long)((double)(1l<<(20-2))*c+0.5); // Keep 20-2 bits for |
iv = (long)((double)(1l<<(20-2))*s+0.5); // coefficients |
|
rv = (rv << 20) | (iv & (~(-1<<20))); |
cof = rv; |
|
aux = ((k&(TESTSZ-1))==0); |
|
bfly->test(n,k, cof, lft, rht, aux); |
} |
|
delete bfly; |
/bench/cpp/Makefile
1,4 → 1,4
all: mpy_tb dblrev_tb dblstage_tb qtrstage_tb |
all: mpy_tb dblrev_tb dblstage_tb qtrstage_tb test |
|
OBJDR:= ../../sw/fft-core/obj_dir |
VINC := -I/usr/share/verilator/include -I$(OBJDR)/ |
6,6 → 6,7
DBLRV:= $(OBJDR)/Vdblreverse__ALL.a |
DBLSG:= $(OBJDR)/Vdblstage__ALL.a |
QTRSG:= $(OBJDR)/Vqtrstage__ALL.a |
BFLYL:= $(OBJDR)/Vbutterfly__ALL.a |
VERILATOR_ROOT := /usr/share/verilator |
|
mpy_tb: mpy_tb.cpp $(MPYLB) |
20,6 → 21,18
qtrstage_tb: qtrstage_tb.cpp $(QTRSG) |
g++ -g $(VINC) $< $(QTRSG) $(VERILATOR_ROOT)/include/verilated.cpp -o $@ |
|
butterfly_tb: butterfly_tb.cpp $(BFLYL) |
g++ -g $(VINC) $< $(BFLYL) $(VERILATOR_ROOT)/include/verilated.cpp -o $@ |
|
.PHONY: test |
test: mpy_tb dblrev_tb dblstage_tb qtrstage_tb butterfly_tb |
./mpy_tb |
./dblrev_tb |
./dblstage_tb |
./qtrstage_tb |
./butterfly_tb |
|
.PHONY: clean |
clean: |
rm mpy_tb dblrev_tb dblstage_tb qtrstage_tb |
|
/sw/fftgen.cpp
25,7 → 25,10
"// for more details.\n" |
"//\n" |
"// You should have received a copy of the GNU General Public License along\n" |
"// with this program. If not, see <http://www.gnu.org/licenses/>.\n" |
"// with this program. (It's in the $(ROOT)/doc directory, run make with no\n" |
"// target there if the PDF file isn\'t present.) If not, see\n" |
"// <http://www.gnu.org/licenses/> for a copy.\n" |
"//\n" |
"// License: GPL, v3, as defined and found on www.gnu.org,\n" |
"// http://www.gnu.org/licenses/gpl.html\n" |
"//\n" |
55,9 → 58,9
int cbits = nbits + xtra; |
int delay = nbits + 2; |
if (nbits+1<cbits) |
delay = nbits+2; |
delay = nbits+4; |
else |
delay = cbits+1; |
delay = cbits+3; |
return lgval(delay); |
} |
|
76,13 → 79,11
"// \n" |
"// Project: %s\n" |
"//\n" |
"// Purpose: This file is (almost) a Verilog source file. It is meant to\n" |
"// be used by a FFT core compiler to generate FFTs which may be\n" |
"// used as part of an FFT core. Specifically, this file \n" |
"// encapsulates the options of a 4 point, decimation in\n" |
"// frequency FFT-stage. This particular stage is optimized so\n" |
"// that all of the multiplies are accomplished by additions and\n" |
"// mux'es.\n" |
"// Purpose: This file encapsulates the 4 point stage of a decimation in\n" |
"// frequency FFT. This particular implementation is optimized\n" |
"// so that all of the multiplies are accomplished by additions\n" |
"// and multiplexers only.\n" |
"//\n" |
"//\n%s" |
"//\n", |
prjname, creator); |
90,100 → 91,100
|
fprintf(fp, |
"module\tqtrstage(i_clk, i_rst, i_ce, i_sync, i_data, o_data, o_sync);\n" |
"\tparameter IWIDTH=16, OWIDTH=IWIDTH+1;\n" |
"\t// Parameters specific to the core that should be changed when this\n" |
"\t// core is built ... Note that the minimum LGSPAN is 2. Smaller \n" |
"\t// spans must use the fftdoubles stage.\n" |
"\tparameter\tLGWIDTH=8, ODD=0, INVERSE=0,SHIFT=0;\n" |
"\tinput\t i_clk, i_rst, i_ce, i_sync;\n" |
"\tinput\t [(2*IWIDTH-1):0] i_data;\n" |
"\toutput\treg [(2*OWIDTH-1):0] o_data;\n" |
"\toutput\treg o_sync;\n" |
"\t\n" |
"\treg\t wait_for_sync;\n" |
"\treg\t[2:0] pipeline;\n" |
"\tparameter IWIDTH=16, OWIDTH=IWIDTH+1;\n" |
"\t// Parameters specific to the core that should be changed when this\n" |
"\t// core is built ... Note that the minimum LGSPAN is 2. Smaller \n" |
"\t// spans must use the fftdoubles stage.\n" |
"\tparameter\tLGWIDTH=8, ODD=0, INVERSE=0,SHIFT=0;\n" |
"\tinput\t i_clk, i_rst, i_ce, i_sync;\n" |
"\tinput\t [(2*IWIDTH-1):0] i_data;\n" |
"\toutput\treg [(2*OWIDTH-1):0] o_data;\n" |
"\toutput\treg o_sync;\n" |
"\t\n" |
"\treg\t wait_for_sync;\n" |
"\treg\t[2:0] pipeline;\n" |
"\n" |
"\treg\t[(IWIDTH):0] sum_r, sum_i, diff_r, diff_i;\n" |
"\twire\t[(IWIDTH):0] n_diff_i;\n" |
"\tassign n_diff_i = -diff_i;\n" |
"\treg\t[(IWIDTH):0] sum_r, sum_i, diff_r, diff_i;\n" |
"\twire\t[(IWIDTH):0] n_diff_i;\n" |
"\tassign n_diff_i = -diff_i;\n" |
"\n" |
"\treg\t[(2*OWIDTH-1):0] ob_a;\n" |
"\twire\t[(2*OWIDTH-1):0] ob_b;\n" |
"\treg\t[(OWIDTH-1):0] ob_b_r, ob_b_i;\n" |
"\tassign ob_b = { ob_b_r, ob_b_i };\n" |
"\treg\t[(2*OWIDTH-1):0] ob_a;\n" |
"\twire\t[(2*OWIDTH-1):0] ob_b;\n" |
"\treg\t[(OWIDTH-1):0] ob_b_r, ob_b_i;\n" |
"\tassign ob_b = { ob_b_r, ob_b_i };\n" |
"\n" |
"\treg\t[(LGWIDTH-1):0] iaddr;\n" |
"\treg\t[(2*IWIDTH-1):0] imem;\n" |
"\treg\t[(LGWIDTH-1):0] iaddr;\n" |
"\treg\t[(2*IWIDTH-1):0] imem;\n" |
"\n" |
"\twire\tsigned\t[(IWIDTH-1):0]\timem_r, imem_i;\n" |
"\tassign\timem_r = imem[(2*IWIDTH-1):(IWIDTH)];\n" |
"\tassign\timem_i = imem[(IWIDTH-1):0];\n" |
"\twire\tsigned\t[(IWIDTH-1):0]\timem_r, imem_i;\n" |
"\tassign\timem_r = imem[(2*IWIDTH-1):(IWIDTH)];\n" |
"\tassign\timem_i = imem[(IWIDTH-1):0];\n" |
"\n" |
"\twire\tsigned\t[(IWIDTH-1):0]\ti_data_r, i_data_i;\n" |
"\tassign\ti_data_r = i_data[(2*IWIDTH-1):(IWIDTH)];\n" |
"\tassign\ti_data_i = i_data[(IWIDTH-1):0];\n" |
"\twire\tsigned\t[(IWIDTH-1):0]\ti_data_r, i_data_i;\n" |
"\tassign\ti_data_r = i_data[(2*IWIDTH-1):(IWIDTH)];\n" |
"\tassign\ti_data_i = i_data[(IWIDTH-1):0];\n" |
"\n" |
"\treg [(2*OWIDTH-1):0] omem;\n" |
"\treg [(2*OWIDTH-1):0] omem;\n" |
"\n" |
"\twire [(IWIDTH-1):0] rnd;\n" |
"\tassign rnd = ((IWIDTH+1-OWIDTH-SHIFT)!=0) ? { {(IWIDTH-1){1'b0}}, (OWIDTH<IWIDTH+1)? 1'b1:1'b0 } : {{(IWIDTH){1'b0}}};\n" |
"\twire [(IWIDTH-1):0] rnd;\n" |
"\tassign rnd = ((IWIDTH+1-OWIDTH-SHIFT)!=0) ? { {(IWIDTH-1){1'b0}}, (OWIDTH<IWIDTH+1)? 1'b1:1'b0 } : {{(IWIDTH){1'b0}}};\n" |
"\n" |
"\talways @(posedge i_clk)\n" |
"\t\tif (i_rst)\n" |
"\t\tbegin\n" |
"\t\t\twait_for_sync <= 1'b1;\n" |
"\t\t\tiaddr <= 0;\n" |
"\t\t\tpipeline <= 3'b000;\n" |
"\t\tend\n" |
"\t\telse if ((i_ce)&&((~wait_for_sync)||(i_sync)))\n" |
"\t\tbegin\n" |
"\t\t\t// Always\n" |
"\t\t\timem <= i_data;\n" |
"\t\t\tiaddr <= iaddr + 1;\n" |
"\t\t\twait_for_sync <= 1'b0;\n" |
"\talways @(posedge i_clk)\n" |
"\t\tif (i_rst)\n" |
"\t\tbegin\n" |
"\t\t\twait_for_sync <= 1'b1;\n" |
"\t\t\tiaddr <= 0;\n" |
"\t\t\tpipeline <= 3'b000;\n" |
"\t\tend\n" |
"\t\telse if ((i_ce)&&((~wait_for_sync)||(i_sync)))\n" |
"\t\tbegin\n" |
"\t\t\t// Always\n" |
"\t\t\timem <= i_data;\n" |
"\t\t\tiaddr <= iaddr + 1;\n" |
"\t\t\twait_for_sync <= 1'b0;\n" |
"\n" |
"\t\t\t// In sequence, clock = 0\n" |
"\t\t\tif (iaddr[0])\n" |
"\t\t\tbegin\n" |
"\t\t\t\tsum_r <= imem_r + i_data_r + rnd;\n" |
"\t\t\t\tsum_i <= imem_i + i_data_i + rnd;\n" |
"\t\t\t\tdiff_r <= imem_r - i_data_r + rnd;\n" |
"\t\t\t\tdiff_i <= imem_i - i_data_i + rnd;\n" |
"\t\t\t// In sequence, clock = 0\n" |
"\t\t\tif (iaddr[0])\n" |
"\t\t\tbegin\n" |
"\t\t\t\tsum_r <= imem_r + i_data_r + rnd;\n" |
"\t\t\t\tsum_i <= imem_i + i_data_i + rnd;\n" |
"\t\t\t\tdiff_r <= imem_r - i_data_r + rnd;\n" |
"\t\t\t\tdiff_i <= imem_i - i_data_i + rnd;\n" |
"\n" |
"\t\t\t\tpipeline[2:0] <= { pipeline[1:0], 1'b1 };\n" |
"\t\t\tend else\n" |
"\t\t\t\tpipeline[2:0] <= { pipeline[1:0], 1'b0 };\n" |
"\t\t\t\tpipeline[2:0] <= { pipeline[1:0], 1'b1 };\n" |
"\t\t\tend else\n" |
"\t\t\t\tpipeline[2:0] <= { pipeline[1:0], 1'b0 };\n" |
"\n" |
"\t\t\t// In sequence, clock = 1\n" |
"\t\t\tif (pipeline[1])\n" |
"\t\t\tbegin\n" |
"\t\t\t ob_a <= { sum_r[(IWIDTH):(IWIDTH+1-OWIDTH)],\n" |
"\t\t\t\t\t\tsum_i[(IWIDTH):(IWIDTH+1-OWIDTH)] };\n" |
"\t\t\t\t// on Even, W = e^{-j2pi 1/4 0} = 1\n" |
"\t\t\t\tif (~ODD)\n" |
"\t\t\t\tbegin\n" |
"\t\t\t// In sequence, clock = 1\n" |
"\t\t\tif (pipeline[1])\n" |
"\t\t\tbegin\n" |
"\t\t\t\tob_a <= { sum_r[(IWIDTH):(IWIDTH+1-OWIDTH)],\n" |
"\t\t\t\t\t\tsum_i[(IWIDTH):(IWIDTH+1-OWIDTH)] };\n" |
"\t\t\t\t// on Even, W = e^{-j2pi 1/4 0} = 1\n" |
"\t\t\t\tif (~ODD)\n" |
"\t\t\t\tbegin\n" |
"\t\t\t\t\tob_b_r <= diff_r[(IWIDTH-SHIFT):(IWIDTH+1-OWIDTH-SHIFT)];\n" |
"\t\t\t\t\tob_b_i <= diff_i[(IWIDTH-SHIFT):(IWIDTH+1-OWIDTH-SHIFT)];\n" |
"\t\t\t\tend else if (~INVERSE) begin\n" |
"\t\t\t\tend else if (~INVERSE) begin\n" |
"\t\t\t\t\t// on Odd, W = e^{-j2pi 1/4} = -j\n" |
"\t\t\t\t\tob_b_r <= diff_i[(IWIDTH-SHIFT):(IWIDTH+1-OWIDTH-SHIFT)];\n" |
"\t\t\t\t\tob_b_i <= diff_r[(IWIDTH-SHIFT):(IWIDTH+1-OWIDTH-SHIFT)];\n" |
"\t\t\t\tend else begin\n" |
"\t\t\t\tend else begin\n" |
"\t\t\t\t\t// on Odd, W = e^{j2pi 1/4} = j\n" |
"\t\t\t\t\tob_b_r <= n_diff_i[(IWIDTH-SHIFT):(IWIDTH+1-OWIDTH-SHIFT)];\n" |
"\t\t\t\t\tob_b_i <= diff_r[(IWIDTH-SHIFT):(IWIDTH+1-OWIDTH-SHIFT)];\n" |
"\t\t\t\tend\n" |
"\t\t\t\t// (wire) ob_b <= { ob_b_r, ob_b_i };\n" |
"\t\t\tend\n" |
"\t\t\t// In sequence, clock = 2\n" |
"\t\t\tif (pipeline[2])\n" |
"\t\t\tbegin\n" |
"\t\t\t\tomem <= ob_b;\n" |
"\t\t\t\to_data <= ob_a;\n" |
"\t\t\tend else\n" |
"\t\t\t\to_data <= omem;\n" |
"\t\t\to_sync <= &(~iaddr[(LGWIDTH-1):3]) && (iaddr[2:0] == 3'b100);\n" |
"\t\tend\n" |
"\t\t\t\tend\n" |
"\t\t\t\t// (wire) ob_b <= { ob_b_r, ob_b_i };\n" |
"\t\t\tend\n" |
"\t\t\t// In sequence, clock = 2\n" |
"\t\t\tif (pipeline[2])\n" |
"\t\t\tbegin\n" |
"\t\t\t\tomem <= ob_b;\n" |
"\t\t\t\to_data <= ob_a;\n" |
"\t\t\tend else\n" |
"\t\t\t\to_data <= omem;\n" |
"\t\t\to_sync <= &(~iaddr[(LGWIDTH-1):3]) && (iaddr[2:0] == 3'b100);\n" |
"\t\tend\n" |
"endmodule\n"); |
} |
|
203,12 → 204,13
"// Project: %s\n" |
"//\n" |
"// Purpose: This is part of an FPGA implementation that will process\n" |
"// data at two samples per clock. If you notice from the\n" |
"// derivation of an FFT, the only time both even and odd\n" |
"// samples are used at the same time is the first stage.\n" |
"// Therefore, after this stage and these twiddles, all of the\n" |
"// other stages can run two stages at a time at one sample per\n" |
"// clock.\n" |
"// the final stage of a decimate-in-frequency FFT, running\n" |
"// through the data at two samples per clock. If you notice\n" |
"// from the derivation of an FFT, the only time both even and\n" |
"// odd samples are used at the same time is in this stage.\n" |
"// Therefore, other than this stage and these twiddles, all of\n" |
"// the other stages can run two stages at a time at one sample\n" |
"// per clock.\n" |
"//\n" |
"// In this implementation, the output is valid one clock after\n" |
"// the input is valid. The output also accumulates one bit\n" |
229,43 → 231,43
fprintf(fp, "%s", cpyleft); |
fprintf(fp, |
"module dblstage(i_clk, i_ce, i_left, i_right, o_left, o_right);\n" |
"\tparameter\tIWIDTH=16,OWIDTH=IWIDTH+1, SHIFT=0;\n" |
"\tinput\t\ti_clk, i_ce;\n" |
"\tinput\t\t[(2*IWIDTH-1):0]\ti_left, i_right;\n" |
"\toutput\twire [(2*OWIDTH-1):0]\to_left, o_right;\n" |
"\tparameter\tIWIDTH=16,OWIDTH=IWIDTH+1, SHIFT=0;\n" |
"\tinput\t\ti_clk, i_ce;\n" |
"\tinput\t\t[(2*IWIDTH-1):0]\ti_left, i_right;\n" |
"\toutput\twire [(2*OWIDTH-1):0]\to_left, o_right;\n" |
"\n" |
"\twire\tsigned\t[(IWIDTH-1):0]\ti_in_0r, i_in_0i, i_in_1r, i_in_1i;\n" |
"\tassign\ti_in_0r = i_left[(2*IWIDTH-1):(IWIDTH)]; \n" |
"\tassign\ti_in_0i = i_left[(IWIDTH-1):0]; \n" |
"\tassign\ti_in_1r = i_right[(2*IWIDTH-1):(IWIDTH)]; \n" |
"\tassign\ti_in_1i = i_right[(IWIDTH-1):0]; \n" |
"\twire\t[(OWIDTH-1):0]\t\to_out_0r, o_out_0i,\n" |
"\t\t\t\t\to_out_1r, o_out_1i;\n" |
"\twire\tsigned\t[(IWIDTH-1):0]\ti_in_0r, i_in_0i, i_in_1r, i_in_1i;\n" |
"\tassign\ti_in_0r = i_left[(2*IWIDTH-1):(IWIDTH)]; \n" |
"\tassign\ti_in_0i = i_left[(IWIDTH-1):0]; \n" |
"\tassign\ti_in_1r = i_right[(2*IWIDTH-1):(IWIDTH)]; \n" |
"\tassign\ti_in_1i = i_right[(IWIDTH-1):0]; \n" |
"\twire\t[(OWIDTH-1):0]\t\to_out_0r, o_out_0i,\n" |
"\t\t\t\t\to_out_1r, o_out_1i;\n" |
"\n" |
"\t// Don't forget that we accumulate a bit by adding two values together.\n" |
"\t// Therefore our intermediate value must have one more bit than the\n" |
"\t// two originals.\n" |
"\treg\t[IWIDTH:0]\tout_0r, out_0i, out_1r, out_1i;\n" |
"\t// Don't forget that we accumulate a bit by adding two values\n" |
"\t// together. Therefore our intermediate value must have one more\n" |
"\t// bit than the two originals.\n" |
"\treg\t[IWIDTH:0]\tout_0r, out_0i, out_1r, out_1i;\n" |
"\n" |
"\talways @(posedge i_clk)\n" |
"\t\tif (i_ce)\n" |
"\t\tbegin\n" |
"\t\t\tout_0r <= i_in_0r + i_in_1r;\n" |
"\t\t\tout_0i <= i_in_0i + i_in_1i;\n" |
"\t\t\t//\n" |
"\t\t\tout_1r <= i_in_0r - i_in_1r;\n" |
"\t\t\tout_1i <= i_in_0i - i_in_1i;\n" |
"\t\tend\n" |
"\talways @(posedge i_clk)\n" |
"\t\tif (i_ce)\n" |
"\t\tbegin\n" |
"\t\t\tout_0r <= i_in_0r + i_in_1r;\n" |
"\t\t\tout_0i <= i_in_0i + i_in_1i;\n" |
"\t\t\t//\n" |
"\t\t\tout_1r <= i_in_0r - i_in_1r;\n" |
"\t\t\tout_1i <= i_in_0i - i_in_1i;\n" |
"\t\tend\n" |
"\n" |
"\t// Now, if the master control program doesn't want to keep all of our\n" |
"\t// bits, we can shift down to OWIDTH bits here.\n" |
"\tassign\to_out_0r = out_0r[(IWIDTH-SHIFT):(IWIDTH+1-OWIDTH-SHIFT)];\n" |
"\tassign\to_out_0i = out_0i[(IWIDTH-SHIFT):(IWIDTH+1-OWIDTH-SHIFT)];\n" |
"\tassign\to_out_1r = out_1r[(IWIDTH-SHIFT):(IWIDTH+1-OWIDTH-SHIFT)];\n" |
"\tassign\to_out_1i = out_1i[(IWIDTH-SHIFT):(IWIDTH+1-OWIDTH-SHIFT)];\n" |
"\t// Now, if the master control program doesn't want to keep all of\n" |
"\t// our bits, we can shift down to OWIDTH bits here.\n" |
"\tassign\to_out_0r = out_0r[(IWIDTH-SHIFT):(IWIDTH+1-OWIDTH-SHIFT)];\n" |
"\tassign\to_out_0i = out_0i[(IWIDTH-SHIFT):(IWIDTH+1-OWIDTH-SHIFT)];\n" |
"\tassign\to_out_1r = out_1r[(IWIDTH-SHIFT):(IWIDTH+1-OWIDTH-SHIFT)];\n" |
"\tassign\to_out_1i = out_1i[(IWIDTH-SHIFT):(IWIDTH+1-OWIDTH-SHIFT)];\n" |
"\n" |
"\tassign\to_left = { o_out_0r, o_out_0i };\n" |
"\tassign\to_right = { o_out_1r, o_out_1i };\n" |
"\tassign\to_left = { o_out_0r, o_out_0i };\n" |
"\tassign\to_right = { o_out_1r, o_out_1i };\n" |
"\n" |
"endmodule\n"); |
fclose(fp); |
326,6 → 328,10
"\treg\t[(AWIDTH+BWIDTH-1):0]\tacc[0:(AWIDTH-1)];\n" |
"\tgenvar k;\n" |
"\n" |
"\t// If we were forced to stay within two\'s complement arithmetic,\n" |
"\t// taking the absolute value here would require an additional bit.\n" |
"\t// However, because our results are now unsigned, we can stay\n" |
"\t// within the number of bits given (for now).\n" |
"\talways @(posedge i_clk)\n" |
"\t\tif (i_ce)\n" |
"\t\tbegin\n" |
431,65 → 437,70
"// o_1[m] = mem[01xxx1]\n" |
"// ...\n" |
"//\n" |
"// The answer is that, yes we can but: we need to use four memory banks\n" |
"// to do it properly. These four banks are defined by the two bits\n" |
"// that determine the top and bottom of the correct address. Larger\n" |
"// FFT\'s would require more memories.\n" |
"//\n" |
"//\n"); |
fprintf(fp, |
"module dblreverse(i_clk, i_rst, i_ce, i_in_0, i_in_1,\n" |
"\t\to_out_0, o_out_1, o_sync);\n" |
"\tparameter\t\t\tLGSIZE=4, WIDTH=24;\n" |
"\tinput\t\t\t\ti_clk, i_rst, i_ce;\n" |
"\tinput\t\t[(2*WIDTH-1):0]\ti_in_0, i_in_1;\n" |
"\toutput\treg\t[(2*WIDTH-1):0]\to_out_0, o_out_1;\n" |
"\toutput\treg\t\t\to_sync;\n" |
"\t\to_out_0, o_out_1, o_sync);\n" |
"\tparameter\t\t\tLGSIZE=4, WIDTH=24;\n" |
"\tinput\t\t\t\ti_clk, i_rst, i_ce;\n" |
"\tinput\t\t[(2*WIDTH-1):0]\ti_in_0, i_in_1;\n" |
"\toutput\treg\t[(2*WIDTH-1):0]\to_out_0, o_out_1;\n" |
"\toutput\treg\t\t\to_sync;\n" |
"\n" |
"\treg\tin_reset;\n" |
"\treg\t[(LGSIZE):0]\tiaddr;\n" |
"\treg\t[(2*WIDTH-1):0]\tmem_0e [0:((1<<(LGSIZE-1))-1)];\n" |
"\treg\t[(2*WIDTH-1):0]\tmem_0o [0:((1<<(LGSIZE-1))-1)];\n" |
"\treg\t[(2*WIDTH-1):0]\tmem_1e [0:((1<<(LGSIZE-1))-1)];\n" |
"\treg\t[(2*WIDTH-1):0]\tmem_1o [0:((1<<(LGSIZE-1))-1)];\n" |
"\treg\tin_reset;\n" |
"\treg\t[(LGSIZE):0]\tiaddr;\n" |
"\treg\t[(2*WIDTH-1):0]\tmem_0e [0:((1<<(LGSIZE-1))-1)];\n" |
"\treg\t[(2*WIDTH-1):0]\tmem_0o [0:((1<<(LGSIZE-1))-1)];\n" |
"\treg\t[(2*WIDTH-1):0]\tmem_1e [0:((1<<(LGSIZE-1))-1)];\n" |
"\treg\t[(2*WIDTH-1):0]\tmem_1o [0:((1<<(LGSIZE-1))-1)];\n" |
"\n" |
"\twire\t[(2*LGSIZE-1):0] braddr;\n" |
"\tgenvar\tk;\n" |
"\tgenerate for(k=0; k<LGSIZE; k++)\n" |
"\t\tassign braddr[k] = iaddr[LGSIZE-1-k];\n" |
"\tendgenerate\n" |
"\twire\t[(2*LGSIZE-1):0] braddr;\n" |
"\tgenvar\tk;\n" |
"\tgenerate for(k=0; k<LGSIZE; k++)\n" |
"\t\tassign braddr[k] = iaddr[LGSIZE-1-k];\n" |
"\tendgenerate\n" |
"\n" |
"\talways @(posedge i_clk)\n" |
"\t\tif (i_rst)\n" |
"\t\tbegin\n" |
"\t\t\tiaddr <= 0;\n" |
"\t\t\tin_reset <= 1'b1;\n" |
"\t\tend else if (i_ce)\n" |
"\t\tbegin\n" |
"\t\t\tif (iaddr[(LGSIZE-1)])\n" |
"\t\t\tbegin\n" |
"\t\t\t\tmem_1e[{iaddr[LGSIZE],iaddr[(LGSIZE-2):1]}] <= i_in_0;\n" |
"\t\t\t\tmem_1o[{iaddr[LGSIZE],iaddr[(LGSIZE-2):1]}] <= i_in_1;\n" |
"\t\t\tend else begin\n" |
"\t\t\t\tmem_0e[{iaddr[LGSIZE],iaddr[(LGSIZE-2):1]}] <= i_in_0;\n" |
"\t\t\t\tmem_0o[{iaddr[LGSIZE],iaddr[(LGSIZE-2):1]}] <= i_in_1;\n" |
"\t\t\tend\n" |
"\t\t\tiaddr <= iaddr + 2;\n" |
"\t\t\tif (&iaddr[(LGSIZE-1):1])\n" |
"\t\t\t\tin_reset <= 1'b0;\n" |
"\t\t\tif (in_reset)\n" |
"\t\t\tbegin\n" |
"\t\t\t\to_out_0 <= {(2*WIDTH){1'b0}};\n" |
"\t\t\t\to_out_1 <= {(2*WIDTH){1'b0}};\n" |
"\t\t\t\to_sync <= 1'b0;\n" |
"\t\t\tend else\n" |
"\t\t\tbegin\n" |
"\t\t\t\tif (braddr[0])\n" |
"\t\t\t\tbegin\n" |
"\talways @(posedge i_clk)\n" |
"\t\tif (i_rst)\n" |
"\t\tbegin\n" |
"\t\t\tiaddr <= 0;\n" |
"\t\t\tin_reset <= 1'b1;\n" |
"\t\tend else if (i_ce)\n" |
"\t\tbegin\n" |
"\t\t\tif (iaddr[(LGSIZE-1)])\n" |
"\t\t\tbegin\n" |
"\t\t\t\tmem_1e[{iaddr[LGSIZE],iaddr[(LGSIZE-2):1]}] <= i_in_0;\n" |
"\t\t\t\tmem_1o[{iaddr[LGSIZE],iaddr[(LGSIZE-2):1]}] <= i_in_1;\n" |
"\t\t\tend else begin\n" |
"\t\t\t\tmem_0e[{iaddr[LGSIZE],iaddr[(LGSIZE-2):1]}] <= i_in_0;\n" |
"\t\t\t\tmem_0o[{iaddr[LGSIZE],iaddr[(LGSIZE-2):1]}] <= i_in_1;\n" |
"\t\t\tend\n" |
"\t\t\tiaddr <= iaddr + 2;\n" |
"\t\t\tif (&iaddr[(LGSIZE-1):1])\n" |
"\t\t\t\tin_reset <= 1'b0;\n" |
"\t\t\tif (in_reset)\n" |
"\t\t\tbegin\n" |
"\t\t\t\to_out_0 <= {(2*WIDTH){1'b0}};\n" |
"\t\t\t\to_out_1 <= {(2*WIDTH){1'b0}};\n" |
"\t\t\t\to_sync <= 1'b0;\n" |
"\t\t\tend else\n" |
"\t\t\tbegin\n" |
"\t\t\t\tif (braddr[0])\n" |
"\t\t\t\tbegin\n" |
"\t\t\t\t\to_out_0 <= mem_0o[{~iaddr[LGSIZE],braddr[(LGSIZE-2):1]}];\n" |
"\t\t\t\t\to_out_1 <= mem_1o[{~iaddr[LGSIZE],braddr[(LGSIZE-2):1]}];\n" |
"\t\t\t\tend else begin\n" |
"\t\t\t\tend else begin\n" |
"\t\t\t\t\to_out_0 <= mem_0e[{~iaddr[LGSIZE],braddr[(LGSIZE-2):1]}];\n" |
"\t\t\t\t\to_out_1 <= mem_1e[{~iaddr[LGSIZE],braddr[(LGSIZE-2):1]}];\n" |
"\t\t\t\tend\n" |
"\t\t\t\to_sync <= ~(|iaddr[(LGSIZE-1):0]);\n" |
"\t\t\tend\n" |
"\t\tend\n" |
"\t\t\t\tend\n" |
"\t\t\t\to_sync <= ~(|iaddr[(LGSIZE-1):0]);\n" |
"\t\t\tend\n" |
"\t\tend\n" |
"\n" |
"endmodule;\n"); |
|
578,46 → 589,46
|
fprintf(fp, |
"module\tbutterfly(i_clk, i_ce, i_coef, i_left, i_right, i_aux,\n" |
"\t\to_left, o_right, o_aux);\n" |
"\t// Public changeable parameters ...\n" |
"\tparameter IWIDTH=16,CWIDTH=IWIDTH,OWIDTH=IWIDTH;\n" |
"\t// Parameters specific to the core that should not be changed.\n" |
"\tparameter MPYDELAY=(IWIDTH+1 < CWIDTH)?(IWIDTH+2):(CWIDTH+1),\n" |
"\t\t\tSHIFT=0, ROUND=1;\n" |
"\t// The LGDELAY should be the base two log of the MPYDELAY. If\n" |
"\t// this value is fractional, then round up to the nearest\n" |
"\t// integer: LGDELAY=ceil(log(MPYDELAY)/log(2));\n" |
"\tparameter LGDELAY=5;\n" |
"\tinput i_clk, i_ce;\n" |
"\tinput [(2*CWIDTH-1):0] i_coef;\n" |
"\tinput [(2*IWIDTH-1):0] i_left, i_right;\n" |
"\tinput i_aux;\n" |
"\toutput wire [(2*OWIDTH-1):0] o_left, o_right;\n" |
"\toutput wire o_aux;\n" |
"\t\to_left, o_right, o_aux);\n" |
"\t// Public changeable parameters ...\n" |
"\tparameter IWIDTH=16,CWIDTH=IWIDTH+4,OWIDTH=IWIDTH+1;\n" |
"\t// Parameters specific to the core that should not be changed.\n" |
"\tparameter MPYDELAY=5'd20, // (IWIDTH+1 < CWIDTH)?(IWIDTH+4):(CWIDTH+3),\n" |
"\t\t\tSHIFT=0, ROUND=0;\n" |
"\t// The LGDELAY should be the base two log of the MPYDELAY. If\n" |
"\t// this value is fractional, then round up to the nearest\n" |
"\t// integer: LGDELAY=ceil(log(MPYDELAY)/log(2));\n" |
"\tparameter\tLGDELAY=5;\n" |
"\tinput\t\ti_clk, i_ce;\n" |
"\tinput\t\t[(2*CWIDTH-1):0] i_coef;\n" |
"\tinput\t\t[(2*IWIDTH-1):0] i_left, i_right;\n" |
"\tinput\t\ti_aux;\n" |
"\toutput\twire [(2*OWIDTH-1):0] o_left, o_right;\n" |
"\toutput\twire o_aux;\n" |
"\n" |
"\twire [(OWIDTH-1):0] o_left_r, o_left_i, o_right_r, o_right_i;\n" |
"\twire\t[(OWIDTH-1):0] o_left_r, o_left_i, o_right_r, o_right_i;\n" |
"\n" |
"\treg [(2*IWIDTH-1):0] r_left, r_right;\n" |
"\treg r_aux, r_aux_2;\n" |
"\treg [(2*CWIDTH-1):0] r_coef, r_coef_2;\n" |
"\twire [(CWIDTH-1):0] r_coef_r, r_coef_i;\n" |
"\tassign r_coef_r = r_coef_2[ (2*CWIDTH-1):(CWIDTH)];\n" |
"\tassign r_coef_i = r_coef_2[ (CWIDTH-1):0];\n" |
"\twire [(IWIDTH-1):0] r_left_r, r_left_i, r_right_r, r_right_i;\n" |
"\tassign r_left_r = i_left[ (2*IWIDTH-1):(IWIDTH)];\n" |
"\tassign r_left_i = i_left[ (IWIDTH-1):0];\n" |
"\tassign r_right_r = i_right[(2*IWIDTH-1):(IWIDTH)];\n" |
"\tassign r_right_i = i_right[(IWIDTH-1):0];\n" |
"\treg\t[(2*IWIDTH-1):0]\tr_left, r_right;\n" |
"\treg\t\t\t\tr_aux, r_aux_2;\n" |
"\treg\t[(2*CWIDTH-1):0]\tr_coef, r_coef_2;\n" |
"\twire\tsigned\t[(CWIDTH-1):0]\tr_coef_r, r_coef_i;\n" |
"\tassign\tr_coef_r = r_coef_2[ (2*CWIDTH-1):(CWIDTH)];\n" |
"\tassign\tr_coef_i = r_coef_2[ ( CWIDTH-1):0];\n" |
"\twire\tsigned\t[(IWIDTH-1):0]\tr_left_r, r_left_i, r_right_r, r_right_i;\n" |
"\tassign\tr_left_r = r_left[ (2*IWIDTH-1):(IWIDTH)];\n" |
"\tassign\tr_left_i = r_left[ (IWIDTH-1):0];\n" |
"\tassign\tr_right_r = r_right[(2*IWIDTH-1):(IWIDTH)];\n" |
"\tassign\tr_right_i = r_right[(IWIDTH-1):0];\n" |
"\n" |
"\treg [(IWIDTH):0] r_sum_r, r_sum_i, r_dif_r, r_dif_i;\n" |
"\treg\tsigned\t[(IWIDTH):0]\tr_sum_r, r_sum_i, r_dif_r, r_dif_i;\n" |
"\n" |
"\treg [(LGDELAY-1):0] fifo_addr;\n" |
"\twire [(LGDELAY-1):0] fifo_read_addr;\n" |
"\t/* verilator lint_off WIDTH */\n" |
"\tassign fifo_read_addr = fifo_addr - MPYDELAY;\n" |
"\t/* verilator lint_on WIDTH */\n" |
"\treg [(2*IWIDTH+2):0] fifo_left [ 0:((1<<LGDELAY)-1)];\n" |
"\n" |
"\treg [(LGDELAY-1):0] fifo_addr;\n" |
"\twire [(LGDELAY-1):0] fifo_read_addr;\n" |
"\tassign fifo_read_addr = fifo_addr - MPYDELAY;\n" |
"\treg [(2*IWIDTH+2):0] fifo_left [ 0:((1<<LGDELAY)-1)];\n" |
"\n"); |
fprintf(fp, |
"\t// Set up the input to the multiply\n" |
"\talways @(posedge i_clk)\n" |
"\t\tif (i_ce)\n" |
"\t\tbegin\n" |
635,7 → 646,11
"\t\t\tr_aux_2 <= r_aux;\n" |
"\t\t\tr_coef_2<= r_coef;\n" |
"\t\tend\n" |
"\n" |
"\n"); |
fprintf(fp, |
"\t// Don\'t forget to record the even side, since it doesn\'t need\n" |
"\t// to be multiplied, but yet we still need the results in sync\n" |
"\t// with the answer when it is ready.\n" |
"\talways @(posedge i_clk)\n" |
"\t\tif (i_ce)\n" |
"\t\tbegin\n" |
646,33 → 661,62
"\t\t\tfifo_addr <= fifo_addr + 1;\n" |
"\t\tend\n" |
"\n" |
"\twire [(CWIDTH-1):0] ir_coef_r, ir_coef_i;\n" |
"\tassign ir_coef_r = r_coef_2[(2*CWIDTH-1):CWIDTH];\n" |
"\tassign ir_coef_i = r_coef_2[(CWIDTH-1):0];\n" |
"\twire [(IWIDTH+CWIDTH+1+2-1):0] p_one, p_two, p_three;\n" |
"\twire\tsigned\t[(CWIDTH-1):0] ir_coef_r, ir_coef_i;\n" |
"\tassign\tir_coef_r = r_coef_2[(2*CWIDTH-1):CWIDTH];\n" |
"\tassign\tir_coef_i = r_coef_2[(CWIDTH-1):0];\n" |
"\twire\tsigned\t[((IWIDTH+2)+(CWIDTH+1)-1):0]\tp_one, p_two, p_three;\n" |
"\n" |
"\t// Multiply output is always a width of IWIDTH+CWIDTH-1. ALWAYS.\n" |
"\t// We take care of dropping the width to OWIDTH in our routine\n" |
"\t// below, but this is the definition of a multiply.\n" |
"\n"); |
fprintf(fp, |
"\t// Multiply output is always a width of the sum of the widths of\n" |
"\t// the two inputs. ALWAYS. This is independent of the number of\n" |
"\t// bits in p_one, p_two, or p_three. These values needed to \n" |
"\t// accumulate a bit (or two) each. However, this approach to a\n" |
"\t// three multiply complex multiply cannot increase the total\n" |
"\t// number of bits in our final output. We\'ll take care of\n" |
"\t// dropping back down to the proper width, OWIDTH, in our routine\n" |
"\t// below.\n" |
"\n" |
"\n"); |
fprintf(fp, |
"\t// We accomplish here \"Karatsuba\" multiplication. That is,\n" |
"\t// by doing three multiplies we accomplish the work of four.\n" |
"\t// Let\'s prove to ourselves that this works ... We wish to\n" |
"\t// multiply: (a+jb) * (c+jd), where a+jb is given by\n" |
"\t//\ta + jb = r_dif_r + j r_dif_i, and\n" |
"\t//\tc + jd = ir_coef_r + j ir_coef_i.\n" |
"\t// We do this by calculating the intermediate products P1, P2,\n" |
"\t// and P3 as\n" |
"\t//\tP1 = ac\n" |
"\t//\tP2 = bd\n" |
"\t//\tP3 = (a + b) * (c + d)\n" |
"\t// and then complete our final answer with\n" |
"\t//\tac - bd = P1 - P2 (this checks)\n" |
"\t//\tad + bc = P3 - P2 - P1\n" |
"\t//\t = (ac + bc + ad + bd) - bd - ac\n" |
"\t//\t = bc + ad (this checks)\n" |
"\n" |
"\n" |
"// This should really be based upon an IF\n" |
"// if (IWIDTH < CWIDTH) then ...\n" |
"\n"); |
fprintf(fp, |
"\t// This should really be based upon an IF, such as in\n" |
"\t// if (IWIDTH < CWIDTH) then ...\n" |
"\t// However, this is the only (other) way I know to do it.\n" |
"\tgenerate\n" |
"\tif (CWIDTH < IWIDTH+1)\n" |
"\tbegin\n" |
"\t\t// We need to pad these first two multiplies by an extra\n" |
"\t\t// just to keep them aligned with the third, simpler,\n" |
"\t\t// multiply.\n" |
"\t\t// bit just to keep them aligned with the third,\n" |
"\t\t// simpler, multiply.\n" |
"\t\tshiftaddmpy #(CWIDTH+1,IWIDTH+2) p1(i_clk, i_ce,\n" |
"\t\t\t\t{ir_coef_r[CWIDTH-1],ir_coef_r},\n" |
"\t\t\t\t{r_dif_r[IWIDTH],r_dif_r}, p_one);\n" |
"\t\tshiftaddmpy #(CWIDTH+1,IWIDTH+2) p2(i_clk, i_ce,\n" |
"\t\t\t\t{ir_coef_r[CWIDTH-1],ir_coef_r},\n" |
"\t\t\t\t{ir_coef_i[CWIDTH-1],ir_coef_i},\n" |
"\t\t\t\t{r_dif_i[IWIDTH],r_dif_i}, p_two);\n" |
"\t\tshiftaddmpy #(CWIDTH+1,IWIDTH+2) p3(i_clk, i_ce,\n" |
"\t\t\t\tir_coef_i+ir_coef_r, r_dif_r + r_dif_i, p_three);\n" |
"\t\t\t\tir_coef_i+ir_coef_r,\n" |
"\t\t\t\tr_dif_r + r_dif_i,\n" |
"\t\t\t\tp_three);\n" |
"\tend else begin\n" |
"\t\tshiftaddmpy #(IWIDTH+2,CWIDTH+1) p1a(i_clk, i_ce,\n" |
"\t\t\t\t{r_dif_r[IWIDTH],r_dif_r},\n" |
679,7 → 723,7
"\t\t\t\t{ir_coef_r[CWIDTH-1],ir_coef_r}, p_one);\n" |
"\t\tshiftaddmpy #(IWIDTH+2,CWIDTH+1) p2a(i_clk, i_ce,\n" |
"\t\t\t\t{r_dif_i[IWIDTH], r_dif_i},\n" |
"\t\t\t\t{ir_coef_r[CWIDTH-1],ir_coef_r}, p_two);\n" |
"\t\t\t\t{ir_coef_i[CWIDTH-1],ir_coef_i}, p_two);\n" |
"\t\tshiftaddmpy #(IWIDTH+2,CWIDTH+1) p3a(i_clk, i_ce,\n" |
"\t\t\t\tr_dif_r+r_dif_i,\n" |
"\t\t\t\tir_coef_i+ir_coef_r,\n" |
686,20 → 730,35
"\t\t\t\tp_three);\n" |
"\tend\n" |
"\tendgenerate\n" |
"\n" |
"\n"); |
fprintf(fp, |
"\t// These values are held in memory and delayed during the\n" |
"\t// multiply. Here, we recover them. During the multiply,\n" |
"\t// values were multiplied by 2^(CWIDTH-2)*exp{-j*2*pi*...},\n" |
"\t// therefore, the left_x values need to be right shifted by\n" |
"\t// CWIDTH-2 as well. The additional bits come from a sign\n" |
"\t// extension.\n" |
"\twire aux;\n" |
"\twire [(IWIDTH+CWIDTH):0] left_i, left_r;\n" |
"\treg [(2*IWIDTH+2):0] fifo_read;\n" |
"\tassign left_r = { fifo_read[(2*(IWIDTH+1)-1):(IWIDTH+1)], {(CWIDTH){1'b0}} };\n" |
"\tassign left_i = { fifo_read[((IWIDTH+1)-1):0], {(CWIDTH){1'b0}} };\n" |
"\tassign aux = fifo_read[2*IWIDTH+2];\n" |
"\twire\tsigned\t[(IWIDTH+CWIDTH):0] fifo_i, fifo_r;\n" |
"\treg\t\t[(2*IWIDTH+2):0] fifo_read;\n" |
"\tassign\tfifo_r = { {2{fifo_read[2*(IWIDTH+1)-1]}}, fifo_read[(2*(IWIDTH+1)-1):(IWIDTH+1)], {(CWIDTH-2){1'b0}} };\n" |
"\tassign\tfifo_i = { {2{fifo_read[(IWIDTH+1)-1]}}, fifo_read[((IWIDTH+1)-1):0], {(CWIDTH-2){1'b0}} };\n" |
"\tassign\taux = fifo_read[2*IWIDTH+2];\n" |
"\n" |
"\n" |
"\treg [(CWIDTH+IWIDTH+3-1):0] b_left_r, b_left_i, b_right_r, b_right_i, mpy_r, mpy_i;\n" |
"\treg [(CWIDTH+IWIDTH+3-1):0] rnd;\n" |
"\tassign rnd = ((~ROUND)||(SHIFT==0))?\n" |
"\t\t\t({(CWIDTH+IWIDTH+3){1'b0}})\n" |
"\t\t\t: ({ {(OWIDTH+1+SHIFT){1'b0}},1'b1,{(CWIDTH+IWIDTH+3-2-OWIDTH-SHIFT){1'b0}} });\n" |
"\treg\tsigned\t[(CWIDTH+IWIDTH+3-1):0] b_left_r, b_left_i,\n" |
"\t\t\t\t\t\tb_right_r, b_right_i;\n" |
"\treg\tsigned\t[(CWIDTH+IWIDTH+3-1):0] mpy_r, mpy_i;\n" |
"\treg\tsigned\t[(CWIDTH+IWIDTH+3-1):0] rnd;\n" |
"\tgenerate\n" |
"\tif ((~ROUND)||(CWIDTH+IWIDTH-OWIDTH-SHIFT<1))\n" |
"\t\tassign rnd = ({(CWIDTH+IWIDTH+3){1'b0}});\n" |
"\telse\n" |
"\t\tassign rnd = ({ {(OWIDTH+3+SHIFT){1'b0}},1'b1,\n" |
"\t\t\t\t{(CWIDTH+IWIDTH-OWIDTH-SHIFT-1){1'b0}} });\n" |
"\tendgenerate\n" |
"\n"); |
fprintf(fp, |
"\talways @(posedge i_clk)\n" |
"\t\tif (i_ce)\n" |
"\t\tbegin\n" |
706,6 → 765,9
"\t\t\t// First clock, recover all values\n" |
"\t\t\tfifo_read <= fifo_left[fifo_read_addr];\n" |
"\t\t\t// These values are IWIDTH+CWIDTH+3 bits wide\n" |
"\t\t\t// although they only need to be (IWIDTH+1)\n" |
"\t\t\t// + (CWIDTH) bits wide. (We\'ve got two\n" |
"\t\t\t// extra bits we need to get rid of.)\n" |
"\t\t\tmpy_r <= p_one - p_two;\n" |
"\t\t\tmpy_i <= p_three - p_one - p_two;\n" |
"\n" |
712,19 → 774,45
"\t\t\t// Second clock, round and latch for final clock\n" |
"\t\t\tb_right_r <= mpy_r + rnd;\n" |
"\t\t\tb_right_i <= mpy_i + rnd;\n" |
"\t\t\tb_left_r <= { {2{left_r[(IWIDTH+CWIDTH)]}},left_r } + rnd;\n" |
"\t\t\tb_left_i <= { {2{left_i[(IWIDTH+CWIDTH)]}},left_i } + rnd;\n" |
"\t\t\tb_left_r <= { {2{fifo_r[(IWIDTH+CWIDTH)]}},fifo_r } + rnd;\n" |
"\t\t\tb_left_i <= { {2{fifo_i[(IWIDTH+CWIDTH)]}},fifo_i } + rnd;\n" |
"\t\t\to_aux <= aux;\n" |
"\t\tend\n" |
"\n" |
"\n"); |
fprintf(fp, |
"\t// Final clock--clock and remove unnecessary bits.\n" |
"\t// We have (IWIDTH+CWIDTH+3) bits here, we need to drop down to OWIDTH,\n" |
"\t// and SHIFT by SHIFT bits in the process.\n" |
"\tassign o_left_r = b_left_r[ (CWIDTH+IWIDTH+2-SHIFT):(CWIDTH+IWIDTH+2-SHIFT-OWIDTH+1)];\n" |
"\tassign o_left_i = b_left_i[ (CWIDTH+IWIDTH+2-SHIFT):(CWIDTH+IWIDTH+2-SHIFT-OWIDTH+1)];\n" |
"\tassign o_right_r = b_right_r[(CWIDTH+IWIDTH+2-SHIFT):(CWIDTH+IWIDTH+2-SHIFT-OWIDTH+1)];\n" |
"\tassign o_right_i = b_right_i[(CWIDTH+IWIDTH+2-SHIFT):(CWIDTH+IWIDTH+2-SHIFT-OWIDTH+1)];\n" |
"\t// We have (IWIDTH+CWIDTH+3) bits here, we need to drop down to\n" |
"\t// OWIDTH, and SHIFT by SHIFT bits in the process. The trick is\n" |
"\t// that we don\'t need (IWIDTH+CWIDTH+3) bits. We\'ve accumulated\n" |
"\t// them, but the actual values will never fill all these bits.\n" |
"\t// In particular, we only need:\n" |
"\t//\t IWIDTH bits for the input\n" |
"\t//\t +1 bit for the add/subtract\n" |
"\t//\t+CWIDTH bits for the coefficient multiply\n" |
"\t//\t +1 bit for the add/subtract in the complex multiply\n" |
"\t//\t ------\n" |
"\t//\t (IWIDTH+CWIDTH+2) bits at full precision.\n" |
"\t//\n" |
"\t// However, the coefficient multiply multiplied by a maximum value\n" |
"\t// of 2^(CWIDTH-2). Thus, we only have\n" |
"\t//\t IWIDTH bits for the input\n" |
"\t//\t +1 bit for the add/subtract\n" |
"\t//\t+CWIDTH-2 bits for the coefficient multiply\n" |
"\t//\t +1 (optional) bit for the add/subtract in the cpx mpy.\n" |
"\t//\t -------- ... multiply. (This last bit may be shifted out.)\n" |
"\t//\t (IWIDTH+CWIDTH) valid output bits. \n" |
"\t// Now, if the user wants to keep any extras of these (via OWIDTH),\n" |
"\t// or if he wishes to arbitrarily shift some of these off (via\n" |
"\t// SHIFT) we accomplish that here.\n" |
"\tassign o_left_r = b_left_r[ (CWIDTH+IWIDTH-1-SHIFT-1):(CWIDTH+IWIDTH-OWIDTH-SHIFT-1)];\n" |
"\tassign o_left_i = b_left_i[ (CWIDTH+IWIDTH-1-SHIFT-1):(CWIDTH+IWIDTH-OWIDTH-SHIFT-1)];\n" |
"\tassign o_right_r = b_right_r[(CWIDTH+IWIDTH-1-SHIFT-1):(CWIDTH+IWIDTH-OWIDTH-SHIFT-1)];\n" |
"\tassign o_right_i = b_right_i[(CWIDTH+IWIDTH-1-SHIFT-1):(CWIDTH+IWIDTH-OWIDTH-SHIFT-1)];\n" |
"\n" |
"\t// As a final step, we pack our outputs into two packed two\'s\n" |
"\t// complement numbers per output word, so that each output word\n" |
"\t// has (2*OWIDTH) bits in it, with the top half being the real\n" |
"\t// portion and the bottom half being the imaginary portion.\n" |
"\tassign o_left = { o_left_r, o_left_i };\n" |
"\tassign o_right= { o_right_r,o_right_i};\n" |
"\n" |
907,13 → 995,14
"\t\t\tend else\n" |
"\t\t\t\to_sync <= 1'b0;\n" |
"\t\tend\n" |
"\n" |
"\n", (inv)?"i":""); |
fprintf(fstage, |
"\tbutterfly #(.IWIDTH(IWIDTH),.CWIDTH(CWIDTH),.OWIDTH(OWIDTH),\n" |
"\t\t\t.LGDELAY(LGBDLY),.SHIFT(BFLYSHIFT))\n" |
"\t\t\t.MPYDELAY(%d\'d%d),.LGDELAY(LGBDLY),.SHIFT(BFLYSHIFT))\n" |
"\t\tbfly(i_clk, (b_ce&i_ce), ib_c,\n" |
"\t\t\tib_a, ib_b, ib_sync, ob_a, ob_b, ob_sync);\n" |
"endmodule;\n", |
(inv)?"i":""); |
lgdelay(nbits, xtra), (1<xtra)?(nbits+4):(nbits+xtra+3)); |
} |
|
void usage(void) { |
/sw/Makefile
9,17 → 9,17
%.o: %.cpp |
$(CXX) -c $< -o $@ |
|
.PHONY: alltest |
alltest: test itest shiftaddmpy butterfly dblreverse qtrstage dblstage |
.PHONY: test |
test: fft ifft shiftaddmpy butterfly dblreverse qtrstage dblstage |
|
.PHONY: test |
test: fftgen |
.PHONY: fft |
fft: fftgen |
./fftgen -f 2048 -n 16 |
cd $(CORED)/; verilator -cc fftmain.v |
cd $(OBJDR); make -f Vfftmain.mk |
|
.PHONY: itest |
itest: fftgen |
.PHONY: ifft |
ifft: fftgen |
./fftgen -f 2048 -1 -n 24 -m 24 |
cd $(CORED)/; verilator -cc ifftmain.v |
cd $(OBJDR); make -f Vifftmain.mk |
37,7 → 37,7
.PHONY: butterfly |
butterfly: $(OBJDR)/Vbutterfly__ALL.a |
|
$(CORED)/butterfly.v: test |
$(CORED)/butterfly.v: fft |
$(OBJDR)/Vbutterfly.cpp $(OBJDR)/Vbutterfly.h: $(CORED)/butterfly.v |
cd $(CORED)/; verilator -cc butterfly.v |
$(OBJDR)/Vbutterfly__ALL.a: $(OBJDR)/Vbutterfly.h |
47,7 → 47,7
.PHONY: dblreverse |
dblreverse: $(OBJDR)/Vdblreverse__ALL.a |
|
$(CORED)/dblreverse.v: test |
$(CORED)/dblreverse.v: fft |
$(OBJDR)/Vdblreverse.cpp $(OBJDR)/Vdblreverse.h: $(CORED)/dblreverse.v |
cd $(CORED)/; verilator -cc dblreverse.v |
$(OBJDR)/Vdblreverse__ALL.a: $(OBJDR)/Vdblreverse.h |
57,7 → 57,7
.PHONY: qtrstage |
qtrstage: $(OBJDR)/Vqtrstage__ALL.a |
|
$(CORED)/qtrstage.v: test |
$(CORED)/qtrstage.v: fft |
$(OBJDR)/Vqtrstage.cpp $(OBJDR)/Vqtrstage.h: $(CORED)/qtrstage.v |
cd $(CORED)/; verilator -cc qtrstage.v |
$(OBJDR)/Vqtrstage__ALL.a: $(OBJDR)/Vqtrstage.h |
67,7 → 67,7
.PHONY: dblstage |
dblstage: $(OBJDR)/Vdblstage__ALL.a |
|
$(CORED)/dblstage.v: test |
$(CORED)/dblstage.v: fft |
$(OBJDR)/Vdblstage.cpp $(OBJDR)/Vdblstage.h: $(CORED)/dblstage.v |
cd $(CORED)/; verilator -cc dblstage.v |
$(OBJDR)/Vdblstage__ALL.a: $(OBJDR)/Vdblstage.h |
74,6 → 74,7
$(OBJDR)/Vdblstage__ALL.a: $(OBJDR)/Vdblstage.cpp |
cd $(OBJDR)/; make -f Vdblstage.mk |
|
.PHONY: clean |
clean: |
rm fftgen fftgen.o |
rm -rf $(CORED) |