URL https://opencores.org/ocsvn/dblclockfft/dblclockfft/trunk

Subversion Repositories dblclockfft

[/] [dblclockfft/] [trunk/] [rtl/] [longbimpy.v] - Blame information for rev 36

Go to most recent revision | Details | Compare with Previous | View Log


////////////////////////////////////////////////////////////////////////////////
//
// Filename:    ../rtl/longbimpy.v
//
// Project:     A General Purpose Pipelined FFT Implementation
//
// Purpose:     A portable shift and add multiply, built with the knowledge
//      of the existence of a six bit LUT and carry chain.  That knowledge
//      allows us to multiply two bits from one value at a time against all
//      of the bits of the other value.  This sub multiply is called the
//      bimpy.
//
//      For minimal processing delay, make the first parameter the one with
//      the least bits, so that AWIDTH <= BWIDTH.
//
//
//
// Creator:     Dan Gisselquist, Ph.D.
//              Gisselquist Technology, LLC
//
////////////////////////////////////////////////////////////////////////////////
//
// Copyright (C) 2015-2018, Gisselquist Technology, LLC
//
// This program is free software (firmware): you can redistribute it and/or
// modify it under the terms of  the GNU General Public License as published
// by the Free Software Foundation, either version 3 of the License, or (at
// your option) any later version.
//
// This program is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTIBILITY or
// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
// for more details.
//
// You should have received a copy of the GNU General Public License along
// with this program.  (It's in the $(ROOT)/doc directory, run make with no
// target there if the PDF file isn't present.)  If not, see
// <http://www.gnu.org/licenses/> for a copy.
//
// License:     GPL, v3, as defined and found on www.gnu.org,
//              http://www.gnu.org/licenses/gpl.html
//
//
////////////////////////////////////////////////////////////////////////////////
//
//
`default_nettype        none
//
module  longbimpy(i_clk, i_ce, i_a_unsorted, i_b_unsorted, o_r);
        parameter       IAW=8,  // The width of i_a, min width is 5
                        IBW=12, // The width of i_b, can be anything
                        // The following three parameters should not be changed
                        // by any implementation, but are based upon hardware
                        // and the above values:
                        OW=IAW+IBW;     // The output width
        localparam      AW = (IAW<IBW) ? IAW : IBW,
                        BW = (IAW<IBW) ? IBW : IAW,
                        IW=(AW+1)&(-2), // Internal width of A
                        LUTB=2, // How many bits we can multiply by at once
                        TLEN=(AW+(LUTB-1))/LUTB; // Nmbr of rows in our tableau
        input                           i_clk, i_ce;
        input           [(IAW-1):0]      i_a_unsorted;
        input           [(IBW-1):0]      i_b_unsorted;
        output  reg     [(AW+BW-1):0]    o_r;
 
        //
        // Swap parameter order, so that AW <= BW -- for performance
        // reasons
        wire    [AW-1:0] i_a;
        wire    [BW-1:0] i_b;
        generate if (IAW <= IBW)
        begin : NO_PARAM_CHANGE
                assign i_a = i_a_unsorted;
                assign i_b = i_b_unsorted;
        end else begin : SWAP_PARAMETERS
                assign i_a = i_b_unsorted;
                assign i_b = i_a_unsorted;
        end endgenerate
 
        reg     [(IW-1):0]       u_a;
        reg     [(BW-1):0]       u_b;
        reg                     sgn;
 
        reg     [(IW-1-2*(LUTB)):0]      r_a[0:(TLEN-3)];
        reg     [(BW-1):0]               r_b[0:(TLEN-3)];
        reg     [(TLEN-1):0]             r_s;
        reg     [(IW+BW-1):0]            acc[0:(TLEN-2)];
        genvar k;
 
        // First step:
        // Switch to unsigned arithmetic for our multiply, keeping track
        // of the along the way.  We'll then add the sign again later at
        // the end.
        //
        // If we were forced to stay within two's complement arithmetic,
        // taking the absolute value here would require an additional bit.
        // However, because our results are now unsigned, we can stay
        // within the number of bits given (for now).
        generate if (IW > AW)
        begin
                always @(posedge i_clk)
                        if (i_ce)
                                u_a <= { 1'b0, (i_a[AW-1])?(-i_a):(i_a) };
        end else begin
                always @(posedge i_clk)
                        if (i_ce)
                                u_a <= (i_a[AW-1])?(-i_a):(i_a);
        end endgenerate
 
        always @(posedge i_clk)
                if (i_ce)
                begin
                        u_b <= (i_b[BW-1])?(-i_b):(i_b);
                        sgn <= i_a[AW-1] ^ i_b[BW-1];
                end
 
        wire    [(BW+LUTB-1):0]  pr_a, pr_b;
 
        //
        // Second step: First two 2xN products.
        //
        // Since we have no tableau of additions (yet), we can do both
        // of the first two rows at the same time and add them together.
        // For the next round, we'll then have a previous sum to accumulate
        // with new and subsequent product, and so only do one product at
        // a time can follow this--but the first clock can do two at a time.
        bimpy   #(BW) lmpy_0(i_clk,i_ce,u_a[(  LUTB-1):   0], u_b, pr_a);
        bimpy   #(BW) lmpy_1(i_clk,i_ce,u_a[(2*LUTB-1):LUTB], u_b, pr_b);
        always @(posedge i_clk)
                if (i_ce) r_a[0] <= u_a[(IW-1):(2*LUTB)];
        always @(posedge i_clk)
                if (i_ce) r_b[0] <= u_b;
        always @(posedge i_clk)
                if (i_ce) r_s <= { r_s[(TLEN-2):0], sgn };
        always @(posedge i_clk) // One clk after p[0],p[1] become valid
                if (i_ce) acc[0] <= { {(IW-LUTB){1'b0}}, pr_a}
                          +{ {(IW-(2*LUTB)){1'b0}}, pr_b, {(LUTB){1'b0}} };
 
        generate // Keep track of intermediate values, before multiplying them
        if (TLEN > 3) for(k=0; k<TLEN-3; k=k+1)
        begin : gencopies
                always @(posedge i_clk)
                if (i_ce)
                begin
                        r_a[k+1] <= { {(LUTB){1'b0}},
                                r_a[k][(IW-1-(2*LUTB)):LUTB] };
                        r_b[k+1] <= r_b[k];
                end
        end endgenerate
 
        generate // The actual multiply and accumulate stage
        if (TLEN > 2) for(k=0; k<TLEN-2; k=k+1)
        begin : genstages
                // First, the multiply: 2-bits times BW bits
                wire    [(BW+LUTB-1):0] genp;
                bimpy #(BW) genmpy(i_clk,i_ce,r_a[k][(LUTB-1):0],r_b[k], genp);
 
                // Then the accumulate step -- on the next clock
                always @(posedge i_clk)
                        if (i_ce)
                                acc[k+1] <= acc[k] + {{(IW-LUTB*(k+3)){1'b0}},
                                        genp, {(LUTB*(k+2)){1'b0}} };
        end endgenerate
 
        wire    [(IW+BW-1):0]    w_r;
        assign  w_r = (r_s[TLEN-1]) ? (-acc[TLEN-2]) : acc[TLEN-2];
        always @(posedge i_clk)
                if (i_ce)
                        o_r <= w_r[(AW+BW-1):0];
 
        generate if (IW > AW)
        begin : VUNUSED
                // verilator lint_off UNUSED
                wire    [(IW-AW)-1:0]    unused;
                assign  unused = w_r[(IW+BW-1):(AW+BW)];
                // verilator lint_on UNUSED
        end endgenerate
 
endmodule

Line No.	Rev	Author	Line
1	36	dgisselq	`////////////////////////////////////////////////////////////////////////////////`
2			`//`
3			`// Filename: ../rtl/longbimpy.v`
4			`//`
5			`// Project: A General Purpose Pipelined FFT Implementation`
6			`//`
7			`// Purpose: A portable shift and add multiply, built with the knowledge`
8			`// of the existence of a six bit LUT and carry chain. That knowledge`
9			`// allows us to multiply two bits from one value at a time against all`
10			`// of the bits of the other value. This sub multiply is called the`
11			`// bimpy.`
12			`//`
13			`// For minimal processing delay, make the first parameter the one with`
14			`// the least bits, so that AWIDTH <= BWIDTH.`
15			`//`
16			`//`
17			`//`
18			`// Creator: Dan Gisselquist, Ph.D.`
19			`// Gisselquist Technology, LLC`
20			`//`
21			`////////////////////////////////////////////////////////////////////////////////`
22			`//`
23			`// Copyright (C) 2015-2018, Gisselquist Technology, LLC`
24			`//`
25			`// This program is free software (firmware): you can redistribute it and/or`
26			`// modify it under the terms of the GNU General Public License as published`
27			`// by the Free Software Foundation, either version 3 of the License, or (at`
28			`// your option) any later version.`
29			`//`
30			`// This program is distributed in the hope that it will be useful, but WITHOUT`
31			`// ANY WARRANTY; without even the implied warranty of MERCHANTIBILITY or`
32			`// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License`
33			`// for more details.`
34			`//`
35			`// You should have received a copy of the GNU General Public License along`
36			`// with this program. (It's in the $(ROOT)/doc directory, run make with no`
37			`// target there if the PDF file isn't present.) If not, see`
38			`// <http://www.gnu.org/licenses/> for a copy.`
39			`//`
40			`// License: GPL, v3, as defined and found on www.gnu.org,`
41			`// http://www.gnu.org/licenses/gpl.html`
42			`//`
43			`//`
44			`////////////////////////////////////////////////////////////////////////////////`
45			`//`
46			`//`
47			`default_nettype none
48			`//`
49			`module longbimpy(i_clk, i_ce, i_a_unsorted, i_b_unsorted, o_r);`
50			`parameter IAW=8, // The width of i_a, min width is 5`
51			`IBW=12, // The width of i_b, can be anything`
52			`// The following three parameters should not be changed`
53			`// by any implementation, but are based upon hardware`
54			`// and the above values:`
55			`OW=IAW+IBW; // The output width`
56			`localparam AW = (IAW<IBW) ? IAW : IBW,`
57			`BW = (IAW<IBW) ? IBW : IAW,`
58			`IW=(AW+1)&(-2), // Internal width of A`
59			`LUTB=2, // How many bits we can multiply by at once`
60			`TLEN=(AW+(LUTB-1))/LUTB; // Nmbr of rows in our tableau`
61			`input i_clk, i_ce;`
62			`input [(IAW-1):0] i_a_unsorted;`
63			`input [(IBW-1):0] i_b_unsorted;`
64			`output reg [(AW+BW-1):0] o_r;`
65
66			`//`
67			`// Swap parameter order, so that AW <= BW -- for performance`
68			`// reasons`
69			`wire [AW-1:0] i_a;`
70			`wire [BW-1:0] i_b;`
71			`generate if (IAW <= IBW)`
72			`begin : NO_PARAM_CHANGE`
73			`assign i_a = i_a_unsorted;`
74			`assign i_b = i_b_unsorted;`
75			`end else begin : SWAP_PARAMETERS`
76			`assign i_a = i_b_unsorted;`
77			`assign i_b = i_a_unsorted;`
78			`end endgenerate`
79
80			`reg [(IW-1):0] u_a;`
81			`reg [(BW-1):0] u_b;`
82			`reg sgn;`
83
84			`reg [(IW-1-2*(LUTB)):0] r_a[0:(TLEN-3)];`
85			`reg [(BW-1):0] r_b[0:(TLEN-3)];`
86			`reg [(TLEN-1):0] r_s;`
87			`reg [(IW+BW-1):0] acc[0:(TLEN-2)];`
88			`genvar k;`
89
90			`// First step:`
91			`// Switch to unsigned arithmetic for our multiply, keeping track`
92			`// of the along the way. We'll then add the sign again later at`
93			`// the end.`
94			`//`
95			`// If we were forced to stay within two's complement arithmetic,`
96			`// taking the absolute value here would require an additional bit.`
97			`// However, because our results are now unsigned, we can stay`
98			`// within the number of bits given (for now).`
99			`generate if (IW > AW)`
100			`begin`
101			`always @(posedge i_clk)`
102			`if (i_ce)`
103			`u_a <= { 1'b0, (i_a[AW-1])?(-i_a):(i_a) };`
104			`end else begin`
105			`always @(posedge i_clk)`
106			`if (i_ce)`
107			`u_a <= (i_a[AW-1])?(-i_a):(i_a);`
108			`end endgenerate`
109
110			`always @(posedge i_clk)`
111			`if (i_ce)`
112			`begin`
113			`u_b <= (i_b[BW-1])?(-i_b):(i_b);`
114			`sgn <= i_a[AW-1] ^ i_b[BW-1];`
115			`end`
116
117			`wire [(BW+LUTB-1):0] pr_a, pr_b;`
118
119			`//`
120			`// Second step: First two 2xN products.`
121			`//`
122			`// Since we have no tableau of additions (yet), we can do both`
123			`// of the first two rows at the same time and add them together.`
124			`// For the next round, we'll then have a previous sum to accumulate`
125			`// with new and subsequent product, and so only do one product at`
126			`// a time can follow this--but the first clock can do two at a time.`
127			`bimpy #(BW) lmpy_0(i_clk,i_ce,u_a[( LUTB-1): 0], u_b, pr_a);`
128			`bimpy #(BW) lmpy_1(i_clk,i_ce,u_a[(2*LUTB-1):LUTB], u_b, pr_b);`
129			`always @(posedge i_clk)`
130			`if (i_ce) r_a[0] <= u_a[(IW-1):(2*LUTB)];`
131			`always @(posedge i_clk)`
132			`if (i_ce) r_b[0] <= u_b;`
133			`always @(posedge i_clk)`
134			`if (i_ce) r_s <= { r_s[(TLEN-2):0], sgn };`
135			`always @(posedge i_clk) // One clk after p[0],p[1] become valid`
136			`if (i_ce) acc[0] <= { {(IW-LUTB){1'b0}}, pr_a}`
137			`+{ {(IW-(2*LUTB)){1'b0}}, pr_b, {(LUTB){1'b0}} };`
138
139			`generate // Keep track of intermediate values, before multiplying them`
140			`if (TLEN > 3) for(k=0; k<TLEN-3; k=k+1)`
141			`begin : gencopies`
142			`always @(posedge i_clk)`
143			`if (i_ce)`
144			`begin`
145			`r_a[k+1] <= { {(LUTB){1'b0}},`
146			`r_a[k][(IW-1-(2*LUTB)):LUTB] };`
147			`r_b[k+1] <= r_b[k];`
148			`end`
149			`end endgenerate`
150
151			`generate // The actual multiply and accumulate stage`
152			`if (TLEN > 2) for(k=0; k<TLEN-2; k=k+1)`
153			`begin : genstages`
154			`// First, the multiply: 2-bits times BW bits`
155			`wire [(BW+LUTB-1):0] genp;`
156			`bimpy #(BW) genmpy(i_clk,i_ce,r_a[k][(LUTB-1):0],r_b[k], genp);`
157
158			`// Then the accumulate step -- on the next clock`
159			`always @(posedge i_clk)`
160			`if (i_ce)`
161			`acc[k+1] <= acc[k] + {{(IW-LUTB*(k+3)){1'b0}},`
162			`genp, {(LUTB*(k+2)){1'b0}} };`
163			`end endgenerate`
164
165			`wire [(IW+BW-1):0] w_r;`
166			`assign w_r = (r_s[TLEN-1]) ? (-acc[TLEN-2]) : acc[TLEN-2];`
167			`always @(posedge i_clk)`
168			`if (i_ce)`
169			`o_r <= w_r[(AW+BW-1):0];`
170
171			`generate if (IW > AW)`
172			`begin : VUNUSED`
173			`// verilator lint_off UNUSED`
174			`wire [(IW-AW)-1:0] unused;`
175			`assign unused = w_r[(IW+BW-1):(AW+BW)];`
176			`// verilator lint_on UNUSED`
177			`end endgenerate`
178
179			`endmodule`

Browse

Tools

Subversion Repositories dblclockfft

[/] [dblclockfft/] [trunk/] [rtl/] [longbimpy.v] - Blame information for rev 36