URL https://opencores.org/ocsvn/zipcpu/zipcpu/trunk

Subversion Repositories zipcpu

[/] [zipcpu/] [trunk/] [rtl/] [core/] [mpyop.v] - Blame information for rev 209

Details | Compare with Previous | View Log


////////////////////////////////////////////////////////////////////////////////
//
// Filename:    mpyop.v
//
// Project:     Zip CPU -- a small, lightweight, RISC CPU soft core
//
// Purpose:     This code has been pulled from the cpuops.v file so as to
//              encapsulate the multiply component--the one component that
//      (can't be) formally verified well, and so must be abstracted away.
//      This separation was done to support potential future abstraction.
//
//
// Creator:     Dan Gisselquist, Ph.D.
//              Gisselquist Technology, LLC
//
////////////////////////////////////////////////////////////////////////////////
//
// Copyright (C) 2015-2019, Gisselquist Technology, LLC
//
// This program is free software (firmware): you can redistribute it and/or
// modify it under the terms of  the GNU General Public License as published
// by the Free Software Foundation, either version 3 of the License, or (at
// your option) any later version.
//
// This program is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTIBILITY or
// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
// for more details.
//
// You should have received a copy of the GNU General Public License along
// with this program.  (It's in the $(ROOT)/doc directory.  Run make with no
// target there if the PDF file isn't present.)  If not, see
// <http://www.gnu.org/licenses/> for a copy.
//
// License:     GPL, v3, as defined and found on www.gnu.org,
//              http://www.gnu.org/licenses/gpl.html
//
//
////////////////////////////////////////////////////////////////////////////////
//
//
`default_nettype        none
//
module  mpyop(i_clk,i_reset, i_stb, i_op, i_a, i_b, o_valid, o_busy, o_result, o_hi);
        // The following parameter selects which multiply algorithm we use.
        // Timing performance is strictly dependent upon it.
        parameter       IMPLEMENT_MPY = 1;
        input   wire            i_clk, i_reset, i_stb;
        input   wire    [1:0]    i_op; // 2'b00=MPY, 2'b10=MPYUHI, 2'b11=MPYSHI
        input   wire    [31:0]   i_a, i_b;
        output  wire            o_valid; // True if we'll be valid on the next clock;
        output  wire            o_busy; // The multiply is busy if true
        output  wire    [63:0]   o_result; // Where we dump the multiply result
        output  reg             o_hi;   // Return the high half of the multiply
 
 
        // A 4-way multiplexer can be done in one 6-LUT.
        // A 16-way multiplexer can therefore be done in 4x 6-LUT's with
        //      the Xilinx multiplexer fabric that follows. 
        // Given that we wish to apply this multiplexer approach to 33-bits,
        // this will cost a minimum of 132 6-LUTs.
 
// i_stb instead of this_is_a_multiply_op
// o_result
// o_busy
// o_done
        generate
        if (IMPLEMENT_MPY == 0)
        begin : MPYNONE // No multiply support.
 
                assign  o_result   = 64'h00;
                assign  o_busy     = 1'b0;
                assign  o_valid    = i_stb;
                always @(*) o_hi = 1'b0; // Not needed
 
`ifdef  VERILATOR
                // verilator lint_off UNUSED
                wire    [32+32+5-1:0]    mpy_unused;
                assign  mpy_unused = { i_clk, i_reset, i_stb, i_op, i_a, i_b };
                // verilator lint_on  UNUSED
`endif
        end else begin : IMPY
        if (IMPLEMENT_MPY == 1)
        begin : MPY1CK // Our single clock option (no extra clocks)
 
                wire    signed  [63:0]   w_mpy_a_input, w_mpy_b_input;
 
                assign  w_mpy_a_input = {{(32){(i_a[31])&(i_op[0])}},i_a[31:0]};
                assign  w_mpy_b_input = {{(32){(i_b[31])&(i_op[0])}},i_b[31:0]};
 
                assign  o_result = w_mpy_a_input * w_mpy_b_input;
 
                assign  o_busy  = 1'b0;
                assign  o_valid = 1'b0;
                always @(*) o_hi = i_op[1];
 
`ifdef  VERILATOR
                // verilator lint_off UNUSED
                wire    [3:0]    mpy_unused;
                assign  mpy_unused = { i_clk, i_reset, i_stb, i_op[1] };
                // verilator lint_on  UNUSED
`endif
 
        end else begin: MPN1
        if (IMPLEMENT_MPY == 2)
        begin : MPY2CK // Our two clock option (ALU must pause for 1 clock)
 
                reg     signed  [63:0]   r_mpy_a_input, r_mpy_b_input;
                always @(posedge i_clk)
                begin
                        r_mpy_a_input <={{(32){(i_a[31])&(i_op[0])}},i_a[31:0]};
                        r_mpy_b_input <={{(32){(i_b[31])&(i_op[0])}},i_b[31:0]};
                end
 
                assign  o_result = r_mpy_a_input * r_mpy_b_input;
                assign  o_busy  = 1'b0;
 
                reg     mpypipe;
                initial mpypipe = 1'b0;
                always @(posedge i_clk)
                        if (i_reset)
                                mpypipe <= 1'b0;
                        else
                                mpypipe <= (i_stb);
 
                assign  o_valid = mpypipe; // this_is_a_multiply_op;
                always @(posedge i_clk)
                if (i_stb)
                        o_hi  <= i_op[1];
 
        end else begin : MPN2
        if (IMPLEMENT_MPY == 3)
        begin : MPY3CK // Our three clock option (ALU pauses for 2 clocks)
                reg     signed  [63:0]   r_smpy_result;
                reg             [63:0]   r_umpy_result;
                reg     signed  [31:0]   r_mpy_a_input, r_mpy_b_input;
                reg             [1:0]    mpypipe;
                reg             [1:0]    r_sgn;
 
                initial mpypipe = 2'b0;
                always @(posedge i_clk)
                        if (i_reset)
                                mpypipe <= 2'b0;
                        else
                        mpypipe <= { mpypipe[0], i_stb };
 
                // First clock
                always @(posedge i_clk)
                begin
                        r_mpy_a_input <= i_a[31:0];
                        r_mpy_b_input <= i_b[31:0];
                        r_sgn <= { r_sgn[0], i_op[0] };
                end
 
                // Second clock
`ifdef  VERILATOR
                wire    signed  [63:0]   s_mpy_a_input, s_mpy_b_input;
                wire            [63:0]   u_mpy_a_input, u_mpy_b_input;
 
                assign  s_mpy_a_input = {{(32){r_mpy_a_input[31]}},r_mpy_a_input};
                assign  s_mpy_b_input = {{(32){r_mpy_b_input[31]}},r_mpy_b_input};
                assign  u_mpy_a_input = {32'h00,r_mpy_a_input};
                assign  u_mpy_b_input = {32'h00,r_mpy_b_input};
                always @(posedge i_clk)
                        r_smpy_result <= s_mpy_a_input * s_mpy_b_input;
                always @(posedge i_clk)
                        r_umpy_result <= u_mpy_a_input * u_mpy_b_input;
`else
 
                wire            [31:0]   u_mpy_a_input, u_mpy_b_input;
 
                assign  u_mpy_a_input = r_mpy_a_input;
                assign  u_mpy_b_input = r_mpy_b_input;
 
                always @(posedge i_clk)
                        r_smpy_result <= r_mpy_a_input * r_mpy_b_input;
                always @(posedge i_clk)
                        r_umpy_result <= u_mpy_a_input * u_mpy_b_input;
`endif
 
                always @(posedge i_clk)
                if (i_stb)
                        o_hi  <= i_op[1];
                assign  o_busy  = mpypipe[0];
                assign  o_result = (r_sgn[1])?r_smpy_result:r_umpy_result;
                assign  o_valid = mpypipe[1];
 
                // Results are then set on the third clock
        end else begin : MPN3
        if (IMPLEMENT_MPY == 4)
        begin : MPY4CK // The three clock option
                reg     [63:0]   r_mpy_result;
                reg     [31:0]   r_mpy_a_input, r_mpy_b_input;
                reg             r_mpy_signed;
                reg     [2:0]    mpypipe;
 
                // First clock, latch in the inputs
                initial mpypipe = 3'b0;
                always @(posedge i_clk)
                begin
                        // mpypipe indicates we have a multiply in the
                        // pipeline.  In this case, the multiply
                        // pipeline is a two stage pipeline, so we need 
                        // two bits in the pipe.
                        if (i_reset)
                                mpypipe <= 3'h0;
                        else begin
                                mpypipe[0] <= i_stb;
                                mpypipe[1] <= mpypipe[0];
                                mpypipe[2] <= mpypipe[1];
                        end
 
                        if (i_op[0]) // i.e. if signed multiply
                        begin
                                r_mpy_a_input <= {(~i_a[31]),i_a[30:0]};
                                r_mpy_b_input <= {(~i_b[31]),i_b[30:0]};
                        end else begin
                                r_mpy_a_input <= i_a[31:0];
                                r_mpy_b_input <= i_b[31:0];
                        end
                        // The signed bit really only matters in the
                        // case of 64 bit multiply.  We'll keep track
                        // of it, though, and pretend in all other
                        // cases.
                        r_mpy_signed  <= i_op[0];
 
                        if (i_stb)
                                o_hi  <= i_op[1];
                end
 
                assign  o_busy  = |mpypipe[1:0];
                assign  o_valid = mpypipe[2];
 
                // Second clock, do the multiplies, get the "partial
                // products".  Here, we break our input up into two
                // halves, 
                //
                //   A  = (2^16 ah + al)
                //   B  = (2^16 bh + bl)
                //
                // and use these to compute partial products.
                //
                //   AB = (2^32 ah*bh + 2^16 (ah*bl + al*bh) + (al*bl)
                //
                // Since we're following the FOIL algorithm to get here,
                // we'll name these partial products according to FOIL.
                //
                // The trick is what happens if A or B is signed.  In
                // those cases, the real value of A will not be given by
                //      A = (2^16 ah + al)
                // but rather
                //      A = (2^16 ah[31^] + al) - 2^31
                //  (where we have flipped the sign bit of A)
                // and so ...
                //
                // AB= (2^16 ah + al - 2^31) * (2^16 bh + bl - 2^31)
                //      = 2^32(ah*bh)
                //              +2^16 (ah*bl+al*bh)
                //              +(al*bl)
                //              - 2^31 (2^16 bh+bl + 2^16 ah+al)
                //              - 2^62
                //      = 2^32(ah*bh)
                //              +2^16 (ah*bl+al*bh)
                //              +(al*bl)
                //              - 2^31 (2^16 bh+bl + 2^16 ah+al + 2^31)
                //
                reg     [31:0]   pp_f, pp_l; // F and L from FOIL
                reg     [32:0]   pp_oi; // The O and I from FOIL
                reg     [32:0]   pp_s;
                always @(posedge i_clk)
                begin
                        pp_f<=r_mpy_a_input[31:16]*r_mpy_b_input[31:16];
                        pp_oi<=r_mpy_a_input[31:16]*r_mpy_b_input[15: 0]
                                + r_mpy_a_input[15: 0]*r_mpy_b_input[31:16];
                        pp_l<=r_mpy_a_input[15: 0]*r_mpy_b_input[15: 0];
                        // And a special one for the sign
                        if (r_mpy_signed)
                                pp_s <= 32'h8000_0000-(
                                        r_mpy_a_input[31:0]
                                        + r_mpy_b_input[31:0]);
                        else
                                pp_s <= 33'h0;
                end
 
                // Third clock, add the results and produce a product
                always @(posedge i_clk)
                begin
                        r_mpy_result[15:0] <= pp_l[15:0];
                        r_mpy_result[63:16] <=
                                { 32'h00, pp_l[31:16] }
                                + { 15'h00, pp_oi }
                                + { pp_s, 15'h00 }
                                + { pp_f, 16'h00 };
                end
 
                assign  o_result = r_mpy_result;
                // Fourth clock -- results are clocked into writeback
        end else begin : MPYSLOW
 
                // verilator lint_off UNUSED
                wire            unused_aux;
                wire    [65:0]   full_result;
                // verilator lint_on  UNUSED
 
                slowmpy #(.LGNA(6), .NA(33)) slowmpyi(i_clk, i_reset, i_stb,
                        { (i_op[0])&(i_a[31]), i_a },
                        { (i_op[0])&(i_b[31]), i_b }, 1'b0, o_busy,
                                o_valid, full_result, unused_aux);
 
                assign  o_result = full_result[63:0];
 
                always @(posedge i_clk)
                if (i_stb)
                        o_hi  <= i_op[1];
 
        end end end end end
        endgenerate // All possible multiply results have been determined
 
endmodule

Browse

Tools

Subversion Repositories zipcpu

[/] [zipcpu/] [trunk/] [rtl/] [core/] [mpyop.v] - Blame information for rev 209

Line No.	Rev	Author	Line
1	209	dgisselq	`////////////////////////////////////////////////////////////////////////////////`
2			`//`
3			`// Filename: mpyop.v`
4			`//`
5			`// Project: Zip CPU -- a small, lightweight, RISC CPU soft core`
6			`//`
7			`// Purpose: This code has been pulled from the cpuops.v file so as to`
8			`// encapsulate the multiply component--the one component that`
9			`// (can't be) formally verified well, and so must be abstracted away.`
10			`// This separation was done to support potential future abstraction.`
11			`//`
12			`//`
13			`// Creator: Dan Gisselquist, Ph.D.`
14			`// Gisselquist Technology, LLC`
15			`//`
16			`////////////////////////////////////////////////////////////////////////////////`
17			`//`
18			`// Copyright (C) 2015-2019, Gisselquist Technology, LLC`
19			`//`
20			`// This program is free software (firmware): you can redistribute it and/or`
21			`// modify it under the terms of the GNU General Public License as published`
22			`// by the Free Software Foundation, either version 3 of the License, or (at`
23			`// your option) any later version.`
24			`//`
25			`// This program is distributed in the hope that it will be useful, but WITHOUT`
26			`// ANY WARRANTY; without even the implied warranty of MERCHANTIBILITY or`
27			`// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License`
28			`// for more details.`
29			`//`
30			`// You should have received a copy of the GNU General Public License along`
31			`// with this program. (It's in the $(ROOT)/doc directory. Run make with no`
32			`// target there if the PDF file isn't present.) If not, see`
33			`// <http://www.gnu.org/licenses/> for a copy.`
34			`//`
35			`// License: GPL, v3, as defined and found on www.gnu.org,`
36			`// http://www.gnu.org/licenses/gpl.html`
37			`//`
38			`//`
39			`////////////////////////////////////////////////////////////////////////////////`
40			`//`
41			`//`
42			`default_nettype none
43			`//`
44			`module mpyop(i_clk,i_reset, i_stb, i_op, i_a, i_b, o_valid, o_busy, o_result, o_hi);`
45			`// The following parameter selects which multiply algorithm we use.`
46			`// Timing performance is strictly dependent upon it.`
47			`parameter IMPLEMENT_MPY = 1;`
48			`input wire i_clk, i_reset, i_stb;`
49			`input wire [1:0] i_op; // 2'b00=MPY, 2'b10=MPYUHI, 2'b11=MPYSHI`
50			`input wire [31:0] i_a, i_b;`
51			`output wire o_valid; // True if we'll be valid on the next clock;`
52			`output wire o_busy; // The multiply is busy if true`
53			`output wire [63:0] o_result; // Where we dump the multiply result`
54			`output reg o_hi; // Return the high half of the multiply`
55
56
57			`// A 4-way multiplexer can be done in one 6-LUT.`
58			`// A 16-way multiplexer can therefore be done in 4x 6-LUT's with`
59			`// the Xilinx multiplexer fabric that follows.`
60			`// Given that we wish to apply this multiplexer approach to 33-bits,`
61			`// this will cost a minimum of 132 6-LUTs.`
62
63			`// i_stb instead of this_is_a_multiply_op`
64			`// o_result`
65			`// o_busy`
66			`// o_done`
67			`generate`
68			`if (IMPLEMENT_MPY == 0)`
69			`begin : MPYNONE // No multiply support.`
70
71			`assign o_result = 64'h00;`
72			`assign o_busy = 1'b0;`
73			`assign o_valid = i_stb;`
74			`always @(*) o_hi = 1'b0; // Not needed`
75
76			`ifdef VERILATOR
77			`// verilator lint_off UNUSED`
78			`wire [32+32+5-1:0] mpy_unused;`
79			`assign mpy_unused = { i_clk, i_reset, i_stb, i_op, i_a, i_b };`
80			`// verilator lint_on UNUSED`
81			`endif
82			`end else begin : IMPY`
83			`if (IMPLEMENT_MPY == 1)`
84			`begin : MPY1CK // Our single clock option (no extra clocks)`
85
86			`wire signed [63:0] w_mpy_a_input, w_mpy_b_input;`
87
88			`assign w_mpy_a_input = {{(32){(i_a[31])&(i_op[0])}},i_a[31:0]};`
89			`assign w_mpy_b_input = {{(32){(i_b[31])&(i_op[0])}},i_b[31:0]};`
90
91			`assign o_result = w_mpy_a_input * w_mpy_b_input;`
92
93			`assign o_busy = 1'b0;`
94			`assign o_valid = 1'b0;`
95			`always @(*) o_hi = i_op[1];`
96
97			`ifdef VERILATOR
98			`// verilator lint_off UNUSED`
99			`wire [3:0] mpy_unused;`
100			`assign mpy_unused = { i_clk, i_reset, i_stb, i_op[1] };`
101			`// verilator lint_on UNUSED`
102			`endif
103
104			`end else begin: MPN1`
105			`if (IMPLEMENT_MPY == 2)`
106			`begin : MPY2CK // Our two clock option (ALU must pause for 1 clock)`
107
108			`reg signed [63:0] r_mpy_a_input, r_mpy_b_input;`
109			`always @(posedge i_clk)`
110			`begin`
111			`r_mpy_a_input <={{(32){(i_a[31])&(i_op[0])}},i_a[31:0]};`
112			`r_mpy_b_input <={{(32){(i_b[31])&(i_op[0])}},i_b[31:0]};`
113			`end`
114
115			`assign o_result = r_mpy_a_input * r_mpy_b_input;`
116			`assign o_busy = 1'b0;`
117
118			`reg mpypipe;`
119			`initial mpypipe = 1'b0;`
120			`always @(posedge i_clk)`
121			`if (i_reset)`
122			`mpypipe <= 1'b0;`
123			`else`
124			`mpypipe <= (i_stb);`
125
126			`assign o_valid = mpypipe; // this_is_a_multiply_op;`
127			`always @(posedge i_clk)`
128			`if (i_stb)`
129			`o_hi <= i_op[1];`
130
131			`end else begin : MPN2`
132			`if (IMPLEMENT_MPY == 3)`
133			`begin : MPY3CK // Our three clock option (ALU pauses for 2 clocks)`
134			`reg signed [63:0] r_smpy_result;`
135			`reg [63:0] r_umpy_result;`
136			`reg signed [31:0] r_mpy_a_input, r_mpy_b_input;`
137			`reg [1:0] mpypipe;`
138			`reg [1:0] r_sgn;`
139
140			`initial mpypipe = 2'b0;`
141			`always @(posedge i_clk)`
142			`if (i_reset)`
143			`mpypipe <= 2'b0;`
144			`else`
145			`mpypipe <= { mpypipe[0], i_stb };`
146
147			`// First clock`
148			`always @(posedge i_clk)`
149			`begin`
150			`r_mpy_a_input <= i_a[31:0];`
151			`r_mpy_b_input <= i_b[31:0];`
152			`r_sgn <= { r_sgn[0], i_op[0] };`
153			`end`
154
155			`// Second clock`
156			`ifdef VERILATOR
157			`wire signed [63:0] s_mpy_a_input, s_mpy_b_input;`
158			`wire [63:0] u_mpy_a_input, u_mpy_b_input;`
159
160			`assign s_mpy_a_input = {{(32){r_mpy_a_input[31]}},r_mpy_a_input};`
161			`assign s_mpy_b_input = {{(32){r_mpy_b_input[31]}},r_mpy_b_input};`
162			`assign u_mpy_a_input = {32'h00,r_mpy_a_input};`
163			`assign u_mpy_b_input = {32'h00,r_mpy_b_input};`
164			`always @(posedge i_clk)`
165			`r_smpy_result <= s_mpy_a_input * s_mpy_b_input;`
166			`always @(posedge i_clk)`
167			`r_umpy_result <= u_mpy_a_input * u_mpy_b_input;`
168			`else
169
170			`wire [31:0] u_mpy_a_input, u_mpy_b_input;`
171
172			`assign u_mpy_a_input = r_mpy_a_input;`
173			`assign u_mpy_b_input = r_mpy_b_input;`
174
175			`always @(posedge i_clk)`
176			`r_smpy_result <= r_mpy_a_input * r_mpy_b_input;`
177			`always @(posedge i_clk)`
178			`r_umpy_result <= u_mpy_a_input * u_mpy_b_input;`
179			`endif
180
181			`always @(posedge i_clk)`
182			`if (i_stb)`
183			`o_hi <= i_op[1];`
184			`assign o_busy = mpypipe[0];`
185			`assign o_result = (r_sgn[1])?r_smpy_result:r_umpy_result;`
186			`assign o_valid = mpypipe[1];`
187
188			`// Results are then set on the third clock`
189			`end else begin : MPN3`
190			`if (IMPLEMENT_MPY == 4)`
191			`begin : MPY4CK // The three clock option`
192			`reg [63:0] r_mpy_result;`
193			`reg [31:0] r_mpy_a_input, r_mpy_b_input;`
194			`reg r_mpy_signed;`
195			`reg [2:0] mpypipe;`
196
197			`// First clock, latch in the inputs`
198			`initial mpypipe = 3'b0;`
199			`always @(posedge i_clk)`
200			`begin`
201			`// mpypipe indicates we have a multiply in the`
202			`// pipeline. In this case, the multiply`
203			`// pipeline is a two stage pipeline, so we need`
204			`// two bits in the pipe.`
205			`if (i_reset)`
206			`mpypipe <= 3'h0;`
207			`else begin`
208			`mpypipe[0] <= i_stb;`
209			`mpypipe[1] <= mpypipe[0];`
210			`mpypipe[2] <= mpypipe[1];`
211			`end`
212
213			`if (i_op[0]) // i.e. if signed multiply`
214			`begin`
215			`r_mpy_a_input <= {(~i_a[31]),i_a[30:0]};`
216			`r_mpy_b_input <= {(~i_b[31]),i_b[30:0]};`
217			`end else begin`
218			`r_mpy_a_input <= i_a[31:0];`
219			`r_mpy_b_input <= i_b[31:0];`
220			`end`
221			`// The signed bit really only matters in the`
222			`// case of 64 bit multiply. We'll keep track`
223			`// of it, though, and pretend in all other`
224			`// cases.`
225			`r_mpy_signed <= i_op[0];`
226
227			`if (i_stb)`
228			`o_hi <= i_op[1];`
229			`end`
230
231			`assign o_busy = \|mpypipe[1:0];`
232			`assign o_valid = mpypipe[2];`
233
234			`// Second clock, do the multiplies, get the "partial`
235			`// products". Here, we break our input up into two`
236			`// halves,`
237			`//`
238			`// A = (2^16 ah + al)`
239			`// B = (2^16 bh + bl)`
240			`//`
241			`// and use these to compute partial products.`
242			`//`
243			`// AB = (2^32 ahbh + 2^16 (ahbl + albh) + (albl)`
244			`//`
245			`// Since we're following the FOIL algorithm to get here,`
246			`// we'll name these partial products according to FOIL.`
247			`//`
248			`// The trick is what happens if A or B is signed. In`
249			`// those cases, the real value of A will not be given by`
250			`// A = (2^16 ah + al)`
251			`// but rather`
252			`// A = (2^16 ah[31^] + al) - 2^31`
253			`// (where we have flipped the sign bit of A)`
254			`// and so ...`
255			`//`
256			`// AB= (2^16 ah + al - 2^31) * (2^16 bh + bl - 2^31)`
257			`// = 2^32(ah*bh)`
258			`// +2^16 (ahbl+albh)`
259			`// +(al*bl)`
260			`// - 2^31 (2^16 bh+bl + 2^16 ah+al)`
261			`// - 2^62`
262			`// = 2^32(ah*bh)`
263			`// +2^16 (ahbl+albh)`
264			`// +(al*bl)`
265			`// - 2^31 (2^16 bh+bl + 2^16 ah+al + 2^31)`
266			`//`
267			`reg [31:0] pp_f, pp_l; // F and L from FOIL`
268			`reg [32:0] pp_oi; // The O and I from FOIL`
269			`reg [32:0] pp_s;`
270			`always @(posedge i_clk)`
271			`begin`
272			`pp_f<=r_mpy_a_input[31:16]*r_mpy_b_input[31:16];`
273			`pp_oi<=r_mpy_a_input[31:16]*r_mpy_b_input[15: 0]`
274			`+ r_mpy_a_input[15: 0]*r_mpy_b_input[31:16];`
275			`pp_l<=r_mpy_a_input[15: 0]*r_mpy_b_input[15: 0];`
276			`// And a special one for the sign`
277			`if (r_mpy_signed)`
278			`pp_s <= 32'h8000_0000-(`
279			`r_mpy_a_input[31:0]`
280			`+ r_mpy_b_input[31:0]);`
281			`else`
282			`pp_s <= 33'h0;`
283			`end`
284
285			`// Third clock, add the results and produce a product`
286			`always @(posedge i_clk)`
287			`begin`
288			`r_mpy_result[15:0] <= pp_l[15:0];`
289			`r_mpy_result[63:16] <=`
290			`{ 32'h00, pp_l[31:16] }`
291			`+ { 15'h00, pp_oi }`
292			`+ { pp_s, 15'h00 }`
293			`+ { pp_f, 16'h00 };`
294			`end`
295
296			`assign o_result = r_mpy_result;`
297			`// Fourth clock -- results are clocked into writeback`
298			`end else begin : MPYSLOW`
299
300			`// verilator lint_off UNUSED`
301			`wire unused_aux;`
302			`wire [65:0] full_result;`
303			`// verilator lint_on UNUSED`
304
305			`slowmpy #(.LGNA(6), .NA(33)) slowmpyi(i_clk, i_reset, i_stb,`
306			`{ (i_op[0])&(i_a[31]), i_a },`
307			`{ (i_op[0])&(i_b[31]), i_b }, 1'b0, o_busy,`
308			`o_valid, full_result, unused_aux);`
309
310			`assign o_result = full_result[63:0];`
311
312			`always @(posedge i_clk)`
313			`if (i_stb)`
314			`o_hi <= i_op[1];`
315
316			`end end end end end`
317			`endgenerate // All possible multiply results have been determined`
318
319			`endmodule`