URL https://opencores.org/ocsvn/openarty/openarty/trunk

Subversion Repositories openarty

[/] [openarty/] [trunk/] [rtl/] [cpu/] [fastops.v] - Blame information for rev 39

Go to most recent revision | Details | Compare with Previous | View Log


///////////////////////////////////////////////////////////////////////////
//
// Filename:    fastops.v
//
// Project:     Zip CPU -- a small, lightweight, RISC CPU soft core
//
// Purpose:     This supports the instruction set reordering of operations
//              created by the second generation instruction set, as well as
//      the new operations of POPC (population count) and BREV (bit reversal).
//
//
// Creator:     Dan Gisselquist, Ph.D.
//              Gisselquist Technology, LLC
//
///////////////////////////////////////////////////////////////////////////
//
// Copyright (C) 2015-2016, Gisselquist Technology, LLC
//
// This program is free software (firmware): you can redistribute it and/or
// modify it under the terms of  the GNU General Public License as published
// by the Free Software Foundation, either version 3 of the License, or (at
// your option) any later version.
//
// This program is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTIBILITY or
// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
// for more details.
//
// License:     GPL, v3, as defined and found on www.gnu.org,
//              http://www.gnu.org/licenses/gpl.html
//
//
///////////////////////////////////////////////////////////////////////////
//
module  fastops(i_clk,i_rst, i_ce, i_valid, i_op, i_a, i_b, o_c, o_f, o_valid,
                        o_illegal, o_busy);
        input           i_clk, i_rst, i_ce;
        input           [3:0]    i_op;
        input           [31:0]   i_a, i_b;
        input                   i_valid;
        output  reg     [31:0]   o_c;
        output  wire    [3:0]    o_f;
        output  wire            o_valid;
        output  wire            o_illegal;
        output  wire            o_busy;
 
        // Rotate-left logic
        wire    [63:0]   w_rol_tmp;
        assign  w_rol_tmp = { i_a, i_a } << i_b[4:0];
        reg     [31:0]   r_rol_result;
        always @(posedge i_clk)
                r_rol_result <= w_rol_tmp[63:32]; // Won't set flags
 
        // Shift register logic
        reg     [32:0]           r_lsr_result, r_asr_result, r_lsl_result;
        always @(posedge i_clk)
        begin
                r_asr_result <= (|i_b[31:5])? {(33){i_a[31]}}
                                : ( $signed({i_a, 1'b0 })>>> (i_b[4:0]) );// ASR
                r_lsr_result <= (|i_b[31:5])? 33'h00
                                : ( { i_a, 1'b0 } >> (i_b[4:0]) );// LSR
                r_lsl_result <= (|i_b[31:5])? 33'h00 : {1'b0, i_a } << i_b[4:0]; // LSL
        end
 
        // Bit reversal pre-logic
        wire    [31:0]   w_brev_result;
        reg     [31:0]   r_brev_result;
        genvar  k;
        generate
        for(k=0; k<32; k=k+1)
        begin : bit_reversal_cpuop
                assign w_brev_result[k] = i_b[31-k];
        end endgenerate
        always @(posedge i_clk)
                r_brev_result <= w_brev_result;
 
        // Popcount logic
        wire    [31:0]   w_popc_result;
        reg     [5:0]    r_popc_result;
        always @(posedge i_clk)
                r_popc_result =
                 ({5'h0,i_b[ 0]}+{5'h0,i_b[ 1]}+{5'h0,i_b[ 2]}+{5'h0,i_b[ 3]})
                +({5'h0,i_b[ 4]}+{5'h0,i_b[ 5]}+{5'h0,i_b[ 6]}+{5'h0,i_b[ 7]})
                +({5'h0,i_b[ 8]}+{5'h0,i_b[ 9]}+{5'h0,i_b[10]}+{5'h0,i_b[11]})
                +({5'h0,i_b[12]}+{5'h0,i_b[13]}+{5'h0,i_b[14]}+{5'h0,i_b[15]})
                +({5'h0,i_b[16]}+{5'h0,i_b[17]}+{5'h0,i_b[18]}+{5'h0,i_b[19]})
                +({5'h0,i_b[20]}+{5'h0,i_b[21]}+{5'h0,i_b[22]}+{5'h0,i_b[23]})
                +({5'h0,i_b[24]}+{5'h0,i_b[25]}+{5'h0,i_b[26]}+{5'h0,i_b[27]})
                +({5'h0,i_b[28]}+{5'h0,i_b[29]}+{5'h0,i_b[30]}+{5'h0,i_b[31]});
        assign  w_popc_result = { 26'h00, r_popc_result };
 
        // Prelogic for our flags registers
        wire    z, n, v;
        reg     c, pre_sign, set_ovfl;
        always @(posedge i_clk)
                if (i_ce) // 1 LUT
                        set_ovfl =(((i_op==4'h0)&&(i_a[31] != i_b[31]))//SUB&CMP
                                ||((i_op==4'h2)&&(i_a[31] == i_b[31])) // ADD
                                ||(i_op == 4'h6) // LSL
                                ||(i_op == 4'h5)); // LSR
 
        reg     [31:0]   r_logical;
        always @(posedge i_clk)
                r_logical <= (i_op[0]) ? (i_a & i_b) : (i_a | i_b);
 
        reg     [32:0]   r_sum, r_diff;
        reg     [31:0]   r_ldilo, r_bypass, r_xor;
        always @(posedge i_clk)
                r_sum <= i_a + i_b;                     // Add
        always @(posedge i_clk)
                r_diff <= {1'b0, i_a } - { 1'b0, i_b }; // SUB
        always @(posedge i_clk)
                r_xor    <= i_a ^ i_b;                  // XOR
        always @(posedge i_clk)
                r_ldilo  <= { i_a[31:16], i_b[15:0] };   // LDILO
        always @(posedge i_clk)
                r_bypass <= i_b;                        // LOD/MOV,ETC
 
        reg     mpyhi;
        wire    mpybusy;
 
        //
        // Multiply logic
        //
        reg     [63:0]   r_mpy_result;   // Our final goal
 
        // The three clock option
        reg     [31:0]   r_mpy_a_input, r_mpy_b_input;
        reg             r_mpy_signed;
        reg     [1:0]    mpypipe;
 
        wire    mpy;
        assign  mpy = (i_op[3:1] == 3'h5)||(i_op[3:0] != 4'h8);
 
        // First clock, latch in the inputs
        always @(posedge i_clk)
        begin
                if (i_op[0]) // i.e. if signed multiply
                begin
                        r_mpy_a_input <= {(~i_a[31]),i_a[30:0]};
                        r_mpy_b_input <= {(~i_b[31]),i_b[30:0]};
                end else begin
                        r_mpy_a_input <= i_a[31:0];
                        r_mpy_b_input <= i_b[31:0];
                end
                // The signed bit really only matters in the case of 64 bit
                // multiply.  We'll keep track of it, though, and pretend in
                // all other cases.
                r_mpy_signed  <= i_op[0];
 
                mpyhi  = i_op[1];
        end
 
        // Second clock, do the multiplies, get the "partial products".  Here,
        // we break our input up into two halves, 
        //
        //   A  = (2^16 ah + al)
        //   B  = (2^16 bh + bl)
        //
        // and use these to compute partial products.
        //
        //   AB = (2^32 ah*bh + 2^16 (ah*bl + al*bh) + (al*bl)
        //
        // Since we're following the FOIL algorithm to get here,
        // we'll name these partial products according to FOIL.
        //
        // The trick is what happens if A or B is signed.  In
        // those cases, the real value of A will not be given by
        //      A = (2^16 ah + al)
        // but rather
        //      A = (2^16 ah[31^] + al) - 2^31
        //  (where we have flipped the sign bit of A) and so ...
        //
        // AB= (2^16 ah + al - 2^31) * (2^16 bh + bl - 2^31)
        //      = 2^32(ah*bh)
        //              +2^16 (ah*bl+al*bh)
        //              +(al*bl)
        //              - 2^31 (2^16 bh+bl + 2^16 ah+al)
        //              - 2^62
        //      = 2^32(ah*bh)
        //              +2^16 (ah*bl+al*bh)
        //              +(al*bl)
        //              - 2^31 (2^16 bh+bl + 2^16 ah+al + 2^31)
        //
        reg     [31:0]   pp_f, pp_o, pp_i, pp_l; // F, O, I and L from FOIL
        reg     [32:0]   pp_s;
        always @(posedge i_clk)
        begin
                pp_f<=r_mpy_a_input[31:16]*r_mpy_b_input[31:16];
                pp_o<=r_mpy_a_input[31:16]*r_mpy_b_input[15: 0];
                pp_i<=r_mpy_a_input[15: 0]*r_mpy_b_input[31:16];
                pp_l<=r_mpy_a_input[15: 0]*r_mpy_b_input[15: 0];
                // And a special one for the sign
                if (r_mpy_signed)
                        pp_s <= 32'h8000_0000-( r_mpy_a_input[31:0]
                                                + r_mpy_b_input[31:0]);
                else
                        pp_s <= 33'h0;
        end
 
        // Third clock, add the results and produce a product
        //              r_mpy_result[63:16] <=
        //                      { 32'h00, pp_l[31:16] }
        //                      + { 16'h00, pp_o }
        //                      + { 16'h00, pp_i }
        //                      + { pp_s, 15'h00 }
        //                      + { pp_f, 16'h00 };
        //
        //              16'h00          16'h00          pp_l[31:16]     ppl[15:]
        //              16'h00          pp_o[31:16]     pp_o[15:0]      16'h00
        //              16'h00          pp_i[31:16]     pp_i[15:0]      16'h00
        //              pp_s[32:17]     pp_s[16:1]      pp_s[0],15'h0   16'h00
        //              pp_f[31:16]     pp_f[31:16]     16'h00          16'h00
        //
        //              16'h0           15'h0,lo[32]    lo[31:16]       lo[15:]
        //              15'h0,oi[32]    oi[31:16]       oi[15:0]        16'h00
        //              hi[31:0]        hi[15:0]        16'h00
        //
        //
        reg     [32:0]   partial_mpy_oi, partial_mpy_lo;
        reg     [31:0]   partial_mpy_hi;
        always @(posedge i_clk)
                begin
                        partial_mpy_lo[30:0]<= pp_l[30:0];
                        partial_mpy_lo[32:31]<= pp_s[0]+pp_l[31];
                        partial_mpy_oi[32:0]<= pp_o + pp_i;
                        partial_mpy_hi[31:0]<= pp_s[32:1] + pp_f;
                end
        reg     partial_mpy_2cl, partial_mpy_2ch;
        reg     [31:0]   partial_mpy_2lo, partial_mpy_2hi;
        // Fourth clock -- Finish adding our partial results
        always @(posedge i_clk)
                begin
                        partial_mpy_2lo[15:0] <= partial_mpy_lo[15:0];
                        { partial_mpy_2cl, partial_mpy_2lo[31:16] }
                                <= partial_mpy_oi[15:0] + partial_mpy_lo[31:16];
                        { partial_mpy_2ch, partial_mpy_2hi[15:0] }
                                <= partial_mpy_oi[32:16] + partial_mpy_hi[16:0];
                        partial_mpy_2hi[31:17] <= partial_mpy_2hi[31:17];
                end
        // Fifth clock -- deal with final carries
        always @(posedge i_clk)
                begin
                        r_mpy_result[31:0] <= partial_mpy_2lo[31:0];
                        r_mpy_result[63:32] <= partial_mpy_2hi+
                                { 14'h0,partial_mpy_2ch,15'h0, partial_mpy_2cl};
                end
        // Fifth clock -- results are available for writeback.
 
        //
        // The master ALU case statement
        //
        reg     [3:0]    r_op;
        always @(posedge i_clk)
        begin
                r_op <= i_op;
                pre_sign <= (i_a[31]);
                c <= 1'b0;
                casez(r_op)
                4'b0000:{c,o_c } <= r_diff;             // CMP/SUB
                4'b00?1:   o_c   <= r_logical;          // BTST/And/Or
                4'b0010:{c,o_c } <= r_sum;              // Add
                4'b0100:   o_c   <= r_xor;              // Xor
                4'b0101:{o_c,c } <= r_lsr_result;       // LSR
                4'b0110:{c,o_c } <= r_lsl_result;       // LSL
                4'b0111:{o_c,c } <= r_asr_result;       // ASR
                4'b1000:   o_c   <= r_mpy_result[31:0]; // MPY
                4'b1001:   o_c   <= r_ldilo;            // LODILO
                4'b1010:   o_c   <= r_mpy_result[63:32]; // MPYHU
                4'b1011:   o_c   <= r_mpy_result[63:32]; // MPYHS
                4'b1100:   o_c   <= r_brev_result;      // BREV
                4'b1101:   o_c   <= w_popc_result;      // POPC
                4'b1110:   o_c   <= r_rol_result;       // ROL
                default:   o_c   <= r_bypass;           // MOV, LDI
                endcase
        end
 
        // With the multiply implemented (as above), there are no illegal
        // results.
        assign o_illegal = 1'b0;
 
        assign  z = (o_c == 32'h0000); // This really costs us a clock ...
        assign  n = (o_c[31]);
        assign  v = (set_ovfl)&&(pre_sign != o_c[31]);
 
        assign  o_f = { v, n, c, z };
 
        reg     [2:0]    alu_pipe;
        always @(posedge i_clk)
                if (i_rst)
                        alu_pipe <= 3'h0;
                else
                        alu_pipe <= { alu_pipe[1], (i_ce)&(~mpy)|alu_pipe[0],
                                (i_ce)&(mpy) };
        //
        // A longer pipeline would look like:
        //
        // alu_pipe <= { alu_pipe[2:1], (i_ce)&(~mpy)|alu_pipe[1], alu_pipe[0],
        //                      (i_ce)&mpy;
        // o_busy <= (|alu_pipe[1:0])
 
        assign  o_valid = alu_pipe[2];
        assign  o_busy  = alu_pipe[0];
endmodule

Browse

Tools

Subversion Repositories openarty

[/] [openarty/] [trunk/] [rtl/] [cpu/] [fastops.v] - Blame information for rev 39

Line No.	Rev	Author	Line
1	3	dgisselq	`///////////////////////////////////////////////////////////////////////////`
2			`//`
3			`// Filename: fastops.v`
4			`//`
5			`// Project: Zip CPU -- a small, lightweight, RISC CPU soft core`
6			`//`
7			`// Purpose: This supports the instruction set reordering of operations`
8			`// created by the second generation instruction set, as well as`
9			`// the new operations of POPC (population count) and BREV (bit reversal).`
10			`//`
11			`//`
12			`// Creator: Dan Gisselquist, Ph.D.`
13			`// Gisselquist Technology, LLC`
14			`//`
15			`///////////////////////////////////////////////////////////////////////////`
16			`//`
17			`// Copyright (C) 2015-2016, Gisselquist Technology, LLC`
18			`//`
19			`// This program is free software (firmware): you can redistribute it and/or`
20			`// modify it under the terms of the GNU General Public License as published`
21			`// by the Free Software Foundation, either version 3 of the License, or (at`
22			`// your option) any later version.`
23			`//`
24			`// This program is distributed in the hope that it will be useful, but WITHOUT`
25			`// ANY WARRANTY; without even the implied warranty of MERCHANTIBILITY or`
26			`// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License`
27			`// for more details.`
28			`//`
29			`// License: GPL, v3, as defined and found on www.gnu.org,`
30			`// http://www.gnu.org/licenses/gpl.html`
31			`//`
32			`//`
33			`///////////////////////////////////////////////////////////////////////////`
34			`//`
35			`module fastops(i_clk,i_rst, i_ce, i_valid, i_op, i_a, i_b, o_c, o_f, o_valid,`
36			`o_illegal, o_busy);`
37			`input i_clk, i_rst, i_ce;`
38			`input [3:0] i_op;`
39			`input [31:0] i_a, i_b;`
40			`input i_valid;`
41			`output reg [31:0] o_c;`
42			`output wire [3:0] o_f;`
43			`output wire o_valid;`
44			`output wire o_illegal;`
45			`output wire o_busy;`
46
47			`// Rotate-left logic`
48			`wire [63:0] w_rol_tmp;`
49			`assign w_rol_tmp = { i_a, i_a } << i_b[4:0];`
50			`reg [31:0] r_rol_result;`
51			`always @(posedge i_clk)`
52			`r_rol_result <= w_rol_tmp[63:32]; // Won't set flags`
53
54			`// Shift register logic`
55			`reg [32:0] r_lsr_result, r_asr_result, r_lsl_result;`
56			`always @(posedge i_clk)`
57			`begin`
58			`r_asr_result <= (\|i_b[31:5])? {(33){i_a[31]}}`
59			`: ( $signed({i_a, 1'b0 })>>> (i_b[4:0]) );// ASR`
60			`r_lsr_result <= (\|i_b[31:5])? 33'h00`
61			`: ( { i_a, 1'b0 } >> (i_b[4:0]) );// LSR`
62			`r_lsl_result <= (\|i_b[31:5])? 33'h00 : {1'b0, i_a } << i_b[4:0]; // LSL`
63			`end`
64
65			`// Bit reversal pre-logic`
66			`wire [31:0] w_brev_result;`
67			`reg [31:0] r_brev_result;`
68			`genvar k;`
69			`generate`
70			`for(k=0; k<32; k=k+1)`
71			`begin : bit_reversal_cpuop`
72			`assign w_brev_result[k] = i_b[31-k];`
73			`end endgenerate`
74			`always @(posedge i_clk)`
75			`r_brev_result <= w_brev_result;`
76
77			`// Popcount logic`
78			`wire [31:0] w_popc_result;`
79			`reg [5:0] r_popc_result;`
80			`always @(posedge i_clk)`
81			`r_popc_result =`
82			`({5'h0,i_b[ 0]}+{5'h0,i_b[ 1]}+{5'h0,i_b[ 2]}+{5'h0,i_b[ 3]})`
83			`+({5'h0,i_b[ 4]}+{5'h0,i_b[ 5]}+{5'h0,i_b[ 6]}+{5'h0,i_b[ 7]})`
84			`+({5'h0,i_b[ 8]}+{5'h0,i_b[ 9]}+{5'h0,i_b[10]}+{5'h0,i_b[11]})`
85			`+({5'h0,i_b[12]}+{5'h0,i_b[13]}+{5'h0,i_b[14]}+{5'h0,i_b[15]})`
86			`+({5'h0,i_b[16]}+{5'h0,i_b[17]}+{5'h0,i_b[18]}+{5'h0,i_b[19]})`
87			`+({5'h0,i_b[20]}+{5'h0,i_b[21]}+{5'h0,i_b[22]}+{5'h0,i_b[23]})`
88			`+({5'h0,i_b[24]}+{5'h0,i_b[25]}+{5'h0,i_b[26]}+{5'h0,i_b[27]})`
89			`+({5'h0,i_b[28]}+{5'h0,i_b[29]}+{5'h0,i_b[30]}+{5'h0,i_b[31]});`
90			`assign w_popc_result = { 26'h00, r_popc_result };`
91
92			`// Prelogic for our flags registers`
93			`wire z, n, v;`
94			`reg c, pre_sign, set_ovfl;`
95			`always @(posedge i_clk)`
96			`if (i_ce) // 1 LUT`
97			`set_ovfl =(((i_op==4'h0)&&(i_a[31] != i_b[31]))//SUB&CMP`
98			`\|\|((i_op==4'h2)&&(i_a[31] == i_b[31])) // ADD`
99			`\|\|(i_op == 4'h6) // LSL`
100			`\|\|(i_op == 4'h5)); // LSR`
101
102			`reg [31:0] r_logical;`
103			`always @(posedge i_clk)`
104			`r_logical <= (i_op[0]) ? (i_a & i_b) : (i_a \| i_b);`
105
106			`reg [32:0] r_sum, r_diff;`
107			`reg [31:0] r_ldilo, r_bypass, r_xor;`
108			`always @(posedge i_clk)`
109			`r_sum <= i_a + i_b; // Add`
110			`always @(posedge i_clk)`
111			`r_diff <= {1'b0, i_a } - { 1'b0, i_b }; // SUB`
112			`always @(posedge i_clk)`
113			`r_xor <= i_a ^ i_b; // XOR`
114			`always @(posedge i_clk)`
115			`r_ldilo <= { i_a[31:16], i_b[15:0] }; // LDILO`
116			`always @(posedge i_clk)`
117			`r_bypass <= i_b; // LOD/MOV,ETC`
118
119			`reg mpyhi;`
120			`wire mpybusy;`
121
122			`//`
123			`// Multiply logic`
124			`//`
125			`reg [63:0] r_mpy_result; // Our final goal`
126
127			`// The three clock option`
128			`reg [31:0] r_mpy_a_input, r_mpy_b_input;`
129			`reg r_mpy_signed;`
130			`reg [1:0] mpypipe;`
131
132			`wire mpy;`
133			`assign mpy = (i_op[3:1] == 3'h5)\|\|(i_op[3:0] != 4'h8);`
134
135			`// First clock, latch in the inputs`
136			`always @(posedge i_clk)`
137			`begin`
138			`if (i_op[0]) // i.e. if signed multiply`
139			`begin`
140			`r_mpy_a_input <= {(~i_a[31]),i_a[30:0]};`
141			`r_mpy_b_input <= {(~i_b[31]),i_b[30:0]};`
142			`end else begin`
143			`r_mpy_a_input <= i_a[31:0];`
144			`r_mpy_b_input <= i_b[31:0];`
145			`end`
146			`// The signed bit really only matters in the case of 64 bit`
147			`// multiply. We'll keep track of it, though, and pretend in`
148			`// all other cases.`
149			`r_mpy_signed <= i_op[0];`
150
151			`mpyhi = i_op[1];`
152			`end`
153
154			`// Second clock, do the multiplies, get the "partial products". Here,`
155			`// we break our input up into two halves,`
156			`//`
157			`// A = (2^16 ah + al)`
158			`// B = (2^16 bh + bl)`
159			`//`
160			`// and use these to compute partial products.`
161			`//`
162			`// AB = (2^32 ahbh + 2^16 (ahbl + albh) + (albl)`
163			`//`
164			`// Since we're following the FOIL algorithm to get here,`
165			`// we'll name these partial products according to FOIL.`
166			`//`
167			`// The trick is what happens if A or B is signed. In`
168			`// those cases, the real value of A will not be given by`
169			`// A = (2^16 ah + al)`
170			`// but rather`
171			`// A = (2^16 ah[31^] + al) - 2^31`
172			`// (where we have flipped the sign bit of A) and so ...`
173			`//`
174			`// AB= (2^16 ah + al - 2^31) * (2^16 bh + bl - 2^31)`
175			`// = 2^32(ah*bh)`
176			`// +2^16 (ahbl+albh)`
177			`// +(al*bl)`
178			`// - 2^31 (2^16 bh+bl + 2^16 ah+al)`
179			`// - 2^62`
180			`// = 2^32(ah*bh)`
181			`// +2^16 (ahbl+albh)`
182			`// +(al*bl)`
183			`// - 2^31 (2^16 bh+bl + 2^16 ah+al + 2^31)`
184			`//`
185			`reg [31:0] pp_f, pp_o, pp_i, pp_l; // F, O, I and L from FOIL`
186			`reg [32:0] pp_s;`
187			`always @(posedge i_clk)`
188			`begin`
189			`pp_f<=r_mpy_a_input[31:16]*r_mpy_b_input[31:16];`
190			`pp_o<=r_mpy_a_input[31:16]*r_mpy_b_input[15: 0];`
191			`pp_i<=r_mpy_a_input[15: 0]*r_mpy_b_input[31:16];`
192			`pp_l<=r_mpy_a_input[15: 0]*r_mpy_b_input[15: 0];`
193			`// And a special one for the sign`
194			`if (r_mpy_signed)`
195			`pp_s <= 32'h8000_0000-( r_mpy_a_input[31:0]`
196			`+ r_mpy_b_input[31:0]);`
197			`else`
198			`pp_s <= 33'h0;`
199			`end`
200
201			`// Third clock, add the results and produce a product`
202			`// r_mpy_result[63:16] <=`
203			`// { 32'h00, pp_l[31:16] }`
204			`// + { 16'h00, pp_o }`
205			`// + { 16'h00, pp_i }`
206			`// + { pp_s, 15'h00 }`
207			`// + { pp_f, 16'h00 };`
208			`//`
209			`// 16'h00 16'h00 pp_l[31:16] ppl[15:]`
210			`// 16'h00 pp_o[31:16] pp_o[15:0] 16'h00`
211			`// 16'h00 pp_i[31:16] pp_i[15:0] 16'h00`
212			`// pp_s[32:17] pp_s[16:1] pp_s[0],15'h0 16'h00`
213			`// pp_f[31:16] pp_f[31:16] 16'h00 16'h00`
214			`//`
215			`// 16'h0 15'h0,lo[32] lo[31:16] lo[15:]`
216			`// 15'h0,oi[32] oi[31:16] oi[15:0] 16'h00`
217			`// hi[31:0] hi[15:0] 16'h00`
218			`//`
219			`//`
220			`reg [32:0] partial_mpy_oi, partial_mpy_lo;`
221			`reg [31:0] partial_mpy_hi;`
222			`always @(posedge i_clk)`
223			`begin`
224			`partial_mpy_lo[30:0]<= pp_l[30:0];`
225			`partial_mpy_lo[32:31]<= pp_s[0]+pp_l[31];`
226			`partial_mpy_oi[32:0]<= pp_o + pp_i;`
227			`partial_mpy_hi[31:0]<= pp_s[32:1] + pp_f;`
228			`end`
229			`reg partial_mpy_2cl, partial_mpy_2ch;`
230			`reg [31:0] partial_mpy_2lo, partial_mpy_2hi;`
231			`// Fourth clock -- Finish adding our partial results`
232			`always @(posedge i_clk)`
233			`begin`
234			`partial_mpy_2lo[15:0] <= partial_mpy_lo[15:0];`
235			`{ partial_mpy_2cl, partial_mpy_2lo[31:16] }`
236			`<= partial_mpy_oi[15:0] + partial_mpy_lo[31:16];`
237			`{ partial_mpy_2ch, partial_mpy_2hi[15:0] }`
238			`<= partial_mpy_oi[32:16] + partial_mpy_hi[16:0];`
239			`partial_mpy_2hi[31:17] <= partial_mpy_2hi[31:17];`
240			`end`
241			`// Fifth clock -- deal with final carries`
242			`always @(posedge i_clk)`
243			`begin`
244			`r_mpy_result[31:0] <= partial_mpy_2lo[31:0];`
245			`r_mpy_result[63:32] <= partial_mpy_2hi+`
246			`{ 14'h0,partial_mpy_2ch,15'h0, partial_mpy_2cl};`
247			`end`
248			`// Fifth clock -- results are available for writeback.`
249
250			`//`
251			`// The master ALU case statement`
252			`//`
253			`reg [3:0] r_op;`
254			`always @(posedge i_clk)`
255			`begin`
256			`r_op <= i_op;`
257			`pre_sign <= (i_a[31]);`
258			`c <= 1'b0;`
259			`casez(r_op)`
260			`4'b0000:{c,o_c } <= r_diff; // CMP/SUB`
261			`4'b00?1: o_c <= r_logical; // BTST/And/Or`
262			`4'b0010:{c,o_c } <= r_sum; // Add`
263			`4'b0100: o_c <= r_xor; // Xor`
264			`4'b0101:{o_c,c } <= r_lsr_result; // LSR`
265			`4'b0110:{c,o_c } <= r_lsl_result; // LSL`
266			`4'b0111:{o_c,c } <= r_asr_result; // ASR`
267			`4'b1000: o_c <= r_mpy_result[31:0]; // MPY`
268			`4'b1001: o_c <= r_ldilo; // LODILO`
269			`4'b1010: o_c <= r_mpy_result[63:32]; // MPYHU`
270			`4'b1011: o_c <= r_mpy_result[63:32]; // MPYHS`
271			`4'b1100: o_c <= r_brev_result; // BREV`
272			`4'b1101: o_c <= w_popc_result; // POPC`
273			`4'b1110: o_c <= r_rol_result; // ROL`
274			`default: o_c <= r_bypass; // MOV, LDI`
275			`endcase`
276			`end`
277
278			`// With the multiply implemented (as above), there are no illegal`
279			`// results.`
280			`assign o_illegal = 1'b0;`
281
282			`assign z = (o_c == 32'h0000); // This really costs us a clock ...`
283			`assign n = (o_c[31]);`
284			`assign v = (set_ovfl)&&(pre_sign != o_c[31]);`
285
286			`assign o_f = { v, n, c, z };`
287
288			`reg [2:0] alu_pipe;`
289			`always @(posedge i_clk)`
290			`if (i_rst)`
291			`alu_pipe <= 3'h0;`
292			`else`
293			`alu_pipe <= { alu_pipe[1], (i_ce)&(~mpy)\|alu_pipe[0],`
294			`(i_ce)&(mpy) };`
295			`//`
296			`// A longer pipeline would look like:`
297			`//`
298			`// alu_pipe <= { alu_pipe[2:1], (i_ce)&(~mpy)\|alu_pipe[1], alu_pipe[0],`
299			`// (i_ce)&mpy;`
300			`// o_busy <= (\|alu_pipe[1:0])`
301
302			`assign o_valid = alu_pipe[2];`
303			`assign o_busy = alu_pipe[0];`
304			`endmodule`