URL
https://opencores.org/ocsvn/openarty/openarty/trunk
Subversion Repositories openarty
[/] [openarty/] [trunk/] [rtl/] [cpu/] [fastops.v] - Rev 32
Go to most recent revision | Compare with Previous | Blame | View Log
/////////////////////////////////////////////////////////////////////////// // // Filename: fastops.v // // Project: Zip CPU -- a small, lightweight, RISC CPU soft core // // Purpose: This supports the instruction set reordering of operations // created by the second generation instruction set, as well as // the new operations of POPC (population count) and BREV (bit reversal). // // // Creator: Dan Gisselquist, Ph.D. // Gisselquist Technology, LLC // /////////////////////////////////////////////////////////////////////////// // // Copyright (C) 2015-2016, Gisselquist Technology, LLC // // This program is free software (firmware): you can redistribute it and/or // modify it under the terms of the GNU General Public License as published // by the Free Software Foundation, either version 3 of the License, or (at // your option) any later version. // // This program is distributed in the hope that it will be useful, but WITHOUT // ANY WARRANTY; without even the implied warranty of MERCHANTIBILITY or // FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License // for more details. // // License: GPL, v3, as defined and found on www.gnu.org, // http://www.gnu.org/licenses/gpl.html // // /////////////////////////////////////////////////////////////////////////// // module fastops(i_clk,i_rst, i_ce, i_valid, i_op, i_a, i_b, o_c, o_f, o_valid, o_illegal, o_busy); input i_clk, i_rst, i_ce; input [3:0] i_op; input [31:0] i_a, i_b; input i_valid; output reg [31:0] o_c; output wire [3:0] o_f; output wire o_valid; output wire o_illegal; output wire o_busy; // Rotate-left logic wire [63:0] w_rol_tmp; assign w_rol_tmp = { i_a, i_a } << i_b[4:0]; reg [31:0] r_rol_result; always @(posedge i_clk) r_rol_result <= w_rol_tmp[63:32]; // Won't set flags // Shift register logic reg [32:0] r_lsr_result, r_asr_result, r_lsl_result; always @(posedge i_clk) begin r_asr_result <= (|i_b[31:5])? {(33){i_a[31]}} : ( $signed({i_a, 1'b0 })>>> (i_b[4:0]) );// ASR r_lsr_result <= (|i_b[31:5])? 33'h00 : ( { i_a, 1'b0 } >> (i_b[4:0]) );// LSR r_lsl_result <= (|i_b[31:5])? 33'h00 : {1'b0, i_a } << i_b[4:0]; // LSL end // Bit reversal pre-logic wire [31:0] w_brev_result; reg [31:0] r_brev_result; genvar k; generate for(k=0; k<32; k=k+1) begin : bit_reversal_cpuop assign w_brev_result[k] = i_b[31-k]; end endgenerate always @(posedge i_clk) r_brev_result <= w_brev_result; // Popcount logic wire [31:0] w_popc_result; reg [5:0] r_popc_result; always @(posedge i_clk) r_popc_result = ({5'h0,i_b[ 0]}+{5'h0,i_b[ 1]}+{5'h0,i_b[ 2]}+{5'h0,i_b[ 3]}) +({5'h0,i_b[ 4]}+{5'h0,i_b[ 5]}+{5'h0,i_b[ 6]}+{5'h0,i_b[ 7]}) +({5'h0,i_b[ 8]}+{5'h0,i_b[ 9]}+{5'h0,i_b[10]}+{5'h0,i_b[11]}) +({5'h0,i_b[12]}+{5'h0,i_b[13]}+{5'h0,i_b[14]}+{5'h0,i_b[15]}) +({5'h0,i_b[16]}+{5'h0,i_b[17]}+{5'h0,i_b[18]}+{5'h0,i_b[19]}) +({5'h0,i_b[20]}+{5'h0,i_b[21]}+{5'h0,i_b[22]}+{5'h0,i_b[23]}) +({5'h0,i_b[24]}+{5'h0,i_b[25]}+{5'h0,i_b[26]}+{5'h0,i_b[27]}) +({5'h0,i_b[28]}+{5'h0,i_b[29]}+{5'h0,i_b[30]}+{5'h0,i_b[31]}); assign w_popc_result = { 26'h00, r_popc_result }; // Prelogic for our flags registers wire z, n, v; reg c, pre_sign, set_ovfl; always @(posedge i_clk) if (i_ce) // 1 LUT set_ovfl =(((i_op==4'h0)&&(i_a[31] != i_b[31]))//SUB&CMP ||((i_op==4'h2)&&(i_a[31] == i_b[31])) // ADD ||(i_op == 4'h6) // LSL ||(i_op == 4'h5)); // LSR reg [31:0] r_logical; always @(posedge i_clk) r_logical <= (i_op[0]) ? (i_a & i_b) : (i_a | i_b); reg [32:0] r_sum, r_diff; reg [31:0] r_ldilo, r_bypass, r_xor; always @(posedge i_clk) r_sum <= i_a + i_b; // Add always @(posedge i_clk) r_diff <= {1'b0, i_a } - { 1'b0, i_b }; // SUB always @(posedge i_clk) r_xor <= i_a ^ i_b; // XOR always @(posedge i_clk) r_ldilo <= { i_a[31:16], i_b[15:0] }; // LDILO always @(posedge i_clk) r_bypass <= i_b; // LOD/MOV,ETC reg mpyhi; wire mpybusy; // // Multiply logic // reg [63:0] r_mpy_result; // Our final goal // The three clock option reg [31:0] r_mpy_a_input, r_mpy_b_input; reg r_mpy_signed; reg [1:0] mpypipe; wire mpy; assign mpy = (i_op[3:1] == 3'h5)||(i_op[3:0] != 4'h8); // First clock, latch in the inputs always @(posedge i_clk) begin if (i_op[0]) // i.e. if signed multiply begin r_mpy_a_input <= {(~i_a[31]),i_a[30:0]}; r_mpy_b_input <= {(~i_b[31]),i_b[30:0]}; end else begin r_mpy_a_input <= i_a[31:0]; r_mpy_b_input <= i_b[31:0]; end // The signed bit really only matters in the case of 64 bit // multiply. We'll keep track of it, though, and pretend in // all other cases. r_mpy_signed <= i_op[0]; mpyhi = i_op[1]; end // Second clock, do the multiplies, get the "partial products". Here, // we break our input up into two halves, // // A = (2^16 ah + al) // B = (2^16 bh + bl) // // and use these to compute partial products. // // AB = (2^32 ah*bh + 2^16 (ah*bl + al*bh) + (al*bl) // // Since we're following the FOIL algorithm to get here, // we'll name these partial products according to FOIL. // // The trick is what happens if A or B is signed. In // those cases, the real value of A will not be given by // A = (2^16 ah + al) // but rather // A = (2^16 ah[31^] + al) - 2^31 // (where we have flipped the sign bit of A) and so ... // // AB= (2^16 ah + al - 2^31) * (2^16 bh + bl - 2^31) // = 2^32(ah*bh) // +2^16 (ah*bl+al*bh) // +(al*bl) // - 2^31 (2^16 bh+bl + 2^16 ah+al) // - 2^62 // = 2^32(ah*bh) // +2^16 (ah*bl+al*bh) // +(al*bl) // - 2^31 (2^16 bh+bl + 2^16 ah+al + 2^31) // reg [31:0] pp_f, pp_o, pp_i, pp_l; // F, O, I and L from FOIL reg [32:0] pp_s; always @(posedge i_clk) begin pp_f<=r_mpy_a_input[31:16]*r_mpy_b_input[31:16]; pp_o<=r_mpy_a_input[31:16]*r_mpy_b_input[15: 0]; pp_i<=r_mpy_a_input[15: 0]*r_mpy_b_input[31:16]; pp_l<=r_mpy_a_input[15: 0]*r_mpy_b_input[15: 0]; // And a special one for the sign if (r_mpy_signed) pp_s <= 32'h8000_0000-( r_mpy_a_input[31:0] + r_mpy_b_input[31:0]); else pp_s <= 33'h0; end // Third clock, add the results and produce a product // r_mpy_result[63:16] <= // { 32'h00, pp_l[31:16] } // + { 16'h00, pp_o } // + { 16'h00, pp_i } // + { pp_s, 15'h00 } // + { pp_f, 16'h00 }; // // 16'h00 16'h00 pp_l[31:16] ppl[15:] // 16'h00 pp_o[31:16] pp_o[15:0] 16'h00 // 16'h00 pp_i[31:16] pp_i[15:0] 16'h00 // pp_s[32:17] pp_s[16:1] pp_s[0],15'h0 16'h00 // pp_f[31:16] pp_f[31:16] 16'h00 16'h00 // // 16'h0 15'h0,lo[32] lo[31:16] lo[15:] // 15'h0,oi[32] oi[31:16] oi[15:0] 16'h00 // hi[31:0] hi[15:0] 16'h00 // // reg [32:0] partial_mpy_oi, partial_mpy_lo; reg [31:0] partial_mpy_hi; always @(posedge i_clk) begin partial_mpy_lo[30:0]<= pp_l[30:0]; partial_mpy_lo[32:31]<= pp_s[0]+pp_l[31]; partial_mpy_oi[32:0]<= pp_o + pp_i; partial_mpy_hi[31:0]<= pp_s[32:1] + pp_f; end reg partial_mpy_2cl, partial_mpy_2ch; reg [31:0] partial_mpy_2lo, partial_mpy_2hi; // Fourth clock -- Finish adding our partial results always @(posedge i_clk) begin partial_mpy_2lo[15:0] <= partial_mpy_lo[15:0]; { partial_mpy_2cl, partial_mpy_2lo[31:16] } <= partial_mpy_oi[15:0] + partial_mpy_lo[31:16]; { partial_mpy_2ch, partial_mpy_2hi[15:0] } <= partial_mpy_oi[32:16] + partial_mpy_hi[16:0]; partial_mpy_2hi[31:17] <= partial_mpy_2hi[31:17]; end // Fifth clock -- deal with final carries always @(posedge i_clk) begin r_mpy_result[31:0] <= partial_mpy_2lo[31:0]; r_mpy_result[63:32] <= partial_mpy_2hi+ { 14'h0,partial_mpy_2ch,15'h0, partial_mpy_2cl}; end // Fifth clock -- results are available for writeback. // // The master ALU case statement // reg [3:0] r_op; always @(posedge i_clk) begin r_op <= i_op; pre_sign <= (i_a[31]); c <= 1'b0; casez(r_op) 4'b0000:{c,o_c } <= r_diff; // CMP/SUB 4'b00?1: o_c <= r_logical; // BTST/And/Or 4'b0010:{c,o_c } <= r_sum; // Add 4'b0100: o_c <= r_xor; // Xor 4'b0101:{o_c,c } <= r_lsr_result; // LSR 4'b0110:{c,o_c } <= r_lsl_result; // LSL 4'b0111:{o_c,c } <= r_asr_result; // ASR 4'b1000: o_c <= r_mpy_result[31:0]; // MPY 4'b1001: o_c <= r_ldilo; // LODILO 4'b1010: o_c <= r_mpy_result[63:32]; // MPYHU 4'b1011: o_c <= r_mpy_result[63:32]; // MPYHS 4'b1100: o_c <= r_brev_result; // BREV 4'b1101: o_c <= w_popc_result; // POPC 4'b1110: o_c <= r_rol_result; // ROL default: o_c <= r_bypass; // MOV, LDI endcase end // With the multiply implemented (as above), there are no illegal // results. assign o_illegal = 1'b0; assign z = (o_c == 32'h0000); // This really costs us a clock ... assign n = (o_c[31]); assign v = (set_ovfl)&&(pre_sign != o_c[31]); assign o_f = { v, n, c, z }; reg [2:0] alu_pipe; always @(posedge i_clk) if (i_rst) alu_pipe <= 3'h0; else alu_pipe <= { alu_pipe[1], (i_ce)&(~mpy)|alu_pipe[0], (i_ce)&(mpy) }; // // A longer pipeline would look like: // // alu_pipe <= { alu_pipe[2:1], (i_ce)&(~mpy)|alu_pipe[1], alu_pipe[0], // (i_ce)&mpy; // o_busy <= (|alu_pipe[1:0]) assign o_valid = alu_pipe[2]; assign o_busy = alu_pipe[0]; endmodule
Go to most recent revision | Compare with Previous | Blame | View Log