URL https://opencores.org/ocsvn/openarty/openarty/trunk
Subversion Repositories openarty

[/] [openarty/] [trunk/] [rtl/] [cpu/] [fastops.v] - Rev 4

Go to most recent revision | Compare with Previous | Blame | View Log
///////////////////////////////////////////////////////////////////////////
//
// Filename:	fastops.v
//
// Project:	Zip CPU -- a small, lightweight, RISC CPU soft core
//
// Purpose:	This supports the instruction set reordering of operations
//		created by the second generation instruction set, as well as
//	the new operations of POPC (population count) and BREV (bit reversal).
//
//
// Creator:	Dan Gisselquist, Ph.D.
//		Gisselquist Technology, LLC
//
///////////////////////////////////////////////////////////////////////////
//
// Copyright (C) 2015-2016, Gisselquist Technology, LLC
//
// This program is free software (firmware): you can redistribute it and/or
// modify it under the terms of  the GNU General Public License as published
// by the Free Software Foundation, either version 3 of the License, or (at
// your option) any later version.
//
// This program is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTIBILITY or
// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
// for more details.
//
// License:	GPL, v3, as defined and found on www.gnu.org,
//		http://www.gnu.org/licenses/gpl.html
//
//
///////////////////////////////////////////////////////////////////////////
//
module	fastops(i_clk,i_rst, i_ce, i_valid, i_op, i_a, i_b, o_c, o_f, o_valid,
			o_illegal, o_busy);
	input		i_clk, i_rst, i_ce;
	input		[3:0]	i_op;
	input		[31:0]	i_a, i_b;
	input			i_valid;
	output	reg	[31:0]	o_c;
	output	wire	[3:0]	o_f;
	output	wire		o_valid;
	output	wire		o_illegal;
	output	wire		o_busy;
 
	// Rotate-left logic
	wire	[63:0]	w_rol_tmp;
	assign	w_rol_tmp = { i_a, i_a } << i_b[4:0];
	reg	[31:0]	r_rol_result;
	always @(posedge i_clk)
		r_rol_result <= w_rol_tmp[63:32]; // Won't set flags
 
	// Shift register logic
	reg	[32:0]		r_lsr_result, r_asr_result, r_lsl_result;
	always @(posedge i_clk)
	begin
		r_asr_result <= (|i_b[31:5])? {(33){i_a[31]}}
				: ( $signed({i_a, 1'b0 })>>> (i_b[4:0]) );// ASR
		r_lsr_result <= (|i_b[31:5])? 33'h00
				: ( { i_a, 1'b0 } >> (i_b[4:0]) );// LSR
		r_lsl_result <= (|i_b[31:5])? 33'h00 : {1'b0, i_a } << i_b[4:0];	// LSL
	end
 
	// Bit reversal pre-logic
	wire	[31:0]	w_brev_result;
	reg	[31:0]	r_brev_result;
	genvar	k;
	generate
	for(k=0; k<32; k=k+1)
	begin : bit_reversal_cpuop
		assign w_brev_result[k] = i_b[31-k];
	end endgenerate
	always @(posedge i_clk)
		r_brev_result <= w_brev_result;
 
	// Popcount logic
	wire	[31:0]	w_popc_result;
	reg	[5:0]	r_popc_result;
	always @(posedge i_clk)
		r_popc_result =
		 ({5'h0,i_b[ 0]}+{5'h0,i_b[ 1]}+{5'h0,i_b[ 2]}+{5'h0,i_b[ 3]})
		+({5'h0,i_b[ 4]}+{5'h0,i_b[ 5]}+{5'h0,i_b[ 6]}+{5'h0,i_b[ 7]})
		+({5'h0,i_b[ 8]}+{5'h0,i_b[ 9]}+{5'h0,i_b[10]}+{5'h0,i_b[11]})
		+({5'h0,i_b[12]}+{5'h0,i_b[13]}+{5'h0,i_b[14]}+{5'h0,i_b[15]})
		+({5'h0,i_b[16]}+{5'h0,i_b[17]}+{5'h0,i_b[18]}+{5'h0,i_b[19]})
		+({5'h0,i_b[20]}+{5'h0,i_b[21]}+{5'h0,i_b[22]}+{5'h0,i_b[23]})
		+({5'h0,i_b[24]}+{5'h0,i_b[25]}+{5'h0,i_b[26]}+{5'h0,i_b[27]})
		+({5'h0,i_b[28]}+{5'h0,i_b[29]}+{5'h0,i_b[30]}+{5'h0,i_b[31]});
	assign	w_popc_result = { 26'h00, r_popc_result };
 
	// Prelogic for our flags registers
	wire	z, n, v;
	reg	c, pre_sign, set_ovfl;
	always @(posedge i_clk)
		if (i_ce) // 1 LUT
			set_ovfl =(((i_op==4'h0)&&(i_a[31] != i_b[31]))//SUB&CMP
				||((i_op==4'h2)&&(i_a[31] == i_b[31])) // ADD
				||(i_op == 4'h6) // LSL
				||(i_op == 4'h5)); // LSR
 
	reg	[31:0]	r_logical;
	always @(posedge i_clk)
		r_logical <= (i_op[0]) ? (i_a & i_b) : (i_a | i_b);
 
	reg	[32:0]	r_sum, r_diff;
	reg	[31:0]	r_ldilo, r_bypass, r_xor;
	always @(posedge i_clk)
		r_sum <= i_a + i_b;			// Add
	always @(posedge i_clk)
		r_diff <= {1'b0, i_a } - { 1'b0, i_b };	// SUB
	always @(posedge i_clk)
		r_xor    <= i_a ^ i_b;			// XOR
	always @(posedge i_clk)
		r_ldilo  <= { i_a[31:16], i_b[15:0] };	// LDILO
	always @(posedge i_clk)
		r_bypass <= i_b;			// LOD/MOV,ETC
 
	reg	mpyhi;
	wire	mpybusy;
 
	//
	// Multiply logic
	//
	reg	[63:0]	r_mpy_result;	// Our final goal
 
	// The three clock option
	reg	[31:0]	r_mpy_a_input, r_mpy_b_input;
	reg		r_mpy_signed;
	reg	[1:0]	mpypipe;
 
	wire	mpy;
	assign	mpy = (i_op[3:1] == 3'h5)||(i_op[3:0] != 4'h8);
 
	// First clock, latch in the inputs
	always @(posedge i_clk)
	begin
		if (i_op[0]) // i.e. if signed multiply
		begin
			r_mpy_a_input <= {(~i_a[31]),i_a[30:0]};
			r_mpy_b_input <= {(~i_b[31]),i_b[30:0]};
		end else begin
			r_mpy_a_input <= i_a[31:0];
			r_mpy_b_input <= i_b[31:0];
		end
		// The signed bit really only matters in the case of 64 bit
		// multiply.  We'll keep track of it, though, and pretend in
		// all other cases.
		r_mpy_signed  <= i_op[0];
 
		mpyhi  = i_op[1];
	end
 
	// Second clock, do the multiplies, get the "partial products".  Here,
	// we break our input up into two halves, 
	//
	//   A  = (2^16 ah + al)
	//   B  = (2^16 bh + bl)
	//
	// and use these to compute partial products.
	//
	//   AB = (2^32 ah*bh + 2^16 (ah*bl + al*bh) + (al*bl)
	//
	// Since we're following the FOIL algorithm to get here,
	// we'll name these partial products according to FOIL.
	//
	// The trick is what happens if A or B is signed.  In
	// those cases, the real value of A will not be given by
	//	A = (2^16 ah + al)
	// but rather
	//	A = (2^16 ah[31^] + al) - 2^31
	//  (where we have flipped the sign bit of A) and so ...
	//
	// AB= (2^16 ah + al - 2^31) * (2^16 bh + bl - 2^31)
	//	= 2^32(ah*bh)
	//		+2^16 (ah*bl+al*bh)
	//		+(al*bl)
	//		- 2^31 (2^16 bh+bl + 2^16 ah+al)
	//		- 2^62
	//	= 2^32(ah*bh)
	//		+2^16 (ah*bl+al*bh)
	//		+(al*bl)
	//		- 2^31 (2^16 bh+bl + 2^16 ah+al + 2^31)
	//
	reg	[31:0]	pp_f, pp_o, pp_i, pp_l; // F, O, I and L from FOIL
	reg	[32:0]	pp_s;
	always @(posedge i_clk)
	begin
		pp_f<=r_mpy_a_input[31:16]*r_mpy_b_input[31:16];
		pp_o<=r_mpy_a_input[31:16]*r_mpy_b_input[15: 0];
		pp_i<=r_mpy_a_input[15: 0]*r_mpy_b_input[31:16];
		pp_l<=r_mpy_a_input[15: 0]*r_mpy_b_input[15: 0];
		// And a special one for the sign
		if (r_mpy_signed)
			pp_s <= 32'h8000_0000-( r_mpy_a_input[31:0]
						+ r_mpy_b_input[31:0]);
		else
			pp_s <= 33'h0;
	end
 
	// Third clock, add the results and produce a product
	//		r_mpy_result[63:16] <=
	//		  	{ 32'h00, pp_l[31:16] }
	//			+ { 16'h00, pp_o }
	//			+ { 16'h00, pp_i }
	//			+ { pp_s, 15'h00 }
	//			+ { pp_f, 16'h00 };
	//
	//		16'h00		16'h00		pp_l[31:16]	ppl[15:]
	//		16'h00		pp_o[31:16]	pp_o[15:0]	16'h00
	//		16'h00		pp_i[31:16]	pp_i[15:0]	16'h00
	//		pp_s[32:17]	pp_s[16:1]	pp_s[0],15'h0	16'h00
	//		pp_f[31:16]	pp_f[31:16]	16'h00		16'h00
	//
	//		16'h0		15'h0,lo[32]	lo[31:16]	lo[15:]
	//		15'h0,oi[32]	oi[31:16]	oi[15:0]	16'h00
	//		hi[31:0]	hi[15:0]	16'h00
	//
	//
	reg	[32:0]	partial_mpy_oi, partial_mpy_lo;
	reg	[31:0]	partial_mpy_hi;
	always @(posedge i_clk)
		begin
			partial_mpy_lo[30:0]<= pp_l[30:0];
			partial_mpy_lo[32:31]<= pp_s[0]+pp_l[31];
			partial_mpy_oi[32:0]<= pp_o + pp_i;
			partial_mpy_hi[31:0]<= pp_s[32:1] + pp_f;
		end
	reg	partial_mpy_2cl, partial_mpy_2ch;
	reg	[31:0]	partial_mpy_2lo, partial_mpy_2hi;
	// Fourth clock -- Finish adding our partial results
	always @(posedge i_clk)
		begin
			partial_mpy_2lo[15:0] <= partial_mpy_lo[15:0];
			{ partial_mpy_2cl, partial_mpy_2lo[31:16] }
				<= partial_mpy_oi[15:0] + partial_mpy_lo[31:16];
			{ partial_mpy_2ch, partial_mpy_2hi[15:0] }
				<= partial_mpy_oi[32:16] + partial_mpy_hi[16:0];
			partial_mpy_2hi[31:17] <= partial_mpy_2hi[31:17];
		end
	// Fifth clock -- deal with final carries
	always @(posedge i_clk)
		begin
			r_mpy_result[31:0] <= partial_mpy_2lo[31:0];
			r_mpy_result[63:32] <= partial_mpy_2hi+
				{ 14'h0,partial_mpy_2ch,15'h0, partial_mpy_2cl};
		end
	// Fifth clock -- results are available for writeback.
 
	//
	// The master ALU case statement
	//
	reg	[3:0]	r_op;
	always @(posedge i_clk)
	begin
		r_op <= i_op;
		pre_sign <= (i_a[31]);
		c <= 1'b0;
		casez(r_op)
		4'b0000:{c,o_c } <= r_diff;		// CMP/SUB
		4'b00?1:   o_c   <= r_logical;		// BTST/And/Or
		4'b0010:{c,o_c } <= r_sum;		// Add
		4'b0100:   o_c   <= r_xor;		// Xor
		4'b0101:{o_c,c } <= r_lsr_result;	// LSR
		4'b0110:{c,o_c } <= r_lsl_result;	// LSL
		4'b0111:{o_c,c } <= r_asr_result;	// ASR
		4'b1000:   o_c   <= r_mpy_result[31:0]; // MPY
		4'b1001:   o_c   <= r_ldilo;		// LODILO
		4'b1010:   o_c   <= r_mpy_result[63:32]; // MPYHU
		4'b1011:   o_c   <= r_mpy_result[63:32]; // MPYHS
		4'b1100:   o_c   <= r_brev_result;	// BREV
		4'b1101:   o_c   <= w_popc_result;	// POPC
		4'b1110:   o_c   <= r_rol_result;	// ROL
		default:   o_c   <= r_bypass;		// MOV, LDI
		endcase
	end
 
	// With the multiply implemented (as above), there are no illegal
	// results.
	assign o_illegal = 1'b0;
 
	assign	z = (o_c == 32'h0000); // This really costs us a clock ...
	assign	n = (o_c[31]);
	assign	v = (set_ovfl)&&(pre_sign != o_c[31]);
 
	assign	o_f = { v, n, c, z };
 
	reg	[2:0]	alu_pipe;
	always @(posedge i_clk)
		if (i_rst)
			alu_pipe <= 3'h0;
		else
			alu_pipe <= { alu_pipe[1], (i_ce)&(~mpy)|alu_pipe[0],
				(i_ce)&(mpy) };
	//
	// A longer pipeline would look like:
	//
	// alu_pipe <= { alu_pipe[2:1], (i_ce)&(~mpy)|alu_pipe[1], alu_pipe[0],
	//			(i_ce)&mpy;
	// o_busy <= (|alu_pipe[1:0])
 
	assign	o_valid = alu_pipe[2];
	assign	o_busy  = alu_pipe[0];
endmodule
Go to most recent revision | Compare with Previous | Blame | View Log
Browse

Tools

Subversion Repositories openarty

[/] [openarty/] [trunk/] [rtl/] [cpu/] [fastops.v] - Rev 4