
Subversion Repositories zipcpu

[/] [zipcpu/] [trunk/] [rtl/] [core/] [mpyop.v] - Rev 209

Compare with Previous | Blame | View Log

// Filename:	mpyop.v
// Project:	Zip CPU -- a small, lightweight, RISC CPU soft core
// Purpose:	This code has been pulled from the cpuops.v file so as to
//		encapsulate the multiply component--the one component that
//	(can't be) formally verified well, and so must be abstracted away.
//	This separation was done to support potential future abstraction.
// Creator:	Dan Gisselquist, Ph.D.
//		Gisselquist Technology, LLC
// Copyright (C) 2015-2019, Gisselquist Technology, LLC
// This program is free software (firmware): you can redistribute it and/or
// modify it under the terms of  the GNU General Public License as published
// by the Free Software Foundation, either version 3 of the License, or (at
// your option) any later version.
// This program is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTIBILITY or
// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
// for more details.
// You should have received a copy of the GNU General Public License along
// with this program.  (It's in the $(ROOT)/doc directory.  Run make with no
// target there if the PDF file isn't present.)  If not, see
// <> for a copy.
// License:	GPL, v3, as defined and found on,
`default_nettype	none
module	mpyop(i_clk,i_reset, i_stb, i_op, i_a, i_b, o_valid, o_busy, o_result, o_hi);
	// The following parameter selects which multiply algorithm we use.
	// Timing performance is strictly dependent upon it.
	parameter	IMPLEMENT_MPY = 1;
	input	wire		i_clk, i_reset, i_stb;
	input	wire	[1:0]	i_op; // 2'b00=MPY, 2'b10=MPYUHI, 2'b11=MPYSHI
	input	wire	[31:0]	i_a, i_b;
	output	wire		o_valid; // True if we'll be valid on the next clock;
	output	wire		o_busy; // The multiply is busy if true
	output	wire	[63:0]	o_result; // Where we dump the multiply result
	output	reg		o_hi;	// Return the high half of the multiply
	// A 4-way multiplexer can be done in one 6-LUT.
	// A 16-way multiplexer can therefore be done in 4x 6-LUT's with
	//	the Xilinx multiplexer fabric that follows. 
	// Given that we wish to apply this multiplexer approach to 33-bits,
	// this will cost a minimum of 132 6-LUTs.
// i_stb instead of this_is_a_multiply_op
// o_result
// o_busy
// o_done
	if (IMPLEMENT_MPY == 0)
	begin : MPYNONE // No multiply support.
		assign	o_result   = 64'h00;
		assign	o_busy     = 1'b0;
		assign	o_valid    = i_stb;
		always @(*) o_hi = 1'b0; // Not needed
		// verilator lint_off UNUSED
		wire	[32+32+5-1:0]	mpy_unused;
		assign	mpy_unused = { i_clk, i_reset, i_stb, i_op, i_a, i_b };
		// verilator lint_on  UNUSED
	end else begin : IMPY
	if (IMPLEMENT_MPY == 1)
	begin : MPY1CK // Our single clock option (no extra clocks)
		wire	signed	[63:0]	w_mpy_a_input, w_mpy_b_input;
		assign	w_mpy_a_input = {{(32){(i_a[31])&(i_op[0])}},i_a[31:0]};
		assign	w_mpy_b_input = {{(32){(i_b[31])&(i_op[0])}},i_b[31:0]};
		assign	o_result = w_mpy_a_input * w_mpy_b_input;
		assign	o_busy  = 1'b0;
		assign	o_valid = 1'b0;
		always @(*) o_hi = i_op[1];
		// verilator lint_off UNUSED
		wire	[3:0]	mpy_unused;
		assign	mpy_unused = { i_clk, i_reset, i_stb, i_op[1] };
		// verilator lint_on  UNUSED
	end else begin: MPN1
	if (IMPLEMENT_MPY == 2)
	begin : MPY2CK // Our two clock option (ALU must pause for 1 clock)
		reg	signed	[63:0]	r_mpy_a_input, r_mpy_b_input;
		always @(posedge i_clk)
			r_mpy_a_input <={{(32){(i_a[31])&(i_op[0])}},i_a[31:0]};
			r_mpy_b_input <={{(32){(i_b[31])&(i_op[0])}},i_b[31:0]};
		assign	o_result = r_mpy_a_input * r_mpy_b_input;
		assign	o_busy  = 1'b0;
		reg	mpypipe;
		initial	mpypipe = 1'b0;
		always @(posedge i_clk)
			if (i_reset)
				mpypipe <= 1'b0;
				mpypipe <= (i_stb);
		assign	o_valid = mpypipe; // this_is_a_multiply_op;
		always @(posedge i_clk)
		if (i_stb)
			o_hi  <= i_op[1];
	end else begin : MPN2
	if (IMPLEMENT_MPY == 3)
	begin : MPY3CK // Our three clock option (ALU pauses for 2 clocks)
		reg	signed	[63:0]	r_smpy_result;
		reg		[63:0]	r_umpy_result;
		reg	signed	[31:0]	r_mpy_a_input, r_mpy_b_input;
		reg		[1:0]	mpypipe;
		reg		[1:0]	r_sgn;
		initial	mpypipe = 2'b0;
		always @(posedge i_clk)
			if (i_reset)
				mpypipe <= 2'b0;
			mpypipe <= { mpypipe[0], i_stb };
		// First clock
		always @(posedge i_clk)
			r_mpy_a_input <= i_a[31:0];
			r_mpy_b_input <= i_b[31:0];
			r_sgn <= { r_sgn[0], i_op[0] };
		// Second clock
		wire	signed	[63:0]	s_mpy_a_input, s_mpy_b_input;
		wire		[63:0]	u_mpy_a_input, u_mpy_b_input;
		assign	s_mpy_a_input = {{(32){r_mpy_a_input[31]}},r_mpy_a_input};
		assign	s_mpy_b_input = {{(32){r_mpy_b_input[31]}},r_mpy_b_input};
		assign	u_mpy_a_input = {32'h00,r_mpy_a_input};
		assign	u_mpy_b_input = {32'h00,r_mpy_b_input};
		always @(posedge i_clk)
			r_smpy_result <= s_mpy_a_input * s_mpy_b_input;
		always @(posedge i_clk)
			r_umpy_result <= u_mpy_a_input * u_mpy_b_input;
		wire		[31:0]	u_mpy_a_input, u_mpy_b_input;
		assign	u_mpy_a_input = r_mpy_a_input;
		assign	u_mpy_b_input = r_mpy_b_input;
		always @(posedge i_clk)
			r_smpy_result <= r_mpy_a_input * r_mpy_b_input;
		always @(posedge i_clk)
			r_umpy_result <= u_mpy_a_input * u_mpy_b_input;
		always @(posedge i_clk)
		if (i_stb)
			o_hi  <= i_op[1];
		assign	o_busy  = mpypipe[0];
		assign	o_result = (r_sgn[1])?r_smpy_result:r_umpy_result;
		assign	o_valid = mpypipe[1];
		// Results are then set on the third clock
	end else begin : MPN3
	if (IMPLEMENT_MPY == 4)
	begin : MPY4CK // The three clock option
		reg	[63:0]	r_mpy_result;
		reg	[31:0]	r_mpy_a_input, r_mpy_b_input;
		reg		r_mpy_signed;
		reg	[2:0]	mpypipe;
		// First clock, latch in the inputs
		initial	mpypipe = 3'b0;
		always @(posedge i_clk)
			// mpypipe indicates we have a multiply in the
			// pipeline.  In this case, the multiply
			// pipeline is a two stage pipeline, so we need 
			// two bits in the pipe.
			if (i_reset)
				mpypipe <= 3'h0;
			else begin
				mpypipe[0] <= i_stb;
				mpypipe[1] <= mpypipe[0];
				mpypipe[2] <= mpypipe[1];
			if (i_op[0]) // i.e. if signed multiply
				r_mpy_a_input <= {(~i_a[31]),i_a[30:0]};
				r_mpy_b_input <= {(~i_b[31]),i_b[30:0]};
			end else begin
				r_mpy_a_input <= i_a[31:0];
				r_mpy_b_input <= i_b[31:0];
			// The signed bit really only matters in the
			// case of 64 bit multiply.  We'll keep track
			// of it, though, and pretend in all other
			// cases.
			r_mpy_signed  <= i_op[0];
			if (i_stb)
				o_hi  <= i_op[1];
		assign	o_busy  = |mpypipe[1:0];
		assign	o_valid = mpypipe[2];
		// Second clock, do the multiplies, get the "partial
		// products".  Here, we break our input up into two
		// halves, 
		//   A  = (2^16 ah + al)
		//   B  = (2^16 bh + bl)
		// and use these to compute partial products.
		//   AB = (2^32 ah*bh + 2^16 (ah*bl + al*bh) + (al*bl)
		// Since we're following the FOIL algorithm to get here,
		// we'll name these partial products according to FOIL.
		// The trick is what happens if A or B is signed.  In
		// those cases, the real value of A will not be given by
		//	A = (2^16 ah + al)
		// but rather
		//	A = (2^16 ah[31^] + al) - 2^31
		//  (where we have flipped the sign bit of A)
		// and so ...
		// AB= (2^16 ah + al - 2^31) * (2^16 bh + bl - 2^31)
		//	= 2^32(ah*bh)
		//		+2^16 (ah*bl+al*bh)
		//		+(al*bl)
		//		- 2^31 (2^16 bh+bl + 2^16 ah+al)
		//		- 2^62
		//	= 2^32(ah*bh)
		//		+2^16 (ah*bl+al*bh)
		//		+(al*bl)
		//		- 2^31 (2^16 bh+bl + 2^16 ah+al + 2^31)
		reg	[31:0]	pp_f, pp_l; // F and L from FOIL
		reg	[32:0]	pp_oi; // The O and I from FOIL
		reg	[32:0]	pp_s;
		always @(posedge i_clk)
			pp_oi<=r_mpy_a_input[31:16]*r_mpy_b_input[15: 0]
				+ r_mpy_a_input[15: 0]*r_mpy_b_input[31:16];
			pp_l<=r_mpy_a_input[15: 0]*r_mpy_b_input[15: 0];
			// And a special one for the sign
			if (r_mpy_signed)
				pp_s <= 32'h8000_0000-(
					+ r_mpy_b_input[31:0]);
				pp_s <= 33'h0;
		// Third clock, add the results and produce a product
		always @(posedge i_clk)
			r_mpy_result[15:0] <= pp_l[15:0];
			r_mpy_result[63:16] <=
			  	{ 32'h00, pp_l[31:16] }
				+ { 15'h00, pp_oi }
				+ { pp_s, 15'h00 }
				+ { pp_f, 16'h00 };
		assign	o_result = r_mpy_result;
		// Fourth clock -- results are clocked into writeback
	end else begin : MPYSLOW
		// verilator lint_off UNUSED
		wire		unused_aux;
		wire	[65:0]	full_result;
		// verilator lint_on  UNUSED
		slowmpy #(.LGNA(6), .NA(33)) slowmpyi(i_clk, i_reset, i_stb,
			{ (i_op[0])&(i_a[31]), i_a },
			{ (i_op[0])&(i_b[31]), i_b }, 1'b0, o_busy,
				o_valid, full_result, unused_aux);
		assign	o_result = full_result[63:0];
		always @(posedge i_clk)
		if (i_stb)
			o_hi  <= i_op[1];
	end end end end end
	endgenerate // All possible multiply results have been determined

Compare with Previous | Blame | View Log

powered by: WebSVN 2.1.0

© copyright 1999-2024, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.