OpenCores

Rev 47	Rev 56
Line 1...	Line 1...
`-- #################################################################################################`	`-- #################################################################################################`
`-- # << NEORV32 - CPU Co-Processor: Integer Multiplier/Divider Unit (RISC-V "M" Extension)>> #`	`-- # << NEORV32 - CPU Co-Processor: Integer Multiplier/Divider Unit (RISC-V "M" Extension)>> #`
`-- # ********************************************************************************************* #`	`-- # ********************************************************************************************* #`
`-- # Multiplier and Divider unit. Implements the RISC-V RV32-M CPU extension. #`	`-- # Multiplier and Divider unit. Implements the RISC-V M CPU extension. #`
`-- # Multiplier core (signed/unsigned) uses serial algorithm. -> 32+4 cycles latency #`	`-- # #`
`-- # Divider core (unsigned) uses serial algorithm. -> 32+6 cycles latency #`	`-- # Multiplier core (signed/unsigned) uses classical serial algorithm. Unit atency: 31+3 cycles #`
`-- # Multiplications can be mapped to DSP block when FAST_MUL_EN = true. #`	`-- # Divider core (unsigned) uses classical serial algorithm. Unit latency: 32+4 cycles #`
	`-- # #`
	`-- # Multiplications can be mapped to DSP blocks (faster!) when FAST_MUL_EN = true. #`
	`-- # Unit latency: 3 cycles #`
`-- # ********************************************************************************************* #`	`-- # ********************************************************************************************* #`
`-- # BSD 3-Clause License #`	`-- # BSD 3-Clause License #`
`-- # #`	`-- # #`
`-- # Copyright (c) 2021, Stephan Nolting. All rights reserved. #`	`-- # Copyright (c) 2021, Stephan Nolting. All rights reserved. #`
`-- # #`	`-- # #`
Line 63...	Line 66...
`);`	`);`
`end neorv32_cpu_cp_muldiv;`	`end neorv32_cpu_cp_muldiv;`

`architecture neorv32_cpu_cp_muldiv_rtl of neorv32_cpu_cp_muldiv is`	`architecture neorv32_cpu_cp_muldiv_rtl of neorv32_cpu_cp_muldiv is`

`-- advanced configuration --`
`constant dsp_add_reg_stage_c : boolean := false; -- add another register stage to DSP-based multiplication for timing-closure`

`-- operations --`	`-- operations --`
`constant cp_op_mul_c : std_ulogic_vector(2 downto 0) := "000"; -- mul`	`constant cp_op_mul_c : std_ulogic_vector(2 downto 0) := "000"; -- mul`
`constant cp_op_mulh_c : std_ulogic_vector(2 downto 0) := "001"; -- mulh`	`constant cp_op_mulh_c : std_ulogic_vector(2 downto 0) := "001"; -- mulh`
`constant cp_op_mulhsu_c : std_ulogic_vector(2 downto 0) := "010"; -- mulhsu`	`constant cp_op_mulhsu_c : std_ulogic_vector(2 downto 0) := "010"; -- mulhsu`
`constant cp_op_mulhu_c : std_ulogic_vector(2 downto 0) := "011"; -- mulhu`	`constant cp_op_mulhu_c : std_ulogic_vector(2 downto 0) := "011"; -- mulhu`
Line 77...	Line 77...
`constant cp_op_divu_c : std_ulogic_vector(2 downto 0) := "101"; -- divu`	`constant cp_op_divu_c : std_ulogic_vector(2 downto 0) := "101"; -- divu`
`constant cp_op_rem_c : std_ulogic_vector(2 downto 0) := "110"; -- rem`	`constant cp_op_rem_c : std_ulogic_vector(2 downto 0) := "110"; -- rem`
`constant cp_op_remu_c : std_ulogic_vector(2 downto 0) := "111"; -- remu`	`constant cp_op_remu_c : std_ulogic_vector(2 downto 0) := "111"; -- remu`

`-- controller --`	`-- controller --`
`type state_t is (IDLE, DECODE, INIT_OPX, INIT_OPY, PROCESSING, FINALIZE, COMPLETED, FAST_MUL);`	`type state_t is (IDLE, DIV_PREPROCESS, PROCESSING, FINALIZE, COMPLETED);`
`signal state : state_t;`	`signal state : state_t;`
`signal cnt : std_ulogic_vector(4 downto 0);`	`signal cnt : std_ulogic_vector(4 downto 0);`
`signal cp_op : std_ulogic_vector(2 downto 0); -- operation to execute`	`signal cp_op : std_ulogic_vector(2 downto 0); -- operation to execute`
`signal cp_op_ff : std_ulogic_vector(2 downto 0); -- operation that was executed`	`signal cp_op_ff : std_ulogic_vector(2 downto 0); -- operation that was executed`
`signal start : std_ulogic;`	`signal start_div : std_ulogic;`
	`signal start_mul : std_ulogic;`
`signal operation : std_ulogic;`	`signal operation : std_ulogic;`
`signal rs1, opx, opy : std_ulogic_vector(data_width_c-1 downto 0); -- input operands`	`signal div_opx : std_ulogic_vector(data_width_c-1 downto 0);`
`signal opx_is_signed : std_ulogic;`	`signal div_opy : std_ulogic_vector(data_width_c-1 downto 0);`
`signal opy_is_signed : std_ulogic;`	`signal rs1_is_signed : std_ulogic;`
	`signal rs2_is_signed : std_ulogic;`
`signal opy_is_zero : std_ulogic;`	`signal opy_is_zero : std_ulogic;`
`signal div_res_corr : std_ulogic;`	`signal div_res_corr : std_ulogic;`
`signal valid : std_ulogic;`	`signal valid : std_ulogic;`

`-- divider core --`	`-- divider core --`
Line 100...	Line 102...
`signal div_sign_comp_in : std_ulogic_vector(data_width_c-1 downto 0);`	`signal div_sign_comp_in : std_ulogic_vector(data_width_c-1 downto 0);`
`signal div_sign_comp : std_ulogic_vector(data_width_c-1 downto 0);`	`signal div_sign_comp : std_ulogic_vector(data_width_c-1 downto 0);`
`signal div_res : std_ulogic_vector(data_width_c-1 downto 0);`	`signal div_res : std_ulogic_vector(data_width_c-1 downto 0);`

`-- multiplier core --`	`-- multiplier core --`
	`signal mul_product_p : std_ulogic_vector(63 downto 0);`
	`signal mul_product_s : std_ulogic_vector(63 downto 0);`
`signal mul_product : std_ulogic_vector(63 downto 0);`	`signal mul_product : std_ulogic_vector(63 downto 0);`
`signal mul_do_add : std_ulogic_vector(data_width_c downto 0);`	`signal mul_do_add : std_ulogic_vector(data_width_c downto 0);`
`signal mul_sign_cycle : std_ulogic;`	`signal mul_sign_cycle : std_ulogic;`
`signal mul_p_sext : std_ulogic;`	`signal mul_p_sext : std_ulogic;`
`signal mul_op_x : signed(32 downto 0); -- for using DSPs`	`signal mul_op_x : signed(32 downto 0); -- for using DSPs`
`signal mul_op_y : signed(32 downto 0); -- for using DSPs`	`signal mul_op_y : signed(32 downto 0); -- for using DSPs`
`signal mul_buf_ff : signed(65 downto 0); -- for using DSPs`	`signal mul_buf_ff : signed(65 downto 0); -- for using DSPs`
`signal mul_buf2_ff : signed(65 downto 0); -- for using DSPs`

`begin`	`begin`

`-- Co-Processor Controller ----------------------------------------------------------------`	`-- Co-Processor Controller ----------------------------------------------------------------`
`-- -------------------------------------------------------------------------------------------`	`-- -------------------------------------------------------------------------------------------`
`coprocessor_ctrl: process(rstn_i, clk_i)`	`coprocessor_ctrl: process(rstn_i, clk_i)`
`begin`	`begin`
`if (rstn_i = '0') then`	`if (rstn_i = '0') then`
`state <= IDLE;`	`state <= IDLE;`
`opx <= (others => '0');`	`div_opx <= (others => def_rst_val_c);`
`opy <= (others => '0');`	`div_opy <= (others => def_rst_val_c);`
`rs1 <= (others => '0');`	`cnt <= (others => def_rst_val_c);`
`cnt <= (others => '0');`	`cp_op_ff <= (others => def_rst_val_c);`
`start <= '0';`	`start_div <= '0';`
`valid <= '0';`	`valid <= '0';`
`div_res_corr <= '0';`	`div_res_corr <= def_rst_val_c;`
`opy_is_zero <= '0';`	`opy_is_zero <= def_rst_val_c;`
`cp_op_ff <= (others => '0');`
`elsif rising_edge(clk_i) then`	`elsif rising_edge(clk_i) then`
`-- defaults --`	`-- defaults --`
`start <= '0';`	`start_div <= '0';`
`valid <= '0';`	`valid <= '0';`
`cp_op_ff <= cp_op;`

`-- FSM --`	`-- FSM --`
`case state is`	`case state is`
`when IDLE =>`	`when IDLE =>`
	`cp_op_ff <= cp_op;`
`if (start_i = '1') then`	`if (start_i = '1') then`
`opx <= rs1_i;`	`if (operation = '1') then -- division`
`rs1 <= rs1_i;`	`cnt <= "11111";`
`opy <= rs2_i;`	`state <= DIV_PREPROCESS;`
`state <= DECODE;`	`else`
	`cnt <= "11110";`
	`if (FAST_MUL_EN = true) then`
	`state <= FINALIZE;`
	`else`
	`state <= PROCESSING;`
	`end if;`
	`end if;`
`end if;`	`end if;`

`when DECODE =>`	`when DIV_PREPROCESS =>`
`--`	`-- check rlevatn input signs --`
`if (cp_op = cp_op_div_c) then -- result sign compensation for div?`	`if (cp_op = cp_op_div_c) then -- result sign compensation for div?`
`div_res_corr <= opx(opx'left) xor opy(opy'left);`	`div_res_corr <= rs1_i(rs1_i'left) xor rs2_i(rs2_i'left);`
`elsif (cp_op = cp_op_rem_c) then -- result sign compensation for rem?`	`elsif (cp_op = cp_op_rem_c) then -- result sign compensation for rem?`
`div_res_corr <= opx(opx'left);`	`div_res_corr <= rs1_i(rs1_i'left);`
`else`	`else`
`div_res_corr <= '0';`	`div_res_corr <= '0';`
`end if;`	`end if;`
`--`	`-- divide by zero? --`
`if (or_all_f(opy) = '0') then -- divide by 0?`	`opy_is_zero <= not or_all_f(rs2_i); -- set if rs2 = 0`
`opy_is_zero <= '1';`	`-- abs(rs1) --`
	`if ((rs1_i(rs1_i'left) and rs1_is_signed) = '1') then -- signed division?`
	`div_opx <= std_ulogic_vector(0 - unsigned(rs1_i)); -- make positive`
`else`	`else`
`opy_is_zero <= '0';`	`div_opx <= rs1_i;`
`end if;`	`end if;`
`--`	`-- abs(rs2) --`
`cnt <= "11111";`	`if ((rs2_i(rs2_i'left) and rs2_is_signed) = '1') then -- signed division?`
`if (operation = '1') then -- division`	`div_opy <= std_ulogic_vector(0 - unsigned(rs2_i)); -- make positive`
`state <= INIT_OPX;`
`else -- multiplication`
`start <= '1';`
`if (FAST_MUL_EN = true) then`
`state <= FAST_MUL;`
`else`	`else`
`state <= PROCESSING;`	`div_opy <= rs2_i;`
`end if;`
`end if;`

`when INIT_OPX =>`
`if ((opx(opx'left) and opx_is_signed) = '1') then -- signed division?`
`opx <= div_sign_comp; -- make positive`
`end if;`
`state <= INIT_OPY;`

`when INIT_OPY =>`
`start <= '1';`
`if ((opy(opy'left) and opy_is_signed) = '1') then -- signed division?`
`opy <= div_sign_comp; -- make positive`
`end if;`	`end if;`
	`--`
	`start_div <= '1';`
`state <= PROCESSING;`	`state <= PROCESSING;`

`when PROCESSING =>`	`when PROCESSING =>`
`cnt <= std_ulogic_vector(unsigned(cnt) - 1);`	`cnt <= std_ulogic_vector(unsigned(cnt) - 1);`
`if (cnt = "00000") then`	`if (cnt = "00000") then`
`state <= FINALIZE;`	`state <= FINALIZE;`
`end if;`	`end if;`

`when FAST_MUL =>`
`state <= FINALIZE;`

`when FINALIZE =>`	`when FINALIZE =>`
`state <= COMPLETED;`	`state <= COMPLETED;`

`when COMPLETED =>`	`when COMPLETED =>`
`valid <= '1';`	`valid <= '1';`
Line 205...	Line 197...
`end process coprocessor_ctrl;`	`end process coprocessor_ctrl;`

`-- co-processor command --`	`-- co-processor command --`
`cp_op <= ctrl_i(ctrl_ir_funct3_2_c downto ctrl_ir_funct3_0_c);`	`cp_op <= ctrl_i(ctrl_ir_funct3_2_c downto ctrl_ir_funct3_0_c);`

`-- operation --`	`-- operation: 0=mul, 1=div --`
`operation <= '1' when (cp_op = cp_op_div_c) or (cp_op = cp_op_divu_c) or (cp_op = cp_op_rem_c) or (cp_op = cp_op_remu_c) else '0';`	`operation <= '1' when (cp_op(2) = '1') else '0';`

`-- opx (rs1) signed? --`	`-- opx (rs1) signed? --`
`opx_is_signed <= '1' when (cp_op = cp_op_mulh_c) or (cp_op = cp_op_mulhsu_c) or (cp_op = cp_op_div_c) or (cp_op = cp_op_rem_c) else '0';`	`rs1_is_signed <= '1' when (cp_op = cp_op_mulh_c) or (cp_op = cp_op_mulhsu_c) or (cp_op = cp_op_div_c) or (cp_op = cp_op_rem_c) else '0';`

`-- opy (rs2) signed? --`	`-- opy (rs2) signed? --`
`opy_is_signed <= '1' when (cp_op = cp_op_mulh_c) or (cp_op = cp_op_div_c) or (cp_op = cp_op_rem_c) else '0';`	`rs2_is_signed <= '1' when (cp_op = cp_op_mulh_c) or (cp_op = cp_op_div_c) or (cp_op = cp_op_rem_c) else '0';`

	`-- start MUL operation (do it fast!) --`
	`start_mul <= '1' when (state = IDLE) and (start_i = '1') and (operation = '0') else '0';`


`-- Multiplier Core (signed/unsigned) ------------------------------------------------------`	`-- Multiplier Core (signed/unsigned) ------------------------------------------------------`
`-- -------------------------------------------------------------------------------------------`	`-- -------------------------------------------------------------------------------------------`
`multiplier_core: process(clk_i)`	`multiplier_core_serial: process(rstn_i, clk_i)`
`begin`	`begin`
`if rising_edge(clk_i) then`	`if (rstn_i = '0') then`
`-- ---------------------------------------------------------`	`mul_product_s <= (others => def_rst_val_c);`
	`elsif rising_edge(clk_i) then`
`if (FAST_MUL_EN = false) then -- use small iterative computation`	`if (FAST_MUL_EN = false) then -- use small iterative computation`
`if (start = '1') then -- start new multiplication`	`if (start_mul = '1') then -- start new multiplication`
`mul_product(63 downto 32) <= (others => '0');`	`mul_product_s(63 downto 32) <= (others => '0');`
`mul_product(31 downto 00) <= opy;`	`mul_product_s(31 downto 00) <= rs2_i;`
`elsif ((state = PROCESSING) or (state = FINALIZE)) and (operation = '0') then`	`elsif (state = PROCESSING) or (state = FINALIZE) then -- processing step or sign-finalization step`
`mul_product(63 downto 31) <= mul_do_add(32 downto 0);`	`mul_product_s(63 downto 31) <= mul_do_add(32 downto 0);`
`mul_product(30 downto 00) <= mul_product(31 downto 1);`	`mul_product_s(30 downto 00) <= mul_product_s(31 downto 1);`
`end if;`
`-- ---------------------------------------------------------`
`else -- use direct approach using (several!) DSP blocks`
`if (start = '1') then`
`mul_op_x <= signed((opx(opx'left) and opx_is_signed) & opx);`
`mul_op_y <= signed((opy(opy'left) and opy_is_signed) & opy);`
`end if;`	`end if;`
`mul_buf_ff <= mul_op_x * mul_op_y;`
`if (dsp_add_reg_stage_c = true) then -- add another reg stage?`
`mul_buf2_ff <= mul_buf_ff;`
`mul_product <= std_ulogic_vector(mul_buf2_ff(63 downto 0)); -- let the register balancing do the magic here`
`else`
`mul_product <= std_ulogic_vector(mul_buf_ff(63 downto 0)); -- let the register balancing do the magic here`
`end if;`	`end if;`
`end if;`	`end if;`
	`end process multiplier_core_serial;`

	`multiplier_core_dsp: process(clk_i)`
	`begin`
	`if rising_edge(clk_i) then`
	`if (FAST_MUL_EN = true) then -- use direct approach using DSP blocks`
	`if (start_mul = '1') then`
	`mul_op_x <= signed((rs1_i(rs1_i'left) and rs1_is_signed) & rs1_i);`
	`mul_op_y <= signed((rs2_i(rs2_i'left) and rs2_is_signed) & rs2_i);`
	`end if;`
	`mul_buf_ff <= mul_op_x * mul_op_y;`
	`mul_product_p <= std_ulogic_vector(mul_buf_ff(63 downto 0)); -- let the register balancing do the magic here`
	`end if;`
`end if;`	`end if;`
`end process multiplier_core;`	`end process multiplier_core_dsp;`

	`mul_product <= mul_product_p when (FAST_MUL_EN = true) else mul_product_s;`

`-- MUL: do another addition --`	`-- do another addition --`
`mul_update: process(mul_product, mul_sign_cycle, mul_p_sext, opx_is_signed, opx)`	`mul_update: process(mul_product, mul_sign_cycle, mul_p_sext, rs1_is_signed, rs1_i)`
`begin`	`begin`
`-- current bit of opy to take care of --`	`-- current bit of rs2_i to take care of --`
`if (mul_product(0) = '1') then -- multiply with 1`	`if (mul_product(0) = '1') then -- multiply with 1`
`if (mul_sign_cycle = '1') then -- for signed operations only: take care of negative weighted MSB -> multiply with -1`	`if (mul_sign_cycle = '1') then -- for signed operations only: take care of negative weighted MSB -> multiply with -1`
`mul_do_add <= std_ulogic_vector(unsigned(mul_p_sext & mul_product(63 downto 32)) - unsigned((opx(opx'left) and opx_is_signed) & opx));`	`mul_do_add <= std_ulogic_vector(unsigned(mul_p_sext & mul_product(63 downto 32)) - unsigned((rs1_i(rs1_i'left) and rs1_is_signed) & rs1_i));`
`else -- multiply with +1`	`else -- multiply with +1`
`mul_do_add <= std_ulogic_vector(unsigned(mul_p_sext & mul_product(63 downto 32)) + unsigned((opx(opx'left) and opx_is_signed) & opx));`	`mul_do_add <= std_ulogic_vector(unsigned(mul_p_sext & mul_product(63 downto 32)) + unsigned((rs1_i(rs1_i'left) and rs1_is_signed) & rs1_i));`
`end if;`	`end if;`
`else -- multiply with 0`	`else -- multiply with 0`
`mul_do_add <= mul_p_sext & mul_product(63 downto 32);`	`mul_do_add <= mul_p_sext & mul_product(63 downto 32);`
`end if;`	`end if;`
`end process mul_update;`	`end process mul_update;`

`-- sign control --`	`-- sign control --`
`mul_sign_cycle <= opy_is_signed when (state = FINALIZE) else '0';`	`mul_sign_cycle <= rs2_is_signed when (state = FINALIZE) else '0';`
`mul_p_sext <= mul_product(mul_product'left) and opx_is_signed;`	`mul_p_sext <= mul_product(mul_product'left) and rs1_is_signed;`


`-- Divider Core (unsigned) ----------------------------------------------------------------`	`-- Divider Core (unsigned) ----------------------------------------------------------------`
`-- -------------------------------------------------------------------------------------------`	`-- -------------------------------------------------------------------------------------------`
`divider_core: process(clk_i)`	`divider_core: process(rstn_i, clk_i)`
`begin`	`begin`
`if rising_edge(clk_i) then`	`if (rstn_i = '0') then`
`if (start = '1') then -- start new division`	`quotient <= (others => def_rst_val_c);`
`quotient <= opx;`	`remainder <= (others => def_rst_val_c);`
	`elsif rising_edge(clk_i) then`
	`if (start_div = '1') then -- start new division`
	`quotient <= div_opx;`
`remainder <= (others => '0');`	`remainder <= (others => '0');`
`elsif ((state = PROCESSING) or (state = FINALIZE)) and (operation = '1') then -- running?`	`elsif (state = PROCESSING) or (state = FINALIZE) then -- running?`
`quotient <= quotient(30 downto 0) & (not div_sub(32));`	`quotient <= quotient(30 downto 0) & (not div_sub(32));`
`if (div_sub(32) = '0') then -- still overflowing`	`if (div_sub(32) = '0') then -- still overflowing`
`remainder <= div_sub(31 downto 0);`	`remainder <= div_sub(31 downto 0);`
`else -- underflow`	`else -- underflow`
`remainder <= remainder(30 downto 0) & quotient(31);`	`remainder <= remainder(30 downto 0) & quotient(31);`
`end if;`	`end if;`
`end if;`	`end if;`
`end if;`	`end if;`
`end process divider_core;`	`end process divider_core;`

`-- DIV: try another subtraction --`	`-- try another subtraction --`
`div_sub <= std_ulogic_vector(unsigned('0' & remainder(30 downto 0) & quotient(31)) - unsigned('0' & opy));`	`div_sub <= std_ulogic_vector(unsigned('0' & remainder(30 downto 0) & quotient(31)) - unsigned('0' & div_opy));`

`-- Div sign compensation --`	`-- result sign compensation --`
`div_sign_comp_in <= opx when (state = INIT_OPX) else`	`div_sign_comp_in <= quotient when (cp_op = cp_op_div_c) else remainder;`
`opy when (state = INIT_OPY) else`
`quotient when ((cp_op = cp_op_div_c) or (cp_op = cp_op_divu_c)) else remainder;`
`div_sign_comp <= std_ulogic_vector(0 - unsigned(div_sign_comp_in));`	`div_sign_comp <= std_ulogic_vector(0 - unsigned(div_sign_comp_in));`

`-- result sign correction --`
`div_res <= div_sign_comp when (div_res_corr = '1') and (opy_is_zero = '0') else div_sign_comp_in;`	`div_res <= div_sign_comp when (div_res_corr = '1') and (opy_is_zero = '0') else div_sign_comp_in;`


`-- Data Output ----------------------------------------------------------------------------`	`-- Data Output ----------------------------------------------------------------------------`
`-- -------------------------------------------------------------------------------------------`	`-- -------------------------------------------------------------------------------------------`
`operation_result: process(clk_i)`	`operation_result: process(rstn_i, clk_i)`
`begin`	`begin`
`if rising_edge(clk_i) then`	`if (rstn_i = '0') then`
	`res_o <= (others => def_rst_val_c);`
	`elsif rising_edge(clk_i) then`
`res_o <= (others => '0');`	`res_o <= (others => '0');`
`if (valid = '1') then`	`if (valid = '1') then`
`case cp_op_ff is`	`case cp_op_ff is`
`when cp_op_mul_c =>`	`when cp_op_mul_c =>`
`res_o <= mul_product(31 downto 00);`	`res_o <= mul_product(31 downto 00);`
Line 318...	Line 318...
`res_o <= quotient;`	`res_o <= quotient;`
`when cp_op_rem_c =>`	`when cp_op_rem_c =>`
`if (opy_is_zero = '0') then`	`if (opy_is_zero = '0') then`
`res_o <= div_res;`	`res_o <= div_res;`
`else`	`else`
`res_o <= rs1;`	`res_o <= rs1_i;`
`end if;`	`end if;`
`when others => -- cp_op_remu_c`	`when others => -- cp_op_remu_c`
`res_o <= remainder;`	`res_o <= remainder;`
`end case;`	`end case;`
`end if;`	`end if;`

Line 1...

-- #################################################################################################

-- #################################################################################################

-- # << NEORV32 - CPU Co-Processor: Integer Multiplier/Divider Unit (RISC-V "M" Extension)>>       #

-- # << NEORV32 - CPU Co-Processor: Integer Multiplier/Divider Unit (RISC-V "M" Extension)>>       #

-- # ********************************************************************************************* #

-- # ********************************************************************************************* #

-- # Multiplier and Divider unit. Implements the RISC-V RV32-M CPU extension.                      #

-- # Multiplier and Divider unit. Implements the RISC-V M CPU extension.                           #

-- # Multiplier core (signed/unsigned) uses serial algorithm. -> 32+4 cycles latency               #

-- #                                                                                               #

-- # Divider core (unsigned) uses serial algorithm. -> 32+6 cycles latency                         #

-- # Multiplier core (signed/unsigned) uses classical serial algorithm. Unit atency: 31+3 cycles   #

-- # Multiplications can be mapped to DSP block when FAST_MUL_EN = true.                           #

-- # Divider core (unsigned) uses classical serial algorithm. Unit latency: 32+4 cycles            #

-- #                                                                                               #

-- # Multiplications can be mapped to DSP blocks (faster!) when FAST_MUL_EN = true.                #

-- # Unit latency: 3 cycles                                                                        #

-- # ********************************************************************************************* #

-- # ********************************************************************************************* #

-- # BSD 3-Clause License                                                                          #

-- # BSD 3-Clause License                                                                          #

-- #                                                                                               #

-- #                                                                                               #

-- # Copyright (c) 2021, Stephan Nolting. All rights reserved.                                     #

-- # Copyright (c) 2021, Stephan Nolting. All rights reserved.                                     #

-- #                                                                                               #

-- #                                                                                               #

Line 63...

Line 66...

);

);

end neorv32_cpu_cp_muldiv;

end neorv32_cpu_cp_muldiv;

architecture neorv32_cpu_cp_muldiv_rtl of neorv32_cpu_cp_muldiv is

architecture neorv32_cpu_cp_muldiv_rtl of neorv32_cpu_cp_muldiv is

  -- advanced configuration --

  constant dsp_add_reg_stage_c : boolean := false; -- add another register stage to DSP-based multiplication for timing-closure

  -- operations --

  -- operations --

  constant cp_op_mul_c    : std_ulogic_vector(2 downto 0) := "000"; -- mul

  constant cp_op_mul_c    : std_ulogic_vector(2 downto 0) := "000"; -- mul

  constant cp_op_mulh_c   : std_ulogic_vector(2 downto 0) := "001"; -- mulh

  constant cp_op_mulh_c   : std_ulogic_vector(2 downto 0) := "001"; -- mulh

  constant cp_op_mulhsu_c : std_ulogic_vector(2 downto 0) := "010"; -- mulhsu

  constant cp_op_mulhsu_c : std_ulogic_vector(2 downto 0) := "010"; -- mulhsu

  constant cp_op_mulhu_c  : std_ulogic_vector(2 downto 0) := "011"; -- mulhu

  constant cp_op_mulhu_c  : std_ulogic_vector(2 downto 0) := "011"; -- mulhu

Line 77...

  constant cp_op_divu_c   : std_ulogic_vector(2 downto 0) := "101"; -- divu

  constant cp_op_divu_c   : std_ulogic_vector(2 downto 0) := "101"; -- divu

  constant cp_op_rem_c    : std_ulogic_vector(2 downto 0) := "110"; -- rem

  constant cp_op_rem_c    : std_ulogic_vector(2 downto 0) := "110"; -- rem

  constant cp_op_remu_c   : std_ulogic_vector(2 downto 0) := "111"; -- remu

  constant cp_op_remu_c   : std_ulogic_vector(2 downto 0) := "111"; -- remu

  -- controller --

  -- controller --

  type state_t is (IDLE, DECODE, INIT_OPX, INIT_OPY, PROCESSING, FINALIZE, COMPLETED, FAST_MUL);

  type state_t is (IDLE, DIV_PREPROCESS, PROCESSING, FINALIZE, COMPLETED);

  signal state         : state_t;

  signal state         : state_t;

  signal cnt           : std_ulogic_vector(4 downto 0);

  signal cnt           : std_ulogic_vector(4 downto 0);

  signal cp_op         : std_ulogic_vector(2 downto 0); -- operation to execute

  signal cp_op         : std_ulogic_vector(2 downto 0); -- operation to execute

  signal cp_op_ff      : std_ulogic_vector(2 downto 0); -- operation that was executed

  signal cp_op_ff      : std_ulogic_vector(2 downto 0); -- operation that was executed

  signal start         : std_ulogic;

  signal start_div     : std_ulogic;

  signal start_mul     : std_ulogic;

  signal operation     : std_ulogic;

  signal operation     : std_ulogic;

  signal rs1, opx, opy : std_ulogic_vector(data_width_c-1 downto 0); -- input operands

  signal div_opx       : std_ulogic_vector(data_width_c-1 downto 0);

  signal opx_is_signed : std_ulogic;

  signal div_opy       : std_ulogic_vector(data_width_c-1 downto 0);

  signal opy_is_signed : std_ulogic;

  signal rs1_is_signed : std_ulogic;

  signal rs2_is_signed : std_ulogic;

  signal opy_is_zero   : std_ulogic;

  signal opy_is_zero   : std_ulogic;

  signal div_res_corr  : std_ulogic;

  signal div_res_corr  : std_ulogic;

  signal valid         : std_ulogic;

  signal valid         : std_ulogic;

  -- divider core --

  -- divider core --

Line 100...

Line 102...

  signal div_sign_comp_in : std_ulogic_vector(data_width_c-1 downto 0);

  signal div_sign_comp_in : std_ulogic_vector(data_width_c-1 downto 0);

  signal div_sign_comp    : std_ulogic_vector(data_width_c-1 downto 0);

  signal div_sign_comp    : std_ulogic_vector(data_width_c-1 downto 0);

  signal div_res          : std_ulogic_vector(data_width_c-1 downto 0);

  signal div_res          : std_ulogic_vector(data_width_c-1 downto 0);

  -- multiplier core --

  -- multiplier core --

  signal mul_product_p  : std_ulogic_vector(63 downto 0);

  signal mul_product_s  : std_ulogic_vector(63 downto 0);

  signal mul_product    : std_ulogic_vector(63 downto 0);

  signal mul_product    : std_ulogic_vector(63 downto 0);

  signal mul_do_add     : std_ulogic_vector(data_width_c downto 0);

  signal mul_do_add     : std_ulogic_vector(data_width_c downto 0);

  signal mul_sign_cycle : std_ulogic;

  signal mul_sign_cycle : std_ulogic;

  signal mul_p_sext     : std_ulogic;

  signal mul_p_sext     : std_ulogic;

  signal mul_op_x       : signed(32 downto 0); -- for using DSPs

  signal mul_op_x       : signed(32 downto 0); -- for using DSPs

  signal mul_op_y       : signed(32 downto 0); -- for using DSPs

  signal mul_op_y       : signed(32 downto 0); -- for using DSPs

  signal mul_buf_ff     : signed(65 downto 0); -- for using DSPs

  signal mul_buf_ff     : signed(65 downto 0); -- for using DSPs

  signal mul_buf2_ff    : signed(65 downto 0); -- for using DSPs

begin

begin

  -- Co-Processor Controller ----------------------------------------------------------------

  -- Co-Processor Controller ----------------------------------------------------------------

  -- -------------------------------------------------------------------------------------------

  -- -------------------------------------------------------------------------------------------

  coprocessor_ctrl: process(rstn_i, clk_i)

  coprocessor_ctrl: process(rstn_i, clk_i)

  begin

  begin

    if (rstn_i = '0') then

    if (rstn_i = '0') then

      state        <= IDLE;

      state        <= IDLE;

      opx          <= (others => '0');

      div_opx      <= (others => def_rst_val_c);

      opy          <= (others => '0');

      div_opy      <= (others => def_rst_val_c);

      rs1          <= (others => '0');

      cnt          <= (others => def_rst_val_c);

      cnt          <= (others => '0');

      cp_op_ff     <= (others => def_rst_val_c);

      start        <= '0';

      start_div    <= '0';

      valid        <= '0';

      valid        <= '0';

      div_res_corr <= '0';

      div_res_corr <= def_rst_val_c;

      opy_is_zero  <= '0';

      opy_is_zero  <= def_rst_val_c;

      cp_op_ff     <= (others => '0');

    elsif rising_edge(clk_i) then

    elsif rising_edge(clk_i) then

      -- defaults --

      -- defaults --

      start    <= '0';

      start_div <= '0';

      valid    <= '0';

      valid    <= '0';

      cp_op_ff <= cp_op;

      -- FSM --

      -- FSM --

      case state is

      case state is

        when IDLE =>

        when IDLE =>

          cp_op_ff <= cp_op;

          if (start_i = '1') then

          if (start_i = '1') then

            opx   <= rs1_i;

            if (operation = '1') then -- division

            rs1   <= rs1_i;

              cnt <= "11111";

            opy   <= rs2_i;

              state <= DIV_PREPROCESS;

            state <= DECODE;

            else

              cnt <= "11110";

              if (FAST_MUL_EN = true) then

                state <= FINALIZE;

              else

                state <= PROCESSING;

              end if;

            end if;

          end if;

          end if;

        when DECODE =>

        when DIV_PREPROCESS =>

--

          -- check rlevatn input signs --

          if (cp_op = cp_op_div_c) then -- result sign compensation for div?

          if (cp_op = cp_op_div_c) then -- result sign compensation for div?

            div_res_corr <= opx(opx'left) xor opy(opy'left);

            div_res_corr <= rs1_i(rs1_i'left) xor rs2_i(rs2_i'left);

          elsif (cp_op = cp_op_rem_c) then -- result sign compensation for rem?

          elsif (cp_op = cp_op_rem_c) then -- result sign compensation for rem?

            div_res_corr <= opx(opx'left);

            div_res_corr <= rs1_i(rs1_i'left);

          else

          else

            div_res_corr <= '0';

            div_res_corr <= '0';

          end if;

          end if;

--

          -- divide by zero? --

          if (or_all_f(opy) = '0') then -- *divide* by 0?

          opy_is_zero <= not or_all_f(rs2_i); -- set if rs2 = 0

            opy_is_zero <= '1';

          -- abs(rs1) --

          if ((rs1_i(rs1_i'left) and rs1_is_signed) = '1') then -- signed division?

            div_opx <= std_ulogic_vector(0 - unsigned(rs1_i)); -- make positive

          else

          else

            opy_is_zero <= '0';

            div_opx <= rs1_i;

          end if;

          end if;

--

          -- abs(rs2) --

          cnt   <= "11111";

          if ((rs2_i(rs2_i'left) and rs2_is_signed) = '1') then -- signed division?

          if (operation = '1') then -- division

            div_opy <= std_ulogic_vector(0 - unsigned(rs2_i)); -- make positive

            state <= INIT_OPX;

          else -- multiplication

            start <= '1';

            if (FAST_MUL_EN = true) then

              state <= FAST_MUL;

            else

            else

              state <= PROCESSING;

            div_opy <= rs2_i;

            end if;

          end if;

        when INIT_OPX =>

          if ((opx(opx'left) and opx_is_signed) = '1') then -- signed division?

            opx <= div_sign_comp; -- make positive

          end if;

          state <= INIT_OPY;

        when INIT_OPY =>

          start <= '1';

          if ((opy(opy'left) and opy_is_signed) = '1') then -- signed division?

            opy <= div_sign_comp; -- make positive

          end if;

          end if;

--

          start_div <= '1';

          state <= PROCESSING;

          state <= PROCESSING;

        when PROCESSING =>

        when PROCESSING =>

          cnt <= std_ulogic_vector(unsigned(cnt) - 1);

          cnt <= std_ulogic_vector(unsigned(cnt) - 1);

          if (cnt = "00000") then

          if (cnt = "00000") then

            state <= FINALIZE;

            state <= FINALIZE;

          end if;

          end if;

        when FAST_MUL =>

          state <= FINALIZE;

        when FINALIZE =>

        when FINALIZE =>

          state <= COMPLETED;

          state <= COMPLETED;

        when COMPLETED =>

        when COMPLETED =>

          valid <= '1';

          valid <= '1';

Line 205...

Line 197...

  end process coprocessor_ctrl;

  end process coprocessor_ctrl;

  -- co-processor command --

  -- co-processor command --

  cp_op <= ctrl_i(ctrl_ir_funct3_2_c downto ctrl_ir_funct3_0_c);

  cp_op <= ctrl_i(ctrl_ir_funct3_2_c downto ctrl_ir_funct3_0_c);

  -- operation --

  -- operation: 0=mul, 1=div --

  operation <= '1' when (cp_op = cp_op_div_c) or (cp_op = cp_op_divu_c) or (cp_op = cp_op_rem_c) or (cp_op = cp_op_remu_c) else '0';

  operation <= '1' when (cp_op(2) = '1') else '0';

  -- opx (rs1) signed? --

  -- opx (rs1) signed? --

  opx_is_signed <= '1' when (cp_op = cp_op_mulh_c) or (cp_op = cp_op_mulhsu_c) or (cp_op = cp_op_div_c) or (cp_op = cp_op_rem_c) else '0';

  rs1_is_signed <= '1' when (cp_op = cp_op_mulh_c) or (cp_op = cp_op_mulhsu_c) or (cp_op = cp_op_div_c) or (cp_op = cp_op_rem_c) else '0';

  -- opy (rs2) signed? --

  -- opy (rs2) signed? --

  opy_is_signed <= '1' when (cp_op = cp_op_mulh_c) or (cp_op = cp_op_div_c) or (cp_op = cp_op_rem_c) else '0';

  rs2_is_signed <= '1' when (cp_op = cp_op_mulh_c) or (cp_op = cp_op_div_c) or (cp_op = cp_op_rem_c) else '0';

  -- start MUL operation (do it fast!) --

  start_mul <= '1' when (state = IDLE) and (start_i = '1') and (operation = '0') else '0';

  -- Multiplier Core (signed/unsigned) ------------------------------------------------------

  -- Multiplier Core (signed/unsigned) ------------------------------------------------------

  -- -------------------------------------------------------------------------------------------

  -- -------------------------------------------------------------------------------------------

  multiplier_core: process(clk_i)

  multiplier_core_serial: process(rstn_i, clk_i)

  begin

  begin

    if rising_edge(clk_i) then

    if (rstn_i = '0') then

      -- ---------------------------------------------------------

      mul_product_s <= (others => def_rst_val_c);

    elsif rising_edge(clk_i) then

      if (FAST_MUL_EN = false) then -- use small iterative computation

      if (FAST_MUL_EN = false) then -- use small iterative computation

        if (start = '1') then -- start new multiplication

        if (start_mul = '1') then -- start new multiplication

          mul_product(63 downto 32) <= (others => '0');

          mul_product_s(63 downto 32) <= (others => '0');

          mul_product(31 downto 00) <= opy;

          mul_product_s(31 downto 00) <= rs2_i;

        elsif ((state = PROCESSING) or (state = FINALIZE)) and (operation = '0') then

        elsif (state = PROCESSING) or (state = FINALIZE) then -- processing step or sign-finalization step

          mul_product(63 downto 31) <= mul_do_add(32 downto 0);

          mul_product_s(63 downto 31) <= mul_do_add(32 downto 0);

          mul_product(30 downto 00) <= mul_product(31 downto 1);

          mul_product_s(30 downto 00) <= mul_product_s(31 downto 1);

        end if;

      -- ---------------------------------------------------------

      else -- use direct approach using (several!) DSP blocks

        if (start = '1') then

          mul_op_x <= signed((opx(opx'left) and opx_is_signed) & opx);

          mul_op_y <= signed((opy(opy'left) and opy_is_signed) & opy);

        end if;

        end if;

        mul_buf_ff <= mul_op_x * mul_op_y;

        if (dsp_add_reg_stage_c = true) then -- add another reg stage?

          mul_buf2_ff <= mul_buf_ff;

          mul_product <= std_ulogic_vector(mul_buf2_ff(63 downto 0)); -- let the register balancing do the magic here

        else

          mul_product <= std_ulogic_vector(mul_buf_ff(63 downto 0)); -- let the register balancing do the magic here

        end if;

        end if;

      end if;

      end if;

  end process multiplier_core_serial;

  multiplier_core_dsp: process(clk_i)

  begin

    if rising_edge(clk_i) then

      if (FAST_MUL_EN = true) then -- use direct approach using DSP blocks

        if (start_mul = '1') then

          mul_op_x <= signed((rs1_i(rs1_i'left) and rs1_is_signed) & rs1_i);

          mul_op_y <= signed((rs2_i(rs2_i'left) and rs2_is_signed) & rs2_i);

        end if;

        mul_buf_ff    <= mul_op_x * mul_op_y;

        mul_product_p <= std_ulogic_vector(mul_buf_ff(63 downto 0)); -- let the register balancing do the magic here

      end if;

    end if;

    end if;

  end process multiplier_core;

  end process multiplier_core_dsp;

  mul_product <= mul_product_p when (FAST_MUL_EN = true) else mul_product_s;

  -- MUL: do another addition --

  -- do another addition --

  mul_update: process(mul_product, mul_sign_cycle, mul_p_sext, opx_is_signed, opx)

  mul_update: process(mul_product, mul_sign_cycle, mul_p_sext, rs1_is_signed, rs1_i)

  begin

  begin

    -- current bit of opy to take care of --

    -- current bit of rs2_i to take care of --

    if (mul_product(0) = '1') then -- multiply with 1

    if (mul_product(0) = '1') then -- multiply with 1

      if (mul_sign_cycle = '1') then -- for signed operations only: take care of negative weighted MSB -> multiply with -1

      if (mul_sign_cycle = '1') then -- for signed operations only: take care of negative weighted MSB -> multiply with -1

        mul_do_add <= std_ulogic_vector(unsigned(mul_p_sext & mul_product(63 downto 32)) - unsigned((opx(opx'left) and opx_is_signed) & opx));

        mul_do_add <= std_ulogic_vector(unsigned(mul_p_sext & mul_product(63 downto 32)) - unsigned((rs1_i(rs1_i'left) and rs1_is_signed) & rs1_i));

      else -- multiply with +1

      else -- multiply with +1

        mul_do_add <= std_ulogic_vector(unsigned(mul_p_sext & mul_product(63 downto 32)) + unsigned((opx(opx'left) and opx_is_signed) & opx));

        mul_do_add <= std_ulogic_vector(unsigned(mul_p_sext & mul_product(63 downto 32)) + unsigned((rs1_i(rs1_i'left) and rs1_is_signed) & rs1_i));

      end if;

      end if;

    else -- multiply with 0

    else -- multiply with 0

      mul_do_add <= mul_p_sext & mul_product(63 downto 32);

      mul_do_add <= mul_p_sext & mul_product(63 downto 32);

    end if;

    end if;

  end process mul_update;

  end process mul_update;

  -- sign control --

  -- sign control --

  mul_sign_cycle <= opy_is_signed when (state = FINALIZE) else '0';

  mul_sign_cycle <= rs2_is_signed when (state = FINALIZE) else '0';

  mul_p_sext     <= mul_product(mul_product'left) and opx_is_signed;

  mul_p_sext     <= mul_product(mul_product'left) and rs1_is_signed;

  -- Divider Core (unsigned) ----------------------------------------------------------------

  -- Divider Core (unsigned) ----------------------------------------------------------------

  -- -------------------------------------------------------------------------------------------

  -- -------------------------------------------------------------------------------------------

  divider_core: process(clk_i)

  divider_core: process(rstn_i, clk_i)

  begin

  begin

    if rising_edge(clk_i) then

    if (rstn_i = '0') then

      if (start = '1') then -- start new division

      quotient  <= (others => def_rst_val_c);

        quotient  <= opx;

      remainder <= (others => def_rst_val_c);

    elsif rising_edge(clk_i) then

      if (start_div = '1') then -- start new division

        quotient  <= div_opx;

        remainder <= (others => '0');

        remainder <= (others => '0');

      elsif ((state = PROCESSING) or (state = FINALIZE)) and (operation = '1') then -- running?

      elsif (state = PROCESSING) or (state = FINALIZE) then -- running?

        quotient <= quotient(30 downto 0) & (not div_sub(32));

        quotient <= quotient(30 downto 0) & (not div_sub(32));

        if (div_sub(32) = '0') then -- still overflowing

        if (div_sub(32) = '0') then -- still overflowing

          remainder <= div_sub(31 downto 0);

          remainder <= div_sub(31 downto 0);

        else -- underflow

        else -- underflow

          remainder <= remainder(30 downto 0) & quotient(31);

          remainder <= remainder(30 downto 0) & quotient(31);

        end if;

        end if;

      end if;

      end if;

    end if;

    end if;

  end process divider_core;

  end process divider_core;

  -- DIV: try another subtraction --

  -- try another subtraction --

  div_sub <= std_ulogic_vector(unsigned('0' & remainder(30 downto 0) & quotient(31)) - unsigned('0' & opy));

  div_sub <= std_ulogic_vector(unsigned('0' & remainder(30 downto 0) & quotient(31)) - unsigned('0' & div_opy));

  -- Div sign compensation --

  -- result sign compensation --

  div_sign_comp_in <= opx when (state = INIT_OPX) else

  div_sign_comp_in <= quotient when (cp_op = cp_op_div_c) else remainder;

                      opy when (state = INIT_OPY) else

                      quotient when ((cp_op = cp_op_div_c) or (cp_op = cp_op_divu_c)) else remainder;

  div_sign_comp <= std_ulogic_vector(0 - unsigned(div_sign_comp_in));

  div_sign_comp <= std_ulogic_vector(0 - unsigned(div_sign_comp_in));

  -- result sign correction --

  div_res <= div_sign_comp when (div_res_corr = '1') and (opy_is_zero = '0') else div_sign_comp_in;

  div_res <= div_sign_comp when (div_res_corr = '1') and (opy_is_zero = '0') else div_sign_comp_in;

  -- Data Output ----------------------------------------------------------------------------

  -- Data Output ----------------------------------------------------------------------------

  -- -------------------------------------------------------------------------------------------

  -- -------------------------------------------------------------------------------------------

  operation_result: process(clk_i)

  operation_result: process(rstn_i, clk_i)

  begin

  begin

    if rising_edge(clk_i) then

    if (rstn_i = '0') then

      res_o <= (others => def_rst_val_c);

    elsif rising_edge(clk_i) then

      res_o <= (others => '0');

      res_o <= (others => '0');

      if (valid = '1') then

      if (valid = '1') then

        case cp_op_ff is

        case cp_op_ff is

          when cp_op_mul_c =>

          when cp_op_mul_c =>

            res_o <= mul_product(31 downto 00);

            res_o <= mul_product(31 downto 00);

Line 318...

            res_o <= quotient;

            res_o <= quotient;

          when cp_op_rem_c =>

          when cp_op_rem_c =>

            if (opy_is_zero = '0') then

            if (opy_is_zero = '0') then

              res_o <= div_res;

              res_o <= div_res;

            else

            else

              res_o <= rs1;

              res_o <= rs1_i;

            end if;

            end if;

          when others => -- cp_op_remu_c

          when others => -- cp_op_remu_c

            res_o <= remainder;

            res_o <= remainder;

        end case;

        end case;

      end if;

      end if;

Browse

Tools

Subversion Repositories neorv32

[/] [neorv32/] [trunk/] [rtl/] [core/] [neorv32_cpu_cp_muldiv.vhd] - Diff between revs 47 and 56