OpenCores
URL https://opencores.org/ocsvn/neorv32/neorv32/trunk

Subversion Repositories neorv32

[/] [neorv32/] [trunk/] [rtl/] [core/] [neorv32_cpu_cp_bitmanip.vhd] - Rev 74

Compare with Previous | Blame | View Log

-- #################################################################################################
-- # << NEORV32 - CPU Co-Processor: Bit-Manipulation Co-Processor Unit (RISC-V "B" Extension) >>   #
-- # ********************************************************************************************* #
-- # Supported B sub-extensions (Zb*):                                                             #
-- # - Zba: Address-generation instructions                                                        #
-- # - Zbb: Basic bit-manipulation instructions                                                    #
-- # - Zbs: Single-bit instructions                                                                #
-- # - Zbc: Carry-less multiplication instructions                                                 #
-- #                                                                                               #
-- # NOTE: This is a first implementation of the bit-manipulation co-processor that supports all   #
-- #       sub-sets of the B extension. Hence, it is not yet optimized for area, latency or speed. #
-- # ********************************************************************************************* #
-- # BSD 3-Clause License                                                                          #
-- #                                                                                               #
-- # Copyright (c) 2022, Stephan Nolting. All rights reserved.                                     #
-- #                                                                                               #
-- # Redistribution and use in source and binary forms, with or without modification, are          #
-- # permitted provided that the following conditions are met:                                     #
-- #                                                                                               #
-- # 1. Redistributions of source code must retain the above copyright notice, this list of        #
-- #    conditions and the following disclaimer.                                                   #
-- #                                                                                               #
-- # 2. Redistributions in binary form must reproduce the above copyright notice, this list of     #
-- #    conditions and the following disclaimer in the documentation and/or other materials        #
-- #    provided with the distribution.                                                            #
-- #                                                                                               #
-- # 3. Neither the name of the copyright holder nor the names of its contributors may be used to  #
-- #    endorse or promote products derived from this software without specific prior written      #
-- #    permission.                                                                                #
-- #                                                                                               #
-- # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS   #
-- # OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF               #
-- # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE    #
-- # COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,     #
-- # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE #
-- # GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED    #
-- # AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING     #
-- # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED  #
-- # OF THE POSSIBILITY OF SUCH DAMAGE.                                                            #
-- # ********************************************************************************************* #
-- # The NEORV32 Processor - https://github.com/stnolting/neorv32              (c) Stephan Nolting #
-- #################################################################################################
 
library ieee;
use ieee.std_logic_1164.all;
use ieee.numeric_std.all;
 
library neorv32;
use neorv32.neorv32_package.all;
 
entity neorv32_cpu_cp_bitmanip is
  generic (
    FAST_SHIFT_EN : boolean -- use barrel shifter for shift operations
  );
  port (
    -- global control --
    clk_i   : in  std_ulogic; -- global clock, rising edge
    rstn_i  : in  std_ulogic; -- global reset, low-active, async
    ctrl_i  : in  std_ulogic_vector(ctrl_width_c-1 downto 0); -- main control bus
    start_i : in  std_ulogic; -- trigger operation
    -- data input --
    cmp_i   : in  std_ulogic_vector(1 downto 0); -- comparator status
    rs1_i   : in  std_ulogic_vector(data_width_c-1 downto 0); -- rf source 1
    rs2_i   : in  std_ulogic_vector(data_width_c-1 downto 0); -- rf source 2
    shamt_i : in  std_ulogic_vector(index_size_f(data_width_c)-1 downto 0); -- shift amount
    -- result and status --
    res_o   : out std_ulogic_vector(data_width_c-1 downto 0); -- operation result
    valid_o : out std_ulogic -- data output valid
  );
end neorv32_cpu_cp_bitmanip;
 
architecture neorv32_cpu_cp_bitmanip_rtl of neorv32_cpu_cp_bitmanip is
 
  -- Sub-extension configuration ----------------------------
  -- Note that this configurations does NOT effect the CPU's (illegal) instruction decoding logic!
  constant zbb_en_c : boolean := true;
  constant zba_en_c : boolean := true;
  constant zbc_en_c : boolean := true;
  constant zbs_en_c : boolean := true;
  -- --------------------------------------------------------
 
  -- Zbb - logic with negate --
  constant op_andn_c    : natural := 0;
  constant op_orn_c     : natural := 1;
  constant op_xnor_c    : natural := 2;
  -- Zbb - count leading/trailing zero bits --
  constant op_clz_c     : natural := 3;
  constant op_ctz_c     : natural := 4;
  -- Zbb - count population --
  constant op_cpop_c    : natural := 5;
  -- Zbb - integer minimum/maximum --
  constant op_max_c     : natural := 6; -- signed/unsigned
  constant op_min_c     : natural := 7; -- signed/unsigned
  -- Zbb - sign- and zero-extension --
  constant op_sextb_c   : natural := 8;
  constant op_sexth_c   : natural := 9;
  constant op_zexth_c   : natural := 10;
  -- Zbb - bitwise rotation --
  constant op_rol_c     : natural := 11;
  constant op_ror_c     : natural := 12; -- also rori
  -- Zbb - or-combine --
  constant op_orcb_c    : natural := 13;
  -- Zbb - byte-reverse --
  constant op_rev8_c    : natural := 14;
  -- Zba - shifted-add --
  constant op_sh1add_c  : natural := 15;
  constant op_sh2add_c  : natural := 16;
  constant op_sh3add_c  : natural := 17;
  -- Zbs - single-bit operations --
  constant op_bclr_c    : natural := 18;
  constant op_bext_c    : natural := 19;
  constant op_binv_c    : natural := 20;
  constant op_bset_c    : natural := 21;
  -- Zbc - carry-less multiplication --
  constant op_clmul_c   : natural := 22;
  constant op_clmulh_c  : natural := 23;
  constant op_clmulr_c  : natural := 24;
  --
  constant op_width_c   : natural := 25;
 
  -- controller --
  type ctrl_state_t is (S_IDLE, S_START_SHIFT, S_BUSY_SHIFT, S_START_CLMUL, S_BUSY_CLMUL);
  signal ctrl_state   : ctrl_state_t;
  signal cmd, cmd_buf : std_ulogic_vector(op_width_c-1 downto 0);
  signal valid        : std_ulogic;
 
  -- operand buffers --
  signal rs1_reg : std_ulogic_vector(data_width_c-1 downto 0);
  signal rs2_reg : std_ulogic_vector(data_width_c-1 downto 0);
  signal sha_reg : std_ulogic_vector(index_size_f(data_width_c)-1 downto 0);
  signal less_ff : std_ulogic;
 
  -- serial shifter --
  type shifter_t is record
    start   : std_ulogic;
    run     : std_ulogic;
    bcnt    : std_ulogic_vector(index_size_f(data_width_c) downto 0); -- bit counter
    cnt     : std_ulogic_vector(index_size_f(data_width_c) downto 0); -- iteration counter
    cnt_max : std_ulogic_vector(index_size_f(data_width_c) downto 0);
    sreg    : std_ulogic_vector(data_width_c-1 downto 0);
  end record;
  signal shifter : shifter_t;
 
  -- barrel shifter --
  type bs_level_t is array (index_size_f(data_width_c) downto 0) of std_ulogic_vector(data_width_c-1 downto 0);
  signal bs_level : bs_level_t;
 
  -- operation results --
  type res_t is array (0 to op_width_c-1) of std_ulogic_vector(data_width_c-1 downto 0);
  signal res_int, res_out : res_t;
 
  -- shifted-add unit --
  signal adder_core : std_ulogic_vector(data_width_c-1 downto 0);
 
  -- one-hot shifter --
  signal one_hot_core : std_ulogic_vector(data_width_c-1 downto 0);
 
  -- carry-less multiplier --
  type clmultiplier_t is record
    start : std_ulogic;
    busy  : std_ulogic;
    rs2   : std_ulogic_vector(data_width_c-1 downto 0);
    cnt   : std_ulogic_vector(index_size_f(data_width_c) downto 0);
    prod  : std_ulogic_vector(2*data_width_c-1 downto 0);
  end record;
  signal clmul : clmultiplier_t;
 
begin
 
  -- Sub-Extension Configuration ------------------------------------------------------------
  -- -------------------------------------------------------------------------------------------
  assert false report
  "NEORV32 CPU: Implementing bit-manipulation (B) sub-extensions " &
  cond_sel_string_f(zba_en_c, "Zba ", "") &
  cond_sel_string_f(zbb_en_c, "Zbb ", "") &
  cond_sel_string_f(zbc_en_c, "Zbc ", "") &
  cond_sel_string_f(zbs_en_c, "Zbs ", "") &
  ""
  severity note;
 
 
  -- Instruction Decoding (One-Hot) ---------------------------------------------------------
  -- -------------------------------------------------------------------------------------------
  -- a minimal decoding logic is used here just to distinguish between the different B instruction
  -- a more precise decoding and valid-instruction check is done by the CPU control unit
 
  -- Zbb - Basic bit-manipulation instructions --
  cmd(op_andn_c)   <= '1' when (zbb_en_c = true) and (ctrl_i(ctrl_ir_funct12_10_c downto ctrl_ir_funct12_9_c) = "10") and (ctrl_i(ctrl_ir_funct12_7_c) = '0') and (ctrl_i(ctrl_ir_funct3_1_c downto ctrl_ir_funct3_0_c) = "11") else '0';
  cmd(op_orn_c)    <= '1' when (zbb_en_c = true) and (ctrl_i(ctrl_ir_funct12_10_c downto ctrl_ir_funct12_9_c) = "10") and (ctrl_i(ctrl_ir_funct12_7_c) = '0') and (ctrl_i(ctrl_ir_funct3_1_c downto ctrl_ir_funct3_0_c) = "10") else '0';
  cmd(op_xnor_c)   <= '1' when (zbb_en_c = true) and (ctrl_i(ctrl_ir_funct12_10_c downto ctrl_ir_funct12_9_c) = "10") and (ctrl_i(ctrl_ir_funct12_7_c) = '0') and (ctrl_i(ctrl_ir_funct3_1_c downto ctrl_ir_funct3_0_c) = "00") else '0';
  --
  cmd(op_max_c)    <= '1' when (zbb_en_c = true) and (ctrl_i(ctrl_ir_funct12_10_c downto ctrl_ir_funct12_9_c) = "00") and (ctrl_i(ctrl_ir_funct12_5_c) = '1') and (ctrl_i(ctrl_ir_funct3_2_c downto ctrl_ir_funct3_1_c) = "11") else '0';
  cmd(op_min_c)    <= '1' when (zbb_en_c = true) and (ctrl_i(ctrl_ir_funct12_10_c downto ctrl_ir_funct12_9_c) = "00") and (ctrl_i(ctrl_ir_funct12_5_c) = '1') and (ctrl_i(ctrl_ir_funct3_2_c downto ctrl_ir_funct3_1_c) = "10") else '0';
  cmd(op_zexth_c)  <= '1' when (zbb_en_c = true) and (ctrl_i(ctrl_ir_funct12_10_c downto ctrl_ir_funct12_9_c) = "00") and (ctrl_i(ctrl_ir_funct12_5_c) = '0') else '0';
  --
  cmd(op_orcb_c)   <= '1' when (zbb_en_c = true) and (ctrl_i(ctrl_ir_funct12_10_c downto ctrl_ir_funct12_9_c) = "01") and (ctrl_i(ctrl_ir_funct12_7_c) = '1') and (ctrl_i(ctrl_ir_funct3_2_c downto ctrl_ir_funct3_0_c) = "101") else '0';
  --
  cmd(op_clz_c)    <= '1' when (zbb_en_c = true) and (ctrl_i(ctrl_ir_funct12_10_c downto ctrl_ir_funct12_9_c) = "11") and (ctrl_i(ctrl_ir_funct12_7_c) = '0') and (ctrl_i(ctrl_ir_funct12_2_c downto ctrl_ir_funct12_0_c) = "000") else '0';
  cmd(op_ctz_c)    <= '1' when (zbb_en_c = true) and (ctrl_i(ctrl_ir_funct12_10_c downto ctrl_ir_funct12_9_c) = "11") and (ctrl_i(ctrl_ir_funct12_7_c) = '0') and (ctrl_i(ctrl_ir_funct12_2_c downto ctrl_ir_funct12_0_c) = "001") else '0';
  cmd(op_cpop_c)   <= '1' when (zbb_en_c = true) and (ctrl_i(ctrl_ir_funct12_10_c downto ctrl_ir_funct12_9_c) = "11") and (ctrl_i(ctrl_ir_funct12_7_c) = '0') and (ctrl_i(ctrl_ir_funct12_2_c downto ctrl_ir_funct12_0_c) = "010") and (ctrl_i(ctrl_ir_opcode7_5_c) = '0') else '0';
  cmd(op_sextb_c)  <= '1' when (zbb_en_c = true) and (ctrl_i(ctrl_ir_funct12_10_c downto ctrl_ir_funct12_9_c) = "11") and (ctrl_i(ctrl_ir_funct12_7_c) = '0') and (ctrl_i(ctrl_ir_funct12_2_c downto ctrl_ir_funct12_0_c) = "100") and (ctrl_i(ctrl_ir_funct3_2_c) = '0')  and (ctrl_i(ctrl_ir_opcode7_5_c) = '0') else '0';
  cmd(op_sexth_c)  <= '1' when (zbb_en_c = true) and (ctrl_i(ctrl_ir_funct12_10_c downto ctrl_ir_funct12_9_c) = "11") and (ctrl_i(ctrl_ir_funct12_7_c) = '0') and (ctrl_i(ctrl_ir_funct12_2_c downto ctrl_ir_funct12_0_c) = "101") and (ctrl_i(ctrl_ir_funct3_2_c) = '0') and (ctrl_i(ctrl_ir_opcode7_5_c) = '0') else '0';
  cmd(op_rol_c)    <= '1' when (zbb_en_c = true) and (ctrl_i(ctrl_ir_funct12_10_c downto ctrl_ir_funct12_9_c) = "11") and (ctrl_i(ctrl_ir_funct12_7_c) = '0') and (ctrl_i(ctrl_ir_funct3_2_c downto ctrl_ir_funct3_0_c) = "001") and (ctrl_i(ctrl_ir_opcode7_5_c) = '1') else '0';
  cmd(op_ror_c)    <= '1' when (zbb_en_c = true) and (ctrl_i(ctrl_ir_funct12_10_c downto ctrl_ir_funct12_9_c) = "11") and (ctrl_i(ctrl_ir_funct12_7_c) = '0') and (ctrl_i(ctrl_ir_funct3_2_c downto ctrl_ir_funct3_0_c) = "101") and (ctrl_i(ctrl_ir_funct3_2_c) = '1') else '0';
  cmd(op_rev8_c)   <= '1' when (zbb_en_c = true) and (ctrl_i(ctrl_ir_funct12_10_c downto ctrl_ir_funct12_9_c) = "11") and (ctrl_i(ctrl_ir_funct12_7_c) = '1') and (ctrl_i(ctrl_ir_funct3_2_c downto ctrl_ir_funct3_0_c) = "101") else '0';
 
  -- Zba - Address generation instructions --
  cmd(op_sh1add_c) <= '1' when (zba_en_c = true) and (ctrl_i(ctrl_ir_funct12_10_c downto ctrl_ir_funct12_9_c) = "01") and (ctrl_i(ctrl_ir_funct12_7_c) = '0') and (ctrl_i(ctrl_ir_funct3_2_c downto ctrl_ir_funct3_1_c) = "01") else '0';
  cmd(op_sh2add_c) <= '1' when (zba_en_c = true) and (ctrl_i(ctrl_ir_funct12_10_c downto ctrl_ir_funct12_9_c) = "01") and (ctrl_i(ctrl_ir_funct12_7_c) = '0') and (ctrl_i(ctrl_ir_funct3_2_c downto ctrl_ir_funct3_1_c) = "10") else '0';
  cmd(op_sh3add_c) <= '1' when (zba_en_c = true) and (ctrl_i(ctrl_ir_funct12_10_c downto ctrl_ir_funct12_9_c) = "01") and (ctrl_i(ctrl_ir_funct12_7_c) = '0') and (ctrl_i(ctrl_ir_funct3_2_c downto ctrl_ir_funct3_1_c) = "11") else '0';
 
  -- Zbs - Single-bit instructions --
  cmd(op_bclr_c)   <= '1' when (zbs_en_c = true) and (ctrl_i(ctrl_ir_funct12_10_c downto ctrl_ir_funct12_9_c) = "10") and (ctrl_i(ctrl_ir_funct12_7_c) = '1') and (ctrl_i(ctrl_ir_funct3_2_c) = '0') else '0';
  cmd(op_bext_c)   <= '1' when (zbs_en_c = true) and (ctrl_i(ctrl_ir_funct12_10_c downto ctrl_ir_funct12_9_c) = "10") and (ctrl_i(ctrl_ir_funct12_7_c) = '1') and (ctrl_i(ctrl_ir_funct3_2_c) = '1') else '0';
  cmd(op_binv_c)   <= '1' when (zbs_en_c = true) and (ctrl_i(ctrl_ir_funct12_10_c downto ctrl_ir_funct12_9_c) = "11") and (ctrl_i(ctrl_ir_funct12_7_c) = '1') and (ctrl_i(ctrl_ir_funct3_2_c) = '0') else '0';
  cmd(op_bset_c)   <= '1' when (zbs_en_c = true) and (ctrl_i(ctrl_ir_funct12_10_c downto ctrl_ir_funct12_9_c) = "01") and (ctrl_i(ctrl_ir_funct12_7_c) = '1') and (ctrl_i(ctrl_ir_funct3_2_c) = '0') else '0';
 
  -- Zbc - Carry-less multiplication instructions --
  cmd(op_clmul_c)  <= '1' when (zbc_en_c = true) and (ctrl_i(ctrl_ir_funct12_10_c downto ctrl_ir_funct12_9_c) = "00") and (ctrl_i(ctrl_ir_funct12_5_c) = '1') and (ctrl_i(ctrl_ir_funct3_2_c downto ctrl_ir_funct3_0_c) = "001") else '0';
  cmd(op_clmulh_c) <= '1' when (zbc_en_c = true) and (ctrl_i(ctrl_ir_funct12_10_c downto ctrl_ir_funct12_9_c) = "00") and (ctrl_i(ctrl_ir_funct12_5_c) = '1') and (ctrl_i(ctrl_ir_funct3_2_c downto ctrl_ir_funct3_0_c) = "011") else '0';
  cmd(op_clmulr_c) <= '1' when (zbc_en_c = true) and (ctrl_i(ctrl_ir_funct12_10_c downto ctrl_ir_funct12_9_c) = "00") and (ctrl_i(ctrl_ir_funct12_5_c) = '1') and (ctrl_i(ctrl_ir_funct3_2_c downto ctrl_ir_funct3_0_c) = "010") else '0';
 
 
  -- Co-Processor Controller ----------------------------------------------------------------
  -- -------------------------------------------------------------------------------------------
  coprocessor_ctrl: process(rstn_i, clk_i)
  begin
    if (rstn_i = '0') then
      ctrl_state    <= S_IDLE;
      cmd_buf       <= (others => def_rst_val_c);
      rs1_reg       <= (others => def_rst_val_c);
      rs2_reg       <= (others => def_rst_val_c);
      sha_reg       <= (others => def_rst_val_c);
      less_ff       <= def_rst_val_c;
      clmul.start   <= '0';
      shifter.start <= '0';
      valid         <= '0';
    elsif rising_edge(clk_i) then
      -- defaults --
      shifter.start <= '0';
      clmul.start   <= '0';
      valid         <= '0';
 
      -- fsm --
      case ctrl_state is
 
        when S_IDLE => -- wait for operation trigger
        -- ------------------------------------------------------------
          if (start_i = '1') then
            less_ff <= cmp_i(cmp_less_c);
            cmd_buf <= cmd;
            rs1_reg <= rs1_i;
            rs2_reg <= rs2_i;
            sha_reg <= shamt_i;
            if ((cmd(op_clz_c) or cmd(op_ctz_c) or cmd(op_cpop_c) or cmd(op_ror_c) or cmd(op_rol_c)) = '1') then -- multi-cycle shift operation
              if (FAST_SHIFT_EN = false) then -- default: iterative computation
                shifter.start <= '1';
                ctrl_state <= S_START_SHIFT;
              else -- full-parallel computation
                ctrl_state <= S_BUSY_SHIFT;
              end if;
            elsif (zbc_en_c = true) and ((cmd(op_clmul_c) or cmd(op_clmulh_c) or cmd(op_clmulr_c)) = '1') then -- multi-cycle clmul operation
              clmul.start <= '1';
              ctrl_state  <= S_START_CLMUL;
            else
              valid      <= '1';
              ctrl_state <= S_IDLE;
            end if;
          end if;
 
        when S_START_SHIFT => -- one cycle delay to start shift operation
        -- ------------------------------------------------------------
          ctrl_state <= S_BUSY_SHIFT;
 
        when S_BUSY_SHIFT => -- wait for multi-cycle shift operation to finish
        -- ------------------------------------------------------------
          if (shifter.run = '0') or (ctrl_i(ctrl_trap_c) = '1') then -- abort on trap
            valid      <= '1';
            ctrl_state <= S_IDLE;
          end if;
 
        when S_START_CLMUL => -- one cycle delay to start clmul operation
        -- ------------------------------------------------------------
          ctrl_state <= S_BUSY_CLMUL;
 
        when S_BUSY_CLMUL => -- wait for multi-cycle clmul operation to finish
        -- ------------------------------------------------------------
          if (clmul.busy = '0') or (ctrl_i(ctrl_trap_c) = '1') then -- abort on trap
            valid      <= '1';
            ctrl_state <= S_IDLE;
          end if;
 
        when others => -- undefined
        -- ------------------------------------------------------------
          ctrl_state <= S_IDLE;
 
      end case;
    end if;
  end process coprocessor_ctrl;
 
 
  -- Shifter Function Core (iterative: small but slow) --------------------------------------
  -- -------------------------------------------------------------------------------------------
  serial_shifter:
  if (FAST_SHIFT_EN = false) generate
    shifter_unit: process(rstn_i, clk_i)
      variable new_bit_v : std_ulogic;
    begin
      if (rstn_i = '0') then
        shifter.cnt     <= (others => def_rst_val_c);
        shifter.sreg    <= (others => def_rst_val_c);
        shifter.cnt_max <= (others => def_rst_val_c);
        shifter.bcnt    <= (others => def_rst_val_c);
      elsif rising_edge(clk_i) then
        if (shifter.start = '1') then -- trigger new shift
          shifter.cnt <= (others => '0');
          -- shift operand --
          if (cmd_buf(op_clz_c) = '1') or (cmd_buf(op_rol_c) = '1') then -- count LEADING zeros / rotate LEFT
            shifter.sreg <= bit_rev_f(rs1_reg); -- reverse - we can only do right shifts here
          else -- ctz, cpop, ror
            shifter.sreg <= rs1_reg;
          end if;
          -- max shift amount --
          if (cmd_buf(op_cpop_c) = '1') then -- population count
            shifter.cnt_max <= (others => '0');
            shifter.cnt_max(shifter.cnt_max'left) <= '1';
          else
            shifter.cnt_max <= '0' & sha_reg;
          end if;
          shifter.bcnt <= (others => '0');
        elsif (shifter.run = '1') then -- right shifts only
          new_bit_v := ((cmd_buf(op_ror_c) or cmd_buf(op_rol_c)) and shifter.sreg(0)) or (cmd_buf(op_clz_c) or cmd_buf(op_ctz_c));
          shifter.sreg <= new_bit_v & shifter.sreg(shifter.sreg'left downto 1); -- ro[r/l]/lsr(for counting)
          shifter.cnt  <= std_ulogic_vector(unsigned(shifter.cnt) + 1); -- iteration counter
          if (shifter.sreg(0) = '1') then
            shifter.bcnt <= std_ulogic_vector(unsigned(shifter.bcnt) + 1); -- bit counter
          end if;
        end if;
      end if;
    end process shifter_unit;
  end generate;
 
  -- run control --
  serial_shifter_ctrl:
  if (FAST_SHIFT_EN = false) generate
    shifter_unit_ctrl: process(cmd_buf, shifter)
    begin
      -- keep shifting until ... --
      if (cmd_buf(op_clz_c) = '1') or (cmd_buf(op_ctz_c) = '1') then -- count leading/trailing zeros
        shifter.run <= not shifter.sreg(0);
      else -- population count / rotate
        if (shifter.cnt = shifter.cnt_max) then
          shifter.run <= '0';
        else
          shifter.run <= '1';
        end if;
      end if;
    end process shifter_unit_ctrl;
  end generate;
 
 
  -- Shifter Function Core (parallel: fast but large) ---------------------------------------
  -- -------------------------------------------------------------------------------------------
  barrel_shifter_async_sync:
  if (FAST_SHIFT_EN = true) generate
    shifter_unit_fast: process(rstn_i, clk_i)
      variable new_bit_v : std_ulogic;
    begin
      if (rstn_i = '0') then
        shifter.cnt     <= (others => def_rst_val_c);
        shifter.sreg    <= (others => def_rst_val_c);
        shifter.bcnt    <= (others => def_rst_val_c);
      elsif rising_edge(clk_i) then
        -- population count --
        shifter.bcnt <= std_ulogic_vector(to_unsigned(popcount_f(rs1_reg), shifter.bcnt'length));
        -- count leading/trailing zeros --
        if cmd_buf(op_clz_c) = '1' then -- leading
          shifter.cnt <= std_ulogic_vector(to_unsigned(leading_zeros_f(rs1_reg), shifter.cnt'length));
        else -- trailing
          shifter.cnt <= std_ulogic_vector(to_unsigned(leading_zeros_f(bit_rev_f(rs1_reg)), shifter.cnt'length));
        end if;
        -- barrel shifter --
        shifter.sreg <= bs_level(0); -- rol/ror[i]
      end if;
    end process shifter_unit_fast;
    shifter.run <= '0'; -- we are done already!
  end generate;
 
  -- barrel shifter array --
  barrel_shifter_async:
  if (FAST_SHIFT_EN = true) generate
    shifter_unit_async: process(rs1_reg, sha_reg, cmd_buf, bs_level)
    begin
      -- input level: convert left shifts to right shifts --
      if (cmd_buf(op_rol_c) = '1') then -- is left shift?
        bs_level(index_size_f(data_width_c)) <= bit_rev_f(rs1_reg); -- reverse bit order of input operand
      else
        bs_level(index_size_f(data_width_c)) <= rs1_reg;
      end if;
 
      -- shifter array --
      for i in index_size_f(data_width_c)-1 downto 0 loop
        if (sha_reg(i) = '1') then
          bs_level(i)(data_width_c-1 downto data_width_c-(2**i)) <= bs_level(i+1)((2**i)-1 downto 0);
          bs_level(i)((data_width_c-(2**i))-1 downto 0) <= bs_level(i+1)(data_width_c-1 downto 2**i);
        else
          bs_level(i) <= bs_level(i+1);
        end if;
      end loop;
    end process shifter_unit_async;
  end generate;
 
 
  -- Shifted-Add Core -----------------------------------------------------------------------
  -- -------------------------------------------------------------------------------------------
  shift_adder: process(rs1_reg, rs2_reg, ctrl_i)
    variable opb_v : std_ulogic_vector(data_width_c-1 downto 0);
  begin
    case ctrl_i(ctrl_ir_funct3_2_c downto ctrl_ir_funct3_1_c) is
      when "01"   => opb_v := rs1_reg(rs1_reg'left-1 downto 0) & '0';   -- << 1
      when "10"   => opb_v := rs1_reg(rs1_reg'left-2 downto 0) & "00";  -- << 2
      when "11"   => opb_v := rs1_reg(rs1_reg'left-3 downto 0) & "000"; -- << 3
      when others => opb_v := (others => '-'); -- undefined
    end case;
    adder_core <= std_ulogic_vector(unsigned(rs2_reg) + unsigned(opb_v));
  end process shift_adder;
 
 
  -- One-Hot Generator Core -----------------------------------------------------------------
  -- -------------------------------------------------------------------------------------------
  shift_one_hot: process(sha_reg)
  begin
    one_hot_core <= (others => '0');
    if (zbs_en_c = true) then
      one_hot_core(to_integer(unsigned(sha_reg))) <= '1';
    end if;
  end process shift_one_hot;
 
 
  -- Carry-Less Multiplication Core ---------------------------------------------------------
  -- -------------------------------------------------------------------------------------------
  clmul_core: process(rstn_i, clk_i)
  begin
    if (rstn_i = '0') then
      clmul.cnt  <= (others => def_rst_val_c);
      clmul.prod <= (others => def_rst_val_c);
    elsif rising_edge(clk_i) then
      if (clmul.start = '1') then -- start new multiplication
        clmul.cnt                 <= (others => '0');
        clmul.cnt(clmul.cnt'left) <= '1';
        clmul.prod(63 downto 32)  <= (others => '0');
        if (cmd_buf(op_clmulr_c) = '1') then -- reverse input operands?
          clmul.prod(31 downto 00) <= bit_rev_f(rs1_reg);
        else
          clmul.prod(31 downto 00) <= rs1_reg;
        end if;
      elsif (clmul.busy = '1') then -- processing
        clmul.cnt <= std_ulogic_vector(unsigned(clmul.cnt) - 1);
        if (clmul.prod(0) = '1') then
          clmul.prod(62 downto 31) <= clmul.prod(63 downto 32) xor clmul.rs2;
        else
          clmul.prod(62 downto 31) <= clmul.prod(63 downto 32);
        end if;
        clmul.prod(30 downto 00) <= clmul.prod(31 downto 1);
      end if;
    end if;
  end process clmul_core;
 
  -- reverse input operands? --
  clmul.rs2 <= bit_rev_f(rs2_reg) when (cmd_buf(op_clmulr_c) = '1') else rs2_reg;
 
  -- multiplier busy? --
  clmul.busy <= '1' when (or_reduce_f(clmul.cnt) = '1') else '0';
 
 
  -- Operation Results ----------------------------------------------------------------------
  -- -------------------------------------------------------------------------------------------
  -- logic with negate --
  res_int(op_andn_c) <= rs1_reg and (not rs2_reg);
  res_int(op_orn_c)  <= rs1_reg or  (not rs2_reg);
  res_int(op_xnor_c) <= rs1_reg xor (not rs2_reg);
 
  -- count leading/trailing zeros --
  res_int(op_clz_c)(data_width_c-1 downto shifter.cnt'left+1) <= (others => '0');
  res_int(op_clz_c)(shifter.cnt'left downto 0) <= shifter.cnt;
  res_int(op_ctz_c) <= (others => '0'); -- unused/redundant
 
  -- count set bits --
  res_int(op_cpop_c)(data_width_c-1 downto shifter.bcnt'left+1) <= (others => '0');
  res_int(op_cpop_c)(shifter.bcnt'left downto 0) <= shifter.bcnt;
 
  -- min/max select --
  res_int(op_min_c) <= rs1_reg when ((less_ff xor cmd_buf(op_max_c)) = '1') else rs2_reg;
  res_int(op_max_c) <= (others => '0'); -- unused/redundant
 
  -- sign-extension --
  res_int(op_sextb_c)(data_width_c-1 downto 8) <= (others => rs1_reg(7));
  res_int(op_sextb_c)(7 downto 0) <= rs1_reg(7 downto 0); -- sign-extend byte
  res_int(op_sexth_c)(data_width_c-1 downto 16) <= (others => rs1_reg(15));
  res_int(op_sexth_c)(15 downto 0) <= rs1_reg(15 downto 0); -- sign-extend half-word
  res_int(op_zexth_c)(data_width_c-1 downto 16) <= (others => '0');
  res_int(op_zexth_c)(15 downto 0) <= rs1_reg(15 downto 0); -- zero-extend half-word
 
  -- rotate right/left --
  res_int(op_ror_c) <= shifter.sreg;
  res_int(op_rol_c) <= bit_rev_f(shifter.sreg); -- reverse to compensate internal right-only shifts
 
  -- or-combine.byte --
  or_combine_gen:
  for i in 0 to (data_width_c/8)-1 generate -- sub-byte loop
    res_int(op_orcb_c)(i*8+7 downto i*8) <= (others => or_reduce_f(rs1_reg(i*8+7 downto i*8)));
  end generate; -- i
 
  -- reversal.8 (byte swap) --
  res_int(op_rev8_c) <= bswap32_f(rs1_reg);
 
  -- address generation instructions --
  res_int(op_sh1add_c) <= adder_core;
  res_int(op_sh2add_c) <= (others => '0'); -- unused/redundant
  res_int(op_sh3add_c) <= (others => '0'); -- unused/redundant
 
  -- single-bit instructions --
  res_int(op_bclr_c) <= rs1_reg and (not one_hot_core);
  res_int(op_bext_c)(data_width_c-1 downto 1) <= (others => '0');
  res_int(op_bext_c)(0) <= '1' when (or_reduce_f(rs1_reg and one_hot_core) = '1') else '0';
  res_int(op_binv_c) <= rs1_reg xor one_hot_core;
  res_int(op_bset_c) <= rs1_reg or one_hot_core;
 
  -- carry-less multiplication instructions --
  res_int(op_clmul_c)  <= clmul.prod(31 downto 00);
  res_int(op_clmulh_c) <= clmul.prod(63 downto 32);
  res_int(op_clmulr_c) <= bit_rev_f(clmul.prod(31 downto 00));
 
 
  -- Output Selector ------------------------------------------------------------------------
  -- -------------------------------------------------------------------------------------------
  res_out(op_andn_c)  <= res_int(op_andn_c)  when (cmd_buf(op_andn_c)  = '1') else (others => '0');
  res_out(op_orn_c)   <= res_int(op_orn_c)   when (cmd_buf(op_orn_c)   = '1') else (others => '0');
  res_out(op_xnor_c)  <= res_int(op_xnor_c)  when (cmd_buf(op_xnor_c)  = '1') else (others => '0');
  res_out(op_clz_c)   <= res_int(op_clz_c)   when ((cmd_buf(op_clz_c) or cmd_buf(op_ctz_c)) = '1') else (others => '0');
  res_out(op_ctz_c)   <= (others => '0'); -- unused/redundant
  res_out(op_cpop_c)  <= res_int(op_cpop_c)  when (cmd_buf(op_cpop_c)  = '1') else (others => '0');
  res_out(op_min_c)   <= res_int(op_min_c)   when ((cmd_buf(op_min_c) or cmd_buf(op_max_c)) = '1') else (others => '0');
  res_out(op_max_c)   <= (others => '0'); -- unused/redundant
  res_out(op_sextb_c) <= res_int(op_sextb_c) when (cmd_buf(op_sextb_c) = '1') else (others => '0');
  res_out(op_sexth_c) <= res_int(op_sexth_c) when (cmd_buf(op_sexth_c) = '1') else (others => '0');
  res_out(op_zexth_c) <= res_int(op_zexth_c) when (cmd_buf(op_zexth_c) = '1') else (others => '0');
  res_out(op_ror_c)   <= res_int(op_ror_c)   when (cmd_buf(op_ror_c)   = '1') else (others => '0');
  res_out(op_rol_c)   <= res_int(op_rol_c)   when (cmd_buf(op_rol_c)   = '1') else (others => '0');
  res_out(op_orcb_c)  <= res_int(op_orcb_c)  when (cmd_buf(op_orcb_c)  = '1') else (others => '0');
  res_out(op_rev8_c)  <= res_int(op_rev8_c)  when (cmd_buf(op_rev8_c)  = '1') else (others => '0');
  --
  res_out(op_sh1add_c) <= res_int(op_sh1add_c) when ((cmd_buf(op_sh1add_c) or cmd_buf(op_sh2add_c) or cmd_buf(op_sh3add_c))  = '1') else (others => '0');
  res_out(op_sh2add_c) <= (others => '0'); -- unused/redundant
  res_out(op_sh3add_c) <= (others => '0'); -- unused/redundant
  --
  res_out(op_bclr_c) <= res_int(op_bclr_c) when (cmd_buf(op_bclr_c) = '1') else (others => '0');
  res_out(op_bext_c) <= res_int(op_bext_c) when (cmd_buf(op_bext_c) = '1') else (others => '0');
  res_out(op_binv_c) <= res_int(op_binv_c) when (cmd_buf(op_binv_c) = '1') else (others => '0');
  res_out(op_bset_c) <= res_int(op_bset_c) when (cmd_buf(op_bset_c) = '1') else (others => '0');
  --
  res_out(op_clmul_c)  <= res_int(op_clmul_c)  when (cmd_buf(op_clmul_c) = '1')  else (others => '0');
  res_out(op_clmulh_c) <= res_int(op_clmulh_c) when (cmd_buf(op_clmulh_c) = '1') else (others => '0');
  res_out(op_clmulr_c) <= res_int(op_clmulr_c) when (cmd_buf(op_clmulr_c) = '1') else (others => '0');
 
 
  -- Output Gate ----------------------------------------------------------------------------
  -- -------------------------------------------------------------------------------------------
  output_gate: process(rstn_i, clk_i)
  begin
    if (rstn_i = '0') then
      res_o <= (others => def_rst_val_c);
    elsif rising_edge(clk_i) then
      res_o <= (others => '0');
      if (valid = '1') then
        res_o <= res_out(op_andn_c)   or res_out(op_orn_c)    or res_out(op_xnor_c)  or
                 res_out(op_clz_c)    or res_out(op_cpop_c)   or -- res_out(op_ctz_c) is unused here
                 res_out(op_min_c)    or -- res_out(op_max_c) is unused here
                 res_out(op_sextb_c)  or res_out(op_sexth_c)  or res_out(op_zexth_c) or
                 res_out(op_ror_c)    or res_out(op_rol_c)    or
                 res_out(op_orcb_c)   or res_out(op_rev8_c)   or
                 res_out(op_sh1add_c) or -- res_out(op_sh2add_c) and res_out(op_sh3add_c) are unused here
                 res_out(op_bclr_c)   or res_out(op_bext_c)   or res_out(op_binv_c)  or res_out(op_bset_c) or
                 res_out(op_clmul_c)  or res_out(op_clmulh_c) or res_out(op_clmulr_c);
      end if;
    end if;
  end process output_gate;
 
  -- valid output --
  valid_o <= valid;
 
 
end neorv32_cpu_cp_bitmanip_rtl;
 

Compare with Previous | Blame | View Log

powered by: WebSVN 2.1.0

© copyright 1999-2024 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.