URL
https://opencores.org/ocsvn/rv01_riscv_core/rv01_riscv_core/trunk
Subversion Repositories rv01_riscv_core
[/] [rv01_riscv_core/] [trunk/] [VHDL/] [RV01_sbuf_2w.vhd] - Rev 2
Compare with Previous | Blame | View Log
----------------------------------------------------------------- -- -- ----------------------------------------------------------------- -- -- -- Copyright (C) 2017 Stefano Tonello -- -- -- -- This source file may be used and distributed without -- -- restriction provided that this copyright statement is not -- -- removed from the file and that any derivative work contains -- -- the original copyright notice and the associated disclaimer.-- -- -- -- THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY -- -- EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED -- -- TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS -- -- FOR A PARTICULAR PURPOSE. IN NO EVENT SHALL THE AUTHOR -- -- OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, -- -- INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -- -- (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE -- -- GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR -- -- BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -- -- LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -- -- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT -- -- OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -- -- POSSIBILITY OF SUCH DAMAGE. -- -- -- ----------------------------------------------------------------- --------------------------------------------------------------- -- RV01 Store buffer --------------------------------------------------------------- library IEEE; use IEEE.std_logic_1164.all; use IEEE.numeric_std.all; library work; use work.RV01_CONSTS_PKG.all; use work.RV01_TYPES_PKG.all; use work.RV01_FUNCS_PKG.all; use work.RV01_OP_PKG.all; entity RV01_SBUF_2W is generic( NW : natural := 2; DEPTH : natural := 4; SIMULATION_ONLY : std_logic := '0' ); port( CLK_i : in std_logic; RST_i : in std_logic; CLRB_i : in std_logic; -- clear buffer flag KTS_i : in std_logic; -- kill top store RE_i : in std_logic_vector(NW-1 downto 0); -- SB read enable WE_i : in std_logic_vector(NW-1 downto 0); -- SB write enable BE0_i : in std_logic_vector(4-1 downto 0); -- inst #0 byte enable BE1_i : in std_logic_vector(4-1 downto 0); -- inst #1 byte enable D0_i : in std_logic_vector(SDLEN-1 downto 0); -- inst #0 store data D1_i : in std_logic_vector(SDLEN-1 downto 0); -- inst #1 store data IX1_V_i : std_logic_vector(2-1 downto 0); LS_OP0_i : in LS_OP_T; LS_OP1_i : in LS_OP_T; DADR0_i : in ADR_T; DADR1_i : in ADR_T; -- just for debugging purpose SADR0_i : in ADR_T; SADR1_i : in ADR_T; BF_o : out std_logic; -- buffer full flag NOPR_o : out std_logic; -- no pending reads flag S2LAC_o : out std_logic_vector(2-1 downto 0); -- store-2-load conflict WE_o : out std_logic; LS_OP_o : out LS_OP_T; BE_o : out std_logic_vector(4-1 downto 0); Q_o : out std_logic_vector(SDLEN-1 downto 0); SADR_o : out ADR_T ); end RV01_SBUF_2W; architecture ARC of RV01_SBUF_2W is constant ZERO : std_logic_vector(DEPTH-1 downto 0) := (others => '0'); constant MTCH_WIDTH : natural := 8; -- store buffer entry type type SB_ENTRY_T is record LS_OP : LS_OP_T; BE : std_logic_vector(4-1 downto 0); DATA : std_logic_vector(SDLEN-1 downto 0); ADR : ADR_T; end record; -- store buffer type type SB_T is array (natural range<>) of SB_ENTRY_T; signal SB,SB_q : SB_T(DEPTH-1 downto 0); signal SBV_q,SBV : std_logic_vector(DEPTH-1 downto 0); signal SB_NEW0,SB_NEW1 : SB_ENTRY_T; signal TP,TP_q : integer range -1 to DEPTH+2; signal PUSH : std_logic_vector(NW-1 downto 0); signal POP,BF : std_logic; signal MTCH0 : std_logic_vector(DEPTH-1 downto 0); signal MTCH1 : std_logic_vector(DEPTH downto 0); signal LD0,LD1,ST0,LS0 : std_logic; signal PR_CNT_q : natural range 0 to DEPTH-1; signal PR_CNT : integer range -1 to DEPTH+1; signal S2LAC : std_logic_vector(2-1 downto 0); signal PRV_q,PRV : std_logic_vector(DEPTH-1 downto 0); function wired_or(V : std_logic_vector) return std_logic is variable WO : std_logic; begin WO := '0'; for i in V'LOW to V'HIGH loop WO := WO or V(i); end loop; return(WO); end function; begin ---------------------------------------------------- -- Notes ---------------------------------------------------- -- Store buffer is organised like a queue which is -- written when a store reaches IX1 stage and read -- when a store reaches IX3 stage (but only if memory -- write port is not occupied by an active load, in -- order to minimize pipe stalls). -- Buffered stores are always read from entry zero -- and written on entry pointed by TP_q (the tail -- pointer). -- In order to detect store-to-load conflicts, both -- load addresses must be compared against buffered -- store addresses. In addition, IX1 instruction #1 -- load address must be compared to IX1 instruction -- #0 store adddress. -- When a conflict is detected, the involved store -- must be allowed to proceed in order to remove -- the conflict. If the store is buffered, a buffer -- read is forced, while, if it's still in IX1 , -- it's allowed to move to next pipe stages. -- CLRB_i and NOPR_o signals have been added to -- support exception processing: CLRB_i allows to -- empty the buffer when an exception is raised and -- NOPR_o tells exception logic that is safe to raise -- exceptions because stores eventually remaining in -- the buffer are newer than IX3 instructions ready -- to raise exceptions. -- This "2w" version support dual stores. -- 10/11/2015 -- CLRB_i is now coincident with CLRP signal, as -- NOPR_o has ben permanently set to '1'. In this -- way re-fetch, exception servicing and return from -- exception can start even if there're still pending -- read in store buffer. Such result is obtained by -- invalidating, on CLRB_i assertion, store buffer -- entries for which no read request is pending (the -- remaining ones are related to instructions older -- the one(s) in IX3 and therefore can be completed -- safely). -- 11/02/2017 -- KTS_i input is added to B/J handle mis-predictions -- (and some special case of jalr instruction) -- triggering a B/J in IX2. -- KTS_i is set if, in the previous cycle, IX1 -- instruction #0 triggered a B/J when instruction -- #1 was a store (under such condition an entry -- corresponding to a nullified store has been written -- to be buffer. -- If KTS_i is set, current top entry must be -- invalidated. -- Note: when KTS_i is set, WE_i is always equal to -- "00". ---------------------------------------------------- -- Pending read counter ---------------------------------------------------- -- If a buffer read is requested (RE_i = '1') but the -- buffer can't be read because memory write port is -- in use by a valid load/store instruction, the read -- request is recorded by incrementing the pending -- read counter PR_CNT_q. -- A pending read is actually performed when the memory -- write port is available (no valid load/store is using -- it), when this event occurs, the pending read counter -- is decremented. -- If a read is requested in the same cycle where a -- pending read is performed, the pending read counter -- remains un-changed. process(CLK_i) begin if(CLK_i = '1' and CLK_i'event) then --if(RST_i = '1' or CLRB_i = '1') then if(RST_i = '1') then PR_CNT_q <= 0; PRV_q <= (others => '0'); else PR_CNT_q <= PR_CNT; PRV_q <= PRV; end if; end if; end process; process(PR_CNT_q,RE_i,POP) variable TMP : std_logic_vector(3-1 downto 0); begin TMP := POP & RE_i; case TMP is when "001"|"010"|"111" => PR_CNT <= PR_CNT_q + 1; when "011" => PR_CNT <= PR_CNT_q + 2; when "100" => PR_CNT <= PR_CNT_q - 1; when others => PR_CNT <= PR_CNT_q; end case; end process; -- PRV_q is an "alternative" view of PR_CNT_q: if -- PR_CNT_q = n, PRV_q(n-1:0) = all-1. PRV_q is -- used to set SBV_q when CLRB_i gets asserted. process(PRV_q,RE_i,POP) variable TMP : std_logic_vector(3-1 downto 0); begin TMP := POP & RE_i; case TMP is when "001"|"010"|"111" => PRV <= PRV_q(DEPTH-2 downto 0) & '1'; when "011" => PRV <= PRV_q(DEPTH-3 downto 0) & "11"; when "100" => PRV <= '0' & PRV_q(DEPTH-1 downto 1); when others => PRV <= PRV_q; end case; end process; NOPR_o <= '1' when (PR_CNT_q = 0 and RE_i = "00") else '0'; --NOPR_o <= '1'; ---------------------------------------------------- -- Buffer data registers ---------------------------------------------------- -- When CLRB_i gets asserted, SBV_q is set to PRV_q -- thereby invalidating all entries for which there's -- no pending read, remaining entries are older than -- instruction(s) in IX3 and can be completed safely. -- Such "trick" allow instruction flow change to run -- in parallel with buffer entries completion. process(CLK_i) begin if(CLK_i = '1' and CLK_i'event) then --if(RST_i = '1' or CLRB_i = '1') then if(RST_i = '1') then SBV_q <= (others => '0'); TP_q <= 0; elsif(CLRB_i = '1') then TP_q <= PR_CNT; SBV_q <= PRV; else SBV_q <= SBV; TP_q <= TP; end if; SB_q <= SB; end if; end process; ---------------------------------------------------- -- Buffer data updating logic ---------------------------------------------------- -- store buffer new entry SB_NEW0 <= ( LS_OP0_i, BE0_i, D0_i, DADR0_i ); SB_NEW1 <= ( LS_OP1_i, BE1_i, D1_i, DADR1_i ); -- Buffer is written when a valid store instruction -- reaches stage IX1. PUSH <= WE_i; -- Buffer is read when: -- 1) IX1 instruction #0 is not a valid L/S and there's an -- active read request (RE_i = '1'), OR -- 2) IX1 instruction #0 is not a valid L/S and there's a -- pending read request (PR_CNT_q > 0), OR -- 3) a forced pop is needed. POP <= not(CLRB_i) when ( LS0 = '0' and (RE_i /= "00" or PR_CNT_q > 0) ) else '0'; -- store buffer data updating logic process(SB_q,SBV_q,TP_q,PUSH,POP,KTS_i,SB_NEW0,SB_NEW1) begin for k in 0 to DEPTH-1 loop if(PUSH = "11" and POP = '1') then -- used entries are shifted down one position -- (deleting bottom one end emptying top), -- emptied top entry and entry above it are -- loaded with new data. if(k = TP_q) then SBV(k) <= '1'; SB(k) <= SB_NEW1; elsif(k = TP_q-1) then SBV(k) <= '1'; SB(k) <= SB_NEW0; elsif(k < DEPTH-1) then SBV(k) <= SBV_q(k+1); SB(k) <= SB_q(k+1); else SBV(k) <= '0'; SB(k) <= SB_q(k); end if; TP <= TP_q + 1; elsif((PUSH = "01" or PUSH = "10") and POP = '1') then -- used entries are shifted down one position -- (deleting bottom one end emptying top), -- emptied top entry is loaded with new data. if(k = TP_q-1) then SBV(k) <= '1'; if(PUSH = "01") then SB(k) <= SB_NEW0; else SB(k) <= SB_NEW1; end if; elsif(k < DEPTH-1) then SBV(k) <= SBV_q(k+1); SB(k) <= SB_q(k+1); else SBV(k) <= '0'; SB(k) <= SB_q(k); end if; TP <= TP_q; elsif(PUSH = "11") then -- top empty entry and entry above it are -- loaded with new data, other entries remain -- unchanged. if(k = TP_q+1) then SBV(k) <= '1'; SB(k) <= SB_NEW1; elsif(k = TP_q) then SBV(k) <= '1'; SB(k) <= SB_NEW0; else SBV(k) <= SBV_q(k); SB(k) <= SB_q(k); end if; TP <= TP_q + 2; elsif(PUSH = "01" or PUSH = "10") then -- top empty entry is loaded with new data, -- other entries remain unchanged. if(k = TP_q) then SBV(k) <= '1'; if(PUSH = "01") then SB(k) <= SB_NEW0; else SB(k) <= SB_NEW1; end if; else SBV(k) <= SBV_q(k); SB(k) <= SB_q(k); end if; TP <= TP_q + 1; elsif(POP = '1') then -- used entries are shifted down one position -- (deleting bottom one end emptying top). if(k = TP_q-1) then SBV(k) <= '0'; SB(k) <= SB_q(k); -- don't care! elsif(k = TP_q-2 and KTS_i = '1') then SBV(k) <= '0'; SB(k) <= SB_q(k); -- don't care elsif(k < DEPTH-1) then SBV(k) <= SBV_q(k+1); SB(k) <= SB_q(k+1); else SBV(k) <= '0'; SB(k) <= SB_q(k); end if; if(KTS_i = '1') then TP <= TP_q - 2; else TP <= TP_q - 1; end if; else if(k = TP_q-1 and KTS_i = '1') then SBV(k) <= '0'; else SBV(k) <= SBV_q(k); end if; SB(k) <= SB_q(k); if(KTS_i = '1') then TP <= TP_q - 1; else TP <= TP_q; end if; end if; end loop; end process; ---------------------------------------------------- -- Store-to-load conflict check ---------------------------------------------------- -- MTCHm(n) flag is set if store buffer n-th entry is -- valid and slot #m load addresses matches entry -- address. MTCH(DEPTH) is set if inst. #0 is a store -- and inst. #1 load addr. matches inst #0 store one. -- Comparison is restricted to MTCH_WIDTH bits, at the -- cost of possible "fake" matches, in order to reduce -- delay. process(SB_q,SBV_q,DADR0_i,DADR1_i,ST0) begin for k in 0 to DEPTH-1 loop if(DADR0_i((MTCH_WIDTH+2)-1 downto 2) = SB_q(k).ADR((MTCH_WIDTH+2)-1 downto 2) ) then MTCH0(k) <= SBV_q(k); else MTCH0(k) <= '0'; end if; if(DADR1_i((MTCH_WIDTH+2)-1 downto 2) = SB_q(k).ADR((MTCH_WIDTH+2)-1 downto 2) ) then MTCH1(k) <= SBV_q(k); else MTCH1(k) <= '0'; end if; end loop; if(DADR0_i((MTCH_WIDTH+2)-1 downto 2) = DADR1_i((MTCH_WIDTH+2)-1 downto 2) ) then MTCH1(DEPTH) <= ST0; else MTCH1(DEPTH) <= '0'; end if; end process; -- inst. #0 store flag ST0 <= IX1_V_i(0) when ( LS_OP0_i = LS_SB or LS_OP0_i = LS_SH or LS_OP0_i = LS_SW ) else '0'; -- inst. #0 load flag LD0 <= IX1_V_i(0) when ( LS_OP0_i = LS_LB or LS_OP0_i = LS_LH or LS_OP0_i = LS_LW ) else '0'; -- inst. #1 load flag LD1 <= IX1_V_i(1) when ( LS_OP1_i = LS_LB or LS_OP1_i = LS_LH or LS_OP1_i = LS_LW ) else '0'; -- inst. #0 load/store flag LS0 <= LD0 or ST0; -- Buffer full flag (buffer is treated as full -- when less than the number of empty entries -- equals the number of pushed ones). --BF <= '1' when ( --(SBV_q(DEPTH-3) = '1' and (PUSH = "11")) or --(SBV_q(DEPTH-2) = '1' and (PUSH = "10" or PUSH = "01")) --) else '0'; BF <= SBV_q(DEPTH-4); -- A conflict is detected if an active load address -- matches a buffered store one. -- A force-pop creates a special case of conflict -- because the load in slot #0 can't be performed -- in order to execute the pending store which is -- force-popped. S2LAC(0) <= LD0 and wired_or(MTCH0); S2LAC(1) <= LD1 and wired_or(MTCH1); ---------------------------------------------------- -- outputs ---------------------------------------------------- BF_o <= BF; S2LAC_o(0) <= S2LAC(0); S2LAC_o(1) <= S2LAC(1); WE_o <= POP; LS_OP_o <= SB_q(0).LS_OP; BE_o <= SB_q(0).BE; Q_o <= SB_q(0).DATA; SADR_o <= SB_q(0).ADR; ---------------------------------------------------- -- Checkers ---------------------------------------------------- -- synthesis translate_off GCHK0: if SIMULATION_ONLY = '1' generate --assert not( -- (WE_i /= "00" and BF = '1') and -- (CLK_i = '1' and CLK_i'event)and -- (RST_i = '0') --) --report "attempted write when store buffer is full!" --severity FAILURE; assert not( (RE_i /= "00" and SBV_q(0) = '0') and (CLK_i = '1' and CLK_i'event)and (RST_i = '0') ) report "attempted read when store buffer is empty!" severity FAILURE; assert not( ( (PR_CNT_q > TP_q) or (PR_CNT_q > TP_q-1 and (RE_i = "01" or RE_i ="10")) or (PR_CNT_q > TP_q-2 and RE_i = "11") ) and (CLK_i = '1' and CLK_i'event) and (RST_i = '0') ) report "pending read count + read requests > tail pointer in store buffer!" severity FAILURE; assert not( ( (RE_i = "01" and SADR0_i /= SB_q(PR_CNT_q).ADR) or (RE_i = "10" and SADR1_i /= SB_q(PR_CNT_q).ADR) or (RE_i = "11" and (SADR0_i /= SB_q(PR_CNT_q).ADR or SADR1_i /= SB_q(PR_CNT_q+1).ADR)) ) and (CLK_i = '1' and CLK_i'event) and (RST_i = '0') ) report "invalid read requests!" severity FAILURE; end generate; -- synthesis translate_on end ARC;