Line 77... |
Line 77... |
-- In subsequen versions of the cache, the tag size needs to be enlarged AND
|
-- In subsequen versions of the cache, the tag size needs to be enlarged AND
|
-- some of the top bits might be omitted when they're not needed to implement
|
-- some of the top bits might be omitted when they're not needed to implement
|
-- the default memory map (namely bit 30 which is always '0').
|
-- the default memory map (namely bit 30 which is always '0').
|
--
|
--
|
--
|
--
|
-- @note3: Possible bug in Quartus-II and workaround
|
-- @note3: Synthesis problem in Quartus-II and workaround
|
--
|
--
|
-- I had to put a 'dummy' mux between the cache line store and the CPU in order
|
-- I had to put a 'dummy' mux between the cache line store and the CPU in order
|
-- to get rid of a quirk in Quartus-II synthseizer (V9.0 build 235).
|
-- to get rid of a quirk in Quartus-II synthseizer (several versions).
|
-- If we omit this extra dummy layer of logic the synth will fail to infer the
|
-- If we omit this extra dummy layer of logic the synth will fail to infer the
|
-- tag table as a BRAM and will use logic fabric instead, crippling performance.
|
-- tag table as a BRAM and will use logic fabric instead, crippling performance.
|
-- The mux is otherwise useless and hits performance badly, but so far I haven't
|
-- The mux is otherwise useless and hits performance badly, but so far I haven't
|
-- found any other way to overcome this bug, not even with the helop of the
|
-- found any other way to overcome this bug, not even with the helop of the
|
-- Altera support forum.
|
-- Altera support forum.
|
|
-- Probable cause of this behavior: according to the Cyclone-II manual (section
|
|
-- 'M4K Routing Interface'), no direct connection is possible between an M4K
|
|
-- data output and the address input of another M4K (in this case, the cache
|
|
-- line BRAM and the register bank BRAM). And apparently Quartus-2 won't insert
|
|
-- intermediate logic itself for some reason.
|
|
-- This does not happen with ISE on Spartan-3.
|
|
-- FIXME: Move this comment to the relevant section of the doc.
|
--
|
--
|
-- @note4: Startup values for the cache tables
|
-- @note4: Startup values for the cache tables
|
--
|
--
|
-- The cache tables has been given startup values; these are only for simulation
|
-- The cache tables has been given startup values; these are only for simulation
|
-- convenience and have no effect on the cache behaviour (and obviuosly they
|
-- convenience and have no effect on the cache behaviour (and obviuosly they
|
Line 925... |
Line 932... |
else
|
else
|
-- Raise 'read_pending' as soon as we know a read is to be done.
|
-- Raise 'read_pending' as soon as we know a read is to be done.
|
-- Clear it as soon as the read/refill has STARTED.
|
-- Clear it as soon as the read/refill has STARTED.
|
-- Can be raised again after a read is started and before it's done.
|
-- Can be raised again after a read is started and before it's done.
|
-- data_rd_addr_reg always has the addr of any pending read.
|
-- data_rd_addr_reg always has the addr of any pending read.
|
if data_miss='1' then --data_rd_vma='1' then
|
if data_miss='1' then
|
read_pending <= '1';
|
read_pending <= '1';
|
data_rd_addr_reg <= data_addr(31 downto 2);
|
data_rd_addr_reg <= data_addr(31 downto 2);
|
elsif data_refill_start='1' or ps=data_read_io_0 or
|
elsif data_refill_start='1' or ps=data_read_io_0 or
|
ps=data_ignore_read then
|
ps=data_ignore_read then
|
read_pending <= '0';
|
read_pending <= '0';
|
end if;
|
end if;
|
|
|
-- Raise 'write_pending' at the 1st cycle of a write, clear it when
|
-- Raise 'write_pending' at the 1st cycle of a write, clear it when
|
-- the write (writethrough actually) operation has been done.
|
-- the write (writethrough actually) operation has been done.
|
-- data_wr_addr_reg always has the addr of any pending write
|
-- data_wr_addr_reg always has the addr of any pending write
|
if byte_we/="0000" and ps=idle and write_pending='0' then
|
if byte_we/="0000" then
|
byte_we_reg <= byte_we;
|
byte_we_reg <= byte_we;
|
data_wr_reg <= data_wr;
|
data_wr_reg <= data_wr;
|
data_wr_addr_reg <= data_addr(31 downto 2);
|
data_wr_addr_reg <= data_addr(31 downto 2);
|
write_pending <= '1';
|
write_pending <= '1';
|
elsif ps=data_writethrough_sram_1b or
|
elsif ps=data_writethrough_sram_1b or
|
Line 1020... |
Line 1027... |
--------------------------------------------------------------------------------
|
--------------------------------------------------------------------------------
|
--------------------------------------------------------------------------------
|
--------------------------------------------------------------------------------
|
-- Code cache
|
-- Code cache
|
|
|
-- CPU is wired directly to cache output, no muxes -- or at least is SHOULD.
|
-- CPU is wired directly to cache output, no muxes -- or at least is SHOULD.
|
-- Due to an apparent bug in Quartus-2 (V9.0 build 235), if we omit this extra
|
-- Due to some unknowk reason, if we omit this extra dummy layer of logic the
|
-- dummy layer of logic the synth will fail to infer the tag table as a BRAM.
|
-- synth (Quartus-II) will fail to infer the tag table as a BRAM.
|
-- (@note3)
|
-- (@note3)
|
code_rd <= code_cache_rd when reset='0' else X"00000000";
|
code_rd <= code_cache_rd when reset='0' else X"00000000";
|
|
|
-- Register here the requested code tag so we can compare it to the tag in the
|
-- Register here the requested code tag so we can compare it to the tag in the
|
-- cache store. Note we register and match the 'line valid' bit together with
|
-- cache store. Note we register and match the 'line valid' bit together with
|
Line 1180... |
Line 1187... |
data_miss_cached <= '1' when
|
data_miss_cached <= '1' when
|
(data_tag_match_valid='1' and data_tags_match='0') or
|
(data_tag_match_valid='1' and data_tags_match='0') or
|
data_miss_by_invalidation='1'
|
data_miss_by_invalidation='1'
|
else '0';
|
else '0';
|
|
|
-- Select the proper code_miss signal
|
-- Select the proper data_miss source with a mux
|
data_miss <= data_miss_uncached when cache_enable='0' else data_miss_cached;
|
data_miss <= data_miss_uncached when cache_enable='0' else data_miss_cached;
|
|
|
|
|
-- Code line address used for both read and write into the table
|
-- Data line address used for both read and write into the table
|
data_line_addr <=
|
data_line_addr <=
|
-- when the CPU wants to invalidate D-Cache lines, the addr comes from the
|
-- When the CPU wants to invalidate D-Cache lines, the addr comes from the
|
-- data bus (see @note1)
|
-- data bus (see @note1)
|
data_wr(7 downto 0) when byte_we(3)='1' and ic_invalidate='1'
|
data_wr(7 downto 0) when byte_we(3)='1' and ic_invalidate='1'
|
-- otherwise the addr comes from the code address as usual
|
-- otherwise the addr comes from the code address as usual
|
else data_addr(11 downto 4);
|
else data_addr(11 downto 4);
|
|
|
data_word_addr <= data_addr(11 downto 2);
|
data_word_addr <= data_addr(11 downto 2);
|
data_word_addr_wr <= data_line_addr & conv_std_logic_vector(data_refill_ctr,LINE_INDEX_SIZE);
|
data_word_addr_wr <= data_line_addr & conv_std_logic_vector(data_refill_ctr,LINE_INDEX_SIZE);
|
-- NOTE: the tag will be marked as INVALID ('1') when the CPU is invalidating
|
-- NOTE: the tag will be marked as INVALID ('1') when the CPU is invalidating
|
-- code lines (@note1)
|
-- code lines (@note1)
|
|
-- FIXME explain role of ic_invalidate in this logic
|
data_tag <=
|
data_tag <=
|
(ic_invalidate or not data_tag_match_valid) &
|
(ic_invalidate or not data_tag_match_valid) &
|
data_addr(31 downto 27) &
|
data_addr(31 downto 27) &
|
data_addr(11+DATA_TAG_SIZE-5 downto 11+1);
|
data_addr(11+DATA_TAG_SIZE-5 downto 11+1);
|
|
|
Line 1247... |
Line 1255... |
-- Data can only come from SRAM (including 16- and 8- bit interfaces)
|
-- Data can only come from SRAM (including 16- and 8- bit interfaces)
|
with ps select data_refill_data <=
|
with ps select data_refill_data <=
|
bram_rd_data when data_refill_bram_1,
|
bram_rd_data when data_refill_bram_1,
|
sram_rd_data when others;
|
sram_rd_data when others;
|
|
|
|
------------------------------------------------------------------------------
|
|
|
|
|
|
|
--------------------------------------------------------------------------------
|
|
--------------------------------------------------------------------------------
|
|
-- OLD Data cache (unimplemented -- uses stub cache logic)
|
|
|
|
-- -- CPU data input mux: direct cache output OR uncached io input
|
|
-- with ps select data_rd <=
|
|
-- io_rd_data when data_read_io_1,
|
|
-- data_cache_rd when others;
|
|
--
|
|
-- -- All the tag match logic is unfinished and will be simplified away in synth.
|
|
-- -- The 'cache' is really a single register.
|
|
-- data_cache_rd <= data_cache_store;
|
|
-- data_cache_tag <= data_cache_tag_store;
|
|
--
|
|
-- data_cache_memory:
|
|
-- process(clk)
|
|
-- begin
|
|
-- if clk'event and clk='1' then
|
|
-- if reset='1' then
|
|
-- -- in the real hardware the tag store can't be reset and it's up
|
|
-- -- to the SW to initialize the cache.
|
|
-- data_cache_tag_store <= (others => '0');
|
|
-- data_cache_store <= (others => '0');
|
|
-- else
|
|
-- -- Refill data cache if necessary
|
|
-- if ps=data_refill_sram_1 or ps=data_refill_sram8_3 then
|
|
-- data_cache_tag_store <=
|
|
-- "01" & data_rd_addr_reg(t_data_tag'high-2 downto t_data_tag'low);
|
|
-- data_cache_store <= sram_rd_data;
|
|
-- elsif ps=data_refill_bram_1 then
|
|
-- data_cache_tag_store <=
|
|
-- "01" & data_rd_addr_reg(t_data_tag'high-2 downto t_data_tag'low);
|
|
-- data_cache_store <= bram_rd_data;
|
|
-- end if;
|
|
-- end if;
|
|
-- end if;
|
|
-- end process data_cache_memory;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
--------------------------------------------------------------------------------
|
|
--------------------------------------------------------------------------------
|
--------------------------------------------------------------------------------
|
-- SRAM interface
|
-- SRAM interface
|
|
|
-- Note this signals are meant to be connected directly to FPGA pins (and then
|
-- Note this signals are meant to be connected directly to FPGA pins (and then
|
-- to a SRAM, of course). They are the only signals whose tco we care about.
|
-- to a SRAM, of course). They are the only signals whose tco we care about.
|
Line 1415... |
Line 1373... |
|
|
|
|
--------------------------------------------------------------------------------
|
--------------------------------------------------------------------------------
|
-- CPU stall control
|
-- CPU stall control
|
|
|
-- FIXME data_miss should be raised only on the cycle a data miss is detected,
|
|
-- otherwise it overlaps data_wait
|
|
--@@@data_miss <= read_pending; -- FIXME stub; will change with real D-Cache
|
|
|
|
-- Stall the CPU when either state machine needs it
|
-- Stall the CPU when either state machine needs it
|
mem_wait <=
|
mem_wait <=
|
(code_wait or data_wait or -- code or data refill in course
|
(code_wait or data_wait or -- code or data refill in course
|
code_miss or data_miss -- code or data miss
|
code_miss or data_miss -- code or data miss
|
) and not reset; -- FIXME stub
|
) and not reset; -- FIXME stub
|
Line 1458... |
Line 1412... |
'1' when data_refill_sram8_3,
|
'1' when data_refill_sram8_3,
|
'1' when data_refill_bram_0,
|
'1' when data_refill_bram_0,
|
'1' when data_refill_bram_1,
|
'1' when data_refill_bram_1,
|
'1' when data_refill_bram_2,
|
'1' when data_refill_bram_2,
|
'1' when data_read_io_0,
|
'1' when data_read_io_0,
|
-- Otherwise, we stall the CPU the cycle after a RD or WR is triggered
|
-- In any other state, stall CPU only if there's a RD/WR pending.
|
read_pending or write_pending when idle,
|
read_pending or write_pending when others;
|
|
|
'0' when others;
|
|
|
|
end architecture direct;
|
end architecture direct;
|
|
|
No newline at end of file
|
No newline at end of file
|