OpenCores
URL https://opencores.org/ocsvn/zipcpu/zipcpu/trunk

Subversion Repositories zipcpu

[/] [zipcpu/] [trunk/] [rtl/] [core/] [dcache.v] - Rev 208

Go to most recent revision | Compare with Previous | Blame | View Log

////////////////////////////////////////////////////////////////////////////////
//
// Filename:	dcache.v
//
// Project:	Zip CPU -- a small, lightweight, RISC CPU soft core
//
// Purpose:	To provide a simple data cache for the ZipCPU.  The cache is
//		designed to be a drop in replacement for the pipememm memory
//	unit currently existing within the ZipCPU.  The goal of this unit is
//	to achieve single cycle read access to any memory in the last cache line
//	used, or two cycle access to any memory currently in the cache.
//
//	The cache separates between four types of accesses, one write and three
//	read access types.  The read accesses are split between those that are
//	not cacheable, those that are in the cache, and those that are not.
//
//	1. Write accesses always create writes to the bus.  For these reasons,
//		these may always be considered cache misses.
//
//		Writes to memory locations within the cache must also update
//		cache memory immediately, to keep the cache in synch.
//
//		It is our goal to be able to maintain single cycle write
//		accesses for memory bursts.
//
//	2. Read access to non-cacheable memory locations will also immediately
//		go to the bus, just as all write accesses go to the bus.
//
//	3. Read accesses to cacheable memory locations will immediately read
//		from the appropriate cache line.  However, since thee valid
//		line will take a second clock to read, it may take up to two
//		clocks to know if the memory was in cache.  For this reason,
//		we bypass the test for the last validly accessed cache line.
//
//		We shall design these read accesses so that reads to the cache
//		may take place concurrently with other writes to the bus.
//
//	Errors in cache reads will void the entire cache line.  For this reason,
//	cache lines must always be of a smaller in size than any associated
//	virtual page size--lest in the middle of reading a page a TLB miss
//	take place referencing only a part of the cacheable page.
//
//	
//
//
// Creator:	Dan Gisselquist, Ph.D.
//		Gisselquist Technology, LLC
//
////////////////////////////////////////////////////////////////////////////////
//
// Copyright (C) 2016, Gisselquist Technology, LLC
//
// This program is free software (firmware): you can redistribute it and/or
// modify it under the terms of  the GNU General Public License as published
// by the Free Software Foundation, either version 3 of the License, or (at
// your option) any later version.
//
// This program is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTIBILITY or
// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
// for more details.
//
// License:	GPL, v3, as defined and found on www.gnu.org,
//		http://www.gnu.org/licenses/gpl.html
//
//
////////////////////////////////////////////////////////////////////////////////
//
//
module	dcache(i_clk, i_rst, i_pipe_stb, i_lock,
		i_op, i_addr, i_data, i_oreg,
			o_busy, o_pipe_stalled, o_valid, o_err, o_wreg,o_data,
		o_wb_cyc_gbl, o_wb_cyc_lcl, o_wb_stb_gbl, o_wb_stb_lcl,
			o_wb_we, o_wb_addr, o_wb_data,
		i_wb_ack, i_wb_stall, i_wb_err, i_wb_data);
	parameter	LGCACHELEN = 8,
			ADDRESS_WIDTH=32,
			LGNLINES=5, // Log of the number of separate cache lines
			IMPLEMENT_LOCK=0,
			NAUX=5;	// # of aux d-wires to keep aligned w/memops
	localparam 	SDRAM_BIT = 26;
	localparam	FLASH_BIT = 22;
	localparam	BLKRAM_BIT= 15;
	localparam	AW = ADDRESS_WIDTH; // Just for ease of notation below
	localparam	CS = LGCACHELEN; // Number of bits in a cache address
	localparam	LS = CS-LGNLINES; // Bits to spec position w/in cline
	localparam	LGAUX = 3; // log_2 of the maximum number of piped data 
	input			i_clk, i_rst;
	// Interface from the CPU
	input			i_pipe_stb, i_lock;
	input			i_op;
	input	[31:0]		i_addr;
	input	[31:0]		i_data;
	input	[(NAUX-1):0]	i_oreg;	// Aux data, such as reg to write to
	// Outputs, going back to the CPU
	output	wire		o_busy, o_pipe_stalled, o_valid, o_err;
	output reg [(NAUX-1):0]	o_wreg;
	output	reg	[31:0]	o_data;
	// Wishbone bus master outputs
	output	wire		o_wb_cyc_gbl, o_wb_cyc_lcl;
	output	reg		o_wb_stb_gbl, o_wb_stb_lcl;
	output	reg		o_wb_we;
	output	reg	[(AW-1):0]	o_wb_addr;
	output	reg	[31:0]		o_wb_data;
	// Wishbone bus slave response inputs
	input				i_wb_ack, i_wb_stall, i_wb_err;
	input		[31:0]		i_wb_data;
 
 
	reg	cyc, stb, last_ack, end_of_line, last_line_stb;
 
 
	reg	[((1<<LGNLINES)-1):0] c_v;	// One bit per cache line, is it valid?
	reg	[(AW-LS-1):0]	c_vtags	[0:((1<<LGNLINES)-1)];
	reg	[31:0]		c_mem	[0:((1<<CS)-1)];
	// reg	[((1<<LGNLINES)-1):0]	c_wr; // Is the cache line writable?
	// reg	c_wdata;
	// reg	c_waddr;
 
	// To simplify writing to the cache, and the job of the synthesizer to
	// recognize that a cache write needs to take place, we'll take an extra
	// clock to get there, and use these c_w... registers to capture the
	// data in the meantime.
	reg			c_wr;
	reg	[31:0]		c_wdata;
	reg	[(CS-1):0]	c_waddr;
 
	reg	[(AW-LS-1):0]	last_tag;
 
 
	wire	[(LGNLINES-1):0]	i_cline;
	wire	[(CS-1):0]	i_caddr;
	wire	[(AW-LS-1):0]	i_ctag;
 
	assign	i_cline = i_addr[(CS-1):LS];
	assign	i_caddr = i_addr[(CS-1):0];
	assign	i_ctag  = i_addr[(AW-1):LS];
 
	wire	cache_miss_inow, w_cachable;
	assign	cache_miss_inow = (last_tag != i_addr[31:LS])||(!c_v[i_cline]);
	assign	w_cachable = (i_addr[31:30]!=2'b11)&&(!i_lock)&&(
				((SDRAM_BIT>0)&&(i_addr[SDRAM_BIT]))
				||((FLASH_BIT>0)&&(i_addr[FLASH_BIT]))
				||((BLKRAM_BIT>0)&&(i_addr[BLKRAM_BIT])));
 
	reg	r_cachable, r_svalid, r_dvalid, r_rd, r_cache_miss, r_rvalid;
	reg	[(AW-1):0]	r_addr;
	reg	[31:0]		r_idata, r_ddata, r_rdata;
	wire	[(LGNLINES-1):0]	r_cline;
	wire	[(CS-1):0]	r_caddr;
	wire	[(AW-LS-1):0]	r_ctag;
 
	assign	r_cline = r_addr[(CS-1):LS];
	assign	r_caddr = r_addr[(CS-1):0];
	assign	r_ctag  = r_addr[(AW-1):LS];
 
 
	reg	wr_cstb, r_iv, pipeable_op, non_pipeable_op, in_cache;
	reg	[(AW-LS-1):0]	r_itag;
 
	//
	// The one-clock delayed read values from the cache.
	//
	initial	r_rd = 1'b0;
	initial	r_cachable = 1'b0;
	initial	r_svalid = 1'b0;
	initial	r_dvalid = 1'b0;
	always @(posedge i_clk)
	begin
		// The single clock path
		r_idata <= c_mem[i_addr[(CS-1):0]];
		// The valid for the single clock path
		//	Only ... we need to wait if we are currently writing
		//	to our cache.
		r_svalid<= (!i_op)&&(!cache_miss_inow)&&(w_cachable)
				&&(i_pipe_stb)&&(!c_wr)&&(!wr_cstb);
 
		//
		// The two clock in-cache path
		//
		// Some preliminaries that needed to be calculated on the first
		// clock
		if (!o_busy)
		begin
			r_iv   <= c_v[i_cline];
			r_itag <= c_vtags[i_cline];
			r_addr <= i_addr;
			r_cachable <= (!i_op)&&(w_cachable)&&(i_pipe_stb);
		end else begin
			r_iv   <= c_v[r_cline];
			r_itag <= c_vtags[r_cline];
		end
		// r_idata still contains the right answer
		r_rd <= (i_pipe_stb)&&(!i_op);
		r_ddata  <= r_idata;
		// r_itag contains the tag we didn't have available to us on the
		// last clock, r_ctag is a bit select from r_addr containing a
		// one clock delayed address.
		r_dvalid <= (r_itag == r_ctag)&&(r_iv)&&(r_cachable);
		if ((r_itag == r_ctag)&&(r_iv)&&(r_cachable))
			last_tag <= r_ctag;
 
		// r_cache miss takes a clock cycle.  It is only ever true for
		// something that should be cachable, but isn't in the cache.
		// A cache miss is only true _if_
		// 1. A read was requested
		// 2. It is for a cachable address, AND
		// 3. It isn't in the cache on the first read
		//	or the second read
		// 4. The read hasn't yet started to get this address
		r_cache_miss <= ((!cyc)||(o_wb_we))&&(r_cachable)
				// One clock path -- miss
				&&(!r_svalid)
				// Two clock path -- misses as well
				&&(r_rd)&&(!r_svalid)
				&&((r_itag != r_ctag)||(!r_iv));
 
		r_rdata <= c_mem[r_addr[(CS-1):0]];
		r_rvalid<= ((i_wb_ack)&&(last_ack));
	end
 
`define	DC_IDLE		2'b00
`define	DC_WRITE	2'b01
`define	DC_READS	2'b10
`define	DC_READC	2'b11
	reg	[1:0]	state;
 
	reg	[(AW-LS-1):0]	wr_wtag, wr_vtag;
	reg	[31:0]		wr_data;
	reg	[(CS-1):0]	wr_addr;
	always @(posedge i_clk)
	begin
		// By default, update the cache from the write 1-clock ago
		c_wr <= (wr_cstb)&&(wr_wtag == wr_vtag);
		c_wdata <= wr_data;
		c_waddr <= wr_addr[(CS-1):0];
 
		wr_cstb <= 1'b0;
		wr_vtag <= c_vtags[o_wb_addr[(CS-LS-1):0]];
		wr_wtag <= o_wb_addr[(AW-LS-1):0];
		wr_data <= o_wb_data;
		wr_addr <= o_wb_addr[(CS-1):0];
 
 
		if (LS <= 1)
			end_of_line <= 1'b1;
		else
			end_of_line<=(cyc)&&((c_waddr[(LS-1):1]=={(LS-1){1'b1}})
				||((i_wb_ack)
				&&(c_waddr[(LS-1):0]=={{(LS-2){1'b1}},2'b01})));
 
		if (LS <= 1)
			last_line_stb <= 1'b1;
		else
			last_line_stb <= (stb)&&
				((o_wb_addr[(LS-1):1]=={(LS-1){1'b1}})
				||((!i_wb_stall)
					&&(o_wb_addr[(LS-1):0]
						=={{(LS-2){1'b1}},2'b01})));
 
		//
		if (state == `DC_IDLE)
			pipeable_op <= 1'b0;
		if (state == `DC_IDLE)
			non_pipeable_op <= 1'b0;
 
 
		if (state == `DC_IDLE)
		begin
			o_wb_we <= 1'b0;
			o_wb_data <= i_data;
			pipeable_op <= 1'b0;
			non_pipeable_op <= 1'b1;
 
			cyc <= 1'b0;
			stb <= 1'b0;
 
			r_wb_cyc_gbl <= 1'b0;
			r_wb_cyc_lcl <= 1'b0;
			o_wb_stb_gbl <= 1'b0;
			o_wb_stb_lcl <= 1'b0;
 
			in_cache <= (i_op)&&(w_cachable);
			if ((i_pipe_stb)&&(i_op))
			begin // Write  operation
				state <= `DC_WRITE;
				o_wb_addr <= i_addr;
				o_wb_we <= 1'b1;
				pipeable_op <= 1'b1;
 
				cyc <= 1'b1;
				stb <= 1'b1;
 
				r_wb_cyc_gbl <= (i_addr[31:30]!=2'b11);
				r_wb_cyc_lcl <= (i_addr[31:30]==2'b11);
				o_wb_stb_gbl <= (i_addr[31:30]!=2'b11);
				o_wb_stb_lcl <= (i_addr[31:30]==2'b11);
 
			end else if (r_cache_miss)
			begin
				state <= `DC_READC;
				o_wb_addr <= { i_ctag, {(LS){1'b0}} };
				non_pipeable_op <= 1'b1;
 
				cyc <= 1'b1;
				stb <= 1'b1;
				r_wb_cyc_gbl <= 1'b1;
				o_wb_stb_gbl <= 1'b1;
			end else if ((i_pipe_stb)&&(!w_cachable))
			begin // Read non-cachable memory area
				state <= `DC_READS;
				o_wb_addr <= i_addr;
				pipeable_op <= 1'b1;
 
				cyc <= 1'b1;
				stb <= 1'b1;
				r_wb_cyc_gbl <= (i_addr[31:30]!=2'b11);
				r_wb_cyc_lcl <= (i_addr[31:30]==2'b11);
				o_wb_stb_gbl <= (i_addr[31:30]!=2'b11);
				o_wb_stb_lcl <= (i_addr[31:30]==2'b11);
			end // else we stay idle
 
		end else if (state == `DC_READC)
		begin
			// We enter here once we have committed to reading
			// data into a cache line.
			if ((stb)&&(!i_wb_stall))
			begin
				stb <= (!last_line_stb);
				o_wb_stb_gbl <= (!last_line_stb);
				o_wb_addr[(LS-1):0] <= o_wb_addr[(LS-1):0]+1'b1;
			end
 
			if(stb)
				c_v[o_wb_addr[(CS-LS-1):0]] <= 1'b0;
 
			c_wr <= (i_wb_ack);
			c_wdata <= o_wb_data;
			c_waddr <= ((c_wr)?(c_waddr+1'b1):c_waddr);
 
			c_vtags[o_wb_addr[(CS-LS-1):0]]<= o_wb_addr[(AW-LS-1):0];
 
			if (((i_wb_ack)&&(end_of_line))||(i_wb_err))
			begin
				state           <= `DC_IDLE;
				non_pipeable_op <= 1'b0;
				cyc <= 1'b0;
				r_wb_cyc_gbl <= 1'b0;
				r_wb_cyc_lcl <= 1'b0;
				//
				c_v[o_wb_addr[(CS-LS-1):0]] <= i_wb_ack;
			end
		end else if (state == `DC_READS)
		begin
			// We enter here once we have committed to reading
			// data that cannot go into a cache line
			if ((!i_wb_stall)&&(!i_pipe_stb))
			begin
				stb <= 1'b0;
				o_wb_stb_gbl <= 1'b0;
				o_wb_stb_lcl <= 1'b0;
				pipeable_op <= 1'b0;
			end
 
			if ((!i_wb_stall)&&(i_pipe_stb))
				o_wb_addr <= i_data;
 
			c_wr <= 1'b0;
 
			if (((i_wb_ack)&&(last_ack))||(i_wb_err))
			begin
				state        <= `DC_IDLE;
				cyc          <= 1'b0;
				r_wb_cyc_gbl <= 1'b0;
				r_wb_cyc_lcl <= 1'b0;
			end
		end else if (state == `DC_WRITE)
		begin
			// c_wr    <= (c_v[])&&(c_tag[])&&(in_cache)&&(stb);
			c_wdata <= o_wb_data;
			c_waddr <= (state == `DC_IDLE)?i_caddr
				: ((c_wr)?(c_waddr+1'b1):c_waddr);
 
			if ((!i_wb_stall)&&(!i_pipe_stb))
			begin
				stb          <= 1'b0;
				o_wb_stb_gbl <= 1'b0;
				o_wb_stb_lcl <= 1'b0;
				pipeable_op  <= 1'b0;
			end
 
			wr_cstb  <= (stb)&&(!i_wb_stall)&&(in_cache);
 
			if ((stb)&&(!i_wb_stall)&&(i_pipe_stb))
				o_wb_addr <= i_addr;
			if ((stb)&&(!i_wb_stall)&&(i_pipe_stb))
				o_wb_data <= i_data;
 
			if (((i_wb_ack)&&(last_ack))||(i_wb_err))
			begin
				state        <= `DC_IDLE;
				cyc          <= 1'b0;
				r_wb_cyc_gbl <= 1'b0;
				r_wb_cyc_lcl <= 1'b0;
			end
		end
	end
 
	//
	// Writes to the cache
	//
	// These have been made as simple as possible.  Note that the c_wr
	// line has already been determined, as have the write value and address
	// on the last clock.  Further, this structure is defined to match the
	// block RAM design of as many architectures as possible.
	// 
	always @(posedge i_clk)
		if (c_wr)
			c_mem[c_waddr] <= c_wdata;
 
	//
	// Reads from the cache
	//
	// Some architectures require that all reads be registered.  We
	// accomplish that here.  Whether or not the result of this read is
	// going to be our output will need to be determined with combinatorial
	// logic on the output.
	//
	reg	[31:0]	cached_idata, cached_rdata;
	always @(posedge i_clk)
		cached_idata <= c_mem[i_caddr];
 
	always @(posedge i_clk)
		cached_rdata <= c_mem[r_caddr];
 
// o_data can come from one of three places:
// 1. The cache, assuming the data was in the last cache line
// 2. The cache, second clock, assuming the data was in the cache at all
// 3. The cache, after filling the cache
// 4. The wishbone state machine, upon reading the value desired.
	always @(posedge i_clk)
		if (r_svalid)
			o_data <= cached_idata;
		else if ((i_wb_ack)&&(pipeable_op))
			o_data <= i_wb_data;
		else
			o_data <= cached_rdata;
	always @(posedge i_clk)
		o_valid <= (r_svalid)||((i_wb_ack)&&(pipeable_op))
				||(r_dvalid)||(r_rvalid);
	always @(posedge i_clk)
		o_err <= (cyc)&&(i_wb_err);
 
	assign	o_busy = (state != `DC_IDLE);
 
 
	//
	// Handle our auxilliary data lines.
	//
	// These just go into a FIFO upon request, and then get fed back out
	// upon completion of an OP.
	//
	// These are currently designed for handling bursts of writes or
	// non-cachable  reads.
	//
	// A very similar structure will be used once we switch to using an
	// MMU, in order to make certain memory operations are synchronous
	// enough to deal with bus errors.
	//
	reg	[(LGAUX-1):0]	aux_head, aux_tail;
	reg	[(NAUX-1):0]	aux_fifo [0:((1<<LGAUX)-1)];
	initial	aux_head = 0;
	initial	aux_tail = 0;
	always @(posedge i_clk)
	begin
		if ((i_rst)||(i_wb_err))
			aux_head <= 0;
		else if ((i_pipe_stb)&&(!o_busy))
			aux_head <= aux_head + 1'b1;
		aux_fifo[aux_head] <= i_oreg;
	end
	always @(posedge i_clk)
	begin
		if ((i_rst)||(i_wb_err))
			aux_tail <= 0;
		else if (o_valid) // ||(aux_tail[WBIT])&&(no-mmu-error)
			aux_tail <= aux_tail + 1'b1;
		o_wreg <= aux_fifo[aux_tail];
	end
 
	//
	// We can use our FIFO addresses to pre-calculate when an ACK is going
	// to be the last_noncachable_ack.
 
 
	assign o_pipe_stalled=((pipeable_op)&&(i_wb_stall))||(non_pipeable_op);
	// pipeable_op must become zero when stb goes low
 
	always @(posedge i_clk)
	begin
		lock_gbl <= (i_lock)&&((r_wb_cyc_gbl)||(lock_gbl));
		lock_lcl <= (i_lock)&&((r_wb_cyc_lcl)||(lock_lcl));
	end
 
	assign	o_wb_cyc_gbl = (r_wb_cyc_gbl)||(lock_gbl);
	assign	o_wb_cyc_lcl = (r_wb_cyc_lcl)||(lock_lcl);
endmodule
 

Go to most recent revision | Compare with Previous | Blame | View Log

powered by: WebSVN 2.1.0

© copyright 1999-2024 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.