1 |
26 |
Agner |
//////////////////////////////////////////////////////////////////////////////////
|
2 |
|
|
// Engineer: Agner Fog
|
3 |
|
|
//
|
4 |
|
|
// Create Date: 2020-05-03
|
5 |
|
|
// Last modified: 2021-07-30
|
6 |
|
|
// Module Name: fetch
|
7 |
|
|
// Project Name: ForwardCom soft core
|
8 |
|
|
// Target Devices: Artix 7
|
9 |
|
|
// Tool Versions: Vivado v. 2020.1
|
10 |
|
|
// License: CERN-OHL-W v. 2 or later
|
11 |
|
|
// Description: Instruction fetch and unconditional jump, call, and return
|
12 |
|
|
//
|
13 |
|
|
//////////////////////////////////////////////////////////////////////////////////
|
14 |
|
|
`include "defines.vh"
|
15 |
|
|
|
16 |
|
|
// code address to jump to when reset button is pressed
|
17 |
|
|
parameter max_loader_size = (`MAX_LOADER_SIZE) << 2; // loader size in words
|
18 |
|
|
parameter code_memory_start = 2**`CODE_ADDR_START;
|
19 |
|
|
parameter code_memory_size = 2**(`CODE_ADDR_WIDTH+2);
|
20 |
|
|
//parameter code_memory_end = code_memory_start + code_memory_size;
|
21 |
|
|
parameter loader_start_address = code_memory_size - max_loader_size; // address of loader relative to code memory start, in bytes
|
22 |
|
|
|
23 |
|
|
// upper 7 bits of instruction word identifying unconditional jump or call
|
24 |
|
|
parameter instruction_jump_uncond = 7'b0111100; // next bit is 1 for call, 0 for jump. The rest is 24 bits signed offset
|
25 |
|
|
// upper 11 bits of instruction word identifying return instruction
|
26 |
|
|
parameter instruction_return = 11'b01110111110;
|
27 |
|
|
// upper 11 bits of instruction word identifying sys_return instruction
|
28 |
|
|
parameter instruction_sys_return = 11'b01111111110;
|
29 |
|
|
// upper 4 bits of any 1-word control transfer instruction
|
30 |
|
|
parameter instruction_jumpa = 4'b0111;
|
31 |
|
|
// upper 8 bits of any 2-word control transfer instruction
|
32 |
|
|
parameter instruction_jump2w = 8'b10101000;
|
33 |
|
|
// upper 8 bits of any 3-word control transfer instruction
|
34 |
|
|
parameter instruction_jump3w = 8'b11001000;
|
35 |
|
|
// bit OP1 for push and pop instructions (= 56,57)
|
36 |
|
|
parameter instruction_push_pop = 6'b111000;
|
37 |
|
|
// upper 11 bits of instruction word identifying read_perfs serializing instruction. Need M bit too
|
38 |
|
|
parameter instruction_read_perfs = 11'b01000100101;
|
39 |
|
|
|
40 |
|
|
|
41 |
|
|
// Fetch module: fetch instructions from memory or code cache
|
42 |
|
|
module fetch
|
43 |
|
|
( input clock, // system clock (100 MHz)
|
44 |
|
|
input clock_enable, // clock enable. Used when single-stepping
|
45 |
|
|
input reset, // system reset.
|
46 |
|
|
input restart, // restart running program
|
47 |
|
|
input valid_in, // valid data from code cache ready
|
48 |
|
|
input stall_in, // a later stage in pipeline is stalled
|
49 |
|
|
input jump_in, // a jump target is coming from the ALU. jump_pointer has been sent to the code cache
|
50 |
|
|
input nojump_in, // signal from ALU that the jump target is the next instruction
|
51 |
|
|
input [`CODE_ADDR_WIDTH-1:0] jump_pointer, // jump target from ALU
|
52 |
|
|
input [`CODE_DATA_WIDTH-1:0] read_data, // data from code cache
|
53 |
|
|
input [`CODE_ADDR_WIDTH-1:0] return_pop_data,// Return address popped here at return instruction
|
54 |
|
|
output reg [`CODE_ADDR_WIDTH-2:0] read_addr_out, // read address relative to code memory start
|
55 |
|
|
output reg read_enable_out, // code cache read enable
|
56 |
|
|
output reg valid_out, // An instruction is ready for output to decoder
|
57 |
|
|
output reg jump_out, // A jump instruction is bypassing the pipeline
|
58 |
|
|
output reg [`CODE_ADDR_WIDTH-1:0] instruction_pointer_out, // address of current instruction
|
59 |
|
|
output reg [95:0] instruction_out, // current instruction, up to 3 words long
|
60 |
|
|
output reg call_e_out, // Executing call instruction. push_data contains return address
|
61 |
|
|
output reg return_e_out, // Executing return instruction. return address is available in advance on pop_data
|
62 |
|
|
output reg stall_predict_out, // Predict that decoder will use multiple clock cycles
|
63 |
|
|
output reg [`CODE_ADDR_WIDTH-1:0] call_push_data_out, // Return address pushed here at call instruction
|
64 |
|
|
output reg [31:0] debug1_out // temporary debug output
|
65 |
|
|
);
|
66 |
|
|
|
67 |
|
|
// Efficient handling of jumps, calls, and returns:
|
68 |
|
|
// Unconditional jumps, calls, and returns are executed directly in the fetch unit rather
|
69 |
|
|
// than waiting for the instruction to go through the pipeline.
|
70 |
|
|
// Conditional and indirect jumps must go to the ALU. The jump target address is fed from the ALU
|
71 |
|
|
// directly to the code cache in order to save one clock cycle.
|
72 |
|
|
// Direct calls and returns are communicating directly with the call stack.
|
73 |
|
|
// Indirect calls are handled in both fetch unit and ALU. The return address is pushed on the
|
74 |
|
|
// call stack by the fecth module while the target address comes from the ALU.
|
75 |
|
|
// Return addresses are obtained from the call stack. It takes one clock to send a call or return
|
76 |
|
|
// request to the call stack and another clock to retrieve the return address from the stack.
|
77 |
|
|
// Therefore, it is not possible to execute a return in the first clock cycle after another
|
78 |
|
|
// call or return. The fetch module does not check for this because the second return is delayed
|
79 |
|
|
// for a clock cycle anyway to wait for the target to be fetched from the code cache.
|
80 |
|
|
|
81 |
|
|
parameter fetch_buffer_size = 8; // number of 32-bit words in instruction fetch buffer
|
82 |
|
|
|
83 |
|
|
// Name suffixes on local variables:
|
84 |
|
|
// 0: relates to the instruction that is currently in output registers
|
85 |
|
|
// 1: relates to the instruction that is being generated in the current clock cycle
|
86 |
|
|
// 2: relates to the instruction that will be generated in the next clock cycle
|
87 |
|
|
|
88 |
|
|
reg [0:fetch_buffer_size-1][31:0] fetch_buffer; // instruction buffer, (fetch_buffer_size) * 32-bit words
|
89 |
|
|
reg unsigned [3:0] valid_words0; // number of valid 32-bit words in fetch_buffer
|
90 |
|
|
logic unsigned [3:0] valid_words1; // number of valid words in fetch_buffer in next clock cycle
|
91 |
|
|
logic unsigned [1:0] instruction_length0; // length of current instruction, in 32-bit words
|
92 |
|
|
logic unsigned [1:0] instruction_length1; // length of next instruction, in 32-bit words
|
93 |
|
|
logic unsigned [1:0] instruction_length2; // length of 2. next instruction, in 32-bit words
|
94 |
|
|
logic instruction_ready0; // current instruction has been fetched
|
95 |
|
|
logic instruction_ready1; // instruction 1 will be dispatched in next clock cycle
|
96 |
|
|
|
97 |
|
|
logic [1:0] buffer_action; // 0: idle. nothing dispatched. buffer is full or waiting for data
|
98 |
|
|
// 1: fill buffer. nothing dispatched. new data arriving from code cache
|
99 |
|
|
// 2: dispatch. instruction 0 is dispatched to the pipeline. shift down data
|
100 |
|
|
// 3: dispatch and fill.
|
101 |
|
|
logic shift_out0; // instruction 0 is dispatched in this clock cycle and fetch_buffer is shifted to get the next instruction to position 0
|
102 |
|
|
logic unsigned [1:0] dispatch_length0; // length of dispatched instruction
|
103 |
|
|
logic send_next; // send an address to code cache. true if buffer is sure not to overflow in next two clocks
|
104 |
|
|
logic [3:0] fetch_buffer_pos; // position where to write to fetch_buffer from cache
|
105 |
|
|
|
106 |
|
|
logic early_jump; // jump instruction detected in instruction 1 or 2
|
107 |
|
|
logic conditional_jump; // a conditional or indirect jump or call detected in instruction 1. Wait for ALU to find target
|
108 |
|
|
logic [1:0] call_instruction; // 1: any kind of call or trap detected in instruction 1 or 2. Push return address on stack
|
109 |
|
|
// 2: return or system return instruction detected. pop return address from stack
|
110 |
|
|
logic unsigned [`CODE_ADDR_WIDTH-1:0] early_jump_addr; // target address for early jump
|
111 |
|
|
reg unsigned [`CODE_ADDR_WIDTH:0] jump_target; // save jump target address. may be calculated here for unconditional jump, or input from ALU for conditional jump
|
112 |
|
|
logic unsigned [`CODE_ADDR_WIDTH:0] reset_target;// Address of loader or restart code
|
113 |
|
|
reg restart_underway; // remember restarting is in process
|
114 |
|
|
|
115 |
|
|
logic unsigned [`CODE_ADDR_WIDTH-1:0] return_addr; // return address after call instruction
|
116 |
|
|
logic [31:0] word1; // first word of instruction 1
|
117 |
|
|
logic unsigned [`CODE_ADDR_WIDTH-1:0] instruction_pointer1; // address of instruction 1
|
118 |
|
|
|
119 |
|
|
reg [3:0] jump_case; // for debug display only. may be removed
|
120 |
|
|
|
121 |
|
|
// It takes two clock cycles to fetch data from the code cache: one clock to send an address to
|
122 |
|
|
// the code cache, and one clock to send the data from the code cache.
|
123 |
|
|
// The following three shift registers are keeping track for the data that is underway:
|
124 |
|
|
// next_underway is tracking sequential code, target_underway is tracking jump targets,
|
125 |
|
|
// and wait_for_target tells that we are waiting for a jump target to be calculated and fetched.
|
126 |
|
|
|
127 |
|
|
reg [1:0] next_underway; // target_underway is a shift register indicating that code words are underway from the code cache
|
128 |
|
|
// next_underway is shifted right with zero extension
|
129 |
|
|
// next_underway[0]: data arrived from code cache
|
130 |
|
|
// next_underway[1]: next address has been sent to code cache
|
131 |
|
|
|
132 |
|
|
reg [2:0] target_underway; // target_underway is a shift register indicating that a jump target is underway:
|
133 |
|
|
// target_underway is shifted right with zero extension
|
134 |
|
|
// 100: system reset
|
135 |
|
|
// 010: wait for target to be fetched from code cache
|
136 |
|
|
// 001: target code is inserted in fetch_buffer. Clear wait_for_target
|
137 |
|
|
|
138 |
|
|
reg wait_for_target; // wait_for_target indicates that an unconditional jump, call, or return
|
139 |
|
|
// is waiting for the target to be fetched from the code cache
|
140 |
|
|
|
141 |
|
|
reg wait_for_jump; // wait_for_jump indicates that a conditional or indirect jump or call
|
142 |
|
|
// has been dispatched and is waiting for the ALU to deliver the target address
|
143 |
|
|
|
144 |
|
|
|
145 |
|
|
// Analyze the status of fetch_buffer:
|
146 |
|
|
always_comb begin
|
147 |
|
|
|
148 |
|
|
// if (restart == 0): Start address is loader address
|
149 |
|
|
// if (restart == 1): Start address is restart address = loader address + 1
|
150 |
|
|
reset_target = {loader_start_address >> 3, (restart | restart_underway)};
|
151 |
|
|
|
152 |
|
|
// Find length and position of instruction 0
|
153 |
|
|
if (valid_words0 > 0) begin
|
154 |
|
|
instruction_length0 = fetch_buffer[0][31] ? fetch_buffer[0][31:30] : 2'b01; // the length of instruction 0
|
155 |
|
|
// instruction 0 is ready if all words belonging to the instruction are fetched.
|
156 |
|
|
instruction_ready0 = (valid_words0 >= instruction_length0) && !target_underway[0] && !wait_for_target;
|
157 |
|
|
shift_out0 = instruction_ready0 & !stall_in & !reset & (!wait_for_jump | nojump_in); // instruction 0 will be dispatched in this clock cycle
|
158 |
|
|
end else begin
|
159 |
|
|
// First instruction has not been fetched yet
|
160 |
|
|
instruction_length0 = 0;
|
161 |
|
|
instruction_ready0 = 0;
|
162 |
|
|
shift_out0 = 0;
|
163 |
|
|
end
|
164 |
|
|
|
165 |
|
|
// number of words dispatched
|
166 |
|
|
if (shift_out0)
|
167 |
|
|
dispatch_length0 = instruction_length0;
|
168 |
|
|
else
|
169 |
|
|
dispatch_length0 = 0;
|
170 |
|
|
|
171 |
|
|
// check if we can fill the buffer
|
172 |
|
|
if ((target_underway[0] | early_jump | jump_in) & valid_in) begin // overwrite buffer with new jump target
|
173 |
|
|
send_next = 1;
|
174 |
|
|
fetch_buffer_pos = 0;
|
175 |
|
|
end else begin
|
176 |
|
|
if (shift_out0) begin
|
177 |
|
|
fetch_buffer_pos = valid_words0 - instruction_length0;
|
178 |
|
|
end else begin
|
179 |
|
|
fetch_buffer_pos = valid_words0;
|
180 |
|
|
end
|
181 |
|
|
|
182 |
|
|
// determine whether we will fetch the next doubleword from the code cache.
|
183 |
|
|
// maybe this can be tweaked a little better, but make sure the fetch buffer cannot overflow in case of stalls
|
184 |
|
|
if (next_underway[0] & valid_in & next_underway[1]) begin
|
185 |
|
|
send_next = fetch_buffer_pos < fetch_buffer_size - 6;
|
186 |
|
|
end else if ((next_underway[0] & valid_in) | next_underway[1]) begin
|
187 |
|
|
send_next = fetch_buffer_pos < fetch_buffer_size - 4;
|
188 |
|
|
end else begin
|
189 |
|
|
send_next = fetch_buffer_pos < fetch_buffer_size - 2;
|
190 |
|
|
end
|
191 |
|
|
end
|
192 |
|
|
|
193 |
|
|
buffer_action[0] = (next_underway[0] | target_underway[0]) & valid_in; // fill buffer
|
194 |
|
|
buffer_action[1] = shift_out0; // instruction 0 dispatched. shift down buffer
|
195 |
|
|
|
196 |
|
|
// predict if the next instruction, i.e. instruction 1, will be ready in next clock cycle
|
197 |
|
|
if (target_underway[0] & valid_in) begin
|
198 |
|
|
if (jump_target[0])
|
199 |
|
|
valid_words1 = 1; // jumping to an odd address. use only the upper half of read_data
|
200 |
|
|
else
|
201 |
|
|
valid_words1 = 2; // jumping to even address. use 64 bits read_data
|
202 |
|
|
end else if (wait_for_target) begin
|
203 |
|
|
valid_words1 = 0;
|
204 |
|
|
end else begin
|
205 |
|
|
if (next_underway[0] & valid_in)
|
206 |
|
|
valid_words1 = valid_words0 - dispatch_length0 + 2;
|
207 |
|
|
else
|
208 |
|
|
valid_words1 = valid_words0 - dispatch_length0;
|
209 |
|
|
end
|
210 |
|
|
|
211 |
|
|
// Find first word of instruction 1 for the sake of early jump detection and predecoding.
|
212 |
|
|
// (Here, I am shortening the critical path
|
213 |
|
|
// valid_words0 -> instruction_length0 -> instruction_ready0 -> shift_out0 -> dispatch_length0
|
214 |
|
|
// -> valid_words1 -> word1 -> instruction_length1 -> early_jump_addr -> instruction_pointer_out
|
215 |
|
|
// by postponing "if (valid_words1 != 0)")
|
216 |
|
|
if (target_underway[0] && valid_in) begin // get instruction1 from jump target
|
217 |
|
|
if (jump_target[0]) begin
|
218 |
|
|
word1 = read_data[63:32]; // jumping to odd address
|
219 |
|
|
end else begin
|
220 |
|
|
word1 = read_data[31:0];
|
221 |
|
|
end
|
222 |
|
|
instruction_pointer1 = jump_target;
|
223 |
|
|
instruction_length1 = word1[31] ? word1[31:30] : 2'b01; // length of second instruction
|
224 |
|
|
end else if (valid_words0 > instruction_length0) begin // instruction 1 is already in buffer
|
225 |
|
|
word1 = fetch_buffer[instruction_length0];
|
226 |
|
|
instruction_pointer1 = instruction_pointer_out + instruction_length0;
|
227 |
|
|
instruction_length1 = word1[31] ? word1[31:30] : 2'b01; // length of second instruction
|
228 |
|
|
|
229 |
|
|
end else if (valid_words0 == instruction_length0) begin // instruction 1 is going into buffer in this clock cycle
|
230 |
|
|
word1 = read_data[31:0];
|
231 |
|
|
instruction_pointer1 = instruction_pointer_out + instruction_length0;
|
232 |
|
|
instruction_length1 = word1[31] ? word1[31:30] : 2'b01; // length of second instruction
|
233 |
|
|
|
234 |
|
|
end else if (valid_words0 > 0) begin // instruction 1 is partially in buffer
|
235 |
|
|
word1 = fetch_buffer[0];
|
236 |
|
|
instruction_pointer1 = instruction_pointer_out;
|
237 |
|
|
instruction_length1 = word1[31] ? word1[31:30] : 2'b01; // length of second instruction
|
238 |
|
|
|
239 |
|
|
end else begin
|
240 |
|
|
word1 = 0;
|
241 |
|
|
instruction_pointer1 = 0; //64'HXXXXXXXXXXXXXXXX;
|
242 |
|
|
instruction_length1 = 3; // indicate not ready
|
243 |
|
|
end
|
244 |
|
|
|
245 |
|
|
|
246 |
|
|
// Look for jump, call, and return instructions in instruction 1
|
247 |
|
|
// in order to fetch target as early as possible.
|
248 |
|
|
// This is done in the following way:
|
249 |
|
|
// Unconditional jumps, calls, and returns are handled as early as possible in order
|
250 |
|
|
// to fetch early from the target address and thereby save time. However,
|
251 |
|
|
// we have to check if there is a preceding jump or call in a preceding position in
|
252 |
|
|
// fetch_buffer before we execute a jump, call, or return in position 2.
|
253 |
|
|
// Conditional and indirect jumps are detected when they are in position 0 in fetch_buffer,
|
254 |
|
|
// and we have to wait for the ALU to find the target address.
|
255 |
|
|
// Indirect calls are are also detected when they are in position 0 in fetch_buffer:
|
256 |
|
|
// the return address is pushed on the call stack while we wait for the ALU to find the target address.
|
257 |
|
|
// The following variables tell what we have found here:
|
258 |
|
|
// early_jump: An unconditional jump, call, or return detected in position 1 or 2.
|
259 |
|
|
// conditional_jump: A conditional or indirect jump or call is detected. Wait for ALU to find target
|
260 |
|
|
// call_instruction: 1: any kind of call detected. Push return address on stack
|
261 |
|
|
// 2: a return or sys_return instruction detected. Pop return address from stack
|
262 |
|
|
|
263 |
|
|
conditional_jump = 0;
|
264 |
|
|
early_jump = 0;
|
265 |
|
|
early_jump_addr = 0;
|
266 |
|
|
call_instruction = 0;
|
267 |
|
|
return_addr = 0;
|
268 |
|
|
|
269 |
|
|
instruction_ready1 = (valid_words1 >= instruction_length1) & !reset && (!wait_for_jump | nojump_in); // instruction 1 will be dispatched in next clock cycle
|
270 |
|
|
//valid_out <= valid_words1 >= instruction_length1 & !reset && !early_jump & target_underway[2:1] == 0 & (!wait_for_jump | nojump_in);
|
271 |
|
|
|
272 |
|
|
|
273 |
|
|
if (valid_words1 != 0 && word1[31:28] == instruction_jumpa) begin
|
274 |
|
|
// Any single-word control transfer instruction is underway
|
275 |
|
|
if ((word1[31:25] == instruction_jump_uncond) & !stall_in & (!wait_for_jump | nojump_in)) begin
|
276 |
|
|
// unconditional jump or call instruction found in instruction 1
|
277 |
|
|
early_jump = 1;
|
278 |
|
|
early_jump_addr = $signed(word1[23:0]) + instruction_pointer1 + 1; // add 24-bit signed offset to address of end of instruction
|
279 |
|
|
call_instruction = word1[24]; // 0: unconditional jump, 1: direct call
|
280 |
|
|
return_addr = instruction_pointer1 + instruction_length1; // return address for call instruction
|
281 |
|
|
end else if ((word1[31:21] == instruction_return || word1[31:21] == instruction_sys_return) & !stall_in & (!wait_for_jump | nojump_in)) begin
|
282 |
|
|
// a return instruction is found in the first instruction
|
283 |
|
|
early_jump = 1;
|
284 |
|
|
early_jump_addr = return_pop_data; // get return address from call stack
|
285 |
|
|
call_instruction = 2; // 2 means return instruction
|
286 |
|
|
return_addr = 0;
|
287 |
|
|
end else if ((word1[`OP1] == `IJ_JUMP_INDIRECT_MEM+1 || word1[`OP1] == `IJ_JUMP_RELATIVE+1 || word1[`OP1] == `IJ_SYSCALL) & !stall_in & (!wait_for_jump | nojump_in)) begin
|
288 |
|
|
// an indirect call or system call instruction is found in the first instruction
|
289 |
|
|
early_jump = 0;
|
290 |
|
|
early_jump_addr = 0;
|
291 |
|
|
return_addr = instruction_pointer1 + instruction_length1; // return address to push on call stack
|
292 |
|
|
conditional_jump = 1; // this instruction must go the the ALU
|
293 |
|
|
if (word1[`OP1] == `IJ_TRAP && word1[`MODE] == 7) begin
|
294 |
|
|
// Trap or breakpoint in format 1.7C (IJ_TRAP == IJ_SYSCALL)
|
295 |
|
|
// The breakpoint instruction should not push a return address on the call stack as long
|
296 |
|
|
// as it only activates single step mode without calling any interrupt service routine.
|
297 |
|
|
// Note: this code must be changed if any traps or trap instructions go to an interrupt
|
298 |
|
|
// service routine that ends with a return or a system return.
|
299 |
|
|
// Setting call_instruction to 1 here will make the next return instruction fail if the
|
300 |
|
|
// trap does not end with a return.
|
301 |
|
|
call_instruction = 0;
|
302 |
|
|
end else begin
|
303 |
|
|
// All other indirect call and system call instructions
|
304 |
|
|
call_instruction = 1;
|
305 |
|
|
end
|
306 |
|
|
end else begin
|
307 |
|
|
// other conditional or indirect jump instruction found in instruction 1
|
308 |
|
|
early_jump = 0;
|
309 |
|
|
early_jump_addr = 0;
|
310 |
|
|
call_instruction = 0;
|
311 |
|
|
conditional_jump = 1; // this instruction must go the the ALU
|
312 |
|
|
return_addr = 0;
|
313 |
|
|
end
|
314 |
|
|
|
315 |
|
|
end else if (valid_words1 > 1 && word1[31:24] == instruction_jump2w) begin
|
316 |
|
|
// any double-word jump or call instruction found in the instruction 1
|
317 |
|
|
early_jump = 0;
|
318 |
|
|
early_jump_addr = 0;
|
319 |
|
|
conditional_jump = 1; // this instruction must go the the ALU
|
320 |
|
|
if (word1[5:0] == `IJ_JUMP_INDIRECT_MEM + 1 // indirect call
|
321 |
|
|
|| word1[5:0] == `IJ_JUMP_RELATIVE + 1 // call with relative pointer
|
322 |
|
|
|| word1[5:0] == `IJ_SYSCALL // system call
|
323 |
|
|
|| word1[`OP1] == 7 // system call
|
324 |
|
|
) begin
|
325 |
|
|
call_instruction = !stall_in & (!wait_for_jump | nojump_in); // push return address on stack
|
326 |
|
|
return_addr = instruction_pointer1 + instruction_length1;
|
327 |
|
|
end else begin
|
328 |
|
|
call_instruction = 0;
|
329 |
|
|
return_addr = 0;
|
330 |
|
|
end
|
331 |
|
|
|
332 |
|
|
end else if (valid_words1 > 2 && word1[31:24] == instruction_jump3w) begin
|
333 |
|
|
// any triple-word jump or call instruction found in first instruction
|
334 |
|
|
early_jump = 0;
|
335 |
|
|
early_jump_addr = 0;
|
336 |
|
|
conditional_jump = 1; // this instruction must go the the ALU
|
337 |
|
|
if (word1[5:0] == `IJ_JUMP_INDIRECT_MEM+1 // 64-bit call
|
338 |
|
|
|| word1[5:0] == `IJ_SYSCALL // system call
|
339 |
|
|
) begin
|
340 |
|
|
call_instruction = !stall_in & (!wait_for_jump | nojump_in); // push return address on stack
|
341 |
|
|
return_addr = instruction_pointer1 + instruction_length1;
|
342 |
|
|
end else begin
|
343 |
|
|
call_instruction = 0;
|
344 |
|
|
return_addr = 0;
|
345 |
|
|
end
|
346 |
|
|
end else if (valid_words1 != 0 && word1[31:21] == instruction_read_perfs && word1[`M]) begin
|
347 |
|
|
// the serializing instruction read_perfs must flush the pipeline.
|
348 |
|
|
// Use the conditional jump mechanism for this, and give a nojump_in when ready to resume feeding the pipeline
|
349 |
|
|
conditional_jump = 1; // serializing instruction read_perfs
|
350 |
|
|
end
|
351 |
|
|
end
|
352 |
|
|
|
353 |
|
|
|
354 |
|
|
// Generate code for all possible inputs to each word in fetch_buffer.
|
355 |
|
|
// The current instruction is removed, and the rest of fetch_buffer is shifted down to make space for next 2 words of code
|
356 |
|
|
// Data from the code cache are inserted into the first vacant space of fetch_buffer
|
357 |
|
|
genvar i;
|
358 |
|
|
generate
|
359 |
|
|
// generation loop for each word in fetch_buffer
|
360 |
|
|
for (i = 0; i < fetch_buffer_size; i++) begin
|
361 |
|
|
always_ff @(posedge clock) if (clock_enable) begin
|
362 |
|
|
|
363 |
|
|
if (i < fetch_buffer_pos && buffer_action[1]) begin
|
364 |
|
|
// instruction 0 is being dispatched. shift down
|
365 |
|
|
fetch_buffer[i][31:0] <= fetch_buffer[i+instruction_length0][31:0];
|
366 |
|
|
|
367 |
|
|
end else if (i == fetch_buffer_pos && buffer_action[0]) begin
|
368 |
|
|
// load first word
|
369 |
|
|
if (target_underway[0] & jump_target[0]) begin
|
370 |
|
|
// jumping to an odd address. use only upper half of read_data
|
371 |
|
|
fetch_buffer[i][31:0] <= read_data[63:32];
|
372 |
|
|
end else begin
|
373 |
|
|
// load first word
|
374 |
|
|
fetch_buffer[i][31:0] <= read_data[31:0];
|
375 |
|
|
end
|
376 |
|
|
|
377 |
|
|
end else if (i == fetch_buffer_pos + 1 && buffer_action[0]) begin
|
378 |
|
|
// load second word
|
379 |
|
|
fetch_buffer[i][31:0] <= read_data[63:32];
|
380 |
|
|
|
381 |
|
|
end
|
382 |
|
|
end
|
383 |
|
|
end
|
384 |
|
|
endgenerate
|
385 |
|
|
|
386 |
|
|
|
387 |
|
|
// Calculate read_addr and instruction_pointer in next clock cycle
|
388 |
|
|
// The shift registers named target_underway and wait_for_target indicate if we are waiting for a jump target
|
389 |
|
|
always_ff @(posedge clock) if (clock_enable) begin
|
390 |
|
|
|
391 |
|
|
valid_words0 <= valid_words1;
|
392 |
|
|
read_enable_out <= send_next;
|
393 |
|
|
|
394 |
|
|
if (!stall_in) begin
|
395 |
|
|
// send instruction to the decoder
|
396 |
|
|
valid_out <= instruction_ready1 && !early_jump;
|
397 |
|
|
|
398 |
|
|
// Unconditional jumps are bypassing the pipeline
|
399 |
|
|
jump_out <= early_jump;
|
400 |
|
|
|
401 |
|
|
end else if (instruction_ready1 && !early_jump) begin
|
402 |
|
|
|
403 |
|
|
// Turn valid_out on, but not off, when there is stall_in.
|
404 |
|
|
// This is necessary if there is a stall one instruction before a fast jump,
|
405 |
|
|
// causing the jump bubble to be filled. Otherwise, it skips the first instruction after the jump
|
406 |
|
|
valid_out <= 1;
|
407 |
|
|
end
|
408 |
|
|
|
409 |
|
|
jump_case <= 0;
|
410 |
|
|
|
411 |
|
|
if (reset) begin
|
412 |
|
|
// reset button pressed
|
413 |
|
|
if (restart) restart_underway <= 1;
|
414 |
|
|
next_underway <= 2'b00;
|
415 |
|
|
target_underway <= 3'b100;
|
416 |
|
|
wait_for_target <= 1;
|
417 |
|
|
wait_for_jump <= 0;
|
418 |
|
|
jump_target <= reset_target;
|
419 |
|
|
read_addr_out <= reset_target >> 1;
|
420 |
|
|
instruction_pointer_out <= reset_target;
|
421 |
|
|
valid_words0 <= 0;
|
422 |
|
|
read_enable_out <= 0;
|
423 |
|
|
valid_out <= 0;
|
424 |
|
|
jump_out <= 0;
|
425 |
|
|
|
426 |
|
|
end else if (target_underway[2]) begin
|
427 |
|
|
// first clock after reset
|
428 |
|
|
jump_case <= 1;
|
429 |
|
|
next_underway <= 2'b00;
|
430 |
|
|
target_underway <= {1'b0,target_underway[2:1]}; // shift right to indicate when jump target arrives
|
431 |
|
|
wait_for_target <= 1; // skip all instructions until jump target arrives
|
432 |
|
|
instruction_pointer_out <= reset_target;
|
433 |
|
|
jump_target <= reset_target;
|
434 |
|
|
read_addr_out <= reset_target >> 1;
|
435 |
|
|
|
436 |
|
|
end else if (early_jump) begin
|
437 |
|
|
// unconditional jump detected in instruction 1
|
438 |
|
|
jump_case <= 2;
|
439 |
|
|
next_underway <= 2'b00;
|
440 |
|
|
target_underway <= 3'b010; // wait 2 clock cycles for target
|
441 |
|
|
read_addr_out <= early_jump_addr >> 1;
|
442 |
|
|
jump_target <= early_jump_addr;
|
443 |
|
|
restart_underway <= 0;
|
444 |
|
|
if (!stall_in) begin
|
445 |
|
|
wait_for_target <= 1; // skip all instructions until jump target arrives
|
446 |
|
|
wait_for_jump <= 0;
|
447 |
|
|
instruction_pointer_out <= early_jump_addr;
|
448 |
|
|
end
|
449 |
|
|
|
450 |
|
|
end else if (conditional_jump && (instruction_ready1 & !stall_in || shift_out0)) begin
|
451 |
|
|
// conditional jump detected in instruction 1
|
452 |
|
|
jump_case <= 3;
|
453 |
|
|
next_underway <= {send_next,next_underway[1]}; // shift right to indicate when data arrives
|
454 |
|
|
target_underway <= 3'b000; // wait 2 clock cycles for target
|
455 |
|
|
// read address is two words ahead because reading takes 2 clock cycles
|
456 |
|
|
if (send_next) begin
|
457 |
|
|
read_addr_out <= read_addr_out + 1;
|
458 |
|
|
end
|
459 |
|
|
wait_for_jump <= 1; // wait for jump target address from ALU
|
460 |
|
|
jump_target <= 0;
|
461 |
|
|
wait_for_target <= 0;
|
462 |
|
|
if (shift_out0) begin
|
463 |
|
|
// point to next instruction
|
464 |
|
|
instruction_pointer_out <= instruction_pointer_out + instruction_length0;
|
465 |
|
|
end
|
466 |
|
|
/*if (!stall_in) begin
|
467 |
|
|
jump_target <= 0;
|
468 |
|
|
wait_for_target <= 0;
|
469 |
|
|
end*/
|
470 |
|
|
|
471 |
|
|
end else if (target_underway[0] & valid_in) begin
|
472 |
|
|
// a jump target has arrived from code cache. (ignore any subsequent jump instructions)
|
473 |
|
|
restart_underway <= 0;
|
474 |
|
|
jump_case <= 4;
|
475 |
|
|
next_underway <= {send_next, next_underway[1]}; // shift right to indicate when data arrives
|
476 |
|
|
wait_for_target <= 0; // stop waiting for jump target
|
477 |
|
|
target_underway <= 3'b000;
|
478 |
|
|
read_addr_out <= read_addr_out + 1;
|
479 |
|
|
if (!stall_in) begin
|
480 |
|
|
instruction_pointer_out <= jump_target; // set address of current instruction
|
481 |
|
|
end
|
482 |
|
|
|
483 |
|
|
end else if (jump_in & wait_for_jump & valid_words1 >= instruction_length1) begin
|
484 |
|
|
// a conditional or indirect jump instruction has been executed in ALU
|
485 |
|
|
// the ALU has sent the target address directly to the code cache to save one clock cycle
|
486 |
|
|
//next_underway <= 2'b00;
|
487 |
|
|
restart_underway <= 0;
|
488 |
|
|
jump_case <= 5;
|
489 |
|
|
next_underway <= {send_next, next_underway[1]}; // shift right to indicate when data arrives
|
490 |
|
|
target_underway <= 3'b001; // wait one clock cycle for target
|
491 |
|
|
if (!stall_in) begin
|
492 |
|
|
wait_for_jump <= 0;
|
493 |
|
|
read_addr_out <= (jump_pointer >> 1) + 1;
|
494 |
|
|
wait_for_target <= 1;
|
495 |
|
|
jump_target <= jump_pointer;
|
496 |
|
|
instruction_pointer_out <= jump_pointer;
|
497 |
|
|
end
|
498 |
|
|
|
499 |
|
|
end else if (nojump_in & wait_for_jump) begin
|
500 |
|
|
// a conditional or indirect jump instruction has been executed in ALU
|
501 |
|
|
// and the target is the next instruction
|
502 |
|
|
//next_underway <= {send_next,next_underway[1]}; // shift right to indicate when data arrives
|
503 |
|
|
restart_underway <= 0;
|
504 |
|
|
jump_case <= 6;
|
505 |
|
|
next_underway <= {send_next, next_underway[1]}; // shift right to indicate when data arrives
|
506 |
|
|
target_underway <= 3'b000; // wait two clock cycles for target
|
507 |
|
|
wait_for_target <= 0;
|
508 |
|
|
wait_for_jump <= 0;
|
509 |
|
|
if (send_next) begin
|
510 |
|
|
read_addr_out <= read_addr_out + 1;
|
511 |
|
|
end
|
512 |
|
|
// if (!stall_in) begin
|
513 |
|
|
if (shift_out0) begin
|
514 |
|
|
instruction_pointer_out <= instruction_pointer_out + instruction_length0;
|
515 |
|
|
end
|
516 |
|
|
|
517 |
|
|
end else begin
|
518 |
|
|
// no new jump instruction
|
519 |
|
|
restart_underway <= 0;
|
520 |
|
|
jump_case <= 7;
|
521 |
|
|
next_underway <= {send_next,next_underway[1]}; // shift right to indicate when data arrives
|
522 |
|
|
target_underway <= {1'b0,target_underway[2:1]}; // shift right to indicate when jump target arrives
|
523 |
|
|
|
524 |
|
|
// make ready for next read. Least significant address bit ignored because data bus is double size
|
525 |
|
|
// read address is two words ahead because reading takes 2 clock cycles
|
526 |
|
|
if (send_next) begin
|
527 |
|
|
read_addr_out <= read_addr_out + 1;
|
528 |
|
|
end
|
529 |
|
|
if (shift_out0) begin
|
530 |
|
|
// point to next instruction
|
531 |
|
|
instruction_pointer_out <= instruction_pointer_out + instruction_length0;
|
532 |
|
|
end
|
533 |
|
|
|
534 |
|
|
end
|
535 |
|
|
|
536 |
|
|
// communicate with call stack as soon as a call or return instruction is detected.
|
537 |
|
|
// checking !target_underway[0] && !wait_for_target[0] to avoid seding the call_e_out
|
538 |
|
|
// or return_e_out multiple times
|
539 |
|
|
if (reset || target_underway[2:1] != 0) begin
|
540 |
|
|
call_e_out <= 0;
|
541 |
|
|
return_e_out <= 0;
|
542 |
|
|
call_push_data_out <= 0;
|
543 |
|
|
end else if (call_instruction == 1) begin
|
544 |
|
|
call_e_out <= 1;
|
545 |
|
|
return_e_out <= 0;
|
546 |
|
|
call_push_data_out <= return_addr;
|
547 |
|
|
end else if (call_instruction == 2) begin
|
548 |
|
|
return_e_out <= 1;
|
549 |
|
|
call_e_out <= 0;
|
550 |
|
|
call_push_data_out <= 0;
|
551 |
|
|
end else begin
|
552 |
|
|
call_e_out <= 0;
|
553 |
|
|
call_push_data_out <= 0;
|
554 |
|
|
return_e_out <= 0;
|
555 |
|
|
end
|
556 |
|
|
|
557 |
|
|
// predict that decoder will use multiple clock cycles for push and pop instructions
|
558 |
|
|
if (valid_words1 != 0 && word1[`IL] == 2'b01 && (word1[`MODE] == 3'b011 || (word1[`MODE] == 3'b00 && word1[`M]))
|
559 |
|
|
&& word1[`OP1] >> 1 == instruction_push_pop >> 1 && shift_out0) begin
|
560 |
|
|
stall_predict_out <= 1; // mode = 1.3 or 1.8, op1 = 56 or 57
|
561 |
|
|
end else begin
|
562 |
|
|
stall_predict_out <= 0;
|
563 |
|
|
end
|
564 |
|
|
|
565 |
|
|
// collect various signals for debugging purpose
|
566 |
|
|
debug1_out[0] <= early_jump;
|
567 |
|
|
debug1_out[1] <= conditional_jump;
|
568 |
|
|
debug1_out[3] <= stall_in;
|
569 |
|
|
|
570 |
|
|
debug1_out[6:4] <= valid_words1[2:0];
|
571 |
|
|
debug1_out[7] <= instruction_ready1;
|
572 |
|
|
|
573 |
|
|
debug1_out[8] <= buffer_action[0]; // fill buffer
|
574 |
|
|
debug1_out[9] <= buffer_action[1]; // shift_out0;
|
575 |
|
|
debug1_out[11:10]<= dispatch_length0;
|
576 |
|
|
|
577 |
|
|
debug1_out[15:12]<= fetch_buffer_pos;
|
578 |
|
|
|
579 |
|
|
debug1_out[16] <= send_next;
|
580 |
|
|
debug1_out[17] <= instruction_ready0;
|
581 |
|
|
debug1_out[18] <= nojump_in;
|
582 |
|
|
debug1_out[19] <= jump_in;
|
583 |
|
|
end
|
584 |
|
|
// register variables are assigned to avoid an extra clock delay:
|
585 |
|
|
assign debug1_out[21:20] = next_underway;
|
586 |
|
|
assign debug1_out[23:22] = target_underway[1:0];
|
587 |
|
|
|
588 |
|
|
assign debug1_out[27:24] = jump_case; // jump handling case
|
589 |
|
|
|
590 |
|
|
assign debug1_out[28] = wait_for_target;
|
591 |
|
|
assign debug1_out[29] = wait_for_jump;
|
592 |
|
|
assign debug1_out[31] = valid_out;
|
593 |
|
|
|
594 |
|
|
|
595 |
|
|
// output instruction, 1-3 words
|
596 |
|
|
assign instruction_out[31:0] = fetch_buffer[0][31:0];
|
597 |
|
|
assign instruction_out[63:32] = fetch_buffer[1][31:0];
|
598 |
|
|
assign instruction_out[95:64] = fetch_buffer[2][31:0];
|
599 |
|
|
|
600 |
|
|
endmodule
|