1 |
2 |
marcelos |
/*
|
2 |
|
|
* Copyright (c) 2018, Marcelo Samsoniuk
|
3 |
|
|
* All rights reserved.
|
4 |
|
|
*
|
5 |
|
|
* Redistribution and use in source and binary forms, with or without
|
6 |
|
|
* modification, are permitted provided that the following conditions are met:
|
7 |
|
|
*
|
8 |
|
|
* * Redistributions of source code must retain the above copyright notice, this
|
9 |
|
|
* list of conditions and the following disclaimer.
|
10 |
|
|
*
|
11 |
|
|
* * Redistributions in binary form must reproduce the above copyright notice,
|
12 |
|
|
* this list of conditions and the following disclaimer in the documentation
|
13 |
|
|
* and/or other materials provided with the distribution.
|
14 |
|
|
*
|
15 |
|
|
* * Neither the name of the copyright holder nor the names of its
|
16 |
|
|
* contributors may be used to endorse or promote products derived from
|
17 |
|
|
* this software without specific prior written permission.
|
18 |
|
|
*
|
19 |
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
20 |
|
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
21 |
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
22 |
|
|
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
23 |
|
|
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
24 |
|
|
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
25 |
|
|
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
26 |
|
|
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
27 |
|
|
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
28 |
|
|
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
29 |
|
|
*/
|
30 |
|
|
|
31 |
|
|
`timescale 1ns / 1ps
|
32 |
|
|
|
33 |
|
|
// memory architecture
|
34 |
|
|
//
|
35 |
|
|
// TODO: fix the different memory architecture concepts:
|
36 |
|
|
// status:
|
37 |
|
|
// ICACHE: works without interrupt
|
38 |
|
|
// DCACHE: does not work!
|
39 |
|
|
// WAITSTATE: works
|
40 |
|
|
//
|
41 |
|
|
//`define __ICACHE__ // instruction cache
|
42 |
|
|
//`define __DCACHE__ // data cache (bug: simulation only)
|
43 |
|
|
//`define __WAITSTATES__ // wait-state tests, no cache
|
44 |
|
|
|
45 |
|
|
// peripheral configuration
|
46 |
|
|
//
|
47 |
|
|
// UART speed is set in bits per second, typically 115200 bps:
|
48 |
|
|
|
49 |
|
|
`define __UARTSPEED__ 115200
|
50 |
|
|
|
51 |
|
|
// darkriscv/darksocv configuration
|
52 |
|
|
//
|
53 |
|
|
// pipeline stages:
|
54 |
|
|
//
|
55 |
|
|
// 2-stage version: core and memory in different clock edges result in less
|
56 |
|
|
// clock performance, but less losses when the program counter changes
|
57 |
|
|
// (pipeline flush = 1 clock). Works like a 4-stage pipeline and remember
|
58 |
|
|
// the 68040 clock scheme, with instruction per clock = 1. alternatively,
|
59 |
|
|
// it is possible work w/ 1 wait-state and 1 clock edge, but with a penalty
|
60 |
|
|
// in performance (instruction per clock = 0.5).
|
61 |
|
|
//
|
62 |
|
|
// 3-stage version: core and memory in the same clock edge require one extra
|
63 |
|
|
// stage in the pipeline, but keep a good performance most of time
|
64 |
|
|
// (instruction per clock = 1). of course, read operations require 1
|
65 |
|
|
// wait-state, which means sometimes the read performance is reduced.
|
66 |
|
|
|
67 |
|
|
`define __3STAGE__
|
68 |
|
|
|
69 |
4 |
marcelos |
// read-modify-write cycle:
|
70 |
|
|
//
|
71 |
|
|
// Generate RMW cycles when writing in the memory. This option basically
|
72 |
|
|
// makes the read and write cycle symmetric and may work better in the cases
|
73 |
|
|
// when the 32-bit memory does not support separate write enables for
|
74 |
|
|
// separate 16-bit and 8-bit words. Typically, the RMW cycle results in a
|
75 |
|
|
// decrease of 5% in the performance (not the clock, but the instruction
|
76 |
|
|
// pipeline eficiency) due to memory wait-states.
|
77 |
|
|
// Additional note: the RMW cycle is required for -O3 compilation!
|
78 |
|
|
|
79 |
|
|
//`define __RMW_CYCLE__
|
80 |
|
|
|
81 |
2 |
marcelos |
// muti-threading support:
|
82 |
|
|
//
|
83 |
6 |
marcelos |
// Decreases clock performance by 20% (80MHz), but enables two or more
|
84 |
|
|
// contexts (threads) in the core. The threads work in symmetrical way,
|
85 |
|
|
// which means that they will start with the same exactly core parameters
|
86 |
|
|
// (same initial PC, same initial SP, etc). The boot.s code is designed
|
87 |
|
|
// to handle this difference and set each thread to different
|
88 |
|
|
// applications.
|
89 |
4 |
marcelos |
// Notes:
|
90 |
|
|
// a) threading is currently supported only in the 3-stage pipeline version.
|
91 |
|
|
// b) the old experimental "interrupt mode" was removed, which means that
|
92 |
|
|
// the multi-thread mode does not make anything "visible" other than
|
93 |
|
|
// increment the gpio register.
|
94 |
6 |
marcelos |
// c) the threading in the non-interrupt mode switches when the program flow
|
95 |
|
|
// changes, i.e. every jal instruction. When the core is idle, it is
|
96 |
|
|
// probably in a jal loop.
|
97 |
2 |
marcelos |
|
98 |
|
|
//`define __THREADING__
|
99 |
|
|
|
100 |
6 |
marcelos |
// number of threads: between 2 and n. Of course, it requires more and
|
101 |
|
|
// more FPGA space in order to implement it, depending of the FPGA technology.
|
102 |
|
|
|
103 |
|
|
`define NTHREADS 4
|
104 |
|
|
|
105 |
2 |
marcelos |
// performance measurement:
|
106 |
|
|
//
|
107 |
|
|
// The performance measurement can be done in the simulation level by
|
108 |
|
|
// eabling the __PERFMETER__ define, in order to check how the clock cycles
|
109 |
4 |
marcelos |
// are used in the core. The report is displayed when the FINISH_REQ signal
|
110 |
|
|
// is actived by the UART.
|
111 |
2 |
marcelos |
|
112 |
4 |
marcelos |
`define __PERFMETER__
|
113 |
2 |
marcelos |
|
114 |
|
|
// mac instruction:
|
115 |
|
|
//
|
116 |
|
|
// The mac instruction is similar to other register to register
|
117 |
|
|
// instructions, but with a different opcode 7'h1111111. the format is mac
|
118 |
|
|
// rd,r1,r2, but is not currently possible encode in asm, by this way it is
|
119 |
|
|
// available in licb as int mac(int rd, short r1, short r2). Although it
|
120 |
|
|
// can be used to accelerate the mul/div operations, the mac operation is
|
121 |
|
|
// designed for DSP applications. with some effort (low level machine
|
122 |
|
|
// code), it is possible peak 100MMAC/s @100MHz.
|
123 |
|
|
|
124 |
|
|
//`define __MAC16X16__
|
125 |
|
|
|
126 |
|
|
// RV32I vs RV32E:
|
127 |
|
|
//
|
128 |
|
|
// The difference between the RV32I and RV32E regarding the logic space is
|
129 |
|
|
// minimal in typical applications with modern 5 or 6 input LUT based FPGAs,
|
130 |
|
|
// but the RV32E is better with old 4 input LUT based FPGAs.
|
131 |
|
|
|
132 |
|
|
`define __RV32E__
|
133 |
|
|
|
134 |
|
|
// full harvard architecture:
|
135 |
|
|
//
|
136 |
|
|
// When defined, enforses that the instruction and data buses are connected
|
137 |
|
|
// to fully separate memory banks. Although the darkriscv always use
|
138 |
|
|
// harvard architecture in the core, with separate instruction and data
|
139 |
|
|
// buses, the logic levels outside the core can use different architectures
|
140 |
|
|
// and concepts, including von neumann, wich a single bus shared by
|
141 |
|
|
// instruction and data access, as well a mix between harvard and von
|
142 |
|
|
// neumann, which is possible in the case of dual-port blockrams, where is
|
143 |
|
|
// possible connect two separate buses in a single memory bank. the main
|
144 |
|
|
// advantage of a single memory bank is that the .text and .data areas can
|
145 |
|
|
// be better allocated, but in this case is not possible protect the .text
|
146 |
|
|
// area as in the case of separate memory banks.
|
147 |
|
|
|
148 |
4 |
marcelos |
//`define __HARVARD__
|
149 |
2 |
marcelos |
|
150 |
|
|
// flexbuzz interface (experimental):
|
151 |
|
|
//
|
152 |
|
|
// A new data bus interface similar to a well known c*ldfire bus interface, in
|
153 |
|
|
// a way that part of the bus routing is moved to the core, in a way that
|
154 |
|
|
// is possible support different bus widths (8, 16 or 32 bit) and endians more
|
155 |
|
|
// easily (the new interface is natively big-endian, but the endian can be adjusted
|
156 |
|
|
// in the bus interface dinamically). Similarly to the standard 32-bit interface,
|
157 |
|
|
// the external logic must detect the RD/WR operation quick enough and assert HLT
|
158 |
|
|
// in order to insert wait-states and perform the required multiplexing to fit
|
159 |
|
|
// the DLEN operand size in the data bus width available.
|
160 |
|
|
|
161 |
4 |
marcelos |
`define __FLEXBUZZ__
|
162 |
|
|
|
163 |
|
|
// initial PC and SP
|
164 |
|
|
//
|
165 |
|
|
// it is possible program the initial PC and SP. Typically, the PC is set
|
166 |
|
|
// to address 0, representing the start of ROM memory and the SP is set to
|
167 |
|
|
// the final of RAM memory. In the linker, the start of ROM memory matches
|
168 |
|
|
// with the .text area, which is defined in the boot.c code and the start of
|
169 |
|
|
// RAM memory matches with the .data and other volatile data, in a way that
|
170 |
|
|
// the stack can be positioned in the top of RAM and does not match with the
|
171 |
|
|
// .data.
|
172 |
|
|
|
173 |
2 |
marcelos |
`define __RESETPC__ 32'd0
|
174 |
|
|
`define __RESETSP__ 32'd8192
|
175 |
|
|
|
176 |
6 |
marcelos |
// UART queue:
|
177 |
|
|
//
|
178 |
|
|
// Optional RX/TX queue for communication oriented applications. The concept
|
179 |
|
|
// foreseen 256 bytes for TX and RX, in a way that frames up to 128 bytes can
|
180 |
|
|
// be easily exchanged via UART.
|
181 |
|
|
|
182 |
|
|
//`define __UARTQUEUE__
|
183 |
|
|
|
184 |
2 |
marcelos |
// board definition:
|
185 |
|
|
//
|
186 |
|
|
// The board is automatically defined in the xst/xise files via Makefile or
|
187 |
|
|
// ISE. Case it is not the case, please define you board name here:
|
188 |
|
|
|
189 |
|
|
//`define AVNET_MICROBOARD_LX9
|
190 |
|
|
//`define XILINX_AC701_A200
|
191 |
|
|
//`define QMTECH_SDRAM_LX16
|
192 |
|
|
|
193 |
|
|
// the following defines are automatically defined:
|
194 |
|
|
|
195 |
|
|
`ifdef __ICARUS__
|
196 |
|
|
`define SIMULATION 1
|
197 |
|
|
`endif
|
198 |
|
|
|
199 |
|
|
`ifdef XILINX_ISIM
|
200 |
|
|
`define SIMULATION 2
|
201 |
|
|
`endif
|
202 |
|
|
|
203 |
|
|
`ifdef MODEL_TECH
|
204 |
|
|
`define SIMULATION 3
|
205 |
|
|
`endif
|
206 |
|
|
|
207 |
|
|
`ifdef XILINX_SIMULATOR
|
208 |
|
|
`define SIMULATION 4
|
209 |
|
|
`endif
|
210 |
|
|
|
211 |
|
|
`ifdef AVNET_MICROBOARD_LX9
|
212 |
|
|
`define BOARD_ID 1
|
213 |
|
|
//`define BOARD_CK 100000000
|
214 |
|
|
//`define BOARD_CK 66666666
|
215 |
|
|
//`define BOARD_CK 40000000
|
216 |
|
|
// example of DCM logic:
|
217 |
|
|
`define BOARD_CK_REF 100000000
|
218 |
|
|
`define BOARD_CK_MUL 2
|
219 |
|
|
`ifdef __3STAGE__
|
220 |
|
|
`define BOARD_CK_DIV 2 // 100MHz
|
221 |
|
|
`else
|
222 |
|
|
`define BOARD_CK_DIV 4 // 50MHz
|
223 |
|
|
`endif
|
224 |
|
|
`endif
|
225 |
|
|
|
226 |
|
|
`ifdef XILINX_AC701_A200
|
227 |
|
|
`define BOARD_ID 2
|
228 |
|
|
//`define BOARD_CK 90000000
|
229 |
|
|
`define BOARD_CK_REF 90000000
|
230 |
|
|
`define BOARD_CK_MUL 4
|
231 |
|
|
`define BOARD_CK_DIV 2
|
232 |
|
|
`endif
|
233 |
|
|
|
234 |
|
|
`ifdef QMTECH_SDRAM_LX16
|
235 |
|
|
`define BOARD_ID 3
|
236 |
|
|
`define BOARD_CK_REF 50000000
|
237 |
|
|
`define BOARD_CK_MUL 4
|
238 |
|
|
`define BOARD_CK_DIV 2
|
239 |
|
|
`define INVRES 1
|
240 |
|
|
`endif
|
241 |
|
|
|
242 |
|
|
`ifdef QMTECH_SPARTAN7_S15
|
243 |
|
|
`define BOARD_ID 4
|
244 |
|
|
`define BOARD_CK_REF 50000000
|
245 |
|
|
`define BOARD_CK_MUL 20
|
246 |
|
|
`define BOARD_CK_DIV 10
|
247 |
|
|
`define XILINX7CLK 1
|
248 |
|
|
`define VIVADO 1
|
249 |
|
|
`define INVRES 1
|
250 |
|
|
`endif
|
251 |
|
|
|
252 |
|
|
`ifdef LATTICE_BREVIA2_XP2
|
253 |
|
|
`define BOARD_ID 5
|
254 |
|
|
`define BOARD_CK 50000000
|
255 |
|
|
`define INVRES 1
|
256 |
|
|
`endif
|
257 |
|
|
|
258 |
|
|
`ifdef PISWORDS_RS485_LX9
|
259 |
|
|
`define BOARD_ID 6
|
260 |
|
|
`define BOARD_CK_REF 50000000
|
261 |
|
|
`define BOARD_CK_MUL 4
|
262 |
|
|
`define BOARD_CK_DIV 2
|
263 |
|
|
`define INVRES 1
|
264 |
|
|
`endif
|
265 |
|
|
|
266 |
|
|
`ifdef DIGILENT_SPARTAN3_S200
|
267 |
|
|
`define BOARD_ID 7
|
268 |
|
|
`define BOARD_CK 50000000
|
269 |
|
|
`endif
|
270 |
|
|
|
271 |
|
|
`ifdef ALIEXPRESS_HPC40GBE_K420
|
272 |
|
|
`define BOARD_ID 8
|
273 |
|
|
//`define BOARD_CK 200000000
|
274 |
|
|
`define BOARD_CK_REF 100000000
|
275 |
6 |
marcelos |
`define BOARD_CK_MUL 12
|
276 |
2 |
marcelos |
`define BOARD_CK_DIV 5
|
277 |
|
|
`define XILINX7CLK 1
|
278 |
|
|
`define INVRES 1
|
279 |
|
|
`endif
|
280 |
|
|
|
281 |
|
|
`ifdef QMTECH_ARTIX7_A35
|
282 |
|
|
`define BOARD_ID 9
|
283 |
|
|
`define BOARD_CK_REF 50000000
|
284 |
|
|
`define BOARD_CK_MUL 20
|
285 |
|
|
`define BOARD_CK_DIV 10
|
286 |
|
|
`define XILINX7CLK 1
|
287 |
|
|
`define VIVADO 1
|
288 |
|
|
`define INVRES 1
|
289 |
|
|
`endif
|
290 |
|
|
|
291 |
6 |
marcelos |
`ifdef ALIEXPRESS_HPC40GBE_XKCU040
|
292 |
|
|
`define BOARD_ID 10
|
293 |
|
|
//`define BOARD_CK 200000000
|
294 |
|
|
`define BOARD_CK_REF 100000000
|
295 |
|
|
`define BOARD_CK_MUL 8 // x8/2 = 400MHZ (overclock!)
|
296 |
|
|
`define BOARD_CK_DIV 2 // vivado reco. = 250MHz
|
297 |
|
|
`define XILINX7CLK 1
|
298 |
|
|
`define INVRES 1
|
299 |
|
|
`endif
|
300 |
|
|
|
301 |
2 |
marcelos |
`ifndef BOARD_ID
|
302 |
|
|
`define BOARD_ID 0
|
303 |
|
|
`define BOARD_CK 100000000
|
304 |
|
|
`endif
|
305 |
|
|
|
306 |
|
|
`ifdef BOARD_CK_REF
|
307 |
|
|
`define BOARD_CK (`BOARD_CK_REF * `BOARD_CK_MUL / `BOARD_CK_DIV)
|
308 |
|
|
`endif
|
309 |
|
|
|
310 |
6 |
marcelos |
// the 3-stage pipeline is required when the threading mode is enabled,
|
311 |
|
|
// also, we need a non-null number of threads (default 2)
|
312 |
|
|
|
313 |
|
|
`ifdef __THREADING__
|
314 |
|
|
`ifndef __3STAGE__
|
315 |
|
|
`define __3STAGE__
|
316 |
|
|
`endif
|
317 |
|
|
`ifndef NTHREADS
|
318 |
|
|
`define NTHREADS 2
|
319 |
|
|
`endif
|
320 |
|
|
`endif
|
321 |
|
|
|
322 |
2 |
marcelos |
// darkuart baudrate automtically calculated according to board clock:
|
323 |
|
|
|
324 |
|
|
`ifndef __UARTSPEED__
|
325 |
|
|
`define __UARTSPEED__ 115200
|
326 |
|
|
`endif
|
327 |
|
|
|
328 |
|
|
`define __BAUD__ ((`BOARD_CK/`__UARTSPEED__))
|