1 |
21 |
dgisselq |
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
2 |
|
|
%%
|
3 |
|
|
%% Filename: spec.tex
|
4 |
|
|
%%
|
5 |
|
|
%% Project: Zip CPU -- a small, lightweight, RISC CPU soft core
|
6 |
|
|
%%
|
7 |
|
|
%% Purpose: This LaTeX file contains all of the documentation/description
|
8 |
33 |
dgisselq |
%% currently provided with this Zip CPU soft core. It supersedes
|
9 |
21 |
dgisselq |
%% any information about the instruction set or CPUs found
|
10 |
|
|
%% elsewhere. It's not nearly as interesting, though, as the PDF
|
11 |
|
|
%% file it creates, so I'd recommend reading that before diving
|
12 |
|
|
%% into this file. You should be able to find the PDF file in
|
13 |
|
|
%% the SVN distribution together with this PDF file and a copy of
|
14 |
|
|
%% the GPL-3.0 license this file is distributed under. If not,
|
15 |
|
|
%% just type 'make' in the doc directory and it (should) build
|
16 |
|
|
%% without a problem.
|
17 |
|
|
%%
|
18 |
|
|
%%
|
19 |
|
|
%% Creator: Dan Gisselquist
|
20 |
|
|
%% Gisselquist Technology, LLC
|
21 |
|
|
%%
|
22 |
|
|
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
23 |
|
|
%%
|
24 |
|
|
%% Copyright (C) 2015, Gisselquist Technology, LLC
|
25 |
|
|
%%
|
26 |
|
|
%% This program is free software (firmware): you can redistribute it and/or
|
27 |
|
|
%% modify it under the terms of the GNU General Public License as published
|
28 |
|
|
%% by the Free Software Foundation, either version 3 of the License, or (at
|
29 |
|
|
%% your option) any later version.
|
30 |
|
|
%%
|
31 |
|
|
%% This program is distributed in the hope that it will be useful, but WITHOUT
|
32 |
|
|
%% ANY WARRANTY; without even the implied warranty of MERCHANTIBILITY or
|
33 |
|
|
%% FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
34 |
|
|
%% for more details.
|
35 |
|
|
%%
|
36 |
|
|
%% You should have received a copy of the GNU General Public License along
|
37 |
|
|
%% with this program. (It's in the $(ROOT)/doc directory, run make with no
|
38 |
|
|
%% target there if the PDF file isn't present.) If not, see
|
39 |
|
|
%% <http://www.gnu.org/licenses/> for a copy.
|
40 |
|
|
%%
|
41 |
|
|
%% License: GPL, v3, as defined and found on www.gnu.org,
|
42 |
|
|
%% http://www.gnu.org/licenses/gpl.html
|
43 |
|
|
%%
|
44 |
|
|
%%
|
45 |
|
|
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
46 |
|
|
\documentclass{gqtekspec}
|
47 |
68 |
dgisselq |
\usepackage{import}
|
48 |
69 |
dgisselq |
\usepackage{bytefield}
|
49 |
68 |
dgisselq |
% \graphicspath{{../gfx}}
|
50 |
21 |
dgisselq |
\project{Zip CPU}
|
51 |
|
|
\title{Specification}
|
52 |
|
|
\author{Dan Gisselquist, Ph.D.}
|
53 |
|
|
\email{dgisselq (at) opencores.org}
|
54 |
92 |
dgisselq |
\revision{Rev.~0.8}
|
55 |
69 |
dgisselq |
\definecolor{webred}{rgb}{0.5,0,0}
|
56 |
|
|
\definecolor{webgreen}{rgb}{0,0.4,0}
|
57 |
36 |
dgisselq |
\usepackage[dvips,ps2pdf,colorlinks=true,
|
58 |
69 |
dgisselq |
anchorcolor=black,pdfpagelabels,hypertexnames,
|
59 |
36 |
dgisselq |
pdfauthor={Dan Gisselquist},
|
60 |
|
|
pdfsubject={Zip CPU}]{hyperref}
|
61 |
69 |
dgisselq |
\hypersetup{
|
62 |
|
|
colorlinks = true,
|
63 |
|
|
linkcolor = webred,
|
64 |
|
|
citecolor = webgreen
|
65 |
|
|
}
|
66 |
21 |
dgisselq |
\begin{document}
|
67 |
|
|
\pagestyle{gqtekspecplain}
|
68 |
|
|
\titlepage
|
69 |
|
|
\begin{license}
|
70 |
|
|
Copyright (C) \theyear\today, Gisselquist Technology, LLC
|
71 |
|
|
|
72 |
|
|
This project is free software (firmware): you can redistribute it and/or
|
73 |
|
|
modify it under the terms of the GNU General Public License as published
|
74 |
|
|
by the Free Software Foundation, either version 3 of the License, or (at
|
75 |
|
|
your option) any later version.
|
76 |
|
|
|
77 |
|
|
This program is distributed in the hope that it will be useful, but WITHOUT
|
78 |
|
|
ANY WARRANTY; without even the implied warranty of MERCHANTIBILITY or
|
79 |
|
|
FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
80 |
|
|
for more details.
|
81 |
|
|
|
82 |
|
|
You should have received a copy of the GNU General Public License along
|
83 |
|
|
with this program. If not, see \hbox{<http://www.gnu.org/licenses/>} for a
|
84 |
|
|
copy.
|
85 |
|
|
\end{license}
|
86 |
|
|
\begin{revisionhistory}
|
87 |
92 |
dgisselq |
0.8 & 1/28/2016 & Gisselquist & Reduced complexity early branching \\\hline
|
88 |
69 |
dgisselq |
0.7 & 12/22/2015 & Gisselquist & New Instruction Set Architecture \\\hline
|
89 |
68 |
dgisselq |
0.6 & 11/17/2015 & Gisselquist & Added graphics to illustrate pipeline discussion.\\\hline
|
90 |
39 |
dgisselq |
0.5 & 9/29/2015 & Gisselquist & Added pipelined memory access discussion.\\\hline
|
91 |
36 |
dgisselq |
0.4 & 9/19/2015 & Gisselquist & Added DMA controller, improved stall information, and self--assessment info.\\\hline
|
92 |
33 |
dgisselq |
0.3 & 8/22/2015 & Gisselquist & First completed draft\\\hline
|
93 |
24 |
dgisselq |
0.2 & 8/19/2015 & Gisselquist & Still Draft, more complete \\\hline
|
94 |
21 |
dgisselq |
0.1 & 8/17/2015 & Gisselquist & Incomplete First Draft \\\hline
|
95 |
|
|
\end{revisionhistory}
|
96 |
|
|
% Revision History
|
97 |
|
|
% Table of Contents, named Contents
|
98 |
|
|
\tableofcontents
|
99 |
24 |
dgisselq |
\listoffigures
|
100 |
21 |
dgisselq |
\listoftables
|
101 |
|
|
\begin{preface}
|
102 |
|
|
Many people have asked me why I am building the Zip CPU. ARM processors are
|
103 |
|
|
good and effective. Xilinx makes and markets Microblaze, Altera Nios, and both
|
104 |
|
|
have better toolsets than the Zip CPU will ever have. OpenRISC is also
|
105 |
24 |
dgisselq |
available, RISC--V may be replacing it. Why build a new processor?
|
106 |
21 |
dgisselq |
|
107 |
|
|
The easiest, most obvious answer is the simple one: Because I can.
|
108 |
|
|
|
109 |
|
|
There's more to it, though. There's a lot that I would like to do with a
|
110 |
|
|
processor, and I want to be able to do it in a vendor independent fashion.
|
111 |
36 |
dgisselq |
First, I would like to be able to place this processor inside an FPGA. Without
|
112 |
|
|
paying royalties, ARM is out of the question. I would then like to be able to
|
113 |
|
|
generate Verilog code, both for the processor and the system it sits within,
|
114 |
|
|
that can run equivalently on both Xilinx and Altera chips, and that can be
|
115 |
|
|
easily ported from one manufacturer's chipsets to another. Even more, before
|
116 |
|
|
purchasing a chip or a board, I would like to know that my soft core works. I
|
117 |
|
|
would like to build a test bench to test components with, and Verilator is my
|
118 |
|
|
chosen test bench. This forces me to use all Verilog, and it prevents me from
|
119 |
|
|
using any proprietary cores. For this reason, Microblaze and Nios are out of
|
120 |
|
|
the question.
|
121 |
21 |
dgisselq |
|
122 |
|
|
Why not OpenRISC? That's a hard question. The OpenRISC team has done some
|
123 |
|
|
wonderful work on an amazing processor, and I'll have to admit that I am
|
124 |
|
|
envious of what they've accomplished. I would like to port binutils to the
|
125 |
|
|
Zip CPU, as I would like to port GCC and GDB. They are way ahead of me. The
|
126 |
|
|
OpenRISC processor, however, is complex and hefty at about 4,500 LUTs. It has
|
127 |
|
|
a lot of features of modern CPUs within it that ... well, let's just say it's
|
128 |
|
|
not the little guy on the block. The Zip CPU is lighter weight, costing only
|
129 |
32 |
dgisselq |
about 2,300 LUTs with no peripherals, and 3,200 LUTs with some very basic
|
130 |
21 |
dgisselq |
peripherals.
|
131 |
|
|
|
132 |
|
|
My final reason is that I'm building the Zip CPU as a learning experience. The
|
133 |
|
|
Zip CPU has allowed me to learn a lot about how CPUs work on a very micro
|
134 |
|
|
level. For the first time, I am beginning to understand many of the Computer
|
135 |
|
|
Architecture lessons from years ago.
|
136 |
|
|
|
137 |
|
|
To summarize: Because I can, because it is open source, because it is light
|
138 |
|
|
weight, and as an exercise in learning.
|
139 |
|
|
|
140 |
|
|
\end{preface}
|
141 |
|
|
|
142 |
|
|
\chapter{Introduction}
|
143 |
|
|
\pagenumbering{arabic}
|
144 |
|
|
\setcounter{page}{1}
|
145 |
|
|
|
146 |
|
|
|
147 |
36 |
dgisselq |
The original goal of the Zip CPU was to be a very simple CPU. You might
|
148 |
21 |
dgisselq |
think of it as a poor man's alternative to the OpenRISC architecture.
|
149 |
|
|
For this reason, all instructions have been designed to be as simple as
|
150 |
69 |
dgisselq |
possible, and the base instructions are all designed to be executed in one
|
151 |
|
|
instruction cycle per instruction, barring pipeline stalls. Indeed, even the
|
152 |
|
|
bus has been simplified to a constant 32-bit width, with no option for more
|
153 |
|
|
or less. This has resulted in the choice to drop push and pop instructions,
|
154 |
|
|
pre-increment and post-decrement addressing modes, and more.
|
155 |
21 |
dgisselq |
|
156 |
|
|
For those who like buzz words, the Zip CPU is:
|
157 |
|
|
\begin{itemize}
|
158 |
|
|
\item A 32-bit CPU: All registers are 32-bits, addresses are 32-bits,
|
159 |
|
|
instructions are 32-bits wide, etc.
|
160 |
24 |
dgisselq |
\item A RISC CPU. There is no microcode for executing instructions. All
|
161 |
|
|
instructions are designed to be completed in one clock cycle.
|
162 |
21 |
dgisselq |
\item A Load/Store architecture. (Only load and store instructions
|
163 |
|
|
can access memory.)
|
164 |
|
|
\item Wishbone compliant. All peripherals are accessed just like
|
165 |
|
|
memory across this bus.
|
166 |
|
|
\item A Von-Neumann architecture. (The instructions and data share a
|
167 |
|
|
common bus.)
|
168 |
|
|
\item A pipelined architecture, having stages for {\bf Prefetch},
|
169 |
69 |
dgisselq |
{\bf Decode}, {\bf Read-Operand}, a
|
170 |
|
|
combined stage containing the {\bf ALU},
|
171 |
|
|
{\bf Memory}, {\bf Divide}, and {\bf Floating Point}
|
172 |
|
|
units, and then the final {\bf Write-back} stage.
|
173 |
|
|
See Fig.~\ref{fig:cpu}
|
174 |
24 |
dgisselq |
\begin{figure}\begin{center}
|
175 |
|
|
\includegraphics[width=3.5in]{../gfx/cpu.eps}
|
176 |
|
|
\caption{Zip CPU internal pipeline architecture}\label{fig:cpu}
|
177 |
|
|
\end{center}\end{figure}
|
178 |
|
|
for a diagram of this structure.
|
179 |
21 |
dgisselq |
\item Completely open source, licensed under the GPL.\footnote{Should you
|
180 |
|
|
need a copy of the Zip CPU licensed under other terms, please
|
181 |
|
|
contact me.}
|
182 |
|
|
\end{itemize}
|
183 |
|
|
|
184 |
68 |
dgisselq |
The Zip CPU also has one very unique feature: the ability to do pipelined loads
|
185 |
|
|
and stores. This allows the CPU to access on-chip memory at one access per
|
186 |
|
|
clock, minus a stall for the initial access.
|
187 |
|
|
|
188 |
|
|
\section{Characteristics of a SwiC}
|
189 |
|
|
|
190 |
|
|
Here, we shall define a soft core internal to an FPGA as a ``System within a
|
191 |
|
|
Chip,'' or a SwiC. SwiCs have some very unique properties internal to them
|
192 |
|
|
that have influenced the design of the Zip CPU. Among these are the bus,
|
193 |
|
|
memory, and available peripherals.
|
194 |
|
|
|
195 |
|
|
Most other approaches to soft core CPU's employ a Harvard architecture.
|
196 |
|
|
This allows these other CPU's to have two separate bus structures: one for the
|
197 |
69 |
dgisselq |
program fetch, and the other for the memory. The Zip CPU is fairly unique in
|
198 |
68 |
dgisselq |
its approach because it uses a von Neumann architecture. This was done for
|
199 |
|
|
simplicity. By using a von Neumann architecture, only one bus needs to be
|
200 |
|
|
implemented within any FPGA. This helps to minimize real-estate, while
|
201 |
|
|
maintaining a high clock speed. The disadvantage is that it can severely
|
202 |
|
|
degrade the overall instructions per clock count.
|
203 |
|
|
|
204 |
|
|
Soft core's within an FPGA have an additional characteristic regarding
|
205 |
69 |
dgisselq |
memory access: it is slow. While memory on chip may be accessed at a single
|
206 |
|
|
cycle per access, small FPGA's often have only a limited amount of memory on
|
207 |
|
|
chip. Going off chip, however, is expensive. Two examples will prove this
|
208 |
|
|
point. On
|
209 |
68 |
dgisselq |
the XuLA2 board, Flash can be accessed at 128~cycles per 32--bit word,
|
210 |
|
|
or 64~cycles per subsequent word in a pipelined architecture. Likewise, the
|
211 |
69 |
dgisselq |
SDRAM chip on the XuLA2 board allows a 6~cycle access for a write, 10~cycles
|
212 |
68 |
dgisselq |
per read, and 2~cycles for any subsequent pipelined access read or write.
|
213 |
|
|
Either way you look at it, this memory access will be slow and this doesn't
|
214 |
|
|
account for any logic delays should the bus implementation logic get
|
215 |
|
|
complicated.
|
216 |
|
|
|
217 |
|
|
As may be noticed from the above discussion about memory speed, a second
|
218 |
|
|
characteristic of memory is that all memory accesses may be pipelined, and
|
219 |
|
|
that pipelined memory access is faster than non--pipelined access. Therefore,
|
220 |
|
|
a SwiC soft core should support pipelined operations, but it should also
|
221 |
|
|
allow a higher priority subsystem to get access to the bus (no starvation).
|
222 |
|
|
|
223 |
|
|
As a further characteristic of SwiC memory options, on-chip cache's are
|
224 |
|
|
expensive. If you want to have a minimum of logic, cache logic may not be
|
225 |
|
|
the highest on the priority list.
|
226 |
|
|
|
227 |
|
|
In sum, memory is slow. While one processor on one FPGA may be able to fill
|
228 |
|
|
its pipeline, the same processor on another FPGA may struggle to get more than
|
229 |
|
|
one instruction at a time into the pipeline. Any SwiC must be able to deal
|
230 |
|
|
with both cases: fast and slow memories.
|
231 |
|
|
|
232 |
|
|
A final characteristic of SwiC's within FPGA's is the peripherals.
|
233 |
|
|
Specifically, FPGA's are highly reconfigurable. Soft peripherals can easily
|
234 |
|
|
be created on chip to support the SwiC if necessary. As an example, a simple
|
235 |
|
|
30-bit peripheral could easily support reversing 30-bit numbers: a read from
|
236 |
|
|
the peripheral returns it's bit--reversed address. This is cheap within an
|
237 |
69 |
dgisselq |
FPGA, but expensive in instructions. Reading from another 16--bit peripheral
|
238 |
|
|
might calculate a sine function, where the 16--bit address internal to the
|
239 |
|
|
peripheral was the angle of the sine wave.
|
240 |
68 |
dgisselq |
|
241 |
|
|
Indeed, anything that must be done fast within an FPGA is likely to already
|
242 |
69 |
dgisselq |
be done--elsewhere in the fabric. This leaves the CPU with the simple role
|
243 |
|
|
of solely handling sequential tasks that need a lot of state.
|
244 |
68 |
dgisselq |
|
245 |
|
|
This means that the SwiC needs to live within a very unique environment,
|
246 |
|
|
separate and different from the traditional SoC. That isn't to say that a
|
247 |
|
|
SwiC cannot be turned into a SoC, just that this SwiC has not been designed
|
248 |
|
|
for that purpose.
|
249 |
|
|
|
250 |
|
|
\section{Lessons Learned}
|
251 |
|
|
|
252 |
21 |
dgisselq |
Now, however, that I've worked on the Zip CPU for a while, it is not nearly
|
253 |
|
|
as simple as I originally hoped. Worse, I've had to adjust to create
|
254 |
|
|
capabilities that I was never expecting to need. These include:
|
255 |
|
|
\begin{itemize}
|
256 |
33 |
dgisselq |
\item {\bf External Debug:} Once placed upon an FPGA, some external means is
|
257 |
21 |
dgisselq |
still necessary to debug this CPU. That means that there needs to be
|
258 |
|
|
an external register that can control the CPU: reset it, halt it, step
|
259 |
24 |
dgisselq |
it, and tell whether it is running or not. My chosen interface
|
260 |
|
|
includes a second register similar to this control register. This
|
261 |
|
|
second register allows the external controller or debugger to examine
|
262 |
21 |
dgisselq |
registers internal to the CPU.
|
263 |
|
|
|
264 |
|
|
\item {\bf Internal Debug:} Being able to run a debugger from within
|
265 |
|
|
a user process requires an ability to step a user process from
|
266 |
|
|
within a debugger. It also requires a break instruction that can
|
267 |
|
|
be substituted for any other instruction, and substituted back.
|
268 |
|
|
The break is actually difficult: the break instruction cannot be
|
269 |
|
|
allowed to execute. That way, upon a break, the debugger should
|
270 |
|
|
be able to jump back into the user process to step the instruction
|
271 |
|
|
that would've been at the break point initially, and then to
|
272 |
|
|
replace the break after passing it.
|
273 |
|
|
|
274 |
24 |
dgisselq |
Incidentally, this break messes with the prefetch cache and the
|
275 |
|
|
pipeline: if you change an instruction partially through the pipeline,
|
276 |
|
|
the whole pipeline needs to be cleansed. Likewise if you change
|
277 |
|
|
an instruction in memory, you need to make sure the cache is reloaded
|
278 |
|
|
with the new instruction.
|
279 |
|
|
|
280 |
69 |
dgisselq |
\item {\bf Prefetch Cache:} My original implementation, {\tt prefetch}, had
|
281 |
|
|
a very simple prefetch stage. Any time the PC changed the prefetch
|
282 |
|
|
would go and fetch the new instruction. While this was perhaps this
|
283 |
|
|
simplest approach, it cost roughly five clocks for every instruction.
|
284 |
|
|
This was deemed unacceptable, as I wanted a CPU that could execute
|
285 |
|
|
instructions in one cycle.
|
286 |
21 |
dgisselq |
|
287 |
69 |
dgisselq |
My second implementation, {\tt pipefetch}, attempted to make the most
|
288 |
|
|
use of pipelined memory. When a new CPU address was issued, it would
|
289 |
|
|
start reading
|
290 |
|
|
memory in a pipelined fashion, and issuing instructions as soon as they
|
291 |
|
|
were ready. This cache was a sliding window in memory. This suffered
|
292 |
|
|
from some difficult performance problems, though. If the CPU was
|
293 |
|
|
alternating between two diverse sections of code, both could never be
|
294 |
|
|
in the cache at the same time--causing lots of cache misses. Further,
|
295 |
|
|
the extra logic to implement this window cost an extra clock cycle
|
296 |
|
|
in the cache implementation, slowing down branches.
|
297 |
21 |
dgisselq |
|
298 |
69 |
dgisselq |
The Zip CPU now has a third cache implementation, {\tt pfcache}. This
|
299 |
|
|
new implementation takes only a single cycle per access, but costs a
|
300 |
|
|
full cache line miss on any miss. While configurable, a full cache
|
301 |
|
|
line miss might mean that the CPU needs to read 256~instructions from
|
302 |
|
|
memory before it can execute the first one of them.
|
303 |
|
|
|
304 |
21 |
dgisselq |
\item {\bf Operating System:} In order to support an operating system,
|
305 |
|
|
interrupts and so forth, the CPU needs to support supervisor and
|
306 |
|
|
user modes, as well as a means of switching between them. For example,
|
307 |
|
|
the user needs a means of executing a system call. This is the
|
308 |
|
|
purpose of the {\bf `trap'} instruction. This instruction needs to
|
309 |
|
|
place the CPU into supervisor mode (here equivalent to disabling
|
310 |
|
|
interrupts), as well as handing it a parameter such as identifying
|
311 |
|
|
which O/S function was called.
|
312 |
|
|
|
313 |
24 |
dgisselq |
My initial approach to building a trap instruction was to create an external
|
314 |
|
|
peripheral which, when written to, would generate an interrupt and could
|
315 |
|
|
return the last value written to it. In practice, this approach didn't work
|
316 |
|
|
at all: the CPU executed two instructions while waiting for the
|
317 |
|
|
trap interrupt to take place. Since then, I've decided to keep the rest of
|
318 |
|
|
the CC register for that purpose so that a write to the CC register, with the
|
319 |
|
|
GIE bit cleared, could be used to execute a trap. This has other problems,
|
320 |
|
|
though, primarily in the limitation of the uses of the CC register. In
|
321 |
|
|
particular, the CC register is the best place to put CPU state information and
|
322 |
|
|
to ``announce'' special CPU features (floating point, etc). So the trap
|
323 |
|
|
instruction still switches to interrupt mode, but the CC register is not
|
324 |
|
|
nearly as useful for telling the supervisor mode processor what trap is being
|
325 |
|
|
executed.
|
326 |
21 |
dgisselq |
|
327 |
|
|
Modern timesharing systems also depend upon a {\bf Timer} interrupt
|
328 |
24 |
dgisselq |
to handle task swapping. For the Zip CPU, this interrupt is handled
|
329 |
|
|
external to the CPU as part of the CPU System, found in {\tt zipsystem.v}.
|
330 |
|
|
The timer module itself is found in {\tt ziptimer.v}.
|
331 |
21 |
dgisselq |
|
332 |
69 |
dgisselq |
\item {\bf Bus Errors:} My original implementation had no logic to handle
|
333 |
|
|
what would happen if the CPU attempted to read or write a non-existent
|
334 |
|
|
memory address. This changed after I needed to troubleshoot a failure
|
335 |
|
|
caused by a subroutine return to a non-existent address.
|
336 |
|
|
|
337 |
|
|
My next problem bus problem was caused by a misbehaving peripheral.
|
338 |
|
|
Whenever the CPU attempted to read from or write to this peripheral,
|
339 |
|
|
the peripheral would take control of the wishbone bus and not return
|
340 |
|
|
it. For example, it might never return an {\tt ACK} to signal
|
341 |
|
|
the end of the bus transaction. This led to the implementation of
|
342 |
|
|
a wishbone bus watchdog that would create a bus error if any particular
|
343 |
|
|
bus action didn't complete in a timely fashion.
|
344 |
|
|
|
345 |
21 |
dgisselq |
\item {\bf Pipeline Stalls:} My original plan was to not support pipeline
|
346 |
|
|
stalls at all, but rather to require the compiler to properly schedule
|
347 |
24 |
dgisselq |
all instructions so that stalls would never be necessary. After trying
|
348 |
21 |
dgisselq |
to build such an architecture, I gave up, having learned some things:
|
349 |
|
|
|
350 |
68 |
dgisselq |
First, and ideal pipeline might look something like
|
351 |
|
|
Fig.~\ref{fig:ideal-pipeline}.
|
352 |
|
|
\begin{figure}
|
353 |
|
|
\begin{center}
|
354 |
|
|
\includegraphics[width=4in]{../gfx/fullpline.eps}
|
355 |
|
|
\caption{An Ideal Pipeline: One instruction per clock cycle}\label{fig:ideal-pipeline}
|
356 |
|
|
\end{center}\end{figure}
|
357 |
|
|
Notice that, in this figure, all the pipeline stages are complete and
|
358 |
|
|
full. Every instruction takes one clock and there are no delays.
|
359 |
|
|
However, as the discussion above pointed out, the memory associated
|
360 |
|
|
with a SwiC may not allow single clock access. It may be instead
|
361 |
|
|
that you can only read every two clocks. In that case, what shall
|
362 |
|
|
the pipeline look like? Should it look like
|
363 |
|
|
Fig.~\ref{fig:waiting-pipeline},
|
364 |
|
|
\begin{figure}\begin{center}
|
365 |
|
|
\includegraphics[width=4in]{../gfx/stuttra.eps}
|
366 |
|
|
\caption{Instructions wait for each other}\label{fig:waiting-pipeline}
|
367 |
|
|
\end{center}\end{figure}
|
368 |
|
|
where instructions are held back until the pipeline is full, or should
|
369 |
|
|
it look like Fig.~\ref{fig:independent-pipeline},
|
370 |
|
|
\begin{figure}\begin{center}
|
371 |
|
|
\includegraphics[width=4in]{../gfx/stuttrb.eps}
|
372 |
|
|
\caption{Instructions proceed independently}\label{fig:independent-pipeline}
|
373 |
|
|
\end{center}\end{figure}
|
374 |
|
|
where each instruction is allowed to move through the pipeline
|
375 |
|
|
independently? For better or worse, the Zip CPU allows instructions
|
376 |
|
|
to move through the pipeline independently.
|
377 |
21 |
dgisselq |
|
378 |
68 |
dgisselq |
One approach to avoiding stalls is to use a branch delay slot,
|
379 |
|
|
such as is shown in Fig.~\ref{fig:brdelay}.
|
380 |
|
|
\begin{figure}\begin{center}
|
381 |
|
|
\includegraphics[width=4in]{../gfx/bdly.eps}
|
382 |
|
|
\caption{A typical branch delay slot approach}\label{fig:brdelay}
|
383 |
|
|
\end{center}\end{figure}
|
384 |
|
|
In this figure, instructions
|
385 |
|
|
{\tt BR} (a branch), {\tt BD} (a branch delay instruction),
|
386 |
|
|
are followed by instructions after the branch: {\tt IA}, {\tt IB}, etc.
|
387 |
|
|
Since it takes a processor a clock cycle to execute a branch, the
|
388 |
|
|
delay slot allows the processor to do something useful in that
|
389 |
|
|
branch. The problem the Zip CPU has with this approach is, what
|
390 |
|
|
happens when the pipeline looks like Fig.~\ref{fig:brbroken}?
|
391 |
|
|
\begin{figure}\begin{center}
|
392 |
|
|
\includegraphics[width=4in]{../gfx/bdbroken.eps}
|
393 |
|
|
\caption{The branch delay slot breaks with a slow memory}\label{fig:brbroken}
|
394 |
|
|
\end{center}\end{figure}
|
395 |
|
|
In this case, the branch delay slot never gets filled in the first
|
396 |
|
|
place, and so the pipeline squashes it before it gets executed.
|
397 |
|
|
If not that, then what happens when handling interrupts or
|
398 |
|
|
debug stepping: when has the CPU finished an instruction?
|
399 |
|
|
When the {\tt BR} instruction has finished, or must {\tt BD}
|
400 |
|
|
follow every {\tt BR}? and, again, what if the pipeline isn't
|
401 |
|
|
full?
|
402 |
|
|
These thoughts killed any hopes of doing delayed branching.
|
403 |
|
|
|
404 |
21 |
dgisselq |
So I switched to a model of discrete execution: Once an instruction
|
405 |
|
|
enters into either the ALU or memory unit, the instruction is
|
406 |
|
|
guaranteed to complete. If the logic recognizes a branch or a
|
407 |
|
|
condition that would render the instruction entering into this stage
|
408 |
33 |
dgisselq |
possibly inappropriate (i.e. a conditional branch preceding a store
|
409 |
21 |
dgisselq |
instruction for example), then the pipeline stalls for one cycle
|
410 |
|
|
until the conditional branch completes. Then, if it generates a new
|
411 |
33 |
dgisselq |
PC address, the stages preceding are all wiped clean.
|
412 |
21 |
dgisselq |
|
413 |
68 |
dgisselq |
This model, however, generated too many pipeline stalls, so the
|
414 |
|
|
discrete execution model was modified to allow instructions to go
|
415 |
|
|
through the ALU unit and be canceled before writeback. This removed
|
416 |
|
|
the stall associated with ALU instructions before untaken branches.
|
417 |
|
|
|
418 |
|
|
The discrete execution model allows such things as sleeping, as
|
419 |
|
|
outlined in Fig.~\ref{fig:sleeping}.
|
420 |
|
|
\begin{figure}\begin{center}
|
421 |
|
|
\includegraphics[width=4in]{../gfx/sleep.eps}
|
422 |
|
|
\caption{How the CPU halts when sleeping}\label{fig:sleeping}
|
423 |
|
|
\end{center}\end{figure}
|
424 |
|
|
If the
|
425 |
24 |
dgisselq |
CPU is put to ``sleep,'' the ALU and memory stages stall and back up
|
426 |
21 |
dgisselq |
everything before them. Likewise, anything that has entered the ALU
|
427 |
|
|
or memory stage when the CPU is placed to sleep continues to completion.
|
428 |
|
|
To handle this logic, each pipeline stage has three control signals:
|
429 |
|
|
a valid signal, a stall signal, and a clock enable signal. In
|
430 |
|
|
general, a stage stalls if it's contents are valid and the next step
|
431 |
|
|
is stalled. This allows the pipeline to fill any time a later stage
|
432 |
68 |
dgisselq |
stalls, as illustrated in Fig.~\ref{fig:stacking}.
|
433 |
|
|
\begin{figure}\begin{center}
|
434 |
|
|
\includegraphics[width=4in]{../gfx/stacking.eps}
|
435 |
|
|
\caption{Instructions can stack up behind a stalled instruction}\label{fig:stacking}
|
436 |
|
|
\end{center}\end{figure}
|
437 |
69 |
dgisselq |
However, if a pipeline hazard is detected, a stage can stall in order
|
438 |
|
|
to prevent the previous from moving forward.
|
439 |
21 |
dgisselq |
|
440 |
68 |
dgisselq |
This approach is also different from other pipeline approaches.
|
441 |
|
|
Instead of keeping the entire pipeline filled, each stage is treated
|
442 |
24 |
dgisselq |
independently. Therefore, individual stages may move forward as long
|
443 |
|
|
as the subsequent stage is available, regardless of whether the stage
|
444 |
|
|
behind it is filled.
|
445 |
21 |
dgisselq |
\end{itemize}
|
446 |
|
|
|
447 |
|
|
With that introduction out of the way, let's move on to the instruction
|
448 |
|
|
set.
|
449 |
|
|
|
450 |
|
|
\chapter{CPU Architecture}\label{chap:arch}
|
451 |
|
|
|
452 |
24 |
dgisselq |
The Zip CPU supports a set of two operand instructions, where the second operand
|
453 |
21 |
dgisselq |
(always a register) is the result. The only exception is the store instruction,
|
454 |
|
|
where the first operand (always a register) is the source of the data to be
|
455 |
|
|
stored.
|
456 |
|
|
|
457 |
24 |
dgisselq |
\section{Simplified Bus}
|
458 |
|
|
The bus architecture of the Zip CPU is that of a simplified WISHBONE bus.
|
459 |
|
|
It has been simplified in this fashion: all operations are 32--bit operations.
|
460 |
36 |
dgisselq |
The bus is neither little endian nor big endian. For this reason, all words
|
461 |
24 |
dgisselq |
are 32--bits. All instructions are also 32--bits wide. Everything has been
|
462 |
|
|
built around the 32--bit word.
|
463 |
|
|
|
464 |
21 |
dgisselq |
\section{Register Set}
|
465 |
|
|
The Zip CPU supports two sets of sixteen 32-bit registers, a supervisor
|
466 |
24 |
dgisselq |
and a user set as shown in Fig.~\ref{fig:regset}.
|
467 |
|
|
\begin{figure}\begin{center}
|
468 |
|
|
\includegraphics[width=3.5in]{../gfx/regset.eps}
|
469 |
|
|
\caption{Zip CPU Register File}\label{fig:regset}
|
470 |
|
|
\end{center}\end{figure}
|
471 |
|
|
The supervisor set is used in interrupt mode when interrupts are disabled,
|
472 |
|
|
whereas the user set is used otherwise. Of this register set, the Program
|
473 |
|
|
Counter (PC) is register 15, whereas the status register (SR) or condition
|
474 |
|
|
code register
|
475 |
21 |
dgisselq |
(CC) is register 14. By convention, the stack pointer will be register 13 and
|
476 |
24 |
dgisselq |
noted as (SP)--although there is nothing special about this register other
|
477 |
69 |
dgisselq |
than this convention. Also by convention register~12 will point to a global
|
478 |
|
|
offset table, and may be abbreviated as the (GBL) register.
|
479 |
21 |
dgisselq |
The CPU can access both register sets via move instructions from the
|
480 |
|
|
supervisor state, whereas the user state can only access the user registers.
|
481 |
|
|
|
482 |
36 |
dgisselq |
The status register is special, and bears further mention. As shown in
|
483 |
|
|
Fig.~\ref{tbl:cc-register},
|
484 |
|
|
\begin{table}\begin{center}
|
485 |
|
|
\begin{bitlist}
|
486 |
69 |
dgisselq |
31\ldots 13 & R/W & Reserved for future uses\\\hline
|
487 |
|
|
12 & R & (Reserved for) Floating Point Exception\\\hline
|
488 |
|
|
11 & R & Division by Zero Exception\\\hline
|
489 |
|
|
10 & R & Bus-Error Flag\\\hline
|
490 |
36 |
dgisselq |
9 & R & Trap, or user interrupt, Flag. Cleared on return to userspace.\\\hline
|
491 |
68 |
dgisselq |
8 & R & Illegal Instruction Flag\\\hline
|
492 |
36 |
dgisselq |
7 & R/W & Break--Enable\\\hline
|
493 |
|
|
6 & R/W & Step\\\hline
|
494 |
|
|
5 & R/W & Global Interrupt Enable (GIE)\\\hline
|
495 |
|
|
4 & R/W & Sleep. When GIE is also set, the CPU waits for an interrupt.\\\hline
|
496 |
|
|
3 & R/W & Overflow\\\hline
|
497 |
|
|
2 & R/W & Negative. The sign bit was set as a result of the last ALU instruction.\\\hline
|
498 |
|
|
1 & R/W & Carry\\\hline
|
499 |
|
|
|
500 |
|
|
\end{bitlist}
|
501 |
|
|
\caption{Condition Code Register Bit Assignment}\label{tbl:cc-register}
|
502 |
|
|
\end{center}\end{table}
|
503 |
|
|
the lower 11~bits of the status register form
|
504 |
|
|
a set of CPU state and condition codes. Writes to other bits of this register
|
505 |
|
|
are preserved.
|
506 |
21 |
dgisselq |
|
507 |
33 |
dgisselq |
Of the condition codes, the bottom four bits are the current flags:
|
508 |
21 |
dgisselq |
Zero (Z),
|
509 |
|
|
Carry (C),
|
510 |
|
|
Negative (N),
|
511 |
|
|
and Overflow (V).
|
512 |
69 |
dgisselq |
On those instructions that set the flags, these flags will be set based upon
|
513 |
|
|
the output of the instruction. If the result is zero, the Z flag will be set.
|
514 |
|
|
If the high order bit is set, the N flag will be set. If the instruction
|
515 |
|
|
caused a bit to fall off the end, the carry bit will be set. Finally, if
|
516 |
|
|
the instruction causes a signed integer overflow, the V flag will be set
|
517 |
|
|
afterwards.
|
518 |
21 |
dgisselq |
|
519 |
69 |
dgisselq |
The next bit is a sleep bit. Set this bit to one to disable instruction
|
520 |
|
|
execution and place the CPU to sleep, or to zero to keep the pipeline
|
521 |
|
|
running. Setting this bit will cause the CPU to wait for an interrupt
|
522 |
|
|
(if interrupts are enabled), or to completely halt (if interrupts are
|
523 |
|
|
disabled). In order to prevent users from halting the CPU, only the
|
524 |
|
|
supervisor is allowed to both put the CPU to sleep and disable
|
525 |
|
|
interrupts. Any user attempt to do so will simply result in a switch
|
526 |
|
|
to supervisor mode.
|
527 |
33 |
dgisselq |
|
528 |
21 |
dgisselq |
The sixth bit is a global interrupt enable bit (GIE). When this
|
529 |
32 |
dgisselq |
sixth bit is a `1' interrupts will be enabled, else disabled. When
|
530 |
21 |
dgisselq |
interrupts are disabled, the CPU will be in supervisor mode, otherwise
|
531 |
|
|
it is in user mode. Thus, to execute a context switch, one only
|
532 |
|
|
need enable or disable interrupts. (When an interrupt line goes
|
533 |
|
|
high, interrupts will automatically be disabled, as the CPU goes
|
534 |
32 |
dgisselq |
and deals with its context switch.) Special logic has been added to
|
535 |
|
|
keep the user mode from setting the sleep register and clearing the
|
536 |
|
|
GIE register at the same time, with clearing the GIE register taking
|
537 |
|
|
precedence.
|
538 |
21 |
dgisselq |
|
539 |
69 |
dgisselq |
The seventh bit is a step bit. This bit can be set from supervisor mode only.
|
540 |
|
|
After setting this bit, should the supervisor mode process switch to
|
541 |
|
|
user mode, it would then accomplish one instruction in user mode
|
542 |
|
|
before returning to supervisor mode. Then, upon return to supervisor
|
543 |
|
|
mode, this bit will be automatically cleared. This bit has no effect
|
544 |
|
|
on the CPU while in supervisor mode.
|
545 |
21 |
dgisselq |
|
546 |
|
|
This functionality was added to enable a userspace debugger
|
547 |
|
|
functionality on a user process, working through supervisor mode
|
548 |
|
|
of course.
|
549 |
|
|
|
550 |
|
|
|
551 |
24 |
dgisselq |
The eighth bit is a break enable bit. This controls whether a break
|
552 |
|
|
instruction in user mode will halt the processor for an external debugger
|
553 |
|
|
(break enabled), or whether the break instruction will simply send send the
|
554 |
|
|
CPU into interrupt mode. Encountering a break in supervisor mode will
|
555 |
|
|
halt the CPU independent of the break enable bit. This bit can only be set
|
556 |
|
|
within supervisor mode.
|
557 |
21 |
dgisselq |
|
558 |
32 |
dgisselq |
% Should break enable be a supervisor mode bit, while the break enable bit
|
559 |
|
|
% in user mode is a break has taken place bit?
|
560 |
|
|
%
|
561 |
|
|
|
562 |
21 |
dgisselq |
This functionality was added to enable an external debugger to
|
563 |
|
|
set and manage breakpoints.
|
564 |
|
|
|
565 |
68 |
dgisselq |
The ninth bit is an illegal instruction bit. When the CPU
|
566 |
36 |
dgisselq |
tries to execute either a non-existant instruction, or an instruction from
|
567 |
68 |
dgisselq |
an address that produces a bus error, the CPU will (if implemented) switch
|
568 |
36 |
dgisselq |
to supervisor mode while setting this bit. The bit will automatically be
|
569 |
|
|
cleared upon any return to user mode.
|
570 |
21 |
dgisselq |
|
571 |
|
|
The tenth bit is a trap bit. It is set whenever the user requests a soft
|
572 |
|
|
interrupt, and cleared on any return to userspace command. This allows the
|
573 |
|
|
supervisor, in supervisor mode, to determine whether it got to supervisor
|
574 |
|
|
mode from a trap or from an external interrupt or both.
|
575 |
|
|
|
576 |
69 |
dgisselq |
\section{Instruction Format}
|
577 |
|
|
All Zip CPU instructions fit in one of the formats shown in
|
578 |
|
|
Fig.~\ref{fig:iset-format}.
|
579 |
|
|
\begin{figure}\begin{center}
|
580 |
|
|
\begin{bytefield}[endianness=big]{32}
|
581 |
|
|
\bitheader{0-31}\\
|
582 |
|
|
\begin{leftwordgroup}{Standard}\bitbox{1}{0}\bitbox{4}{DR}
|
583 |
|
|
\bitbox[lrt]{5}{OpCode}
|
584 |
|
|
\bitbox[lrt]{3}{Cnd}
|
585 |
|
|
\bitbox{1}{0}
|
586 |
|
|
\bitbox{18}{18-bit Signed Immediate} \\
|
587 |
|
|
\bitbox{1}{0}\bitbox{4}{DR}
|
588 |
|
|
\bitbox[lrb]{5}{}
|
589 |
|
|
\bitbox[lrb]{3}{}
|
590 |
|
|
\bitbox{1}{1}
|
591 |
|
|
\bitbox{4}{BR}
|
592 |
|
|
\bitbox{14}{14-bit Signed Immediate}\end{leftwordgroup} \\
|
593 |
|
|
\begin{leftwordgroup}{MOV}\bitbox{1}{0}\bitbox{4}{DR}
|
594 |
|
|
\bitbox[lrt]{5}{5'hf}
|
595 |
|
|
\bitbox[lrt]{3}{Cnd}
|
596 |
|
|
\bitbox{1}{A}
|
597 |
|
|
\bitbox{4}{BR}
|
598 |
|
|
\bitbox{1}{B}
|
599 |
|
|
\bitbox{13}{13-bit Signed Immediate}\end{leftwordgroup} \\
|
600 |
|
|
\begin{leftwordgroup}{LDI}\bitbox{1}{0}\bitbox{4}{DR}
|
601 |
|
|
\bitbox{4}{4'hb}
|
602 |
|
|
\bitbox{23}{23-bit Signed Immediate}\end{leftwordgroup} \\
|
603 |
|
|
\begin{leftwordgroup}{NOOP}\bitbox{1}{0}\bitbox{3}{3'h7}
|
604 |
|
|
\bitbox{1}{}
|
605 |
|
|
\bitbox{2}{11}
|
606 |
|
|
\bitbox{3}{xxx}
|
607 |
|
|
\bitbox{22}{Ignored}
|
608 |
|
|
\end{leftwordgroup} \\
|
609 |
|
|
\begin{leftwordgroup}{VLIW}\bitbox{1}{1}\bitbox[lrt]{4}{DR}
|
610 |
|
|
\bitbox[lrt]{5}{OpCode}
|
611 |
|
|
\bitbox[lrt]{3}{Cnd}
|
612 |
|
|
\bitbox{1}{0}
|
613 |
|
|
\bitbox{4}{Imm.}
|
614 |
|
|
\bitbox{14}{---} \\
|
615 |
|
|
\bitbox{1}{1}\bitbox[lr]{4}{}
|
616 |
|
|
\bitbox[lrb]{5}{}
|
617 |
|
|
\bitbox[lr]{3}{}
|
618 |
|
|
\bitbox{1}{1}
|
619 |
|
|
\bitbox{4}{BR}
|
620 |
|
|
\bitbox{14}{---} \\
|
621 |
|
|
\bitbox{1}{1}\bitbox[lrb]{4}{}
|
622 |
|
|
\bitbox{4}{4'hb}
|
623 |
|
|
\bitbox{1}{}
|
624 |
|
|
\bitbox[lrb]{3}{}
|
625 |
|
|
\bitbox{5}{5'b Imm}
|
626 |
|
|
\bitbox{14}{---} \\
|
627 |
|
|
\bitbox{1}{1}\bitbox{9}{---}
|
628 |
|
|
\bitbox[lrt]{3}{Cnd}
|
629 |
|
|
\bitbox{5}{---}
|
630 |
|
|
\bitbox[lrt]{4}{DR}
|
631 |
|
|
\bitbox[lrt]{5}{OpCode}
|
632 |
|
|
\bitbox{1}{0}
|
633 |
|
|
\bitbox{4}{Imm}
|
634 |
|
|
\\
|
635 |
|
|
\bitbox{1}{1}\bitbox{9}{---}
|
636 |
|
|
\bitbox[lr]{3}{}
|
637 |
|
|
\bitbox{5}{---}
|
638 |
|
|
\bitbox[lr]{4}{}
|
639 |
|
|
\bitbox[lrb]{5}{}
|
640 |
|
|
\bitbox{1}{1}
|
641 |
|
|
\bitbox{4}{Reg} \\
|
642 |
|
|
\bitbox{1}{1}\bitbox{9}{---}
|
643 |
|
|
\bitbox[lrb]{3}{}
|
644 |
|
|
\bitbox{5}{---}
|
645 |
|
|
\bitbox[lrb]{4}{}
|
646 |
|
|
\bitbox{4}{4'hb}
|
647 |
|
|
\bitbox{1}{}
|
648 |
|
|
\bitbox{5}{5'b Imm}
|
649 |
|
|
\end{leftwordgroup} \\
|
650 |
|
|
\end{bytefield}
|
651 |
|
|
\caption{Zip Instruction Set Format}\label{fig:iset-format}
|
652 |
|
|
\end{center}\end{figure}
|
653 |
|
|
The basic format is that some operation, defined by the OpCode, is applied
|
654 |
|
|
if a condition, Cnd, is true in order to produce a result which is placed in
|
655 |
|
|
the destination register, or DR. The Load 23--bit signed immediate instruction
|
656 |
|
|
is different in that it requires no conditions, and uses only a 4-bit opcode.
|
657 |
|
|
|
658 |
|
|
This is actually a second version of instruction set definition, given certain
|
659 |
|
|
lessons learned. For example, the original instruction set had the following
|
660 |
|
|
problems:
|
661 |
|
|
\begin{enumerate}
|
662 |
|
|
\item No opcodes were available for divide or floating point extensions to be
|
663 |
|
|
made available. Although there was space in the instruction set to
|
664 |
|
|
add these types of instructions, this instruction space was going to
|
665 |
|
|
require extra logic to use.
|
666 |
|
|
\item The carveouts for instructions such as NOOP and LDIHI/LDILO required
|
667 |
|
|
extra logic to process.
|
668 |
|
|
\item The instruction set wasn't very compact. One bus operation was required
|
669 |
|
|
for every instruction.
|
670 |
|
|
\end{enumerate}
|
671 |
|
|
This second version was designed with two criteria. The first was that the
|
672 |
|
|
new instruction set needed to be compatible, at the assembly language level,
|
673 |
|
|
with the previous instruction set. Thus, it must be able to support all of
|
674 |
|
|
the previous menumonics and more. This was achieved with the sole exception
|
675 |
|
|
that instruction immediates are generally two bits shorter than before.
|
676 |
|
|
(One bit was lost to the VLIW bit in front, another from changing from 4--bit
|
677 |
|
|
to 5--bit opcodes.) Second, the new instruction set needed to be a drop--in
|
678 |
|
|
replacement for the decoder, modifying nothing else. This was almost achieved,
|
679 |
|
|
save for two issues: the ALU unit needed to be replaced since the OpCodes
|
680 |
|
|
were reordered, and some condition code logic needed to be adjusted since the
|
681 |
|
|
condition codes were renumbered as well. In the end, maximum reuse of the
|
682 |
|
|
existing RTL (Verilog) code was achieved in this upgrade.
|
683 |
|
|
|
684 |
|
|
As of this second version of the Zip CPU instruction set, the Zip CPU also
|
685 |
|
|
supports a very long instruction word (VLIW) set of instructions. These
|
686 |
|
|
instruction formats pack two instructions into a single instuction word,
|
687 |
|
|
trading immediate instruction space to do this, but in just about all other
|
688 |
|
|
respects these are identical to two standard instructions. Other than
|
689 |
|
|
instruction format, the only basic difference is that the CPU will not switch
|
690 |
|
|
to interrupt mode in between the two instructions. Likewise a new job given
|
691 |
|
|
to the assembler is that of automatically packing as many instructions as
|
692 |
|
|
possible into the VLIW format. Where necessary to place both VLIW instructions
|
693 |
|
|
on the same line, they will be separated by a vertical bar.
|
694 |
|
|
|
695 |
|
|
\section{Instruction OpCodes}
|
696 |
|
|
With a 5--bit opcode field, there are 32--possible instructions as shown in
|
697 |
|
|
Tbl.~\ref{tbl:iset-opcodes}.
|
698 |
|
|
\begin{table}\begin{center}
|
699 |
|
|
\begin{tabular}{|l|l|l|c|} \hline \rowcolor[gray]{0.85}
|
700 |
|
|
OpCode & & Instruction &Sets CC \\\hline\hline
|
701 |
|
|
5'h00 & SUB & Subtract & \\\cline{1-3}
|
702 |
|
|
5'h01 & AND & Bitwise And & \\\cline{1-3}
|
703 |
|
|
5'h02 & ADD & Add two numbers & \\\cline{1-3}
|
704 |
|
|
5'h03 & OR & Bitwise Or & Y \\\cline{1-3}
|
705 |
|
|
5'h04 & XOR & Bitwise Exclusive Or & \\\cline{1-3}
|
706 |
|
|
5'h05 & LSR & Logical Shift Right & \\\cline{1-3}
|
707 |
|
|
5'h06 & LSL & Logical Shift Left & \\\cline{1-3}
|
708 |
|
|
5'h07 & ASR & Arithmetic Shift Right & \\\hline
|
709 |
|
|
5'h08 & LDIHI & Load Immediate High & N \\\cline{1-3}
|
710 |
|
|
5'h09 & LDILO & Load Immediate Low & \\\hline
|
711 |
|
|
5'h0a & MPYU & Unsigned 16--bit Multiply & \\\cline{1-3}
|
712 |
|
|
5'h0b & MPYS & Signed 16--bit Multiply & Y \\\cline{1-3}
|
713 |
|
|
5'h0c & BREV & Bit Reverse & \\\cline{1-3}
|
714 |
|
|
5'h0d & POPC& Population Count & \\\cline{1-3}
|
715 |
|
|
5'h0e & ROL & Rotate left & \\\hline
|
716 |
|
|
5'h0f & MOV & Move register & N \\\hline
|
717 |
|
|
5'h10 & CMP & Compare & Y \\\cline{1-3}
|
718 |
|
|
5'h11 & TST & Test (AND w/o setting result) & \\\hline
|
719 |
|
|
5'h12 & LOD & Load from memory & N \\\cline{1-3}
|
720 |
|
|
5'h13 & STO & Store a register into memory & \\\hline\hline
|
721 |
|
|
5'h14 & DIVU & Divide, unsigned & Y \\\cline{1-3}
|
722 |
|
|
5'h15 & DIVS & Divide, signed & \\\hline\hline
|
723 |
|
|
5'h16/7 & LDI & Load 23--bit signed immediate & N \\\hline\hline
|
724 |
|
|
5'h18 & FPADD & Floating point add & \\\cline{1-3}
|
725 |
|
|
5'h19 & FPSUB & Floating point subtract & \\\cline{1-3}
|
726 |
|
|
5'h1a & FPMPY & Floating point multiply & Y \\\cline{1-3}
|
727 |
|
|
5'h1b & FPDIV & Floating point divide & \\\cline{1-3}
|
728 |
|
|
5'h1c & FPCVT & Convert integer to floating point & \\\cline{1-3}
|
729 |
|
|
5'h1d & FPINT & Convert to integer & \\\hline
|
730 |
|
|
5'h1e & & {\em Reserved for future use} &\\\hline
|
731 |
|
|
5'h1f & & {\em Reserved for future use} &\\\hline
|
732 |
39 |
dgisselq |
\end{tabular}
|
733 |
69 |
dgisselq |
\caption{Zip CPU OpCodes}\label{tbl:iset-opcodes}
|
734 |
39 |
dgisselq |
\end{center}\end{table}
|
735 |
69 |
dgisselq |
%
|
736 |
|
|
Of these opcodes, the {\tt BREV} and {\tt POPC} are experimental, and may be
|
737 |
|
|
replaced later, and two floating point instruction opcodes are reserved for
|
738 |
|
|
future use.
|
739 |
39 |
dgisselq |
|
740 |
21 |
dgisselq |
\section{Conditional Instructions}
|
741 |
69 |
dgisselq |
Most, although not quite all, instructions may be conditionally executed.
|
742 |
|
|
The 23--bit load immediate instruction, together with the {\tt NOOP},
|
743 |
|
|
{\tt BREAK}, and {\tt LOCK} instructions are the only exception to this rule.
|
744 |
|
|
|
745 |
|
|
From the four condition code flags, eight conditions are defined for standard
|
746 |
|
|
instructions. These are shown in Tbl.~\ref{tbl:conditions}.
|
747 |
|
|
\begin{table}\begin{center}
|
748 |
21 |
dgisselq |
\begin{tabular}{l|l|l}
|
749 |
|
|
Code & Mneumonic & Condition \\\hline
|
750 |
|
|
3'h0 & None & Always execute the instruction \\
|
751 |
69 |
dgisselq |
3'h1 & {\tt .LT} & Less than ('N' set) \\
|
752 |
|
|
3'h2 & {\tt .Z} & Only execute when 'Z' is set \\
|
753 |
|
|
3'h3 & {\tt .NZ} & Only execute when 'Z' is not set \\
|
754 |
21 |
dgisselq |
3'h4 & {\tt .GT} & Greater than ('N' not set, 'Z' not set) \\
|
755 |
69 |
dgisselq |
3'h5 & {\tt .GE} & Greater than or equal ('N' not set, 'Z' irrelevant) \\
|
756 |
21 |
dgisselq |
3'h6 & {\tt .C} & Carry set\\
|
757 |
|
|
3'h7 & {\tt .V} & Overflow set\\
|
758 |
|
|
\end{tabular}
|
759 |
|
|
\caption{Conditions for conditional operand execution}\label{tbl:conditions}
|
760 |
69 |
dgisselq |
\end{center}\end{table}
|
761 |
|
|
There is no condition code for less than or equal, not C or not V---there
|
762 |
|
|
just wasn't enough space in 3--bits. Conditioning on a non--supported
|
763 |
|
|
condition is still possible, but it will take an extra instruction and a
|
764 |
|
|
pipeline stall. (Ex: \hbox{\em (Stall)}; \hbox{\tt TST \$4,CC;} \hbox{\tt
|
765 |
|
|
STO.NZ R0,(R1)}) As an alternative, it is often possible to reverse the
|
766 |
|
|
condition, and thus recovering those extra two clocks. Thus instead of
|
767 |
|
|
\hbox{\tt CMP Rx,Ry;} \hbox{\tt BNV label} you can issue a
|
768 |
|
|
\hbox{\tt CMP Ry,Rx;} \hbox{\tt BV label}.
|
769 |
21 |
dgisselq |
|
770 |
69 |
dgisselq |
Conditionally executed instructions will not further adjust the
|
771 |
68 |
dgisselq |
condition codes, with the exception of \hbox{\tt CMP} and \hbox{\tt TST}
|
772 |
|
|
instructions. Conditional \hbox{\tt CMP} or \hbox{\tt TST} instructions
|
773 |
69 |
dgisselq |
will adjust conditions whenever they are executed. In this way,
|
774 |
68 |
dgisselq |
multiple conditions may be evaluated without branches. For example, to do
|
775 |
|
|
something if \hbox{\tt R0} is one and \hbox{\tt R1} is two, one might try
|
776 |
|
|
code such as Tbl.~\ref{tbl:dbl-condition}.
|
777 |
|
|
\begin{table}\begin{center}
|
778 |
|
|
\begin{tabular}{l}
|
779 |
|
|
{\tt CMP 1,R0} \\
|
780 |
|
|
{;\em Condition codes are now set based upon R0-1} \\
|
781 |
|
|
{\tt CMP.Z 2,R1} \\
|
782 |
|
|
{;\em If R0 $\neq$ 1, conditions are unchanged.} \\
|
783 |
|
|
{;\em If R0 $=$ 1, conditions are set based upon R1-2.} \\
|
784 |
|
|
{;\em Now do something based upon the conjunction of both conditions.} \\
|
785 |
|
|
{;\em While we use the example of a STO, it could be any instruction.} \\
|
786 |
|
|
{\tt STO.Z R0,(R2)} \\
|
787 |
|
|
\end{tabular}
|
788 |
|
|
\caption{An example of a double conditional}\label{tbl:dbl-condition}
|
789 |
|
|
\end{center}\end{table}
|
790 |
36 |
dgisselq |
|
791 |
69 |
dgisselq |
In the case of VLIW instructions, only four conditions are defined as shown
|
792 |
|
|
in Tbl.~\ref{tbl:vliw-conditions}.
|
793 |
|
|
\begin{table}\begin{center}
|
794 |
|
|
\begin{tabular}{l|l|l}
|
795 |
|
|
Code & Mneumonic & Condition \\\hline
|
796 |
|
|
2'h0 & None & Always execute the instruction \\
|
797 |
|
|
2'h1 & {\tt .LT} & Less than ('N' set) \\
|
798 |
|
|
2'h2 & {\tt .Z} & Only execute when 'Z' is set \\
|
799 |
|
|
2'h3 & {\tt .NZ} & Only execute when 'Z' is not set \\
|
800 |
|
|
\end{tabular}
|
801 |
|
|
\caption{VLIW Conditions}\label{tbl:vliw-conditions}
|
802 |
|
|
\end{center}\end{table}
|
803 |
|
|
Further, the first bit is given a special meaning. If the first bit is set,
|
804 |
|
|
the conditions apply to the second half of the instruction, otherwise the
|
805 |
|
|
conditions will only apply to the first half of a conditional instruction.
|
806 |
68 |
dgisselq |
|
807 |
21 |
dgisselq |
\section{Operand B}
|
808 |
69 |
dgisselq |
Many instruction forms have a 19-bit source ``Operand B'' associated with them.
|
809 |
|
|
This ``Operand B'' is shown in Fig.~\ref{fig:iset-format} as part of the
|
810 |
|
|
standard instructions. This Operand B is either equal to a register plus a
|
811 |
|
|
14--bit signed immediate offset, or an 18--bit signed immediate offset by
|
812 |
|
|
itself. This value is encoded as shown in Tbl.~\ref{tbl:opb}.
|
813 |
21 |
dgisselq |
\begin{table}\begin{center}
|
814 |
69 |
dgisselq |
\begin{bytefield}[endianness=big]{19}
|
815 |
|
|
\bitheader{0-18} \\
|
816 |
|
|
\bitbox{1}{0}\bitbox{18}{18-bit Signed Immediate} \\
|
817 |
|
|
\bitbox{1}{1}\bitbox{4}{Reg}\bitbox{14}{14-bit Signed Immediate}
|
818 |
|
|
\end{bytefield}
|
819 |
21 |
dgisselq |
\caption{Bit allocation for Operand B}\label{tbl:opb}
|
820 |
|
|
\end{center}\end{table}
|
821 |
24 |
dgisselq |
|
822 |
69 |
dgisselq |
Fourteen and eighteen bit immediate values don't make sense for all
|
823 |
|
|
instructions. For example, what is the point of an 18--bit immediate when
|
824 |
|
|
executing a 16--bit multiply? Or a 16--bit load--immediate? In these cases,
|
825 |
|
|
the extra bits are simply ignored.
|
826 |
24 |
dgisselq |
|
827 |
69 |
dgisselq |
VLIW instructions still use the same operand B, only there was no room for any
|
828 |
|
|
instruction plus immediate addressing. Therefore, VLIW instructions have either
|
829 |
|
|
a register or a 4--bit signed immediate as their operand B. The only exception
|
830 |
|
|
is the load immediate instruction, which permits a 5--bit signed operand
|
831 |
|
|
B.\footnote{Although the space exists to extend this VLIW load immediate
|
832 |
|
|
instruction to six bits, the 5--bit limit was chosen to simplify the
|
833 |
|
|
disassembler. This may change in the future.}
|
834 |
|
|
|
835 |
21 |
dgisselq |
\section{Address Modes}
|
836 |
36 |
dgisselq |
The Zip CPU supports two addressing modes: register plus immediate, and
|
837 |
21 |
dgisselq |
immediate address. Addresses are therefore encoded in the same fashion as
|
838 |
69 |
dgisselq |
Operand B's, shown above. Practically, the VLIW instruction set only offers
|
839 |
|
|
register addressing, necessitating a non--VLIW instruction for most memory
|
840 |
|
|
operations.
|
841 |
21 |
dgisselq |
|
842 |
|
|
A lot of long hard thought was put into whether to allow pre/post increment
|
843 |
|
|
and decrement addressing modes. Finding no way to use these operators without
|
844 |
32 |
dgisselq |
taking two or more clocks per instruction,\footnote{The two clocks figure
|
845 |
|
|
comes from the design of the register set, allowing only one write per clock.
|
846 |
|
|
That write is either from the memory unit or the ALU, but never both.} these
|
847 |
|
|
addressing modes have been
|
848 |
21 |
dgisselq |
removed from the realm of possibilities. This means that the Zip CPU has no
|
849 |
|
|
native way of executing push, pop, return, or jump to subroutine operations.
|
850 |
24 |
dgisselq |
Each of these instructions can be emulated with a set of instructions from the
|
851 |
|
|
existing set.
|
852 |
21 |
dgisselq |
|
853 |
|
|
\section{Move Operands}
|
854 |
|
|
The previous set of operands would be perfect and complete, save only that
|
855 |
24 |
dgisselq |
the CPU needs access to non--supervisory registers while in supervisory mode.
|
856 |
|
|
Therefore, the MOV instruction is special and offers access to these registers
|
857 |
|
|
\ldots when in supervisory mode. To keep the compiler simple, the extra bits
|
858 |
|
|
are ignored in non-supervisory mode (as though they didn't exist), rather than
|
859 |
|
|
being mapped to new instructions or additional capabilities. The bits
|
860 |
69 |
dgisselq |
indicating which register set each register lies within are the A-User, marked
|
861 |
|
|
`A' in Fig.~\ref{fig:iset-format}, and B-User bits, marked as `B'. When set
|
862 |
|
|
to a one, these refer to a user mode register. When set to a zero, these
|
863 |
|
|
refer to a register in the current mode, whether user or supervisor. Further,
|
864 |
|
|
because a load immediate instruction exists, there is no move capability
|
865 |
|
|
between an immediate and a register: all moves come from either a register or
|
866 |
|
|
a register plus an offset.
|
867 |
21 |
dgisselq |
|
868 |
69 |
dgisselq |
This actually leads to a bit of a problem: since the {\tt MOV} instruction
|
869 |
|
|
encodes which register set each register is coming from or moving to, how shall
|
870 |
|
|
a compiler or assembler know how to compile a MOV instruction without knowing
|
871 |
24 |
dgisselq |
the mode of the CPU at the time? For this reason, the compiler will assume
|
872 |
|
|
all MOV registers are supervisor registers, and display them as normal.
|
873 |
69 |
dgisselq |
Anything with the user bit set will be treated as a user register and displayed
|
874 |
|
|
special. Since the CPU quietly ignores the supervisor bits while in user mode,
|
875 |
|
|
anything marked as a user register will always be specific.
|
876 |
21 |
dgisselq |
|
877 |
|
|
\section{Multiply Operations}
|
878 |
36 |
dgisselq |
The Zip CPU supports two Multiply operations, a 16x16 bit signed multiply
|
879 |
69 |
dgisselq |
({\tt MPYS}) and a 16x16 bit unsigned multiply ({\tt MPYU}). A 32--bit
|
880 |
|
|
multiply, should it be desired, needs to be created via software from this
|
881 |
|
|
16x16 bit multiply.
|
882 |
21 |
dgisselq |
|
883 |
69 |
dgisselq |
\section{Divide Unit}
|
884 |
|
|
The Zip CPU also has a divide unit which can be built alongside the ALU.
|
885 |
|
|
This divide unit provides the Zip CPU with its first two instructions that
|
886 |
|
|
cannot be executed in a single cycle: {\tt DIVS}, or signed divide, and
|
887 |
|
|
{\tt DIVU}, the unsigned divide. These are both 32--bit divide instructions,
|
888 |
|
|
dividing one 32--bit number by another. In this case, the Operand B field,
|
889 |
|
|
whether it be register or register plus immediate, constitutes the denominator,
|
890 |
|
|
whereas the numerator is given by the other register.
|
891 |
21 |
dgisselq |
|
892 |
69 |
dgisselq |
The Divide is also a multi--clock instruction. While the divide is running,
|
893 |
|
|
the ALU, memory unit, and floating point unit (if installed) will be idle.
|
894 |
|
|
Once the divide completes, other units may continue.
|
895 |
21 |
dgisselq |
|
896 |
69 |
dgisselq |
Of course, divides can have errors: division by zero. In the case of division
|
897 |
|
|
by zero, an exception will be caused that will send the CPU either from
|
898 |
|
|
user mode to supervisor mode, or halt the CPU if it is already in supervisor
|
899 |
|
|
mode.
|
900 |
32 |
dgisselq |
|
901 |
69 |
dgisselq |
\section{NOOP, BREAK, and Bus Lock Instruction}
|
902 |
|
|
Three instructions are not listed in the opcode list in
|
903 |
|
|
Tbl.~\ref{tbl:iset-opcodes}, yet fit in the NOOP type instruction format of
|
904 |
|
|
Fig.~\ref{fig:iset-format}. These are the {\tt NOOP}, {\tt Break}, and
|
905 |
|
|
bus {\tt LOCK} instructions. These are encoded according to
|
906 |
|
|
Fig.~\ref{fig:iset-noop}, and have the following meanings:
|
907 |
|
|
\begin{figure}\begin{center}
|
908 |
|
|
\begin{bytefield}[endianness=big]{32}
|
909 |
|
|
\bitheader{0-31}\\
|
910 |
|
|
\begin{leftwordgroup}{NOOP}
|
911 |
|
|
\bitbox{1}{0}\bitbox{3}{3'h7}\bitbox{1}{}
|
912 |
|
|
\bitbox{2}{11}\bitbox{3}{001}\bitbox{22}{Ignored} \\
|
913 |
|
|
\bitbox{1}{1}\bitbox{3}{3'h7}\bitbox{1}{}
|
914 |
|
|
\bitbox{2}{11}\bitbox{3}{001}\bitbox{22}{---} \\
|
915 |
|
|
\bitbox{1}{1}\bitbox{9}{---}\bitbox{3}{---}\bitbox{5}{---}
|
916 |
|
|
\bitbox{3}{3'h7}\bitbox{1}{}\bitbox{2}{11}\bitbox{3}{001}
|
917 |
|
|
\bitbox{5}{Ignored}
|
918 |
|
|
\end{leftwordgroup} \\
|
919 |
|
|
\begin{leftwordgroup}{BREAK}
|
920 |
|
|
\bitbox{1}{0}\bitbox{3}{3'h7}
|
921 |
|
|
\bitbox{1}{}\bitbox{2}{11}\bitbox{3}{010}\bitbox{22}{Ignored}
|
922 |
|
|
\end{leftwordgroup} \\
|
923 |
|
|
\begin{leftwordgroup}{LOCK}
|
924 |
|
|
\bitbox{1}{0}\bitbox{3}{3'h7}
|
925 |
|
|
\bitbox{1}{}\bitbox{2}{11}\bitbox{3}{100}\bitbox{22}{Ignored}
|
926 |
|
|
\end{leftwordgroup} \\
|
927 |
|
|
\end{bytefield}
|
928 |
|
|
\caption{NOOP/Break/LOCK Instruction Format}\label{fig:iset-noop}
|
929 |
|
|
\end{center}\end{figure}
|
930 |
32 |
dgisselq |
|
931 |
69 |
dgisselq |
The {\tt NOOP} instruction is just that: an instruction that does not perform
|
932 |
|
|
any operation. While many other instructions, such as a move from a register to
|
933 |
|
|
itself, could also fit these roles, only the NOOP instruction guarantees that
|
934 |
|
|
it will not stall waiting for a register to be available. For this reason,
|
935 |
|
|
it gets its own place in the instruction set.
|
936 |
32 |
dgisselq |
|
937 |
69 |
dgisselq |
The {\tt BREAK} instruction is useful for creating a debug instruction that
|
938 |
|
|
will halt the CPU without executing. If in user mode, depending upon the
|
939 |
|
|
setting of the break enable bit, it will either switch to supervisor mode or
|
940 |
|
|
halt the CPU--depending upon where the user wishes to do his debugging.
|
941 |
21 |
dgisselq |
|
942 |
69 |
dgisselq |
Finally, the {\tt LOCK} instruction was added in order to make a test and
|
943 |
|
|
set multi--CPU operation possible. Following a LOCK instruction, the next
|
944 |
|
|
two instructions, if they are memory LOD/STO instructions, will execute without
|
945 |
|
|
dropping the wishbone {\tt CYC} line between the instructions. Thus a
|
946 |
|
|
{\tt LOCK} followed by {\tt LOD (Rx),Ry} and a {\tt STO Rz,(Rx)}, where Rz
|
947 |
|
|
is initially set, can be used to set an address while guaranteeing that Ry
|
948 |
|
|
was the value before setting the address to Rz. This is a useful instruction
|
949 |
|
|
while trying to achieve concurrency among multiple CPU's.
|
950 |
21 |
dgisselq |
|
951 |
69 |
dgisselq |
\section{Floating Point}
|
952 |
|
|
Although the Zip CPU does not (yet) have a floating point unit, the current
|
953 |
|
|
instruction set offers eight opcodes for floating point operations, and treats
|
954 |
|
|
floating point exceptions like divide by zero errors. Once this unit is built
|
955 |
|
|
and integrated together with the rest of the CPU, the Zip CPU will support
|
956 |
|
|
32--bit floating point instructions natively. Any 64--bit floating point
|
957 |
|
|
instructions will still need to be emulated in software.
|
958 |
|
|
|
959 |
21 |
dgisselq |
\section{Derived Instructions}
|
960 |
36 |
dgisselq |
The Zip CPU supports many other common instructions, but not all of them
|
961 |
24 |
dgisselq |
are single cycle instructions. The derived instruction tables,
|
962 |
36 |
dgisselq |
Tbls.~\ref{tbl:derived-1}, \ref{tbl:derived-2}, \ref{tbl:derived-3}
|
963 |
|
|
and~\ref{tbl:derived-4},
|
964 |
21 |
dgisselq |
help to capture some of how these other instructions may be implemented on
|
965 |
36 |
dgisselq |
the Zip CPU. Many of these instructions will have assembly equivalents,
|
966 |
21 |
dgisselq |
such as the branch instructions, to facilitate working with the CPU.
|
967 |
|
|
\begin{table}\begin{center}
|
968 |
|
|
\begin{tabular}{p{1.4in}p{1.5in}p{3in}}\\\hline
|
969 |
|
|
Mapped & Actual & Notes \\\hline
|
970 |
39 |
dgisselq |
{\tt ABS Rx}
|
971 |
|
|
& \parbox[t]{1.5in}{\tt TST -1,Rx\\NEG.LT Rx}
|
972 |
36 |
dgisselq |
& Absolute value, depends upon derived NEG.\\\hline
|
973 |
39 |
dgisselq |
\parbox[t]{1.4in}{\tt ADD Ra,Rx\\ADDC Rb,Ry}
|
974 |
|
|
& \parbox[t]{1.5in}{\tt Add Ra,Rx\\ADD.C \$1,Ry\\Add Rb,Ry}
|
975 |
21 |
dgisselq |
& Add with carry \\\hline
|
976 |
39 |
dgisselq |
{\tt BRA.Cond +/-\$Addr}
|
977 |
92 |
dgisselq |
& \hbox{\tt ADD.cond \$Addr+PC,PC}
|
978 |
|
|
& Branch or jump on condition. Works for 18--bit
|
979 |
24 |
dgisselq |
signed address offsets.\\\hline
|
980 |
39 |
dgisselq |
{\tt BRA.Cond +/-\$Addr}
|
981 |
|
|
& \parbox[t]{1.5in}{\tt LDI \$Addr,Rx \\ ADD.cond Rx,PC}
|
982 |
73 |
dgisselq |
& Branch/jump on condition. Works for 23 bit address offsets, but
|
983 |
|
|
costs a register and an extra instruction. With LDIHI and LDILO
|
984 |
|
|
this can be made to work anywhere in the 32-bit address space, but yet
|
985 |
|
|
cost an additional instruction still. \\\hline
|
986 |
39 |
dgisselq |
{\tt BNC PC+\$Addr}
|
987 |
92 |
dgisselq |
& \parbox[t]{1.5in}{\tt Test \$Carry,CC \\ ADD.Z PC+\$Addr,PC}
|
988 |
21 |
dgisselq |
& Example of a branch on an unsupported
|
989 |
|
|
condition, in this case a branch on not carry \\\hline
|
990 |
92 |
dgisselq |
{\tt BUSY } & {\tt ADD \$-1,PC} & Execute an infinite loop \\\hline
|
991 |
39 |
dgisselq |
{\tt CLRF.NZ Rx }
|
992 |
|
|
& {\tt XOR.NZ Rx,Rx}
|
993 |
21 |
dgisselq |
& Clear Rx, and flags, if the Z-bit is not set \\\hline
|
994 |
39 |
dgisselq |
{\tt CLR Rx }
|
995 |
|
|
& {\tt LDI \$0,Rx}
|
996 |
21 |
dgisselq |
& Clears Rx, leaves flags untouched. This instruction cannot be
|
997 |
|
|
conditional. \\\hline
|
998 |
39 |
dgisselq |
{\tt EXCH.W Rx }
|
999 |
|
|
& {\tt ROL \$16,Rx}
|
1000 |
21 |
dgisselq |
& Exchanges the top and bottom 16'bit words of Rx \\\hline
|
1001 |
39 |
dgisselq |
{\tt HALT }
|
1002 |
|
|
& {\tt Or \$SLEEP,CC}
|
1003 |
|
|
& This only works when issued in interrupt/supervisor mode. In user
|
1004 |
|
|
mode this is simply a wait until interrupt instruction. \\\hline
|
1005 |
69 |
dgisselq |
{\tt INT } & {\tt LDI \$0,CC} & This is also known as a trap instruction\\\hline
|
1006 |
39 |
dgisselq |
{\tt IRET}
|
1007 |
|
|
& {\tt OR \$GIE,CC}
|
1008 |
|
|
& Also known as an RTU instruction (Return to Userspace) \\\hline
|
1009 |
92 |
dgisselq |
{\tt JMP R6+\$Offset}
|
1010 |
|
|
& {\tt MOV \$Offset(R6),PC}
|
1011 |
21 |
dgisselq |
& \\\hline
|
1012 |
69 |
dgisselq |
{\tt LJMP \$Addr}
|
1013 |
|
|
& \parbox[t]{1.5in}{\tt LOD (PC),PC \\ {\em Address }}
|
1014 |
|
|
& Although this only works for an unconditional jump, and it only
|
1015 |
|
|
works in a Von Neumann architecture, this instruction combination makes
|
1016 |
|
|
for a nice combination that can be adjusted by a linker at a later
|
1017 |
|
|
time.\\\hline
|
1018 |
92 |
dgisselq |
{\tt JSR PC+\$Offset }
|
1019 |
|
|
& \parbox[t]{1.5in}{\tt MOV \$1+PC,R0 \\ ADD \$Offset,PC}
|
1020 |
69 |
dgisselq |
& This is similar to the jump and link instructions from other
|
1021 |
|
|
architectures, save only that it requires a specific link
|
1022 |
|
|
instruction, also known as the {\tt MOV} instruction on the
|
1023 |
|
|
left.\\\hline
|
1024 |
|
|
\end{tabular}
|
1025 |
|
|
\caption{Derived Instructions}\label{tbl:derived-1}
|
1026 |
|
|
\end{center}\end{table}
|
1027 |
|
|
\begin{table}\begin{center}
|
1028 |
|
|
\begin{tabular}{p{1.4in}p{1.5in}p{3in}}\\\hline
|
1029 |
|
|
Mapped & Actual & Notes \\\hline
|
1030 |
39 |
dgisselq |
{\tt LDI.l \$val,Rx }
|
1031 |
|
|
& \parbox[t]{1.8in}{\tt LDIHI (\$val$>>$16)\&0x0ffff, Rx \\
|
1032 |
|
|
LDILO (\$val\&0x0ffff),Rx}
|
1033 |
69 |
dgisselq |
& \parbox[t]{3.0in}{Sadly, there's not enough instruction
|
1034 |
21 |
dgisselq |
space to load a complete immediate value into any register.
|
1035 |
|
|
Therefore, fully loading any register takes two cycles.
|
1036 |
|
|
The LDIHI (load immediate high) and LDILO (load immediate low)
|
1037 |
69 |
dgisselq |
instructions have been created to facilitate this.
|
1038 |
|
|
\\
|
1039 |
|
|
This is also the appropriate means for setting a register value
|
1040 |
|
|
to an arbitrary 32--bit value in a post--assembly link
|
1041 |
|
|
operation.}\\\hline
|
1042 |
39 |
dgisselq |
{\tt LOD.b \$addr,Rx}
|
1043 |
|
|
& \parbox[t]{1.5in}{\tt %
|
1044 |
21 |
dgisselq |
LDI \$addr,Ra \\
|
1045 |
|
|
LDI \$addr,Rb \\
|
1046 |
|
|
LSR \$2,Ra \\
|
1047 |
|
|
AND \$3,Rb \\
|
1048 |
|
|
LOD (Ra),Rx \\
|
1049 |
|
|
LSL \$3,Rb \\
|
1050 |
|
|
SUB \$32,Rb \\
|
1051 |
|
|
ROL Rb,Rx \\
|
1052 |
|
|
AND \$0ffh,Rx}
|
1053 |
|
|
& \parbox[t]{3in}{This CPU is designed for 32'bit word
|
1054 |
|
|
length instructions. Byte addressing is not supported by the CPU or
|
1055 |
|
|
the bus, so it therefore takes more work to do.
|
1056 |
|
|
|
1057 |
|
|
Note also that in this example, \$Addr is a byte-wise address, where
|
1058 |
24 |
dgisselq |
all other addresses in this document are 32-bit wordlength addresses.
|
1059 |
|
|
For this reason,
|
1060 |
21 |
dgisselq |
we needed to drop the bottom two bits. This also limits the address
|
1061 |
|
|
space of character accesses using this method from 16 MB down to 4MB.}
|
1062 |
|
|
\\\hline
|
1063 |
39 |
dgisselq |
\parbox[t]{1.5in}{\tt LSL \$1,Rx\\ LSLC \$1,Ry}
|
1064 |
|
|
& \parbox[t]{1.5in}{\tt LSL \$1,Ry \\
|
1065 |
21 |
dgisselq |
LSL \$1,Rx \\
|
1066 |
|
|
OR.C \$1,Ry}
|
1067 |
|
|
& Logical shift left with carry. Note that the
|
1068 |
|
|
instruction order is now backwards, to keep the conditions valid.
|
1069 |
33 |
dgisselq |
That is, LSL sets the carry flag, so if we did this the other way
|
1070 |
21 |
dgisselq |
with Rx before Ry, then the condition flag wouldn't have been right
|
1071 |
|
|
for an OR correction at the end. \\\hline
|
1072 |
39 |
dgisselq |
\parbox[t]{1.5in}{\tt LSR \$1,Rx \\ LSRC \$1,Ry}
|
1073 |
|
|
& \parbox[t]{1.5in}{\tt CLR Rz \\
|
1074 |
21 |
dgisselq |
LSR \$1,Ry \\
|
1075 |
|
|
LDIHI.C \$8000h,Rz \\
|
1076 |
|
|
LSR \$1,Rx \\
|
1077 |
|
|
OR Rz,Rx}
|
1078 |
|
|
& Logical shift right with carry \\\hline
|
1079 |
39 |
dgisselq |
{\tt NEG Rx} & \parbox[t]{1.5in}{\tt XOR \$-1,Rx \\ ADD \$1,Rx} & \\\hline
|
1080 |
|
|
{\tt NEG.C Rx} & \parbox[t]{1.5in}{\tt MOV.C \$-1+Rx,Rx\\XOR.C \$-1,Rx} & \\\hline
|
1081 |
|
|
{\tt NOOP} & {\tt NOOP} & While there are many
|
1082 |
21 |
dgisselq |
operations that do nothing, such as MOV Rx,Rx, or OR \$0,Rx, these
|
1083 |
|
|
operations have consequences in that they might stall the bus if
|
1084 |
|
|
Rx isn't ready yet. For this reason, we have a dedicated NOOP
|
1085 |
|
|
instruction. \\\hline
|
1086 |
39 |
dgisselq |
{\tt NOT Rx } & {\tt XOR \$-1,Rx } & \\\hline
|
1087 |
|
|
{\tt POP Rx }
|
1088 |
69 |
dgisselq |
& \parbox[t]{1.5in}{\tt LOD \$(SP),Rx \\ ADD \$1,SP}
|
1089 |
|
|
& \\\hline
|
1090 |
36 |
dgisselq |
\end{tabular}
|
1091 |
|
|
\caption{Derived Instructions, continued}\label{tbl:derived-2}
|
1092 |
|
|
\end{center}\end{table}
|
1093 |
|
|
\begin{table}\begin{center}
|
1094 |
|
|
\begin{tabular}{p{1.4in}p{1.5in}p{3in}}\\\hline
|
1095 |
39 |
dgisselq |
{\tt PUSH Rx}
|
1096 |
69 |
dgisselq |
& \parbox[t]{1.5in}{\hbox{\tt SUB \$1,SP}
|
1097 |
|
|
\hbox{\tt STO Rx,\$(SP)}}
|
1098 |
39 |
dgisselq |
& Note that for pipelined operation, it helps to coalesce all the
|
1099 |
|
|
{\tt SUB}'s into one command, and place the {\tt STO}'s right
|
1100 |
69 |
dgisselq |
after each other. Further, to avoid a pipeline stall, the
|
1101 |
|
|
immediate value for the store must be zero.
|
1102 |
|
|
\\\hline
|
1103 |
39 |
dgisselq |
{\tt PUSH Rx-Ry}
|
1104 |
69 |
dgisselq |
& \parbox[t]{1.5in}{\tt SUB \$$n$,SP \\
|
1105 |
|
|
STO Rx,\$(SP)
|
1106 |
36 |
dgisselq |
\ldots \\
|
1107 |
69 |
dgisselq |
STO Ry,\$$\left(n-1\right)$(SP)}
|
1108 |
36 |
dgisselq |
& Multiple pushes at once only need the single subtract from the
|
1109 |
|
|
stack pointer. This derived instruction is analogous to a similar one
|
1110 |
|
|
on the Motoroloa 68k architecture, although the Zip Assembler
|
1111 |
39 |
dgisselq |
does not support this instruction (yet). This instruction
|
1112 |
|
|
also supports pipelined memory access.\\\hline
|
1113 |
|
|
{\tt RESET}
|
1114 |
|
|
& \parbox[t]{1in}{\tt STO \$1,\$watchdog(R12)\\NOOP\\NOOP}
|
1115 |
|
|
& This depends upon the peripheral base address being
|
1116 |
69 |
dgisselq |
preloaded into R12.
|
1117 |
21 |
dgisselq |
|
1118 |
|
|
Another opportunity might be to jump to the reset address from within
|
1119 |
39 |
dgisselq |
supervisor mode.\\\hline
|
1120 |
69 |
dgisselq |
{\tt RET} & {\tt MOV R0,PC}
|
1121 |
|
|
& This depends upon the form of the {\tt JSR} given on the previous
|
1122 |
|
|
page that stores the return address into R0.
|
1123 |
21 |
dgisselq |
\\\hline
|
1124 |
39 |
dgisselq |
{\tt STEP Rr,Rt}
|
1125 |
|
|
& \parbox[t]{1.5in}{\tt LSR \$1,Rr \\ XOR.C Rt,Rr}
|
1126 |
21 |
dgisselq |
& Step a Galois implementation of a Linear Feedback Shift Register, Rr,
|
1127 |
|
|
using taps Rt \\\hline
|
1128 |
39 |
dgisselq |
{\tt STO.b Rx,\$addr}
|
1129 |
|
|
& \parbox[t]{1.5in}{\tt %
|
1130 |
21 |
dgisselq |
LDI \$addr,Ra \\
|
1131 |
|
|
LDI \$addr,Rb \\
|
1132 |
|
|
LSR \$2,Ra \\
|
1133 |
|
|
AND \$3,Rb \\
|
1134 |
|
|
SUB \$32,Rb \\
|
1135 |
|
|
LOD (Ra),Ry \\
|
1136 |
|
|
AND \$0ffh,Rx \\
|
1137 |
39 |
dgisselq |
AND \~\$0ffh,Ry \\
|
1138 |
21 |
dgisselq |
ROL Rb,Rx \\
|
1139 |
|
|
OR Rx,Ry \\
|
1140 |
|
|
STO Ry,(Ra) }
|
1141 |
|
|
& \parbox[t]{3in}{This CPU and it's bus are {\em not} optimized
|
1142 |
|
|
for byte-wise operations.
|
1143 |
|
|
|
1144 |
|
|
Note that in this example, \$addr is a
|
1145 |
|
|
byte-wise address, whereas in all of our other examples it is a
|
1146 |
|
|
32-bit word address. This also limits the address space
|
1147 |
|
|
of character accesses from 16 MB down to 4MB.F
|
1148 |
|
|
Further, this instruction implies a byte ordering,
|
1149 |
|
|
such as big or little endian.} \\\hline
|
1150 |
39 |
dgisselq |
{\tt SWAP Rx,Ry }
|
1151 |
69 |
dgisselq |
& \parbox[t]{1.5in}{\tt XOR Ry,Rx \\ XOR Rx,Ry \\ XOR Ry,Rx}
|
1152 |
21 |
dgisselq |
& While no extra registers are needed, this example
|
1153 |
|
|
does take 3-clocks. \\\hline
|
1154 |
69 |
dgisselq |
\end{tabular}
|
1155 |
|
|
\caption{Derived Instructions, continued}\label{tbl:derived-3}
|
1156 |
|
|
\end{center}\end{table}
|
1157 |
|
|
\begin{table}\begin{center}
|
1158 |
|
|
\begin{tabular}{p{1.4in}p{1.5in}p{3in}}\\\hline
|
1159 |
39 |
dgisselq |
{\tt TRAP \#X}
|
1160 |
|
|
& \parbox[t]{1.5in}{\tt LDI \$x,R0 \\ AND \~\$GIE,CC }
|
1161 |
36 |
dgisselq |
& This works because whenever a user lowers the \$GIE flag, it sets
|
1162 |
|
|
a TRAP bit within the CC register. Therefore, upon entering the
|
1163 |
|
|
supervisor state, the CPU only need check this bit to know that it
|
1164 |
|
|
got there via a TRAP. The trap could be made conditional by making
|
1165 |
|
|
the LDI and the AND conditional. In that case, the assembler would
|
1166 |
|
|
quietly turn the LDI instruction into an LDILO and LDIHI pair,
|
1167 |
37 |
dgisselq |
but the effect would be the same. \\\hline
|
1168 |
69 |
dgisselq |
{\tt TS Rx,Ry,(Rz)}
|
1169 |
|
|
& \hbox{\tt LDI 1,Rx}
|
1170 |
|
|
\hbox{\tt LOCK}
|
1171 |
|
|
\hbox{\tt LOD (Rz),Ry}
|
1172 |
|
|
\hbox{\tt STO Rx,(Rz)}
|
1173 |
|
|
& A test and set instruction. The {\tt LOCK} instruction insures
|
1174 |
|
|
that the next two instructions lock the bus between the instructions,
|
1175 |
|
|
so no one else can use it. Thus guarantees that the operation is
|
1176 |
|
|
atomic.
|
1177 |
|
|
\\\hline
|
1178 |
39 |
dgisselq |
{\tt TST Rx}
|
1179 |
|
|
& {\tt TST \$-1,Rx}
|
1180 |
21 |
dgisselq |
& Set the condition codes based upon Rx. Could also do a CMP \$0,Rx,
|
1181 |
|
|
ADD \$0,Rx, SUB \$0,Rx, etc, AND \$-1,Rx, etc. The TST and CMP
|
1182 |
|
|
approaches won't stall future pipeline stages looking for the value
|
1183 |
69 |
dgisselq |
of Rx. (Future versions of the assembler may shorten this to a
|
1184 |
|
|
{\tt TST Rx} instruction.)\\\hline
|
1185 |
39 |
dgisselq |
{\tt WAIT}
|
1186 |
|
|
& {\tt Or \$GIE | \$SLEEP,CC}
|
1187 |
|
|
& Wait until the next interrupt, then jump to supervisor/interrupt
|
1188 |
|
|
mode.
|
1189 |
21 |
dgisselq |
\end{tabular}
|
1190 |
36 |
dgisselq |
\caption{Derived Instructions, continued}\label{tbl:derived-4}
|
1191 |
21 |
dgisselq |
\end{center}\end{table}
|
1192 |
69 |
dgisselq |
|
1193 |
|
|
\section{Interrupt Handling}
|
1194 |
|
|
The Zip CPU does not maintain any interrupt vector tables. If an interrupt
|
1195 |
|
|
takes place, the CPU simply switches to interrupt mode. The supervisor code
|
1196 |
|
|
continues in this interrupt mode from where it left off before, after
|
1197 |
|
|
executing a return to userspace {\tt RTU} instruction.
|
1198 |
|
|
|
1199 |
|
|
At this point, the supervisor code needs to determine first whether an
|
1200 |
|
|
interrupt has occurred, and then whether it is in interrupt mode due to
|
1201 |
|
|
an exception and handle each case appropriately.
|
1202 |
|
|
|
1203 |
21 |
dgisselq |
\section{Pipeline Stages}
|
1204 |
32 |
dgisselq |
As mentioned in the introduction, and highlighted in Fig.~\ref{fig:cpu},
|
1205 |
|
|
the Zip CPU supports a five stage pipeline.
|
1206 |
21 |
dgisselq |
\begin{enumerate}
|
1207 |
36 |
dgisselq |
\item {\bf Prefetch}: Reads instruction from memory and into a cache, if so
|
1208 |
|
|
configured. This
|
1209 |
21 |
dgisselq |
stage is actually pipelined itself, and so it will stall if the PC
|
1210 |
|
|
ever changes. Stalls are also created here if the instruction isn't
|
1211 |
|
|
in the prefetch cache.
|
1212 |
36 |
dgisselq |
|
1213 |
69 |
dgisselq |
The Zip CPU supports one of three prefetch methods, depending upon a
|
1214 |
|
|
flag set at build time within the {\tt cpudefs.v} file. The simplest
|
1215 |
|
|
is a non--cached implementation of a prefetch. This implementation is
|
1216 |
|
|
fairly small, and ideal for users of the Zip CPU who need the extra
|
1217 |
|
|
space on the FPGA fabric. However, because this non--cached version
|
1218 |
|
|
has no cache, the maximum number of instructions per clock is limited
|
1219 |
|
|
to about one per five.
|
1220 |
36 |
dgisselq |
|
1221 |
|
|
The second prefetch module is a pipelined prefetch with a cache. This
|
1222 |
|
|
module tries to keep the instruction address within a window of valid
|
1223 |
|
|
instruction addresses. While effective, it is not a traditional
|
1224 |
|
|
cache implementation. One unique feature of this cache implementation,
|
1225 |
|
|
however, is that it can be cleared in a single clock. A disappointing
|
1226 |
|
|
feature, though, was that it needs an extra internal pipeline stage
|
1227 |
|
|
to be implemented.
|
1228 |
|
|
|
1229 |
69 |
dgisselq |
The third prefetch and cache module implements a more traditional cache.
|
1230 |
|
|
While the resulting code tends to be twice as fast as the pipelined
|
1231 |
|
|
cache architecture, this implementation uses a large amount of
|
1232 |
|
|
distributed FPGA RAM to be successful. This then inflates the Zip CPU's
|
1233 |
|
|
FPGA usage statistics.
|
1234 |
|
|
|
1235 |
|
|
\item {\bf Decode}: Decodes an instruction into OpCode, register(s) to read,
|
1236 |
|
|
and immediate offset. This stage also determines whether the flags
|
1237 |
|
|
will be set or whether the result will be written back.
|
1238 |
|
|
|
1239 |
21 |
dgisselq |
\item {\bf Read Operands}: Read registers and apply any immediate values to
|
1240 |
24 |
dgisselq |
them. There is no means of detecting or flagging arithmetic overflow
|
1241 |
|
|
or carry when adding the immediate to the operand. This stage will
|
1242 |
|
|
stall if any source operand is pending.
|
1243 |
69 |
dgisselq |
|
1244 |
|
|
\item Split into one of four tracks: An {\bf ALU} track which will accomplish
|
1245 |
|
|
a simple instruction, the {\bf MemOps} stage which handles {\tt LOD}
|
1246 |
|
|
(load) and {\tt STO} (store) instructions, the {\bf divide} unit,
|
1247 |
|
|
and the {\bf floating point} unit.
|
1248 |
21 |
dgisselq |
\begin{itemize}
|
1249 |
69 |
dgisselq |
\item Loads will stall instructions in the decode stage until the
|
1250 |
|
|
entire pipeline until complete, lest a register be read in
|
1251 |
|
|
the read operands stage only to be updated unseen by the
|
1252 |
|
|
Load.
|
1253 |
|
|
\item Condition codes are available upon completion of the ALU,
|
1254 |
|
|
divide, or FPU stage.
|
1255 |
|
|
\item Issuing a non--pipelined memory instruction to the memory unit
|
1256 |
|
|
while the memory unit is busy will stall the entire pipeline.
|
1257 |
21 |
dgisselq |
\end{itemize}
|
1258 |
32 |
dgisselq |
\item {\bf Write-Back}: Conditionally write back the result to the register
|
1259 |
69 |
dgisselq |
set, applying the condition. This routine is quad-entrant: either the
|
1260 |
|
|
ALU, the memory, the divide, or the FPU may write back a register.
|
1261 |
|
|
The only design rule is that no more than a single register may be
|
1262 |
|
|
written back in any given clock.
|
1263 |
21 |
dgisselq |
\end{enumerate}
|
1264 |
|
|
|
1265 |
24 |
dgisselq |
The Zip CPU does not support out of order execution. Therefore, if the memory
|
1266 |
69 |
dgisselq |
unit stalls, every other instruction stalls. The same is true for divide or
|
1267 |
|
|
floating point instructions--all other instructions will stall while waiting
|
1268 |
|
|
for these to complete. Memory stores, however, can take place concurrently
|
1269 |
|
|
with non--memory operations, although memory reads (loads) cannot.
|
1270 |
24 |
dgisselq |
|
1271 |
32 |
dgisselq |
\section{Pipeline Stalls}
|
1272 |
|
|
The processing pipeline can and will stall for a variety of reasons. Some of
|
1273 |
|
|
these are obvious, some less so. These reasons are listed below:
|
1274 |
|
|
\begin{itemize}
|
1275 |
|
|
\item When the prefetch cache is exhausted
|
1276 |
21 |
dgisselq |
|
1277 |
36 |
dgisselq |
This reason should be obvious. If the prefetch cache doesn't have the
|
1278 |
69 |
dgisselq |
instruction in memory, the entire pipeline must stall until an instruction
|
1279 |
|
|
can be made ready. In the case of the {\tt pipefetch} windowed approach
|
1280 |
|
|
to the prefetch cache, this means the pipeline will stall until enough of the
|
1281 |
|
|
prefetch cache is loaded to support the next instruction. In the case
|
1282 |
|
|
of the more traditional {\tt pfcache} approach, the entire cache line must
|
1283 |
|
|
fill before instruction execution can continue.
|
1284 |
21 |
dgisselq |
|
1285 |
32 |
dgisselq |
\item While waiting for the pipeline to load following any taken branch, jump,
|
1286 |
69 |
dgisselq |
return from interrupt or switch to interrupt context (4 stall cycles)
|
1287 |
32 |
dgisselq |
|
1288 |
68 |
dgisselq |
Fig.~\ref{fig:bcstalls}
|
1289 |
|
|
\begin{figure}\begin{center}
|
1290 |
|
|
\includegraphics[width=3.5in]{../gfx/bc.eps}
|
1291 |
69 |
dgisselq |
\caption{A conditional branch generates 4 stall cycles}\label{fig:bcstalls}
|
1292 |
68 |
dgisselq |
\end{center}\end{figure}
|
1293 |
|
|
illustrates the situation for a conditional branch. In this case, the branch
|
1294 |
69 |
dgisselq |
instruction, {\tt BC}, is nominally followed by instructions {\tt I1} and so
|
1295 |
68 |
dgisselq |
forth. However, since the branch is taken, the next instruction must be
|
1296 |
|
|
{\tt IA}. Therefore, the pipeline needs to be cleared and reloaded.
|
1297 |
|
|
Given that there are five stages to the pipeline, that accounts
|
1298 |
69 |
dgisselq |
for the four stalls. (Were the {\tt pipefetch} cache chosen, there would
|
1299 |
|
|
be another stall internal to the {\tt pipefetch} cache.)
|
1300 |
32 |
dgisselq |
|
1301 |
92 |
dgisselq |
The Zip CPU handles the {\tt ADD \$X,PC} and
|
1302 |
36 |
dgisselq |
{\tt LDI \$X,PC} instructions specially, however. These instructions, when
|
1303 |
69 |
dgisselq |
not conditioned on the flags, can execute with only a single stall cycle,
|
1304 |
|
|
such as is shown in Fig.~\ref{fig:branch}.\footnote{Note that when using the
|
1305 |
|
|
{\tt pipefetch} cache, this requires an additional stall cycle due to that
|
1306 |
|
|
cache's implementation.}
|
1307 |
68 |
dgisselq |
\begin{figure}\begin{center}
|
1308 |
69 |
dgisselq |
\includegraphics[width=4in]{../gfx/bra.eps} %0.4in per clock
|
1309 |
|
|
\caption{An expedited branch costs a single stall cycle}\label{fig:branch}
|
1310 |
68 |
dgisselq |
\end{center}\end{figure}
|
1311 |
|
|
In this example, {\tt BR} is a branch always taken, {\tt I1} is the instruction
|
1312 |
|
|
following the branch in memory, while {\tt IA} is the first instruction at the
|
1313 |
|
|
branch address. ({\tt CLR} denotes a clear--pipeline operation, and does
|
1314 |
|
|
not represent any instruction.)
|
1315 |
36 |
dgisselq |
|
1316 |
32 |
dgisselq |
\item When reading from a prior register while also adding an immediate offset
|
1317 |
|
|
\begin{enumerate}
|
1318 |
|
|
\item\ {\tt OPCODE ?,RA}
|
1319 |
|
|
\item\ {\em (stall)}
|
1320 |
|
|
\item\ {\tt OPCODE I+RA,RB}
|
1321 |
|
|
\end{enumerate}
|
1322 |
|
|
|
1323 |
|
|
Since the addition of the immediate register within OpB decoding gets applied
|
1324 |
|
|
during the read operand stage so that it can be nicely settled before the ALU,
|
1325 |
|
|
any instruction that will write back an operand must be separated from the
|
1326 |
|
|
opcode that will read and apply an immediate offset by one instruction. The
|
1327 |
|
|
good news is that this stall can easily be mitigated by proper scheduling.
|
1328 |
36 |
dgisselq |
That is, any instruction that does not add an immediate to {\tt RA} may be
|
1329 |
|
|
scheduled into the stall slot.
|
1330 |
32 |
dgisselq |
|
1331 |
69 |
dgisselq |
This is also the reason why, when setting up a stack frame, the top of the
|
1332 |
|
|
stack frame is used first: it eliminates this stall cycle. Hence, to save
|
1333 |
|
|
registers at the top of a procedure, one would write:
|
1334 |
32 |
dgisselq |
\begin{enumerate}
|
1335 |
69 |
dgisselq |
\item\ {\tt SUB 2,SP}
|
1336 |
|
|
\item\ {\tt STO R1,(SP)}
|
1337 |
|
|
\item\ {\tt STO R2,1(SP)}
|
1338 |
32 |
dgisselq |
\end{enumerate}
|
1339 |
69 |
dgisselq |
Had {\tt R1} instead been stored at {\tt 1(SP)} as the top of the stack,
|
1340 |
|
|
there would've been an extra stall in setting up the stack frame.
|
1341 |
32 |
dgisselq |
|
1342 |
|
|
\item When reading from the CC register after setting the flags
|
1343 |
|
|
\begin{enumerate}
|
1344 |
69 |
dgisselq |
\item\ {\tt ALUOP RA,RB} {\em ; Ex: a compare opcode}
|
1345 |
36 |
dgisselq |
\item\ {\em (stall)}
|
1346 |
32 |
dgisselq |
\item\ {\tt TST sys.ccv,CC}
|
1347 |
|
|
\item\ {\tt BZ somewhere}
|
1348 |
|
|
\end{enumerate}
|
1349 |
|
|
|
1350 |
68 |
dgisselq |
The reason for this stall is simply performance: many of the flags are
|
1351 |
|
|
determined via combinatorial logic {\em during} the writeback cycle.
|
1352 |
|
|
Trying to then place these into the input for one of the operands for an
|
1353 |
|
|
ALU instruction during the same cycle
|
1354 |
32 |
dgisselq |
created a time delay loop that would no longer execute in a single 100~MHz
|
1355 |
|
|
clock cycle. (The time delay of the multiply within the ALU wasn't helping
|
1356 |
|
|
either \ldots).
|
1357 |
|
|
|
1358 |
33 |
dgisselq |
This stall may be eliminated via proper scheduling, by placing an instruction
|
1359 |
|
|
that does not set flags in between the ALU operation and the instruction
|
1360 |
|
|
that references the CC register. For example, {\tt MOV \$addr+PC,uPC}
|
1361 |
|
|
followed by an {\tt RTU} ({\tt OR \$GIE,CC}) instruction will not incur
|
1362 |
|
|
this stall, whereas an {\tt OR \$BREAKEN,CC} followed by an {\tt OR \$STEP,CC}
|
1363 |
68 |
dgisselq |
will incur the stall, while a {\tt LDI \$BREAKEN|\$STEP,CC} will not since
|
1364 |
69 |
dgisselq |
it doesn't read the condition codes before executing.
|
1365 |
33 |
dgisselq |
|
1366 |
32 |
dgisselq |
\item When waiting for a memory read operation to complete
|
1367 |
|
|
\begin{enumerate}
|
1368 |
|
|
\item\ {\tt LOD address,RA}
|
1369 |
36 |
dgisselq |
\item\ {\em (multiple stalls, bus dependent, 4 clocks best)}
|
1370 |
32 |
dgisselq |
\item\ {\tt OPCODE I+RA,RB}
|
1371 |
|
|
\end{enumerate}
|
1372 |
|
|
|
1373 |
36 |
dgisselq |
Remember, the Zip CPU does not support out of order execution. Therefore,
|
1374 |
32 |
dgisselq |
anytime the memory unit becomes busy both the memory unit and the ALU must
|
1375 |
68 |
dgisselq |
stall until the memory unit is cleared. This is illustrated in
|
1376 |
|
|
Fig.~\ref{fig:memrd},
|
1377 |
|
|
\begin{figure}\begin{center}
|
1378 |
69 |
dgisselq |
\includegraphics[width=5.6in]{../gfx/memrd.eps}
|
1379 |
68 |
dgisselq |
\caption{Pipeline handling of a load instruction}\label{fig:memrd}
|
1380 |
|
|
\end{center}\end{figure}
|
1381 |
|
|
since it is especially true of a load
|
1382 |
69 |
dgisselq |
instruction, which must still write its operand back to the register file.
|
1383 |
|
|
Further, note that on a pipelined memory operation, the instruction must
|
1384 |
|
|
stall in the decode operand stage, lest it try to read a result from the
|
1385 |
|
|
register file before the load result has been written to it. Finally, note
|
1386 |
|
|
that there is an extra stall at the end of the memory cycle, so that
|
1387 |
|
|
the memory unit will be idle for two clocks before an instruction will be
|
1388 |
|
|
accepted into the ALU. Store instructions are different, as shown in
|
1389 |
|
|
Fig.~\ref{fig:memwr},
|
1390 |
68 |
dgisselq |
\begin{figure}\begin{center}
|
1391 |
69 |
dgisselq |
\includegraphics[width=4in]{../gfx/memwr.eps}
|
1392 |
68 |
dgisselq |
\caption{Pipeline handling of a store instruction}\label{fig:memwr}
|
1393 |
|
|
\end{center}\end{figure}
|
1394 |
|
|
since they can be busy with the bus without impacting later write back
|
1395 |
|
|
pipeline stages. Hence, only loads stall the pipeline.
|
1396 |
32 |
dgisselq |
|
1397 |
68 |
dgisselq |
This, of course, also assumes that the memory being accessed is a single cycle
|
1398 |
|
|
memory and that there are no stalls to get to the memory.
|
1399 |
32 |
dgisselq |
Slower memories, such as the Quad SPI flash, will take longer--perhaps even
|
1400 |
33 |
dgisselq |
as long as forty clocks. During this time the CPU and the external bus
|
1401 |
68 |
dgisselq |
will be busy, and unable to do anything else. Likewise, if it takes a couple
|
1402 |
|
|
of clock cycles for the bus to be free, as shown in both Figs.~\ref{fig:memrd}
|
1403 |
|
|
and~\ref{fig:memwr}, there will be stalls.
|
1404 |
32 |
dgisselq |
|
1405 |
|
|
\item Memory operation followed by a memory operation
|
1406 |
|
|
\begin{enumerate}
|
1407 |
|
|
\item\ {\tt STO address,RA}
|
1408 |
36 |
dgisselq |
\item\ {\em (multiple stalls, bus dependent, 4 clocks best)}
|
1409 |
32 |
dgisselq |
\item\ {\tt LOD address,RB}
|
1410 |
36 |
dgisselq |
\item\ {\em (multiple stalls, bus dependent, 4 clocks best)}
|
1411 |
32 |
dgisselq |
\end{enumerate}
|
1412 |
|
|
|
1413 |
68 |
dgisselq |
In this case, the LOD instruction cannot start until the STO is finished,
|
1414 |
|
|
as illustrated by Fig.~\ref{fig:mstld}.
|
1415 |
|
|
\begin{figure}\begin{center}
|
1416 |
|
|
\includegraphics[width=5.5in]{../gfx/mstld.eps}
|
1417 |
|
|
\caption{Pipeline handling of a store followed by a load instruction}\label{fig:mstld}
|
1418 |
|
|
\end{center}\end{figure}
|
1419 |
32 |
dgisselq |
With proper scheduling, it is possible to do something in the ALU while the
|
1420 |
36 |
dgisselq |
memory unit is busy with the STO instruction, but otherwise this pipeline will
|
1421 |
68 |
dgisselq |
stall while waiting for it to complete before the load instruction can
|
1422 |
|
|
start.
|
1423 |
32 |
dgisselq |
|
1424 |
39 |
dgisselq |
The Zip CPU does have the capability of supporting pipelined memory access,
|
1425 |
|
|
but only under the following conditions: all accesses within the pipeline
|
1426 |
|
|
must all be reads or all be writes, all must use the same register for their
|
1427 |
|
|
address, and there can be no stalls or other instructions between pipelined
|
1428 |
|
|
memory access instructions. Further, the offset to memory must be increasing
|
1429 |
|
|
by one address each instruction. These conditions work well for saving or
|
1430 |
68 |
dgisselq |
storing registers to the stack. Indeed, if you noticed, both
|
1431 |
|
|
Fig.~\ref{fig:memrd} and Fig.~\ref{fig:memwr} illustrated pipelined memory
|
1432 |
|
|
accesses.
|
1433 |
36 |
dgisselq |
|
1434 |
32 |
dgisselq |
\end{itemize}
|
1435 |
|
|
|
1436 |
|
|
|
1437 |
21 |
dgisselq |
\chapter{Peripherals}\label{chap:periph}
|
1438 |
24 |
dgisselq |
|
1439 |
|
|
While the previous chapter describes a CPU in isolation, the Zip System
|
1440 |
|
|
includes a minimum set of peripherals as well. These peripherals are shown
|
1441 |
|
|
in Fig.~\ref{fig:zipsystem}
|
1442 |
|
|
\begin{figure}\begin{center}
|
1443 |
|
|
\includegraphics[width=3.5in]{../gfx/system.eps}
|
1444 |
|
|
\caption{Zip System Peripherals}\label{fig:zipsystem}
|
1445 |
|
|
\end{center}\end{figure}
|
1446 |
|
|
and described here. They are designed to make
|
1447 |
|
|
the Zip CPU more useful in an Embedded Operating System environment.
|
1448 |
|
|
|
1449 |
68 |
dgisselq |
\section{Interrupt Controller}\label{sec:pic}
|
1450 |
24 |
dgisselq |
|
1451 |
|
|
Perhaps the most important peripheral within the Zip System is the interrupt
|
1452 |
|
|
controller. While the Zip CPU itself can only handle one interrupt, and has
|
1453 |
|
|
only the one interrupt state: disabled or enabled, the interrupt controller
|
1454 |
|
|
can make things more interesting.
|
1455 |
|
|
|
1456 |
|
|
The Zip System interrupt controller module supports up to 15 interrupts, all
|
1457 |
|
|
controlled from one register. Bit~31 of the interrupt controller controls
|
1458 |
|
|
overall whether interrupts are enabled (1'b1) or disabled (1'b0). Bits~16--30
|
1459 |
68 |
dgisselq |
control whether individual interrupts are enabled (1'b1) or disabled (1'b0).
|
1460 |
24 |
dgisselq |
Bit~15 is an indicator showing whether or not any interrupt is active, and
|
1461 |
|
|
bits~0--15 indicate whether or not an individual interrupt is active.
|
1462 |
|
|
|
1463 |
|
|
The interrupt controller has been designed so that bits can be controlled
|
1464 |
|
|
individually without having any knowledge of the rest of the controller
|
1465 |
|
|
setting. To enable an interrupt, write to the register with the high order
|
1466 |
|
|
global enable bit set and the respective interrupt enable bit set. No other
|
1467 |
|
|
bits will be affected. To disable an interrupt, write to the register with
|
1468 |
|
|
the high order global enable bit cleared and the respective interrupt enable
|
1469 |
|
|
bit set. To clear an interrupt, write a `1' to that interrupts status pin.
|
1470 |
|
|
Zero's written to the register have no affect, save that a zero written to the
|
1471 |
|
|
master enable will disable all interrupts.
|
1472 |
|
|
|
1473 |
|
|
As an example, suppose you wished to enable interrupt \#4. You would then
|
1474 |
|
|
write to the register a {\tt 0x80100010} to enable interrupt \#4 and to clear
|
1475 |
|
|
any past active state. When you later wish to disable this interrupt, you would
|
1476 |
|
|
write a {\tt 0x00100010} to the register. As before, this both disables the
|
1477 |
|
|
interrupt and clears the active indicator. This also has the side effect of
|
1478 |
|
|
disabling all interrupts, so a second write of {\tt 0x80000000} may be necessary
|
1479 |
|
|
to re-enable any other interrupts.
|
1480 |
|
|
|
1481 |
|
|
The Zip System currently hosts two interrupt controllers, a primary and a
|
1482 |
69 |
dgisselq |
secondary. The primary interrupt controller has one (or more) interrupt line(s)
|
1483 |
|
|
which may come from an external interrupt source, and one interrupt line from
|
1484 |
|
|
the secondary controller. Other primary interrupts include the system timers,
|
1485 |
|
|
the jiffies interrupt, and the manual cache interrupt. The secondary interrupt
|
1486 |
|
|
controller maintains an interrupt state for all of the processor accounting
|
1487 |
|
|
counters.
|
1488 |
24 |
dgisselq |
|
1489 |
21 |
dgisselq |
\section{Counter}
|
1490 |
|
|
|
1491 |
|
|
The Zip Counter is a very simple counter: it just counts. It cannot be
|
1492 |
|
|
halted. When it rolls over, it issues an interrupt. Writing a value to the
|
1493 |
|
|
counter just sets the current value, and it starts counting again from that
|
1494 |
|
|
value.
|
1495 |
|
|
|
1496 |
|
|
Eight counters are implemented in the Zip System for process accounting.
|
1497 |
|
|
This may change in the future, as nothing as yet uses these counters.
|
1498 |
|
|
|
1499 |
|
|
\section{Timer}
|
1500 |
|
|
|
1501 |
|
|
The Zip Timer is also very simple: it simply counts down to zero. When it
|
1502 |
|
|
transitions from a one to a zero it creates an interrupt.
|
1503 |
|
|
|
1504 |
|
|
Writing any non-zero value to the timer starts the timer. If the high order
|
1505 |
|
|
bit is set when writing to the timer, the timer becomes an interval timer and
|
1506 |
|
|
reloads its last start time on any interrupt. Hence, to mark seconds, one
|
1507 |
|
|
might set the timer to 100~million (the number of clocks per second), and
|
1508 |
|
|
set the high bit. Ever after, the timer will interrupt the CPU once per
|
1509 |
24 |
dgisselq |
second (assuming a 100~MHz clock). This reload capability also limits the
|
1510 |
68 |
dgisselq |
maximum timer value to $2^{31}-1$ (about 21~seconds using a 100~MHz clock),
|
1511 |
|
|
rather than $2^{32}-1$.
|
1512 |
21 |
dgisselq |
|
1513 |
|
|
\section{Watchdog Timer}
|
1514 |
|
|
|
1515 |
|
|
The watchdog timer is no different from any of the other timers, save for one
|
1516 |
|
|
critical difference: the interrupt line from the watchdog
|
1517 |
|
|
timer is tied to the reset line of the CPU. Hence writing a `1' to the
|
1518 |
|
|
watchdog timer will always reset the CPU.
|
1519 |
32 |
dgisselq |
To stop the Watchdog timer, write a `0' to it. To start it,
|
1520 |
21 |
dgisselq |
write any other number to it---as with the other timers.
|
1521 |
|
|
|
1522 |
|
|
While the watchdog timer supports interval mode, it doesn't make as much sense
|
1523 |
|
|
as it did with the other timers.
|
1524 |
|
|
|
1525 |
68 |
dgisselq |
\section{Bus Watchdog}
|
1526 |
|
|
There is an additional watchdog timer on the Wishbone bus. This timer,
|
1527 |
|
|
however, is hardware configured and not software configured. The timer is
|
1528 |
|
|
reset at the beginning of any bus transaction, and only counts clocks during
|
1529 |
|
|
such bus transactions. If the bus transaction takes longer than the number
|
1530 |
|
|
of counts the timer allots, it will raise a bus error flag to terminate the
|
1531 |
|
|
transaction. This is useful in the case of any peripherals that are
|
1532 |
|
|
misbehaving. If the bus watchdog terminates a bus transaction, the CPU may
|
1533 |
|
|
then read from its port to find out which memory location created the problem.
|
1534 |
|
|
|
1535 |
|
|
Aside from its unusual configuration, the bus watchdog is just another
|
1536 |
69 |
dgisselq |
implementation of the fundamental timer described above--stripped down
|
1537 |
|
|
for simplicity.
|
1538 |
68 |
dgisselq |
|
1539 |
21 |
dgisselq |
\section{Jiffies}
|
1540 |
|
|
|
1541 |
|
|
This peripheral is motivated by the Linux use of `jiffies' whereby a process
|
1542 |
|
|
can request to be put to sleep until a certain number of `jiffies' have
|
1543 |
|
|
elapsed. Using this interface, the CPU can read the number of `jiffies'
|
1544 |
|
|
from the peripheral (it only has the one location in address space), add the
|
1545 |
69 |
dgisselq |
sleep length to it, and write the result back to the peripheral. The
|
1546 |
|
|
{\tt zipjiffies}
|
1547 |
21 |
dgisselq |
peripheral will record the value written to it only if it is nearer the current
|
1548 |
|
|
counter value than the last current waiting interrupt time. If no other
|
1549 |
|
|
interrupts are waiting, and this time is in the future, it will be enabled.
|
1550 |
|
|
(There is currently no way to disable a jiffie interrupt once set, other
|
1551 |
24 |
dgisselq |
than to disable the interrupt line in the interrupt controller.) The processor
|
1552 |
21 |
dgisselq |
may then place this sleep request into a list among other sleep requests.
|
1553 |
|
|
Once the timer expires, it would write the next Jiffy request to the peripheral
|
1554 |
|
|
and wake up the process whose timer had expired.
|
1555 |
|
|
|
1556 |
|
|
Indeed, the Jiffies register is nothing more than a glorified counter with
|
1557 |
|
|
an interrupt. Unlike the other counters, the Jiffies register cannot be set.
|
1558 |
|
|
Writes to the jiffies register create an interrupt time. When the Jiffies
|
1559 |
|
|
register later equals the value written to it, an interrupt will be asserted
|
1560 |
|
|
and the register then continues counting as though no interrupt had taken
|
1561 |
|
|
place.
|
1562 |
|
|
|
1563 |
|
|
The purpose of this register is to support alarm times within a CPU. To
|
1564 |
|
|
set an alarm for a particular process $N$ clocks in advance, read the current
|
1565 |
|
|
Jiffies value, and $N$, and write it back to the Jiffies register. The
|
1566 |
|
|
O/S must also keep track of values written to the Jiffies register. Thus,
|
1567 |
32 |
dgisselq |
when an `alarm' trips, it should be removed from the list of alarms, the list
|
1568 |
69 |
dgisselq |
should be resorted, and the next alarm in terms of Jiffies should be written
|
1569 |
|
|
to the register--possibly for a second time.
|
1570 |
21 |
dgisselq |
|
1571 |
36 |
dgisselq |
\section{Direct Memory Access Controller}
|
1572 |
24 |
dgisselq |
|
1573 |
36 |
dgisselq |
The Direct Memory Access (DMA) controller can be used to either move memory
|
1574 |
|
|
from one location to another, to read from a peripheral into memory, or to
|
1575 |
|
|
write from a peripheral into memory all without CPU intervention. Further,
|
1576 |
|
|
since the DMA controller can issue (and does issue) pipeline wishbone accesses,
|
1577 |
|
|
any DMA memory move will by nature be faster than a corresponding program
|
1578 |
|
|
accomplishing the same move. To put this to numbers, it may take a program
|
1579 |
|
|
18~clocks per word transferred, whereas this DMA controller can move one
|
1580 |
69 |
dgisselq |
word in two clocks--provided it has bus access. (The CPU gets priority over
|
1581 |
|
|
the bus.)
|
1582 |
24 |
dgisselq |
|
1583 |
36 |
dgisselq |
When copying memory from one location to another, the DMA controller will
|
1584 |
|
|
copy in units of a given transfer length--up to 1024 words at a time. It will
|
1585 |
|
|
read that transfer length into its internal buffer, and then write to the
|
1586 |
69 |
dgisselq |
destination address from that buffer.
|
1587 |
24 |
dgisselq |
|
1588 |
36 |
dgisselq |
When coupled with a peripheral, the DMA controller can be configured to start
|
1589 |
69 |
dgisselq |
a memory copy when any interrupt line going high. Further, the controller can
|
1590 |
|
|
be configured to issue reads from (or to) the same address instead of
|
1591 |
|
|
incrementing the address at each clock. The DMA completes once the total
|
1592 |
|
|
number of items specified (not the transfer length) have been transferred.
|
1593 |
36 |
dgisselq |
|
1594 |
|
|
In each case, once the transfer is complete and the DMA unit returns to
|
1595 |
|
|
idle, the DMA will issue an interrupt.
|
1596 |
|
|
|
1597 |
|
|
|
1598 |
21 |
dgisselq |
\chapter{Operation}\label{chap:ops}
|
1599 |
|
|
|
1600 |
33 |
dgisselq |
The Zip CPU, and even the Zip System, is not a System on a Chip (SoC). It
|
1601 |
|
|
needs to be connected to its operational environment in order to be used.
|
1602 |
|
|
Specifically, some per system adjustments need to be made:
|
1603 |
|
|
\begin{enumerate}
|
1604 |
|
|
\item The Zip System depends upon an external 32-bit Wishbone bus. This
|
1605 |
|
|
must exist, and must be connected to the Zip CPU for it to work.
|
1606 |
|
|
\item The Zip System needs to be told of its {\tt RESET\_ADDRESS}. This is
|
1607 |
|
|
the program counter of the first instruction following a reset.
|
1608 |
69 |
dgisselq |
\item To conserve logic, you'll want to set the {\tt ADDRESS\_WIDTH} parameter
|
1609 |
|
|
to the number of address bits on your wishbone bus.
|
1610 |
|
|
\item Likewise, the {\tt LGICACHE} parameter sets the number of bits in
|
1611 |
|
|
the instruction cache address. This means that the instruction cache
|
1612 |
|
|
will have $2^{\mbox{\tiny\tt LGICACHE}}$ locations within it.
|
1613 |
33 |
dgisselq |
\item If you want the Zip System to start up on its own, you will need to
|
1614 |
|
|
set the {\tt START\_HALTED} parameter to zero. Otherwise, if you
|
1615 |
|
|
wish to manually start the CPU, that is if upon reset you want the
|
1616 |
|
|
CPU start start in its halted, reset state, then set this parameter to
|
1617 |
69 |
dgisselq |
one. This latter configuration is useful for a CPU that should be
|
1618 |
|
|
idle (i.e. halted) until given an explicit instruction from somewhere
|
1619 |
|
|
else to start.
|
1620 |
33 |
dgisselq |
\item The third parameter to set is the number of interrupts you will be
|
1621 |
|
|
providing from external to the CPU. This can be anything from one
|
1622 |
69 |
dgisselq |
to sixteen, but it cannot be zero. (Set this to 1 and wire the single
|
1623 |
|
|
interrupt line to a 1'b0 if you do not wish to support any external
|
1624 |
|
|
interrupts.)
|
1625 |
33 |
dgisselq |
\item Finally, you need to place into some wishbone accessible address, whether
|
1626 |
|
|
RAM or (more likely) ROM, the initial instructions for the CPU.
|
1627 |
|
|
\end{enumerate}
|
1628 |
|
|
If you have enabled your CPU to start automatically, then upon power up the
|
1629 |
69 |
dgisselq |
CPU will immediately start executing your instructions, starting at the given
|
1630 |
|
|
{\tt RESET\_ADDRESS}.
|
1631 |
33 |
dgisselq |
|
1632 |
|
|
This is, however, not how I have used the Zip CPU. I have instead used the
|
1633 |
36 |
dgisselq |
Zip CPU in a more controlled environment. For me, the CPU starts in a
|
1634 |
33 |
dgisselq |
halted state, and waits to be told to start. Further, the RESET address is a
|
1635 |
|
|
location in RAM. After bringing up the board I am using, and further the
|
1636 |
|
|
bus that is on it, the RAM memory is then loaded externally with the program
|
1637 |
|
|
I wish the Zip System to run. Once the RAM is loaded, I release the CPU.
|
1638 |
69 |
dgisselq |
The CPU then runs until either its halt condition or an exception occurrs in
|
1639 |
|
|
supervisor mode, at which point its task is complete.
|
1640 |
33 |
dgisselq |
|
1641 |
|
|
Eventually, I intend to place an operating system onto the ZipSystem, I'm
|
1642 |
|
|
just not there yet.
|
1643 |
|
|
|
1644 |
68 |
dgisselq |
The rest of this chapter examines some common programming models, and how they
|
1645 |
|
|
might be applied to the Zip System, and then finish with a couple of examples.
|
1646 |
33 |
dgisselq |
|
1647 |
68 |
dgisselq |
\section{System High}
|
1648 |
|
|
The easiest and simplest way to run the Zip CPU is in the system high mode.
|
1649 |
|
|
In this mode, the CPU runs your program in supervisor mode from reboot to
|
1650 |
|
|
power down, and is never interrupted. You will need to poll the interrupt
|
1651 |
|
|
controller to determine when any external condition has become active. This
|
1652 |
|
|
mode is useful, and can handle many microcontroller tasks.
|
1653 |
|
|
|
1654 |
|
|
Even better, in system high mode, all of the user registers are available
|
1655 |
|
|
to the system high program as variables. Accessing these registers can be
|
1656 |
|
|
done in a single clock cycle, which would move them to the active register
|
1657 |
|
|
set or move them back. While this may seem like a load or store instruction,
|
1658 |
|
|
none of these register accesses will suffer from memory delays.
|
1659 |
|
|
|
1660 |
|
|
The one thing that cannot be done in supervisor mode is a wait for interrupt
|
1661 |
|
|
instruction. This, however, is easily rectified by jumping to a user task
|
1662 |
|
|
within the supervisors memory space, such as Tbl.~\ref{tbl:shi-idle}.
|
1663 |
|
|
\begin{table}\begin{center}
|
1664 |
|
|
\begin{tabbing}
|
1665 |
|
|
{\tt supervisor\_idle:} \\
|
1666 |
|
|
\hbox to 0.25in{}\={\em ; While not strictly required, the following move helps to} \\
|
1667 |
|
|
\> {\em ; ensure that the prefetch doesn't try to fetch an instruction} \\
|
1668 |
|
|
\> {\em ; outside of the CPU's address space when it switches to user} \\
|
1669 |
|
|
\> {\em ; mode.} \\
|
1670 |
|
|
\> {\tt MOV supervisor\_idle\_continue,uPC} \\
|
1671 |
|
|
\> {\em ; Put the processor into user mode and to sleep in the same} \\
|
1672 |
|
|
\> {\em ; instruction. } \\
|
1673 |
|
|
\> {\tt OR \$SLEEP|\$GIE,CC} \\
|
1674 |
|
|
{\tt supervisor\_idle\_continue:} \\
|
1675 |
|
|
\> {\em ; Now, if we haven't done this inline, we need to return} \\
|
1676 |
|
|
\> {\em ; to whatever function called us.} \\
|
1677 |
|
|
\> {\tt RETN} \\
|
1678 |
|
|
\end{tabbing}
|
1679 |
|
|
\caption{Executing an idle from supervisor mode}\label{tbl:shi-idle}
|
1680 |
|
|
\end{center}\end{table}
|
1681 |
|
|
|
1682 |
|
|
\section{Traditional Interrupt Handling}
|
1683 |
|
|
Although the Zip CPU does not have a traditional interrupt architecture,
|
1684 |
|
|
it is possible to create the more traditional interrupt approach via software.
|
1685 |
|
|
In this mode, the programmable interrupt controller is used together with the
|
1686 |
|
|
supervisor state to create the illusion of more traditional interrupt handling.
|
1687 |
|
|
|
1688 |
|
|
To set this up, upon reboot the supervisor task:
|
1689 |
|
|
\begin{enumerate}
|
1690 |
|
|
\item Creates a (single) user context, a user stack, and sets the user
|
1691 |
|
|
program counter to the entry of the user task
|
1692 |
|
|
\item Creates a task table of ISR entries
|
1693 |
|
|
\item Enables the master interrupt enable via the interrupt controller, albeit
|
1694 |
|
|
without enabling any of the fifteen potential underlying interrupts.
|
1695 |
|
|
\item Switches to user mode, as the first part of the while loop in
|
1696 |
|
|
Tbl.~\ref{tbl:traditional-isr}.
|
1697 |
|
|
\end{enumerate}
|
1698 |
|
|
\begin{table}\begin{center}
|
1699 |
|
|
\begin{tabbing}
|
1700 |
|
|
{\tt while(true) \{} \\
|
1701 |
|
|
\hbox to 0.25in{}\= {\tt rtu();}\\
|
1702 |
|
|
\> {\tt if (trap) \{} {\em // Here, we allow users to install ISRs, or} \\
|
1703 |
|
|
\>\hbox to 0.25in{}\= {\em // whatever else they may wish to do in supervisor mode.} \\
|
1704 |
|
|
\> {\tt \} else \{} \\
|
1705 |
|
|
\> \> {\tt volatile int *pic = PIC\_ADDRESS;} \\
|
1706 |
|
|
\\
|
1707 |
|
|
\> \> {\em // Save the user context before running any ISRs. This could easily be}\\
|
1708 |
|
|
\> \> {\em // implemented as an inline assembly routine or macro}\\
|
1709 |
|
|
\> \> {\tt SAVE\_PARTIAL\_CONTEXT; }\\
|
1710 |
|
|
\> \> {\em // At this point, we know an interrupt has taken place: Ask the programmable}\\
|
1711 |
|
|
\> \> {\em // interrupt controller (PIC) which interrupts are enabled and which are active.}\\
|
1712 |
|
|
\> \> {\tt int picv = *pic;}\\
|
1713 |
|
|
\> \> {\em // Turn off all active interrupts}\\
|
1714 |
|
|
\> \> {\em // Globally disable interrupt generation in the process}\\
|
1715 |
|
|
\> \> {\tt int active = (picv >> 16) \& picv \& 0x07fff;}\\
|
1716 |
|
|
\> \> {\tt *pic = (active<<16);}\\
|
1717 |
|
|
\> \> {\em // We build a mask of interrupts to re-enable in picv.}\\
|
1718 |
|
|
\> \> {\tt picv = 0;}\\
|
1719 |
|
|
\> \> {\tt for(int i=0,msk=1; i<15; i++, msk<<=1) \{}\\
|
1720 |
|
|
\> \>\hbox to 0.25in{}\={\tt if ((active \& msk)\&\&(isr\_table[i])) \{}\\
|
1721 |
|
|
\> \>\>\hbox to 0.25in{}\= {\tt mov(isr\_table[i],uPC); }\\
|
1722 |
|
|
\> \>\>\> {\em // Acknowledge this particular interrupt. While we could acknowledge all}\\
|
1723 |
|
|
\> \>\>\> {\em // interrupts at once, by acknowledging only those with ISR's we allow}\\
|
1724 |
|
|
\> \>\>\> {\em // the user process to use peripherals manually, and to manually check}\\
|
1725 |
|
|
\> \>\>\> {\em // whether or no those other interrupts had occurred.}\\
|
1726 |
|
|
\> \>\>\> {\tt *pic = msk; }\\
|
1727 |
|
|
\> \>\>\> {\tt rtu(); }\\
|
1728 |
|
|
\> \>\>\> {\em // The ISR will only exit on a trap in the Zip archtecture. There is}\\
|
1729 |
|
|
\> \>\>\> {\em // no {\tt RETI} instruction. Since the PIC holds all interrupts disabled,}\\
|
1730 |
|
|
\> \>\>\> {\em // there is no need to check for further interrupts.}\\
|
1731 |
|
|
\> \>\>\> {\em // }\\
|
1732 |
|
|
\> \>\>\> {\em // The tricky part is that, because of how the PIC is built, the ISR cannot}\\
|
1733 |
|
|
\>\>\>\> {\em // re-enable its own interrupt without re-enabling all interrupts. Hence, we}\\
|
1734 |
|
|
\>\>\>\> {\em // look at R0 upon ISR completion to know if an interrupt needs to be }\\
|
1735 |
|
|
\> \>\>\> {\em // re-enabled. }\\
|
1736 |
|
|
\> \>\>\> {\tt mov(uR0,tmp); }\\
|
1737 |
|
|
\> \>\>\> {\tt picv |= (tmp \& 0x7fff) << 16; }\\
|
1738 |
|
|
\> \>\> {\tt \} }\\
|
1739 |
|
|
\> \> {\tt \} }\\
|
1740 |
|
|
\> \> {\tt RESTORE\_PARTIAL\_CONTEXT; }\\
|
1741 |
|
|
\> \> {\em // Re-activate all (requested) interrupts }\\
|
1742 |
|
|
\> \> {\tt *pic = picv | 0x80000000; }\\
|
1743 |
|
|
\>{\tt \} }\\
|
1744 |
|
|
{\tt \}}\\
|
1745 |
|
|
\end{tabbing}
|
1746 |
|
|
\caption{Traditional Interrupt handling}\label{tbl:traditional-isr}
|
1747 |
|
|
\end{center}\end{table}
|
1748 |
|
|
|
1749 |
|
|
We can work through the interrupt handling process by examining
|
1750 |
|
|
Tbl.~\ref{tbl:traditional-isr}. First, remember, the CPU is always running
|
1751 |
|
|
either the user or the supervisor context. Once the supervisor switches to
|
1752 |
|
|
user mode, control does not return until either an interrupt or a trap
|
1753 |
|
|
has taken place. (Okay, there's also the possibility of a bus error, or an
|
1754 |
|
|
illegal instruction such as an unimplemented floating point instruction---but
|
1755 |
|
|
for now we'll just focus on the trap instruction.) Therefore, if the trap bit
|
1756 |
|
|
isn't set, then we know an interrupt has taken place.
|
1757 |
|
|
|
1758 |
|
|
To process an interrupt, we steal the user's stack: the PC and CC registers
|
1759 |
|
|
are saved on the stack, as outlined in Tbl.~\ref{tbl:save-partial}.
|
1760 |
|
|
\begin{table}\begin{center}
|
1761 |
|
|
\begin{tabbing}
|
1762 |
|
|
SAVE\_PARTIAL\_CONTEXT: \\
|
1763 |
|
|
\hbox to 0.25in{}\= {\em ; We save R0, CC, and PC only} \\
|
1764 |
|
|
\> {\tt MOV -3(uSP),R3} \\
|
1765 |
|
|
\> {\tt MOV uR0,R0} \\
|
1766 |
|
|
\> {\tt MOV uCC,R1} \\
|
1767 |
|
|
\> {\tt MOV uPC,R2} \\
|
1768 |
69 |
dgisselq |
\> {\tt STO R0,(R3)} {\em ; Exploit memory pipelining: }\\
|
1769 |
|
|
\> {\tt STO R1,1(R3)} {\em ; All instructions write to stack }\\
|
1770 |
|
|
\> {\tt STO R2,2(R3)} {\em ; All offsets increment by one }\\
|
1771 |
68 |
dgisselq |
\> {\tt MOV R3,uSP} {\em ; Return the updated stack pointer } \\
|
1772 |
|
|
\end{tabbing}
|
1773 |
|
|
\caption{Example Saving Minimal User Context}\label{tbl:save-partial}
|
1774 |
|
|
\end{center}\end{table}
|
1775 |
|
|
This is much cheaper than the full context swap of a preemptive multitasking
|
1776 |
|
|
kernel, but it also depends upon the ISR saving any state it uses. Further,
|
1777 |
|
|
if multiple ISR's get called at once, this looses its optimality property
|
1778 |
|
|
very quickly.
|
1779 |
|
|
|
1780 |
|
|
As Sec.~\ref{sec:pic} discusses, the top of the PIC register stores which
|
1781 |
|
|
interrupts are enabled, and the bottom stores which have tripped. (Interrupts
|
1782 |
|
|
may trip without being enabled, they just will not generate an interrupt to the
|
1783 |
|
|
CPU.) Our first step is to query the register to find out our interrupt
|
1784 |
|
|
state, and then to disable any interrupts that have tripped. To do
|
1785 |
|
|
that, we write a one to the enable half of the register while also clearing
|
1786 |
|
|
the top bit (master interrupt enable). This has the consequence of disabling
|
1787 |
|
|
any and all further interrupts, not just the ones that have tripped. Hence,
|
1788 |
|
|
upon completion, we re--enable the master interrupt bit again. Finally,
|
1789 |
|
|
we keep track of which interrupts have tripped.
|
1790 |
|
|
|
1791 |
|
|
Using the bit mask of interrupts that have tripped, we walk through all fifteen
|
1792 |
|
|
possible interrupts. If there is an ISR installed, we acknowledge and reset
|
1793 |
|
|
the interrupt within the PIC, and then call the ISR. The ISR, however, cannot
|
1794 |
|
|
re--enable its interrupt without re-enabling the master interrupt bit. Thus,
|
1795 |
|
|
to keep things simple, when the ISR is finished it places its interrupt
|
1796 |
|
|
mask back into R0, or clears R0. This tells the supervisor mode process which
|
1797 |
|
|
interrupts to re--enable. Any other registers that the ISR uses must be
|
1798 |
|
|
saved and restored. (This is only truly optimal if only a single ISR is
|
1799 |
|
|
called.) As a final instruction, the ISR clears the GIE bit executing a user
|
1800 |
|
|
trap. (Remember, the Zip CPU has no {\tt RETI} instruction to restore the
|
1801 |
|
|
stack and return to userland. It needs to go through the supervisor mode to
|
1802 |
|
|
get there.)
|
1803 |
|
|
|
1804 |
|
|
Then, once all interrupts are handled, the user context is restored in a
|
1805 |
|
|
fashion similar to Tbl.~\ref{tbl:restore-partial}.
|
1806 |
|
|
\begin{table}\begin{center}
|
1807 |
|
|
\begin{tabbing}
|
1808 |
|
|
RESTORE\_PARTIAL\_CONTEXT: \\
|
1809 |
|
|
\hbox to 0.25in{}\= {\em ; We retore R0, CC, and PC only} \\
|
1810 |
|
|
\> {\tt MOV uSP,R3} {\em ; Return the updated stack pointer } \\
|
1811 |
69 |
dgisselq |
\> {\tt LOD R0,(R3),R0} {\em ; Exploit memory pipelining: }\\
|
1812 |
|
|
\> {\tt LOD R1,1(R3),R1} {\em ; All instructions write to stack }\\
|
1813 |
|
|
\> {\tt LOD R2,2(R3),R2} {\em ; All offsets increment by one }\\
|
1814 |
68 |
dgisselq |
\> {\tt MOV R0,uR0} \\
|
1815 |
|
|
\> {\tt MOV R1,uCC} \\
|
1816 |
|
|
\> {\tt MOV R2,uPC} \\
|
1817 |
|
|
\> {\tt MOV 3(R3),uSP} \\
|
1818 |
|
|
\end{tabbing}
|
1819 |
|
|
\caption{Example Restoring Minimal User Context}\label{tbl:restore-partial}
|
1820 |
|
|
\end{center}\end{table}
|
1821 |
|
|
Again, this is short and sweet simply because any other registers that needed
|
1822 |
|
|
saving were saved within the ISR.
|
1823 |
|
|
|
1824 |
|
|
There you have it: the Zip CPU, with its non-traditional interrupt architecture,
|
1825 |
|
|
can still process interrupts in a very traditional fashion.
|
1826 |
|
|
|
1827 |
36 |
dgisselq |
\section{Example: Idle Task}
|
1828 |
|
|
One task every operating system needs is the idle task, the task that takes
|
1829 |
|
|
place when nothing else can run. On the Zip CPU, this task is quite simple,
|
1830 |
|
|
and it is shown in assemble in Tbl.~\ref{tbl:idle-asm}.
|
1831 |
|
|
\begin{table}\begin{center}
|
1832 |
|
|
\begin{tabular}{ll}
|
1833 |
|
|
{\tt idle\_task:} \\
|
1834 |
|
|
& {\em ; Wait for the next interrupt, then switch to supervisor task} \\
|
1835 |
|
|
& {\tt WAIT} \\
|
1836 |
|
|
& {\em ; When we come back, it's because the supervisor wishes to} \\
|
1837 |
|
|
& {\em ; wait for an interrupt again, so go back to the top.} \\
|
1838 |
|
|
& {\tt BRA idle\_task} \\
|
1839 |
|
|
\end{tabular}
|
1840 |
|
|
\caption{Example Idle Loop}\label{tbl:idle-asm}
|
1841 |
|
|
\end{center}\end{table}
|
1842 |
|
|
When this task runs, the CPU will fill up all of the pipeline stages up the
|
1843 |
|
|
ALU. The {\tt WAIT} instruction, upon leaving the ALU, places the CPU into
|
1844 |
|
|
a sleep state where nothing more moves. Sure, there may be some more settling,
|
1845 |
|
|
the pipe cache continue to read until full, other instructions may issue until
|
1846 |
|
|
the pipeline fills, but then everything will stall. Then, once an interrupt
|
1847 |
|
|
takes place, control passes to the supervisor task to handle the interrupt.
|
1848 |
|
|
When control passes back to this task, it will be on the next instruction.
|
1849 |
|
|
Since that next instruction sends us back to the top of the task, the idle
|
1850 |
|
|
task thus does nothing but wait for an interrupt.
|
1851 |
|
|
|
1852 |
|
|
This should be the lowest priority task, the task that runs when nothing else
|
1853 |
|
|
can. It will help lower the FPGA power usage overall---at least its dynamic
|
1854 |
|
|
power usage.
|
1855 |
|
|
|
1856 |
|
|
\section{Example: Memory Copy}
|
1857 |
|
|
One common operation is that of a memory move or copy. Consider the C code
|
1858 |
|
|
shown in Tbl.~\ref{tbl:memcp-c}.
|
1859 |
|
|
\begin{table}\begin{center}
|
1860 |
|
|
\parbox{4in}{\begin{tabbing}
|
1861 |
|
|
{\tt void} \= {\tt memcp(void *dest, void *src, int len) \{} \\
|
1862 |
|
|
\> {\tt for(int i=0; i<len; i++)} \\
|
1863 |
|
|
\> \hspace{0.2in} {\tt *dest++ = *src++;} \\
|
1864 |
|
|
\}
|
1865 |
|
|
\end{tabbing}}
|
1866 |
|
|
\caption{Example Memory Copy code in C}\label{tbl:memcp-c}
|
1867 |
|
|
\end{center}\end{table}
|
1868 |
|
|
This same code can be translated in Zip Assembly as shown in
|
1869 |
|
|
Tbl.~\ref{tbl:memcp-asm}.
|
1870 |
|
|
\begin{table}\begin{center}
|
1871 |
|
|
\begin{tabular}{ll}
|
1872 |
|
|
memcp: \\
|
1873 |
69 |
dgisselq |
& {\em ; R0 = *dest, R1 = *src, R2 = LEN, R3 = return addr} \\
|
1874 |
|
|
& {\em ; The following will operate in $12N+19$ clocks.} \\
|
1875 |
|
|
& {\tt CMP 0,R2} \\ % 8 clocks per setup
|
1876 |
|
|
& {\tt MOV.Z R3,PC} {\em ; A conditional return }\\
|
1877 |
|
|
& {\tt SUB 1,SP} {\em ; Create a stack frame}\\
|
1878 |
|
|
& {\tt STO R4,(SP)} {\em ; and a local variable}\\
|
1879 |
|
|
& {\em ; (4 stalls, cannot be further scheduled away)} \\
|
1880 |
|
|
loop: \\ % 12 clocks per loop
|
1881 |
|
|
& {\tt LOD (R1),R4} \\
|
1882 |
36 |
dgisselq |
& {\em ; (4 stalls, cannot be scheduled away)} \\
|
1883 |
69 |
dgisselq |
& {\tt STO R4,(R0)} {\em ; (4 schedulable stalls, has no impact now)} \\
|
1884 |
|
|
& {\tt SUB 1,R2} \\
|
1885 |
|
|
& {\tt BZ memcpend} \\
|
1886 |
|
|
& {\tt ADD 1,R0} \\
|
1887 |
36 |
dgisselq |
& {\tt ADD 1,R1} \\
|
1888 |
69 |
dgisselq |
& {\tt BRA loop} \\
|
1889 |
|
|
& {\em ; (1 stall on a BRA instruction)} \\
|
1890 |
|
|
memcpend: % 11 clocks
|
1891 |
|
|
& {\tt LOD (SP),R4} \\
|
1892 |
|
|
& {\em ; (4 stalls, cannot be further scheduled away)} \\
|
1893 |
|
|
& {\tt ADD 1,SP} \\
|
1894 |
|
|
& {\tt JMP R3} \\
|
1895 |
|
|
& {\em ; (4 stalls)} \\
|
1896 |
36 |
dgisselq |
\end{tabular}
|
1897 |
|
|
\caption{Example Memory Copy code in Zip Assembly}\label{tbl:memcp-asm}
|
1898 |
|
|
\end{center}\end{table}
|
1899 |
|
|
This example points out several things associated with the Zip CPU. First,
|
1900 |
|
|
a straightforward implementation of a for loop is not the fastest loop
|
1901 |
|
|
structure. For this reason, we have placed the test to continue at the
|
1902 |
|
|
end. Second, all pointers are {\tt void} pointers to arbitrary 32--bit
|
1903 |
|
|
data types. The Zip CPU does not have explicit support for smaller or larger
|
1904 |
|
|
data types, and so this memory copy cannot be applied at a byte level.
|
1905 |
|
|
Third, we've optimized the conditional jump to a return instruction into a
|
1906 |
|
|
conditional return instruction.
|
1907 |
|
|
|
1908 |
68 |
dgisselq |
\section{Example: Context Switch}
|
1909 |
36 |
dgisselq |
|
1910 |
|
|
Fundamental to any multiprocessing system is the ability to switch from one
|
1911 |
|
|
task to the next. In the ZipSystem, this is accomplished in one of a couple
|
1912 |
|
|
ways. The first step is that an interrupt happens. Anytime an interrupt
|
1913 |
|
|
happens, the CPU needs to execute the following tasks in supervisor mode:
|
1914 |
|
|
\begin{enumerate}
|
1915 |
69 |
dgisselq |
\item Check for a trap instruction, or other user exception such as a break,
|
1916 |
|
|
bus error, division by zero error, or floating point exception. That
|
1917 |
|
|
is, if the user process needs attending then we may not wish to adjust
|
1918 |
|
|
the context, check interrupts, or call the scheduler.
|
1919 |
|
|
Tbl.~\ref{tbl:trap-check}
|
1920 |
36 |
dgisselq |
\begin{table}\begin{center}
|
1921 |
|
|
\begin{tabular}{ll}
|
1922 |
|
|
{\tt return\_to\_user:} \\
|
1923 |
|
|
& {\em; The instruction before the context switch processing must} \\
|
1924 |
|
|
& {\em; be the RTU instruction that enacted user mode in the first} \\
|
1925 |
|
|
& {\em; place. We show it here just for reference.} \\
|
1926 |
|
|
& {\tt RTU} \\
|
1927 |
|
|
{\tt trap\_check:} \\
|
1928 |
|
|
& {\tt MOV uCC,R0} \\
|
1929 |
69 |
dgisselq |
& {\tt TST \$TRAP \textbar \$BUSERR \textbar \$DIVE \textbar \$FPE,R0} \\
|
1930 |
36 |
dgisselq |
& {\tt BNZ swap\_out} \\
|
1931 |
|
|
& {; \em Do something here to execute the trap} \\
|
1932 |
|
|
& {; \em Don't need to call the scheduler, so we can just return} \\
|
1933 |
|
|
& {\tt BRA return\_to\_user} \\
|
1934 |
|
|
\end{tabular}
|
1935 |
69 |
dgisselq |
\caption{Checking for whether the user task needs our attention}\label{tbl:trap-check}
|
1936 |
36 |
dgisselq |
\end{center}\end{table}
|
1937 |
|
|
shows the rudiments of this code, while showing nothing of how the
|
1938 |
|
|
actual trap would be implemented.
|
1939 |
|
|
|
1940 |
|
|
You may also wish to note that the instruction before the first instruction
|
1941 |
|
|
in our context swap {\em must be} a return to userspace instruction.
|
1942 |
|
|
Remember, the supervisor process is re--entered where it left off. This is
|
1943 |
|
|
different from many other processors that enter interrupt mode at some vector
|
1944 |
|
|
or other. In this case, we always enter supervisor mode right where we last
|
1945 |
|
|
left.\footnote{The one exception to this rule is upon reset where supervisor
|
1946 |
|
|
mode is entered at a pre--programmed wishbone memory address.}
|
1947 |
|
|
|
1948 |
|
|
\item Capture user counters. If the operating system is keeping track of
|
1949 |
|
|
system usage via the accounting counters, those counters need to be
|
1950 |
|
|
copied and accumulated into some master counter at this point.
|
1951 |
|
|
|
1952 |
|
|
\item Preserve the old context. This involves pushing all the user registers
|
1953 |
|
|
onto the user stack and then copying the resulting stack address
|
1954 |
|
|
into the tasks task structure, as shown in Tbl.~\ref{tbl:context-out}.
|
1955 |
|
|
\begin{table}\begin{center}
|
1956 |
|
|
\begin{tabular}{ll}
|
1957 |
|
|
{\tt swap\_out:} \\
|
1958 |
39 |
dgisselq |
& {\tt MOV -15(uSP),R5} \\
|
1959 |
|
|
& {\tt STO R5,stack(R12)} \\
|
1960 |
|
|
& {\tt MOV uR0,R0} \\
|
1961 |
|
|
& {\tt MOV uR1,R1} \\
|
1962 |
|
|
& {\tt MOV uR2,R2} \\
|
1963 |
|
|
& {\tt MOV uR3,R3} \\
|
1964 |
|
|
& {\tt MOV uR4,R4} \\
|
1965 |
69 |
dgisselq |
& {\tt STO R0,(R5)} {\em ; Exploit memory pipelining: }\\
|
1966 |
|
|
& {\tt STO R1,1(R5)} {\em ; All instructions write to stack }\\
|
1967 |
|
|
& {\tt STO R2,2(R5)} {\em ; All offsets increment by one }\\
|
1968 |
|
|
& {\tt STO R3,3(R5)} {\em ; Longest pipeline is 5 cycles.}\\
|
1969 |
|
|
& {\tt STO R4,4(R5)} \\
|
1970 |
39 |
dgisselq |
& \ldots {\em ; Need to repeat for all user registers} \\
|
1971 |
|
|
\iffalse
|
1972 |
|
|
& {\tt MOV uR5,R0} \\
|
1973 |
|
|
& {\tt MOV uR6,R1} \\
|
1974 |
|
|
& {\tt MOV uR7,R2} \\
|
1975 |
|
|
& {\tt MOV uR8,R3} \\
|
1976 |
|
|
& {\tt MOV uR9,R4} \\
|
1977 |
69 |
dgisselq |
& {\tt STO R0,5(R5) }\\
|
1978 |
|
|
& {\tt STO R1,6(R5) }\\
|
1979 |
|
|
& {\tt STO R2,7(R5) }\\
|
1980 |
|
|
& {\tt STO R3,8(R5) }\\
|
1981 |
|
|
& {\tt STO R4,9(R5)} \\
|
1982 |
39 |
dgisselq |
\fi
|
1983 |
|
|
& {\tt MOV uR10,R0} \\
|
1984 |
|
|
& {\tt MOV uR11,R1} \\
|
1985 |
|
|
& {\tt MOV uR12,R2} \\
|
1986 |
|
|
& {\tt MOV uCC,R3} \\
|
1987 |
|
|
& {\tt MOV uPC,R4} \\
|
1988 |
69 |
dgisselq |
& {\tt STO R0,10(R5)}\\
|
1989 |
|
|
& {\tt STO R1,11(R5)}\\
|
1990 |
|
|
& {\tt STO R2,12(R5)}\\
|
1991 |
|
|
& {\tt STO R3,13(R5)}\\
|
1992 |
|
|
& {\tt STO R4,14(R5)} \\
|
1993 |
36 |
dgisselq |
& {\em ; We can skip storing the stack, uSP, since it'll be stored}\\
|
1994 |
|
|
& {\em ; elsewhere (in the task structure) }\\
|
1995 |
|
|
\end{tabular}
|
1996 |
|
|
\caption{Example Storing User Task Context}\label{tbl:context-out}
|
1997 |
|
|
\end{center}\end{table}
|
1998 |
|
|
For the sake of discussion, we assume the supervisor maintains a
|
1999 |
|
|
pointer to the current task's structure in supervisor register
|
2000 |
|
|
{\tt R12}, and that {\tt stack} is an offset to the beginning of this
|
2001 |
|
|
structure indicating where the stack pointer is to be kept within it.
|
2002 |
|
|
|
2003 |
|
|
For those who are still interested, the full code for this context
|
2004 |
|
|
save can be found as an assembler macro within the assembler
|
2005 |
|
|
include file, {\tt sys.i}.
|
2006 |
|
|
|
2007 |
|
|
\item Reset the watchdog timer. If you are using the watchdog timer, it should
|
2008 |
|
|
be reset on a context swap, to know that things are still working.
|
2009 |
|
|
Example code for this is shown in Tbl.~\ref{tbl:reset-watchdog}.
|
2010 |
|
|
\begin{table}\begin{center}
|
2011 |
|
|
\begin{tabular}{ll}
|
2012 |
|
|
\multicolumn{2}{l}{{\tt `define WATCHDOG\_ADDRESS 32'hc000\_0002}}\\
|
2013 |
|
|
\multicolumn{2}{l}{{\tt `define WATCHDOG\_TICKS 32'd1\_000\_000} {; \em = 10 ms}}\\
|
2014 |
|
|
& {\tt LDI WATCHDOG\_ADDRESS,R0} \\
|
2015 |
|
|
& {\tt LDI WATCHDOG\_TICKS,R1} \\
|
2016 |
|
|
& {\tt STO R1,(R0)}
|
2017 |
|
|
\end{tabular}
|
2018 |
|
|
\caption{Example Watchdog Reset}\label{tbl:reset-watchdog}
|
2019 |
|
|
\end{center}\end{table}
|
2020 |
|
|
|
2021 |
|
|
\item Interrupt handling. An interrupt handler within the Zip System is nothing
|
2022 |
|
|
more than a task. At context swap time, the supervisor needs to
|
2023 |
|
|
disable all of the interrupts that have tripped, and then enable
|
2024 |
|
|
all of the tasks that would deal with each of these interrupts.
|
2025 |
|
|
These can be user tasks, run at higher priority than any other user
|
2026 |
|
|
tasks. Either way, they will need to re--enable their own interrupt
|
2027 |
|
|
themselves, if the interrupt is still relevant.
|
2028 |
|
|
|
2029 |
|
|
An example of this master interrut handling is shown in
|
2030 |
|
|
Tbl.~\ref{tbl:pre-handler}.
|
2031 |
|
|
\begin{table}\begin{center}
|
2032 |
|
|
\begin{tabular}{ll}
|
2033 |
|
|
{\tt pre\_handler:} \\
|
2034 |
|
|
& {\tt LDI PIC\_ADDRESS,R0 } \\
|
2035 |
|
|
& {\em ; Start by grabbing the interrupt state from the interrupt}\\
|
2036 |
|
|
& {\em ; controller. We'll store this into the register R7 so that }\\
|
2037 |
|
|
& {\em ; we can keep and preserve this information for the scheduler}\\
|
2038 |
|
|
& {\em ; to use later. }\\
|
2039 |
|
|
& {\tt LOD (R0),R1} \\
|
2040 |
|
|
& {\tt MOV R1,R7 } \\
|
2041 |
|
|
& {\em ; As a next step, we need to acknowledge and disable all active}\\
|
2042 |
|
|
& {\em ; interrupts. We'll start by calculating all of our active}\\
|
2043 |
|
|
& {\em ; interrupts.}\\
|
2044 |
|
|
& {\tt AND 0x07fff,R1 } \\
|
2045 |
|
|
& {\em ; Put the active interrupts into the upper half of R1} \\
|
2046 |
|
|
& {\tt ROL 16,R1 } \\
|
2047 |
|
|
& {\tt LDILO 0x0ffff,R1 } \\
|
2048 |
|
|
& {\tt AND R7,R1}\\
|
2049 |
|
|
& {\em ; Acknowledge and disable active interrupts}\\
|
2050 |
|
|
& {\em ; This also disables all interrupts from the controller, so}\\
|
2051 |
|
|
& {\em ; we'll need to re-enable interrupts in general shortly } \\
|
2052 |
|
|
& {\tt STO R1,(R0) } \\
|
2053 |
|
|
& {\em ; We leave our active interrupt mask in R7 so the scheduler can}\\
|
2054 |
|
|
& {\em ; release any tasks that depended upon them. } \\
|
2055 |
|
|
\end{tabular}
|
2056 |
|
|
\caption{Example checking for active interrupts}\label{tbl:pre-handler}
|
2057 |
|
|
\end{center}\end{table}
|
2058 |
|
|
|
2059 |
|
|
\item Calling the scheduler. This needs to be done to pick the next task
|
2060 |
|
|
to switch to. It may be an interrupt handler, or it may be a normal
|
2061 |
|
|
user task. From a priority standpoint, it would make sense that the
|
2062 |
|
|
interrupt handlers all have a higher priority than the user tasks,
|
2063 |
|
|
and that once they have been called the user tasks may then be called
|
2064 |
|
|
again. If no task is ready to run, run the idle task to wait for an
|
2065 |
|
|
interrupt.
|
2066 |
|
|
|
2067 |
|
|
This suggests a minimum of four task priorities:
|
2068 |
|
|
\begin{enumerate}
|
2069 |
|
|
\item Interrupt handlers, executed with their interrupts disabled
|
2070 |
|
|
\item Device drivers, executed with interrupts re-enabled
|
2071 |
|
|
\item User tasks
|
2072 |
|
|
\item The idle task, executed when nothing else is able to execute
|
2073 |
|
|
\end{enumerate}
|
2074 |
|
|
|
2075 |
|
|
For our purposes here, we'll just assume that a pointer to the current
|
2076 |
|
|
task is maintained in {\tt R12}, that a {\tt JSR scheduler} is
|
2077 |
|
|
called, and that the next current task is likewise placed into
|
2078 |
|
|
{\tt R12}.
|
2079 |
|
|
|
2080 |
|
|
\item Restore the new tasks context. Given that the scheduler has returned a
|
2081 |
|
|
task that can be run at this time, the stack pointer needs to be
|
2082 |
|
|
pulled out of the tasks task structure, placed into the user
|
2083 |
|
|
register, and then the rest of the user registers need to be popped
|
2084 |
|
|
back off of the stack to run this task. An example of this is
|
2085 |
|
|
shown in Tbl.~\ref{tbl:context-in},
|
2086 |
|
|
\begin{table}\begin{center}
|
2087 |
|
|
\begin{tabular}{ll}
|
2088 |
|
|
{\tt swap\_in:} \\
|
2089 |
39 |
dgisselq |
& {\tt LOD stack(R12),R5} \\
|
2090 |
36 |
dgisselq |
& {\tt MOV 15(R1),uSP} \\
|
2091 |
39 |
dgisselq |
& {\em ; Be sure to exploit the memory pipelining capability} \\
|
2092 |
69 |
dgisselq |
& {\tt LOD (R5),R0} \\
|
2093 |
|
|
& {\tt LOD 1(R5),R1} \\
|
2094 |
|
|
& {\tt LOD 2(R5),R2} \\
|
2095 |
|
|
& {\tt LOD 3(R5),R3} \\
|
2096 |
|
|
& {\tt LOD 4(R5),R4} \\
|
2097 |
39 |
dgisselq |
& {\tt MOV R0,uR0} \\
|
2098 |
|
|
& {\tt MOV R1,uR1} \\
|
2099 |
|
|
& {\tt MOV R2,uR2} \\
|
2100 |
|
|
& {\tt MOV R3,uR3} \\
|
2101 |
|
|
& {\tt MOV R4,uR4} \\
|
2102 |
36 |
dgisselq |
& \ldots {\em ; Need to repeat for all user registers} \\
|
2103 |
69 |
dgisselq |
& {\tt LOD 10(R5),R0} \\
|
2104 |
|
|
& {\tt LOD 11(R5),R1} \\
|
2105 |
|
|
& {\tt LOD 12(R5),R2} \\
|
2106 |
|
|
& {\tt LOD 13(R5),R3} \\
|
2107 |
|
|
& {\tt LOD 14(R5),R4} \\
|
2108 |
39 |
dgisselq |
& {\tt MOV R0,uR10} \\
|
2109 |
|
|
& {\tt MOV R1,uR11} \\
|
2110 |
|
|
& {\tt MOV R2,uR12} \\
|
2111 |
|
|
& {\tt MOV R3,uCC} \\
|
2112 |
|
|
& {\tt MOV R4,uPC} \\
|
2113 |
|
|
|
2114 |
36 |
dgisselq |
& {\tt BRA return\_to\_user} \\
|
2115 |
|
|
\end{tabular}
|
2116 |
|
|
\caption{Example Restoring User Task Context}\label{tbl:context-in}
|
2117 |
|
|
\end{center}\end{table}
|
2118 |
|
|
assuming as before that the task
|
2119 |
|
|
pointer is found in supervisor register {\tt R12}.
|
2120 |
|
|
As with storing the user context, the full code associated with
|
2121 |
|
|
restoring the user context can be found in the assembler include
|
2122 |
|
|
file, {\tt sys.i}.
|
2123 |
|
|
|
2124 |
|
|
\item Clear the userspace accounting registers. In order to keep track of
|
2125 |
|
|
per process system usage, these registers need to be cleared before
|
2126 |
|
|
reactivating the userspace process. That way, upon the next
|
2127 |
|
|
interrupt, we'll know how many clocks the userspace program has
|
2128 |
|
|
encountered, and how many instructions it was able to issue in
|
2129 |
|
|
those many clocks.
|
2130 |
|
|
|
2131 |
|
|
\item Jump back to the instruction just before saving the last tasks context,
|
2132 |
|
|
because that location in memory contains the return from interrupt
|
2133 |
|
|
command that we are going to need to execute, in order to guarantee
|
2134 |
|
|
that we return back here again.
|
2135 |
|
|
\end{enumerate}
|
2136 |
|
|
|
2137 |
21 |
dgisselq |
\chapter{Registers}\label{chap:regs}
|
2138 |
|
|
|
2139 |
24 |
dgisselq |
The ZipSystem registers fall into two categories, ZipSystem internal registers
|
2140 |
|
|
accessed via the ZipCPU shown in Tbl.~\ref{tbl:zpregs},
|
2141 |
|
|
\begin{table}[htbp]
|
2142 |
|
|
\begin{center}\begin{reglist}
|
2143 |
32 |
dgisselq |
PIC & \scalebox{0.8}{\tt 0xc0000000} & 32 & R/W & Primary Interrupt Controller \\\hline
|
2144 |
|
|
WDT & \scalebox{0.8}{\tt 0xc0000001} & 32 & R/W & Watchdog Timer \\\hline
|
2145 |
69 |
dgisselq |
& \scalebox{0.8}{\tt 0xc0000002} & 32 & R & Address of last bus error \\\hline
|
2146 |
32 |
dgisselq |
CTRIC & \scalebox{0.8}{\tt 0xc0000003} & 32 & R/W & Secondary Interrupt Controller \\\hline
|
2147 |
|
|
TMRA & \scalebox{0.8}{\tt 0xc0000004} & 32 & R/W & Timer A\\\hline
|
2148 |
|
|
TMRB & \scalebox{0.8}{\tt 0xc0000005} & 32 & R/W & Timer B\\\hline
|
2149 |
|
|
TMRC & \scalebox{0.8}{\tt 0xc0000006} & 32 & R/W & Timer C\\\hline
|
2150 |
|
|
JIFF & \scalebox{0.8}{\tt 0xc0000007} & 32 & R/W & Jiffies \\\hline
|
2151 |
|
|
MTASK & \scalebox{0.8}{\tt 0xc0000008} & 32 & R/W & Master Task Clock Counter \\\hline
|
2152 |
|
|
MMSTL & \scalebox{0.8}{\tt 0xc0000009} & 32 & R/W & Master Stall Counter \\\hline
|
2153 |
|
|
MPSTL & \scalebox{0.8}{\tt 0xc000000a} & 32 & R/W & Master Pre--Fetch Stall Counter \\\hline
|
2154 |
|
|
MICNT & \scalebox{0.8}{\tt 0xc000000b} & 32 & R/W & Master Instruction Counter\\\hline
|
2155 |
|
|
UTASK & \scalebox{0.8}{\tt 0xc000000c} & 32 & R/W & User Task Clock Counter \\\hline
|
2156 |
|
|
UMSTL & \scalebox{0.8}{\tt 0xc000000d} & 32 & R/W & User Stall Counter \\\hline
|
2157 |
|
|
UPSTL & \scalebox{0.8}{\tt 0xc000000e} & 32 & R/W & User Pre--Fetch Stall Counter \\\hline
|
2158 |
|
|
UICNT & \scalebox{0.8}{\tt 0xc000000f} & 32 & R/W & User Instruction Counter\\\hline
|
2159 |
36 |
dgisselq |
DMACTRL & \scalebox{0.8}{\tt 0xc0000010} & 32 & R/W & DMA Control Register\\\hline
|
2160 |
|
|
DMALEN & \scalebox{0.8}{\tt 0xc0000011} & 32 & R/W & DMA total transfer length\\\hline
|
2161 |
|
|
DMASRC & \scalebox{0.8}{\tt 0xc0000012} & 32 & R/W & DMA source address\\\hline
|
2162 |
|
|
DMADST & \scalebox{0.8}{\tt 0xc0000013} & 32 & R/W & DMA destination address\\\hline
|
2163 |
32 |
dgisselq |
% Cache & \scalebox{0.8}{\tt 0xc0100000} & & & Base address of the Cache memory\\\hline
|
2164 |
24 |
dgisselq |
\end{reglist}
|
2165 |
|
|
\caption{Zip System Internal/Peripheral Registers}\label{tbl:zpregs}
|
2166 |
|
|
\end{center}\end{table}
|
2167 |
33 |
dgisselq |
and the two debug registers shown in Tbl.~\ref{tbl:dbgregs}.
|
2168 |
24 |
dgisselq |
\begin{table}[htbp]
|
2169 |
|
|
\begin{center}\begin{reglist}
|
2170 |
|
|
ZIPCTRL & 0 & 32 & R/W & Debug Control Register \\\hline
|
2171 |
|
|
ZIPDATA & 1 & 32 & R/W & Debug Data Register \\\hline
|
2172 |
|
|
\end{reglist}
|
2173 |
|
|
\caption{Zip System Debug Registers}\label{tbl:dbgregs}
|
2174 |
|
|
\end{center}\end{table}
|
2175 |
|
|
|
2176 |
33 |
dgisselq |
\section{Peripheral Registers}
|
2177 |
|
|
The peripheral registers, listed in Tbl.~\ref{tbl:zpregs}, are shown in the
|
2178 |
|
|
CPU's address space. These may be accessed by the CPU at these addresses,
|
2179 |
|
|
and when so accessed will respond as described in Chapt.~\ref{chap:periph}.
|
2180 |
|
|
These registers will be discussed briefly again here.
|
2181 |
24 |
dgisselq |
|
2182 |
69 |
dgisselq |
\subsection{Interrupt Controller(s)}
|
2183 |
33 |
dgisselq |
The Zip CPU Interrupt controller has four different types of bits, as shown in
|
2184 |
|
|
Tbl.~\ref{tbl:picbits}.
|
2185 |
|
|
\begin{table}\begin{center}
|
2186 |
|
|
\begin{bitlist}
|
2187 |
|
|
31 & R/W & Master Interrupt Enable\\\hline
|
2188 |
69 |
dgisselq |
30\ldots 16 & R/W & Interrupt Enables, write `1' to change\\\hline
|
2189 |
33 |
dgisselq |
15 & R & Current Master Interrupt State\\\hline
|
2190 |
69 |
dgisselq |
15\ldots 0 & R/W & Input Interrupt states, write `1' to clear\\\hline
|
2191 |
33 |
dgisselq |
\end{bitlist}
|
2192 |
|
|
\caption{Interrupt Controller Register Bits}\label{tbl:picbits}
|
2193 |
|
|
\end{center}\end{table}
|
2194 |
|
|
The high order bit, or bit--31, is the master interrupt enable bit. When this
|
2195 |
|
|
bit is set, then any time an interrupt occurs the CPU will be interrupted and
|
2196 |
|
|
will switch to supervisor mode, etc.
|
2197 |
|
|
|
2198 |
|
|
Bits 30~\ldots 16 are interrupt enable bits. Should the interrupt line go
|
2199 |
69 |
dgisselq |
hi while enabled, an interrupt will be generated. (All interrupts are positive
|
2200 |
|
|
edge triggered.) To set an interrupt enable bit, one needs to write the
|
2201 |
|
|
master interrupt enable while writing a `1' to this the bit. To clear, one
|
2202 |
|
|
need only write a `0' to the master interrupt enable, while leaving this line
|
2203 |
|
|
high.
|
2204 |
33 |
dgisselq |
|
2205 |
|
|
Bits 15\ldots 0 are the current state of the interrupt vector. Interrupt lines
|
2206 |
|
|
trip when they go high, and remain tripped until they are acknowledged. If
|
2207 |
|
|
the interrupt goes high for longer than one pulse, it may be high when a clear
|
2208 |
|
|
is requested. If so, the interrupt will not clear. The line must go low
|
2209 |
|
|
again before the status bit can be cleared.
|
2210 |
|
|
|
2211 |
|
|
As an example, consider the following scenario where the Zip CPU supports four
|
2212 |
|
|
interrupts, 3\ldots0.
|
2213 |
|
|
\begin{enumerate}
|
2214 |
|
|
\item The Supervisor will first, while in the interrupts disabled mode,
|
2215 |
|
|
write a {\tt 32'h800f000f} to the controller. The supervisor may then
|
2216 |
|
|
switch to the user state with interrupts enabled.
|
2217 |
|
|
\item When an interrupt occurs, the supervisor will switch to the interrupt
|
2218 |
|
|
state. It will then cycle through the interrupt bits to learn which
|
2219 |
|
|
interrupt handler to call.
|
2220 |
|
|
\item If the interrupt handler expects more interrupts, it will clear its
|
2221 |
|
|
current interrupt when it is done handling the interrupt in question.
|
2222 |
69 |
dgisselq |
To do this, it will write a `1' to the low order interrupt mask,
|
2223 |
|
|
such as writing a {\tt 32'h0000\_0001}.
|
2224 |
33 |
dgisselq |
\item If the interrupt handler does not expect any more interrupts, it will
|
2225 |
|
|
instead clear the interrupt from the controller by writing a
|
2226 |
69 |
dgisselq |
{\tt 32'h0001\_0001} to the controller.
|
2227 |
33 |
dgisselq |
\item Once all interrupts have been handled, the supervisor will write a
|
2228 |
69 |
dgisselq |
{\tt 32'h8000\_0000} to the interrupt register to re-enable interrupt
|
2229 |
33 |
dgisselq |
generation.
|
2230 |
|
|
\item The supervisor should also check the user trap bit, and possible soft
|
2231 |
|
|
interrupt bits here, but this action has nothing to do with the
|
2232 |
|
|
interrupt control register.
|
2233 |
|
|
\item The supervisor will then leave interrupt mode, possibly adjusting
|
2234 |
|
|
whichever task is running, by executing a return from interrupt
|
2235 |
|
|
command.
|
2236 |
|
|
\end{enumerate}
|
2237 |
|
|
|
2238 |
69 |
dgisselq |
\subsection{Timer Register}
|
2239 |
|
|
|
2240 |
33 |
dgisselq |
Leaving the interrupt controller, we show the timer registers bit definitions
|
2241 |
|
|
in Tbl.~\ref{tbl:tmrbits}.
|
2242 |
|
|
\begin{table}\begin{center}
|
2243 |
|
|
\begin{bitlist}
|
2244 |
|
|
31 & R/W & Auto-Reload\\\hline
|
2245 |
|
|
30\ldots 0 & R/W & Current timer value\\\hline
|
2246 |
|
|
\end{bitlist}
|
2247 |
|
|
\caption{Timer Register Bits}\label{tbl:tmrbits}
|
2248 |
|
|
\end{center}\end{table}
|
2249 |
|
|
As you may recall, the timer just counts down to zero and then trips an
|
2250 |
|
|
interrupt. Writing to the current timer value sets that value, and reading
|
2251 |
|
|
from it returns that value. Writing to the current timer value while also
|
2252 |
|
|
setting the auto--reload bit will send the timer into an auto--reload mode.
|
2253 |
|
|
In this mode, upon setting its interrupt bit for one cycle, the timer will
|
2254 |
|
|
also reset itself back to the value of the timer that was written to it when
|
2255 |
|
|
the auto--reload option was written to it. To clear and stop the timer,
|
2256 |
|
|
just simply write a `32'h00' to this register.
|
2257 |
|
|
|
2258 |
69 |
dgisselq |
\subsection{Jiffies}
|
2259 |
|
|
|
2260 |
33 |
dgisselq |
The Jiffies register is somewhat similar in that the register always changes.
|
2261 |
|
|
In this case, the register counts up, whereas the timer always counted down.
|
2262 |
|
|
Reads from this register, as shown in Tbl.~\ref{tbl:jiffybits},
|
2263 |
|
|
\begin{table}\begin{center}
|
2264 |
|
|
\begin{bitlist}
|
2265 |
|
|
31\ldots 0 & R & Current jiffy value\\\hline
|
2266 |
|
|
31\ldots 0 & W & Value/time of next interrupt\\\hline
|
2267 |
|
|
\end{bitlist}
|
2268 |
|
|
\caption{Jiffies Register Bits}\label{tbl:jiffybits}
|
2269 |
|
|
\end{center}\end{table}
|
2270 |
|
|
always return the time value contained in the register. Writes greater than
|
2271 |
|
|
the current Jiffy value, that is where the new value minus the old value is
|
2272 |
|
|
greater than zero while ignoring truncation, will set a new Jiffy interrupt
|
2273 |
|
|
time. At that time, the Jiffy vector will clear, and another interrupt time
|
2274 |
|
|
may either be written to it, or it will just continue counting without
|
2275 |
|
|
activating any more interrupts.
|
2276 |
|
|
|
2277 |
69 |
dgisselq |
\subsection{Performance Counters}
|
2278 |
|
|
|
2279 |
33 |
dgisselq |
The Zip CPU also supports several counter peripherals, mostly in the way of
|
2280 |
|
|
process accounting. This peripherals have a single register associated with
|
2281 |
|
|
them, shown in Tbl.~\ref{tbl:ctrbits}.
|
2282 |
|
|
\begin{table}\begin{center}
|
2283 |
|
|
\begin{bitlist}
|
2284 |
|
|
31\ldots 0 & R/W & Current counter value\\\hline
|
2285 |
|
|
\end{bitlist}
|
2286 |
|
|
\caption{Counter Register Bits}\label{tbl:ctrbits}
|
2287 |
|
|
\end{center}\end{table}
|
2288 |
|
|
Writes to this register set the new counter value. Reads read the current
|
2289 |
|
|
counter value.
|
2290 |
|
|
|
2291 |
|
|
The current design operation of these counters is that of performance counting.
|
2292 |
|
|
Two sets of four registers are available for keeping track of performance.
|
2293 |
|
|
The first is a task counter. This just counts clock ticks. The second
|
2294 |
|
|
counter is a prefetch stall counter, then an master stall counter. These
|
2295 |
|
|
allow the CPU to be evaluated as to how efficient it is. The fourth and
|
2296 |
|
|
final counter is an instruction counter, which counts how many instructions the
|
2297 |
|
|
CPU has issued.
|
2298 |
|
|
|
2299 |
|
|
It is envisioned that these counters will be used as follows: First, every time
|
2300 |
|
|
a master counter rolls over, the supervisor (Operating System) will record
|
2301 |
|
|
the fact. Second, whenever activating a user task, the Operating System will
|
2302 |
|
|
set the four user counters to zero. When the user task has completed, the
|
2303 |
|
|
Operating System will read the timers back off, to determine how much of the
|
2304 |
69 |
dgisselq |
CPU the process had consumed. To keep this accurate, the user counters will
|
2305 |
|
|
only increment when the GIE bit is set to indicate that the processor is
|
2306 |
|
|
in user mode.
|
2307 |
33 |
dgisselq |
|
2308 |
69 |
dgisselq |
\subsection{DMA Controller}
|
2309 |
|
|
|
2310 |
36 |
dgisselq |
The final peripheral to discuss is the DMA controller. This controller
|
2311 |
|
|
has four registers. Of these four, the length, source and destination address
|
2312 |
|
|
registers should need no further explanation. They are full 32--bit registers
|
2313 |
|
|
specifying the entire transfer length, the starting address to read from, and
|
2314 |
|
|
the starting address to write to. The registers can be written to when the
|
2315 |
|
|
DMA is idle, and read at any time. The control register, however, will need
|
2316 |
|
|
some more explanation.
|
2317 |
|
|
|
2318 |
|
|
The bit allocation of the control register is shown in Tbl.~\ref{tbl:dmacbits}.
|
2319 |
|
|
\begin{table}\begin{center}
|
2320 |
|
|
\begin{bitlist}
|
2321 |
|
|
31 & R & DMA Active\\\hline
|
2322 |
39 |
dgisselq |
30 & R & Wishbone error, transaction aborted. This bit is cleared the next time
|
2323 |
|
|
this register is written to.\\\hline
|
2324 |
69 |
dgisselq |
29 & R/W & Set to `1' to prevent the controller from incrementing the source address, `0' for normal memory copy. \\\hline
|
2325 |
|
|
28 & R/W & Set to `1' to prevent the controller from incrementing the
|
2326 |
|
|
destination address, `0' for normal memory copy. \\\hline
|
2327 |
36 |
dgisselq |
27 \ldots 16 & W & The DMA Key. Write a 12'hfed to these bits to start the
|
2328 |
|
|
activate any DMA transfer. \\\hline
|
2329 |
69 |
dgisselq |
27 & R & Always reads `0', to force the deliberate writing of the key. \\\hline
|
2330 |
36 |
dgisselq |
26 \ldots 16 & R & Indicates the number of items in the transfer buffer that
|
2331 |
|
|
have yet to be written. \\\hline
|
2332 |
69 |
dgisselq |
15 & R/W & Set to `1' to trigger on an interrupt, or `0' to start immediately
|
2333 |
36 |
dgisselq |
upon receiving a valid key.\\\hline
|
2334 |
|
|
14\ldots 10 & R/W & Select among one of 32~possible interrupt lines.\\\hline
|
2335 |
|
|
9\ldots 0 & R/W & Intermediate transfer length minus one. Thus, to transfer
|
2336 |
|
|
one item at a time set this value to 0. To transfer 1024 at a time,
|
2337 |
|
|
set it to 1024.\\\hline
|
2338 |
|
|
\end{bitlist}
|
2339 |
|
|
\caption{DMA Control Register Bits}\label{tbl:dmacbits}
|
2340 |
|
|
\end{center}\end{table}
|
2341 |
|
|
This control register has been designed so that the common case of memory
|
2342 |
|
|
access need only set the key and the transfer length. Hence, writing a
|
2343 |
|
|
\hbox{32'h0fed03ff} to the control register will start any memory transfer.
|
2344 |
|
|
On the other hand, if you wished to read from a serial port (constant address)
|
2345 |
|
|
and put the result into a buffer every time a word was available, you
|
2346 |
|
|
might wish to write \hbox{32'h2fed8000}--this assumes, of course, that you
|
2347 |
|
|
have a serial port wired to the zero bit of this interrupt control. (The
|
2348 |
|
|
DMA controller does not use the interrupt controller, and cannot clear
|
2349 |
|
|
interrupts.) As a third example, if you wished to write to an external
|
2350 |
|
|
FIFO anytime it was less than half full (had fewer than 512 items), and
|
2351 |
|
|
interrupt line 2 indicated this condition, you might wish to issue a
|
2352 |
|
|
\hbox{32'h1fed8dff} to this port.
|
2353 |
|
|
|
2354 |
33 |
dgisselq |
\section{Debug Port Registers}
|
2355 |
|
|
Accessing the Zip System via the debug port isn't as straight forward as
|
2356 |
|
|
accessing the system via the wishbone bus. The debug port itself has been
|
2357 |
|
|
reduced to two addresses, as outlined earlier in Tbl.~\ref{tbl:dbgregs}.
|
2358 |
|
|
Access to the Zip System begins with the Debug Control register, shown in
|
2359 |
|
|
Tbl.~\ref{tbl:dbgctrl}.
|
2360 |
|
|
\begin{table}\begin{center}
|
2361 |
|
|
\begin{bitlist}
|
2362 |
69 |
dgisselq |
31\ldots 14 & R & External interrupt state. Bit 14 is valid for one
|
2363 |
|
|
interrupt only, bit 15 for two, etc.\\\hline
|
2364 |
33 |
dgisselq |
13 & R & CPU GIE setting\\\hline
|
2365 |
|
|
12 & R & CPU is sleeping\\\hline
|
2366 |
|
|
11 & W & Command clear PF cache\\\hline
|
2367 |
69 |
dgisselq |
10 & R/W & Command HALT, Set to `1' to halt the CPU\\\hline
|
2368 |
|
|
9 & R & Stall Status, `1' if CPU is busy (i.e., not halted yet)\\\hline
|
2369 |
|
|
8 & R/W & Step Command, set to `1' to step the CPU, also sets the halt bit\\\hline
|
2370 |
|
|
7 & R & Interrupt Request Pending\\\hline
|
2371 |
33 |
dgisselq |
6 & R/W & Command RESET \\\hline
|
2372 |
|
|
5\ldots 0 & R/W & Debug Register Address \\\hline
|
2373 |
|
|
\end{bitlist}
|
2374 |
|
|
\caption{Debug Control Register Bits}\label{tbl:dbgctrl}
|
2375 |
|
|
\end{center}\end{table}
|
2376 |
|
|
|
2377 |
|
|
The first step in debugging access is to determine whether or not the CPU
|
2378 |
69 |
dgisselq |
is halted, and to halt it if not. To do this, first write a `1' to the
|
2379 |
33 |
dgisselq |
Command HALT bit. This will halt the CPU and place it into debug mode.
|
2380 |
|
|
Once the CPU is halted, the stall status bit will drop to zero. Thus,
|
2381 |
|
|
if bit 10 is high and bit 9 low, the debug port is open to examine the
|
2382 |
|
|
internal state of the CPU.
|
2383 |
|
|
|
2384 |
|
|
At this point, the external debugger may examine internal state information
|
2385 |
|
|
from within the CPU. To do this, first write again to the command register
|
2386 |
|
|
a value (with command halt still high) containing the address of an internal
|
2387 |
|
|
register of interest in the bottom 6~bits. Internal registers that may be
|
2388 |
|
|
accessed this way are listed in Tbl.~\ref{tbl:dbgaddrs}.
|
2389 |
|
|
\begin{table}\begin{center}
|
2390 |
|
|
\begin{reglist}
|
2391 |
|
|
sR0 & 0 & 32 & R/W & Supervisor Register R0 \\\hline
|
2392 |
|
|
sR1 & 0 & 32 & R/W & Supervisor Register R1 \\\hline
|
2393 |
|
|
sSP & 13 & 32 & R/W & Supervisor Stack Pointer\\\hline
|
2394 |
|
|
sCC & 14 & 32 & R/W & Supervisor Condition Code Register \\\hline
|
2395 |
|
|
sPC & 15 & 32 & R/W & Supervisor Program Counter\\\hline
|
2396 |
|
|
uR0 & 16 & 32 & R/W & User Register R0 \\\hline
|
2397 |
|
|
uR1 & 17 & 32 & R/W & User Register R1 \\\hline
|
2398 |
|
|
uSP & 29 & 32 & R/W & User Stack Pointer\\\hline
|
2399 |
|
|
uCC & 30 & 32 & R/W & User Condition Code Register \\\hline
|
2400 |
|
|
uPC & 31 & 32 & R/W & User Program Counter\\\hline
|
2401 |
|
|
PIC & 32 & 32 & R/W & Primary Interrupt Controller \\\hline
|
2402 |
|
|
WDT & 33 & 32 & R/W & Watchdog Timer\\\hline
|
2403 |
69 |
dgisselq |
BUS & 34 & 32 & R & Last Bus Error\\\hline
|
2404 |
33 |
dgisselq |
CTRIC & 35 & 32 & R/W & Secondary Interrupt Controller\\\hline
|
2405 |
|
|
TMRA & 36 & 32 & R/W & Timer A\\\hline
|
2406 |
|
|
TMRB & 37 & 32 & R/W & Timer B\\\hline
|
2407 |
|
|
TMRC & 38 & 32 & R/W & Timer C\\\hline
|
2408 |
|
|
JIFF & 39 & 32 & R/W & Jiffies peripheral\\\hline
|
2409 |
|
|
MTASK & 40 & 32 & R/W & Master task clock counter\\\hline
|
2410 |
|
|
MMSTL & 41 & 32 & R/W & Master memory stall counter\\\hline
|
2411 |
|
|
MPSTL & 42 & 32 & R/W & Master Pre-Fetch Stall counter\\\hline
|
2412 |
|
|
MICNT & 43 & 32 & R/W & Master instruction counter\\\hline
|
2413 |
|
|
UTASK & 44 & 32 & R/W & User task clock counter\\\hline
|
2414 |
|
|
UMSTL & 45 & 32 & R/W & User memory stall counter\\\hline
|
2415 |
|
|
UPSTL & 46 & 32 & R/W & User Pre-Fetch Stall counter\\\hline
|
2416 |
|
|
UICNT & 47 & 32 & R/W & User instruction counter\\\hline
|
2417 |
39 |
dgisselq |
DMACMD & 48 & 32 & R/W & DMA command and status register\\\hline
|
2418 |
|
|
DMALEN & 49 & 32 & R/W & DMA transfer length\\\hline
|
2419 |
|
|
DMARD & 50 & 32 & R/W & DMA read address\\\hline
|
2420 |
|
|
DMAWR & 51 & 32 & R/W & DMA write address\\\hline
|
2421 |
33 |
dgisselq |
\end{reglist}
|
2422 |
|
|
\caption{Debug Register Addresses}\label{tbl:dbgaddrs}
|
2423 |
|
|
\end{center}\end{table}
|
2424 |
|
|
Primarily, these ``registers'' include access to the entire CPU register
|
2425 |
36 |
dgisselq |
set, as well as the internal peripherals. To read one of these registers
|
2426 |
33 |
dgisselq |
once the address is set, simply issue a read from the data port. To write
|
2427 |
|
|
one of these registers or peripheral ports, simply write to the data port
|
2428 |
|
|
after setting the proper address.
|
2429 |
|
|
|
2430 |
|
|
In this manner, all of the CPU's internal state may be read and adjusted.
|
2431 |
|
|
|
2432 |
|
|
As an example of how to use this, consider what would happen in the case
|
2433 |
|
|
of an external break point. If and when the CPU hits a break point that
|
2434 |
|
|
causes it to halt, the Command HALT bit will activate on its own, the CPU
|
2435 |
|
|
will then raise an external interrupt line and wait for a debugger to examine
|
2436 |
|
|
its state. After examining the state, the debugger will need to remove
|
2437 |
|
|
the breakpoint by writing a different instruction into memory and by writing
|
2438 |
|
|
to the command register while holding the clear cache, command halt, and
|
2439 |
|
|
step CPU bits high, (32'hd00). The debugger may then replace the breakpoint
|
2440 |
|
|
now that the CPU has gone beyond it, and clear the cache again (32'h500).
|
2441 |
|
|
|
2442 |
|
|
To leave this debug mode, simply write a `32'h0' value to the command register.
|
2443 |
|
|
|
2444 |
|
|
\chapter{Wishbone Datasheets}\label{chap:wishbone}
|
2445 |
32 |
dgisselq |
The Zip System supports two wishbone ports, a slave debug port and a master
|
2446 |
21 |
dgisselq |
port for the system itself. These are shown in Tbl.~\ref{tbl:wishbone-slave}
|
2447 |
|
|
\begin{table}[htbp]
|
2448 |
|
|
\begin{center}
|
2449 |
|
|
\begin{wishboneds}
|
2450 |
|
|
Revision level of wishbone & WB B4 spec \\\hline
|
2451 |
|
|
Type of interface & Slave, Read/Write, single words only \\\hline
|
2452 |
24 |
dgisselq |
Address Width & 1--bit \\\hline
|
2453 |
21 |
dgisselq |
Port size & 32--bit \\\hline
|
2454 |
|
|
Port granularity & 32--bit \\\hline
|
2455 |
|
|
Maximum Operand Size & 32--bit \\\hline
|
2456 |
|
|
Data transfer ordering & (Irrelevant) \\\hline
|
2457 |
69 |
dgisselq |
Clock constraints & Works at 100~MHz on a Basys--3 board, and 80~MHz on a
|
2458 |
|
|
XuLA2--LX25\\\hline
|
2459 |
21 |
dgisselq |
Signal Names & \begin{tabular}{ll}
|
2460 |
|
|
Signal Name & Wishbone Equivalent \\\hline
|
2461 |
|
|
{\tt i\_clk} & {\tt CLK\_I} \\
|
2462 |
|
|
{\tt i\_dbg\_cyc} & {\tt CYC\_I} \\
|
2463 |
|
|
{\tt i\_dbg\_stb} & {\tt STB\_I} \\
|
2464 |
|
|
{\tt i\_dbg\_we} & {\tt WE\_I} \\
|
2465 |
|
|
{\tt i\_dbg\_addr} & {\tt ADR\_I} \\
|
2466 |
|
|
{\tt i\_dbg\_data} & {\tt DAT\_I} \\
|
2467 |
|
|
{\tt o\_dbg\_ack} & {\tt ACK\_O} \\
|
2468 |
|
|
{\tt o\_dbg\_stall} & {\tt STALL\_O} \\
|
2469 |
|
|
{\tt o\_dbg\_data} & {\tt DAT\_O}
|
2470 |
|
|
\end{tabular}\\\hline
|
2471 |
|
|
\end{wishboneds}
|
2472 |
22 |
dgisselq |
\caption{Wishbone Datasheet for the Debug Interface}\label{tbl:wishbone-slave}
|
2473 |
21 |
dgisselq |
\end{center}\end{table}
|
2474 |
|
|
and Tbl.~\ref{tbl:wishbone-master} respectively.
|
2475 |
|
|
\begin{table}[htbp]
|
2476 |
|
|
\begin{center}
|
2477 |
|
|
\begin{wishboneds}
|
2478 |
|
|
Revision level of wishbone & WB B4 spec \\\hline
|
2479 |
24 |
dgisselq |
Type of interface & Master, Read/Write, single cycle or pipelined\\\hline
|
2480 |
69 |
dgisselq |
Address Width & (Zip System parameter, can be up to 32--bit bits) \\\hline
|
2481 |
21 |
dgisselq |
Port size & 32--bit \\\hline
|
2482 |
|
|
Port granularity & 32--bit \\\hline
|
2483 |
|
|
Maximum Operand Size & 32--bit \\\hline
|
2484 |
|
|
Data transfer ordering & (Irrelevant) \\\hline
|
2485 |
69 |
dgisselq |
Clock constraints & Works at 100~MHz on a Basys--3 board, and 80~MHz on a
|
2486 |
|
|
XuLA2--LX25\\\hline
|
2487 |
21 |
dgisselq |
Signal Names & \begin{tabular}{ll}
|
2488 |
|
|
Signal Name & Wishbone Equivalent \\\hline
|
2489 |
|
|
{\tt i\_clk} & {\tt CLK\_O} \\
|
2490 |
|
|
{\tt o\_wb\_cyc} & {\tt CYC\_O} \\
|
2491 |
|
|
{\tt o\_wb\_stb} & {\tt STB\_O} \\
|
2492 |
|
|
{\tt o\_wb\_we} & {\tt WE\_O} \\
|
2493 |
|
|
{\tt o\_wb\_addr} & {\tt ADR\_O} \\
|
2494 |
|
|
{\tt o\_wb\_data} & {\tt DAT\_O} \\
|
2495 |
|
|
{\tt i\_wb\_ack} & {\tt ACK\_I} \\
|
2496 |
|
|
{\tt i\_wb\_stall} & {\tt STALL\_I} \\
|
2497 |
69 |
dgisselq |
{\tt i\_wb\_data} & {\tt DAT\_I} \\
|
2498 |
|
|
{\tt i\_wb\_err} & {\tt ERR\_I}
|
2499 |
21 |
dgisselq |
\end{tabular}\\\hline
|
2500 |
|
|
\end{wishboneds}
|
2501 |
22 |
dgisselq |
\caption{Wishbone Datasheet for the CPU as Master}\label{tbl:wishbone-master}
|
2502 |
21 |
dgisselq |
\end{center}\end{table}
|
2503 |
|
|
I do not recommend that you connect these together through the interconnect.
|
2504 |
24 |
dgisselq |
Rather, the debug port of the CPU should be accessible regardless of the state
|
2505 |
|
|
of the master bus.
|
2506 |
21 |
dgisselq |
|
2507 |
69 |
dgisselq |
You may wish to notice that neither the {\tt LOCK} nor the {\tt RTY} (retry)
|
2508 |
|
|
wires have been connected to the CPU's master interface. If necessary, a
|
2509 |
|
|
rudimentary {\tt LOCK} may be created by tying the wire to the {\tt wb\_cyc}
|
2510 |
|
|
line. As for the {\tt RTY}, all the CPU recognizes at this point are bus
|
2511 |
|
|
errors---it cannot tell the difference between a temporary and a permanent bus
|
2512 |
|
|
error.
|
2513 |
21 |
dgisselq |
|
2514 |
|
|
\chapter{Clocks}\label{chap:clocks}
|
2515 |
|
|
|
2516 |
32 |
dgisselq |
This core is based upon the Basys--3 development board sold by Digilent.
|
2517 |
|
|
The Basys--3 development board contains one external 100~MHz clock, which is
|
2518 |
36 |
dgisselq |
sufficient to run the Zip CPU core.
|
2519 |
21 |
dgisselq |
\begin{table}[htbp]
|
2520 |
|
|
\begin{center}
|
2521 |
|
|
\begin{clocklist}
|
2522 |
|
|
i\_clk & External & 100~MHz & 100~MHz & System clock.\\\hline
|
2523 |
|
|
\end{clocklist}
|
2524 |
|
|
\caption{List of Clocks}\label{tbl:clocks}
|
2525 |
|
|
\end{center}\end{table}
|
2526 |
|
|
I hesitate to suggest that the core can run faster than 100~MHz, since I have
|
2527 |
|
|
had struggled with various timing violations to keep it at 100~MHz. So, for
|
2528 |
|
|
now, I will only state that it can run at 100~MHz.
|
2529 |
|
|
|
2530 |
69 |
dgisselq |
On a SPARTAN 6, the clock can run successfully at 80~MHz.
|
2531 |
21 |
dgisselq |
|
2532 |
|
|
\chapter{I/O Ports}\label{chap:ioports}
|
2533 |
33 |
dgisselq |
The I/O ports to the Zip CPU may be grouped into three categories. The first
|
2534 |
|
|
is that of the master wishbone used by the CPU, then the slave wishbone used
|
2535 |
|
|
to command the CPU via a debugger, and then the rest. The first two of these
|
2536 |
|
|
were already discussed in the wishbone chapter. They are listed here
|
2537 |
|
|
for completeness in Tbl.~\ref{tbl:iowb-master}
|
2538 |
|
|
\begin{table}
|
2539 |
|
|
\begin{center}\begin{portlist}
|
2540 |
|
|
{\tt o\_wb\_cyc} & 1 & Output & Indicates an active Wishbone cycle\\\hline
|
2541 |
|
|
{\tt o\_wb\_stb} & 1 & Output & WB Strobe signal\\\hline
|
2542 |
|
|
{\tt o\_wb\_we} & 1 & Output & Write enable\\\hline
|
2543 |
|
|
{\tt o\_wb\_addr} & 32 & Output & Bus address \\\hline
|
2544 |
|
|
{\tt o\_wb\_data} & 32 & Output & Data on WB write\\\hline
|
2545 |
|
|
{\tt i\_wb\_ack} & 1 & Input & Slave has completed a R/W cycle\\\hline
|
2546 |
|
|
{\tt i\_wb\_stall} & 1 & Input & WB bus slave not ready\\\hline
|
2547 |
|
|
{\tt i\_wb\_data} & 32 & Input & Incoming bus data\\\hline
|
2548 |
69 |
dgisselq |
{\tt i\_wb\_err} & 1 & Input & Bus Error indication\\\hline
|
2549 |
33 |
dgisselq |
\end{portlist}\caption{CPU Master Wishbone I/O Ports}\label{tbl:iowb-master}\end{center}\end{table}
|
2550 |
|
|
and~\ref{tbl:iowb-slave} respectively.
|
2551 |
|
|
\begin{table}
|
2552 |
|
|
\begin{center}\begin{portlist}
|
2553 |
|
|
{\tt i\_wb\_cyc} & 1 & Input & Indicates an active Wishbone cycle\\\hline
|
2554 |
|
|
{\tt i\_wb\_stb} & 1 & Input & WB Strobe signal\\\hline
|
2555 |
|
|
{\tt i\_wb\_we} & 1 & Input & Write enable\\\hline
|
2556 |
|
|
{\tt i\_wb\_addr} & 1 & Input & Bus address, command or data port \\\hline
|
2557 |
|
|
{\tt i\_wb\_data} & 32 & Input & Data on WB write\\\hline
|
2558 |
|
|
{\tt o\_wb\_ack} & 1 & Output & Slave has completed a R/W cycle\\\hline
|
2559 |
|
|
{\tt o\_wb\_stall} & 1 & Output & WB bus slave not ready\\\hline
|
2560 |
|
|
{\tt o\_wb\_data} & 32 & Output & Incoming bus data\\\hline
|
2561 |
|
|
\end{portlist}\caption{CPU Debug Wishbone I/O Ports}\label{tbl:iowb-slave}\end{center}\end{table}
|
2562 |
21 |
dgisselq |
|
2563 |
33 |
dgisselq |
There are only four other lines to the CPU: the external clock, external
|
2564 |
|
|
reset, incoming external interrupt line(s), and the outgoing debug interrupt
|
2565 |
|
|
line. These are shown in Tbl.~\ref{tbl:ioports}.
|
2566 |
|
|
\begin{table}
|
2567 |
|
|
\begin{center}\begin{portlist}
|
2568 |
|
|
{\tt i\_clk} & 1 & Input & The master CPU clock \\\hline
|
2569 |
|
|
{\tt i\_rst} & 1 & Input & Active high reset line \\\hline
|
2570 |
69 |
dgisselq |
{\tt i\_ext\_int} & 1\ldots 16 & Input & Incoming external interrupts, actual
|
2571 |
|
|
value set by implementation parameter \\\hline
|
2572 |
33 |
dgisselq |
{\tt o\_ext\_int} & 1 & Output & CPU Halted interrupt \\\hline
|
2573 |
|
|
\end{portlist}\caption{I/O Ports}\label{tbl:ioports}\end{center}\end{table}
|
2574 |
|
|
The clock line was discussed briefly in Chapt.~\ref{chap:clocks}. We
|
2575 |
69 |
dgisselq |
typically run it at 100~MHz, although we've needed to slow it down to 80~MHz
|
2576 |
|
|
for some implementations. The reset line is an active high reset. When
|
2577 |
33 |
dgisselq |
asserted, the CPU will start running again from its reset address in
|
2578 |
69 |
dgisselq |
memory. Further, depending upon how the CPU is configured and specifically
|
2579 |
|
|
based upon how the {\tt START\_HALTED} parameter is set, the CPU may or may
|
2580 |
|
|
not start running automatically following a reset. The {\tt i\_ext\_int}
|
2581 |
|
|
line is for an external interrupt. This line may actually be as wide as
|
2582 |
|
|
16~external interrupts, depending upon the setting of
|
2583 |
|
|
the {\tt EXTERNAL\_INTERRUPTS} parameter. Finally, the Zip System produces one
|
2584 |
|
|
external interrupt whenever the entire CPU halts to wait for the debugger.
|
2585 |
33 |
dgisselq |
|
2586 |
36 |
dgisselq |
\chapter{Initial Assessment}\label{chap:assessment}
|
2587 |
|
|
|
2588 |
|
|
Having now worked with the Zip CPU for a while, it is worth offering an
|
2589 |
|
|
honest assessment of how well it works and how well it was designed. At the
|
2590 |
|
|
end of this assessment, I will propose some changes that may take place in a
|
2591 |
|
|
later version of this Zip CPU to make it better.
|
2592 |
|
|
|
2593 |
|
|
\section{The Good}
|
2594 |
|
|
\begin{itemize}
|
2595 |
69 |
dgisselq |
\item The Zip CPU can be configured to be relatively light weight and fully
|
2596 |
|
|
featured as it exists today. For anyone who wishes to build a general
|
2597 |
|
|
purpose CPU and then to experiment with building and adding particular
|
2598 |
|
|
features, the Zip CPU makes a good starting point--it is fairly simple.
|
2599 |
|
|
Modifications should be simple enough. Indeed, a non--pipelined
|
2600 |
|
|
version of the bare ZipBones (with no peripherals) has been built that
|
2601 |
|
|
only uses 1.1k~LUTs. When using pipelining, the full cache, and all
|
2602 |
|
|
of the peripherals, the ZipSystem can top 5~k LUTs. Where it fits
|
2603 |
|
|
in between is a function of your needs.
|
2604 |
36 |
dgisselq |
\item The Zip CPU was designed to be an implementable soft core that could be
|
2605 |
|
|
placed within an FPGA, controlling actions internal to the FPGA. It
|
2606 |
|
|
fits this role rather nicely. It does not fit the role of a system on
|
2607 |
|
|
a chip very well, but then it was never intended to be a system on a
|
2608 |
|
|
chip but rather a system within a chip.
|
2609 |
|
|
\item The extremely simplified instruction set of the Zip CPU was a good
|
2610 |
|
|
choice. Although it does not have many of the commonly used
|
2611 |
|
|
instructions, PUSH, POP, JSR, and RET among them, the simplified
|
2612 |
|
|
instruction set has demonstrated an amazing versatility. I will contend
|
2613 |
|
|
therefore and for anyone who will listen, that this instruction set
|
2614 |
|
|
offers a full and complete capability for whatever a user might wish
|
2615 |
|
|
to do with two exceptions: bytewise character access and accelerated
|
2616 |
|
|
floating-point support.
|
2617 |
|
|
\item This simplified instruction set is easy to decode.
|
2618 |
|
|
\item The simplified bus transactions (32-bit words only) were also very easy
|
2619 |
|
|
to implement.
|
2620 |
68 |
dgisselq |
\item The pipelined load/store approach is novel, and can be used to greatly
|
2621 |
|
|
increase the speed of the processor.
|
2622 |
36 |
dgisselq |
\item The novel approach of having a single interrupt vector, which just
|
2623 |
|
|
brings the CPU back to the instruction it left off at within the last
|
2624 |
|
|
interrupt context doesn't appear to have been that much of a problem.
|
2625 |
|
|
If most modern systems handle interrupt vectoring in software anyway,
|
2626 |
|
|
why maintain hardware support for it?
|
2627 |
|
|
\item My goal of a high rate of instructions per clock may not be the proper
|
2628 |
|
|
measure. For example, if instructions are being read from a SPI flash
|
2629 |
|
|
device, such as is common among FPGA implementations, these same
|
2630 |
|
|
instructions may suffer stalls of between 64 and 128 cycles per
|
2631 |
|
|
instruction just to read the instruction from the flash. Executing the
|
2632 |
|
|
instruction in a single clock cycle is no longer the appropriate
|
2633 |
|
|
measure. At the same time, it should be possible to use the DMA
|
2634 |
|
|
peripheral to copy instructions from the FLASH to a temporary memory
|
2635 |
|
|
location, after which they may be executed at a single instruction
|
2636 |
|
|
cycle per access again.
|
2637 |
|
|
\end{itemize}
|
2638 |
|
|
|
2639 |
|
|
\section{The Not so Good}
|
2640 |
|
|
\begin{itemize}
|
2641 |
|
|
\item The CPU has no character support. This is both good and bad.
|
2642 |
|
|
Realistically, the CPU works just fine without it. Characters can be
|
2643 |
|
|
supported as subsets of 32-bit words without any problem. Practically,
|
2644 |
|
|
though, it will make compiling non-Zip CPU code difficult--especially
|
2645 |
|
|
anything that assumes sizeof(int)=4*sizeof(char), or that tries to
|
2646 |
|
|
create unions with characters and integers and then attempts to
|
2647 |
|
|
reference the address of the characters within that union.
|
2648 |
|
|
|
2649 |
|
|
\item The Zip CPU does not support a data cache. One can still be built
|
2650 |
|
|
externally, but this is a limitation of the CPU proper as built.
|
2651 |
|
|
Further, under the theory of the Zip CPU design (that of an embedded
|
2652 |
|
|
soft-core processor within an FPGA, where any ``address'' may reference
|
2653 |
|
|
either memory or a peripheral that may have side-effects), any data
|
2654 |
|
|
cache would need to be based upon an initial knowledge of whether or
|
2655 |
|
|
not it is supporting memory (cachable) or peripherals. This knowledge
|
2656 |
|
|
must exist somewhere, and that somewhere is currently (and by design)
|
2657 |
|
|
external to the CPU.
|
2658 |
|
|
|
2659 |
|
|
This may also be written off as a ``feature'' of the Zip CPU, since
|
2660 |
|
|
the addition of a data cache can greatly increase the LUT count of
|
2661 |
|
|
a soft core.
|
2662 |
|
|
|
2663 |
68 |
dgisselq |
The Zip CPU compensates for this via its pipelined load and store
|
2664 |
|
|
instructions.
|
2665 |
|
|
|
2666 |
36 |
dgisselq |
\item Many other instruction sets offer three operand instructions, whereas
|
2667 |
|
|
the Zip CPU only offers two operand instructions. This means that it
|
2668 |
|
|
takes the Zip CPU more instructions to do many of the same operations.
|
2669 |
|
|
The good part of this is that it gives the Zip CPU a greater amount of
|
2670 |
|
|
flexibility in its immediate operand mode, although that increased
|
2671 |
|
|
flexibility isn't necessarily as valuable as one might like.
|
2672 |
|
|
|
2673 |
|
|
\item The Zip CPU doesn't support out of order execution. I suppose it could
|
2674 |
|
|
be modified to do so, but then it would no longer be the ``simple''
|
2675 |
|
|
and low LUT count CPU it was designed to be. The two primary results
|
2676 |
|
|
are that 1) loads may unnecessarily stall the CPU, even if other
|
2677 |
|
|
things could be done while waiting for the load to complete, 2)
|
2678 |
|
|
bus errors on stores will never be caught at the point of the error,
|
2679 |
|
|
and 3) branch prediction becomes more difficult.
|
2680 |
|
|
|
2681 |
|
|
\item Although switching to an interrupt context in the Zip CPU design doesn't
|
2682 |
|
|
require a tremendous swapping of registers, in reality it still
|
2683 |
|
|
does--since any task swap still requires saving and restoring all
|
2684 |
|
|
16~user registers. That's a lot of memory movement just to service
|
2685 |
|
|
an interrupt.
|
2686 |
|
|
|
2687 |
|
|
\item The Zip CPU is by no means generic: it will never handle addresses
|
2688 |
|
|
larger than 32-bits (16GB) without a complete and total redesign.
|
2689 |
|
|
This may limit its utility as a generic CPU in the future, although
|
2690 |
|
|
as an embedded CPU within an FPGA this isn't really much of a limit
|
2691 |
|
|
or restriction.
|
2692 |
|
|
|
2693 |
|
|
\item While the Zip CPU has its own assembler, it has no linker and does not
|
2694 |
|
|
(yet) support a compiler. The standard C library is an even longer
|
2695 |
|
|
shot. My dream of having binutils and gcc support has not been
|
2696 |
|
|
realized and at this rate may not be realized. (I've been intimidated
|
2697 |
|
|
by the challenge everytime I've looked through those codes.)
|
2698 |
|
|
\end{itemize}
|
2699 |
|
|
|
2700 |
|
|
\section{The Next Generation}
|
2701 |
69 |
dgisselq |
This section could also be labeled as my ``To do'' list. Today's list is
|
2702 |
|
|
much different than it was for the last version of this document, as much of
|
2703 |
|
|
the prior to do list (such as VLIW instructions, and a more traditional
|
2704 |
|
|
instruction cache) has now been implemented. The only things really and
|
2705 |
|
|
truly waiting on my list today are assembler support for the VLIW instruction
|
2706 |
|
|
set, linker and compiler support.
|
2707 |
36 |
dgisselq |
|
2708 |
69 |
dgisselq |
Stay tuned, these are likely to be coming next.
|
2709 |
36 |
dgisselq |
|
2710 |
21 |
dgisselq |
% Appendices
|
2711 |
|
|
% Index
|
2712 |
|
|
\end{document}
|
2713 |
|
|
|
2714 |
68 |
dgisselq |
%
|
2715 |
|
|
%
|
2716 |
|
|
% Symbol table relocation types:
|
2717 |
|
|
%
|
2718 |
|
|
% Only 3-types of instructions truly need relocations: those that modify the
|
2719 |
|
|
% PC register, and those that access memory.
|
2720 |
|
|
%
|
2721 |
|
|
% - LDI Addr,Rx // Load's an absolute address into Rx, 24 bits
|
2722 |
|
|
%
|
2723 |
|
|
% - LDILO Addr,Rx // Load's an absolute address into Rx, 32 bits
|
2724 |
|
|
% LDIHI Addr,Rx // requires two instructions
|
2725 |
|
|
%
|
2726 |
|
|
% - JMP Rx // Jump to any address in Rx
|
2727 |
|
|
% // Can be prefixed with two instructions to load Rx
|
2728 |
|
|
% // from any 32-bit immediate
|
2729 |
|
|
% - JMP #Addr // Jump to any 24'bit (signed) address, 23'b uns
|
2730 |
|
|
%
|
2731 |
|
|
% - ADD x,PC // Any PC relative jump (20 bits)
|
2732 |
|
|
%
|
2733 |
|
|
% - ADD.C x,PC // Any PC relative conditional jump (20 bits)
|
2734 |
|
|
%
|
2735 |
|
|
% - LDIHI Addr,Rx // Load from any 32-bit address, clobbers Rx,
|
2736 |
|
|
% LOD Addr(Rx),Rx // unconditional, requires second instruction
|
2737 |
|
|
%
|
2738 |
|
|
% - LOD.C Addr(Ry),Rx // Any 16-bit relative address load, poss. cond
|
2739 |
|
|
%
|
2740 |
|
|
% - STO.C Rx,Addr(Ry) // Any 16-bit rel addr, Rx and Ry must be valid
|
2741 |
|
|
%
|
2742 |
|
|
% - FARJMP #Addr: // Arbitrary 32-bit jumps require a jump table
|
2743 |
|
|
% BRA +1 // memory address. The BRA +1 can be skipped,
|
2744 |
|
|
% .WORD Addr // but only if the address is placed at the end
|
2745 |
|
|
% LOD -2(PC),PC // of an executable section
|
2746 |
|
|
%
|