1 |
15 |
hellwig |
% This file is part of the MMIXware package (c) Donald E Knuth 1999
|
2 |
|
|
@i boilerplate.w %<< legal stuff: PLEASE READ IT BEFORE MAKING ANY CHANGES!
|
3 |
|
|
|
4 |
|
|
\def\title{MMIX-PIPE}
|
5 |
|
|
\def\MMIX{\.{MMIX}}
|
6 |
|
|
\def\NNIX{\hbox{\mc NNIX}}
|
7 |
|
|
\def\Hex#1{\hbox{$^{\scriptscriptstyle\#}$\tt#1}} % experimental hex constant
|
8 |
|
|
@s and normal @q unreserve a C++ keyword @>
|
9 |
|
|
@s or normal @q unreserve a C++ keyword @>
|
10 |
|
|
@s bool normal @q unreserve a C++ keyword @>
|
11 |
|
|
@s xor normal @q unreserve a C++ keyword @>
|
12 |
|
|
|
13 |
|
|
@* Introduction. This program is the heart of the meta-simulator for the
|
14 |
|
|
ultra-configurable \MMIX\ pipeline: It defines the |MMIX_run| routine, which
|
15 |
|
|
does most of the
|
16 |
|
|
work. Another routine, |MMIX_init|, is also defined here, and so is a header
|
17 |
|
|
file called \.{mmix\_pipe.h}. The header file is used by the main routine and
|
18 |
|
|
by other routines like |MMIX_config|, which are compiled separately.
|
19 |
|
|
|
20 |
|
|
Readers of this program should be familiar with the explanation of \MMIX\
|
21 |
|
|
architecture as presented in the main program module for {\mc MMMIX}.
|
22 |
|
|
|
23 |
|
|
A lot of subtle things can happen when instructions are executed in parallel.
|
24 |
|
|
Therefore this simulator ranks among the most interesting and instructive
|
25 |
|
|
programs in the author's experience. The author has tried his best to make
|
26 |
|
|
everything correct \dots\ but the chances for error are great. Anyone who
|
27 |
|
|
discovers a bug is therefore urged to report it as soon as possible to
|
28 |
|
|
\.{knuth-bug@@cs.stanford.edu}; then the program will be as useful as
|
29 |
|
|
possible. Rewards will be paid to bug-finders! (Except for bugs in version~0.)
|
30 |
|
|
|
31 |
|
|
It sort of boggles the mind when one realizes that the present program might
|
32 |
|
|
someday be translated by a \CEE/~compiler for \MMIX\ and used to simulate
|
33 |
|
|
{\it itself}.
|
34 |
|
|
|
35 |
|
|
@ This high-performance prototype of \MMIX\ achieves its efficiency by
|
36 |
|
|
means of ``pipelining,'' a technique of overlapping that is explained
|
37 |
|
|
for the related \.{DLX} computer in Chapter~3 of Hennessy \char`\&\ Patterson's
|
38 |
|
|
book {\sl Computer Architecture\/} (second edition). Other techniques
|
39 |
|
|
such as ``dynamic scheduling'' and ``multiple issue,'' explained in
|
40 |
|
|
Chapter~4 of that book, are used too.
|
41 |
|
|
|
42 |
|
|
One good way to visualize the procedure is to imagine that somebody has
|
43 |
|
|
organized a high-tech car repair shop according to similar principles.
|
44 |
|
|
There are eight independent functional units, which we can think of as
|
45 |
|
|
eight groups of auto mechanics, each specializing in a particular task;
|
46 |
|
|
each group has its own workspace with room to deal with one car at a time.
|
47 |
|
|
Group~F (the ``fetch'' group) is in charge of rounding up customers and
|
48 |
|
|
getting them to enter the assembly-line garage in an orderly fashion.
|
49 |
|
|
Group~D (the ``decode and dispatch'' group) does the initial vehicle
|
50 |
|
|
inspection and
|
51 |
|
|
writes up an order that explains what kind of servicing is required.
|
52 |
|
|
The vehicles go next to one of the four ``execution'' groups:
|
53 |
|
|
Group~X handles routine maintenance, while groups XF, XM, and XD are
|
54 |
|
|
specialists in more complex tasks that tend to take longer. (The XF
|
55 |
|
|
people are good at floating the points, while the XM and XD groups are
|
56 |
|
|
experts in multilink suspensions and differentials.) When the relevant X~group
|
57 |
|
|
has finished its work, cars drive to M~station, where they send or receive
|
58 |
|
|
messages and possibly pay money to members of the ``memory'' group. Finally
|
59 |
|
|
all necessary parts are installed by members of group~W, the ``write''
|
60 |
|
|
group, and the car leaves the shop. Everything is tightly organized so
|
61 |
|
|
that in most cases the cars move in synchronized fashion from station
|
62 |
|
|
to station, at regular 100-nanocentury intervals. % about 5.3 minutes
|
63 |
|
|
|
64 |
|
|
In a similar way, most \MMIX\ instructions can be handled in a five-stage
|
65 |
|
|
pipeline, F--D--X--M--W, with X replaced by XF for floating-point
|
66 |
|
|
addition or conversion, or by XM for multiplication, or by XD for
|
67 |
|
|
division or square root. Each stage ideally takes one clock cycle,
|
68 |
|
|
although XF, XM, and (especially) XD are slower. If the instructions enter
|
69 |
|
|
in a suitable pattern, we might see one instruction being fetched,
|
70 |
|
|
another being decoded, and up to four being executed, while another is accessing
|
71 |
|
|
memory, and yet another is finishing up by writing new information into
|
72 |
|
|
registers; all this is going on simultaneously during one clock cycle. Pipelining
|
73 |
|
|
with eight separate stages might therefore make the machine run
|
74 |
|
|
up to 8 times as fast as it could if each instruction were being dealt with
|
75 |
|
|
individually and without overlap. (Well, perfect speedup turns out to
|
76 |
|
|
be impossible, because of the shared M and~W stages; the theory of
|
77 |
|
|
knapsack programming, to be discussed in Section~7.7 of {\sl The Art
|
78 |
|
|
of Computer Programming}, tells us that the maximal achievable speedup is
|
79 |
|
|
at most $8-1/p-1/q-1/r$ when XF, XM, and~XD have delays bounded by $p$,
|
80 |
|
|
$q$, and~$r$ cycles. But we can achieve a factor of more than~7
|
81 |
|
|
if we are very lucky.)
|
82 |
|
|
|
83 |
|
|
Consider, for example, the \.{ADD} instruction. This instruction enters
|
84 |
|
|
the computer's processing unit in F stage, taking only one clock cycle if
|
85 |
|
|
it is in the cache of instructions recently seen. Then the D~stage
|
86 |
|
|
recognizes the command as an \.{ADD} and acquires the current values
|
87 |
|
|
of \$Y and \$Z; meanwhile, of course, another instruction is being fetched
|
88 |
|
|
by~F.
|
89 |
|
|
On the next clock cycle, the X stage adds the values together.
|
90 |
|
|
This prepares the way for the M stage to watch for overflow and to
|
91 |
|
|
get ready for any exceptional action that might be needed with respect
|
92 |
|
|
to the settings of special register~rA\null.
|
93 |
|
|
Finally, on the fifth clock cycle, the sum is either written into~\$X
|
94 |
|
|
or the trip handler for integer overflow is invoked.
|
95 |
|
|
Although this process has taken five clock
|
96 |
|
|
cycles (that is, $5\upsilon$),
|
97 |
|
|
the net increase in running time has been only~$1\upsilon$.
|
98 |
|
|
|
99 |
|
|
Of course congestion can occur, inside a computer as in a repair shop.
|
100 |
|
|
For example, auto parts might not be readily available; or a car might
|
101 |
|
|
have to sit in D station while waiting to move to XM, thereby blocking
|
102 |
|
|
somebody else from moving from F to~D. Sometimes there won't
|
103 |
|
|
necessarily be a steady stream of customers. In such cases the
|
104 |
|
|
employees in some parts of the shop will occasionally be idle. But we
|
105 |
|
|
assume that they always do their jobs as fast as possible, given the
|
106 |
|
|
sequence of customers that they encounter. With a clever person
|
107 |
|
|
setting up appointments---translation: with a clever
|
108 |
|
|
programmer and/or compiler arranging \MMIX\ instructions---the
|
109 |
|
|
organization can often be expected to run at nearly peak capacity.
|
110 |
|
|
|
111 |
|
|
In fact, this program is designed for experiments with many kinds of
|
112 |
|
|
pipelines, potentially using additional functional units (such as
|
113 |
|
|
several independent X~groups), and potentially fetching, dispatching, and
|
114 |
|
|
executing several nonconflicting instructions simultaneously.
|
115 |
|
|
Such complications
|
116 |
|
|
make this program more difficult than a simple pipeline simulator
|
117 |
|
|
would be, but they also make it a lot more instructive because we
|
118 |
|
|
can get a better understanding of the issues involved if we are
|
119 |
|
|
required to treat them in greater generality.
|
120 |
|
|
|
121 |
|
|
@ Here's the overall structure of the present program module.
|
122 |
|
|
|
123 |
|
|
@c
|
124 |
|
|
#include
|
125 |
|
|
#include
|
126 |
|
|
#include
|
127 |
|
|
#include "abstime.h"
|
128 |
|
|
@h@#
|
129 |
|
|
@
|
130 |
|
|
@@;
|
131 |
|
|
@@;
|
132 |
|
|
@@;
|
133 |
|
|
@@;
|
134 |
|
|
@@;
|
135 |
|
|
@@;
|
136 |
|
|
@@;
|
137 |
|
|
|
138 |
|
|
@ The identifier \&{Extern} is used in {\mc MMIX-PIPE} to
|
139 |
|
|
declare variables that are accessed in other modules. Actually
|
140 |
|
|
all appearances of `\&{Extern}' are defined to be blank here, but
|
141 |
|
|
`\&{Extern}' will become `\&{extern}' in the header file.
|
142 |
|
|
|
143 |
|
|
@d Extern /* blank for us, \&{extern} for them */
|
144 |
|
|
@f Extern extern
|
145 |
|
|
|
146 |
|
|
@=
|
147 |
|
|
Extern int verbose; /* controls the level of diagnostic output */
|
148 |
|
|
|
149 |
|
|
@ The header file repeats the basic definitions and declarations.
|
150 |
|
|
|
151 |
|
|
@(mmix-pipe.h@>=
|
152 |
|
|
#define Extern extern
|
153 |
|
|
@
|
154 |
|
|
@@;
|
155 |
|
|
@@;
|
156 |
|
|
@@;
|
157 |
|
|
|
158 |
|
|
@ Subroutines of this program are declared first with a prototype,
|
159 |
|
|
as in {\mc ANSI C}, then with an old-style \CEE/ function definition.
|
160 |
|
|
The following preprocessor commands make this work correctly with both
|
161 |
|
|
new-style and old-style compilers.
|
162 |
|
|
@^prototypes for functions@>
|
163 |
|
|
|
164 |
|
|
@
|
165 |
|
|
#ifdef __STDC__
|
166 |
|
|
#define ARGS(list) list
|
167 |
|
|
#else
|
168 |
|
|
#define ARGS(list) ()
|
169 |
|
|
#endif
|
170 |
|
|
|
171 |
|
|
@ Some of the names that are natural for this program are in
|
172 |
|
|
conflict with library names on at least
|
173 |
|
|
one of the host computers in the author's tests. So we
|
174 |
|
|
bypass the library names here.
|
175 |
|
|
|
176 |
|
|
@
|
177 |
|
|
#define random my_random
|
178 |
|
|
#define fsqrt my_fsqrt
|
179 |
|
|
#define div my_div
|
180 |
|
|
|
181 |
|
|
@ The amount of verbosity depends on the following bit codes.
|
182 |
|
|
|
183 |
|
|
@
|
184 |
|
|
#define issue_bit (1<<0)
|
185 |
|
|
/* show control blocks when issued, deissued, committed */
|
186 |
|
|
#define pipe_bit (1<<1)
|
187 |
|
|
/* show the pipeline and locks on every cycle */
|
188 |
|
|
#define coroutine_bit (1<<2)
|
189 |
|
|
/* show the coroutines when started on every cycle */
|
190 |
|
|
#define schedule_bit (1<<3)
|
191 |
|
|
/* show the coroutines when scheduled */
|
192 |
|
|
#define uninit_mem_bit (1<<4)
|
193 |
|
|
/* complain when reading from an uninitialized chunk of memory */
|
194 |
|
|
#define interactive_read_bit (1<<5)
|
195 |
|
|
/* prompt user when reading from I/O location */
|
196 |
|
|
#define show_spec_bit (1<<6)
|
197 |
|
|
/* display special read/write transactions as they happen */
|
198 |
|
|
#define show_pred_bit (1<<7)
|
199 |
|
|
/* display branch prediction details */
|
200 |
|
|
#define show_wholecache_bit (1<<8)
|
201 |
|
|
/* display cache blocks even when their key tag is invalid */
|
202 |
|
|
|
203 |
|
|
@ The |MMIX_init()| routine should be called exactly once, after
|
204 |
|
|
|MMIX_config()| has done its work but before the simulator starts to execute
|
205 |
|
|
any programs. Then |MMIX_run| can be called as often as the user likes.
|
206 |
|
|
|
207 |
|
|
@s octa int
|
208 |
|
|
|
209 |
|
|
@=
|
210 |
|
|
Extern void MMIX_init @,@,@[ARGS((void))@];
|
211 |
|
|
Extern void MMIX_run @,@,@[ARGS((int cycs, octa breakpoint))@];
|
212 |
|
|
|
213 |
|
|
@ @=
|
214 |
|
|
void MMIX_init()
|
215 |
|
|
{
|
216 |
|
|
register int i,j;
|
217 |
|
|
@;
|
218 |
|
|
}
|
219 |
|
|
@#
|
220 |
|
|
void MMIX_run(cycs,breakpoint)
|
221 |
|
|
int cycs;
|
222 |
|
|
octa breakpoint;
|
223 |
|
|
{
|
224 |
|
|
@;
|
225 |
|
|
while (cycs) {
|
226 |
|
|
if (verbose&(issue_bit|pipe_bit|coroutine_bit|schedule_bit))
|
227 |
|
|
printf("*** Cycle %d\n", ticks.l);
|
228 |
|
|
@;
|
229 |
|
|
if (verbose&pipe_bit) {
|
230 |
|
|
print_pipe();@+ print_locks();
|
231 |
|
|
}
|
232 |
|
|
if (breakpoint_hit||halted) {
|
233 |
|
|
if (breakpoint_hit)
|
234 |
|
|
printf("Breakpoint instruction fetched at time %d\n",ticks.l-1);
|
235 |
|
|
if (halted) printf("Halted at time %d\n", ticks.l-1);
|
236 |
|
|
break;
|
237 |
|
|
}
|
238 |
|
|
cycs--;
|
239 |
|
|
}
|
240 |
|
|
cease:;
|
241 |
|
|
}
|
242 |
|
|
|
243 |
|
|
@ @=
|
244 |
|
|
typedef enum {@!false, @!true, @!wow}@+bool; /* slightly extended booleans */
|
245 |
|
|
|
246 |
|
|
@ @=
|
247 |
|
|
register int i,j,m;
|
248 |
|
|
bool breakpoint_hit=false;
|
249 |
|
|
bool halted=false;
|
250 |
|
|
|
251 |
|
|
@ Error messages that abort this program are called panic messages.
|
252 |
|
|
The macro called |confusion| will never be needed unless this program is
|
253 |
|
|
internally inconsistent.
|
254 |
|
|
|
255 |
|
|
@d errprint0(f) fprintf(stderr,f)
|
256 |
|
|
@d errprint1(f,a) fprintf(stderr,f,a)
|
257 |
|
|
@d errprint2(f,a,b) fprintf(stderr,f,a,b)
|
258 |
|
|
@d panic(x)@+ {@+errprint0("Panic: ");@+x;@+errprint0("!\n");@+expire();@+}
|
259 |
|
|
@d confusion(m) errprint1("This can't happen: %s",m)
|
260 |
|
|
@.This can't happen@>
|
261 |
|
|
|
262 |
|
|
@=
|
263 |
|
|
static void expire @,@,@[ARGS((void))@];
|
264 |
|
|
|
265 |
|
|
@ @=
|
266 |
|
|
static void expire() /* the last gasp before dying */
|
267 |
|
|
{
|
268 |
|
|
if (ticks.h) errprint2("(Clock time is %dH+%d.)\n",ticks.h,ticks.l);
|
269 |
|
|
else errprint1("(Clock time is %d.)\n",ticks.l);
|
270 |
|
|
@.Clock time is...@>
|
271 |
|
|
exit(-2);
|
272 |
|
|
}
|
273 |
|
|
|
274 |
|
|
@ The data structures of this program are not precisely equivalent to
|
275 |
|
|
logical gates that could be implemented directly in silicon;
|
276 |
|
|
we will use data structures and
|
277 |
|
|
algorithms appropriate to the \CEE/ programming language. For example,
|
278 |
|
|
we'll use pointers and arrays, instead of buses and ports and latches. However,
|
279 |
|
|
the net effect of our data structures and algorithms is intended to
|
280 |
|
|
be equivalent to the net effect of a silicon implementation. The methods
|
281 |
|
|
used below are essentially equivalent to those used in real machines today,
|
282 |
|
|
except that diagnostic facilities are added so that we can readily
|
283 |
|
|
watch what is happening.
|
284 |
|
|
|
285 |
|
|
Each functional unit in the \MMIX\ pipeline is programmed here as a coroutine
|
286 |
|
|
in~\CEE/. At every clock cycle, we will call on each active coroutine to do one
|
287 |
|
|
phase of its operation; in terms of the repair-station analogy
|
288 |
|
|
described in the main program,
|
289 |
|
|
this corresponds to getting each group of
|
290 |
|
|
auto mechanics to do one unit of operation on a car.
|
291 |
|
|
The coroutines are performed sequentially, although
|
292 |
|
|
a real pipeline would have them act in parallel.
|
293 |
|
|
We will not ``cheat'' by letting one coroutine access a value early in its
|
294 |
|
|
cycle that another one computes late in its cycle, unless computer hardware
|
295 |
|
|
could ``cheat'' in an equivalent way.
|
296 |
|
|
|
297 |
|
|
@* Low-level routines. Where should we begin? It is tempting to start with a
|
298 |
|
|
global view of the simulator and then to break it down into component parts.
|
299 |
|
|
But that task is too daunting, because there are so many unknowns about what
|
300 |
|
|
basic ingredients ought to be combined when we construct the larger
|
301 |
|
|
components. So let us look first at the primitive operations on which
|
302 |
|
|
the superstructure will be built. Once we have created some infrastructure,
|
303 |
|
|
we'll be able to proceed with confidence to the larger tasks ahead.
|
304 |
|
|
|
305 |
|
|
@ This program for the 64-bit \MMIX\ architecture is based on 32-bit integer
|
306 |
|
|
arithmetic, because nearly every computer available to the author at the time
|
307 |
|
|
of writing (1998--1999) was limited in that way.
|
308 |
|
|
Details of the basic arithmetic appear in a separate program module
|
309 |
|
|
called {\mc MMIX-ARITH}, because the same routines are needed also
|
310 |
|
|
for the assembler and for the non-pipelined simulator. The
|
311 |
|
|
definition of type \&{tetra} should be changed, if necessary, to conform with
|
312 |
|
|
the definitions found there.
|
313 |
|
|
@^system dependencies@>
|
314 |
|
|
|
315 |
|
|
@=
|
316 |
|
|
typedef unsigned int tetra;
|
317 |
|
|
/* for systems conforming to the LP-64 data model */
|
318 |
|
|
typedef struct { tetra h,l;} octa; /* two tetrabytes make one octabyte */
|
319 |
|
|
|
320 |
|
|
@ @=
|
321 |
|
|
static void print_octa @,@,@[ARGS((octa))@];
|
322 |
|
|
|
323 |
|
|
@ @=
|
324 |
|
|
static void print_octa(o)
|
325 |
|
|
octa o;
|
326 |
|
|
{
|
327 |
|
|
if (o.h) printf("%x%08x",o.h,o.l);@+
|
328 |
|
|
else printf("%x",o.l);
|
329 |
|
|
}
|
330 |
|
|
|
331 |
|
|
@ @=
|
332 |
|
|
extern octa zero_octa; /* |zero_octa.h=zero_octa.l=0| */
|
333 |
|
|
extern octa neg_one; /* |neg_one.h=neg_one.l=-1| */
|
334 |
|
|
extern octa aux; /* auxiliary output of a subroutine */
|
335 |
|
|
extern bool overflow; /* set by certain subroutines for signed arithmetic */
|
336 |
|
|
extern int exceptions; /* bits set by floating point operations */
|
337 |
|
|
extern int cur_round; /* the current rounding mode */
|
338 |
|
|
|
339 |
|
|
@ Most of the subroutines in {\mc MMIX-ARITH} return an octabyte as
|
340 |
|
|
a function of two octabytes; for example, |oplus(y,z)| returns the
|
341 |
|
|
sum of octabytes |y| and~|z|. Multiplication returns the high
|
342 |
|
|
half of a product in the global variable~|aux|; division returns
|
343 |
|
|
the remainder in~|aux|.
|
344 |
|
|
|
345 |
|
|
@=
|
346 |
|
|
extern octa oplus @,@,@[ARGS((octa y,octa z))@];
|
347 |
|
|
/* unsigned $y+z$ */
|
348 |
|
|
extern octa ominus @,@,@[ARGS((octa y,octa z))@];
|
349 |
|
|
/* unsigned $y-z$ */
|
350 |
|
|
extern octa incr @,@,@[ARGS((octa y,int delta))@];
|
351 |
|
|
/* unsigned $y+\delta$ ($\delta$ is signed) */
|
352 |
|
|
extern octa oand @,@,@[ARGS((octa y,octa z))@];
|
353 |
|
|
/* $y\land z$ */
|
354 |
|
|
extern octa oandn @,@,@[ARGS((octa y,octa z))@];
|
355 |
|
|
/* $y\land \bar z$ */
|
356 |
|
|
extern octa shift_left @,@,@[ARGS((octa y,int s))@];
|
357 |
|
|
/* $y\LL s$, $0\le s\le64$ */
|
358 |
|
|
extern octa shift_right @,@,@[ARGS((octa y,int s,int uns))@];
|
359 |
|
|
/* $y\GG s$, signed if |!uns| */
|
360 |
|
|
extern octa omult @,@,@[ARGS((octa y,octa z))@];
|
361 |
|
|
/* unsigned $(|aux|,x)=y\times z$ */
|
362 |
|
|
extern octa signed_omult @,@,@[ARGS((octa y,octa z))@];
|
363 |
|
|
/* signed $x=y\times z$, setting |overflow| */
|
364 |
|
|
extern octa odiv @,@,@[ARGS((octa x,octa y,octa z))@];
|
365 |
|
|
/* unsigned $(x,y)/z$; $|aux|=(x,y)\bmod z$ */
|
366 |
|
|
extern octa signed_odiv @,@,@[ARGS((octa y,octa z))@];
|
367 |
|
|
/* signed $y/z$, when $z\ne0$; $|aux|=y\bmod z$ */
|
368 |
|
|
extern int count_bits @,@,@[ARGS((tetra z))@];
|
369 |
|
|
/* $x=\nu(z)$ */
|
370 |
|
|
extern tetra byte_diff @,@,@[ARGS((tetra y,tetra z))@];
|
371 |
|
|
/* half of \.{BDIF} */
|
372 |
|
|
extern tetra wyde_diff @,@,@[ARGS((tetra y,tetra z))@];
|
373 |
|
|
/* half of \.{WDIF} */
|
374 |
|
|
extern octa bool_mult @,@,@[ARGS((octa y,octa z,bool xor))@];
|
375 |
|
|
/* \.{MOR} or \.{MXOR} */
|
376 |
|
|
extern octa load_sf @,@,@[ARGS((tetra z))@];
|
377 |
|
|
/* load short float */
|
378 |
|
|
extern tetra store_sf @,@,@[ARGS((octa x))@];
|
379 |
|
|
/* store short float */
|
380 |
|
|
extern octa fplus @,@,@[ARGS((octa y,octa z))@];
|
381 |
|
|
/* floating point $x=y\oplus z$ */
|
382 |
|
|
extern octa fmult @,@,@[ARGS((octa y ,octa z))@];
|
383 |
|
|
/* floating point $x=y\otimes z$ */
|
384 |
|
|
extern octa fdivide @,@,@[ARGS((octa y,octa z))@];
|
385 |
|
|
/* floating point $x=y\oslash z$ */
|
386 |
|
|
extern octa froot @,@,@[ARGS((octa,int))@];
|
387 |
|
|
/* floating point $x=\sqrt z$ */
|
388 |
|
|
extern octa fremstep @,@,@[ARGS((octa y,octa z,int delta))@];
|
389 |
|
|
/* floating point $x\,{\rm rem}\,z=y\,{\rm rem}\,z$ */
|
390 |
|
|
extern octa fintegerize @,@,@[ARGS((octa z,int mode))@];
|
391 |
|
|
/* floating point $x={\rm round}(z)$ */
|
392 |
|
|
extern int fcomp @,@,@[ARGS((octa y,octa z))@];
|
393 |
|
|
/* $-1$, 0, 1, or 2 if $yz$, $y\parallel z$ */
|
394 |
|
|
extern int fepscomp @,@,@[ARGS((octa y,octa z,octa eps,int sim))@];
|
395 |
|
|
/* $x=|sim|?\ [y\sim z\ (\epsilon)]:\ [y\approx z\ (\epsilon)]$ */
|
396 |
|
|
extern octa floatit @,@,@[ARGS((octa z,int mode,int unsgnd,int shrt))@];
|
397 |
|
|
/* fix to float */
|
398 |
|
|
extern octa fixit @,@,@[ARGS((octa z,int mode))@];
|
399 |
|
|
/* float to fix */
|
400 |
|
|
|
401 |
|
|
@ We had better check that our 32-bit assumption holds.
|
402 |
|
|
|
403 |
|
|
@=
|
404 |
|
|
if (shift_left(neg_one,1).h!=0xffffffff)
|
405 |
|
|
panic(errprint0("Incorrect implementation of type tetra"));
|
406 |
|
|
@.Incorrect implementation...@>
|
407 |
|
|
|
408 |
|
|
@* Coroutines. As stated earlier, this program can be regarded as a system of
|
409 |
|
|
interacting coroutines. Coroutines---sometimes called threads---are more or
|
410 |
|
|
less independent processes that share and pass data and control back and
|
411 |
|
|
forth. They correspond to the individual workers in an organization.
|
412 |
|
|
|
413 |
|
|
We don't need the full power of recursive coroutines, in which new threads are
|
414 |
|
|
spawned dynamically and have independent stacks for computation; we are, after
|
415 |
|
|
all, simulating a fixed piece of hardware. The total number of coroutines we
|
416 |
|
|
deal with is established once and for all by the |MMIX_config| routine, and
|
417 |
|
|
each coroutine has a fixed amount of local data.
|
418 |
|
|
|
419 |
|
|
The simulation operates one clock tick at a time, by executing all
|
420 |
|
|
coroutines scheduled for time~$t$ before advancing to time~$t+1$. The
|
421 |
|
|
coroutines at time~$t$ may decide to become dormant or they may reschedule
|
422 |
|
|
themselves and/or other coroutines for future times.
|
423 |
|
|
|
424 |
|
|
Each coroutine has a symbolic |name| for diagnostic purposes (e.g.,
|
425 |
|
|
\.{ALU1}); a nonnegative |stage| number (e.g., 2~for the second stage
|
426 |
|
|
of a pipeline); a pointer to the next coroutine scheduled at the same time (or
|
427 |
|
|
|NULL| if the coroutine is unscheduled); a pointer to a lock variable
|
428 |
|
|
(or |NULL| if no lock is currently relevant);
|
429 |
|
|
and a reference to a control block containing the data to be processed.
|
430 |
|
|
|
431 |
|
|
@s control_struct int
|
432 |
|
|
|
433 |
|
|
@=
|
434 |
|
|
typedef struct coroutine_struct {
|
435 |
|
|
char *name; /* symbolic identification of a coroutine */
|
436 |
|
|
int stage; /* its rank */
|
437 |
|
|
struct coroutine_struct *next; /* its successor */
|
438 |
|
|
struct coroutine_struct **lockloc; /* what it might be locking */
|
439 |
|
|
struct control_struct *ctl; /* its data */
|
440 |
|
|
} coroutine;
|
441 |
|
|
|
442 |
|
|
@ @=
|
443 |
|
|
static void print_coroutine_id @,@,@[ARGS((coroutine*))@];
|
444 |
|
|
static void errprint_coroutine_id @,@,@[ARGS((coroutine*))@];
|
445 |
|
|
|
446 |
|
|
@ @=
|
447 |
|
|
static void print_coroutine_id(c)
|
448 |
|
|
coroutine *c;
|
449 |
|
|
{
|
450 |
|
|
if (c) printf("%s:%d",c->name,c->stage);
|
451 |
|
|
else printf("??");
|
452 |
|
|
}
|
453 |
|
|
@#
|
454 |
|
|
static void errprint_coroutine_id(c)
|
455 |
|
|
coroutine *c;
|
456 |
|
|
{
|
457 |
|
|
if (c) errprint2("%s:%d",c->name,c->stage);
|
458 |
|
|
else errprint0("??");
|
459 |
|
|
@.??@>
|
460 |
|
|
}
|
461 |
|
|
|
462 |
|
|
@ Coroutine control is masterminded by a ring of queues, one each for
|
463 |
|
|
times $t$, $t+1$, \dots, $t+|ring_size|-1$, when $t$ is the current
|
464 |
|
|
clock time.
|
465 |
|
|
|
466 |
|
|
All scheduling is first-come-first-served, except that coroutines with higher
|
467 |
|
|
|stage| numbers have priority. We want to process the later stages of a
|
468 |
|
|
pipeline first, in this sequential implementation, for the same reason that a
|
469 |
|
|
car must drive from M~station into W~station before another car can enter
|
470 |
|
|
M~station.
|
471 |
|
|
|
472 |
|
|
Each queue is a circular list of \&{coroutine} nodes, linked together by their
|
473 |
|
|
|next| fields. A list head~$h$ with |stage=max_stage| comes at the end and the
|
474 |
|
|
beginning of the queue. (All |stage| numbers of legitimate coroutines
|
475 |
|
|
are less than~|max_stage|.) The queued items are |h->next|, |h->next->next|,
|
476 |
|
|
etc., from back to front, and we have |c->stage<=c->next->stage| unless |c=h|.
|
477 |
|
|
|
478 |
|
|
Initially all queues are empty.
|
479 |
|
|
|
480 |
|
|
@=
|
481 |
|
|
{@+register coroutine *p;
|
482 |
|
|
for (p=ring;pnext=p;
|
483 |
|
|
}
|
484 |
|
|
|
485 |
|
|
@ To schedule a coroutine |c| with positive delay |d
|
486 |
|
|
|schedule(c,d,s)|. (The |s| parameter is used only if scheduling is
|
487 |
|
|
being logged; it does not affect the computation, but we will
|
488 |
|
|
generally set |s| to the state at which the scheduled coroutine will begin.)
|
489 |
|
|
|
490 |
|
|
@=
|
491 |
|
|
static void schedule @,@,@[ARGS((coroutine*,int,int))@];
|
492 |
|
|
|
493 |
|
|
@ @=
|
494 |
|
|
static void schedule(c,d,s)
|
495 |
|
|
coroutine *c;
|
496 |
|
|
int d,s;
|
497 |
|
|
{
|
498 |
|
|
register int tt=(cur_time+d)%ring_size;
|
499 |
|
|
register coroutine *p=&ring[tt]; /* start at the list head */
|
500 |
|
|
if (d<=0 || d>=ring_size) /* do a sanity check */
|
501 |
|
|
panic(confusion("Scheduling ");errprint_coroutine_id(c);
|
502 |
|
|
errprint1(" with delay %d",d));
|
503 |
|
|
while (p->next->stagestage) p=p->next;
|
504 |
|
|
c->next = p->next;
|
505 |
|
|
p->next = c;
|
506 |
|
|
if (verbose&schedule_bit) {
|
507 |
|
|
printf(" scheduling ");@+print_coroutine_id(c);
|
508 |
|
|
printf(" at time %d, state %d\n",ticks.l+d,s);
|
509 |
|
|
}
|
510 |
|
|
}
|
511 |
|
|
|
512 |
|
|
@ @=
|
513 |
|
|
Extern int ring_size; /* set by |MMIX_config|, must be sufficiently large */
|
514 |
|
|
Extern coroutine *ring;
|
515 |
|
|
Extern int cur_time;
|
516 |
|
|
|
517 |
|
|
@ The all-important |ctl| field of a coroutine, which contains the
|
518 |
|
|
data being manipulated, will be explained below. One of its key
|
519 |
|
|
components is the |state| field, which helps to specify the next
|
520 |
|
|
actions the coroutine will perform. When we schedule a coroutine for
|
521 |
|
|
a new task, we often want it to begin in state~0.
|
522 |
|
|
|
523 |
|
|
@=
|
524 |
|
|
static void startup @,@,@[ARGS((coroutine*,int))@];
|
525 |
|
|
|
526 |
|
|
@ @=
|
527 |
|
|
static void startup(c,d)
|
528 |
|
|
coroutine *c;
|
529 |
|
|
int d;
|
530 |
|
|
{
|
531 |
|
|
c->ctl->state=0;
|
532 |
|
|
schedule(c,d,0);
|
533 |
|
|
}
|
534 |
|
|
|
535 |
|
|
@ The following routine removes a coroutine from whatever queue it's in.
|
536 |
|
|
The case |c->next=c| is also permitted; such a self-loop can occur when a
|
537 |
|
|
coroutine goes to sleep and expects to be awakened (that is, scheduled)
|
538 |
|
|
by another coroutine. Sleeping coroutines have important data in their
|
539 |
|
|
|ctl| field; they are therefore quite different from unscheduled
|
540 |
|
|
or ``unemployed'' coroutines, which have |c->next=NULL|. An unemployed
|
541 |
|
|
coroutine is not assumed to have any valid data in its |ctl| field.
|
542 |
|
|
|
543 |
|
|
@=
|
544 |
|
|
static void unschedule @,@,@[ARGS((coroutine*))@];
|
545 |
|
|
|
546 |
|
|
@ @=
|
547 |
|
|
static void unschedule(c)
|
548 |
|
|
coroutine *c;
|
549 |
|
|
{@+register coroutine *p;
|
550 |
|
|
if (c->next) {
|
551 |
|
|
for (p=c; p->next!=c; p=p->next) ;
|
552 |
|
|
p->next = c->next;
|
553 |
|
|
c->next=NULL;
|
554 |
|
|
if (verbose&schedule_bit) {
|
555 |
|
|
printf(" unscheduling ");@+print_coroutine_id(c);@+printf("\n");
|
556 |
|
|
}
|
557 |
|
|
}
|
558 |
|
|
}
|
559 |
|
|
|
560 |
|
|
@ When it is time to process all coroutines that have queued up for a
|
561 |
|
|
particular time~|t|, we empty the queue called |ring[t]| and link its items in
|
562 |
|
|
the opposite order (from front to back). The following subroutine uses the
|
563 |
|
|
well known algorithm discussed in exercise 2.2.3--7 of {\sl The Art
|
564 |
|
|
of Computer Programming}.
|
565 |
|
|
|
566 |
|
|
@=
|
567 |
|
|
static coroutine *queuelist @,@,@[ARGS((int))@];
|
568 |
|
|
|
569 |
|
|
@ @=
|
570 |
|
|
static coroutine* queuelist(t)
|
571 |
|
|
int t;
|
572 |
|
|
{@+register coroutine *p, *q=&sentinel, *r;
|
573 |
|
|
for (p=ring[t].next;p!=&ring[t];p=r) {
|
574 |
|
|
r=p->next;
|
575 |
|
|
p->next=q;
|
576 |
|
|
q=p;
|
577 |
|
|
}
|
578 |
|
|
ring[t].next=&ring[t];
|
579 |
|
|
sentinel.next=q;
|
580 |
|
|
return q;
|
581 |
|
|
}
|
582 |
|
|
|
583 |
|
|
@ @=
|
584 |
|
|
coroutine sentinel; /* dummy coroutine at origin of circular list */
|
585 |
|
|
|
586 |
|
|
@ Coroutines often start working on tasks that are {\it speculative}, in the
|
587 |
|
|
sense that we want certain results to be ready if they prove to be
|
588 |
|
|
useful; we understand that speculative computations might not actually
|
589 |
|
|
be needed. Therefore a coroutine might need to be aborted before it
|
590 |
|
|
has finished its work.
|
591 |
|
|
|
592 |
|
|
All coroutines must be written in such a way that important data structures
|
593 |
|
|
remain intact even when the coroutine is abruptly terminated. In particular,
|
594 |
|
|
we need to be sure that ``locks'' on shared resources are restored to
|
595 |
|
|
an unlocked state when a coroutine holding the lock is aborted.
|
596 |
|
|
|
597 |
|
|
A \&{lockvar} variable is |NULL| when it is unlocked; otherwise it
|
598 |
|
|
points to the coroutine responsible for unlocking~it.
|
599 |
|
|
|
600 |
|
|
@d set_lock(c,l) {@+l=c;@+(c)->lockloc=&(l);@+}
|
601 |
|
|
@d release_lock(c,l) {@+l=NULL;@+ (c)->lockloc=NULL;@+}
|
602 |
|
|
|
603 |
|
|
@=
|
604 |
|
|
typedef coroutine *lockvar;
|
605 |
|
|
|
606 |
|
|
@ @=
|
607 |
|
|
Extern void print_locks @,@,@[ARGS((void))@];
|
608 |
|
|
|
609 |
|
|
@ @=
|
610 |
|
|
void print_locks()
|
611 |
|
|
{
|
612 |
|
|
print_cache_locks(ITcache);
|
613 |
|
|
print_cache_locks(DTcache);
|
614 |
|
|
print_cache_locks(Icache);
|
615 |
|
|
print_cache_locks(Dcache);
|
616 |
|
|
print_cache_locks(Scache);
|
617 |
|
|
if (mem_lock) printf("mem locked by %s:%d\n",mem_lock->name,mem_lock->stage);
|
618 |
|
|
if (dispatch_lock) printf("dispatch locked by %s:%d\n",
|
619 |
|
|
dispatch_lock->name,dispatch_lock->stage);
|
620 |
|
|
if (wbuf_lock) printf("head of write buffer locked by %s:%d\n",
|
621 |
|
|
wbuf_lock->name,wbuf_lock->stage);
|
622 |
|
|
if (clean_lock) printf("cleaner locked by %s:%d\n",
|
623 |
|
|
clean_lock->name,clean_lock->stage);
|
624 |
|
|
if (speed_lock) printf("write buffer flush locked by %s:%d\n",
|
625 |
|
|
speed_lock->name,speed_lock->stage);
|
626 |
|
|
}
|
627 |
|
|
|
628 |
|
|
@ Many of the quantities we deal with are speculative values
|
629 |
|
|
that might not yet have been certified as part of the ``real''
|
630 |
|
|
calculation; in fact, they might not yet have been calculated.
|
631 |
|
|
|
632 |
|
|
A \&{spec} consists of a 64-bit quantity |o| and a pointer~|p| to
|
633 |
|
|
a \&{specnode}. The value~|o| is meaningful only if the
|
634 |
|
|
pointer~|p| is~|NULL|; otherwise |p| points to a source of further information.
|
635 |
|
|
|
636 |
|
|
A \&{specnode} is a 64-bit quantity |o| together with links to other
|
637 |
|
|
\&{specnode}s
|
638 |
|
|
that are above it or below it in a doubly linked list. An additional
|
639 |
|
|
|known| bit tells whether the |o|~field has been calculated. There also is
|
640 |
|
|
a 64-bit |addr| field, to identify the list and give further information.
|
641 |
|
|
A \&{specnode} list keeps track of speculative values related to a specific
|
642 |
|
|
register or to all of main memory; we will discuss such lists in detail~later.
|
643 |
|
|
|
644 |
|
|
@s specnode_struct int
|
645 |
|
|
|
646 |
|
|
@=
|
647 |
|
|
typedef struct {
|
648 |
|
|
octa o;
|
649 |
|
|
struct specnode_struct *p;
|
650 |
|
|
} spec;
|
651 |
|
|
@#
|
652 |
|
|
typedef struct specnode_struct {
|
653 |
|
|
octa o;
|
654 |
|
|
bool known;
|
655 |
|
|
octa addr;
|
656 |
|
|
struct specnode_struct *up,*down;
|
657 |
|
|
} specnode;
|
658 |
|
|
|
659 |
|
|
@ @=
|
660 |
|
|
spec zero_spec; /* |zero_spec.o.h=zero_spec.o.l=0| and |zero_spec.p=NULL| */
|
661 |
|
|
|
662 |
|
|
@ @=
|
663 |
|
|
static void print_spec @,@,@[ARGS((spec))@];
|
664 |
|
|
|
665 |
|
|
@ @=
|
666 |
|
|
static void print_spec(s)
|
667 |
|
|
spec s;
|
668 |
|
|
{
|
669 |
|
|
if (!s.p) print_octa(s.o);
|
670 |
|
|
else {
|
671 |
|
|
printf(">");@+ print_specnode_id(s.p->addr);
|
672 |
|
|
}
|
673 |
|
|
}
|
674 |
|
|
@#
|
675 |
|
|
static void print_specnode(s)
|
676 |
|
|
specnode s;
|
677 |
|
|
{
|
678 |
|
|
if (s.known) {@+print_octa(s.o);@+printf("!");@+}
|
679 |
|
|
else if (s.o.h || s.o.l) {@+print_octa(s.o);@+printf("?");@+}
|
680 |
|
|
else printf("?");
|
681 |
|
|
print_specnode_id(s.addr);
|
682 |
|
|
}
|
683 |
|
|
|
684 |
|
|
@ The analog of an automobile in our simulator is a block of data called
|
685 |
|
|
\&{control}, which represents all the relevant facts about an \MMIX\
|
686 |
|
|
instruction. We can think of it as the work order attached to a car's
|
687 |
|
|
windshield. Each group of employees updates the work order as the car moves
|
688 |
|
|
through the shop.
|
689 |
|
|
|
690 |
|
|
A \&{control} record contains the original location of an instruction,
|
691 |
|
|
and its four bytes OP~X~Y~Z. An instruction has up to four inputs, which are
|
692 |
|
|
\&{spec} records called |y|, |z|, |b| and~|ra|; it also has up to three
|
693 |
|
|
outputs, which are \&{specnode} records called |x|, |a|, and~|rl|.
|
694 |
|
|
(We usually don't mention the special input~|ra| or the special output~|rl|,
|
695 |
|
|
which refer to \.{MMIX}'s internal registers rA and~rL.) For example, the
|
696 |
|
|
main inputs to a \.{DIVU} command are \$Y, \$Z, and~rD; the outputs are the
|
697 |
|
|
quotient~\$X and the remainder~rR. The inputs to a
|
698 |
|
|
\.{STO} command are \$Y, \$Z, and~\$X; there is one ``output,'' and
|
699 |
|
|
the field~|x.addr| will be set to the physical address of the memory location
|
700 |
|
|
corresponding to virtual address $\rm \$Y+\$Z$.
|
701 |
|
|
|
702 |
|
|
Each \&{control} block also points to the coroutine that owns it, if any.
|
703 |
|
|
And it has various other fields that contain other tidbits of information;
|
704 |
|
|
for example, we have already mentioned
|
705 |
|
|
the |state|~field, which often governs a coroutine's actions. The |i|~field,
|
706 |
|
|
which contains an internal operation code number, is generally used together
|
707 |
|
|
with |state| to switch between alternative computational steps. If, for
|
708 |
|
|
example, the |op|~field is \.{SUB} or \.{SUBI} or \.{NEG} or \.{NEGI},
|
709 |
|
|
the internal opcode~|i| will be simply~|sub|.
|
710 |
|
|
We shall define all the fields of \&{control} records
|
711 |
|
|
now and discuss them later.
|
712 |
|
|
|
713 |
|
|
An actual hardware implementation of \MMIX\ wouldn't need all the information
|
714 |
|
|
we are putting into a \&{control} block. Some of that information would
|
715 |
|
|
typically be latched between stages of a pipeline; other portions would
|
716 |
|
|
probably appear in so-called ``rename registers.''
|
717 |
|
|
@^rename registers@>
|
718 |
|
|
We simulate rename registers only indirectly,
|
719 |
|
|
by counting how many registers of that
|
720 |
|
|
kind would be in use if we were mimicking low-level hardware details more
|
721 |
|
|
precisely. The |go| field is a \&{specnode} for convenience in programming,
|
722 |
|
|
although we use only its |known| and |o| subfields. It generally contains
|
723 |
|
|
the address of the subsequent instruction.
|
724 |
|
|
|
725 |
|
|
@s mmix_opcode int
|
726 |
|
|
@s internal_opcode int
|
727 |
|
|
|
728 |
|
|
@=
|
729 |
|
|
@@;
|
730 |
|
|
typedef struct control_struct {
|
731 |
|
|
octa loc; /* virtual address where an instruction originated */
|
732 |
|
|
mmix_opcode op;@+ unsigned char xx,yy,zz; /* the original instruction bytes */
|
733 |
|
|
spec y,z,b,ra; /* inputs */
|
734 |
|
|
specnode x,a,go,rl; /* outputs */
|
735 |
|
|
coroutine *owner; /* a coroutine whose |ctl| this is */
|
736 |
|
|
internal_opcode i; /* internal opcode */
|
737 |
|
|
int state; /* internal mindset */
|
738 |
|
|
bool usage; /* should rU be increased? */
|
739 |
|
|
bool need_b; /* should we stall until |b.p==NULL|? */
|
740 |
|
|
bool need_ra; /* should we stall until |ra.p==NULL|? */
|
741 |
|
|
bool ren_x; /* does |x| correspond to a rename register? */
|
742 |
|
|
bool mem_x; /* does |x| correspond to a memory write? */
|
743 |
|
|
bool ren_a; /* does |a| correspond to a rename register? */
|
744 |
|
|
bool set_l; /* does |rl| correspond to a new value of rL? */
|
745 |
|
|
bool interim; /* does this instruction need to be reissued on interrupt? */
|
746 |
|
|
unsigned int arith_exc; /* arithmetic exceptions for event bits of rA */
|
747 |
|
|
unsigned int hist; /* history bits for use in branch prediction */
|
748 |
|
|
int denin,denout; /* execution time penalties for subnormal handling */
|
749 |
|
|
octa cur_O,cur_S; /* speculative rO and rS before this instruction */
|
750 |
|
|
unsigned int interrupt; /* does this instruction generate an interrupt? */
|
751 |
|
|
void *ptr_a, *ptr_b, *ptr_c; /* generic pointers for miscellaneous use */
|
752 |
|
|
} control;
|
753 |
|
|
|
754 |
|
|
@ @=
|
755 |
|
|
static void print_control_block @,@,@[ARGS((control*))@];
|
756 |
|
|
|
757 |
|
|
@ @=
|
758 |
|
|
static void print_control_block(c)
|
759 |
|
|
control *c;
|
760 |
|
|
{
|
761 |
|
|
octa default_go;
|
762 |
|
|
if (c->loc.h || c->loc.l || c->op || c->xx || c->yy || c->zz || c->owner) {
|
763 |
|
|
print_octa(c->loc);
|
764 |
|
|
printf(": %02x%02x%02x%02x(%s)",c->op,c->xx,c->yy,c->zz,
|
765 |
|
|
internal_op_name[c->i]);
|
766 |
|
|
}
|
767 |
|
|
if (c->usage) printf("*");
|
768 |
|
|
if (c->interim) printf("+");
|
769 |
|
|
if (c->y.o.h || c->y.o.l || c->y.p) {@+printf(" y=");@+print_spec(c->y);@+}
|
770 |
|
|
if (c->z.o.h || c->z.o.l || c->z.p) {@+printf(" z=");@+print_spec(c->z);@+}
|
771 |
|
|
if (c->b.o.h || c->b.o.l || c->b.p || c->need_b) {
|
772 |
|
|
printf(" b=");@+print_spec(c->b);
|
773 |
|
|
if (c->need_b) printf("*");
|
774 |
|
|
}
|
775 |
|
|
if (c->need_ra) {@+printf(" rA=");@+print_spec(c->ra);@+}
|
776 |
|
|
if (c->ren_x || c->mem_x) {@+printf(" x=");@+print_specnode(c->x);@+}
|
777 |
|
|
else if (c->x.o.h || c->x.o.l) {
|
778 |
|
|
printf(" x=");@+print_octa(c->x.o);@+printf("%c",c->x.known? '!': '?');
|
779 |
|
|
}
|
780 |
|
|
if (c->ren_a) {@+printf(" a=");@+print_specnode(c->a);@+}
|
781 |
|
|
if (c->set_l) {@+printf(" rL=");@+print_specnode(c->rl);@+}
|
782 |
|
|
if (c->interrupt) {@+printf(" int=");@+print_bits(c->interrupt);@+}
|
783 |
|
|
if (c->arith_exc) {@+printf(" exc=");@+print_bits(c->arith_exc<<8);@+}
|
784 |
|
|
default_go=incr(c->loc,4);
|
785 |
|
|
if (c->go.o.l!=default_go.l || c->go.o.h!=default_go.h) {
|
786 |
|
|
printf(" ->");@+print_octa(c->go.o);
|
787 |
|
|
}
|
788 |
|
|
if (verbose&show_pred_bit) printf(" hist=%x",c->hist);
|
789 |
|
|
if (c->i==pop) {
|
790 |
|
|
printf(" rS="); print_octa(c->cur_S);
|
791 |
|
|
printf(" rO="); print_octa(c->cur_O);
|
792 |
|
|
}
|
793 |
|
|
printf(" state=%d",c->state);
|
794 |
|
|
}
|
795 |
|
|
|
796 |
|
|
@* Lists. Here is a (boring) list of all the \MMIX\ opcodes, in order.
|
797 |
|
|
|
798 |
|
|
@=
|
799 |
|
|
typedef enum{@/
|
800 |
|
|
@!TRAP,@!FCMP,@!FUN,@!FEQL,@!FADD,@!FIX,@!FSUB,@!FIXU,@/
|
801 |
|
|
@!FLOT,@!FLOTI,@!FLOTU,@!FLOTUI,@!SFLOT,@!SFLOTI,@!SFLOTU,@!SFLOTUI,@/
|
802 |
|
|
@!FMUL,@!FCMPE,@!FUNE,@!FEQLE,@!FDIV,@!FSQRT,@!FREM,@!FINT,@/
|
803 |
|
|
@!MUL,@!MULI,@!MULU,@!MULUI,@!DIV,@!DIVI,@!DIVU,@!DIVUI,@/
|
804 |
|
|
@!ADD,@!ADDI,@!ADDU,@!ADDUI,@!SUB,@!SUBI,@!SUBU,@!SUBUI,@/
|
805 |
|
|
@!IIADDU,@!IIADDUI,@!IVADDU,@!IVADDUI,@!VIIIADDU,@!VIIIADDUI,@!XVIADDU,@!XVIADDUI,@/
|
806 |
|
|
@!CMP,@!CMPI,@!CMPU,@!CMPUI,@!NEG,@!NEGI,@!NEGU,@!NEGUI,@/
|
807 |
|
|
@!SL,@!SLI,@!SLU,@!SLUI,@!SR,@!SRI,@!SRU,@!SRUI,@/
|
808 |
|
|
@!BN,@!BNB,@!BZ,@!BZB,@!BP,@!BPB,@!BOD,@!BODB,@/
|
809 |
|
|
@!BNN,@!BNNB,@!BNZ,@!BNZB,@!BNP,@!BNPB,@!BEV,@!BEVB,@/
|
810 |
|
|
@!PBN,@!PBNB,@!PBZ,@!PBZB,@!PBP,@!PBPB,@!PBOD,@!PBODB,@/
|
811 |
|
|
@!PBNN,@!PBNNB,@!PBNZ,@!PBNZB,@!PBNP,@!PBNPB,@!PBEV,@!PBEVB,@/
|
812 |
|
|
@!CSN,@!CSNI,@!CSZ,@!CSZI,@!CSP,@!CSPI,@!CSOD,@!CSODI,@/
|
813 |
|
|
@!CSNN,@!CSNNI,@!CSNZ,@!CSNZI,@!CSNP,@!CSNPI,@!CSEV,@!CSEVI,@/
|
814 |
|
|
@!ZSN,@!ZSNI,@!ZSZ,@!ZSZI,@!ZSP,@!ZSPI,@!ZSOD,@!ZSODI,@/
|
815 |
|
|
@!ZSNN,@!ZSNNI,@!ZSNZ,@!ZSNZI,@!ZSNP,@!ZSNPI,@!ZSEV,@!ZSEVI,@/
|
816 |
|
|
@!LDB,@!LDBI,@!LDBU,@!LDBUI,@!LDW,@!LDWI,@!LDWU,@!LDWUI,@/
|
817 |
|
|
@!LDT,@!LDTI,@!LDTU,@!LDTUI,@!LDO,@!LDOI,@!LDOU,@!LDOUI,@/
|
818 |
|
|
@!LDSF,@!LDSFI,@!LDHT,@!LDHTI,@!CSWAP,@!CSWAPI,@!LDUNC,@!LDUNCI,@/
|
819 |
|
|
@!LDVTS,@!LDVTSI,@!PRELD,@!PRELDI,@!PREGO,@!PREGOI,@!GO,@!GOI,@/
|
820 |
|
|
@!STB,@!STBI,@!STBU,@!STBUI,@!STW,@!STWI,@!STWU,@!STWUI,@/
|
821 |
|
|
@!STT,@!STTI,@!STTU,@!STTUI,@!STO,@!STOI,@!STOU,@!STOUI,@/
|
822 |
|
|
@!STSF,@!STSFI,@!STHT,@!STHTI,@!STCO,@!STCOI,@!STUNC,@!STUNCI,@/
|
823 |
|
|
@!SYNCD,@!SYNCDI,@!PREST,@!PRESTI,@!SYNCID,@!SYNCIDI,@!PUSHGO,@!PUSHGOI,@/
|
824 |
|
|
@!OR,@!ORI,@!ORN,@!ORNI,@!NOR,@!NORI,@!XOR,@!XORI,@/
|
825 |
|
|
@!AND,@!ANDI,@!ANDN,@!ANDNI,@!NAND,@!NANDI,@!NXOR,@!NXORI,@/
|
826 |
|
|
@!BDIF,@!BDIFI,@!WDIF,@!WDIFI,@!TDIF,@!TDIFI,@!ODIF,@!ODIFI,@/
|
827 |
|
|
@!MUX,@!MUXI,@!SADD,@!SADDI,@!MOR,@!MORI,@!MXOR,@!MXORI,@/
|
828 |
|
|
@!SETH,@!SETMH,@!SETML,@!SETL,@!INCH,@!INCMH,@!INCML,@!INCL,@/
|
829 |
|
|
@!ORH,@!ORMH,@!ORML,@!ORL,@!ANDNH,@!ANDNMH,@!ANDNML,@!ANDNL,@/
|
830 |
|
|
@!JMP,@!JMPB,@!PUSHJ,@!PUSHJB,@!GETA,@!GETAB,@!PUT,@!PUTI,@/
|
831 |
|
|
@!POP,@!RESUME,@!SAVE,@!UNSAVE,@!SYNC,@!SWYM,@!GET,@!TRIP}@+@!mmix_opcode;
|
832 |
|
|
|
833 |
|
|
@ @=
|
834 |
|
|
char *opcode_name[]={
|
835 |
|
|
"TRAP","FCMP","FUN","FEQL","FADD","FIX","FSUB","FIXU",@/
|
836 |
|
|
"FLOT","FLOTI","FLOTU","FLOTUI","SFLOT","SFLOTI","SFLOTU","SFLOTUI",@/
|
837 |
|
|
"FMUL","FCMPE","FUNE","FEQLE","FDIV","FSQRT","FREM","FINT",@/
|
838 |
|
|
"MUL","MULI","MULU","MULUI","DIV","DIVI","DIVU","DIVUI",@/
|
839 |
|
|
"ADD","ADDI","ADDU","ADDUI","SUB","SUBI","SUBU","SUBUI",@/
|
840 |
|
|
"2ADDU","2ADDUI","4ADDU","4ADDUI","8ADDU","8ADDUI","16ADDU","16ADDUI",@/
|
841 |
|
|
"CMP","CMPI","CMPU","CMPUI","NEG","NEGI","NEGU","NEGUI",@/
|
842 |
|
|
"SL","SLI","SLU","SLUI","SR","SRI","SRU","SRUI",@/
|
843 |
|
|
"BN","BNB","BZ","BZB","BP","BPB","BOD","BODB",@/
|
844 |
|
|
"BNN","BNNB","BNZ","BNZB","BNP","BNPB","BEV","BEVB",@/
|
845 |
|
|
"PBN","PBNB","PBZ","PBZB","PBP","PBPB","PBOD","PBODB",@/
|
846 |
|
|
"PBNN","PBNNB","PBNZ","PBNZB","PBNP","PBNPB","PBEV","PBEVB",@/
|
847 |
|
|
"CSN","CSNI","CSZ","CSZI","CSP","CSPI","CSOD","CSODI",@/
|
848 |
|
|
"CSNN","CSNNI","CSNZ","CSNZI","CSNP","CSNPI","CSEV","CSEVI",@/
|
849 |
|
|
"ZSN","ZSNI","ZSZ","ZSZI","ZSP","ZSPI","ZSOD","ZSODI",@/
|
850 |
|
|
"ZSNN","ZSNNI","ZSNZ","ZSNZI","ZSNP","ZSNPI","ZSEV","ZSEVI",@/
|
851 |
|
|
"LDB","LDBI","LDBU","LDBUI","LDW","LDWI","LDWU","LDWUI",@/
|
852 |
|
|
"LDT","LDTI","LDTU","LDTUI","LDO","LDOI","LDOU","LDOUI",@/
|
853 |
|
|
"LDSF","LDSFI","LDHT","LDHTI","CSWAP","CSWAPI","LDUNC","LDUNCI",@/
|
854 |
|
|
"LDVTS","LDVTSI","PRELD","PRELDI","PREGO","PREGOI","GO","GOI",@/
|
855 |
|
|
"STB","STBI","STBU","STBUI","STW","STWI","STWU","STWUI",@/
|
856 |
|
|
"STT","STTI","STTU","STTUI","STO","STOI","STOU","STOUI",@/
|
857 |
|
|
"STSF","STSFI","STHT","STHTI","STCO","STCOI","STUNC","STUNCI",@/
|
858 |
|
|
"SYNCD","SYNCDI","PREST","PRESTI","SYNCID","SYNCIDI","PUSHGO","PUSHGOI",@/
|
859 |
|
|
"OR","ORI","ORN","ORNI","NOR","NORI","XOR","XORI",@/
|
860 |
|
|
"AND","ANDI","ANDN","ANDNI","NAND","NANDI","NXOR","NXORI",@/
|
861 |
|
|
"BDIF","BDIFI","WDIF","WDIFI","TDIF","TDIFI","ODIF","ODIFI",@/
|
862 |
|
|
"MUX","MUXI","SADD","SADDI","MOR","MORI","MXOR","MXORI",@/
|
863 |
|
|
"SETH","SETMH","SETML","SETL","INCH","INCMH","INCML","INCL",@/
|
864 |
|
|
"ORH","ORMH","ORML","ORL","ANDNH","ANDNMH","ANDNML","ANDNL",@/
|
865 |
|
|
"JMP","JMPB","PUSHJ","PUSHJB","GETA","GETAB","PUT","PUTI",@/
|
866 |
|
|
"POP","RESUME","SAVE","UNSAVE","SYNC","SWYM","GET","TRIP"};
|
867 |
|
|
|
868 |
|
|
@ And here is a (likewise boring) list of all the internal opcodes.
|
869 |
|
|
The smallest numbers, less than or equal to |max_pipe_op|, correspond
|
870 |
|
|
to operations for which arbitrary pipeline delays can be configured
|
871 |
|
|
with |MMIX_config|. The largest numbers, greater than |max_real_command|,
|
872 |
|
|
correspond to internally
|
873 |
|
|
generated operations that have no official OP code; for example,
|
874 |
|
|
there are internal operations to shift the $\gamma$ pointer in the
|
875 |
|
|
register stack, and to compute page table entries.
|
876 |
|
|
|
877 |
|
|
@=
|
878 |
|
|
#define max_pipe_op feps
|
879 |
|
|
#define max_real_command trip
|
880 |
|
|
|
881 |
|
|
typedef enum{@/
|
882 |
|
|
@!mul0, /* multiplication by zero */
|
883 |
|
|
@!mul1, /* multiplication by 1--8 bits */
|
884 |
|
|
@!mul2, /* multiplication by 9--16 bits */
|
885 |
|
|
@!mul3, /* multiplication by 17--24 bits */
|
886 |
|
|
@!mul4, /* multiplication by 25--32 bits */
|
887 |
|
|
@!mul5, /* multiplication by 33--40 bits */
|
888 |
|
|
@!mul6, /* multiplication by 41--48 bits */
|
889 |
|
|
@!mul7, /* multiplication by 49--56 bits */
|
890 |
|
|
@!mul8, /* multiplication by 57--64 bits */
|
891 |
|
|
@!div, /* \.{DIV[U][I]} */
|
892 |
|
|
@!sh, /* \.{S[L,R][U][I]} */
|
893 |
|
|
@!mux, /* \.{MUX[I]} */
|
894 |
|
|
@!sadd, /* \.{SADD[I]} */
|
895 |
|
|
@!mor, /* \.{M[X]OR[I]} */
|
896 |
|
|
@!fadd, /* \.{FADD}, \.{FSUB} */
|
897 |
|
|
@!fmul, /* \.{FMUL} */
|
898 |
|
|
@!fdiv, /* \.{FDIV} */
|
899 |
|
|
@!fsqrt, /* \.{FSQRT} */
|
900 |
|
|
@!fint, /* \.{FINT} */
|
901 |
|
|
@!fix, /* \.{FIX[U]} */
|
902 |
|
|
@!flot, /* \.{[S]FLOT[U][I]} */
|
903 |
|
|
@!feps, /* \.{FCMPE}, \.{FUNE}, \.{FEQLE} */
|
904 |
|
|
@!fcmp, /* \.{FCMP} */
|
905 |
|
|
@!funeq, /* \.{FUN}, \.{FEQL} */
|
906 |
|
|
@!fsub, /* \.{FSUB} */
|
907 |
|
|
@!frem, /* \.{FREM} */
|
908 |
|
|
@!mul, /* \.{MUL[I]} */
|
909 |
|
|
@!mulu, /* \.{MULU[I]} */
|
910 |
|
|
@!divu, /* \.{DIVU[I]} */
|
911 |
|
|
@!add, /* \.{ADD[I]} */
|
912 |
|
|
@!addu, /* \.{[2,4,8,16,]ADDU[I]}, \.{INC[M][H,L]} */
|
913 |
|
|
@!sub, /* \.{SUB[I]}, \.{NEG[I]} */
|
914 |
|
|
@!subu, /* \.{SUBU[I]}, \.{NEGU[I]} */
|
915 |
|
|
@!set, /* \.{SET[M][H,L]}, \.{GETA[B]} */
|
916 |
|
|
@!or, /* \.{OR[I]}, \.{OR[M][H,L]} */
|
917 |
|
|
@!orn, /* \.{ORN[I]} */
|
918 |
|
|
@!nor, /* \.{NOR[I]} */
|
919 |
|
|
@!and, /* \.{AND[I]} */
|
920 |
|
|
@!andn, /* \.{ANDN[I]}, \.{ANDN[M][H,L]} */
|
921 |
|
|
@!nand, /* \.{NAND[I]} */
|
922 |
|
|
@!xor, /* \.{XOR[I]} */
|
923 |
|
|
@!nxor, /* \.{NXOR[I]} */
|
924 |
|
|
@!shlu, /* \.{SLU[I]} */
|
925 |
|
|
@!shru, /* \.{SRU[I]} */
|
926 |
|
|
@!shl, /* \.{SL[I]} */
|
927 |
|
|
@!shr, /* \.{SR[I]} */
|
928 |
|
|
@!cmp, /* \.{CMP[I]} */
|
929 |
|
|
@!cmpu, /* \.{CMPU[I]} */
|
930 |
|
|
@!bdif, /* \.{BDIF[I]} */
|
931 |
|
|
@!wdif, /* \.{WDIF[I]} */
|
932 |
|
|
@!tdif, /* \.{TDIF[I]} */
|
933 |
|
|
@!odif, /* \.{ODIF[I]} */
|
934 |
|
|
@!zset, /* \.{ZS[N][N,Z,P][I]}, \.{ZSEV[I]}, \.{ZSOD[I]} */
|
935 |
|
|
@!cset, /* \.{CS[N][N,Z,P][I]}, \.{CSEV[I]}, \.{CSOD[I]} */
|
936 |
|
|
@!get, /* \.{GET} */
|
937 |
|
|
@!put, /* \.{PUT[I]} */
|
938 |
|
|
@!ld, /* \.{LD[B,W,T,O][U][I]}, \.{LDHT[I]}, \.{LDSF[I]} */
|
939 |
|
|
@!ldptp, /* load page table pointer */
|
940 |
|
|
@!ldpte, /* load page table entry */
|
941 |
|
|
@!ldunc, /* \.{LDUNC[I]} */
|
942 |
|
|
@!ldvts, /* \.{LDVTS[I]} */
|
943 |
|
|
@!preld, /* \.{PRELD[I]} */
|
944 |
|
|
@!prest, /* \.{PREST[I]} */
|
945 |
|
|
@!st, /* \.{STO[U][I]}, \.{STCO[I]}, \.{STUNC[I]} */
|
946 |
|
|
@!syncd, /* \.{SYNCD[I]} */
|
947 |
|
|
@!syncid, /* \.{SYNCID[I]} */
|
948 |
|
|
@!pst, /* \.{ST[B,W,T][U][I]}, \.{STHT[I]} */
|
949 |
|
|
@!stunc, /* \.{STUNC[I]}, in write buffer */
|
950 |
|
|
@!cswap, /* \.{CSWAP[I]} */
|
951 |
|
|
@!br, /* \.{B[N][N,Z,P][B]} */
|
952 |
|
|
@!pbr, /* \.{PB[N][N,Z,P][B]} */
|
953 |
|
|
@!pushj, /* \.{PUSHJ[B]} */
|
954 |
|
|
@!go, /* \.{GO[I]} */
|
955 |
|
|
@!prego, /* \.{PREGO[I]} */
|
956 |
|
|
@!pushgo, /* \.{PUSHGO[I]} */
|
957 |
|
|
@!pop, /* \.{POP} */
|
958 |
|
|
@!resume, /* \.{RESUME} */
|
959 |
|
|
@!save, /* \.{SAVE} */
|
960 |
|
|
@!unsave, /* \.{UNSAVE} */
|
961 |
|
|
@!sync, /* \.{SYNC} */
|
962 |
|
|
@!jmp, /* \.{JMP[B]} */
|
963 |
|
|
@!noop, /* \.{SWYM} */
|
964 |
|
|
@!trap, /* \.{TRAP} */
|
965 |
|
|
@!trip, /* \.{TRIP} */
|
966 |
|
|
@!incgamma, /* increase $\gamma$ pointer */
|
967 |
|
|
@!decgamma, /* decrease $\gamma$ pointer */
|
968 |
|
|
@!incrl, /* increase rL and $\beta$ */
|
969 |
|
|
@!sav, /* intermediate stage of \.{SAVE} */
|
970 |
|
|
@!unsav, /* intermediate stage of \.{UNSAVE} */
|
971 |
|
|
@!resum /* intermediate stage of \.{RESUME} */
|
972 |
|
|
}@! internal_opcode;
|
973 |
|
|
|
974 |
|
|
@ @=
|
975 |
|
|
char *internal_op_name[]={
|
976 |
|
|
"mul0",
|
977 |
|
|
"mul1",
|
978 |
|
|
"mul2",
|
979 |
|
|
"mul3",
|
980 |
|
|
"mul4",
|
981 |
|
|
"mul5",
|
982 |
|
|
"mul6",
|
983 |
|
|
"mul7",
|
984 |
|
|
"mul8",
|
985 |
|
|
"div",
|
986 |
|
|
"sh",
|
987 |
|
|
"mux",
|
988 |
|
|
"sadd",
|
989 |
|
|
"mor",
|
990 |
|
|
"fadd",
|
991 |
|
|
"fmul",
|
992 |
|
|
"fdiv",
|
993 |
|
|
"fsqrt",
|
994 |
|
|
"fint",
|
995 |
|
|
"fix",
|
996 |
|
|
"flot",
|
997 |
|
|
"feps",
|
998 |
|
|
"fcmp",
|
999 |
|
|
"funeq",
|
1000 |
|
|
"fsub",
|
1001 |
|
|
"frem",
|
1002 |
|
|
"mul",
|
1003 |
|
|
"mulu",
|
1004 |
|
|
"divu",
|
1005 |
|
|
"add",
|
1006 |
|
|
"addu",
|
1007 |
|
|
"sub",
|
1008 |
|
|
"subu",
|
1009 |
|
|
"set",
|
1010 |
|
|
"or",
|
1011 |
|
|
"orn",
|
1012 |
|
|
"nor",
|
1013 |
|
|
"and",
|
1014 |
|
|
"andn",
|
1015 |
|
|
"nand",
|
1016 |
|
|
"xor",
|
1017 |
|
|
"nxor",
|
1018 |
|
|
"shlu",
|
1019 |
|
|
"shru",
|
1020 |
|
|
"shl",
|
1021 |
|
|
"shr",
|
1022 |
|
|
"cmp",
|
1023 |
|
|
"cmpu",
|
1024 |
|
|
"bdif",
|
1025 |
|
|
"wdif",
|
1026 |
|
|
"tdif",
|
1027 |
|
|
"odif",
|
1028 |
|
|
"zset",
|
1029 |
|
|
"cset",
|
1030 |
|
|
"get",
|
1031 |
|
|
"put",
|
1032 |
|
|
"ld",
|
1033 |
|
|
"ldptp",
|
1034 |
|
|
"ldpte",
|
1035 |
|
|
"ldunc",
|
1036 |
|
|
"ldvts",
|
1037 |
|
|
"preld",
|
1038 |
|
|
"prest",
|
1039 |
|
|
"st",
|
1040 |
|
|
"syncd",
|
1041 |
|
|
"syncid",
|
1042 |
|
|
"pst",
|
1043 |
|
|
"stunc",
|
1044 |
|
|
"cswap",
|
1045 |
|
|
"br",
|
1046 |
|
|
"pbr",
|
1047 |
|
|
"pushj",
|
1048 |
|
|
"go",
|
1049 |
|
|
"prego",
|
1050 |
|
|
"pushgo",
|
1051 |
|
|
"pop",
|
1052 |
|
|
"resume",
|
1053 |
|
|
"save",
|
1054 |
|
|
"unsave",
|
1055 |
|
|
"sync",
|
1056 |
|
|
"jmp",
|
1057 |
|
|
"noop",
|
1058 |
|
|
"trap",
|
1059 |
|
|
"trip",
|
1060 |
|
|
"incgamma",
|
1061 |
|
|
"decgamma",
|
1062 |
|
|
"incrl",
|
1063 |
|
|
"sav",
|
1064 |
|
|
"unsav",
|
1065 |
|
|
"resum"};
|
1066 |
|
|
|
1067 |
|
|
@ We need a table to convert the external opcodes to
|
1068 |
|
|
internal ones.
|
1069 |
|
|
|
1070 |
|
|
@=
|
1071 |
|
|
internal_opcode internal_op[256]={@/
|
1072 |
|
|
trap,fcmp,funeq,funeq,fadd,fix,fsub,fix,@/
|
1073 |
|
|
flot,flot,flot,flot,flot,flot,flot,flot,@/
|
1074 |
|
|
fmul,feps,feps,feps,fdiv,fsqrt,frem,fint,@/
|
1075 |
|
|
mul,mul,mulu,mulu,div,div,divu,divu,@/
|
1076 |
|
|
add,add,addu,addu,sub,sub,subu,subu,@/
|
1077 |
|
|
addu,addu,addu,addu,addu,addu,addu,addu,@/
|
1078 |
|
|
cmp,cmp,cmpu,cmpu,sub,sub,subu,subu,@/
|
1079 |
|
|
shl,shl,shlu,shlu,shr,shr,shru,shru,@/
|
1080 |
|
|
br,br,br,br,br,br,br,br,@/
|
1081 |
|
|
br,br,br,br,br,br,br,br,@/
|
1082 |
|
|
pbr,pbr,pbr,pbr,pbr,pbr,pbr,pbr,@/
|
1083 |
|
|
pbr,pbr,pbr,pbr,pbr,pbr,pbr,pbr,@/
|
1084 |
|
|
cset,cset,cset,cset,cset,cset,cset,cset,@/
|
1085 |
|
|
cset,cset,cset,cset,cset,cset,cset,cset,@/
|
1086 |
|
|
zset,zset,zset,zset,zset,zset,zset,zset,@/
|
1087 |
|
|
zset,zset,zset,zset,zset,zset,zset,zset,@/
|
1088 |
|
|
ld,ld,ld,ld,ld,ld,ld,ld,@/
|
1089 |
|
|
ld,ld,ld,ld,ld,ld,ld,ld,@/
|
1090 |
|
|
ld,ld,ld,ld,cswap,cswap,ldunc,ldunc,@/
|
1091 |
|
|
ldvts,ldvts,preld,preld,prego,prego,go,go,@/
|
1092 |
|
|
pst,pst,pst,pst,pst,pst,pst,pst,@/
|
1093 |
|
|
pst,pst,pst,pst,st,st,st,st,@/
|
1094 |
|
|
pst,pst,pst,pst,st,st,st,st,@/
|
1095 |
|
|
syncd,syncd,prest,prest,syncid,syncid,pushgo,pushgo,@/
|
1096 |
|
|
or,or,orn,orn,nor,nor,xor,xor,@/
|
1097 |
|
|
and,and,andn,andn,nand,nand,nxor,nxor,@/
|
1098 |
|
|
bdif,bdif,wdif,wdif,tdif,tdif,odif,odif,@/
|
1099 |
|
|
mux,mux,sadd,sadd,mor,mor,mor,mor,@/
|
1100 |
|
|
set,set,set,set,addu,addu,addu,addu,@/
|
1101 |
|
|
or,or,or,or,andn,andn,andn,andn,@/
|
1102 |
|
|
jmp,jmp,pushj,pushj,set,set,put,put,@/
|
1103 |
|
|
pop,resume,save,unsave,sync,noop,get,trip};
|
1104 |
|
|
|
1105 |
|
|
@ While we're into boring lists, we might as well define all the
|
1106 |
|
|
special register numbers, together with an inverse table for
|
1107 |
|
|
use in diagnostic outputs. These codes have been designed so that
|
1108 |
|
|
special registers 0--7 are unencumbered, 8--11 can't be \.{PUT} by anybody,
|
1109 |
|
|
12--18 can't be \.{PUT} by the user. Pipeline delays might occur
|
1110 |
|
|
when \.{GET} is applied to special registers 21--31 or when
|
1111 |
|
|
\.{PUT} is applied to special registers 15--20. The \.{SAVE} and
|
1112 |
|
|
\.{UNSAVE} commands store and restore special registers 0--6 and 23--27.
|
1113 |
|
|
|
1114 |
|
|
@
|
1115 |
|
|
#define rA 21 /* arithmetic status register */
|
1116 |
|
|
#define rB 0 /* bootstrap register (trip) */
|
1117 |
|
|
#define rC 8 /* cycle counter */
|
1118 |
|
|
#define rD 1 /* dividend register */
|
1119 |
|
|
#define rE 2 /* epsilon register */
|
1120 |
|
|
#define rF 22 /* failure location register */
|
1121 |
|
|
#define rG 19 /* global threshold register */
|
1122 |
|
|
#define rH 3 /* himult register */
|
1123 |
|
|
#define rI 12 /* interval counter */
|
1124 |
|
|
#define rJ 4 /* return-jump register */
|
1125 |
|
|
#define rK 15 /* interrupt mask register */
|
1126 |
|
|
#define rL 20 /* local threshold register */
|
1127 |
|
|
#define rM 5 /* multiplex mask register */
|
1128 |
|
|
#define rN 9 /* serial number */
|
1129 |
|
|
#define rO 10 /* register stack offset */
|
1130 |
|
|
#define rP 23 /* prediction register */
|
1131 |
|
|
#define rQ 16 /* interrupt request register */
|
1132 |
|
|
#define rR 6 /* remainder register */
|
1133 |
|
|
#define rS 11 /* register stack pointer */
|
1134 |
|
|
#define rT 13 /* trap address register */
|
1135 |
|
|
#define rU 17 /* usage counter */
|
1136 |
|
|
#define rV 18 /* virtual translation register */
|
1137 |
|
|
#define rW 24 /* where-interrupted register (trip) */
|
1138 |
|
|
#define rX 25 /* execution register (trip) */
|
1139 |
|
|
#define rY 26 /* Y operand (trip) */
|
1140 |
|
|
#define rZ 27 /* Z operand (trip) */
|
1141 |
|
|
#define rBB 7 /* bootstrap register (trap) */
|
1142 |
|
|
#define rTT 14 /* dynamic trap address register */
|
1143 |
|
|
#define rWW 28 /* where-interrupted register (trap) */
|
1144 |
|
|
#define rXX 29 /* execution register (trap) */
|
1145 |
|
|
#define rYY 30 /* Y operand (trap) */
|
1146 |
|
|
#define rZZ 31 /* Z operand (trap) */
|
1147 |
|
|
|
1148 |
|
|
@ @=
|
1149 |
|
|
char *special_name[32]={"rB","rD","rE","rH","rJ","rM","rR","rBB",
|
1150 |
|
|
"rC","rN","rO","rS","rI","rT","rTT","rK","rQ","rU","rV","rG","rL",
|
1151 |
|
|
"rA","rF","rP","rW","rX","rY","rZ","rWW","rXX","rYY","rZZ"};
|
1152 |
|
|
|
1153 |
|
|
@ Here are the bit codes that affect trips and traps. The first eight
|
1154 |
|
|
cases also apply to the upper half of~rQ; the next eight apply to~rA.
|
1155 |
|
|
|
1156 |
|
|
@d P_BIT (1<<0) /* instruction in privileged location */
|
1157 |
|
|
@d S_BIT (1<<1) /* security violation */
|
1158 |
|
|
@d B_BIT (1<<2) /* instruction breaks the rules */
|
1159 |
|
|
@d K_BIT (1<<3) /* instruction for kernel only */
|
1160 |
|
|
@d N_BIT (1<<4) /* virtual translation bypassed */
|
1161 |
|
|
@d PX_BIT (1<<5) /* permission lacking to execute from page */
|
1162 |
|
|
@d PW_BIT (1<<6) /* permission lacking to write on page */
|
1163 |
|
|
@d PR_BIT (1<<7) /* permission lacking to read from page */
|
1164 |
|
|
@d PROT_OFFSET 5 /* distance from |PR_BIT| to protection code position */
|
1165 |
|
|
@d X_BIT (1<<8) /* floating inexact */
|
1166 |
|
|
@d Z_BIT (1<<9) /* floating division by zero */
|
1167 |
|
|
@d U_BIT (1<<10) /* floating underflow */
|
1168 |
|
|
@d O_BIT (1<<11) /* floating overflow */
|
1169 |
|
|
@d I_BIT (1<<12) /* floating invalid operation */
|
1170 |
|
|
@d W_BIT (1<<13) /* float-to-fix overflow */
|
1171 |
|
|
@d V_BIT (1<<14) /* integer overflow */
|
1172 |
|
|
@d D_BIT (1<<15) /* integer divide check */
|
1173 |
|
|
@d H_BIT (1<<16) /* trip handler bit */
|
1174 |
|
|
@d F_BIT (1<<17) /* forced trap bit */
|
1175 |
|
|
@d E_BIT (1<<18) /* external (dynamic) trap bit */
|
1176 |
|
|
|
1177 |
|
|
@=
|
1178 |
|
|
char bit_code_map[]="EFHDVWIOUZXrwxnkbsp";
|
1179 |
|
|
|
1180 |
|
|
@ @=
|
1181 |
|
|
static void print_bits @,@,@[ARGS((int))@];
|
1182 |
|
|
|
1183 |
|
|
@ @=
|
1184 |
|
|
static void print_bits(x)
|
1185 |
|
|
int x;
|
1186 |
|
|
{
|
1187 |
|
|
register int b,j;
|
1188 |
|
|
for (j=0,b=E_BIT;(x&(b+b-1))&&b;j++,b>>=1)
|
1189 |
|
|
if (x&b) printf("%c",bit_code_map[j]);
|
1190 |
|
|
}
|
1191 |
|
|
|
1192 |
|
|
@ The lower half of rQ holds external interrupts of highest priority.
|
1193 |
|
|
Most of them are implementation-dependent, but a few are defined in general.
|
1194 |
|
|
|
1195 |
|
|
@
|
1196 |
|
|
#define POWER_FAILURE (1<<0) /* try to shut down calmly and quickly */
|
1197 |
|
|
#define PARITY_ERROR (1<<1) /* try to save the file systems */
|
1198 |
|
|
#define NONEXISTENT_MEMORY (1<<2) /* a memory address can't be used */
|
1199 |
|
|
#define REBOOT_SIGNAL (1<<4) /* it's time to start over */
|
1200 |
|
|
#define INTERVAL_TIMEOUT (1<<7) /* the timer register, rI, has reached zero */
|
1201 |
|
|
|
1202 |
|
|
@* Dynamic speculation.
|
1203 |
|
|
Now that we understand some basic low-level structures,
|
1204 |
|
|
we're ready to look at the larger picture.
|
1205 |
|
|
|
1206 |
|
|
This simulator is based on the idea of ``dynamic scheduling with register
|
1207 |
|
|
renaming,'' as introduced in the 1960s by R.~M. Tomasulo [{\sl IBM Journal
|
1208 |
|
|
@^Tomasulo, Robert Marco@>
|
1209 |
|
|
of Research and Development\/ \bf11} (1967), 25--33]. Moreover, the dynamic
|
1210 |
|
|
scheduling method is extended here to ``speculative execution,'' as
|
1211 |
|
|
implemented in several processors of the 1990s and described in section~4.6 of
|
1212 |
|
|
Hennessy and Patterson's {\sl Computer Architecture}, second edition (1995).
|
1213 |
|
|
@^Hennessy, John LeRoy@>
|
1214 |
|
|
@^Patterson, David Andrew@>
|
1215 |
|
|
The essential idea is to keep track of the pipeline contents by recording all
|
1216 |
|
|
dependencies between unfinished computations in a queue called the {\it
|
1217 |
|
|
reorder buffer}. An entry in the reorder buffer might, for example, correspond
|
1218 |
|
|
to an instruction that adds together two numbers whose values are still being
|
1219 |
|
|
computed; those numbers have been allocated space in earlier positions of the
|
1220 |
|
|
reorder buffer. The addition will take place as soon as both of its operands
|
1221 |
|
|
are known, but the sum won't be written immediately into the destination
|
1222 |
|
|
register. It will stay in the reorder buffer until reaching the {\it hot
|
1223 |
|
|
seat\/} at the front of the queue. Finally, the addition leaves the
|
1224 |
|
|
hot seat and is said to be {\it committed}.
|
1225 |
|
|
|
1226 |
|
|
Some instructions in the reorder buffer may in fact be executed only
|
1227 |
|
|
on speculation, meaning that they won't really be called for unless a prior
|
1228 |
|
|
branch instruction has the predicted outcome. Indeed, we can say that
|
1229 |
|
|
all instructions not yet in the hot seat are being executed speculatively,
|
1230 |
|
|
because an external interrupt might occur at any time and change the entire
|
1231 |
|
|
course of computation. Organizing the pipeline as a reorder buffer allows us
|
1232 |
|
|
to look ahead and keep busy computing values that have a good chance of being
|
1233 |
|
|
needed later, instead of waiting for slow instructions or slow memory
|
1234 |
|
|
references to be completed.
|
1235 |
|
|
|
1236 |
|
|
The reorder buffer is in fact a queue of \&{control} records, conceptually
|
1237 |
|
|
forming part of a circle of such records inside the simulator, corresponding
|
1238 |
|
|
to all instructions that have been dispatched or {\it issued\/} but not yet
|
1239 |
|
|
committed, in strict program order.
|
1240 |
|
|
|
1241 |
|
|
The best way to get an understanding of speculative execution is perhaps to
|
1242 |
|
|
imagine that the reorder buffer is large enough to hold hundreds of
|
1243 |
|
|
instructions in various stages of execution, and to think of an implementation
|
1244 |
|
|
of \MMIX\ that has dozens of functional units---more than would ever actually
|
1245 |
|
|
@^thinking big@>
|
1246 |
|
|
be built into a chip. Then one can readily visualize the kinds of control
|
1247 |
|
|
structures and checks that must be made to ensure correct execution. Without
|
1248 |
|
|
such a broad viewpoint, a programmer or hardware designer will be inclined to
|
1249 |
|
|
think only of the simple cases and to devise algorithms that lack the proper
|
1250 |
|
|
generality. Thus we have a somewhat paradoxical situation in which a difficult
|
1251 |
|
|
general problem turns out to be easier to solve than its simpler special cases,
|
1252 |
|
|
because it enforces clarity of thinking.
|
1253 |
|
|
|
1254 |
|
|
Instructions that have completed execution and have not yet been committed are
|
1255 |
|
|
analogous to cars that have gone through our hypothetical repair shop and are
|
1256 |
|
|
waiting for their owners to pick them up. However, all analogies break down,
|
1257 |
|
|
and the world of automobiles does not have a natural counterpart for the
|
1258 |
|
|
notion of speculative execution. That notion corresponds roughly to situations
|
1259 |
|
|
in which people are led to believe that their cars need a new piece of
|
1260 |
|
|
equipment, but they suddenly change their mind once they see the price tag,
|
1261 |
|
|
and they insist on having the equipment removed even after it has been
|
1262 |
|
|
partially or completely installed.
|
1263 |
|
|
|
1264 |
|
|
Speculatively executed instructions might make no sense: They might divide
|
1265 |
|
|
by zero or refer to protected memory areas, etc. Such anomalies are not
|
1266 |
|
|
considered catastrophic or even exceptional until the instruction reaches the
|
1267 |
|
|
hot~seat.
|
1268 |
|
|
|
1269 |
|
|
The person who designs a computer with speculative execution is an optimist,
|
1270 |
|
|
who has faith that the vast majority of the machine's predictions will come
|
1271 |
|
|
true. The person who designs a reliable implementation of such a computer
|
1272 |
|
|
is a pessimist, who understands that all predictions might come to naught.
|
1273 |
|
|
The pessimist does, however, take pains to optimize the cases that do turn out
|
1274 |
|
|
well.
|
1275 |
|
|
|
1276 |
|
|
@ Let's consider what happens to a single instruction, say
|
1277 |
|
|
\.{ADD} \.{\$1,\$2,\$3}, as it travels through the pipeline in a normal
|
1278 |
|
|
situation. The first time this instruction is encountered, it is placed into
|
1279 |
|
|
the I-cache (that is, the instruction cache), so that we won't have to access
|
1280 |
|
|
memory when we need to perform it again. We will assume for simplicity in this
|
1281 |
|
|
discussion that each I-cache access takes one clock cycle, although other
|
1282 |
|
|
possibilities are allowed by |MMIX_config|.
|
1283 |
|
|
|
1284 |
|
|
Suppose the simulated machine fetches the example \.{ADD} instruction
|
1285 |
|
|
at time 1000. Fetching is done by a coroutine whose |stage| number is~0.
|
1286 |
|
|
A cache block typically contains 8 or 16 instructions. The fetch unit
|
1287 |
|
|
of our machine is able to fetch up to |fetch_max| instructions on each clock
|
1288 |
|
|
cycle and place them in the fetch buffer, provided that there is room in the
|
1289 |
|
|
buffer and that all the instructions belong to the same cache block.
|
1290 |
|
|
|
1291 |
|
|
The dispatch unit of our simulator is able to issue up to |dispatch_max|
|
1292 |
|
|
instructions on each clock cycle and move them from the fetch buffer to the
|
1293 |
|
|
reorder buffer, provided that functional units are available for those
|
1294 |
|
|
instructions and there is room in the reorder buffer. A functional unit that
|
1295 |
|
|
handles \.{ADD} is usually called an ALU (arithmetic logic unit), and our
|
1296 |
|
|
simulated machine might have several of them. If they aren't all stalled
|
1297 |
|
|
in stage~1 of their pipelines, and if the reorder buffer isn't full, and if
|
1298 |
|
|
the machine isn't in the process of deissuing instructions that were
|
1299 |
|
|
mispredicted, and if
|
1300 |
|
|
fewer than |dispatch_max| instructions are ahead of the \.{ADD} in the fetch
|
1301 |
|
|
buffer, and if all such prior instructions can be issued without using up all
|
1302 |
|
|
the free ALUs, our \.{ADD} instruction will be issued at time 1001.
|
1303 |
|
|
(In fact, all of these conditions are usually true.)
|
1304 |
|
|
|
1305 |
|
|
We assume that $\rm L>3$, so that \$1, \$2, and~\$3 are local registers.
|
1306 |
|
|
For simplicity we'll assume in fact that the register stack is empty, so that
|
1307 |
|
|
the \.{ADD} instruction is supposed to set $\rm l[1]\gets l[2]+l[3]$. The
|
1308 |
|
|
operands l[2] and~l[3] might not be known at time 1001; they are \&{spec}
|
1309 |
|
|
values, which might point to \&{specnode} entries in the reorder buffer for
|
1310 |
|
|
previous instructions whose destinations are l[2] and~l[3].
|
1311 |
|
|
The dispatcher fills the next available control block of the reorder buffer
|
1312 |
|
|
with information for the \.{ADD}, containing appropriate \&{spec} values
|
1313 |
|
|
corresponding to l[2] and~l[3] in its |y| and~|z| fields. The |x|~field of
|
1314 |
|
|
this control block will be inserted into a doubly linked list of \&{specnode}
|
1315 |
|
|
records, corresponding to l[1] and to all instructions in the reorder buffer
|
1316 |
|
|
that have l[1] as a destination. The boolean value |x.known| will be set to
|
1317 |
|
|
|false|, meaning that this speculative value still needs to be
|
1318 |
|
|
computed. Subsequent instructions that need l[1] as a source will point to
|
1319 |
|
|
|x|, if they are issued before the sum |x.o| has been computed. Double
|
1320 |
|
|
linking is used in the \&{specnode} list because the \.{ADD} instruction might
|
1321 |
|
|
be cancelled before it is finally committed; thus deletions might occur
|
1322 |
|
|
at either end of the list for~l[1].
|
1323 |
|
|
|
1324 |
|
|
At time 1002, the ALU handling the \.{ADD} will stall if its inputs |y|
|
1325 |
|
|
and~|z| are not both known (namely if |y.p!=NULL| or |z.p!=NULL|).
|
1326 |
|
|
In fact, it will also stall if its third input rA is not known;
|
1327 |
|
|
the current speculative value of rA, except for its event bits,
|
1328 |
|
|
is represented in the |ra|~field of the control block, and we must
|
1329 |
|
|
have |ra.p==NULL|. In such a case the ALU will look to see if the
|
1330 |
|
|
\&{spec} values pointed to by |y.p| and/or |z.p| and/or |ra.p| become
|
1331 |
|
|
defined on this clock cycle, and it will update its own input values
|
1332 |
|
|
accordingly.
|
1333 |
|
|
|
1334 |
|
|
But let's assume that |y|, |z|, and |ra| are already known at time 1002.
|
1335 |
|
|
Then |x.o| will be set to |y.o+z.o| and |x.known| will become~|true|.
|
1336 |
|
|
This will make the result destined for~l[1] available to be used in other
|
1337 |
|
|
commands at time~1003.
|
1338 |
|
|
|
1339 |
|
|
If no overflow occurs when adding |y.o| to |z.o|, the |interrupt| and
|
1340 |
|
|
|arith_exc| fields of the control block for \.{ADD} are set to zero. But when
|
1341 |
|
|
overflow does occur (shudder), there are two cases, based on the V-enable bit
|
1342 |
|
|
of rA, which is found in field |b.o| of the control block. If this bit is~0,
|
1343 |
|
|
the V-bit of the |arith_exc| field in the control block is set to~1; the
|
1344 |
|
|
|arith_exc| field will be ored into~rA when the \.{ADD} instruction is
|
1345 |
|
|
eventually committed. But if the V-enable bit is~1, the trip handler should
|
1346 |
|
|
be called, interrupting the normal sequence. In such a case, the |interrupt|
|
1347 |
|
|
field of the control block is set to specify a trip, and the fetcher and
|
1348 |
|
|
dispatcher are told to forget what they have been doing; all instructions
|
1349 |
|
|
following the \.{ADD} in the reorder buffer must now be deissued. The virtual starting
|
1350 |
|
|
address of the overflow trip handler, namely location~32, is hastily passed to
|
1351 |
|
|
the fetch routine, and instructions will be fetched from that location
|
1352 |
|
|
as soon as possible. (Of course the overflow and the trip handler are
|
1353 |
|
|
still speculative until the \.{ADD} instruction is committed. Other exceptional
|
1354 |
|
|
conditions might cause the \.{ADD} itself to be terminated before it
|
1355 |
|
|
gets to the hot seat. But the pipeline keeps charging ahead, always trying to
|
1356 |
|
|
guess the most probable outcome.)
|
1357 |
|
|
|
1358 |
|
|
The commission unit of this simulator is able to commit and/or deissue up to
|
1359 |
|
|
|commit_max| instructions on each clock cycle. With luck, fewer than
|
1360 |
|
|
|commit_max| instructions will be ahead of our \.{ADD} instruction at
|
1361 |
|
|
time~1003, and they will all be completed normally. Then l[1]~can be set
|
1362 |
|
|
to |x.o|, and the event bits of~rA can be updated from |arith_exc|,
|
1363 |
|
|
and the \.{ADD} command can pass through the hot seat and out of the
|
1364 |
|
|
reorder buffer.
|
1365 |
|
|
|
1366 |
|
|
@=
|
1367 |
|
|
Extern int fetch_max, dispatch_max, peekahead, commit_max;
|
1368 |
|
|
/* limits on instructions that can be handled per clock cycle */
|
1369 |
|
|
|
1370 |
|
|
@ The instruction currently occupying the hot seat is the only
|
1371 |
|
|
issued-but-not-yet-committed instruction that is guaranteed to be truly
|
1372 |
|
|
essential to the machine's computation. All other instructions in the reorder
|
1373 |
|
|
buffer are being executed on speculation; if they prove to be needed, well and
|
1374 |
|
|
good, but we might want to jettison them all if, say, an external interrupt
|
1375 |
|
|
occurs.
|
1376 |
|
|
|
1377 |
|
|
Thus all instructions that change the global state in complicated ways---like
|
1378 |
|
|
\.{LDVTS}, which changes the virtual address translation caches---are
|
1379 |
|
|
performed only when they reach the hot seat. Fortunately the vast majority
|
1380 |
|
|
of instructions are sufficiently simple that we can deal with them more
|
1381 |
|
|
efficiently while other computations are taking place.
|
1382 |
|
|
|
1383 |
|
|
In this implementation the reorder buffer is simply housed in an array of
|
1384 |
|
|
control records. The first array element is |reorder_bot|, and the last is
|
1385 |
|
|
|reorder_top|. Variable |hot| points to the control block in the hot seat, and
|
1386 |
|
|
|hot-1| to its predecessor, etc. Variable |cool| points to the next control
|
1387 |
|
|
block that will be filled in the reorder buffer. If |hot==cool| the reorder
|
1388 |
|
|
buffer is empty; otherwise it contains the control records |hot|, |hot-1|,
|
1389 |
|
|
\dots,~|cool+1|, except of course that we wrap around from |reorder_bot| to
|
1390 |
|
|
|reorder_top| when moving down in the buffer.
|
1391 |
|
|
|
1392 |
|
|
@=
|
1393 |
|
|
Extern control *reorder_bot, *reorder_top; /* least and greatest
|
1394 |
|
|
entries in the ring containing the reorder buffer */
|
1395 |
|
|
Extern control *hot, *cool; /* front and rear of the reorder buffer */
|
1396 |
|
|
Extern control *old_hot; /* value of |hot| at beginning of cycle */
|
1397 |
|
|
Extern int deissues; /* the number of instructions that need to be deissued */
|
1398 |
|
|
|
1399 |
|
|
@ @=
|
1400 |
|
|
hot=cool=reorder_top;
|
1401 |
|
|
deissues=0;
|
1402 |
|
|
|
1403 |
|
|
@ @=
|
1404 |
|
|
static void print_reorder_buffer @,@,@[ARGS((void))@];
|
1405 |
|
|
|
1406 |
|
|
@ @=
|
1407 |
|
|
static void print_reorder_buffer()
|
1408 |
|
|
{
|
1409 |
|
|
printf("Reorder buffer");
|
1410 |
|
|
if (hot==cool) printf(" (empty)\n");
|
1411 |
|
|
else {@+register control *p;
|
1412 |
|
|
if (deissues) printf(" (%d to be deissued)",deissues);
|
1413 |
|
|
if (doing_interrupt) printf(" (interrupt state %d)",doing_interrupt);
|
1414 |
|
|
printf(":\n");
|
1415 |
|
|
for (p=hot;p!=cool; p=(p==reorder_bot? reorder_top: p-1)) {
|
1416 |
|
|
print_control_block(p);
|
1417 |
|
|
if (p->owner) {
|
1418 |
|
|
printf(" ");@+ print_coroutine_id(p->owner);
|
1419 |
|
|
}
|
1420 |
|
|
printf("\n");
|
1421 |
|
|
}
|
1422 |
|
|
}
|
1423 |
|
|
printf(" %d available rename register%s, %d memory slot%s\n",
|
1424 |
|
|
rename_regs, rename_regs!=1? "s": "",
|
1425 |
|
|
mem_slots, mem_slots!=1? "s": "");
|
1426 |
|
|
}
|
1427 |
|
|
|
1428 |
|
|
@ Here is an overview of what happens on each clock cycle.
|
1429 |
|
|
|
1430 |
|
|
@=
|
1431 |
|
|
{
|
1432 |
|
|
@;
|
1433 |
|
|
dispatch_count=0;
|
1434 |
|
|
old_hot=hot; /* remember the hot seat position at beginning of cycle */
|
1435 |
|
|
old_tail=tail; /* remember the fetch buffer contents at beginning of cycle */
|
1436 |
|
|
suppress_dispatch=(deissues || dispatch_lock);
|
1437 |
|
|
if (doing_interrupt) @@;
|
1438 |
|
|
else @;
|
1439 |
|
|
@;
|
1440 |
|
|
if (!suppress_dispatch) @;
|
1441 |
|
|
ticks=incr(ticks,1); /* and the beat moves on */
|
1442 |
|
|
dispatch_stat[dispatch_count]++;
|
1443 |
|
|
}
|
1444 |
|
|
|
1445 |
|
|
@ @=
|
1446 |
|
|
int dispatch_count; /* how many dispatched on this cycle */
|
1447 |
|
|
bool suppress_dispatch; /* should dispatching be bypassed? */
|
1448 |
|
|
int doing_interrupt; /* how many cycles of interrupt preparations remain */
|
1449 |
|
|
lockvar dispatch_lock; /* lock to prevent instruction issues */
|
1450 |
|
|
|
1451 |
|
|
@ @=
|
1452 |
|
|
Extern int *dispatch_stat;
|
1453 |
|
|
/* how often did we dispatch 0, 1, ... instructions? */
|
1454 |
|
|
Extern bool security_disabled; /* omit security checks for testing purposes? */
|
1455 |
|
|
|
1456 |
|
|
@ @=
|
1457 |
|
|
{
|
1458 |
|
|
for (m=commit_max;m>0 && deissues>0; m--)
|
1459 |
|
|
@;
|
1460 |
|
|
for (;m>0;m--) {
|
1461 |
|
|
if (hot==cool) break; /* reorder buffer is empty */
|
1462 |
|
|
if (!security_disabled) @;
|
1463 |
|
|
if (hot->owner) break; /* hot seat instruction isn't finished */
|
1464 |
|
|
@;
|
1465 |
|
|
i=hot->i;
|
1466 |
|
|
if (hot==reorder_bot) hot=reorder_top;
|
1467 |
|
|
else hot--;
|
1468 |
|
|
if (i==resum) break; /* allow the resumed instruction to see the new rK */
|
1469 |
|
|
}
|
1470 |
|
|
}
|
1471 |
|
|
|
1472 |
|
|
@* The dispatch stage. It would be nice to present the parts of this simulator
|
1473 |
|
|
by dealing with the fetching, dispatching, executing, and committing
|
1474 |
|
|
stages in that order. After all, instructions are first fetched,
|
1475 |
|
|
then dispatched, then executed, and finally committed.
|
1476 |
|
|
However, the fetch stage depends heavily on difficult questions of
|
1477 |
|
|
memory management that are best deferred until we have looked at
|
1478 |
|
|
the simpler parts of simulation. Therefore we will take our initial
|
1479 |
|
|
plunge into the details of this program by looking first at the dispatch phase,
|
1480 |
|
|
assuming that instructions have somehow appeared magically in the fetch buffer.
|
1481 |
|
|
|
1482 |
|
|
The fetch buffer, like the circular priority queue of all coroutines
|
1483 |
|
|
and the circular queue used for the reorder buffer, lives in an
|
1484 |
|
|
array that is best regarded as a ring of elements. The elements
|
1485 |
|
|
are structures of type \&{fetch}, which have five fields:
|
1486 |
|
|
A 32-bit |inst|, which is an \MMIX\ instruction; a 64-bit |loc|,
|
1487 |
|
|
which is the virtual address of that instruction; an |interrupt| field,
|
1488 |
|
|
which is nonzero if, for example, the protection bits in the relevant page
|
1489 |
|
|
table entry for this address do not permit execution access; a boolean
|
1490 |
|
|
|noted| field, which becomes |true| after the dispatch unit has peeked
|
1491 |
|
|
at the instruction to see whether it is a jump or probable branch;
|
1492 |
|
|
and a |hist| field, which records the recent branch history.
|
1493 |
|
|
(The least significant bits of~|hist| correspond to the most recent branches.)
|
1494 |
|
|
|
1495 |
|
|
@=
|
1496 |
|
|
typedef struct {
|
1497 |
|
|
octa loc; /* virtual address of instruction */
|
1498 |
|
|
tetra inst; /* the instruction itself */
|
1499 |
|
|
unsigned int interrupt; /* bit codes that might cause interruption */
|
1500 |
|
|
bool noted; /* have we peeked at this instruction? */
|
1501 |
|
|
unsigned int hist; /* if we peeked, this was the |peek_hist| */
|
1502 |
|
|
} fetch;
|
1503 |
|
|
|
1504 |
|
|
@ The oldest and youngest entries in the fetch buffer are pointed
|
1505 |
|
|
to by |head| and |tail|, just as the oldest and youngest entries in the
|
1506 |
|
|
reorder buffer are called |hot| and |cool|. The fetch coroutine will
|
1507 |
|
|
be adding entries at the |tail| position, which starts at |old_tail|
|
1508 |
|
|
when a cycle begins, in parallel with the actions simulated by
|
1509 |
|
|
the dispatcher. Therefore the dispatcher is allowed to look only at
|
1510 |
|
|
instructions in |head|, |head-1|, \dots,~|old_tail+1|, although a few
|
1511 |
|
|
more recently fetched instructions will usually be present in the fetch
|
1512 |
|
|
buffer by the time this part of the program is executed.
|
1513 |
|
|
|
1514 |
|
|
@=
|
1515 |
|
|
Extern fetch *fetch_bot, *fetch_top; /* least and greatest
|
1516 |
|
|
entries in the ring containing the fetch buffer */
|
1517 |
|
|
Extern fetch *head, *tail; /* front and rear of the fetch buffer */
|
1518 |
|
|
|
1519 |
|
|
@ @=
|
1520 |
|
|
fetch *old_tail; /* rear of the fetch buffer available on the current cycle */
|
1521 |
|
|
|
1522 |
|
|
@ @d UNKNOWN_SPEC ((specnode*)1)
|
1523 |
|
|
|
1524 |
|
|
@=
|
1525 |
|
|
head=tail=fetch_top;
|
1526 |
|
|
inst_ptr.p=UNKNOWN_SPEC;
|
1527 |
|
|
|
1528 |
|
|
@ @=
|
1529 |
|
|
static void print_fetch_buffer @,@,@[ARGS((void))@];
|
1530 |
|
|
|
1531 |
|
|
@ @=
|
1532 |
|
|
static void print_fetch_buffer()
|
1533 |
|
|
{
|
1534 |
|
|
printf("Fetch buffer");
|
1535 |
|
|
if (head==tail) printf(" (empty)\n");
|
1536 |
|
|
else {@+register fetch *p;
|
1537 |
|
|
if (resuming) printf(" (resumption state %d)",resuming);
|
1538 |
|
|
printf(":\n");
|
1539 |
|
|
for (p=head;p!=tail; p=(p==fetch_bot? fetch_top: p-1)) {
|
1540 |
|
|
print_octa(p->loc);
|
1541 |
|
|
printf(": %08x(%s)",p->inst,opcode_name[p->inst>>24]);
|
1542 |
|
|
if (p->interrupt) print_bits(p->interrupt);
|
1543 |
|
|
if (p->noted) printf("*");
|
1544 |
|
|
printf("\n");
|
1545 |
|
|
}
|
1546 |
|
|
}
|
1547 |
|
|
printf("Instruction pointer is ");
|
1548 |
|
|
if (inst_ptr.p==NULL) print_octa(inst_ptr.o);
|
1549 |
|
|
else {
|
1550 |
|
|
printf("waiting for ");
|
1551 |
|
|
if (inst_ptr.p==UNKNOWN_SPEC) printf("dispatch");
|
1552 |
|
|
else if (inst_ptr.p->addr.h==(tetra)-1)
|
1553 |
|
|
print_coroutine_id(((control*)inst_ptr.p->up)->owner);
|
1554 |
|
|
else print_specnode_id(inst_ptr.p->addr);
|
1555 |
|
|
}
|
1556 |
|
|
printf("\n");
|
1557 |
|
|
}
|
1558 |
|
|
|
1559 |
|
|
@ The best way to understand the dispatching process is once again
|
1560 |
|
|
to ``think big,'' by imagining a huge fetch buffer and the
|
1561 |
|
|
@^thinking big@>
|
1562 |
|
|
potential ability to issue dozens of instructions per cycle, although
|
1563 |
|
|
the actual numbers are typically quite small.
|
1564 |
|
|
|
1565 |
|
|
If the fetch buffer is not empty after |dispatch_max| instructions have
|
1566 |
|
|
been dispatched, the dispatcher also looks at up to |peekahead| further
|
1567 |
|
|
instructions to see if they are jumps or other commands that change the
|
1568 |
|
|
flow of control. Much of this action would happen in parallel on a
|
1569 |
|
|
real machine, but our simulator works sequentially.
|
1570 |
|
|
|
1571 |
|
|
In the following program, |true_head| records the head of the fetch buffer as
|
1572 |
|
|
instructions are actually dispatched, while |head| refers to the position
|
1573 |
|
|
currently being examined (possibly peeking into the future).
|
1574 |
|
|
|
1575 |
|
|
If the fetch buffer is empty at the beginning of the current clock
|
1576 |
|
|
cycle, a ``dispatch bypass'' allows the dispatcher to issue the
|
1577 |
|
|
first instruction that enters the fetch buffer on this cycle. Otherwise
|
1578 |
|
|
the dispatcher is restricted to previously fetched instructions.
|
1579 |
|
|
|
1580 |
|
|
@s func int
|
1581 |
|
|
|
1582 |
|
|
@=
|
1583 |
|
|
{@+register fetch *true_head, *new_head;
|
1584 |
|
|
true_head=head;
|
1585 |
|
|
if (head==old_tail && head!=tail)
|
1586 |
|
|
old_tail=(head==fetch_bot? fetch_top: head-1);
|
1587 |
|
|
peek_hist=cool_hist;
|
1588 |
|
|
for (j=0;j
|
1589 |
|
|
@
|
1590 |
|
|
to dispatch it if |j;
|
1591 |
|
|
head=true_head;
|
1592 |
|
|
}
|
1593 |
|
|
|
1594 |
|
|
@ @=
|
1595 |
|
|
{
|
1596 |
|
|
register mmix_opcode op;
|
1597 |
|
|
register int yz,f;
|
1598 |
|
|
register bool freeze_dispatch=false;
|
1599 |
|
|
register func *u=NULL;
|
1600 |
|
|
if (head==old_tail) break; /* fetch buffer empty */
|
1601 |
|
|
if (head==fetch_bot) new_head=fetch_top;@+else new_head=head-1;
|
1602 |
|
|
op=head->inst>>24; @+yz=head->inst&0xffff;
|
1603 |
|
|
@;
|
1604 |
|
|
@;
|
1605 |
|
|
if (f&rel_addr_bit) @;
|
1606 |
|
|
if (head->noted) peek_hist=head->hist;
|
1607 |
|
|
else @;
|
1608 |
|
|
if (j>=dispatch_max || dispatch_lock || nullifying) {
|
1609 |
|
|
head=new_head;@+ continue; /* can't dispatch, but can peek ahead */
|
1610 |
|
|
}
|
1611 |
|
|
if (cool==reorder_bot) new_cool=reorder_top;@+else new_cool=cool-1;
|
1612 |
|
|
@
|
1613 |
|
|
otherwise |goto stall|@>;
|
1614 |
|
|
@;
|
1615 |
|
|
@;
|
1616 |
|
|
if ((op&0xe0)==0x40) @;
|
1617 |
|
|
@;
|
1618 |
|
|
cool=new_cool;@+ cool_O=new_O;@+ cool_S=new_S;
|
1619 |
|
|
cool_hist=peek_hist;@+ continue;
|
1620 |
|
|
stall: @
|
1621 |
|
|
and |break|@>;
|
1622 |
|
|
}
|
1623 |
|
|
|
1624 |
|
|
@ An instruction can be dispatched only if a functional unit
|
1625 |
|
|
is available to handle it. A functional unit consists of a 256-bit
|
1626 |
|
|
vector that specifies a subset of \MMIX's opcodes, and an array
|
1627 |
|
|
of coroutines for the pipeline stages. There are $k$ coroutines in the
|
1628 |
|
|
array, where $k$ is the maximum number of stages needed by any of the opcodes
|
1629 |
|
|
supported.
|
1630 |
|
|
|
1631 |
|
|
@=
|
1632 |
|
|
typedef struct func_struct{
|
1633 |
|
|
char name[16]; /* symbolic designation */
|
1634 |
|
|
tetra ops[8]; /* big-endian bitmap for the opcodes supported */
|
1635 |
|
|
int k; /* number of pipeline stages */
|
1636 |
|
|
coroutine *co; /* pointer to the first of $k$ consecutive coroutines */
|
1637 |
|
|
} @!func;
|
1638 |
|
|
|
1639 |
|
|
@ @=
|
1640 |
|
|
Extern func *funit; /* pointer to array of functional units */
|
1641 |
|
|
Extern int funit_count; /* the number of functional units */
|
1642 |
|
|
|
1643 |
|
|
@ It is convenient to have
|
1644 |
|
|
a 256-bit vector of all the supported opcodes, because we need to
|
1645 |
|
|
shut off a lot of special actions when an opcode is not supported.
|
1646 |
|
|
|
1647 |
|
|
@=
|
1648 |
|
|
control *new_cool; /* the reorder position following |cool| */
|
1649 |
|
|
int resuming; /* set nonzero if resuming an interrupted instruction */
|
1650 |
|
|
tetra support[8]; /* big-endian bitmap for all opcodes supported */
|
1651 |
|
|
|
1652 |
|
|
@ @=
|
1653 |
|
|
{@+register func *u;
|
1654 |
|
|
for (u=funit;u<=funit+funit_count;u++)
|
1655 |
|
|
for (i=0;i<8;i++) support[i] |= u->ops[i];
|
1656 |
|
|
}
|
1657 |
|
|
|
1658 |
|
|
@ @d sign_bit ((unsigned)0x80000000)
|
1659 |
|
|
|
1660 |
|
|
@=
|
1661 |
|
|
if (!(support[op>>5]&(sign_bit>>(op&31)))) {
|
1662 |
|
|
/* oops, this opcode isn't supported by any function unit */
|
1663 |
|
|
f=flags[TRAP], i=trap;
|
1664 |
|
|
}@+else f=flags[op], i=internal_op[op];
|
1665 |
|
|
if (i==trip && (head->loc.h&sign_bit)) f=0,i=noop;
|
1666 |
|
|
|
1667 |
|
|
@ @=
|
1668 |
|
|
if (cool->interim) {
|
1669 |
|
|
cool->usage=false;
|
1670 |
|
|
if (cool->op==SAVE) @@;
|
1671 |
|
|
else if (cool->op==UNSAVE) @@;
|
1672 |
|
|
else if (cool->i==preld || cool->i==prest)
|
1673 |
|
|
@@;
|
1674 |
|
|
else if (cool->i==prego) @@;
|
1675 |
|
|
}
|
1676 |
|
|
else if (cool->i<=max_real_command) {
|
1677 |
|
|
if ((flags[cool->op]&ctl_change_bit)||cool->i==pbr)
|
1678 |
|
|
if (inst_ptr.p==NULL && (inst_ptr.o.h&sign_bit) && !(cool->loc.h&sign_bit)
|
1679 |
|
|
&& cool->i!=trap)
|
1680 |
|
|
cool->interrupt|=P_BIT; /* jumping from nonnegative to negative */
|
1681 |
|
|
true_head=head=new_head; /* delete instruction from fetch buffer */
|
1682 |
|
|
resuming=0;
|
1683 |
|
|
}
|
1684 |
|
|
if (freeze_dispatch) set_lock(u->co,dispatch_lock);
|
1685 |
|
|
cool->owner=u->co;@+ u->co->ctl=cool;
|
1686 |
|
|
startup(u->co,1); /* schedule execution of the new inst */
|
1687 |
|
|
if (verbose&issue_bit) {
|
1688 |
|
|
printf("Issuing ");@+print_control_block(cool);
|
1689 |
|
|
printf(" ");@+print_coroutine_id(u->co);@+printf("\n");
|
1690 |
|
|
}
|
1691 |
|
|
dispatch_count++;
|
1692 |
|
|
|
1693 |
|
|
@ We assign the first functional unit that supports |op| and is
|
1694 |
|
|
totally unoccupied, if possible; otherwise we assign the first
|
1695 |
|
|
functional unit that supports |op| and has stage~1 unoccupied.
|
1696 |
|
|
|
1697 |
|
|
@=
|
1698 |
|
|
{@+register int t=op>>5, b=sign_bit>>(op&31);
|
1699 |
|
|
if (cool->i==trap && op!=TRAP) { /* opcode needs to be emulated */
|
1700 |
|
|
u=funit+funit_count; /* this unit supports just \.{TRIP} and \.{TRAP} */
|
1701 |
|
|
goto unit_found;
|
1702 |
|
|
}
|
1703 |
|
|
for (u=funit;u<=funit+funit_count;u++) if (u->ops[t]&b) {
|
1704 |
|
|
for (i=0;ik;i++) if (u->co[i].next) goto unit_busy;
|
1705 |
|
|
goto unit_found;
|
1706 |
|
|
unit_busy: ;
|
1707 |
|
|
}
|
1708 |
|
|
for (u=funit;u
|
1709 |
|
|
if ((u->ops[t]&b) && (u->co->next==NULL)) goto unit_found;
|
1710 |
|
|
goto stall; /* all units for this |op| are busy */
|
1711 |
|
|
}
|
1712 |
|
|
unit_found:
|
1713 |
|
|
|
1714 |
|
|
@ The |flags| table records special properties of each operation code
|
1715 |
|
|
in binary notation: \Hex{1}~means Z~is an immediate value, \Hex{2}~means rZ is
|
1716 |
|
|
a source operand, \Hex{4}~means Y~is an immediate value, \Hex{8}~means rY is a
|
1717 |
|
|
source operand, \Hex{10}~means rX is a source operand, \Hex{20}~means
|
1718 |
|
|
rX is a destination, \Hex{40}~means YZ is part of a relative address,
|
1719 |
|
|
\Hex{80}~means the control changes at this point.
|
1720 |
|
|
|
1721 |
|
|
@d X_is_dest_bit 0x20
|
1722 |
|
|
@d rel_addr_bit 0x40
|
1723 |
|
|
@d ctl_change_bit 0x80
|
1724 |
|
|
|
1725 |
|
|
@=
|
1726 |
|
|
unsigned char flags[256]={
|
1727 |
|
|
0x8a, 0x2a, 0x2a, 0x2a, 0x2a, 0x26, 0x2a, 0x26, /* \.{TRAP}, \dots\ */
|
1728 |
|
|
0x26, 0x25, 0x26, 0x25, 0x26, 0x25, 0x26, 0x25, /* \.{FLOT}, \dots\ */
|
1729 |
|
|
0x2a, 0x2a, 0x2a, 0x2a, 0x2a, 0x26, 0x2a, 0x26, /* \.{FMUL}, \dots\ */
|
1730 |
|
|
0x2a, 0x29, 0x2a, 0x29, 0x2a, 0x29, 0x2a, 0x29, /* \.{MUL}, \dots\ */
|
1731 |
|
|
0x2a, 0x29, 0x2a, 0x29, 0x2a, 0x29, 0x2a, 0x29, /* \.{ADD}, \dots\ */
|
1732 |
|
|
0x2a, 0x29, 0x2a, 0x29, 0x2a, 0x29, 0x2a, 0x29, /* \.{2ADDU}, \dots\ */
|
1733 |
|
|
0x2a, 0x29, 0x2a, 0x29, 0x26, 0x25, 0x26, 0x25, /* \.{CMP}, \dots\ */
|
1734 |
|
|
0x2a, 0x29, 0x2a, 0x29, 0x2a, 0x29, 0x2a, 0x29, /* \.{SL}, \dots\ */
|
1735 |
|
|
0x50, 0x50, 0x50, 0x50, 0x50, 0x50, 0x50, 0x50, /* \.{BN}, \dots\ */
|
1736 |
|
|
0x50, 0x50, 0x50, 0x50, 0x50, 0x50, 0x50, 0x50, /* \.{BNN}, \dots\ */
|
1737 |
|
|
0x50, 0x50, 0x50, 0x50, 0x50, 0x50, 0x50, 0x50, /* \.{PBN}, \dots\ */
|
1738 |
|
|
0x50, 0x50, 0x50, 0x50, 0x50, 0x50, 0x50, 0x50, /* \.{PBNN}, \dots\ */
|
1739 |
|
|
0x3a, 0x39, 0x3a, 0x39, 0x3a, 0x39, 0x3a, 0x39, /* \.{CSN}, \dots\ */
|
1740 |
|
|
0x3a, 0x39, 0x3a, 0x39, 0x3a, 0x39, 0x3a, 0x39, /* \.{CSNN}, \dots\ */
|
1741 |
|
|
0x2a, 0x29, 0x2a, 0x29, 0x2a, 0x29, 0x2a, 0x29, /* \.{ZSN}, \dots\ */
|
1742 |
|
|
0x2a, 0x29, 0x2a, 0x29, 0x2a, 0x29, 0x2a, 0x29, /* \.{ZSNN}, \dots\ */
|
1743 |
|
|
0x2a, 0x29, 0x2a, 0x29, 0x2a, 0x29, 0x2a, 0x29, /* \.{LDB}, \dots\ */
|
1744 |
|
|
0x2a, 0x29, 0x2a, 0x29, 0x2a, 0x29, 0x2a, 0x29, /* \.{LDT}, \dots\ */
|
1745 |
|
|
0x2a, 0x29, 0x2a, 0x29, 0x1a, 0x19, 0x2a, 0x29, /* \.{LDSF}, \dots\ */
|
1746 |
|
|
0x2a, 0x29, 0x0a, 0x09, 0x0a, 0x09, 0xaa, 0xa9, /* \.{LDVTS}, \dots\ */
|
1747 |
|
|
0x1a, 0x19, 0x1a, 0x19, 0x1a, 0x19, 0x1a, 0x19, /* \.{STB}, \dots\ */
|
1748 |
|
|
0x1a, 0x19, 0x1a, 0x19, 0x1a, 0x19, 0x1a, 0x19, /* \.{STT}, \dots\ */
|
1749 |
|
|
0x1a, 0x19, 0x1a, 0x19, 0x0a, 0x09, 0x1a, 0x19, /* \.{STSF}, \dots\ */
|
1750 |
|
|
0x0a, 0x09, 0x0a, 0x09, 0x0a, 0x09, 0xaa, 0xa9, /* \.{SYNCD}, \dots\ */
|
1751 |
|
|
0x2a, 0x29, 0x2a, 0x29, 0x2a, 0x29, 0x2a, 0x29, /* \.{OR}, \dots\ */
|
1752 |
|
|
0x2a, 0x29, 0x2a, 0x29, 0x2a, 0x29, 0x2a, 0x29, /* \.{AND}, \dots\ */
|
1753 |
|
|
0x2a, 0x29, 0x2a, 0x29, 0x2a, 0x29, 0x2a, 0x29, /* \.{BDIF}, \dots\ */
|
1754 |
|
|
0x2a, 0x29, 0x2a, 0x29, 0x2a, 0x29, 0x2a, 0x29, /* \.{MUX}, \dots\ */
|
1755 |
|
|
0x20, 0x20, 0x20, 0x20, 0x30, 0x30, 0x30, 0x30, /* \.{SETH}, \dots\ */
|
1756 |
|
|
0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, /* \.{ORH}, \dots\ */
|
1757 |
|
|
0xc0, 0xc0, 0xe0, 0xe0, 0x60, 0x60, 0x02, 0x01, /* \.{JMP}, \dots\ */
|
1758 |
|
|
0x80, 0x80, 0x00, 0x02, 0x01, 0x00, 0x20, 0x8a}; /* \.{POP}, \dots\ */
|
1759 |
|
|
|
1760 |
|
|
@ @=
|
1761 |
|
|
{
|
1762 |
|
|
if (i==jmp) yz=head->inst&0xffffff;
|
1763 |
|
|
if (op&1) yz-=(i==jmp? 0x1000000: 0x10000);
|
1764 |
|
|
cool->y.o=incr(head->loc,4), cool->y.p=NULL;
|
1765 |
|
|
cool->z.o=incr(head->loc,yz<<2), cool->z.p=NULL;
|
1766 |
|
|
}
|
1767 |
|
|
|
1768 |
|
|
@ The location of the next instruction to be fetched is in a \&{spec} variable
|
1769 |
|
|
called |inst_ptr|. A slightly tricky optimization of the \.{POP} instruction
|
1770 |
|
|
is made in the common case that the speculative value of~rJ is known.
|
1771 |
|
|
|
1772 |
|
|
@=
|
1773 |
|
|
{@+register int predicted=0;
|
1774 |
|
|
if ((op&0xe0)==0x40) @;
|
1775 |
|
|
head->noted=true;
|
1776 |
|
|
head->hist=peek_hist;
|
1777 |
|
|
if (predicted||(f&ctl_change_bit) || (i==syncid&&!(cool->loc.h&sign_bit))) {
|
1778 |
|
|
old_tail=tail=new_head; /* discard all remaining fetches */
|
1779 |
|
|
@;
|
1780 |
|
|
switch (i) {
|
1781 |
|
|
case jmp: case br: case pbr: case pushj: inst_ptr=cool->z;@+ break;
|
1782 |
|
|
case pop:@+if (g[rJ].up->known &&
|
1783 |
|
|
j
|
1784 |
|
|
inst_ptr.o=incr(g[rJ].up->o,yz<<2), inst_ptr.p=NULL;@+break;
|
1785 |
|
|
} /* otherwise fall through, will wait on |cool->go| */
|
1786 |
|
|
case go: case pushgo: case trap: case resume: case syncid:
|
1787 |
|
|
inst_ptr.p=UNKNOWN_SPEC;@+ break;
|
1788 |
|
|
case trip: inst_ptr=zero_spec;@+ break;
|
1789 |
|
|
}
|
1790 |
|
|
}
|
1791 |
|
|
}
|
1792 |
|
|
|
1793 |
|
|
@ At any given time the simulated machine is in two main states, the
|
1794 |
|
|
``hot state'' corresponding to instructions that have been committed and the
|
1795 |
|
|
``cool state'' corresponding to all the speculative changes currently
|
1796 |
|
|
being considered. The dispatcher works with cool instructions and puts them
|
1797 |
|
|
into the reorder buffer, where they gradually get warmer and warmer.
|
1798 |
|
|
Intermediate instructions, between |hot| and |cool|, have intermediate
|
1799 |
|
|
temperatures.
|
1800 |
|
|
|
1801 |
|
|
A machine register like l[101] or g[250] is represented by a specnode whose
|
1802 |
|
|
|o|~field is the current hot value of the register. If the |up| and |down|
|
1803 |
|
|
fields of this specnode point to the node itself,
|
1804 |
|
|
the hot and cool values of the register are
|
1805 |
|
|
identical. Otherwise |up| and |down| are pointers to the coolest and hottest
|
1806 |
|
|
ends of a doubly linked list of specnodes, representing intermediate
|
1807 |
|
|
speculative values (sometimes called ``rename registers'').
|
1808 |
|
|
@^rename registers@>
|
1809 |
|
|
The rename registers are implemented as the |x| or~|a| specnodes inside control
|
1810 |
|
|
blocks, for speculative instructions that use this register as a
|
1811 |
|
|
destination. Speculative instructions that use the register as a
|
1812 |
|
|
source operand point to the next-hottest specnode on the list, until
|
1813 |
|
|
the value becomes known. The doubly linked list of specnodes is an
|
1814 |
|
|
input-restricted deque: A node is inserted at the cool end when the
|
1815 |
|
|
dispatcher issues an instruction with this register as destination;
|
1816 |
|
|
a node is removed from the cool end if an instruction needs to be deissued;
|
1817 |
|
|
a node is removed from the hot end when an instruction is committed.
|
1818 |
|
|
|
1819 |
|
|
The special registers rA, rB, \dots\ occupy the same array as the
|
1820 |
|
|
global registers g[32], g[33], \dots~\thinspace. For example,
|
1821 |
|
|
rB is internally the same as g[0], because |rB=0|.
|
1822 |
|
|
|
1823 |
|
|
@=
|
1824 |
|
|
Extern specnode g[256]; /* global registers and special registers */
|
1825 |
|
|
Extern specnode *l; /* the ring of local registers */
|
1826 |
|
|
Extern int lring_size; /* the number of on-chip local registers
|
1827 |
|
|
(must be a power of~2) */
|
1828 |
|
|
Extern int max_rename_regs, max_mem_slots; /* capacity of reorder buffer */
|
1829 |
|
|
Extern int rename_regs, mem_slots; /* currently unused capacity */
|
1830 |
|
|
|
1831 |
|
|
@ @
|
1832 |
|
|
#define ticks @[g[rC].o@] /* the internal clock */
|
1833 |
|
|
|
1834 |
|
|
@ @=
|
1835 |
|
|
int lring_mask; /* for calculations modulo |lring_size| */
|
1836 |
|
|
|
1837 |
|
|
@ The |addr| fields in the specnode lists for registers are used
|
1838 |
|
|
to identify that register in diagnostic messages. Such addresses
|
1839 |
|
|
are negative; memory addresses are positive.
|
1840 |
|
|
|
1841 |
|
|
All registers are initially zero except rG, which is initially 255,
|
1842 |
|
|
and rN, which has a constant value identifying the time of compilation.
|
1843 |
|
|
(The macro \.{ABSTIME} is defined externally in the file \.{abstime.h},
|
1844 |
|
|
which should have just been created by {\mc ABSTIME}\kern.05em;
|
1845 |
|
|
{\mc ABSTIME} is
|
1846 |
|
|
a trivial program that computes the value of the standard library function
|
1847 |
|
|
|time(NULL)|. We assume that this number, which is the number of seconds in
|
1848 |
|
|
the ``{\mc UNIX} epoch,'' is less than~$2^{32}$. Beware: Our assumption will
|
1849 |
|
|
fail in February of 2106.)
|
1850 |
|
|
@^system dependencies@>
|
1851 |
|
|
|
1852 |
|
|
@d VERSION 1 /* version of the \MMIX\ architecture that we support */
|
1853 |
|
|
@d SUBVERSION 0 /* secondary byte of version number */
|
1854 |
|
|
@d SUBSUBVERSION 0 /* further qualification to version number */
|
1855 |
|
|
|
1856 |
|
|
@=
|
1857 |
|
|
rename_regs=max_rename_regs;
|
1858 |
|
|
mem_slots=max_mem_slots;
|
1859 |
|
|
lring_mask=lring_size-1;
|
1860 |
|
|
for (j=0;j<256;j++) {
|
1861 |
|
|
g[j].addr.h=sign_bit, g[j].addr.l=j, g[j].known=true;
|
1862 |
|
|
g[j].up=g[j].down=&g[j];
|
1863 |
|
|
}
|
1864 |
|
|
g[rG].o.l=255;
|
1865 |
|
|
g[rN].o.h=(VERSION<<24)+(SUBVERSION<<16)+(SUBSUBVERSION<<8);
|
1866 |
|
|
g[rN].o.l=ABSTIME; /* see comment and warning above */
|
1867 |
|
|
for (j=0;j
|
1868 |
|
|
l[j].addr.h=sign_bit, l[j].addr.l=256+j, l[j].known=true;
|
1869 |
|
|
l[j].up=l[j].down=&l[j];
|
1870 |
|
|
}
|
1871 |
|
|
|
1872 |
|
|
@ @=
|
1873 |
|
|
static void print_specnode_id @,@,@[ARGS((octa))@];
|
1874 |
|
|
|
1875 |
|
|
@ @=
|
1876 |
|
|
static void print_specnode_id(a)
|
1877 |
|
|
octa a;
|
1878 |
|
|
{
|
1879 |
|
|
if (a.h==sign_bit) {
|
1880 |
|
|
if (a.l<32) printf(special_name[a.l]);
|
1881 |
|
|
else if (a.l<256) printf("g[%d]",a.l);
|
1882 |
|
|
else printf("l[%d]",a.l-256);
|
1883 |
|
|
}@+else if (a.h!=(tetra)-1) {
|
1884 |
|
|
printf("m[");@+print_octa(a);@+printf("]");
|
1885 |
|
|
}
|
1886 |
|
|
}
|
1887 |
|
|
|
1888 |
|
|
@ The |specval| subroutine produces a \&{spec} corresponding to the
|
1889 |
|
|
currently coolest value of a given local or global register.
|
1890 |
|
|
|
1891 |
|
|
@=
|
1892 |
|
|
static spec specval @,@,@[ARGS((specnode*))@];
|
1893 |
|
|
|
1894 |
|
|
@ @=
|
1895 |
|
|
static spec specval(r)
|
1896 |
|
|
specnode *r;
|
1897 |
|
|
{@+spec res;
|
1898 |
|
|
if (r->up->known) res.o=r->up->o,res.p=NULL;
|
1899 |
|
|
else res.p=r->up;
|
1900 |
|
|
return res;
|
1901 |
|
|
}
|
1902 |
|
|
|
1903 |
|
|
@ The |spec_install| subroutine introduces a new speculative value at
|
1904 |
|
|
the cool end of a given doubly linked~list.
|
1905 |
|
|
|
1906 |
|
|
@=
|
1907 |
|
|
static void spec_install @,@,@[ARGS((specnode*,specnode*))@];
|
1908 |
|
|
|
1909 |
|
|
@ @=
|
1910 |
|
|
static void spec_install(r,t) /* insert |t| into list |r| */
|
1911 |
|
|
specnode *r,*t;
|
1912 |
|
|
{
|
1913 |
|
|
t->up=r->up;
|
1914 |
|
|
t->up->down=t;
|
1915 |
|
|
r->up=t;
|
1916 |
|
|
t->down=r;
|
1917 |
|
|
t->addr=r->addr;
|
1918 |
|
|
}
|
1919 |
|
|
|
1920 |
|
|
@ Conversely, |spec_rem| takes such a value out.
|
1921 |
|
|
|
1922 |
|
|
@=
|
1923 |
|
|
static void spec_rem @,@,@[ARGS((specnode*))@];
|
1924 |
|
|
|
1925 |
|
|
@ @=
|
1926 |
|
|
static void spec_rem(t) /* remove |t| from its list */
|
1927 |
|
|
specnode *t;
|
1928 |
|
|
{@+register specnode *u=t->up, *d=t->down;
|
1929 |
|
|
u->down=d;@+ d->up=u;
|
1930 |
|
|
}
|
1931 |
|
|
|
1932 |
|
|
@ Some special registers are so central to \MMIX's operation, they are
|
1933 |
|
|
carried along with each control block in the reorder buffer instead of being
|
1934 |
|
|
treated as source and destination registers of each instruction. For example,
|
1935 |
|
|
the register stack pointers rO and~rS are treated in this way.
|
1936 |
|
|
The normal specnodes for rO and~rS, namely |g[rO]| and~|g[rS]|,
|
1937 |
|
|
are not actually used;
|
1938 |
|
|
the cool values are called |cool_O| and |cool_S|.
|
1939 |
|
|
(Actually |cool_O| and |cool_S| correspond to the register
|
1940 |
|
|
values divided by~8, since rO and~rS are always multiples of~8.)
|
1941 |
|
|
|
1942 |
|
|
The arithmetic status register, rA, is also treated specially. Its
|
1943 |
|
|
event bits are kept up to date only at the ``hot'' end, by accumulating
|
1944 |
|
|
values of |arith_exc|; an instruction
|
1945 |
|
|
to \.{GET} the value of~rA will be executed only in the hot seat.
|
1946 |
|
|
The other bits of~rA, which are needed to control trip handlers and
|
1947 |
|
|
floating point rounding, are treated in the normal way.
|
1948 |
|
|
|
1949 |
|
|
@=
|
1950 |
|
|
Extern octa cool_O,cool_S; /* values of rO, rS before the |cool| instruction */
|
1951 |
|
|
|
1952 |
|
|
@ @=
|
1953 |
|
|
int cool_L,cool_G; /* values of rL and rG before the |cool| instruction */
|
1954 |
|
|
unsigned int cool_hist,peek_hist; /* history bits for branch prediction */
|
1955 |
|
|
octa new_O,new_S; /* values of rO, rS after |cool| */
|
1956 |
|
|
|
1957 |
|
|
@ @=
|
1958 |
|
|
cool->op=op; @+cool->i=i;
|
1959 |
|
|
cool->xx=(head->inst>>16)&0xff;@+
|
1960 |
|
|
cool->yy=(head->inst>>8)&0xff;@+
|
1961 |
|
|
cool->zz=(head->inst)&0xff;
|
1962 |
|
|
cool->loc=head->loc;
|
1963 |
|
|
cool->y=cool->z=cool->b=cool->ra=zero_spec;
|
1964 |
|
|
cool->x.o=cool->a.o=cool->rl.o=zero_octa;
|
1965 |
|
|
cool->x.known=false; cool->x.up=NULL;
|
1966 |
|
|
cool->a.known=false; cool->a.up=NULL;
|
1967 |
|
|
cool->rl.known=true; cool->rl.up=NULL;
|
1968 |
|
|
cool->need_b=cool->need_ra=
|
1969 |
|
|
cool->ren_x=cool->mem_x=cool->ren_a=cool->set_l=false;
|
1970 |
|
|
cool->arith_exc=cool->denin=cool->denout=0;
|
1971 |
|
|
if ((head->loc.h&sign_bit) && !(g[rU].o.h&0x8000)) cool->usage=false;
|
1972 |
|
|
else cool->usage=((op&(g[rU].o.h>>16))==g[rU].o.h>>24? true: false);
|
1973 |
|
|
new_O=cool->cur_O=cool_O;@+ new_S=cool->cur_S=cool_S;
|
1974 |
|
|
cool->interrupt=head->interrupt;
|
1975 |
|
|
cool->hist=peek_hist;
|
1976 |
|
|
cool->go.o=incr(cool->loc,4);
|
1977 |
|
|
cool->go.known=false, cool->go.addr.h=-1,cool->go.up=(specnode*)cool;
|
1978 |
|
|
cool->interim=false;
|
1979 |
|
|
|
1980 |
|
|
@ @=
|
1981 |
|
|
if (new_cool==hot) goto stall; /* reorder buffer is full */
|
1982 |
|
|
@;
|
1983 |
|
|
@;
|
1984 |
|
|
if (f&X_is_dest_bit) @
|
1985 |
|
|
an internal command and |goto dispatch_done| if X is marginal@>;
|
1986 |
|
|
switch (i) {
|
1987 |
|
|
@@;
|
1988 |
|
|
default: break;
|
1989 |
|
|
}
|
1990 |
|
|
dispatch_done:@;
|
1991 |
|
|
|
1992 |
|
|
@ The \.{UNSAVE} operation begins by loading register~rG from memory.
|
1993 |
|
|
We don't really need to know the value of~rG until twelve other registers
|
1994 |
|
|
have been unsaved, so we aren't fussy about it here.
|
1995 |
|
|
|
1996 |
|
|
@=
|
1997 |
|
|
if (!g[rL].up->known) goto stall;
|
1998 |
|
|
cool_L=g[rL].up->o.l;
|
1999 |
|
|
if (!g[rG].up->known && !(op==UNSAVE && cool->xx==1)) goto stall;
|
2000 |
|
|
cool_G=g[rG].up->o.l;
|
2001 |
|
|
|
2002 |
|
|
@ @=
|
2003 |
|
|
if (resuming)
|
2004 |
|
|
@@;
|
2005 |
|
|
else{
|
2006 |
|
|
if (f&0x10) @b| from register X@>@;
|
2007 |
|
|
if (third_operand[op] && (cool->i!=trap))
|
2008 |
|
|
@b| and/or |cool->ra| from special register@>;
|
2009 |
|
|
if (f&0x1) cool->z.o.l=cool->zz;
|
2010 |
|
|
else if (f&0x2) @z| from register Z@>@;
|
2011 |
|
|
else if ((op&0xf0)==0xe0) @z| as an immediate wyde@>;
|
2012 |
|
|
if (f&0x4) cool->y.o.l=cool->yy;
|
2013 |
|
|
else if (f&0x8) @y| from register Y@>@;
|
2014 |
|
|
}
|
2015 |
|
|
|
2016 |
|
|
@ @z| from register Z@>=
|
2017 |
|
|
{
|
2018 |
|
|
if (cool->zz>=cool_G) cool->z=specval(&g[cool->zz]);
|
2019 |
|
|
else if (cool->zzz=specval(&l[(cool_O.l+cool->zz)&lring_mask]);
|
2020 |
|
|
}
|
2021 |
|
|
|
2022 |
|
|
@ @y| from register Y@>=
|
2023 |
|
|
{
|
2024 |
|
|
if (cool->yy>=cool_G) cool->y=specval(&g[cool->yy]);
|
2025 |
|
|
else if (cool->yyy=specval(&l[(cool_O.l+cool->yy)&lring_mask]);
|
2026 |
|
|
}
|
2027 |
|
|
|
2028 |
|
|
@ @b| from register X@>=
|
2029 |
|
|
{
|
2030 |
|
|
if (cool->xx>=cool_G) cool->b=specval(&g[cool->xx]);
|
2031 |
|
|
else if (cool->xx
|
2032 |
|
|
cool->b=specval(&l[(cool_O.l+cool->xx)&lring_mask]);
|
2033 |
|
|
if (f&rel_addr_bit) cool->need_b=true; /* |br|, |pbr| */
|
2034 |
|
|
}
|
2035 |
|
|
|
2036 |
|
|
@ If an operation requires a special register as third operand,
|
2037 |
|
|
that register is listed in the |third_operand| table.
|
2038 |
|
|
|
2039 |
|
|
@=
|
2040 |
|
|
unsigned char third_operand[256]={@/
|
2041 |
|
|
0,rA,0,0,rA,rA,rA,rA, /* \.{TRAP}, \dots\ */
|
2042 |
|
|
rA,rA,rA,rA,rA,rA,rA,rA, /* \.{FLOT}, \dots\ */
|
2043 |
|
|
rA,rE,rE,rE,rA,rA,rA,rA, /* \.{FMUL}, \dots\ */
|
2044 |
|
|
rA,rA,0,0,rA,rA,rD,rD, /* \.{MUL}, \dots\ */
|
2045 |
|
|
rA,rA,0,0,rA,rA,0,0, /* \.{ADD}, \dots\ */
|
2046 |
|
|
0,0,0,0,0,0,0,0, /* \.{2ADDU}, \dots\ */
|
2047 |
|
|
0,0,0,0,rA,rA,0,0, /* \.{CMP}, \dots\ */
|
2048 |
|
|
rA,rA,0,0,0,0,0,0, /* \.{SL}, \dots\ */
|
2049 |
|
|
0,0,0,0,0,0,0,0, /* \.{BN}, \dots\ */
|
2050 |
|
|
0,0,0,0,0,0,0,0, /* \.{BNN}, \dots\ */
|
2051 |
|
|
0,0,0,0,0,0,0,0, /* \.{PBN}, \dots\ */
|
2052 |
|
|
0,0,0,0,0,0,0,0, /* \.{PBNN}, \dots\ */
|
2053 |
|
|
0,0,0,0,0,0,0,0, /* \.{CSN}, \dots\ */
|
2054 |
|
|
0,0,0,0,0,0,0,0, /* \.{CSNN}, \dots\ */
|
2055 |
|
|
0,0,0,0,0,0,0,0, /* \.{ZSN}, \dots\ */
|
2056 |
|
|
0,0,0,0,0,0,0,0, /* \.{ZSNN}, \dots\ */
|
2057 |
|
|
0,0,0,0,0,0,0,0, /* \.{LDB}, \dots\ */
|
2058 |
|
|
0,0,0,0,0,0,0,0, /* \.{LDT}, \dots\ */
|
2059 |
|
|
0,0,0,0,0,0,0,0, /* \.{LDSF}, \dots\ */
|
2060 |
|
|
0,0,0,0,0,0,0,0, /* \.{LDVTS}, \dots\ */
|
2061 |
|
|
rA,rA,0,0,rA,rA,0,0, /* \.{STB}, \dots\ */
|
2062 |
|
|
rA,rA,0,0,0,0,0,0, /* \.{STT}, \dots\ */
|
2063 |
|
|
rA,rA,0,0,0,0,0,0, /* \.{STSF}, \dots\ */
|
2064 |
|
|
0,0,0,0,0,0,0,0, /* \.{SYNCD}, \dots\ */
|
2065 |
|
|
0,0,0,0,0,0,0,0, /* \.{OR}, \dots\ */
|
2066 |
|
|
0,0,0,0,0,0,0,0, /* \.{AND}, \dots\ */
|
2067 |
|
|
0,0,0,0,0,0,0,0, /* \.{BDIF}, \dots\ */
|
2068 |
|
|
rM,rM,0,0,0,0,0,0, /* \.{MUX}, \dots\ */
|
2069 |
|
|
0,0,0,0,0,0,0,0, /* \.{SETH}, \dots\ */
|
2070 |
|
|
0,0,0,0,0,0,0,0, /* \.{ORH}, \dots\ */
|
2071 |
|
|
0,0,0,0,0,0,0,0, /* \.{JMP}, \dots\ */
|
2072 |
|
|
rJ,0,0,0,0,0,0,255}; /* \.{POP}, \dots\ */
|
2073 |
|
|
|
2074 |
|
|
@ The |cool->b| field is busy in operations like \.{STB} or \.{STSF},
|
2075 |
|
|
which need~rA. So we use |cool->ra| instead, when rA is needed.
|
2076 |
|
|
|
2077 |
|
|
@b| and/or |cool->ra| from special register@>=
|
2078 |
|
|
{
|
2079 |
|
|
if (third_operand[op]==rA || third_operand[op]==rE)
|
2080 |
|
|
cool->need_ra=true, cool->ra=specval(&g[rA]);
|
2081 |
|
|
if (third_operand[op]!=rA)
|
2082 |
|
|
cool->need_b=true, cool->b=specval(&g[third_operand[op]]);
|
2083 |
|
|
}
|
2084 |
|
|
|
2085 |
|
|
@ @z| as an immediate wyde@>=
|
2086 |
|
|
{ switch (op&3) {
|
2087 |
|
|
case 0: cool->z.o.h=yz<<16;@+break;
|
2088 |
|
|
case 1: cool->z.o.h=yz;@+break;
|
2089 |
|
|
case 2: cool->z.o.l=yz<<16;@+break;
|
2090 |
|
|
case 3: cool->z.o.l=yz;@+break;
|
2091 |
|
|
}
|
2092 |
|
|
if (i!=set) { /* register X should also be the Y operand */
|
2093 |
|
|
cool->y=cool->b; cool->b=zero_spec;
|
2094 |
|
|
}
|
2095 |
|
|
}
|
2096 |
|
|
|
2097 |
|
|
@ @=
|
2098 |
|
|
{
|
2099 |
|
|
if (cool->xx>=cool_G) {
|
2100 |
|
|
if (i!=pushgo && i!=pushj)
|
2101 |
|
|
cool->ren_x=true,spec_install(&g[cool->xx],&cool->x);
|
2102 |
|
|
}@+else if (cool->xx
|
2103 |
|
|
cool->ren_x=true,
|
2104 |
|
|
spec_install(&l[(cool_O.l+cool->xx)&lring_mask],&cool->x);
|
2105 |
|
|
else { /* we need to increase L before issuing |head->inst| */
|
2106 |
|
|
increase_L:@+ if (((cool_S.l-cool_O.l-cool_L-1)&lring_mask)==0)
|
2107 |
|
|
@@;
|
2108 |
|
|
else @;
|
2109 |
|
|
}
|
2110 |
|
|
}
|
2111 |
|
|
|
2112 |
|
|
@ @=
|
2113 |
|
|
if (rename_regsren_x+cool->ren_a) goto stall;
|
2114 |
|
|
if (cool->mem_x)
|
2115 |
|
|
if (mem_slots) mem_slots--;@+else goto stall;
|
2116 |
|
|
rename_regs-=cool->ren_x+cool->ren_a;
|
2117 |
|
|
|
2118 |
|
|
@ The |incrl| instruction
|
2119 |
|
|
advances $\beta$ and~rL by~1 at a time when we know that $\beta\ne\gamma$,
|
2120 |
|
|
in the ring of local registers.
|
2121 |
|
|
|
2122 |
|
|
@=
|
2123 |
|
|
{
|
2124 |
|
|
cool->i=incrl;
|
2125 |
|
|
spec_install(&l[(cool_O.l+cool_L)&lring_mask],&cool->x);
|
2126 |
|
|
cool->need_b=cool->need_ra=false;
|
2127 |
|
|
cool->y=cool->z=zero_spec;
|
2128 |
|
|
cool->x.known=true; /* |cool->x.o=zero_octa| */
|
2129 |
|
|
spec_install(&g[rL],&cool->rl);
|
2130 |
|
|
cool->rl.o.l=cool_L+1;
|
2131 |
|
|
cool->ren_x=cool->set_l=true;
|
2132 |
|
|
op=SETH; /* this instruction to be handled by the simplest units */
|
2133 |
|
|
cool->interim=true;
|
2134 |
|
|
goto dispatch_done;
|
2135 |
|
|
}
|
2136 |
|
|
|
2137 |
|
|
@ The |incgamma| instruction advances $\gamma$ and rS by storing an octabyte
|
2138 |
|
|
from the local register ring to virtual memory location |cool_S<<3|.
|
2139 |
|
|
|
2140 |
|
|
@=
|
2141 |
|
|
{
|
2142 |
|
|
cool->need_b=cool->need_ra=false;
|
2143 |
|
|
cool->i=incgamma;
|
2144 |
|
|
new_S=incr(cool_S,1);
|
2145 |
|
|
cool->b=specval(&l[cool_S.l&lring_mask]);
|
2146 |
|
|
cool->y.p=NULL, cool->y.o=shift_left(cool_S,3);
|
2147 |
|
|
cool->z=zero_spec;
|
2148 |
|
|
cool->mem_x=true, spec_install(&mem,&cool->x);
|
2149 |
|
|
op=STOU; /* this instruction needs to be handled by load/store unit */
|
2150 |
|
|
cool->interim=true;
|
2151 |
|
|
goto dispatch_done;
|
2152 |
|
|
}
|
2153 |
|
|
|
2154 |
|
|
@ The |decgamma| instruction decreases $\gamma$ and rS by loading an octabyte
|
2155 |
|
|
from virtual memory location |(cool_S-1)<<3| into the local register ring.
|
2156 |
|
|
|
2157 |
|
|
@=
|
2158 |
|
|
{
|
2159 |
|
|
cool->i=decgamma;
|
2160 |
|
|
new_S=incr(cool_S,-1);
|
2161 |
|
|
cool->z=cool->b=zero_spec; cool->need_b=false;
|
2162 |
|
|
cool->y.p=NULL, cool->y.o=shift_left(new_S,3);
|
2163 |
|
|
cool->ren_x=true, spec_install(&l[new_S.l&lring_mask],&cool->x);
|
2164 |
|
|
op=LDOU; /* this instruction needs to be handled by load/store unit */
|
2165 |
|
|
cool->interim=true;
|
2166 |
|
|
cool->ptr_a=(void*)mem.up;
|
2167 |
|
|
goto dispatch_done;
|
2168 |
|
|
}
|
2169 |
|
|
|
2170 |
|
|
@ Storing into memory requires a doubly linked data list of specnodes
|
2171 |
|
|
like the lists we use for local and global registers. In this case
|
2172 |
|
|
the head of the list is called |mem|, and the |addr| fields are
|
2173 |
|
|
physical addresses in memory.
|
2174 |
|
|
|
2175 |
|
|
@=
|
2176 |
|
|
Extern specnode mem;
|
2177 |
|
|
|
2178 |
|
|
@ The |addr| field of a memory specnode
|
2179 |
|
|
is all 1s until the physical address has been computed.
|
2180 |
|
|
|
2181 |
|
|
@=
|
2182 |
|
|
mem.addr.h=mem.addr.l=-1;
|
2183 |
|
|
mem.up=mem.down=&mem;
|
2184 |
|
|
|
2185 |
|
|
@ The \.{CSWAP} operation is treated as a partial store, with \$X
|
2186 |
|
|
as a secondary output. Partial store (|pst|) commands read an octabyte
|
2187 |
|
|
from memory before they write it.
|
2188 |
|
|
|
2189 |
|
|
@=
|
2190 |
|
|
case cswap: cool->ren_a=true;
|
2191 |
|
|
spec_install(cool->xx>=cool_G? &g[cool->xx]:
|
2192 |
|
|
&l[(cool_O.l+cool->xx)&lring_mask],&cool->a);
|
2193 |
|
|
cool->i=pst;
|
2194 |
|
|
case st:@+ if ((op&0xfe)==STCO) cool->b.o.l=cool->xx;
|
2195 |
|
|
case pst:
|
2196 |
|
|
cool->mem_x=true, spec_install(&mem,&cool->x);@+ break;
|
2197 |
|
|
case ld: case ldunc: cool->ptr_a=(void *)mem.up;@+ break;
|
2198 |
|
|
|
2199 |
|
|
@ When new data is \.{PUT} into special registers 15--20 (namely rK,
|
2200 |
|
|
rQ, rU, rV, rG, or~rL) it can affect many things. Therefore we stop
|
2201 |
|
|
issuing further instructions until such \.{PUT}s are committed.
|
2202 |
|
|
Moreover, we will see later that such drastic \.{PUT}s defer execution until
|
2203 |
|
|
they reach the hot seat.
|
2204 |
|
|
|
2205 |
|
|
@=
|
2206 |
|
|
case put:@+ if (cool->yy!=0 || cool->xx>=32) goto illegal_inst;
|
2207 |
|
|
if (cool->xx>=8) {
|
2208 |
|
|
if (cool->xx<=11) goto illegal_inst;
|
2209 |
|
|
if (cool->xx<=18 && !(cool->loc.h&sign_bit)) goto privileged_inst;
|
2210 |
|
|
}
|
2211 |
|
|
if (cool->xx>=15 && cool->xx<=20) freeze_dispatch=true;
|
2212 |
|
|
cool->ren_x=true, spec_install(&g[cool->xx],&cool->x);@+break;
|
2213 |
|
|
@#
|
2214 |
|
|
case get:@+ if (cool->yy || cool->zz>=32) goto illegal_inst;
|
2215 |
|
|
if (cool->zz==rO) cool->z.o=shift_left(cool_O,3);
|
2216 |
|
|
else if (cool->zz==rS) cool->z.o=shift_left(cool_S,3);
|
2217 |
|
|
else cool->z=specval(&g[cool->zz]);@+break;
|
2218 |
|
|
illegal_inst: cool->interrupt |= B_BIT;@+goto noop_inst;
|
2219 |
|
|
case ldvts:@+ if (cool->loc.h&sign_bit) break;
|
2220 |
|
|
privileged_inst: cool->interrupt |= K_BIT;
|
2221 |
|
|
noop_inst: cool->i=noop;@+break;
|
2222 |
|
|
|
2223 |
|
|
@ A \.{PUSHGO} instruction with $\rm X\ge G$ causes L to increase
|
2224 |
|
|
momentarily by~1, even if $\rm L=G$.
|
2225 |
|
|
But the value of~L will be decreased before the \.{PUSHGO}
|
2226 |
|
|
is complete, so it will never actually exceed~G. Moreover, we needn't
|
2227 |
|
|
insert an~|incrl| command.
|
2228 |
|
|
|
2229 |
|
|
@=
|
2230 |
|
|
case pushgo: inst_ptr.p=&cool->go;
|
2231 |
|
|
case pushj: {@+register int x=cool->xx;
|
2232 |
|
|
if (x>=cool_G) {
|
2233 |
|
|
if (((cool_S.l-cool_O.l-cool_L-1)&lring_mask)==0)
|
2234 |
|
|
@@;
|
2235 |
|
|
x=cool_L;@+ cool_L++;
|
2236 |
|
|
cool->ren_x=true, spec_install(&l[(cool_O.l+x)&lring_mask],&cool->x);
|
2237 |
|
|
}
|
2238 |
|
|
cool->x.known=true, cool->x.o.h=0, cool->x.o.l=x;
|
2239 |
|
|
cool->ren_a=true, spec_install(&g[rJ],&cool->a);
|
2240 |
|
|
cool->a.known=true, cool->a.o=incr(cool->loc,4);
|
2241 |
|
|
cool->set_l=true, spec_install(&g[rL],&cool->rl);
|
2242 |
|
|
cool->rl.o.l=cool_L-x-1;
|
2243 |
|
|
new_O=incr(cool_O,x+1);
|
2244 |
|
|
}@+break;
|
2245 |
|
|
case syncid: if (cool->loc.h&sign_bit) break;
|
2246 |
|
|
case go: inst_ptr.p=&cool->go;@+break;
|
2247 |
|
|
|
2248 |
|
|
@ We need to know the topmost ``hidden'' element of the register stack
|
2249 |
|
|
when a \.{POP} instruction is dispatched. This element is usually
|
2250 |
|
|
present in the local register ring, unless $\gamma=\alpha$.
|
2251 |
|
|
|
2252 |
|
|
Once it is known, let $x$ be its least significant byte. We will
|
2253 |
|
|
be decreasing rO by $x+1$, so we may have to decrease $\gamma$ repeatedly
|
2254 |
|
|
in order to maintain the condition $\rm rS\le rO$.
|
2255 |
|
|
|
2256 |
|
|
@=
|
2257 |
|
|
case pop:@+if (cool->xx && cool_L>=cool->xx)
|
2258 |
|
|
cool->y=specval(&l[(cool_O.l+cool->xx-1)&lring_mask]);
|
2259 |
|
|
pop_unsave:@+if (cool_S.l==cool_O.l)
|
2260 |
|
|
@;
|
2261 |
|
|
{@+register tetra x; register int new_L;
|
2262 |
|
|
register specnode *p=l[(cool_O.l-1)&lring_mask].up;
|
2263 |
|
|
if (p->known) x=(p->o.l)&0xff;@+ else goto stall;
|
2264 |
|
|
if ((tetra)(cool_O.l-cool_S.l)<=x)
|
2265 |
|
|
@;
|
2266 |
|
|
new_O=incr(cool_O,-x-1);
|
2267 |
|
|
if (cool->i==pop) new_L=x+(cool->xx<=cool_L? cool->xx: cool_L+1);
|
2268 |
|
|
else new_L=x;
|
2269 |
|
|
if (new_L>cool_G) new_L=cool_G;
|
2270 |
|
|
if (x
|
2271 |
|
|
cool->ren_x=true, spec_install(&l[(cool_O.l-1)&lring_mask],&cool->x);
|
2272 |
|
|
cool->set_l=true, spec_install(&g[rL],&cool->rl);
|
2273 |
|
|
cool->rl.o.l=new_L;
|
2274 |
|
|
if (cool->i==pop) {
|
2275 |
|
|
cool->z.o.l=yz<<2;
|
2276 |
|
|
if (inst_ptr.p==UNKNOWN_SPEC && new_head==tail) inst_ptr.p=&cool->go;
|
2277 |
|
|
}
|
2278 |
|
|
break;
|
2279 |
|
|
}
|
2280 |
|
|
|
2281 |
|
|
@ @=
|
2282 |
|
|
case mulu: cool->ren_a=true, spec_install(&g[rH],&cool->a);@+break;
|
2283 |
|
|
case div: case divu: cool->ren_a=true, spec_install(&g[rR],&cool->a);@+break;
|
2284 |
|
|
|
2285 |
|
|
@ It's tempting to say that we could avoid taking up space in the reorder
|
2286 |
|
|
buffer when no operation needs to be done.
|
2287 |
|
|
A \.{JMP} instruction qualifies as a no-op in this sense,
|
2288 |
|
|
because the change of control occurs before the execution stage.
|
2289 |
|
|
However, even a no-op might have to be counted in the usage register~rU,
|
2290 |
|
|
so it might get into the execution stage for that reason.
|
2291 |
|
|
A no-op can also cause a protection interrupt, if it appears in a negative
|
2292 |
|
|
location. Even more importantly, a program might get into a loop that consists
|
2293 |
|
|
entirely of jumps and no-ops; then we wouldn't be able to interrupt it,
|
2294 |
|
|
because the interruption mechanism needs to find the current location
|
2295 |
|
|
in the reorder buffer! At least one functional unit therefore needs to provide
|
2296 |
|
|
explicit support for \.{JMP}, \.{JMPB}, and \.{SWYM}.
|
2297 |
|
|
|
2298 |
|
|
The \.{SWYM} instruction with |F_BIT| set is a special case: This is
|
2299 |
|
|
a request from the fetch coroutine for an update to the IT-cache,
|
2300 |
|
|
when the page table method isn't implemented in hardware.
|
2301 |
|
|
|
2302 |
|
|
@=
|
2303 |
|
|
case noop:@+if (cool->interrupt&F_BIT) {
|
2304 |
|
|
cool->go.o=cool->y.o=cool->loc;
|
2305 |
|
|
inst_ptr=specval(&g[rT]);
|
2306 |
|
|
}
|
2307 |
|
|
break;
|
2308 |
|
|
|
2309 |
|
|
@ @=
|
2310 |
|
|
if (cool->ren_x || cool->mem_x) spec_rem(&cool->x);
|
2311 |
|
|
if (cool->ren_a) spec_rem(&cool->a);
|
2312 |
|
|
if (cool->set_l) spec_rem(&cool->rl);
|
2313 |
|
|
if (inst_ptr.p==&cool->go) inst_ptr.p=UNKNOWN_SPEC;
|
2314 |
|
|
break;
|
2315 |
|
|
|
2316 |
|
|
@* The execution stages. \MMIX's {\it raison d'\^etre\/} is its ability
|
2317 |
|
|
to execute instructions. So now we want to simulate the behavior of its
|
2318 |
|
|
functional units.
|
2319 |
|
|
|
2320 |
|
|
Each coroutine scheduled for action at the current tick of the clock has a
|
2321 |
|
|
|stage| number corresponding to a particular subset of the \MMIX\ hardware.
|
2322 |
|
|
For example, the coroutines with |stage=2| are the second stages in the
|
2323 |
|
|
pipelines of the functional units. A coroutine with |stage=0| works
|
2324 |
|
|
in the fetch unit. Several artificially large stage numbers
|
2325 |
|
|
are used to control special coroutines that do things like write data
|
2326 |
|
|
from buffers into memory.
|
2327 |
|
|
|
2328 |
|
|
In this program the current coroutine of interest is called |self|; hence
|
2329 |
|
|
|self->stage| is the current stage number of interest. Another key variable,
|
2330 |
|
|
|self->ctl|, is called~|data|; this is the control block being operated on by
|
2331 |
|
|
the current coroutine. We typically are simulating an operation in which
|
2332 |
|
|
|data->x| is being computed as a function of |data->y| and |data->z|.
|
2333 |
|
|
The |data| record has many fields, as described earlier when we defined
|
2334 |
|
|
\&{control} structures; for example, |data->owner| is the same as
|
2335 |
|
|
|self|, during the execution stage, if it is nonnull.
|
2336 |
|
|
|
2337 |
|
|
This part of the simulator is written as if each functional unit is able to
|
2338 |
|
|
handle all 256 operations. In practice, of course, a functional unit tends to
|
2339 |
|
|
be much more specialized; the actual specialization is governed by the
|
2340 |
|
|
dispatcher, which issues an instruction only to a functional unit that
|
2341 |
|
|
supports it. Once an instruction has been dispatched, however, we can simulate
|
2342 |
|
|
it most easily if we imagine that its functional unit is universal.
|
2343 |
|
|
|
2344 |
|
|
Coroutines with higher |stage| numbers are processed first.
|
2345 |
|
|
The three most important variables that govern a coroutine's behavior, once
|
2346 |
|
|
|self->stage| is given, are the external operation code |data->op|, the
|
2347 |
|
|
internal operation code |data->i|, and the value of |data->state|. We
|
2348 |
|
|
typically have |data->state=0| when a coroutine is first fired~up.
|
2349 |
|
|
|
2350 |
|
|
@=
|
2351 |
|
|
register coroutine *self; /* the current coroutine being executed */
|
2352 |
|
|
register control *data; /* the |control| block of the current coroutine */
|
2353 |
|
|
|
2354 |
|
|
@ When a coroutine has done all it wants to on a single cycle,
|
2355 |
|
|
it says |goto done|. It will not be scheduled to do any further work
|
2356 |
|
|
unless the |schedule| routine has been called since it began execution.
|
2357 |
|
|
The |wait| macro is a convenient way to say ``Please schedule me to resume
|
2358 |
|
|
again at the current |data->state|'' after a specified time; for example,
|
2359 |
|
|
|wait(1)| will restart a coroutine on the next clock tick.
|
2360 |
|
|
|
2361 |
|
|
@d wait(t)@+ {@+schedule(self,t,data->state);@+ goto done;@+}
|
2362 |
|
|
@d pass_after(t) schedule(self+1,t,data->state)
|
2363 |
|
|
@d sleep@+ {@+self->next=self;@+ goto done;@+} /* wait forever */
|
2364 |
|
|
@d awaken(c,t) schedule(c,t,c->ctl->state)
|
2365 |
|
|
|
2366 |
|
|
@=
|
2367 |
|
|
cur_time++;@+ if (cur_time==ring_size) cur_time=0;
|
2368 |
|
|
for (self=queuelist(cur_time);self!=&sentinel;self=sentinel.next) {
|
2369 |
|
|
sentinel.next=self->next;@+self->next=NULL; /* unschedule this coroutine */
|
2370 |
|
|
data=self->ctl;
|
2371 |
|
|
if (verbose&coroutine_bit) {
|
2372 |
|
|
printf(" running ");@+print_coroutine_id(self);@+printf(" ");
|
2373 |
|
|
print_control_block(data);@+printf("\n");
|
2374 |
|
|
}
|
2375 |
|
|
switch(self->stage) {
|
2376 |
|
|
case 0:@;
|
2377 |
|
|
case 1:@;
|
2378 |
|
|
default:@;
|
2379 |
|
|
@t\4@>@;
|
2380 |
|
|
}
|
2381 |
|
|
terminate:@+if (self->lockloc) *(self->lockloc)=NULL,self->lockloc=NULL;
|
2382 |
|
|
done:;
|
2383 |
|
|
}
|
2384 |
|
|
|
2385 |
|
|
@ A special coroutine whose |stage| number is |vanish| simply goes away
|
2386 |
|
|
at its scheduled time.
|
2387 |
|
|
|
2388 |
|
|
@=
|
2389 |
|
|
case vanish: goto terminate;
|
2390 |
|
|
|
2391 |
|
|
@ @=
|
2392 |
|
|
coroutine mem_locker; /* trivial coroutine that vanishes */
|
2393 |
|
|
coroutine Dlocker; /* another */
|
2394 |
|
|
control vanish_ctl; /* such coroutines share a common control block */
|
2395 |
|
|
|
2396 |
|
|
@ @=
|
2397 |
|
|
mem_locker.name="Locker";
|
2398 |
|
|
mem_locker.ctl=&vanish_ctl;
|
2399 |
|
|
mem_locker.stage=vanish;
|
2400 |
|
|
Dlocker.name="Dlocker";
|
2401 |
|
|
Dlocker.ctl=&vanish_ctl;
|
2402 |
|
|
Dlocker.stage=vanish;
|
2403 |
|
|
vanish_ctl.go.o.l=4;
|
2404 |
|
|
for (j=0;jports;j++) DTcache->reader[j].ctl=&vanish_ctl;
|
2405 |
|
|
if (Dcache) for (j=0;jports;j++) Dcache->reader[j].ctl=&vanish_ctl;
|
2406 |
|
|
for (j=0;jports;j++) ITcache->reader[j].ctl=&vanish_ctl;
|
2407 |
|
|
if (Icache) for (j=0;jports;j++) Icache->reader[j].ctl=&vanish_ctl;
|
2408 |
|
|
|
2409 |
|
|
@ Here is a list of the |stage| numbers for special coroutines to be
|
2410 |
|
|
defined below.
|
2411 |
|
|
|
2412 |
|
|
@
|
2413 |
|
|
#define max_stage 99 /* exceeds all |stage| numbers */
|
2414 |
|
|
#define vanish 98 /* special coroutine that just goes away */
|
2415 |
|
|
#define flush_to_mem 97 /* coroutine for flushing from a cache to memory */
|
2416 |
|
|
#define flush_to_S 96 /* coroutine for flushing from a cache to the S-cache */
|
2417 |
|
|
#define fill_from_mem 95 /* coroutine for filling a cache from memory */
|
2418 |
|
|
#define fill_from_S 94 /* coroutine for filling a cache from the S-cache */
|
2419 |
|
|
#define fill_from_virt 93 /* coroutine for filling a translation cache */
|
2420 |
|
|
#define write_from_wbuf 92 /* coroutine for emptying the write buffer */
|
2421 |
|
|
#define cleanup 91 /* coroutine for cleaning the caches */
|
2422 |
|
|
|
2423 |
|
|
@ At the very beginning of stage 1, a functional unit will stall if necessary
|
2424 |
|
|
until its operands are available. As soon as the operands are all present, the
|
2425 |
|
|
|state| is set nonzero and execution proper begins.
|
2426 |
|
|
|
2427 |
|
|
@=
|
2428 |
|
|
switch1:@+ switch(data->state) {
|
2429 |
|
|
case 0: @;
|
2430 |
|
|
case 1: @;
|
2431 |
|
|
case 2: @;
|
2432 |
|
|
case 3: @;
|
2433 |
|
|
@;
|
2434 |
|
|
}
|
2435 |
|
|
|
2436 |
|
|
@ If some of our input data has been computed by another coroutine on the
|
2437 |
|
|
current cycle, we grab it now but wait for the next cycle. (An actual machine
|
2438 |
|
|
wouldn't have latched the data until then.)
|
2439 |
|
|
|
2440 |
|
|
@=
|
2441 |
|
|
j=0;
|
2442 |
|
|
if (data->y.p) {
|
2443 |
|
|
j++;
|
2444 |
|
|
if (data->y.p->known) data->y.o=data->y.p->o, data->y.p=NULL;
|
2445 |
|
|
else j+=10;
|
2446 |
|
|
}
|
2447 |
|
|
if (data->z.p) {
|
2448 |
|
|
j++;
|
2449 |
|
|
if (data->z.p->known) data->z.o=data->z.p->o, data->z.p=NULL;
|
2450 |
|
|
else j+=10;
|
2451 |
|
|
}
|
2452 |
|
|
if (data->b.p) {
|
2453 |
|
|
if (data->need_b) j++;
|
2454 |
|
|
if (data->b.p->known) data->b.o=data->b.p->o, data->b.p=NULL;
|
2455 |
|
|
else if (data->need_b) j+=10;
|
2456 |
|
|
}
|
2457 |
|
|
if (data->ra.p) {
|
2458 |
|
|
if (data->need_ra) j++;
|
2459 |
|
|
if (data->ra.p->known) data->ra.o=data->ra.p->o, data->ra.p=NULL;
|
2460 |
|
|
else if (data->need_ra) j+=10;
|
2461 |
|
|
}
|
2462 |
|
|
if (j<10) data->state=1;
|
2463 |
|
|
if (j) wait(1); /* otherwise we fall through to case 1 */
|
2464 |
|
|
|
2465 |
|
|
@ Simple register-to-register instructions like \.{ADD} are assumed to take
|
2466 |
|
|
just one cycle, but others like \.{FADD} almost certainly require more time.
|
2467 |
|
|
This simulator can be configured so that \.{FADD} might take, say, four
|
2468 |
|
|
pipeline stages of one cycle each ($1+1+1+1$), or two pipeline stages of two
|
2469 |
|
|
cycles each ($2+2$), or a single unpipelined stage lasting four cycles (4),
|
2470 |
|
|
etc. In any case the simulator computes the results now, for simplicity,
|
2471 |
|
|
placing them in |data->x| and possibly also in |data->a| and/or
|
2472 |
|
|
|data->interrupt|. The results will not be officially made |known| until
|
2473 |
|
|
the proper time.
|
2474 |
|
|
|
2475 |
|
|
@=
|
2476 |
|
|
switch (data->i) {
|
2477 |
|
|
@;
|
2478 |
|
|
@;
|
2479 |
|
|
@;
|
2480 |
|
|
}
|
2481 |
|
|
@;
|
2482 |
|
|
|
2483 |
|
|
@ If the internal opcode |data->i| is |max_pipe_op| or less, a special
|
2484 |
|
|
pipeline sequence like $1+1+1+1$ or $2+2$ or $15+10$, etc., has been
|
2485 |
|
|
configured. Otherwise we assume that the pipeline sequence is simply~1.
|
2486 |
|
|
|
2487 |
|
|
Suppose the pipeline sequence is $t_1+t_2+\cdots+t_k$. Each $t_j$ is
|
2488 |
|
|
positive and less than~256, so we represent the sequence as a
|
2489 |
|
|
string |pipe_seq[data->i]| of unsigned ``characters,'' terminated by~0.
|
2490 |
|
|
Given such a string, we want to do the following: Wait $(t_1-1)$ cycles
|
2491 |
|
|
and pass |data| to stage~2; wait $t_2$ cycles and pass |data| to stage~3;
|
2492 |
|
|
\dots; wait $t_{k-1}$ cycles and pass |data| to stage~$k$; wait $t_k$ cycles
|
2493 |
|
|
and make the results |known|.
|
2494 |
|
|
|
2495 |
|
|
The value of |denin| is added to $t_1$; the value of |denout| is
|
2496 |
|
|
added to~$t_k$.
|
2497 |
|
|
|
2498 |
|
|
@=
|
2499 |
|
|
data->state=3;
|
2500 |
|
|
if (data->i<=max_pipe_op) {@+register unsigned char *s=pipe_seq[data->i];
|
2501 |
|
|
j=s[0]+data->denin;
|
2502 |
|
|
if (s[1]) data->state=2; /* more than one stage */
|
2503 |
|
|
else j+=data->denout;
|
2504 |
|
|
if (j>1) wait(j-1);
|
2505 |
|
|
}
|
2506 |
|
|
goto switch1;
|
2507 |
|
|
|
2508 |
|
|
@ When we're in stage $j$, the coroutine for stage $j+1$ of the same functional
|
2509 |
|
|
unit is |self+1|.
|
2510 |
|
|
|
2511 |
|
|
@=
|
2512 |
|
|
pass_data:@+
|
2513 |
|
|
if ((self+1)->next) wait(1); /* stall if the next stage is occupied */
|
2514 |
|
|
{@+register unsigned char *s=pipe_seq[data->i];
|
2515 |
|
|
j=s[self->stage];
|
2516 |
|
|
if (s[self->stage+1]==0) j+=data->denout,data->state=3;
|
2517 |
|
|
/* the next stage is the last */
|
2518 |
|
|
pass_after(j);
|
2519 |
|
|
}
|
2520 |
|
|
passit: (self+1)->ctl=data;
|
2521 |
|
|
data->owner=self+1;
|
2522 |
|
|
goto done;
|
2523 |
|
|
|
2524 |
|
|
@ @=
|
2525 |
|
|
switch2:@+if (data->b.p && data->b.p->known)
|
2526 |
|
|
data->b.o=data->b.p->o, data->b.p=NULL;
|
2527 |
|
|
switch(data->state) {
|
2528 |
|
|
case 0: panic(confusion("switch2"));
|
2529 |
|
|
case 1: @;
|
2530 |
|
|
case 2: goto pass_data;
|
2531 |
|
|
case 3: goto fin_ex;
|
2532 |
|
|
@;
|
2533 |
|
|
}
|
2534 |
|
|
|
2535 |
|
|
@ The default pipeline times use only one stage; they
|
2536 |
|
|
can be overridden by |MMIX_config|. The total number of stages
|
2537 |
|
|
supported by this simulator is limited to 90, since
|
2538 |
|
|
it must never interfere with the |stage| numbers for special coroutines
|
2539 |
|
|
defined below. (The author doesn't feel guilty about making this restriction.)
|
2540 |
|
|
|
2541 |
|
|
@=
|
2542 |
|
|
#define pipe_limit 90
|
2543 |
|
|
Extern unsigned char pipe_seq[max_pipe_op+1][pipe_limit+1];
|
2544 |
|
|
|
2545 |
|
|
@ The simplest of all register-to-register operations is |set|,
|
2546 |
|
|
which occurs for commands like \.{SETH} as well as for commands
|
2547 |
|
|
like \.{GETA}. (We might as well start with the easy cases and work our
|
2548 |
|
|
way up.)
|
2549 |
|
|
|
2550 |
|
|
@=
|
2551 |
|
|
case set: data->x.o=data->z.o;@+break;
|
2552 |
|
|
|
2553 |
|
|
@ Here are the basic boolean operations, which account for 24 of \MMIX's
|
2554 |
|
|
256 opcodes.
|
2555 |
|
|
|
2556 |
|
|
@=
|
2557 |
|
|
case or: data->x.o.h=data->y.o.h | data->z.o.h;
|
2558 |
|
|
data->x.o.l=data->y.o.l | data->z.o.l; break;
|
2559 |
|
|
case orn: data->x.o.h=data->y.o.h |~data->z.o.h;
|
2560 |
|
|
data->x.o.l=data->y.o.l |~data->z.o.l; break;
|
2561 |
|
|
case nor: data->x.o.h=~(data->y.o.h | data->z.o.h);
|
2562 |
|
|
data->x.o.l=~(data->y.o.l | data->z.o.l); break;
|
2563 |
|
|
case and: data->x.o.h=data->y.o.h & data->z.o.h;
|
2564 |
|
|
data->x.o.l=data->y.o.l & data->z.o.l; break;
|
2565 |
|
|
case andn: data->x.o.h=data->y.o.h &~data->z.o.h;
|
2566 |
|
|
data->x.o.l=data->y.o.l &~data->z.o.l; break;
|
2567 |
|
|
case nand: data->x.o.h=~(data->y.o.h & data->z.o.h);
|
2568 |
|
|
data->x.o.l=~(data->y.o.l & data->z.o.l); break;
|
2569 |
|
|
case xor: data->x.o.h=data->y.o.h ^ data->z.o.h;
|
2570 |
|
|
data->x.o.l=data->y.o.l ^ data->z.o.l; break;
|
2571 |
|
|
case nxor: data->x.o.h=data->y.o.h ^~data->z.o.h;
|
2572 |
|
|
data->x.o.l=data->y.o.l ^~data->z.o.l; break;
|
2573 |
|
|
|
2574 |
|
|
@ The implementation of \.{ADDU} is only slightly more difficult.
|
2575 |
|
|
It would be trivial except for the fact that internal opcode
|
2576 |
|
|
|addu| is used not only for the \.{ADDU[I]} and \.{INC[M][H,L]} operations,
|
2577 |
|
|
in which we simply want to add |data->y.o| to |data->z.o|, but also for
|
2578 |
|
|
operations like \.{4ADDU}.
|
2579 |
|
|
|
2580 |
|
|
@=
|
2581 |
|
|
case addu: data->x.o=oplus((data->op&0xf8)==0x28?@|
|
2582 |
|
|
shift_left(data->y.o,1+((data->op>>1)&0x3)): data->y.o, data->z.o);
|
2583 |
|
|
break;
|
2584 |
|
|
case subu: data->x.o=ominus(data->y.o,data->z.o);@+ break;
|
2585 |
|
|
|
2586 |
|
|
@ Signed addition and subtraction produce the same results as their
|
2587 |
|
|
unsigned counterparts, but overflow must also be detected. Overflow
|
2588 |
|
|
occurs when adding |y| to~|z| if and only if |y| and~|z| have the
|
2589 |
|
|
same sign but their sum has a different sign. Overflow occurs in
|
2590 |
|
|
the calculation |x=y-z| if and only if it occurs in the calculation~|y=x+z|.
|
2591 |
|
|
|
2592 |
|
|
@=
|
2593 |
|
|
case add: data->x.o=oplus(data->y.o,data->z.o);
|
2594 |
|
|
if (((data->y.o.h ^ data->z.o.h)&sign_bit)==0 &&
|
2595 |
|
|
((data->y.o.h ^ data->x.o.h)&sign_bit)!=0) data->interrupt|=V_BIT;
|
2596 |
|
|
break;
|
2597 |
|
|
case sub: data->x.o=ominus(data->y.o,data->z.o);
|
2598 |
|
|
if (((data->x.o.h ^ data->z.o.h)&sign_bit)==0 &&
|
2599 |
|
|
((data->y.o.h ^ data->x.o.h)&sign_bit)!=0) data->interrupt|=V_BIT;
|
2600 |
|
|
break;
|
2601 |
|
|
|
2602 |
|
|
@ The shift commands might take more than one cycle, or they might even be
|
2603 |
|
|
pipelined, if the default value of |pipe_seq[sh]| is changed. But we compute
|
2604 |
|
|
shifts all at once here, because other parts of the simulator will take care
|
2605 |
|
|
of the pipeline timing. (Notice that |shlu| is changed to |sh|, for this
|
2606 |
|
|
reason. Similar changes to the internal op codes are made for other operators
|
2607 |
|
|
below.)
|
2608 |
|
|
|
2609 |
|
|
@d shift_amt (data->z.o.h || data->z.o.l>=64? 64: data->z.o.l)
|
2610 |
|
|
|
2611 |
|
|
@=
|
2612 |
|
|
case shlu: data->x.o=shift_left(data->y.o,shift_amt);@+data->i=sh;@+ break;
|
2613 |
|
|
case shl: data->x.o=shift_left(data->y.o,shift_amt);@+data->i=sh;
|
2614 |
|
|
{@+octa tmpo;
|
2615 |
|
|
tmpo=shift_right(data->x.o,shift_amt,0);
|
2616 |
|
|
if (tmpo.h!=data->y.o.h || tmpo.l!=data->y.o.l) data->interrupt|=V_BIT;
|
2617 |
|
|
}@+break;
|
2618 |
|
|
case shru: data->x.o=shift_right(data->y.o,shift_amt,1);@+data->i=sh;@+ break;
|
2619 |
|
|
case shr: data->x.o=shift_right(data->y.o,shift_amt,0);@+data->i=sh;@+ break;
|
2620 |
|
|
|
2621 |
|
|
@ The \.{MUX} operation has three operands, namely |data->y|, |data->z|,
|
2622 |
|
|
and |data->b|; the third operand is the current (speculative) value of~rM, the
|
2623 |
|
|
special mask register. Otherwise \.{MUX} is unexceptional.
|
2624 |
|
|
|
2625 |
|
|
@=
|
2626 |
|
|
case mux: data->x.o.h=(data->y.o.h&data->b.o.h)+(data->z.o.h&~data->b.o.h);
|
2627 |
|
|
data->x.o.l=(data->y.o.l&data->b.o.l)+(data->z.o.l&~data->b.o.l);
|
2628 |
|
|
break;
|
2629 |
|
|
|
2630 |
|
|
@ Comparisons are a breeze.
|
2631 |
|
|
|
2632 |
|
|
@=
|
2633 |
|
|
case cmp:@+if ((data->y.o.h&sign_bit)>(data->z.o.h&sign_bit)) goto cmp_neg;
|
2634 |
|
|
if ((data->y.o.h&sign_bit)<(data->z.o.h&sign_bit)) goto cmp_pos;
|
2635 |
|
|
case cmpu:@+if (data->y.o.hz.o.h) goto cmp_neg;
|
2636 |
|
|
if (data->y.o.h>data->z.o.h) goto cmp_pos;
|
2637 |
|
|
if (data->y.o.lz.o.l) goto cmp_neg;
|
2638 |
|
|
if (data->y.o.l>data->z.o.l) goto cmp_pos;
|
2639 |
|
|
cmp_zero: break; /* |data->x| is zero */
|
2640 |
|
|
cmp_pos: data->x.o.l=1;@+ break; /* |data->x.o.h| is zero */
|
2641 |
|
|
cmp_neg: data->x.o=neg_one;@+ break;
|
2642 |
|
|
|
2643 |
|
|
@ The other operations will be deferred until later, now that we understand
|
2644 |
|
|
the basic ideas. But one more piece of code ought to be
|
2645 |
|
|
written before we move on, because
|
2646 |
|
|
it completes the execution stage for the simple cases already considered.
|
2647 |
|
|
|
2648 |
|
|
The |ren_x| and |ren_a| fields tell us whether the |x| and/or |a|
|
2649 |
|
|
fields contain valid information that should become officially known.
|
2650 |
|
|
|
2651 |
|
|
@=
|
2652 |
|
|
fin_ex:@+if (data->ren_x) data->x.known=true;
|
2653 |
|
|
else if (data->mem_x) data->x.known=true, data->x.addr.l&=-8;
|
2654 |
|
|
if (data->ren_a) data->a.known=true;
|
2655 |
|
|
if (data->loc.h&sign_bit)
|
2656 |
|
|
data->ra.o.l=0; /* no trips enabled for the operating system */
|
2657 |
|
|
if (data->interrupt&0xffff) @;
|
2658 |
|
|
die: data->owner=NULL;@+goto terminate; /* this coroutine now fades away */
|
2659 |
|
|
|
2660 |
|
|
@* The commission/deissue stage. Control blocks leave the reorder buffer
|
2661 |
|
|
either at the hot end (when they're committed) or at the cool end
|
2662 |
|
|
(when they're deissued). We hope most of them are committed, but
|
2663 |
|
|
from time to time our speculation is incorrect and we must deissue
|
2664 |
|
|
a sequence of instructions that prove to be unwanted. Deissuing must
|
2665 |
|
|
take priority over committing, because the dispatcher cannot do anything
|
2666 |
|
|
until the machine's cool state has stabilized.
|
2667 |
|
|
|
2668 |
|
|
Deissuing changes the cool state by undoing the most recently issued
|
2669 |
|
|
instructions, in reverse order. Committing changes the hot state by
|
2670 |
|
|
doing the least recently issued instructions, in their original order.
|
2671 |
|
|
Both operations are similar, so we assume that they take the same time;
|
2672 |
|
|
at most |commit_max| instructions are deissued and/or committed on
|
2673 |
|
|
each clock cycle.
|
2674 |
|
|
|
2675 |
|
|
@=
|
2676 |
|
|
{
|
2677 |
|
|
cool=(cool==reorder_top? reorder_bot: cool+1);
|
2678 |
|
|
if (verbose&issue_bit) {
|
2679 |
|
|
printf("Deissuing ");@+print_control_block(cool);
|
2680 |
|
|
if (cool->owner) {@+printf(" ");@+print_coroutine_id(cool->owner);@+}
|
2681 |
|
|
printf("\n");
|
2682 |
|
|
}
|
2683 |
|
|
if (cool->ren_x) rename_regs++,spec_rem(&cool->x);
|
2684 |
|
|
if (cool->ren_a) rename_regs++,spec_rem(&cool->a);
|
2685 |
|
|
if (cool->mem_x) mem_slots++,spec_rem(&cool->x);
|
2686 |
|
|
if (cool->set_l) spec_rem(&cool->rl);
|
2687 |
|
|
if (cool->owner) {
|
2688 |
|
|
if (cool->owner->lockloc)
|
2689 |
|
|
*(cool->owner->lockloc)=NULL, cool->owner->lockloc=NULL;
|
2690 |
|
|
if (cool->owner->next) unschedule(cool->owner);
|
2691 |
|
|
}
|
2692 |
|
|
cool_O=cool->cur_O;@+ cool_S=cool->cur_S;
|
2693 |
|
|
deissues--;
|
2694 |
|
|
}
|
2695 |
|
|
|
2696 |
|
|
@ @=
|
2697 |
|
|
{
|
2698 |
|
|
if (nullifying) @@;
|
2699 |
|
|
else {
|
2700 |
|
|
if (hot->i==get && hot->zz==rQ)
|
2701 |
|
|
new_Q=oandn(g[rQ].o,hot->x.o);
|
2702 |
|
|
else if (hot->i==put && hot->xx==rQ)
|
2703 |
|
|
hot->x.o.h |= new_Q.h, hot->x.o.l |= new_Q.l;
|
2704 |
|
|
if (hot->mem_x) @;
|
2705 |
|
|
if (verbose&issue_bit) {
|
2706 |
|
|
printf("Committing ");@+print_control_block(hot);@+printf("\n");
|
2707 |
|
|
}
|
2708 |
|
|
if (hot->ren_x) rename_regs++,hot->x.up->o=hot->x.o,spec_rem(&(hot->x));
|
2709 |
|
|
if (hot->ren_a) rename_regs++,hot->a.up->o=hot->a.o,spec_rem(&(hot->a));
|
2710 |
|
|
if (hot->set_l) hot->rl.up->o=hot->rl.o,spec_rem(&(hot->rl));
|
2711 |
|
|
if (hot->arith_exc) g[rA].o.l |= hot->arith_exc;
|
2712 |
|
|
if (hot->usage) {
|
2713 |
|
|
g[rU].o.l++;@+ if (g[rU].o.l==0) {
|
2714 |
|
|
g[rU].o.h++;@+ if ((g[rU].o.h&0x7fff)==0) g[rU].o.h-=0x8000;
|
2715 |
|
|
}
|
2716 |
|
|
}
|
2717 |
|
|
}
|
2718 |
|
|
if (hot->interrupt>=H_BIT) @;
|
2719 |
|
|
}
|
2720 |
|
|
|
2721 |
|
|
@ A load or store instruction is ``nullified'' if it is about to be captured
|
2722 |
|
|
by a trap interrupt. In such cases it will be the only item in the reorder
|
2723 |
|
|
buffer; thus nullifying is sort of a cross between deissuing and
|
2724 |
|
|
committing. (It is important to have stopped dispatching when nullification
|
2725 |
|
|
is necessary, because instructions such as |incgamma| and
|
2726 |
|
|
|decgamma| change~rS, and we need to change it back when an unexpected
|
2727 |
|
|
interruption occurs.)
|
2728 |
|
|
|
2729 |
|
|
@=
|
2730 |
|
|
{
|
2731 |
|
|
if (verbose&issue_bit) {
|
2732 |
|
|
printf("Nullifying ");@+print_control_block(hot);@+printf("\n");
|
2733 |
|
|
}
|
2734 |
|
|
if (hot->ren_x) rename_regs++,spec_rem(&hot->x);
|
2735 |
|
|
if (hot->ren_a) rename_regs++,spec_rem(&hot->a);
|
2736 |
|
|
if (hot->mem_x) mem_slots++,spec_rem(&hot->x);
|
2737 |
|
|
if (hot->set_l) spec_rem(&hot->rl);
|
2738 |
|
|
cool_O=hot->cur_O, cool_S=hot->cur_S;
|
2739 |
|
|
nullifying=false;
|
2740 |
|
|
}
|
2741 |
|
|
|
2742 |
|
|
@ Interrupt bits in rQ might be lost if they are set between a \.{GET}
|
2743 |
|
|
and a~\.{PUT}. Therefore we don't allow \.{PUT} to zero out bits that
|
2744 |
|
|
have become~1 since the most recently committed \.{GET}.
|
2745 |
|
|
|
2746 |
|
|
@=
|
2747 |
|
|
octa new_Q; /* when rQ increases in any bit position, so should this */
|
2748 |
|
|
|
2749 |
|
|
@ An instruction will not be committed immediately if it violates the basic
|
2750 |
|
|
security rule of \MMIX: An instruction in a nonnegative location
|
2751 |
|
|
should not be performed unless all eight of the internal interrupts
|
2752 |
|
|
have been enabled in the interrupt mask register~rK.
|
2753 |
|
|
Conversely, an instruction in a negative location should not be performed
|
2754 |
|
|
if the |P_BIT| is enabled in~rK.
|
2755 |
|
|
|
2756 |
|
|
Such instructions take one extra cycle before they are committed.
|
2757 |
|
|
The nonnegative-location case turns on the |S_BIT| of both rK and~rQ\null,
|
2758 |
|
|
leading to an immediate interrupt (unless the current instruction
|
2759 |
|
|
is |trap|, |put|, or~|resume|).
|
2760 |
|
|
|
2761 |
|
|
@=
|
2762 |
|
|
{
|
2763 |
|
|
if (hot->loc.h&sign_bit) {
|
2764 |
|
|
if ((g[rK].o.h&P_BIT) && !(hot->interrupt&P_BIT)) {
|
2765 |
|
|
hot->interrupt |= P_BIT;
|
2766 |
|
|
g[rQ].o.h |= P_BIT;
|
2767 |
|
|
new_Q.h |= P_BIT;
|
2768 |
|
|
if (verbose&issue_bit) {
|
2769 |
|
|
printf(" setting rQ=");@+print_octa(g[rQ].o);@+printf("\n");
|
2770 |
|
|
}
|
2771 |
|
|
break;
|
2772 |
|
|
}
|
2773 |
|
|
}@+else if ((g[rK].o.h&0xff)!=0xff && !(hot->interrupt&S_BIT)) {
|
2774 |
|
|
hot->interrupt |= S_BIT;
|
2775 |
|
|
g[rQ].o.h |= S_BIT;
|
2776 |
|
|
new_Q.h |= S_BIT;
|
2777 |
|
|
g[rK].o.h |= S_BIT;
|
2778 |
|
|
if (verbose&issue_bit) {
|
2779 |
|
|
printf(" setting rQ=");@+print_octa(g[rQ].o);
|
2780 |
|
|
printf(", rK=");@+print_octa(g[rK].o);@+printf("\n");
|
2781 |
|
|
}
|
2782 |
|
|
break;
|
2783 |
|
|
}
|
2784 |
|
|
}
|
2785 |
|
|
|
2786 |
|
|
@* Branch prediction. An \MMIX\ programmer distinguishes statically between
|
2787 |
|
|
``branches'' and ``probable branches,'' but many modern computers attempt to
|
2788 |
|
|
do better by implementing dynamic branch prediction. (See, for example,
|
2789 |
|
|
section~4.3 of Hennessy and Patterson's {\sl Computer Architecture},
|
2790 |
|
|
second edition.) Experience has shown that dynamic branch prediction can
|
2791 |
|
|
@^Hennessy, John LeRoy@>
|
2792 |
|
|
@^Patterson, David Andrew@>
|
2793 |
|
|
significantly improve the performance of speculative execution, by
|
2794 |
|
|
reducing the number of instructions that need to be deissued.
|
2795 |
|
|
|
2796 |
|
|
This simulator has an optional |bp_table| containing $2^{\mkern1mua+b+c}$ entries of
|
2797 |
|
|
$n$~bits each, where $n$ is between 1 and~8. Usually $n$ is 1 or~2 in
|
2798 |
|
|
practice, but 8 bits are allocated per entry for convenience in this program.
|
2799 |
|
|
The |bp_table| is consulted and updated on every branch instruction
|
2800 |
|
|
(every \.{B}~or \.{PB} instruction, but not~\.{JMP}), for advice on
|
2801 |
|
|
past history of similar situations. It is indexed by the $a$ least
|
2802 |
|
|
significant bits of the address of the instruction, the $b$ most recent
|
2803 |
|
|
bits of global branch history, and the next $c$ bits of both address
|
2804 |
|
|
and history (exclusive-ored).
|
2805 |
|
|
|
2806 |
|
|
A |bp_table| entry begins at zero and is regarded as a signed $n$-bit number.
|
2807 |
|
|
If it is nonnegative, we will follow the prediction in the instruction,
|
2808 |
|
|
namely to predict a branch taken only in the \.{PB} case. If it is
|
2809 |
|
|
negative, we will predict the opposite of the instruction's recommendation.
|
2810 |
|
|
The $n$-bit number is increased (if possible) if the instruction's
|
2811 |
|
|
prediction was correct, decreased (if possible) if the instruction's
|
2812 |
|
|
prediction was incorrect.
|
2813 |
|
|
|
2814 |
|
|
(Incidentally, a large value of~$n$ is not necessarily a good idea.
|
2815 |
|
|
For example, if $n=8$ the machine might need 128 steps to
|
2816 |
|
|
recognize that a branch taken the first 150 times is not taken
|
2817 |
|
|
the next 150 times. And if we modify the update criteria to avoid this
|
2818 |
|
|
problem, we obtain a scheme that is rarely better than a simple scheme
|
2819 |
|
|
with smaller~$n$.)
|
2820 |
|
|
|
2821 |
|
|
The values $a$, $b$, $c$, and $n$ in this discussion are called
|
2822 |
|
|
|bp_a|, |bp_b|, |bp_c|, and |bp_n| in the program.
|
2823 |
|
|
|
2824 |
|
|
@=
|
2825 |
|
|
Extern int bp_a,bp_b,bp_c,bp_n; /* parameters for branch prediction */
|
2826 |
|
|
Extern char *bp_table; /* either |NULL| or an array of $2^{\mkern1mua+b+c}$ items */
|
2827 |
|
|
|
2828 |
|
|
@ Branch prediction is made when we are either about to issue an
|
2829 |
|
|
instruction or peeking ahead. We look at the |bp_table|, but we
|
2830 |
|
|
don't want to update it yet.
|
2831 |
|
|
|
2832 |
|
|
@=
|
2833 |
|
|
{
|
2834 |
|
|
predicted=op&0x10; /* start with the instruction's recommendation */
|
2835 |
|
|
if (bp_table) {@+register int h;
|
2836 |
|
|
m=((head->loc.l&bp_cmask)<loc.l&bp_amask);
|
2837 |
|
|
m=((cool_hist&bp_bcmask)<>2);
|
2838 |
|
|
h=bp_table[m];
|
2839 |
|
|
if (h&bp_npower) predicted^=0x10;
|
2840 |
|
|
}
|
2841 |
|
|
if (predicted) peek_hist=(peek_hist<<1)+1;
|
2842 |
|
|
else peek_hist<<=1;
|
2843 |
|
|
}
|
2844 |
|
|
|
2845 |
|
|
@ We update the |bp_table| when an instruction is issued.
|
2846 |
|
|
And we store the opposite table
|
2847 |
|
|
value in |cool->x.o.l|, just in case our prediction turns out to be wrong.
|
2848 |
|
|
|
2849 |
|
|
@=
|
2850 |
|
|
if (bp_table) {@+register int reversed,h,h_up,h_down;
|
2851 |
|
|
reversed=op&0x10;
|
2852 |
|
|
if (peek_hist&1) reversed^=0x10;
|
2853 |
|
|
m=((head->loc.l&bp_cmask)<loc.l&bp_amask);
|
2854 |
|
|
m=((cool_hist&bp_bcmask)<>2);
|
2855 |
|
|
h=bp_table[m];
|
2856 |
|
|
h_up=(h+1)&bp_nmask;@+ if (h_up==bp_npower) h_up=h;
|
2857 |
|
|
if (h==bp_npower) h_down=h;@+ else h_down=(h-1)&bp_nmask;
|
2858 |
|
|
if (reversed) {
|
2859 |
|
|
bp_table[m]=h_down, cool->x.o.l=h_up;
|
2860 |
|
|
cool->i=pbr+br-cool->i; /* reverse the sense */
|
2861 |
|
|
bp_rev_stat++;
|
2862 |
|
|
}@+else {
|
2863 |
|
|
bp_table[m]=h_up, cool->x.o.l=h_down; /* go with the flow */
|
2864 |
|
|
bp_ok_stat++;
|
2865 |
|
|
}
|
2866 |
|
|
if (verbose&show_pred_bit) {
|
2867 |
|
|
printf(" predicting ");@+print_octa(cool->loc);
|
2868 |
|
|
printf(" %s; bp[%x]=%d\n",reversed? "NG": "OK",m,
|
2869 |
|
|
bp_table[m]-((bp_table[m]&bp_npower)<<1));
|
2870 |
|
|
}
|
2871 |
|
|
cool->x.o.h=m;
|
2872 |
|
|
}
|
2873 |
|
|
|
2874 |
|
|
@ The calculations in the previous sections need several precomputed constants,
|
2875 |
|
|
depending on the parameters $a$, $b$, $c$, and~$n$.
|
2876 |
|
|
|
2877 |
|
|
@=
|
2878 |
|
|
bp_amask=((1<
|
2879 |
|
|
bp_cmask=((1<
|
2880 |
|
|
bp_bcmask=(1<<(bp_b+bp_c))-1; /* least $b+c$ bits of history info */
|
2881 |
|
|
bp_nmask=(1<
|
2882 |
|
|
bp_npower=1<<(bp_n-1); /* $2^{n-1}$, the sign bit of an $n$-bit number */
|
2883 |
|
|
|
2884 |
|
|
@ @=
|
2885 |
|
|
int bp_amask,bp_cmask,bp_bcmask,bp_nmask,bp_npower;
|
2886 |
|
|
int bp_rev_stat,bp_ok_stat; /* how often we overrode and agreed */
|
2887 |
|
|
int bp_bad_stat,bp_good_stat; /* how often we failed and succeeded */
|
2888 |
|
|
|
2889 |
|
|
@ After a branch or probable branch instruction has been issued and
|
2890 |
|
|
the value of the relevant register has been computed in the
|
2891 |
|
|
reorder buffer as |data->b.o|, we're ready to determine if the
|
2892 |
|
|
prediction was correct or not.
|
2893 |
|
|
|
2894 |
|
|
@=
|
2895 |
|
|
case br: case pbr: j=register_truth(data->b.o,data->op);
|
2896 |
|
|
if (j) data->go.o=data->z.o;@+ else data->go.o=data->y.o;
|
2897 |
|
|
if (j==(data->i==pbr)) bp_good_stat++;
|
2898 |
|
|
else { /* oops, misprediction */
|
2899 |
|
|
bp_bad_stat++;
|
2900 |
|
|
@;
|
2901 |
|
|
}
|
2902 |
|
|
goto fin_ex;
|
2903 |
|
|
|
2904 |
|
|
@ The |register_truth| subroutine is used by \.B, \.{PB}, \.{CS}, and
|
2905 |
|
|
\.{ZS} commands to decide whether an octabyte satisfies the
|
2906 |
|
|
conditions of the opcode, |data->op|.
|
2907 |
|
|
|
2908 |
|
|
@=
|
2909 |
|
|
static int register_truth @,@,@[ARGS((octa,mmix_opcode))@];
|
2910 |
|
|
|
2911 |
|
|
@ @=
|
2912 |
|
|
static int register_truth(o,op)
|
2913 |
|
|
octa o;
|
2914 |
|
|
mmix_opcode op;
|
2915 |
|
|
{@+register int b;
|
2916 |
|
|
switch ((op>>1) & 0x3) {
|
2917 |
|
|
case 0: b=o.h>>31;@+break; /* negative? */
|
2918 |
|
|
case 1: b=(o.h==0 && o.l==0);@+break; /* zero? */
|
2919 |
|
|
case 2: b=(o.h
|
2920 |
|
|
case 3: b=o.l&0x1;@+break; /* odd? */
|
2921 |
|
|
}
|
2922 |
|
|
if (op&0x8) return b^1;
|
2923 |
|
|
else return b;
|
2924 |
|
|
}
|
2925 |
|
|
|
2926 |
|
|
@ The |issued_between| subroutine determines how many speculative instructions
|
2927 |
|
|
were issued between a given control block in the reorder buffer and
|
2928 |
|
|
the current |cool| pointer, when |cc=cool|.
|
2929 |
|
|
|
2930 |
|
|
@=
|
2931 |
|
|
static int issued_between @,@,@[ARGS((control*,control*))@];
|
2932 |
|
|
|
2933 |
|
|
@ @=
|
2934 |
|
|
static int issued_between(c,cc)
|
2935 |
|
|
control *c,*cc;
|
2936 |
|
|
{
|
2937 |
|
|
if (c>cc) return c-1-cc;
|
2938 |
|
|
return (c-reorder_bot)+(reorder_top-cc);
|
2939 |
|
|
}
|
2940 |
|
|
|
2941 |
|
|
@ If more than one functional unit is able to process branch instructions and
|
2942 |
|
|
if two of them simultaneously discover misprediction, or if misprediction is
|
2943 |
|
|
detected by one unit just as another unit is generating an interrupt, we
|
2944 |
|
|
assume that an arbitration takes place so that only the hottest one actually
|
2945 |
|
|
deissues the cooler instructions.
|
2946 |
|
|
|
2947 |
|
|
Changes to the |bp_table| aren't undone when they were made on speculation in
|
2948 |
|
|
an instruction being deissued; nor do we worry about cases where the same
|
2949 |
|
|
|bp_table| entry is being updated by two or more active coroutines. After all,
|
2950 |
|
|
the |bp_table| is just a heuristic, not part of the real computation.
|
2951 |
|
|
We correct the |bp_table| only if we discover that a prediction was wrong, so
|
2952 |
|
|
that we will be less likely to make the same mistake later.
|
2953 |
|
|
|
2954 |
|
|
@=
|
2955 |
|
|
i=issued_between(data,cool);
|
2956 |
|
|
if (i
|
2957 |
|
|
deissues=i;
|
2958 |
|
|
old_tail=tail=head;@+resuming=0; /* clear the fetch buffer */
|
2959 |
|
|
@;
|
2960 |
|
|
inst_ptr.o=data->go.o, inst_ptr.p=NULL;
|
2961 |
|
|
if (!(data->loc.h&sign_bit)) {
|
2962 |
|
|
if (inst_ptr.o.h&sign_bit) data->interrupt |= P_BIT;
|
2963 |
|
|
else data->interrupt &=~P_BIT;
|
2964 |
|
|
}
|
2965 |
|
|
if (bp_table) {
|
2966 |
|
|
bp_table[data->x.o.h]=data->x.o.l; /* this is what we should have stored */
|
2967 |
|
|
if (verbose&show_pred_bit) {
|
2968 |
|
|
printf(" mispredicted ");@+print_octa(data->loc);
|
2969 |
|
|
printf("; bp[%x]=%d\n",data->x.o.h,
|
2970 |
|
|
data->x.o.l-((data->x.o.l&bp_npower)<<1));
|
2971 |
|
|
}
|
2972 |
|
|
}
|
2973 |
|
|
cool_hist=(j? (data->hist<<1)+1: data->hist<<1);
|
2974 |
|
|
|
2975 |
|
|
@ @=
|
2976 |
|
|
Extern void print_stats @,@,@[ARGS((void))@];
|
2977 |
|
|
|
2978 |
|
|
@ @=
|
2979 |
|
|
void print_stats()
|
2980 |
|
|
{
|
2981 |
|
|
register int j;
|
2982 |
|
|
if (bp_table)
|
2983 |
|
|
printf("Predictions: %d in agreement, %d in opposition; %d good, %d bad\n",
|
2984 |
|
|
bp_ok_stat,bp_rev_stat,bp_good_stat,bp_bad_stat);
|
2985 |
|
|
else printf("Predictions: %d good, %d bad\n",bp_good_stat,bp_bad_stat);
|
2986 |
|
|
printf("Instructions issued per cycle:\n");
|
2987 |
|
|
for (j=0;j<=dispatch_max;j++)
|
2988 |
|
|
printf(" %d %d\n",j,dispatch_stat[j]);
|
2989 |
|
|
}
|
2990 |
|
|
|
2991 |
|
|
@* Cache memory. It's time now to consider \MMIX's MMU, the memory management
|
2992 |
|
|
unit. This part of the machine deals with the critical problem of getting data
|
2993 |
|
|
to and from the computational units. In a RISC architecture all interaction
|
2994 |
|
|
between main memory and the computer registers is specified by load and store
|
2995 |
|
|
instructions; thus memory accesses are much easier to deal with than they
|
2996 |
|
|
would be on a machine with more complex kinds of interaction. But memory
|
2997 |
|
|
management is still difficult, if we want to do it well, because main memory
|
2998 |
|
|
typically operates at a much slower speed than the registers do. High-speed
|
2999 |
|
|
implementations of \MMIX\ introduce intermediate ``caches'' of storage in
|
3000 |
|
|
order to keep the most important data accessible, and cache maintenance can be
|
3001 |
|
|
complicated when all the details are taken into account.
|
3002 |
|
|
(See, for example, Chapter 5 of Hennessy and Patterson's
|
3003 |
|
|
{\sl Computer Architecture}, second edition.)
|
3004 |
|
|
@^Hennessy, John LeRoy@>
|
3005 |
|
|
@^Patterson, David Andrew@>
|
3006 |
|
|
@^caches@>
|
3007 |
|
|
|
3008 |
|
|
This simulator can be configured to have up to three auxiliary caches between
|
3009 |
|
|
registers and memory: An I-cache for instructions, a D-cache for data, and an
|
3010 |
|
|
S-cache for both instructions and data. The S-cache, also called a {\it
|
3011 |
|
|
secondary cache}, is supported only if both I-cache and D-cache are present.
|
3012 |
|
|
Arbitrary access times for each cache can be specified independently;
|
3013 |
|
|
we might assume, for example, that data items in the I-cache or D-cache can
|
3014 |
|
|
be sent to a register in one or two clock cycles, but the access time for the
|
3015 |
|
|
S-cache might be say 5 cycles, and main memory might require 20 cycles or more.
|
3016 |
|
|
Our speculative pipeline can have many functional units handling load
|
3017 |
|
|
and store instructions, but only one load or store instruction can be
|
3018 |
|
|
updating the D-cache or S-cache or main memory at a time. (However, the
|
3019 |
|
|
D-cache can have several read ports; furthermore, data might
|
3020 |
|
|
be passing between the S-cache and memory while other data is passing
|
3021 |
|
|
between the reorder buffer and the D-cache.)
|
3022 |
|
|
|
3023 |
|
|
Besides the optional I-cache, D-cache, and S-cache, there are required caches
|
3024 |
|
|
called the IT-cache and DT-cache, for translation of virtual addresses to
|
3025 |
|
|
physical addresses. A translation cache is often called a ``translation
|
3026 |
|
|
@^TLB@>
|
3027 |
|
|
@^translation caches@>
|
3028 |
|
|
lookaside buffer'' or TLB; but we call it a cache since it is implemented in
|
3029 |
|
|
nearly the same way as an I-cache.
|
3030 |
|
|
|
3031 |
|
|
@ Consider a cache that has blocks of $2^b$~bytes each and
|
3032 |
|
|
associativity~$2^a$; here $b\ge3$ and $a\ge0$. The I-cache, D-cache, and
|
3033 |
|
|
S-cache are addressed by 48-bit physical addresses, as if they were part of
|
3034 |
|
|
main memory; but the IT and DT caches are addressed by 64-bit keys, obtained
|
3035 |
|
|
from a virtual address by blanking out the lower $s$ bits and inserting the
|
3036 |
|
|
value of~$n$, where the page size~$s$ and the process number~$n$ are found
|
3037 |
|
|
in~rV. We will consider all caches to be addressed by 64-bit keys, so that
|
3038 |
|
|
both cases are handled with the same basic methods.
|
3039 |
|
|
|
3040 |
|
|
Given a 64-bit key,
|
3041 |
|
|
we ignore the low-order $b$~bits and use the next $c$~bits
|
3042 |
|
|
to address the {\it cache set\/}; then the remaining $64-b-c$ bits should
|
3043 |
|
|
match one of $2^a$ {\it tags\/} in that set. The case $a=0$ corresponds to a
|
3044 |
|
|
so-called {\it direct-mapped\/} cache; the case $c=0$ corresponds to a
|
3045 |
|
|
so-called {\it fully associative\/} cache. With $2^c$ sets of $2^a$ blocks
|
3046 |
|
|
each, and $2^b$ bytes per block, the cache contains $2^{a+b+c}$ bytes of data,
|
3047 |
|
|
in addition to the space needed for tags. Translation caches have $b=3$ and
|
3048 |
|
|
they also usually have $c=0$.
|
3049 |
|
|
|
3050 |
|
|
If a tag matches the specified bits, we ``hit'' in the cache and can
|
3051 |
|
|
use and/or update the data found there. Otherwise we ``miss,'' and
|
3052 |
|
|
we probably want to replace one of the cache blocks by the block containing
|
3053 |
|
|
the item sought. The item chosen for replacement is called a {\it victim}.
|
3054 |
|
|
The choice of victim is forced when the cache is direct-mapped, but four
|
3055 |
|
|
strategies for victim selection are available when we must choose from
|
3056 |
|
|
among $2^a$ entries for $a>0$:
|
3057 |
|
|
|
3058 |
|
|
\smallskip\textindent{$\bullet$} ``Random'' selection chooses the victim
|
3059 |
|
|
by extracting the least significant $a$~bits of the clock.
|
3060 |
|
|
|
3061 |
|
|
\smallskip\textindent{$\bullet$} ``Serial'' selection chooses 0, 1, \dots,
|
3062 |
|
|
$2^a-1$, 0, 1, \dots, $2^a-1$, 0, \dots~on successive trials.
|
3063 |
|
|
|
3064 |
|
|
\smallskip\textindent{$\bullet$} ``LRU (Least Recently Used)'' selection
|
3065 |
|
|
chooses the victim that ranks last if items are ranked inversely to the time
|
3066 |
|
|
that has elapsed since their previous use.
|
3067 |
|
|
|
3068 |
|
|
\smallskip\textindent{$\bullet$} ``Pseudo-LRU'' selection chooses the
|
3069 |
|
|
victim by a rough approximation to LRU that is simpler to implement
|
3070 |
|
|
in hardware. It requires a bit table $r_1\ldots r_{2^a-1}$.
|
3071 |
|
|
Whenever we use an item
|
3072 |
|
|
with binary address $(i_1\ldots i_a)_2$ in the set, we adjust the
|
3073 |
|
|
bit table as follows:
|
3074 |
|
|
$$r_1\gets1-i_1,\quad r_{1i_1}\gets1-i_2,\quad\ldots,\quad
|
3075 |
|
|
r_{1i_1\ldots i_{a-1}}\gets1-i_a;$$
|
3076 |
|
|
here the subscripts on~$r$ are binary numbers. (For example, when $a=3$,
|
3077 |
|
|
the use of element $(010)_2$ sets $r_1\gets1$, $r_{10}\gets0$, $r_{101}\gets1$,
|
3078 |
|
|
where $r_{101}$ means the same as $r_5$.) To select a victim, we start with
|
3079 |
|
|
$l\gets1$ and then repeatedly set $l\gets2l+r_l$, $a$~times; then we
|
3080 |
|
|
choose element $l-2^a$. When $a=1$, this scheme is equivalent to LRU.
|
3081 |
|
|
When $a=2$, this scheme was implemented in the Intel 80486 chip.
|
3082 |
|
|
|
3083 |
|
|
@=
|
3084 |
|
|
typedef enum {@!random,@!serial,@!pseudo_lru,@!lru} replace_policy;
|
3085 |
|
|
|
3086 |
|
|
@ A cache might also include a ``victim'' area, which contains the
|
3087 |
|
|
last $2^v$ victim blocks removed from the main cache area. The victim
|
3088 |
|
|
area can be searched in parallel with the specified cache set, thereby
|
3089 |
|
|
increasing the chance of a hit without making the search go slower.
|
3090 |
|
|
Each of the three replacement policies can be used also in the victim cache.
|
3091 |
|
|
|
3092 |
|
|
@ A cache also has a {\it granularity\/} $2^g$, where $b\ge g\ge3$. This
|
3093 |
|
|
means that we maintain, for each cache block, a set of $2^{b-g}$ ``dirty
|
3094 |
|
|
bits,'' which identify the $2^g$-byte groups that have possibly changed since
|
3095 |
|
|
they were last read from memory. Thus if $g=b$, an entire cache block is
|
3096 |
|
|
either dirty or clean; if $g=3$, the dirtiness of each octabyte is maintained
|
3097 |
|
|
separately.
|
3098 |
|
|
|
3099 |
|
|
Two policies are available when new data is written into all or part
|
3100 |
|
|
of a cache block. We can {\it write-through}, meaning that we send all new data
|
3101 |
|
|
to memory immediately and never mark anything dirty; or we can {\it
|
3102 |
|
|
write-back}, meaning that we update the memory from the cache only when
|
3103 |
|
|
absolutely necessary. Furthermore we can {\it write-allocate},
|
3104 |
|
|
meaning that we keep the new data in the cache, even if the cache block being
|
3105 |
|
|
written has to be fetched first because of a miss; or we can {\it
|
3106 |
|
|
write-around}, meaning that we keep the new data only if it was part of an
|
3107 |
|
|
existing cache block.
|
3108 |
|
|
|
3109 |
|
|
(In this discussion, ``memory'' is shorthand for ``the next level
|
3110 |
|
|
of the memory hierarchy''; if there is an S-cache, the I-cache and
|
3111 |
|
|
D-cache write new data to the S-cache, not directly to memory. The I-cache,
|
3112 |
|
|
IT-cache, and DT-cache are read-only, so they do not need the facilities
|
3113 |
|
|
discussed in this section. Moreover, the D-cache and S-cache can be assumed to
|
3114 |
|
|
have the same granularity.)
|
3115 |
|
|
|
3116 |
|
|
@
|
3117 |
|
|
#define WRITE_BACK 1 /* use this if not write-through */
|
3118 |
|
|
#define WRITE_ALLOC 2 /* use this if not write-around */
|
3119 |
|
|
|
3120 |
|
|
@ We have seen that many flavors of cache can be simulated. They are
|
3121 |
|
|
represented by \&{cache} structures, containing arrays of \&{cacheset}
|
3122 |
|
|
structures that contain arrays of \&{cacheblock} structures
|
3123 |
|
|
for the individual blocks. We use a full byte to store each |dirty| bit,
|
3124 |
|
|
and we use full integer words to store |rank| fields for LRU processing, etc.;
|
3125 |
|
|
memory economy is less important than simplicity in this simulator.
|
3126 |
|
|
|
3127 |
|
|
@=
|
3128 |
|
|
typedef struct{
|
3129 |
|
|
octa tag; /* bits of key not included in the cache block address */
|
3130 |
|
|
char *dirty; /* array of $2^{g-b}$ dirty bits, one per granule */
|
3131 |
|
|
octa *data; /* array of $2^{b-3}$ octabytes, the data in a cache block */
|
3132 |
|
|
int rank; /* auxiliary information for non-|random| policies */
|
3133 |
|
|
} cacheblock;
|
3134 |
|
|
@#
|
3135 |
|
|
typedef cacheblock *cacheset; /* array of $2^a$ or $2^v$ blocks */
|
3136 |
|
|
@#
|
3137 |
|
|
typedef struct{
|
3138 |
|
|
int a,b,c,g,v; /* lg of associativity, blocksize, setsize, granularity,
|
3139 |
|
|
and victimsize */
|
3140 |
|
|
int aa,bb,cc,gg,vv; /* associativity, blocksize, setsize, granularity,
|
3141 |
|
|
and victimsize (all powers of~2) */
|
3142 |
|
|
int tagmask; /* $-2^{b+c}$ */
|
3143 |
|
|
replace_policy repl,vrepl; /* how to choose victims and victim-victims */
|
3144 |
|
|
int mode; /* optional |WRITE_BACK| and/or |WRITE_ALLOC| */
|
3145 |
|
|
int access_time; /* cycles to know if there's a hit */
|
3146 |
|
|
int copy_in_time; /* cycles to copy a new block into the cache */
|
3147 |
|
|
int copy_out_time; /* cycles to copy an old block from the cache */
|
3148 |
|
|
cacheset *set; /* array of $2^c$ sets of arrays of cache blocks */
|
3149 |
|
|
cacheset victim; /* the victim cache, if present */
|
3150 |
|
|
coroutine filler; /* a coroutine for copying new blocks into the cache */
|
3151 |
|
|
control filler_ctl; /* its control block */
|
3152 |
|
|
coroutine flusher; /* a coroutine for writing dirty old data
|
3153 |
|
|
from the cache */
|
3154 |
|
|
control flusher_ctl; /* its control block */
|
3155 |
|
|
cacheblock inbuf; /* filling comes from here */
|
3156 |
|
|
cacheblock outbuf; /* flushing goes to here */
|
3157 |
|
|
lockvar lock; /* nonzero when the cache is being changed significantly */
|
3158 |
|
|
lockvar fill_lock; /* nonzero when filler should pass data back */
|
3159 |
|
|
int ports; /* how many coroutines can be reading the cache? */
|
3160 |
|
|
coroutine *reader; /* array of coroutines that might be reading
|
3161 |
|
|
simultaneously */
|
3162 |
|
|
char *name; /* |"Icache"|, for example */
|
3163 |
|
|
} cache;
|
3164 |
|
|
|
3165 |
|
|
@ @=
|
3166 |
|
|
Extern cache *Icache, *Dcache, *Scache, *ITcache, *DTcache;
|
3167 |
|
|
|
3168 |
|
|
@ Now we are ready to define some basic subroutines for cache maintenance.
|
3169 |
|
|
Let's begin with a trivial routine that tests if a given cache block is dirty.
|
3170 |
|
|
|
3171 |
|
|
@=
|
3172 |
|
|
static bool is_dirty @,@,@[ARGS((cache*,cacheblock*))@];
|
3173 |
|
|
|
3174 |
|
|
@ @=
|
3175 |
|
|
static bool is_dirty(c,p)
|
3176 |
|
|
cache *c; /* the cache containing it */
|
3177 |
|
|
cacheblock *p; /* a cache block */
|
3178 |
|
|
{
|
3179 |
|
|
register int j;
|
3180 |
|
|
register char *d=p->dirty;
|
3181 |
|
|
for (j=0;jbb;d++,j+=c->gg) if (*d) return true;
|
3182 |
|
|
return false;
|
3183 |
|
|
}
|
3184 |
|
|
|
3185 |
|
|
@ For diagnostic purposes we might want to display an entire cache block.
|
3186 |
|
|
|
3187 |
|
|
@=
|
3188 |
|
|
static void print_cache_block @,@,@[ARGS((cacheblock,cache*))@];
|
3189 |
|
|
|
3190 |
|
|
@ @=
|
3191 |
|
|
static void print_cache_block(p,c)
|
3192 |
|
|
cacheblock p;
|
3193 |
|
|
cache *c;
|
3194 |
|
|
{@+register int i,j,b=c->bb>>3,g=c->gg>>3;
|
3195 |
|
|
printf("%08x%08x: ",p.tag.h,p.tag.l);
|
3196 |
|
|
for (i=j=0; j
|
3197 |
|
|
printf("%08x%08x%c",p.data[j].h,p.data[j].l,p.dirty[i]?'*':' ');
|
3198 |
|
|
printf(" (%d)\n",p.rank);
|
3199 |
|
|
}
|
3200 |
|
|
|
3201 |
|
|
@ @=
|
3202 |
|
|
static void print_cache_locks @,@,@[ARGS((cache*))@];
|
3203 |
|
|
|
3204 |
|
|
@ @=
|
3205 |
|
|
static void print_cache_locks(c)
|
3206 |
|
|
cache *c;
|
3207 |
|
|
{
|
3208 |
|
|
if (c) {
|
3209 |
|
|
if (c->lock) printf("%s locked by %s:%d\n",
|
3210 |
|
|
c->name,c->lock->name,c->lock->stage);
|
3211 |
|
|
if (c->fill_lock) printf("%sfill locked by %s:%d\n",
|
3212 |
|
|
c->name,c->fill_lock->name,c->fill_lock->stage);
|
3213 |
|
|
}
|
3214 |
|
|
}
|
3215 |
|
|
|
3216 |
|
|
@ The |print_cache| routine prints the entire contents of a cache. This can be
|
3217 |
|
|
a huge amount of data, but it can be very useful when debugging. Fortunately,
|
3218 |
|
|
the task of debugging favors the use of small caches, since interesting cases
|
3219 |
|
|
arise more often when a cache is fairly small.
|
3220 |
|
|
|
3221 |
|
|
@=
|
3222 |
|
|
Extern void print_cache @,@,@[ARGS((cache*,bool))@];
|
3223 |
|
|
|
3224 |
|
|
@ @=
|
3225 |
|
|
void print_cache(c,dirty_only)
|
3226 |
|
|
cache *c;
|
3227 |
|
|
bool dirty_only;
|
3228 |
|
|
{
|
3229 |
|
|
if (c) {@+register int i,j;
|
3230 |
|
|
printf("%s of %s:",dirty_only?"Dirty blocks":"Contents",c->name);
|
3231 |
|
|
if (c->filler.next) {
|
3232 |
|
|
printf(" (filling ");
|
3233 |
|
|
print_octa(c->name[1]=='T'? c->filler_ctl.y.o: c->filler_ctl.z.o);
|
3234 |
|
|
printf(")");
|
3235 |
|
|
}
|
3236 |
|
|
if (c->flusher.next) {
|
3237 |
|
|
printf(" (flushing ");
|
3238 |
|
|
print_octa(c->outbuf.tag);
|
3239 |
|
|
printf(")");
|
3240 |
|
|
}
|
3241 |
|
|
printf("\n");
|
3242 |
|
|
@;
|
3243 |
|
|
}
|
3244 |
|
|
}
|
3245 |
|
|
|
3246 |
|
|
@ We don't print the cache blocks that have an invalid tag, unless
|
3247 |
|
|
requested to be verbose.
|
3248 |
|
|
|
3249 |
|
|
@=
|
3250 |
|
|
for (i=0;icc;i++) for (j=0;jaa;j++)
|
3251 |
|
|
if ((!(c->set[i][j].tag.h&sign_bit)||(verbose&show_wholecache_bit))&&@|
|
3252 |
|
|
(!dirty_only || is_dirty(c,&c->set[i][j]))) {
|
3253 |
|
|
printf("[%d][%d] ",i,j);
|
3254 |
|
|
print_cache_block(c->set[i][j],c);
|
3255 |
|
|
}
|
3256 |
|
|
for (j=0;jvv;j++)
|
3257 |
|
|
if ((!(c->victim[j].tag.h&sign_bit)||(verbose&show_wholecache_bit))&&@|
|
3258 |
|
|
(!dirty_only || is_dirty(c,&c->victim[j]))) {
|
3259 |
|
|
printf("V[%d] ",j);
|
3260 |
|
|
print_cache_block(c->victim[j],c);
|
3261 |
|
|
}
|
3262 |
|
|
|
3263 |
|
|
@ The |clean_block| routine simply initializes a given cache block.
|
3264 |
|
|
|
3265 |
|
|
@=
|
3266 |
|
|
Extern void clean_block @,@,@[ARGS((cache*,cacheblock*))@];
|
3267 |
|
|
|
3268 |
|
|
@ @=
|
3269 |
|
|
void clean_block(c,p)
|
3270 |
|
|
cache *c;
|
3271 |
|
|
cacheblock *p;
|
3272 |
|
|
{
|
3273 |
|
|
register int j;
|
3274 |
|
|
p->tag.h=sign_bit, p->tag.l=0;
|
3275 |
|
|
for (j=0;jbb>>3;j++) p->data[j]=zero_octa;
|
3276 |
|
|
for (j=0;jbb>>c->g;j++) p->dirty[j]=false;
|
3277 |
|
|
}
|
3278 |
|
|
|
3279 |
|
|
@ The |zap_cache| routine invalidates all tags of a given cache,
|
3280 |
|
|
effectively restoring it to its initial condition.
|
3281 |
|
|
|
3282 |
|
|
@=
|
3283 |
|
|
Extern void zap_cache @,@,@[ARGS((cache*))@];
|
3284 |
|
|
|
3285 |
|
|
@ We clear the |dirty| entries here, just to be tidy, although
|
3286 |
|
|
they could actually be left in arbitrary condition when the tags are invalid.
|
3287 |
|
|
|
3288 |
|
|
@=
|
3289 |
|
|
void zap_cache(c)
|
3290 |
|
|
cache *c;
|
3291 |
|
|
{
|
3292 |
|
|
register int i,j;
|
3293 |
|
|
for (i=0;icc;i++) for (j=0;jaa;j++) {
|
3294 |
|
|
clean_block(c,&(c->set[i][j]));
|
3295 |
|
|
}
|
3296 |
|
|
for (j=0;jvv;j++) {
|
3297 |
|
|
clean_block(c,&(c->victim[j]));
|
3298 |
|
|
}
|
3299 |
|
|
}
|
3300 |
|
|
|
3301 |
|
|
@ The |get_reader| subroutine finds the index of
|
3302 |
|
|
an available reader coroutine for a given cache, or returns a negative value
|
3303 |
|
|
if no readers are available.
|
3304 |
|
|
|
3305 |
|
|
@=
|
3306 |
|
|
static int get_reader @,@,@[ARGS((cache*))@];
|
3307 |
|
|
|
3308 |
|
|
@ @=
|
3309 |
|
|
static int get_reader(c)
|
3310 |
|
|
cache *c;
|
3311 |
|
|
{@+ register int j;
|
3312 |
|
|
for (j=0;jports;j++)
|
3313 |
|
|
if (c->reader[j].next==NULL) return j;
|
3314 |
|
|
return -1;
|
3315 |
|
|
}
|
3316 |
|
|
|
3317 |
|
|
@ The subroutine |copy_block(c,p,cc,pp)| copies the dirty
|
3318 |
|
|
items from block~|p| of cache~|c| into block~|pp| of cache~|cc|, assuming
|
3319 |
|
|
that the destination cache has a sufficiently large block size.
|
3320 |
|
|
(In other words, we assume that |cc->b>=c->b|.) We also assume that both
|
3321 |
|
|
blocks have compatible tags, and that both caches have the same granularity.
|
3322 |
|
|
|
3323 |
|
|
@=
|
3324 |
|
|
static void copy_block @,@,@[ARGS((cache*,cacheblock*,cache*,cacheblock*))@];
|
3325 |
|
|
|
3326 |
|
|
@ @=
|
3327 |
|
|
static void copy_block(c,p,cc,pp)
|
3328 |
|
|
cache *c,*cc;
|
3329 |
|
|
cacheblock *p,*pp;
|
3330 |
|
|
{
|
3331 |
|
|
register int j,jj,i,ii,lim; register int off=p->tag.l&(cc->bb-1);
|
3332 |
|
|
if (c->g!=cc->g || p->tag.h!=pp->tag.h || p->tag.l-off!=pp->tag.l)
|
3333 |
|
|
panic(confusion("copy block"));
|
3334 |
|
|
for (j=0,jj=off>>c->g;jbb>>c->g;j++,jj++) if (p->dirty[j]) {
|
3335 |
|
|
pp->dirty[jj]=true;
|
3336 |
|
|
for (i=j<<(c->g-3),ii=jj<<(c->g-3),lim=(j+1)<<(c->g-3);
|
3337 |
|
|
idata[ii]=p->data[i];
|
3338 |
|
|
}
|
3339 |
|
|
}
|
3340 |
|
|
|
3341 |
|
|
@ The |choose_victim| subroutine selects the victim to be replaced when we
|
3342 |
|
|
need to change a cache~set. We need only one bit of the |rank| fields to
|
3343 |
|
|
implement the $r$~table when |policy=pseudo_lru|,
|
3344 |
|
|
and we don't need |rank| at all when |policy=random|. Of course we use an
|
3345 |
|
|
$a$-bit counter to implement |policy=serial|. In the other case,
|
3346 |
|
|
|policy=lru|, we need an $a$-bit |rank| field; the least recently used entry
|
3347 |
|
|
has rank~0, and the most recently used entry has rank~$2^a-1=|aa|-1$.
|
3348 |
|
|
|
3349 |
|
|
@=
|
3350 |
|
|
static cacheblock* choose_victim @,@,@[ARGS((cacheset,int,replace_policy))@];
|
3351 |
|
|
|
3352 |
|
|
@ @=
|
3353 |
|
|
static cacheblock* choose_victim(s,aa,policy)
|
3354 |
|
|
cacheset s;
|
3355 |
|
|
int aa; /* setsize */
|
3356 |
|
|
replace_policy policy;
|
3357 |
|
|
{
|
3358 |
|
|
register cacheblock *p;
|
3359 |
|
|
register int l,m;
|
3360 |
|
|
switch (policy) {
|
3361 |
|
|
case random: return &s[ticks.l&(aa-1)];
|
3362 |
|
|
case serial: l=s[0].rank;@+ s[0].rank=(l+1)&(aa-1);@+ return &s[l];
|
3363 |
|
|
case lru: for (p=s;p
|
3364 |
|
|
if (p->rank==0) return p;
|
3365 |
|
|
panic(confusion("lru victim")); /* what happened? nobody has rank zero */
|
3366 |
|
|
case pseudo_lru: for (l=1,m=aa>>1; m; m>>=1) l=l+l+s[l].rank;
|
3367 |
|
|
return &s[l-aa];
|
3368 |
|
|
}
|
3369 |
|
|
}
|
3370 |
|
|
|
3371 |
|
|
@ The |note_usage| subroutine updates the |rank| entries to record the
|
3372 |
|
|
fact that a particular block in a cache set is now being used.
|
3373 |
|
|
|
3374 |
|
|
@=
|
3375 |
|
|
static void note_usage @,@,@[ARGS((cacheblock*,cacheset,int,replace_policy))@];
|
3376 |
|
|
|
3377 |
|
|
@ @=
|
3378 |
|
|
static void note_usage(l,s,aa,policy)
|
3379 |
|
|
cacheblock *l; /* a cache block that's probably worth preserving */
|
3380 |
|
|
cacheset s; /* the set that contains $l$ */
|
3381 |
|
|
int aa; /* setsize */
|
3382 |
|
|
replace_policy policy;
|
3383 |
|
|
{
|
3384 |
|
|
register cacheblock *p;
|
3385 |
|
|
register int j,m,r;
|
3386 |
|
|
if (aa==1 || policy<=serial) return;
|
3387 |
|
|
if (policy==lru) {
|
3388 |
|
|
r=l->rank;
|
3389 |
|
|
for (p=s;prank>r) p->rank--;
|
3390 |
|
|
l->rank=aa-1;
|
3391 |
|
|
} else { /* |policy==pseudo_lru| */
|
3392 |
|
|
r=l-s;
|
3393 |
|
|
for (j=1,m=aa>>1;m;m>>=1)
|
3394 |
|
|
if (r&m) s[j].rank=0,j=j+j+1;
|
3395 |
|
|
else s[j].rank=1, j=j+j;
|
3396 |
|
|
}
|
3397 |
|
|
return;
|
3398 |
|
|
}
|
3399 |
|
|
|
3400 |
|
|
@ The |demote_usage| subroutine is sort of the opposite of |note_usage|;
|
3401 |
|
|
it changes the rank of a given block to {\it least\/} recently used.
|
3402 |
|
|
|
3403 |
|
|
@=
|
3404 |
|
|
static void demote_usage @,@,@[ARGS((cacheblock*,cacheset,int,replace_policy))@];
|
3405 |
|
|
|
3406 |
|
|
@ @=
|
3407 |
|
|
static void demote_usage(l,s,aa,policy)
|
3408 |
|
|
cacheblock *l; /* a cache block we probably don't need */
|
3409 |
|
|
cacheset s; /* the set that contains $l$ */
|
3410 |
|
|
int aa; /* setsize */
|
3411 |
|
|
replace_policy policy;
|
3412 |
|
|
{
|
3413 |
|
|
register cacheblock *p;
|
3414 |
|
|
register int j,m,r;
|
3415 |
|
|
if (aa==1 || policy<=serial) return;
|
3416 |
|
|
if (policy==lru) {
|
3417 |
|
|
r=l->rank;
|
3418 |
|
|
for (p=s;prankrank++;
|
3419 |
|
|
l->rank=0;
|
3420 |
|
|
} else { /* |policy==pseudo_lru| */
|
3421 |
|
|
r=l-s;
|
3422 |
|
|
for (j=1,m=aa>>1;m;m>>=1)
|
3423 |
|
|
if (r&m) s[j].rank=1,j=j+j+1;
|
3424 |
|
|
else s[j].rank=0, j=j+j;
|
3425 |
|
|
}
|
3426 |
|
|
return;
|
3427 |
|
|
}
|
3428 |
|
|
|
3429 |
|
|
@ The |cache_search| routine looks for a given key $\alpha$
|
3430 |
|
|
in a given cache, and returns a cache block if there's a hit; otherwise
|
3431 |
|
|
it returns~|NULL|. If the search hits, the set in which the block was
|
3432 |
|
|
found is stored in global variable |hit_set|. Notice that we need to check
|
3433 |
|
|
more bits of the tag when we search in the victim area.
|
3434 |
|
|
|
3435 |
|
|
@d cache_addr(c,alf) c->set[(alf.l&~(c->tagmask))>>c->b]
|
3436 |
|
|
|
3437 |
|
|
@=
|
3438 |
|
|
static cacheblock* cache_search @,@,@[ARGS((cache*,octa))@];
|
3439 |
|
|
|
3440 |
|
|
@ @=
|
3441 |
|
|
static cacheblock* cache_search(c,alf)
|
3442 |
|
|
cache *c; /* the cache to be searched */
|
3443 |
|
|
octa alf; /* the key */
|
3444 |
|
|
{
|
3445 |
|
|
register cacheset s;
|
3446 |
|
|
register cacheblock* p;
|
3447 |
|
|
s=cache_addr(c,alf); /* the set corresponding to |alf| */
|
3448 |
|
|
for (p=s;paa;p++)
|
3449 |
|
|
if (((p->tag.l ^ alf.l)&c->tagmask)==0 && p->tag.h==alf.h) goto hit;
|
3450 |
|
|
s=c->victim;
|
3451 |
|
|
if (!s) return NULL; /* cache miss, and no victim area */
|
3452 |
|
|
for (p=s;pvv;p++)
|
3453 |
|
|
if (((p->tag.l^alf.l)&(-c->bb))==0 && p->tag.h==alf.h) goto hit;
|
3454 |
|
|
return NULL; /* double miss */
|
3455 |
|
|
hit: hit_set=s;@+ return p;
|
3456 |
|
|
}
|
3457 |
|
|
|
3458 |
|
|
@ @=
|
3459 |
|
|
cacheset hit_set;
|
3460 |
|
|
|
3461 |
|
|
@ If |p=cache_search(c,alf)| hits and if we call |use_and_fix(c,p)|
|
3462 |
|
|
immediately afterwards, cache~|c| is updated to record the usage of
|
3463 |
|
|
key~|alf|. A hit in the victim area moves the cache block to the main area,
|
3464 |
|
|
unless the |filler| routine of cache~|c| is active.
|
3465 |
|
|
A pointer to the (possibly moved) cache block is returned.
|
3466 |
|
|
|
3467 |
|
|
@=
|
3468 |
|
|
static cacheblock* use_and_fix @,@,@[ARGS((cache*,cacheblock*))@];
|
3469 |
|
|
|
3470 |
|
|
@ @=
|
3471 |
|
|
static cacheblock *use_and_fix(c,p)
|
3472 |
|
|
cache *c;
|
3473 |
|
|
cacheblock *p;
|
3474 |
|
|
{
|
3475 |
|
|
if (hit_set!=c->victim) note_usage(p,hit_set,c->aa,c->repl);
|
3476 |
|
|
else { note_usage(p,hit_set,c->vv,c->vrepl); /* found in victim cache */
|
3477 |
|
|
if (!c->filler.next) {
|
3478 |
|
|
register cacheset s=cache_addr(c,p->tag);
|
3479 |
|
|
register cacheblock *q=choose_victim(s,c->aa,c->repl);
|
3480 |
|
|
note_usage(q,s,c->aa,c->repl);
|
3481 |
|
|
@;
|
3482 |
|
|
return q;
|
3483 |
|
|
}
|
3484 |
|
|
}
|
3485 |
|
|
return p;
|
3486 |
|
|
}
|
3487 |
|
|
|
3488 |
|
|
@ We can simply permute the pointers inside the cacheblock structures of a
|
3489 |
|
|
cache, instead of copying the data, if we are careful not to let any of those
|
3490 |
|
|
pointers escape into other data structures.
|
3491 |
|
|
|
3492 |
|
|
@=
|
3493 |
|
|
{
|
3494 |
|
|
octa t;
|
3495 |
|
|
register char *d=p->dirty;
|
3496 |
|
|
register octa *dd=p->data;
|
3497 |
|
|
t=p->tag;@+p->tag=q->tag;@+q->tag=t;
|
3498 |
|
|
p->dirty=q->dirty;@+q->dirty=d;
|
3499 |
|
|
p->data=q->data;@+q->data=dd;
|
3500 |
|
|
}
|
3501 |
|
|
|
3502 |
|
|
@ The |demote_and_fix| routine is analogous to |use_and_fix|,
|
3503 |
|
|
except that we don't want to promote the data we found.
|
3504 |
|
|
|
3505 |
|
|
@=
|
3506 |
|
|
static cacheblock* demote_and_fix @,@,@[ARGS((cache*,cacheblock*))@];
|
3507 |
|
|
|
3508 |
|
|
@ @=
|
3509 |
|
|
static cacheblock *demote_and_fix(c,p)
|
3510 |
|
|
cache *c;
|
3511 |
|
|
cacheblock *p;
|
3512 |
|
|
{
|
3513 |
|
|
if (hit_set!=c->victim) demote_usage(p,hit_set,c->aa,c->repl);
|
3514 |
|
|
else demote_usage(p,hit_set,c->vv,c->vrepl);
|
3515 |
|
|
return p;
|
3516 |
|
|
}
|
3517 |
|
|
|
3518 |
|
|
@ The subroutine |load_cache(c,p)| is called at a moment when
|
3519 |
|
|
|c->lock| has been set and |c->inbuf| has been filled with clean data
|
3520 |
|
|
to be placed in the cache block~|p|.
|
3521 |
|
|
|
3522 |
|
|
@=
|
3523 |
|
|
static void load_cache @,@,@[ARGS((cache*,cacheblock*))@];
|
3524 |
|
|
|
3525 |
|
|
@ @=
|
3526 |
|
|
static void load_cache(c,p)
|
3527 |
|
|
cache *c;
|
3528 |
|
|
cacheblock *p;
|
3529 |
|
|
{
|
3530 |
|
|
register int i;
|
3531 |
|
|
register octa *d;
|
3532 |
|
|
for (i=0;ibb>>c->g;i++) p->dirty[i]=false;
|
3533 |
|
|
d=p->data;@+ p->data=c->inbuf.data;@+ c->inbuf.data=d;
|
3534 |
|
|
p->tag=c->inbuf.tag;
|
3535 |
|
|
hit_set=cache_addr(c,p->tag);@+
|
3536 |
|
|
use_and_fix(c,p); /* |p| not moved */
|
3537 |
|
|
}
|
3538 |
|
|
|
3539 |
|
|
@ The subroutine |flush_cache(c,p,keep)| is called at a ``quiet''
|
3540 |
|
|
moment when |c->flusher.next=NULL|.
|
3541 |
|
|
It puts cache block~|p| into |c->outbuf| and
|
3542 |
|
|
fires up the |c->flusher| coroutine, which will take care of
|
3543 |
|
|
sending the data to lower levels of the memory hierarchy.
|
3544 |
|
|
Cache block~|p| is also marked clean.
|
3545 |
|
|
|
3546 |
|
|
@=
|
3547 |
|
|
static void flush_cache @,@,@[ARGS((cache*,cacheblock*,bool))@];
|
3548 |
|
|
|
3549 |
|
|
@ @=
|
3550 |
|
|
static void flush_cache(c,p,keep)
|
3551 |
|
|
cache *c;
|
3552 |
|
|
cacheblock *p; /* a block inside cache |c| */
|
3553 |
|
|
bool keep; /* should we preserve the data in |p|? */
|
3554 |
|
|
{
|
3555 |
|
|
register octa *d;
|
3556 |
|
|
register char *dd;
|
3557 |
|
|
register int j;
|
3558 |
|
|
c->outbuf.tag=p->tag;
|
3559 |
|
|
if (keep)@+ for (j=0;jbb>>3;j++) c->outbuf.data[j]=p->data[j];
|
3560 |
|
|
else d=c->outbuf.data, c->outbuf.data=p->data, p->data=d;
|
3561 |
|
|
dd=c->outbuf.dirty, c->outbuf.dirty=p->dirty, p->dirty=dd;
|
3562 |
|
|
for (j=0;jbb>>c->g;j++) p->dirty[j]=false;
|
3563 |
|
|
startup(&c->flusher,c->copy_out_time); /* will not be aborted */
|
3564 |
|
|
}
|
3565 |
|
|
|
3566 |
|
|
@ The |alloc_slot| routine is called when we wish to put new information
|
3567 |
|
|
into a cache after a cache miss. It returns a pointer to a cache block
|
3568 |
|
|
in the main area where the new information should be put. The tag of
|
3569 |
|
|
that cache block is invalidated; the calling routine should take care
|
3570 |
|
|
of filling it and giving it a valid tag in due time. The cache's |filler|
|
3571 |
|
|
routine should not be active when |alloc_slot| is called.
|
3572 |
|
|
|
3573 |
|
|
Inserting new information might also require writing old information
|
3574 |
|
|
into the next level of the memory hierarchy, if the block being replaced
|
3575 |
|
|
is dirty. This routine returns |NULL| in such cases if the cache is
|
3576 |
|
|
flushing a previously discarded block.
|
3577 |
|
|
Otherwise it schedules the |flusher| coroutine.
|
3578 |
|
|
|
3579 |
|
|
This routine returns |NULL| also if the given key happens to be in the
|
3580 |
|
|
cache. Such cases are rare, but the following scenario shows that
|
3581 |
|
|
they aren't impossible: Suppose the DT-cache access time is 5, the D-cache
|
3582 |
|
|
access time is~1, and two processes simultaneously look for the
|
3583 |
|
|
same physical address. One process hits in DT-cache but misses in D-cache,
|
3584 |
|
|
waiting 5 cycles before trying |alloc_slot| in the D-cache; meanwhile
|
3585 |
|
|
the other process missed in D-cache but didn't need to use the DT-cache,
|
3586 |
|
|
so it might have updated the D-cache.
|
3587 |
|
|
|
3588 |
|
|
A key value is never negative. Therefore we can invalidate the tag in
|
3589 |
|
|
the chosen slot by forcing it to be negative.
|
3590 |
|
|
|
3591 |
|
|
@=
|
3592 |
|
|
static cacheblock* alloc_slot @,@,@[ARGS((cache*,octa))@];
|
3593 |
|
|
|
3594 |
|
|
@ @=
|
3595 |
|
|
static cacheblock* alloc_slot(c,alf)
|
3596 |
|
|
cache *c;
|
3597 |
|
|
octa alf; /* key that probably isn't in the cache */
|
3598 |
|
|
{
|
3599 |
|
|
register cacheset s;
|
3600 |
|
|
register cacheblock *p,*q;
|
3601 |
|
|
if (cache_search(c,alf)) return NULL;
|
3602 |
|
|
s=cache_addr(c,alf); /* the set corresponding to |alf| */
|
3603 |
|
|
if (c->victim) p=choose_victim(c->victim,c->vv,c->vrepl);
|
3604 |
|
|
else p=choose_victim(s,c->aa,c->repl);
|
3605 |
|
|
if (is_dirty(c,p)) {
|
3606 |
|
|
if (c->flusher.next) return NULL;
|
3607 |
|
|
flush_cache(c,p,false);
|
3608 |
|
|
}
|
3609 |
|
|
if (c->victim) {
|
3610 |
|
|
q=choose_victim(s,c->aa,c->repl);
|
3611 |
|
|
@;
|
3612 |
|
|
q->tag.h |= sign_bit; /* invalidate the tag */
|
3613 |
|
|
return q;
|
3614 |
|
|
}
|
3615 |
|
|
p->tag.h |= sign_bit;@+ return p;
|
3616 |
|
|
}
|
3617 |
|
|
|
3618 |
|
|
@* Simulated memory. How should we deal with the potentially gigantic
|
3619 |
|
|
memory of~\MMIX? We can't simply declare an array~$m$ that has
|
3620 |
|
|
$2^{48}$ bytes. (Indeed, up to $2^{63}$ bytes are needed, if we
|
3621 |
|
|
consider also the physical addresses $\ge2^{48}$ that are reserved for
|
3622 |
|
|
memory-mapped input/output.)
|
3623 |
|
|
|
3624 |
|
|
We could regard memory as a special kind of cache,
|
3625 |
|
|
in which every access is required to hit. For example, such an ``M-cache''
|
3626 |
|
|
could be fully associative, with $2^a$ blocks each
|
3627 |
|
|
having a different tag; simulation could proceed until more than~$2^a-1$ tags
|
3628 |
|
|
are required. But then the predefined value of~$a$ might well be so large that
|
3629 |
|
|
the sequential search of our |cache_search| routine would be too slow.
|
3630 |
|
|
|
3631 |
|
|
Instead, we will allocate memory in chunks of $2^{16}$ bytes at a time,
|
3632 |
|
|
as needed, and we will use hashing to search for the relevant chunk
|
3633 |
|
|
whenever a physical address is given. If the address is $2^{48}$ or greater,
|
3634 |
|
|
special routines called |spec_read| and |spec_write|, supplied by the
|
3635 |
|
|
user, will be called upon to do the reading or writing. Otherwise
|
3636 |
|
|
the 48-bit address consists of a 32-bit {\it chunk address\/} and a
|
3637 |
|
|
16-bit {\it chunk offset}.
|
3638 |
|
|
|
3639 |
|
|
Chunk addresses that are not used take no space in this simulator. But if,
|
3640 |
|
|
say, 1000 such patterns occur, the simulator will dynamically allocate
|
3641 |
|
|
approximately 65MB for the portions of main memory that are used.
|
3642 |
|
|
Parameter |mem_chunks_max| specifies the largest number of different chunk
|
3643 |
|
|
addresses that are supported. This parameter does not constrain the range of
|
3644 |
|
|
simulated physical addresses, which cover the entire 256 large-terabyte range
|
3645 |
|
|
permitted by~\MMIX.
|
3646 |
|
|
|
3647 |
|
|
@=
|
3648 |
|
|
typedef struct {
|
3649 |
|
|
tetra tag; /* 32-bit chunk address */
|
3650 |
|
|
octa *chunk; /* either |NULL| or an array of $2^{13}$ octabytes */
|
3651 |
|
|
} chunknode;
|
3652 |
|
|
|
3653 |
|
|
@ The parameter |hash_prime| should be a prime number larger than the
|
3654 |
|
|
parameter
|
3655 |
|
|
|mem_chunks_max|, preferably more than twice as large but not much bigger
|
3656 |
|
|
than~that. The default values |mem_chunks_max=1000| and |hash_prime=2003| are
|
3657 |
|
|
set by |MMIX_config| unless the user specifies otherwise.
|
3658 |
|
|
|
3659 |
|
|
@=
|
3660 |
|
|
Extern int mem_chunks; /* this many chunks are allocated so far */
|
3661 |
|
|
Extern int mem_chunks_max; /* up to this many different chunks per run */
|
3662 |
|
|
Extern int hash_prime; /* larger than |mem_chunks_max|, but not enormous */
|
3663 |
|
|
Extern chunknode *mem_hash; /* the simulated main memory */
|
3664 |
|
|
|
3665 |
|
|
@ The separately compiled procedures |spec_read()| and |spec_write()| have the
|
3666 |
|
|
same calling conventions as the general procedures
|
3667 |
|
|
|mem_read()| and |mem_write()|.
|
3668 |
|
|
|
3669 |
|
|
@=
|
3670 |
|
|
extern octa spec_read @,@,@[ARGS((octa addr))@]; /* for memory mapped I/O */
|
3671 |
|
|
extern void spec_write @,@,@[ARGS((octa addr,octa val))@]; /* likewise */
|
3672 |
|
|
|
3673 |
|
|
@ If the program tries to read from a chunk that hasn't been allocated,
|
3674 |
|
|
the value zero is returned, optionally with a comment to the user.
|
3675 |
|
|
|
3676 |
|
|
Chunk address 0 is always allocated first. Then we can assume that
|
3677 |
|
|
a matching chunk tag implies a nonnull |chunk| pointer.
|
3678 |
|
|
|
3679 |
|
|
This routine sets |last_h| to the chunk found, so that we can rapidly read
|
3680 |
|
|
other words that we know must belong to the same chunk. For this purpose
|
3681 |
|
|
it is convenient to let |mem_hash[hash_prime]| be a chunk full of zeros,
|
3682 |
|
|
representing uninitialized memory.
|
3683 |
|
|
|
3684 |
|
|
@=
|
3685 |
|
|
Extern octa mem_read @,@,@[ARGS((octa addr))@];
|
3686 |
|
|
|
3687 |
|
|
@ @=
|
3688 |
|
|
octa mem_read(addr)
|
3689 |
|
|
octa addr;
|
3690 |
|
|
{
|
3691 |
|
|
register tetra off,key;
|
3692 |
|
|
register int h;
|
3693 |
|
|
if (addr.h>=(1<<16)) return spec_read(addr);
|
3694 |
|
|
off=(addr.l&0xffff)>>3;
|
3695 |
|
|
key=(addr.l&0xffff0000)+addr.h;
|
3696 |
|
|
for (h=key%hash_prime;mem_hash[h].tag!=key;h--) {
|
3697 |
|
|
if (mem_hash[h].chunk==NULL) {
|
3698 |
|
|
if (verbose&uninit_mem_bit)
|
3699 |
|
|
errprint2("uninitialized memory read at %08x%08x",addr.h,addr.l);
|
3700 |
|
|
@.uninitialized memory...@>
|
3701 |
|
|
h=hash_prime;@+ break; /* zero will be returned */
|
3702 |
|
|
}
|
3703 |
|
|
if (h==0) h=hash_prime;
|
3704 |
|
|
}
|
3705 |
|
|
last_h=h;
|
3706 |
|
|
return mem_hash[h].chunk[off];
|
3707 |
|
|
}
|
3708 |
|
|
|
3709 |
|
|
@ @=
|
3710 |
|
|
Extern int last_h; /* the hash index that was most recently correct */
|
3711 |
|
|
|
3712 |
|
|
@ @=
|
3713 |
|
|
Extern void mem_write @,@,@[ARGS((octa addr,octa val))@];
|
3714 |
|
|
|
3715 |
|
|
@ @=
|
3716 |
|
|
void mem_write(addr,val)
|
3717 |
|
|
octa addr,val;
|
3718 |
|
|
{
|
3719 |
|
|
register tetra off,key;
|
3720 |
|
|
register int h;
|
3721 |
|
|
if (addr.h>=(1<<16)) {@+spec_write(addr,val);@+return;@+}
|
3722 |
|
|
off=(addr.l&0xffff)>>3;
|
3723 |
|
|
key=(addr.l&0xffff0000)+addr.h;
|
3724 |
|
|
for (h=key%hash_prime;mem_hash[h].tag!=key;h--) {
|
3725 |
|
|
if (mem_hash[h].chunk==NULL) {
|
3726 |
|
|
if (++mem_chunks>mem_chunks_max)
|
3727 |
|
|
panic(errprint1("More than %d memory chunks are needed",
|
3728 |
|
|
@.More...chunks are needed@>
|
3729 |
|
|
mem_chunks_max));
|
3730 |
|
|
mem_hash[h].chunk=(octa *)calloc(1<<13,sizeof(octa));
|
3731 |
|
|
if (mem_hash[h].chunk==NULL)
|
3732 |
|
|
panic(errprint1("I can't allocate memory chunk number %d",
|
3733 |
|
|
@.I can't allocate...@>
|
3734 |
|
|
mem_chunks));
|
3735 |
|
|
mem_hash[h].tag=key;
|
3736 |
|
|
break;
|
3737 |
|
|
}
|
3738 |
|
|
if (h==0) h=hash_prime;
|
3739 |
|
|
}
|
3740 |
|
|
last_h=h;
|
3741 |
|
|
mem_hash[h].chunk[off]=val;
|
3742 |
|
|
}
|
3743 |
|
|
|
3744 |
|
|
@ The memory is characterized by several parameters, depending on the
|
3745 |
|
|
characteristics of the memory bus being simulated. Let |bus_words|
|
3746 |
|
|
be the number of octabytes read or written simultaneously (usually
|
3747 |
|
|
|bus_words| is 1 or~2; it must be a power of~2). The number of clock
|
3748 |
|
|
cycles needed to read or write |c*bus_words| octabytes that all belong to the
|
3749 |
|
|
same cache block is assumed to be |mem_addr_time+c*mem_read_time| or
|
3750 |
|
|
|mem_addr_time+c*mem_write_time|, respectively.
|
3751 |
|
|
|
3752 |
|
|
@=
|
3753 |
|
|
Extern int mem_addr_time; /* cycles to transmit an address on memory bus */
|
3754 |
|
|
Extern int bus_words; /* width of memory bus, in octabytes */
|
3755 |
|
|
Extern int mem_read_time; /* cycles to read from main memory */
|
3756 |
|
|
Extern int mem_write_time; /* cycles to write to main memory */
|
3757 |
|
|
Extern lockvar mem_lock; /* is nonnull when the bus is busy */
|
3758 |
|
|
|
3759 |
|
|
@ One of the principal ways to write memory is to invoke
|
3760 |
|
|
a |flush_to_mem| coroutine,
|
3761 |
|
|
which is the |Scache->flusher| if there is an S-cache, or the
|
3762 |
|
|
|Dcache->flusher| if there is a D-cache but no S-cache.
|
3763 |
|
|
|
3764 |
|
|
When such a coroutine is started, its |data->ptr_a| will be |Scache|
|
3765 |
|
|
or~|Dcache|. The data to be written will just have been copied to the cache's
|
3766 |
|
|
|outbuf|.
|
3767 |
|
|
|
3768 |
|
|
@=
|
3769 |
|
|
case flush_to_mem: {@+register cache *c=(cache *)data->ptr_a;
|
3770 |
|
|
switch (data->state) {
|
3771 |
|
|
case 0:@+ if (mem_lock) wait(1);
|
3772 |
|
|
data->state=1;
|
3773 |
|
|
case 1: set_lock(self,mem_lock);
|
3774 |
|
|
data->state=2;
|
3775 |
|
|
@outbuf| and wait for the bus@>;
|
3776 |
|
|
case 2: goto terminate; /* this frees |mem_lock| and |c->outbuf| */
|
3777 |
|
|
}
|
3778 |
|
|
}
|
3779 |
|
|
|
3780 |
|
|
@ @outbuf| and wait for the bus@>=
|
3781 |
|
|
{
|
3782 |
|
|
register int off,last_off,count,first,ii;
|
3783 |
|
|
register int del=c->gg>>3; /* octabytes per granule */
|
3784 |
|
|
octa addr;
|
3785 |
|
|
addr=c->outbuf.tag;@+ off=(addr.l&0xffff)>>3;
|
3786 |
|
|
for (i=j=0,first=1,count=0;jbb>>c->g;j++) {
|
3787 |
|
|
ii=i+del;
|
3788 |
|
|
if (!c->outbuf.dirty[j]) i=ii,off+=del,addr.l+=del<<3;
|
3789 |
|
|
else@+ while (i
|
3790 |
|
|
if (first) {
|
3791 |
|
|
count++;@+ last_off=off;@+ first=0;
|
3792 |
|
|
mem_write(addr,c->outbuf.data[i]);
|
3793 |
|
|
}@+else {
|
3794 |
|
|
if ((off^last_off)&(-bus_words)) count++;
|
3795 |
|
|
last_off=off;
|
3796 |
|
|
mem_hash[last_h].chunk[off]=c->outbuf.data[i];
|
3797 |
|
|
}
|
3798 |
|
|
i++;@+ off++;@+ addr.l+=8;
|
3799 |
|
|
}
|
3800 |
|
|
}
|
3801 |
|
|
wait(mem_addr_time+count*mem_write_time);
|
3802 |
|
|
}
|
3803 |
|
|
|
3804 |
|
|
@* Cache transfers. We have seen that the |Dcache->flusher| sends
|
3805 |
|
|
data directly to the main memory if there is no S-cache.
|
3806 |
|
|
But if both D-cache and S-cache exist, the |Dcache->flusher| is a
|
3807 |
|
|
more complicated coroutine of type |flush_to_S|. In this case we need
|
3808 |
|
|
to deal with the fact that the S-cache blocks might be larger than
|
3809 |
|
|
the D-cache blocks; furthermore, the S-cache might have a
|
3810 |
|
|
write-around and/or write-through policy, etc. But one simplifying
|
3811 |
|
|
fact does help us: We know that the flusher coroutine will not be
|
3812 |
|
|
aborted until it has run to completion.
|
3813 |
|
|
|
3814 |
|
|
Some machines, such as the Alpha 21164, have an additional cache between
|
3815 |
|
|
the S-cache and memory, called the B-cache (the ``backup cache''). A B-cache
|
3816 |
|
|
could be simulated by extending the logic used here; but such extensions
|
3817 |
|
|
of the present program are left to the interested reader.
|
3818 |
|
|
|
3819 |
|
|
@=
|
3820 |
|
|
case flush_to_S: {@+register cache *c=(cache *)data->ptr_a;
|
3821 |
|
|
register int block_diff=Scache->bb-c->bb;
|
3822 |
|
|
p=(cacheblock*)data->ptr_b;
|
3823 |
|
|
switch (data->state) {
|
3824 |
|
|
case 0:@+ if (Scache->lock) wait(1);
|
3825 |
|
|
data->state=1;
|
3826 |
|
|
case 1: set_lock(self,Scache->lock);
|
3827 |
|
|
data->ptr_b=(void*)cache_search(Scache,c->outbuf.tag);
|
3828 |
|
|
if (data->ptr_b) data->state=4;
|
3829 |
|
|
else if (Scache->mode & WRITE_ALLOC) data->state=(block_diff? 2: 3);
|
3830 |
|
|
else data->state=6;
|
3831 |
|
|
wait(Scache->access_time);
|
3832 |
|
|
case 2: @inbuf| with clean memory data@>;
|
3833 |
|
|
case 3: @;
|
3834 |
|
|
if (block_diff) @inbuf| to slot |p|@>;
|
3835 |
|
|
case 4: copy_block(c,&(c->outbuf),Scache,p);
|
3836 |
|
|
hit_set=cache_addr(Scache,c->outbuf.tag);@+ use_and_fix(Scache,p);
|
3837 |
|
|
/* |p| not moved */
|
3838 |
|
|
data->state=5;@+ wait(Scache->copy_in_time);
|
3839 |
|
|
case 5:@+ if ((Scache->mode&WRITE_BACK)==0) { /* write-through */
|
3840 |
|
|
if (Scache->flusher.next) wait(1);
|
3841 |
|
|
flush_cache(Scache,p,true);
|
3842 |
|
|
}
|
3843 |
|
|
goto terminate;
|
3844 |
|
|
case 6:@;
|
3845 |
|
|
}
|
3846 |
|
|
}
|
3847 |
|
|
|
3848 |
|
|
@ @=
|
3849 |
|
|
if (Scache->filler.next) wait(1); /* perhaps an unnecessary precaution? */
|
3850 |
|
|
p=alloc_slot(Scache,c->outbuf.tag);
|
3851 |
|
|
if (!p) wait(1);
|
3852 |
|
|
data->ptr_b=(void*)p;
|
3853 |
|
|
p->tag=c->outbuf.tag;@+ p->tag.l=c->outbuf.tag.l&(-Scache->bb);
|
3854 |
|
|
|
3855 |
|
|
@ We only need to read |block_diff| bytes, but it's easier to
|
3856 |
|
|
read them all and to charge only for reading the ones we needed.
|
3857 |
|
|
|
3858 |
|
|
@inbuf| with clean memory data@>=
|
3859 |
|
|
{@+register int count=block_diff>>3;
|
3860 |
|
|
register int off,delay;
|
3861 |
|
|
octa addr;
|
3862 |
|
|
if (mem_lock) wait(1);
|
3863 |
|
|
addr.h=c->outbuf.tag.h;@+ addr.l=c->outbuf.tag.l&-Scache->bb;
|
3864 |
|
|
off=(addr.l&0xffff)>>3;
|
3865 |
|
|
for (j=0;jbb>>3;j++)
|
3866 |
|
|
if (j==0) Scache->inbuf.data[j]=mem_read(addr);
|
3867 |
|
|
else Scache->inbuf.data[j]=mem_hash[last_h].chunk[j+off];
|
3868 |
|
|
set_lock(&mem_locker,mem_lock);
|
3869 |
|
|
delay=mem_addr_time+(int)((count+bus_words-1)/(bus_words))*mem_read_time;
|
3870 |
|
|
startup(&mem_locker,delay);
|
3871 |
|
|
data->state=3;@+ wait(delay);
|
3872 |
|
|
}
|
3873 |
|
|
|
3874 |
|
|
@ @inbuf| to slot |p|@>=
|
3875 |
|
|
{
|
3876 |
|
|
register octa *d=p->data;
|
3877 |
|
|
p->data=Scache->inbuf.data;@+Scache->inbuf.data=d;
|
3878 |
|
|
}
|
3879 |
|
|
|
3880 |
|
|
@ Here we assume that the granularity is~8.
|
3881 |
|
|
|
3882 |
|
|
@=
|
3883 |
|
|
if (Scache->flusher.next) wait(1);
|
3884 |
|
|
Scache->outbuf.tag.h=c->outbuf.tag.h;
|
3885 |
|
|
Scache->outbuf.tag.l=c->outbuf.tag.l&(-Scache->bb);
|
3886 |
|
|
for (j=0;jbb>>Scache->g;j++) Scache->outbuf.dirty[j]=false;
|
3887 |
|
|
copy_block(c,&(c->outbuf),Scache,&(Scache->outbuf));
|
3888 |
|
|
startup(&Scache->flusher,Scache->copy_out_time);
|
3889 |
|
|
goto terminate;
|
3890 |
|
|
|
3891 |
|
|
@ The S-cache gets new data from memory by invoking a |fill_from_mem|
|
3892 |
|
|
coroutine; the I-cache or D-cache may also invoke a |fill_from_mem| coroutine,
|
3893 |
|
|
if there is no S-cache. When such a coroutine is invoked, it holds
|
3894 |
|
|
|mem_lock|, and its caller has gone to sleep.
|
3895 |
|
|
A physical memory address is given in |data->z.o|,
|
3896 |
|
|
and |data->ptr_a| specifies either |Icache| or |Dcache|.
|
3897 |
|
|
Furthermore, |data->ptr_b| specifies a block within that
|
3898 |
|
|
cache, determined by the |alloc_slot| routine. The coroutine
|
3899 |
|
|
simulates reading the contents of the specified memory location,
|
3900 |
|
|
places the result in the |x.o| field of its caller's control block,
|
3901 |
|
|
and wakes up the caller. It proceeds to fill the cache's |inbuf| and,
|
3902 |
|
|
ultimately, the specified cache block, before waking the caller again.
|
3903 |
|
|
|
3904 |
|
|
Let |c=data->ptr_b|. The caller is then |c->fill_lock|, if this variable is
|
3905 |
|
|
nonnull. However, the caller might not wish to be awoken or to receive
|
3906 |
|
|
the data (for example, if it has been aborted). In such cases |c->fill_lock|
|
3907 |
|
|
will be~|NULL|; the filling action continues without the wakeup calls.
|
3908 |
|
|
If |c=Scache|, the S-cache will be locked and the caller will not
|
3909 |
|
|
have been aborted.
|
3910 |
|
|
|
3911 |
|
|
@=
|
3912 |
|
|
case fill_from_mem: {@+register cache *c=(cache *)data->ptr_a;
|
3913 |
|
|
register coroutine *cc=c->fill_lock;
|
3914 |
|
|
switch (data->state) {
|
3915 |
|
|
case 0: data->x.o=mem_read(data->z.o);
|
3916 |
|
|
if (cc) {
|
3917 |
|
|
cc->ctl->x.o=data->x.o;
|
3918 |
|
|
awaken(cc,mem_read_time);
|
3919 |
|
|
}
|
3920 |
|
|
data->state=1;
|
3921 |
|
|
@inbuf| and wait for the bus@>;
|
3922 |
|
|
case 1: release_lock(self,mem_lock);
|
3923 |
|
|
data->state=2;
|
3924 |
|
|
case 2:@+if (c!=Scache) {
|
3925 |
|
|
if (c->lock) wait(1);
|
3926 |
|
|
set_lock(self,c->lock);
|
3927 |
|
|
}
|
3928 |
|
|
if (cc) awaken(cc,c->copy_in_time); /* the second wakeup call */
|
3929 |
|
|
load_cache(c,(cacheblock*)data->ptr_b);
|
3930 |
|
|
data->state=3;@+ wait(c->copy_in_time);
|
3931 |
|
|
case 3: goto terminate;
|
3932 |
|
|
}
|
3933 |
|
|
}
|
3934 |
|
|
|
3935 |
|
|
@ If |c|'s cache size is no larger than the memory bus, we wait an extra
|
3936 |
|
|
cycle, so that there will be two wakeup calls.
|
3937 |
|
|
|
3938 |
|
|
@inbuf|...@>=
|
3939 |
|
|
{
|
3940 |
|
|
register int count, off;
|
3941 |
|
|
c->inbuf.tag=data->z.o;@+ c->inbuf.tag.l &= -c->bb;
|
3942 |
|
|
count=c->bb>>3, off=(c->inbuf.tag.l&0xffff)>>3;
|
3943 |
|
|
for (i=0;iinbuf.data[i]=mem_hash[last_h].chunk[off];
|
3944 |
|
|
if (count<=bus_words) wait(1+mem_read_time)@;
|
3945 |
|
|
else wait((int)(count/bus_words)*mem_read_time);
|
3946 |
|
|
}
|
3947 |
|
|
|
3948 |
|
|
@ The |fill_from_S| coroutine has the same conventions as |fill_from_mem|,
|
3949 |
|
|
except that the data comes directly from the S-cache if it is present there.
|
3950 |
|
|
This is the |filler| coroutine for the I-cache and D-cache if an S-cache
|
3951 |
|
|
is present.
|
3952 |
|
|
|
3953 |
|
|
@=
|
3954 |
|
|
case fill_from_S: {@+register cache *c=(cache *)data->ptr_a;
|
3955 |
|
|
register coroutine *cc=c->fill_lock;
|
3956 |
|
|
p=(cacheblock*)data->ptr_c;
|
3957 |
|
|
switch (data->state) {
|
3958 |
|
|
case 0: p=cache_search(Scache,data->z.o);
|
3959 |
|
|
if (p) goto S_non_miss;
|
3960 |
|
|
data->state=1;
|
3961 |
|
|
case 1: @;
|
3962 |
|
|
data->state=2;@+sleep;
|
3963 |
|
|
case 2:@+if (cc) {
|
3964 |
|
|
cc->ctl->x.o=data->x.o;
|
3965 |
|
|
/* this data has been supplied by |Scache->filler| */
|
3966 |
|
|
awaken(cc,Scache->access_time); /* we propagate it back */
|
3967 |
|
|
}
|
3968 |
|
|
data->state=3;@+sleep; /* when we awake, the S-cache will have our data */
|
3969 |
|
|
S_non_miss:@+if (cc) {
|
3970 |
|
|
cc->ctl->x.o=p->data[(data->z.o.l&(Scache->bb-1))>>3];
|
3971 |
|
|
awaken(cc,Scache->access_time);
|
3972 |
|
|
}
|
3973 |
|
|
case 3: @inbuf|@>;
|
3974 |
|
|
data->state=4;@+wait(Scache->access_time);
|
3975 |
|
|
case 4:@+ if (c->lock) wait(1);
|
3976 |
|
|
set_lock(self,c->lock);
|
3977 |
|
|
Scache->lock=NULL; /* we had been holding that lock */
|
3978 |
|
|
load_cache(c,(cacheblock*)data->ptr_b);
|
3979 |
|
|
data->state=5;@+ wait(c->copy_in_time);
|
3980 |
|
|
case 5:@+if (cc) awaken(cc,1); /* second wakeup call */
|
3981 |
|
|
goto terminate;
|
3982 |
|
|
}
|
3983 |
|
|
}
|
3984 |
|
|
|
3985 |
|
|
@ We are already holding the |Scache->lock|, but we're about to take on the
|
3986 |
|
|
|Scache->fill_lock| too (with the understanding that one is ``stronger''
|
3987 |
|
|
than the other). For a short time the |Scache->lock| will point to us
|
3988 |
|
|
but we will point to |Scache->fill_lock|; this will not cause difficulty,
|
3989 |
|
|
because the present coroutine is not abortable.
|
3990 |
|
|
|
3991 |
|
|
@=
|
3992 |
|
|
if (Scache->filler.next || mem_lock) wait(1);
|
3993 |
|
|
p=alloc_slot(Scache,data->z.o);
|
3994 |
|
|
if (!p) wait(1);
|
3995 |
|
|
set_lock(&Scache->filler,mem_lock);
|
3996 |
|
|
set_lock(self,Scache->fill_lock);
|
3997 |
|
|
data->ptr_c=Scache->filler_ctl.ptr_b=(void *)p;
|
3998 |
|
|
Scache->filler_ctl.z.o=data->z.o;
|
3999 |
|
|
startup(&Scache->filler,mem_addr_time);
|
4000 |
|
|
|
4001 |
|
|
@ The S-cache blocks might be wider than the blocks of the I-cache or
|
4002 |
|
|
D-cache, so the copying in this step isn't quite trivial.
|
4003 |
|
|
|
4004 |
|
|
@inbuf|@>=
|
4005 |
|
|
{@+register int off;
|
4006 |
|
|
c->inbuf.tag=data->z.o;@+c->inbuf.tag.l &=-c->bb;
|
4007 |
|
|
for (j=0,off=(c->inbuf.tag.l&(Scache->bb-1))>>3;jbb>>3;j++,off++)
|
4008 |
|
|
c->inbuf.data[j]=p->data[off];
|
4009 |
|
|
release_lock(self,Scache->fill_lock);
|
4010 |
|
|
set_lock(self,Scache->lock);
|
4011 |
|
|
}
|
4012 |
|
|
|
4013 |
|
|
@ The instruction \.{PRELD} \.{X,\$Y,\$Z} generates $\lfloor{\rm X}/2^b\rfloor$
|
4014 |
|
|
commands if there are $2^b$ bytes per block in the D-cache. These
|
4015 |
|
|
commands will try to preload blocks $\rm\$Y+\$Z$, ${\rm\$Y}+{\rm\$Z}+2^b$,
|
4016 |
|
|
\dots, into the cache if it is not too busy.
|
4017 |
|
|
|
4018 |
|
|
Similar considerations apply to the instructions \.{PREGO} \.{X,\$Y,\$Z}
|
4019 |
|
|
and \.{PREST} \.{X,\$Y,\$Z}.
|
4020 |
|
|
|
4021 |
|
|
@=
|
4022 |
|
|
case preld: case prest:@+ if (!Dcache) goto noop_inst;
|
4023 |
|
|
if (cool->xx>=Dcache->bb) cool->interim=true;
|
4024 |
|
|
cool->ptr_a=(void *)mem.up;@+ break;
|
4025 |
|
|
case prego:@+ if (!Icache) goto noop_inst;
|
4026 |
|
|
if (cool->xx>=Icache->bb) cool->interim=true;
|
4027 |
|
|
cool->ptr_a=(void *)mem.up;@+ break;
|
4028 |
|
|
|
4029 |
|
|
@ If the block size is 64, a command like \.{PREST}~\.{200,\$Y,\$Z}
|
4030 |
|
|
is actually issued as four commands \.{PREST}~\.{200,\$Y,\$Z;}
|
4031 |
|
|
\.{PREST}~\.{191,\$Y,\$Z;} \.{PREST}~\.{127,\$Y,\$Z;}
|
4032 |
|
|
\.{PREST}~\.{63,\$Y,\$Z}. An interruption will then be able to resume
|
4033 |
|
|
properly. In the pipeline, the instruction \.{PREST}~\.{200,\$Y,\$Z}
|
4034 |
|
|
is considered to affect bytes $\rm\$Y+\$Z+192$ through $\rm\$Y+\$Z+200$,
|
4035 |
|
|
or fewer bytes if $\rm\$Y+\$Z$ is not a multiple of~64. (Remember that
|
4036 |
|
|
these instructions are only hints; we act on them only if it is
|
4037 |
|
|
reasonably convenient to do so.)
|
4038 |
|
|
|
4039 |
|
|
@=
|
4040 |
|
|
head->inst = (head->inst&~((Dcache->bb-1)<<16))-0x10000;
|
4041 |
|
|
|
4042 |
|
|
@ @=
|
4043 |
|
|
head->inst = (head->inst&~((Icache->bb-1)<<16))-0x10000;
|
4044 |
|
|
|
4045 |
|
|
@ Another coroutine, called |cleanup|, is occasionally called into
|
4046 |
|
|
action to remove dirty data from the D-cache and S-cache. If it is
|
4047 |
|
|
invoked by starting in state 0, with its |i| field set to |sync|, it
|
4048 |
|
|
will clean everything. It can also be
|
4049 |
|
|
invoked in state~4, with its |i| field set to |syncd| and with a physical
|
4050 |
|
|
address in its |z.o| field; then it simply makes sure that no D-cache
|
4051 |
|
|
or S-cache blocks associated with that address are dirty.
|
4052 |
|
|
|
4053 |
|
|
Field |x.o.h| should be set to zero if items are expected to remain
|
4054 |
|
|
in the cache after being cleaned; otherwise field |x.o.h| should be
|
4055 |
|
|
set to |sign_bit|.
|
4056 |
|
|
|
4057 |
|
|
The coroutine that invokes |cleanup| should hold |clean_lock|. If that
|
4058 |
|
|
coroutine dies, because of an interruption, the |cleanup| coroutine
|
4059 |
|
|
will terminate prematurely.
|
4060 |
|
|
|
4061 |
|
|
We assume that the D-cache and S-cache have some sort of way to
|
4062 |
|
|
identify their first dirty block, if any, in |access_time| cycles.
|
4063 |
|
|
|
4064 |
|
|
@=
|
4065 |
|
|
coroutine clean_co;
|
4066 |
|
|
control clean_ctl;
|
4067 |
|
|
lockvar clean_lock;
|
4068 |
|
|
|
4069 |
|
|
@ @=
|
4070 |
|
|
clean_co.ctl=&clean_ctl;
|
4071 |
|
|
clean_co.name="Clean";
|
4072 |
|
|
clean_co.stage=cleanup;
|
4073 |
|
|
clean_ctl.go.o.l=4;
|
4074 |
|
|
|
4075 |
|
|
@ @=
|
4076 |
|
|
case cleanup: p=(cacheblock*)data->ptr_b;
|
4077 |
|
|
switch(data->state) {
|
4078 |
|
|
@;
|
4079 |
|
|
@;
|
4080 |
|
|
case 10: goto terminate;
|
4081 |
|
|
}
|
4082 |
|
|
|
4083 |
|
|
@ @=
|
4084 |
|
|
case 0:@+ if (Dcache->lock || (j=get_reader(Dcache)<0)) wait(1);
|
4085 |
|
|
startup(&Dcache->reader[j],Dcache->access_time);
|
4086 |
|
|
set_lock(self,Dcache->lock);
|
4087 |
|
|
i=j=0;
|
4088 |
|
|
Dclean_loop: p=(icc? &(Dcache->set[i][j]): &(Dcache->victim[j]));
|
4089 |
|
|
if (p->tag.h&sign_bit) goto Dclean_inc;
|
4090 |
|
|
if (!is_dirty(Dcache,p)) {
|
4091 |
|
|
p->tag.h|=data->x.o.h;@+goto Dclean_inc;
|
4092 |
|
|
}
|
4093 |
|
|
data->y.o.h=i, data->y.o.l=j;
|
4094 |
|
|
Dclean: data->state=1;@+
|
4095 |
|
|
data->ptr_b=(void*)p;@+
|
4096 |
|
|
wait(Dcache->access_time);
|
4097 |
|
|
case 1:@+if (Dcache->flusher.next) wait(1);
|
4098 |
|
|
flush_cache(Dcache,p,data->x.o.h==0);
|
4099 |
|
|
p->tag.h|=data->x.o.h;
|
4100 |
|
|
release_lock(self,Dcache->lock);
|
4101 |
|
|
data->state=2;@+
|
4102 |
|
|
wait(Dcache->copy_out_time);
|
4103 |
|
|
case 2:@+ if (!clean_lock) goto done; /* premature termination */
|
4104 |
|
|
if (Dcache->flusher.next) wait(1);
|
4105 |
|
|
if (data->i!=sync) goto Sprep;
|
4106 |
|
|
data->state=3;
|
4107 |
|
|
case 3:@+ if (Dcache->lock || (j=get_reader(Dcache)<0)) wait(1);
|
4108 |
|
|
startup(&Dcache->reader[j],Dcache->access_time);
|
4109 |
|
|
set_lock(self,Dcache->lock);
|
4110 |
|
|
i=data->y.o.h, j=data->y.o.l;
|
4111 |
|
|
Dclean_inc: j++;
|
4112 |
|
|
if (icc && j==Dcache->aa) j=0, i++;
|
4113 |
|
|
if (i==Dcache->cc && j==Dcache->vv) {
|
4114 |
|
|
data->state=5;@+
|
4115 |
|
|
wait(Dcache->access_time);
|
4116 |
|
|
}
|
4117 |
|
|
goto Dclean_loop;
|
4118 |
|
|
case 4:@+ if (Dcache->lock || (j=get_reader(Dcache)<0)) wait(1);
|
4119 |
|
|
startup(&Dcache->reader[j],Dcache->access_time);
|
4120 |
|
|
set_lock(self,Dcache->lock);
|
4121 |
|
|
p=cache_search(Dcache,data->z.o);
|
4122 |
|
|
if (p) {
|
4123 |
|
|
demote_and_fix(Dcache,p);
|
4124 |
|
|
if (is_dirty(Dcache,p)) goto Dclean;
|
4125 |
|
|
}
|
4126 |
|
|
data->state=9;@+
|
4127 |
|
|
wait(Dcache->access_time);
|
4128 |
|
|
|
4129 |
|
|
@ @=
|
4130 |
|
|
case 5:@+ if (self->lockloc) *(self->lockloc)=NULL, self->lockloc=NULL;
|
4131 |
|
|
if (!Scache) goto done;
|
4132 |
|
|
if (Scache->lock) wait(1);
|
4133 |
|
|
set_lock(self,Scache->lock);
|
4134 |
|
|
i=j=0;
|
4135 |
|
|
Sclean_loop: p=(icc? &(Scache->set[i][j]): &(Scache->victim[j]));
|
4136 |
|
|
if (p->tag.h&sign_bit) goto Sclean_inc;
|
4137 |
|
|
if (!is_dirty(Scache,p)) {
|
4138 |
|
|
p->tag.h|=data->x.o.h;@+goto Sclean_inc;
|
4139 |
|
|
}
|
4140 |
|
|
data->y.o.h=i, data->y.o.l=j;
|
4141 |
|
|
Sclean: data->state=6;@+
|
4142 |
|
|
data->ptr_b=(void*)p;@+
|
4143 |
|
|
wait(Scache->access_time);
|
4144 |
|
|
case 6:@+if (Scache->flusher.next) wait(1);
|
4145 |
|
|
flush_cache(Scache,p,data->x.o.h==0);
|
4146 |
|
|
p->tag.h|=data->x.o.h;
|
4147 |
|
|
release_lock(self,Scache->lock);
|
4148 |
|
|
data->state=7;@+
|
4149 |
|
|
wait(Scache->copy_out_time);
|
4150 |
|
|
case 7:@+ if (!clean_lock) goto done; /* premature termination */
|
4151 |
|
|
if (Scache->flusher.next) wait(1);
|
4152 |
|
|
if (data->i!=sync) goto done;
|
4153 |
|
|
data->state=8;
|
4154 |
|
|
case 8:@+ if (Scache->lock) wait(1);
|
4155 |
|
|
set_lock(self,Scache->lock);
|
4156 |
|
|
i=data->y.o.h, j=data->y.o.l;
|
4157 |
|
|
Sclean_inc: j++;
|
4158 |
|
|
if (icc && j==Scache->aa) j=0, i++;
|
4159 |
|
|
if (i==Scache->cc && j==Scache->vv) {
|
4160 |
|
|
data->state=10;@+
|
4161 |
|
|
wait(Scache->access_time);
|
4162 |
|
|
}
|
4163 |
|
|
goto Sclean_loop;
|
4164 |
|
|
Sprep: data->state=9;
|
4165 |
|
|
case 9:@+if (self->lockloc) release_lock(self,Dcache->lock);
|
4166 |
|
|
if (!Scache) goto done;
|
4167 |
|
|
if (Scache->lock) wait(1);
|
4168 |
|
|
set_lock(self,Scache->lock);
|
4169 |
|
|
p=cache_search(Scache,data->z.o);
|
4170 |
|
|
if (p) {
|
4171 |
|
|
demote_and_fix(Scache,p);
|
4172 |
|
|
if (is_dirty(Scache,p)) goto Sclean;
|
4173 |
|
|
}
|
4174 |
|
|
data->state=10;@+
|
4175 |
|
|
wait(Scache->access_time);
|
4176 |
|
|
|
4177 |
|
|
@* Virtual address translation. Special arrays of coroutines and control
|
4178 |
|
|
blocks come into play when we need to implement \MMIX's rather complicated
|
4179 |
|
|
page table mechanism for virtual address translation. In effect, we have up to
|
4180 |
|
|
ten control blocks {\it outside\/} of the reorder buffer that are capable of
|
4181 |
|
|
executing instructions just as if they were part of that buffer. The
|
4182 |
|
|
``opcodes'' of these non-abortable instructions are special internal
|
4183 |
|
|
operations called |ldptp| and |ldpte|, for loading page table pointers and
|
4184 |
|
|
page table entries.
|
4185 |
|
|
|
4186 |
|
|
Suppose, for example, that we need to translate a virtual address for the
|
4187 |
|
|
DT-cache in which the virtual page address $(a_4a_3a_2a_1a_0)_{1024}$ of
|
4188 |
|
|
segment~$i$ has $a_4=a_3=0$ and $a_2\ne0$. Then the rules say that we should
|
4189 |
|
|
first find a page table pointer $p_2$ in physical location
|
4190 |
|
|
$2^{13}(r+b_i+2)+8a_2$, then another page table pointer~$p_1$ in location
|
4191 |
|
|
$p_2+8a_1$, and finally the page table entry~$p_0$ in location $p_1+8a_0$. The
|
4192 |
|
|
simulator achieves this by setting up three coroutines $c_0$, $c_1$, $c_2$
|
4193 |
|
|
whose control blocks correspond to the pseudo-instructions
|
4194 |
|
|
$$\vbox{\halign{\tt#\hfil\cr
|
4195 |
|
|
LDPTP $x$,[$2^{63}+2^{13}(r+b_i+2)$],$8a_2$\cr
|
4196 |
|
|
LDPTP $x$,$x$,$8a_1$\cr
|
4197 |
|
|
LDPTE $x$,$x$,$8a_0$\cr}}$$
|
4198 |
|
|
where $x$ is a hidden internal register and the other quantities are immediate
|
4199 |
|
|
values. Slight changes to the normal functionality of \.{LDO} give us the
|
4200 |
|
|
actions needed to implement \.{LDPTP} and \.{LDPTE}. Coroutine~$c_j$
|
4201 |
|
|
corresponds to the instruction that involves $a_j$ and computes~$p_j$; when
|
4202 |
|
|
$c_0$ has computed its value~$p_0$, we know how to translate the original
|
4203 |
|
|
virtual address.
|
4204 |
|
|
|
4205 |
|
|
The \.{LDPTP} and \.{LDPTE} commands return zero
|
4206 |
|
|
if their $y$~operand is zero or if the page table does not properly match~rV.
|
4207 |
|
|
|
4208 |
|
|
@d LDPTP PREGO /* internally this won't cause confusion */
|
4209 |
|
|
@d LDPTE GO
|
4210 |
|
|
|
4211 |
|
|
@=
|
4212 |
|
|
control IPTctl[5], DPTctl[5]; /* control blocks for I and D page translation */
|
4213 |
|
|
coroutine IPTco[10], DPTco[10]; /* each coroutine is a two-stage pipeline */
|
4214 |
|
|
char *IPTname[5]={"IPT0","IPT1","IPT2","IPT3","IPT4"};
|
4215 |
|
|
char *DPTname[5]={"DPT0","DPT1","DPT2","DPT3","DPT4"};
|
4216 |
|
|
|
4217 |
|
|
@ @=
|
4218 |
|
|
for (j=0;j<5;j++) {
|
4219 |
|
|
DPTco[2*j].ctl=&DPTctl[j];@+ IPTco[2*j].ctl=&IPTctl[j];
|
4220 |
|
|
if (j>0) DPTctl[j].op=IPTctl[j].op=LDPTP,DPTctl[j].i=IPTctl[j].i=ldptp;
|
4221 |
|
|
else DPTctl[0].op=IPTctl[0].op=LDPTE,DPTctl[0].i=IPTctl[0].i=ldpte;
|
4222 |
|
|
IPTctl[j].loc=DPTctl[j].loc=neg_one;
|
4223 |
|
|
IPTctl[j].go.o=DPTctl[j].go.o=incr(neg_one,4);
|
4224 |
|
|
IPTctl[j].ptr_a=DPTctl[j].ptr_a=(void*)&mem;
|
4225 |
|
|
IPTctl[j].ren_x=DPTctl[j].ren_x=true;
|
4226 |
|
|
IPTctl[j].x.addr.h=DPTctl[j].x.addr.h=-1;
|
4227 |
|
|
IPTco[2*j].stage=DPTco[2*j].stage=1;
|
4228 |
|
|
IPTco[2*j+1].stage=DPTco[2*j+1].stage=2;
|
4229 |
|
|
IPTco[2*j].name=IPTco[2*j+1].name=IPTname[j];
|
4230 |
|
|
DPTco[2*j].name=DPTco[2*j+1].name=DPTname[j];
|
4231 |
|
|
}
|
4232 |
|
|
ITcache->filler_ctl.ptr_c=(void*)&IPTco[0];@+
|
4233 |
|
|
DTcache->filler_ctl.ptr_c=(void*)&DPTco[0];
|
4234 |
|
|
|
4235 |
|
|
@ Page table calculations are invoked by a coroutine of type |fill_from_virt|,
|
4236 |
|
|
which is used to fill the IT-cache or DT-cache. The calling conventions of
|
4237 |
|
|
|fill_from_virt| are analogous to those of |fill_from_mem| or |fill_from_S|:
|
4238 |
|
|
A virtual address is supplied in |data->y.o|, and |data->ptr_a| points
|
4239 |
|
|
to a cache (|ITcache| or |DTcache|), while |data->ptr_b| is a block in that
|
4240 |
|
|
cache. We wake up the caller, who holds the cache's |fill_lock|, as soon as
|
4241 |
|
|
the translation of the given address has been calculated, unless the caller
|
4242 |
|
|
has been aborted. (No second wakeup call is necessary.)
|
4243 |
|
|
|
4244 |
|
|
@=
|
4245 |
|
|
case fill_from_virt: {@+register cache *c=(cache *)data->ptr_a;
|
4246 |
|
|
register coroutine *cc=c->fill_lock;
|
4247 |
|
|
register coroutine *co=(coroutine*)data->ptr_c;
|
4248 |
|
|
/* |&IPTco[0]| or |&DPTco[0]| */
|
4249 |
|
|
octa aaaaa;
|
4250 |
|
|
switch (data->state) {
|
4251 |
|
|
case 0: @;
|
4252 |
|
|
data->state=1;
|
4253 |
|
|
case 1:@+if (data->b.p) {
|
4254 |
|
|
if (data->b.p->known) data->b.o=data->b.p->o, data->b.p=NULL;
|
4255 |
|
|
else wait(1);
|
4256 |
|
|
}
|
4257 |
|
|
@inbuf| and give the caller a sneak
|
4258 |
|
|
preview@>;
|
4259 |
|
|
data->state=2;
|
4260 |
|
|
case 2:@+if (c->lock) wait(1);
|
4261 |
|
|
set_lock(self,c->lock);
|
4262 |
|
|
load_cache(c,(cacheblock*)data->ptr_b);
|
4263 |
|
|
data->state=3;@+ wait(c->copy_in_time);
|
4264 |
|
|
case 3: data->b.o=zero_octa;@+goto terminate;
|
4265 |
|
|
}
|
4266 |
|
|
}
|
4267 |
|
|
|
4268 |
|
|
@ The current contents of rV, the special virtual translation register, are
|
4269 |
|
|
kept unpacked in several global variables |page_r|, |page_s|, etc., for
|
4270 |
|
|
convenience. Whenever rV changes, we recompute all these variables.
|
4271 |
|
|
|
4272 |
|
|
@=
|
4273 |
|
|
int page_n; /* the 10-bit |n| field of rV, times 8 */
|
4274 |
|
|
int page_r; /* the 27-bit |r| field of rV */
|
4275 |
|
|
int page_s; /* the 8-bit |s| field of rV */
|
4276 |
|
|
int page_b[5]; /* the 4-bit |b| fields of rV; |page_b[0]=0| */
|
4277 |
|
|
octa page_mask; /* the least significant |s| bits */
|
4278 |
|
|
bool page_bad=true; /* does rV violate the rules? */
|
4279 |
|
|
|
4280 |
|
|
@ @=
|
4281 |
|
|
{@+octa rv;
|
4282 |
|
|
rv=data->z.o;
|
4283 |
|
|
page_bad=(rv.l&7? true: false);
|
4284 |
|
|
page_n=rv.l&0x1ff8;
|
4285 |
|
|
rv=shift_right(rv,13,1);
|
4286 |
|
|
page_r=rv.l&0x7ffffff;
|
4287 |
|
|
rv=shift_right(rv,27,1);
|
4288 |
|
|
page_s=rv.l&0xff;
|
4289 |
|
|
if (page_s<13 || page_s>48) page_bad=true;
|
4290 |
|
|
else if (page_s<32) page_mask.h=0,page_mask.l=(1<
|
4291 |
|
|
else page_mask.h=(1<<(page_s-32))-1,page_mask.l=0xffffffff;
|
4292 |
|
|
page_b[4]=(rv.l>>8)&0xf;
|
4293 |
|
|
page_b[3]=(rv.l>>12)&0xf;
|
4294 |
|
|
page_b[2]=(rv.l>>16)&0xf;
|
4295 |
|
|
page_b[1]=(rv.l>>20)&0xf;
|
4296 |
|
|
}
|
4297 |
|
|
|
4298 |
|
|
@ Here's how we compute a tag of the IT-cache or DT-cache
|
4299 |
|
|
from a virtual address, and how we compute a physical address
|
4300 |
|
|
from a translation found in the cache.
|
4301 |
|
|
|
4302 |
|
|
@d trans_key(addr) incr(oandn(addr,page_mask),page_n)
|
4303 |
|
|
|
4304 |
|
|
@=
|
4305 |
|
|
static octa phys_addr @,@,@[ARGS((octa,octa))@];
|
4306 |
|
|
|
4307 |
|
|
@ @=
|
4308 |
|
|
static octa phys_addr(virt,trans)
|
4309 |
|
|
octa virt,trans;
|
4310 |
|
|
{@+octa t;
|
4311 |
|
|
t=trans;@+ t.l &= -8; /* zero out the protection bits */
|
4312 |
|
|
return oplus(t,oand(virt,page_mask));
|
4313 |
|
|
}
|
4314 |
|
|
|
4315 |
|
|
@ Cheap (and slow) versions of \MMIX\ leave the page table calculations
|
4316 |
|
|
to software. If the global variable |no_hardware_PT| is set true,
|
4317 |
|
|
|fill_from_virt| begins its actions in state~1, not state~0. (See the
|
4318 |
|
|
|RESUME_TRANS| operation.)
|
4319 |
|
|
|
4320 |
|
|
@=
|
4321 |
|
|
Extern bool no_hardware_PT;
|
4322 |
|
|
|
4323 |
|
|
@ Note: The operating system is supposed to ensure that changes to the page
|
4324 |
|
|
table entries do not appear in the pipeline when a translation cache is being
|
4325 |
|
|
updated. The internal \.{LDPTP} and \.{LDPTE} instructions use only the
|
4326 |
|
|
``hot state'' of the memory system.
|
4327 |
|
|
@^operating system@>
|
4328 |
|
|
|
4329 |
|
|
@=
|
4330 |
|
|
aaaaa=data->y.o;
|
4331 |
|
|
i=aaaaa.h>>29; /* the segment number */
|
4332 |
|
|
aaaaa.h&=0x1fffffff; /* the address within segment $i$ */
|
4333 |
|
|
aaaaa=shift_right(aaaaa,page_s,1); /* the page address */
|
4334 |
|
|
for (j=0;aaaaa.l!=0 || aaaaa.h!=0; j++) {
|
4335 |
|
|
co[2*j].ctl->z.o.h=0, co[2*j].ctl->z.o.l=(aaaaa.l&0x3ff)<<3;
|
4336 |
|
|
aaaaa=shift_right(aaaaa,10,1);
|
4337 |
|
|
}
|
4338 |
|
|
if (page_b[i+1]
|
4339 |
|
|
; /* nothing needs to be done, since |data->b.o| is zero */
|
4340 |
|
|
else {
|
4341 |
|
|
if (j==0) j=1,co[0].ctl->z.o=zero_octa;
|
4342 |
|
|
@;
|
4343 |
|
|
}
|
4344 |
|
|
|
4345 |
|
|
@ The first stage of coroutine $c_j$ is |co[2*j]|. It will pass the $j$th
|
4346 |
|
|
control block to the second stage, |co[2*j+1]|, which will load page table
|
4347 |
|
|
information from memory (or hopefully from the D-cache).
|
4348 |
|
|
|
4349 |
|
|
@=
|
4350 |
|
|
j--;
|
4351 |
|
|
aaaaa.l=page_r+page_b[i]+j;
|
4352 |
|
|
co[2*j].ctl->y.p=NULL;
|
4353 |
|
|
co[2*j].ctl->y.o=shift_left(aaaaa,13);
|
4354 |
|
|
co[2*j].ctl->y.o.h+=sign_bit;
|
4355 |
|
|
for (;;j--) {
|
4356 |
|
|
co[2*j].ctl->x.o=zero_octa;@+ co[2*j].ctl->x.known=false;
|
4357 |
|
|
co[2*j].ctl->owner=&co[2*j];
|
4358 |
|
|
startup(&co[2*j],1);
|
4359 |
|
|
if (j==0) break;
|
4360 |
|
|
co[2*(j-1)].ctl->y.p=&co[2*j].ctl->x;
|
4361 |
|
|
}
|
4362 |
|
|
data->b.p=&co[0].ctl->x;
|
4363 |
|
|
|
4364 |
|
|
@ At this point the translation of the given virtual address |data->y.o| is
|
4365 |
|
|
the octabyte |data->b.o|. Its least significant three bits are the
|
4366 |
|
|
protection code~$p=p_rp_wp_x$; its page address field is scaled by~$2^s$. It
|
4367 |
|
|
is entirely zero, including the protection bits, if there was a
|
4368 |
|
|
page table failure.
|
4369 |
|
|
|
4370 |
|
|
@inbuf| and give the caller a sneak preview@>=
|
4371 |
|
|
c->inbuf.tag=trans_key(data->y.o);
|
4372 |
|
|
c->inbuf.data[0]=data->b.o;
|
4373 |
|
|
if (cc) {
|
4374 |
|
|
cc->ctl->z.o=data->b.o;
|
4375 |
|
|
awaken(cc,1);
|
4376 |
|
|
}
|
4377 |
|
|
|
4378 |
|
|
@* The write buffer. The dispatcher has arranged things so that speculative
|
4379 |
|
|
stores into memory are recorded in a doubly linked list leading upward from
|
4380 |
|
|
|mem|. When such instructions finally are committed, they enter the ``write
|
4381 |
|
|
buffer,'' which holds octabytes that are ready to be written into designated
|
4382 |
|
|
physical memory addresses (or into the D-cache and/or S-cache). The ``hot
|
4383 |
|
|
state'' of the computation is reflected not only by the registers and caches
|
4384 |
|
|
but also by the instructions that are pending in the write buffer.
|
4385 |
|
|
|
4386 |
|
|
@=
|
4387 |
|
|
typedef struct{
|
4388 |
|
|
octa o; /* data to be stored */
|
4389 |
|
|
octa addr; /* its physical address */
|
4390 |
|
|
tetra stamp; /* when last committed (mod $2^{32}$) */
|
4391 |
|
|
internal_opcode i; /* is this write special? */
|
4392 |
|
|
} write_node;
|
4393 |
|
|
|
4394 |
|
|
@ We represent the buffer in the usual way as a circular list, with elements
|
4395 |
|
|
|write_tail+1|, |write_tail+2|, \dots,~|write_head|.
|
4396 |
|
|
|
4397 |
|
|
The data will sit at least |holding_time| cycles before it leaves
|
4398 |
|
|
the write buffer. This speeds things up when different fields of the same
|
4399 |
|
|
octabyte are being stored by different instructions.
|
4400 |
|
|
|
4401 |
|
|
@=
|
4402 |
|
|
Extern write_node *wbuf_bot, *wbuf_top;
|
4403 |
|
|
/* least and greatest write buffer nodes */
|
4404 |
|
|
Extern write_node *write_head, *write_tail;
|
4405 |
|
|
/* front and rear of the write buffer */
|
4406 |
|
|
Extern lockvar wbuf_lock; /* is the data in |write_head| being written? */
|
4407 |
|
|
Extern int holding_time; /* minimum holding time */
|
4408 |
|
|
Extern lockvar speed_lock; /* should we ignore |holding_time|? */
|
4409 |
|
|
|
4410 |
|
|
@ @=
|
4411 |
|
|
coroutine write_co; /* coroutine that empties the write buffer */
|
4412 |
|
|
control write_ctl; /* its control block */
|
4413 |
|
|
|
4414 |
|
|
@ @=
|
4415 |
|
|
write_co.ctl=&write_ctl;
|
4416 |
|
|
write_co.name="Write";
|
4417 |
|
|
write_co.stage=write_from_wbuf;
|
4418 |
|
|
write_ctl.ptr_a=(void*)&mem;
|
4419 |
|
|
write_ctl.go.o.l=4;
|
4420 |
|
|
startup(&write_co,1);
|
4421 |
|
|
write_head=write_tail=wbuf_top;
|
4422 |
|
|
|
4423 |
|
|
@ @=
|
4424 |
|
|
static void print_write_buffer @,@,@[ARGS((void))@];
|
4425 |
|
|
|
4426 |
|
|
@ @=
|
4427 |
|
|
static void print_write_buffer()
|
4428 |
|
|
{
|
4429 |
|
|
printf("Write buffer");
|
4430 |
|
|
if (write_head==write_tail) printf(" (empty)\n");
|
4431 |
|
|
else {@+register write_node *p;
|
4432 |
|
|
printf(":\n");
|
4433 |
|
|
for (p=write_head;p!=write_tail; p=(p==wbuf_bot? wbuf_top: p-1)) {
|
4434 |
|
|
printf("m[");@+print_octa(p->addr);@+printf("]=");@+print_octa(p->o);
|
4435 |
|
|
if (p->i==stunc) printf(" unc");
|
4436 |
|
|
else if (p->i==sync) printf(" sync");
|
4437 |
|
|
printf(" (age %d)\n",ticks.l-p->stamp);
|
4438 |
|
|
}
|
4439 |
|
|
}
|
4440 |
|
|
}
|
4441 |
|
|
|
4442 |
|
|
@ The entire present state of the pipeline computation can be visualized
|
4443 |
|
|
by printing first the write buffer, then the reorder buffer, then the
|
4444 |
|
|
fetch buffer. This shows the progression of results from oldest to youngest,
|
4445 |
|
|
from sizzling hot to ice cold.
|
4446 |
|
|
|
4447 |
|
|
@=
|
4448 |
|
|
Extern void print_pipe @,@,@[ARGS((void))@];
|
4449 |
|
|
|
4450 |
|
|
@ @=
|
4451 |
|
|
void print_pipe()
|
4452 |
|
|
{
|
4453 |
|
|
print_write_buffer();
|
4454 |
|
|
print_reorder_buffer();
|
4455 |
|
|
print_fetch_buffer();
|
4456 |
|
|
}
|
4457 |
|
|
|
4458 |
|
|
@ The |write_search| routine looks to see if any instructions ahead of a given
|
4459 |
|
|
place in the |mem| list of the reorder buffer are storing into a given
|
4460 |
|
|
physical address, or if there's a pending instruction in the write buffer for
|
4461 |
|
|
that address. If so, it returns a pointer to the value to be written. If not,
|
4462 |
|
|
it returns~|NULL|. If the answer is currently unknown, because at least one
|
4463 |
|
|
possibly relevant physical address has not yet been computed, the subroutine
|
4464 |
|
|
returns the special code value~|DUNNO|.
|
4465 |
|
|
|
4466 |
|
|
The search starts at the |x.up| field of a control block for a store
|
4467 |
|
|
instruction, otherwise at the |ptr_a| field of the control block,
|
4468 |
|
|
unless |ptr_a| points to a committed instruction.
|
4469 |
|
|
|
4470 |
|
|
The |i| field in the write buffer is usually |st| or |pst|, inherited from
|
4471 |
|
|
a store or partial store command. It may also be |sync| (from \.{SYNC}~\.1
|
4472 |
|
|
or \.{SYNC}~\.3) or |stunc| (from \.{STUNC}).
|
4473 |
|
|
|
4474 |
|
|
@d DUNNO ((octa *)1) /* an impossible non-|NULL| pointer */
|
4475 |
|
|
|
4476 |
|
|
@=
|
4477 |
|
|
static octa* write_search @,@,@[ARGS((control*,octa))@];
|
4478 |
|
|
|
4479 |
|
|
@ @=
|
4480 |
|
|
static octa *write_search(ctl,addr)
|
4481 |
|
|
control *ctl;
|
4482 |
|
|
octa addr;
|
4483 |
|
|
{@+register specnode *p=(ctl->mem_x? ctl->x.up: (specnode*)ctl->ptr_a);
|
4484 |
|
|
register write_node *q=write_tail;
|
4485 |
|
|
addr.l &=-8;
|
4486 |
|
|
if (p==&mem) goto qloop;
|
4487 |
|
|
if (p > &hot->x && ctl <= hot) goto qloop; /* already committed */
|
4488 |
|
|
if (p < &ctl->x && (ctl <= hot || p > &hot->x)) goto qloop;
|
4489 |
|
|
for (; p!=&mem; p=p->up) {
|
4490 |
|
|
if (p->addr.h==(tetra)-1) return DUNNO;
|
4491 |
|
|
if ((p->addr.l&-8)==addr.l && p->addr.h==addr.h)
|
4492 |
|
|
return (p->known? &(p->o): DUNNO);
|
4493 |
|
|
}
|
4494 |
|
|
qloop:@+ for (;;) {
|
4495 |
|
|
if (q==write_head) return NULL;
|
4496 |
|
|
if (q==wbuf_top) q=wbuf_bot;@+ else q++;
|
4497 |
|
|
if (q->addr.l==addr.l && q->addr.h==addr.h) return &(q->o);
|
4498 |
|
|
}
|
4499 |
|
|
}
|
4500 |
|
|
|
4501 |
|
|
@ When we're committing new data to memory, we can update an existing item in
|
4502 |
|
|
the write buffer if it has the same physical address, unless that item is
|
4503 |
|
|
already in the process of being written out. Increasing the value of
|
4504 |
|
|
|holding_time| will increase the chance that this economy is possible, but
|
4505 |
|
|
it will also increase the number of buffered items when writes are to
|
4506 |
|
|
different locations.
|
4507 |
|
|
|
4508 |
|
|
A store instruction that sets any of the eight interrupt bits
|
4509 |
|
|
\.{rwxnkbsp} will not affect memory, even if it doesn't cause an interrupt.
|
4510 |
|
|
|
4511 |
|
|
When ``store'' is followed by ``store uncached'' at the same address,
|
4512 |
|
|
or vice versa, we believe the most recent hint.
|
4513 |
|
|
|
4514 |
|
|
@=
|
4515 |
|
|
{@+register write_node *q=write_tail;
|
4516 |
|
|
if (hot->interrupt&(F_BIT+0xff)) goto done_with_write;
|
4517 |
|
|
if (hot->i!=sync) for (;;) {
|
4518 |
|
|
if (q==write_head) break;
|
4519 |
|
|
if (q==wbuf_top) q=wbuf_bot;@+ else q++;
|
4520 |
|
|
if (q->i==sync) break;
|
4521 |
|
|
if (q->addr.l==hot->x.addr.l && q->addr.h==hot->x.addr.h
|
4522 |
|
|
&& (q!=write_head || !wbuf_lock)) goto addr_found;
|
4523 |
|
|
}
|
4524 |
|
|
{@+ register write_node *p=(write_tail==wbuf_bot? wbuf_top: write_tail-1);
|
4525 |
|
|
if (p==write_head) break; /* the write buffer is full */
|
4526 |
|
|
q=write_tail;@+ write_tail=p;
|
4527 |
|
|
q->addr=hot->x.addr;
|
4528 |
|
|
}
|
4529 |
|
|
addr_found: q->o=hot->x.o;
|
4530 |
|
|
q->stamp=ticks.l;
|
4531 |
|
|
q->i=hot->i;
|
4532 |
|
|
done_with_write: spec_rem(&(hot->x));
|
4533 |
|
|
mem_slots++;
|
4534 |
|
|
}
|
4535 |
|
|
|
4536 |
|
|
@ A special coroutine whose duty is to empty the write buffer is always
|
4537 |
|
|
active. It holds the |wbuf_lock| while it is writing the contents of
|
4538 |
|
|
|write_head|. It holds |Dcache->fill_lock| while waiting for the D-cache
|
4539 |
|
|
to fill a block.
|
4540 |
|
|
|
4541 |
|
|
@=
|
4542 |
|
|
case write_from_wbuf:
|
4543 |
|
|
p=(cacheblock*)data->ptr_b;
|
4544 |
|
|
switch(data->state) {
|
4545 |
|
|
case 4: @;
|
4546 |
|
|
data->state=5;
|
4547 |
|
|
case 5:@+if (write_head==wbuf_bot) write_head=wbuf_top;@+ else write_head--;
|
4548 |
|
|
write_restart: data->state=0;
|
4549 |
|
|
case 0:@+ if (self->lockloc) *(self->lockloc)=NULL,self->lockloc=NULL;
|
4550 |
|
|
if (write_head==write_tail) wait(1); /* write buffer is empty */
|
4551 |
|
|
if (write_head->i==sync) @;
|
4552 |
|
|
if (ticks.l-write_head->stamp
|
4553 |
|
|
wait(1); /* data too raw */
|
4554 |
|
|
if (!Dcache || (write_head->addr.h&0xffff0000)) goto mem_direct;
|
4555 |
|
|
/* not cached */
|
4556 |
|
|
if (Dcache->lock || (j=get_reader(Dcache)<0)) wait(1); /* D-cache busy */
|
4557 |
|
|
startup(&Dcache->reader[j],Dcache->access_time);
|
4558 |
|
|
@
|
4559 |
|
|
if there's a cache hit@>;
|
4560 |
|
|
data->state=((Dcache->mode&WRITE_ALLOC) && write_head->i!=stunc? 1: 3);
|
4561 |
|
|
wait(Dcache->access_time);
|
4562 |
|
|
case 1: @addr|
|
4563 |
|
|
into the D-cache@>;
|
4564 |
|
|
data->state=2;@+sleep;
|
4565 |
|
|
case 2: data->state=0;@+sleep; /* wake up when the D-cache has the block */
|
4566 |
|
|
case 3: @;
|
4567 |
|
|
mem_direct: @;
|
4568 |
|
|
}
|
4569 |
|
|
|
4570 |
|
|
@ @=
|
4571 |
|
|
register cacheblock *p,*q;
|
4572 |
|
|
|
4573 |
|
|
@ The granularity is guaranteed to be 8 in write-around mode
|
4574 |
|
|
(see |MMIX_config|). Although an uncached store will not be stored in the
|
4575 |
|
|
D-cache (unless it hits in the D-cache), it will go into a secondary cache.
|
4576 |
|
|
|
4577 |
|
|
@=
|
4578 |
|
|
if (Dcache->flusher.next) wait(1);
|
4579 |
|
|
Dcache->outbuf.tag.h=write_head->addr.h;
|
4580 |
|
|
Dcache->outbuf.tag.l=write_head->addr.l&(-Dcache->bb);
|
4581 |
|
|
for (j=0;jbb>>Dcache->g;j++) Dcache->outbuf.dirty[j]=false;
|
4582 |
|
|
Dcache->outbuf.data[(write_head->addr.l&(Dcache->bb-1))>>3]=write_head->o;
|
4583 |
|
|
Dcache->outbuf.dirty[(write_head->addr.l&(Dcache->bb-1))>>Dcache->g]=true;
|
4584 |
|
|
set_lock(self,wbuf_lock);
|
4585 |
|
|
startup(&Dcache->flusher,Dcache->copy_out_time);
|
4586 |
|
|
data->state=5;@+ wait(Dcache->copy_out_time);
|
4587 |
|
|
|
4588 |
|
|
@ @=
|
4589 |
|
|
if (mem_lock) wait(1);
|
4590 |
|
|
set_lock(self,wbuf_lock);
|
4591 |
|
|
set_lock(&mem_locker,mem_lock); /* a coroutine of type |vanish| */
|
4592 |
|
|
startup(&mem_locker,mem_addr_time+mem_write_time);
|
4593 |
|
|
mem_write(write_head->addr,write_head->o);
|
4594 |
|
|
data->state=5;@+ wait(mem_addr_time+mem_write_time);
|
4595 |
|
|
|
4596 |
|
|
@ A subtlety needs to be mentioned here: While we're trying to
|
4597 |
|
|
update the D-cache, another instruction might be filling the
|
4598 |
|
|
same cache block (although not because of the same physical address).
|
4599 |
|
|
Therefore we |goto write_restart| here instead of saying |wait(1)|.
|
4600 |
|
|
|
4601 |
|
|
@addr| into the D-cache@>=
|
4602 |
|
|
if (Dcache->filler.next) goto write_restart;
|
4603 |
|
|
if ((Scache&&Scache->lock) || (!Scache&&mem_lock)) goto write_restart;
|
4604 |
|
|
p=alloc_slot(Dcache,write_head->addr);
|
4605 |
|
|
if (!p) goto write_restart;
|
4606 |
|
|
if (Scache) set_lock(&Dcache->filler,Scache->lock)@;
|
4607 |
|
|
else set_lock(&Dcache->filler,mem_lock);
|
4608 |
|
|
set_lock(self,Dcache->fill_lock);
|
4609 |
|
|
data->ptr_b=Dcache->filler_ctl.ptr_b=(void *)p;
|
4610 |
|
|
Dcache->filler_ctl.z.o=write_head->addr;
|
4611 |
|
|
startup(&Dcache->filler,Scache? Scache->access_time: mem_addr_time);
|
4612 |
|
|
|
4613 |
|
|
@ Here it is assumed that |Dcache->access_time| is enough to search
|
4614 |
|
|
the D-cache and update one octabyte in case of a hit. The D-cache is
|
4615 |
|
|
not locked, since other coroutines that might be simultaneously reading
|
4616 |
|
|
the D-cache are not going to use the octabyte that changes.
|
4617 |
|
|
Perhaps the simulator is being too lenient here.
|
4618 |
|
|
|
4619 |
|
|
@=
|
4620 |
|
|
p=cache_search(Dcache,write_head->addr);
|
4621 |
|
|
if (p) {
|
4622 |
|
|
p=use_and_fix(Dcache,p);
|
4623 |
|
|
set_lock(self,wbuf_lock);
|
4624 |
|
|
data->ptr_b=(void *)p;
|
4625 |
|
|
p->data[(write_head->addr.l&(Dcache->bb-1))>>3]=write_head->o;
|
4626 |
|
|
p->dirty[(write_head->addr.l&(Dcache->bb-1))>>Dcache->g]=true;
|
4627 |
|
|
data->state=4;@+ wait(Dcache->access_time);
|
4628 |
|
|
}
|
4629 |
|
|
|
4630 |
|
|
@ @=
|
4631 |
|
|
if ((Dcache->mode&WRITE_BACK)==0) { /* write-through */
|
4632 |
|
|
if (Dcache->flusher.next) wait(1);
|
4633 |
|
|
flush_cache(Dcache,p,true);
|
4634 |
|
|
}
|
4635 |
|
|
|
4636 |
|
|
@ @=
|
4637 |
|
|
{
|
4638 |
|
|
set_lock(self,wbuf_lock);
|
4639 |
|
|
data->state=5;
|
4640 |
|
|
wait(1);
|
4641 |
|
|
}
|
4642 |
|
|
|
4643 |
|
|
@* Loading and storing. A RISC machine is often said to have a ``load/store
|
4644 |
|
|
architecture,'' perhaps because loading and storing are among the most
|
4645 |
|
|
difficult things a RISC machine is called upon to do.
|
4646 |
|
|
|
4647 |
|
|
We want memory accesses
|
4648 |
|
|
to be efficient, so we try to access the D-cache at the same time as we are
|
4649 |
|
|
translating a virtual address via the DT-cache. Usually we hit in both
|
4650 |
|
|
caches, but numerous cases must be dealt with when we miss. Is there
|
4651 |
|
|
an elegant way to handle all the contingencies? Alas, the author of this
|
4652 |
|
|
program was unable to think of anything better than to throw lots
|
4653 |
|
|
of code at the problem --- knowing full well that such a spaghetti-like
|
4654 |
|
|
approach is fraught with possibilities for error.
|
4655 |
|
|
|
4656 |
|
|
Instructions like \.{LDO} $x,y,z$ operate in two pipeline stages. The first
|
4657 |
|
|
stage computes the virtual address $y+z$, waiting if necessary until $y$
|
4658 |
|
|
and~$z$ are both known; then it starts to access the necessary caches.
|
4659 |
|
|
In the second stage we ascertain the corresponding physical address and
|
4660 |
|
|
hopefully find the data in the cache (or in the speculative |mem| list or the
|
4661 |
|
|
write buffer).
|
4662 |
|
|
|
4663 |
|
|
An instruction like \.{STB} $x,y,z$ shares some of the computation of
|
4664 |
|
|
\.{LDO}~$x,y,z$, because only one byte is being stored but the other seven
|
4665 |
|
|
bytes must be found in the cache. In this case, however, $x$~is treated as an
|
4666 |
|
|
input, and |mem| is the output. The second stage of a store command can begin
|
4667 |
|
|
even though $x$ is not known during the first stage.
|
4668 |
|
|
|
4669 |
|
|
Here's what we do at the beginning of stage~1.
|
4670 |
|
|
|
4671 |
|
|
@d ld_st_launch 7 /* state when load/store command has its memory address */
|
4672 |
|
|
|
4673 |
|
|
@=
|
4674 |
|
|
case preld: case prest: case prego:
|
4675 |
|
|
data->z.o=incr(data->z.o,data->xx&-(data->i==prego? Icache: Dcache)->bb);
|
4676 |
|
|
/* (I hope the adder is fast enough) */
|
4677 |
|
|
case ld: case ldunc: case ldvts:
|
4678 |
|
|
case st: case pst: case syncd: case syncid:
|
4679 |
|
|
start_ld_st: data->y.o=oplus(data->y.o,data->z.o);
|
4680 |
|
|
data->state=ld_st_launch;@+ goto switch1;
|
4681 |
|
|
case ldptp: case ldpte:@+if (data->y.o.h) goto start_ld_st;
|
4682 |
|
|
data->x.o=zero_octa;@+ data->x.known=true;@+ goto die; /* page table fault */
|
4683 |
|
|
|
4684 |
|
|
@ @d PRW_BITS (data->ii==pst? PR_BIT+PW_BIT:
|
4685 |
|
|
(data->i==syncid && (data->loc.h&sign_bit))? 0: PW_BIT)
|
4686 |
|
|
|
4687 |
|
|
@=
|
4688 |
|
|
case ld_st_launch:@+if ((self+1)->next)
|
4689 |
|
|
wait(1); /* second stage must be clear */
|
4690 |
|
|
@;
|
4691 |
|
|
if (data->y.o.h&sign_bit)
|
4692 |
|
|
@;
|
4693 |
|
|
if (page_bad) {
|
4694 |
|
|
if (data->i==st || (data->ii>syncid))
|
4695 |
|
|
data->interrupt|=PRW_BITS;
|
4696 |
|
|
goto fin_ex;
|
4697 |
|
|
}
|
4698 |
|
|
if (DTcache->lock || (j=get_reader(DTcache))<0) wait(1);
|
4699 |
|
|
startup(&DTcache->reader[j],DTcache->access_time);
|
4700 |
|
|
@;
|
4701 |
|
|
pass_after(DTcache->access_time);@+ goto passit;
|
4702 |
|
|
|
4703 |
|
|
@ When stage 2 of a load/store command begins, the state will depend
|
4704 |
|
|
on what transpired in stage~1.
|
4705 |
|
|
For example, |data->state| will be |DT_miss| if the virtual address key
|
4706 |
|
|
can't be found in the DT-cache; then stage~2 will have to compute the
|
4707 |
|
|
physical address the hard way.
|
4708 |
|
|
|
4709 |
|
|
The |data->state| will be |DT_hit| if
|
4710 |
|
|
the physical address is known via the DT-cache, but the data may or may not
|
4711 |
|
|
be in the D-cache. The |data->state| will be |hit_and_miss| if the DT-cache
|
4712 |
|
|
hits and the D-cache doesn't. And |data->state| will be |ld_ready| if
|
4713 |
|
|
|data->x.o| is the desired octabyte (for example, if both caches hit).
|
4714 |
|
|
|
4715 |
|
|
@d DT_miss 10 /* second stage |state| when DT-cache doesn't hold the key */
|
4716 |
|
|
@d DT_hit 11 /* second stage |state| when physical address is known */
|
4717 |
|
|
@d hit_and_miss 12 /* second stage |state| when D-cache misses */
|
4718 |
|
|
@d ld_ready 13 /* second stage |state| when data has been read */
|
4719 |
|
|
@d st_ready 14 /* second stage |state| when data needn't be read */
|
4720 |
|
|
@d prest_win 15 /* second stage |state| when we can fill a block with zeroes */
|
4721 |
|
|
|
4722 |
|
|
@=
|
4723 |
|
|
p=cache_search(DTcache,trans_key(data->y.o));
|
4724 |
|
|
if (!Dcache || Dcache->lock || (j=get_reader(Dcache))<0 ||
|
4725 |
|
|
(data->i>=st && data->i<=syncid))
|
4726 |
|
|
@;
|
4727 |
|
|
startup(&Dcache->reader[j],Dcache->access_time);
|
4728 |
|
|
if (p) @@;
|
4729 |
|
|
else data->state=DT_miss;
|
4730 |
|
|
|
4731 |
|
|
@ We assume that it is possible to look up a virtual address in the DT-cache
|
4732 |
|
|
at the same time as we look for a corresponding physical address in the
|
4733 |
|
|
D-cache, provided that the lower $b+c$ bits of the two addresses are the same.
|
4734 |
|
|
(They will always be the same if |b+c<=page_s|; otherwise the operating system
|
4735 |
|
|
can try to make them the same by ``page coloring'' whenever possible.) If both
|
4736 |
|
|
caches hit, the physical address is known in
|
4737 |
|
|
@^page coloring@>
|
4738 |
|
|
max(|DTcache->access_time,Dcache->access_time|) cycles.
|
4739 |
|
|
|
4740 |
|
|
If the lower $b+c$ bits of the virtual and physical addresses differ,
|
4741 |
|
|
the machine will not know this until the DT-cache has hit.
|
4742 |
|
|
Therefore we simulate the operation of accessing the D-cache, but we go to
|
4743 |
|
|
|DT_hit| instead of to |hit_and_miss| because the D-cache will
|
4744 |
|
|
experience a spurious miss.
|
4745 |
|
|
|
4746 |
|
|
@d max(x,y) ((x)<(y)? (y):(x))
|
4747 |
|
|
|
4748 |
|
|
@=
|
4749 |
|
|
{@+octa *m;
|
4750 |
|
|
@;
|
4751 |
|
|
data->z.o=phys_addr(data->y.o,p->data[0]);
|
4752 |
|
|
m=write_search(data,data->z.o);
|
4753 |
|
|
if (m==DUNNO) data->state=DT_hit;
|
4754 |
|
|
else if (m) data->x.o=*m, data->state=ld_ready;
|
4755 |
|
|
else if (Dcache->b+Dcache->c>page_s &&@|
|
4756 |
|
|
((data->y.o.l^data->z.o.l)&((Dcache->bb<c)-(1<
|
4757 |
|
|
data->state=DT_hit; /* spurious D-cache lookup */
|
4758 |
|
|
else {
|
4759 |
|
|
q=cache_search(Dcache,data->z.o);
|
4760 |
|
|
if (q) {
|
4761 |
|
|
if (data->i==ldunc) q=demote_and_fix(Dcache,q);
|
4762 |
|
|
else q=use_and_fix(Dcache,q);
|
4763 |
|
|
data->x.o=q->data[(data->z.o.l&(Dcache->bb-1))>>3];
|
4764 |
|
|
data->state=ld_ready;
|
4765 |
|
|
}@+else data->state=hit_and_miss;
|
4766 |
|
|
}
|
4767 |
|
|
pass_after(max(DTcache->access_time,Dcache->access_time));
|
4768 |
|
|
goto passit;
|
4769 |
|
|
}
|
4770 |
|
|
|
4771 |
|
|
@ The protection bits $p_rp_wp_x$ in a translation cache are shifted
|
4772 |
|
|
four positions right from the interrupt codes |PR_BIT|, |PW_BIT|, |PX_BIT|.
|
4773 |
|
|
If the data is protected, we abort the load/store operation immediately;
|
4774 |
|
|
this protects the privacy of other users.
|
4775 |
|
|
|
4776 |
|
|
@=
|
4777 |
|
|
p=use_and_fix(DTcache,p);
|
4778 |
|
|
j=PRW_BITS;
|
4779 |
|
|
if (((p->data[0].l<
|
4780 |
|
|
if (data->i==syncd || data->i==syncid) goto sync_check;
|
4781 |
|
|
if (data->i!=preld && data->i!=prest)
|
4782 |
|
|
data->interrupt|=j&~(p->data[0].l<
|
4783 |
|
|
goto fin_ex;
|
4784 |
|
|
}
|
4785 |
|
|
|
4786 |
|
|
@ @=
|
4787 |
|
|
{@+octa *m;
|
4788 |
|
|
if (p) {
|
4789 |
|
|
@;
|
4790 |
|
|
data->z.o=phys_addr(data->y.o,p->data[0]);
|
4791 |
|
|
if (data->i>=st && data->i<=syncid) data->state=st_ready;
|
4792 |
|
|
else {
|
4793 |
|
|
m=write_search(data,data->z.o);
|
4794 |
|
|
if (m && m!= DUNNO) data->x.o=*m, data->state=ld_ready;
|
4795 |
|
|
else data->state=DT_hit;
|
4796 |
|
|
}
|
4797 |
|
|
}@+ else data->state=DT_miss;
|
4798 |
|
|
pass_after(DTcache->access_time);@+ goto passit;
|
4799 |
|
|
}
|
4800 |
|
|
|
4801 |
|
|
@ @=
|
4802 |
|
|
{@+octa *m;
|
4803 |
|
|
if (!(data->loc.h&sign_bit)) {
|
4804 |
|
|
if (data->i==syncd || data->i==syncid) goto sync_check;
|
4805 |
|
|
if (data->i!=preld && data->i!=prest) data->interrupt |= N_BIT;
|
4806 |
|
|
goto fin_ex;
|
4807 |
|
|
}
|
4808 |
|
|
data->z.o=data->y.o;@+ data->z.o.h -= sign_bit;
|
4809 |
|
|
if (data->i>=st && data->i<=syncid) {
|
4810 |
|
|
data->state=st_ready;@+pass_after(1);@+goto passit;
|
4811 |
|
|
}
|
4812 |
|
|
m=write_search(data,data->z.o);
|
4813 |
|
|
if (m) {
|
4814 |
|
|
if (m==DUNNO) data->state=DT_hit;
|
4815 |
|
|
else data->x.o=*m, data->state=ld_ready;
|
4816 |
|
|
}@+ else if ((data->z.o.h&0xffff0000) || !Dcache) {
|
4817 |
|
|
if (mem_lock) wait(1);
|
4818 |
|
|
set_lock(&mem_locker,mem_lock);
|
4819 |
|
|
data->x.o=mem_read(data->z.o);
|
4820 |
|
|
data->state=ld_ready;
|
4821 |
|
|
startup(&mem_locker,mem_addr_time+mem_read_time);
|
4822 |
|
|
pass_after(mem_addr_time+mem_read_time);@+ goto passit;
|
4823 |
|
|
}
|
4824 |
|
|
if (Dcache->lock || (j=get_reader(Dcache))<0) {
|
4825 |
|
|
data->state=DT_hit;@+pass_after(1);@+ goto passit;
|
4826 |
|
|
}
|
4827 |
|
|
startup(&Dcache->reader[j],Dcache->access_time);
|
4828 |
|
|
q=cache_search(Dcache,data->z.o);
|
4829 |
|
|
if (q) {
|
4830 |
|
|
if (data->i==ldunc) q=demote_and_fix(Dcache,q);
|
4831 |
|
|
else q=use_and_fix(Dcache,q);
|
4832 |
|
|
data->x.o=q->data[(data->z.o.l&(Dcache->bb-1))>>3];
|
4833 |
|
|
data->state=ld_ready;
|
4834 |
|
|
}@+else data->state=hit_and_miss;
|
4835 |
|
|
pass_after(Dcache->access_time);@+ goto passit;
|
4836 |
|
|
}
|
4837 |
|
|
|
4838 |
|
|
@ The program for the second stage is, likewise, rather long-winded, yet quite
|
4839 |
|
|
similar to the cache manipulations we have already seen several times.
|
4840 |
|
|
|
4841 |
|
|
Several instructions might be trying to fill the DT-cache for the same page.
|
4842 |
|
|
(A similar situation faced us in the |write_from_wbuf| coroutine.)
|
4843 |
|
|
The second stage therefore needs to do some
|
4844 |
|
|
translation cache searching just as the first stage did. In this
|
4845 |
|
|
stage, however, we don't go all out for speed, because DT-cache misses
|
4846 |
|
|
are rare.
|
4847 |
|
|
|
4848 |
|
|
@d DT_retry 8 /* second stage |state| when DT-cache should be searched again */
|
4849 |
|
|
@d got_DT 9 /* second stage |state| when DT-cache entry has been computed */
|
4850 |
|
|
|
4851 |
|
|
@=
|
4852 |
|
|
square_one: data->state=DT_retry;
|
4853 |
|
|
case DT_retry:@+if (DTcache->lock || (j=get_reader(DTcache))<0) wait(1);
|
4854 |
|
|
startup(&DTcache->reader[j],DTcache->access_time);
|
4855 |
|
|
p=cache_search(DTcache,trans_key(data->y.o));
|
4856 |
|
|
if (p) {
|
4857 |
|
|
@;
|
4858 |
|
|
data->z.o=phys_addr(data->y.o,p->data[0]);
|
4859 |
|
|
if (data->i>=st && data->i<=syncid) data->state=st_ready;
|
4860 |
|
|
else data->state=DT_hit;
|
4861 |
|
|
}@+ else data->state=DT_miss;
|
4862 |
|
|
wait(DTcache->access_time);
|
4863 |
|
|
case DT_miss:@+if (DTcache->filler.next)
|
4864 |
|
|
if (data->i==preld || data->i==prest) goto fin_ex;@+ else goto square_one;
|
4865 |
|
|
if (no_hardware_PT)
|
4866 |
|
|
if (data->i==preld || data->i==prest) goto fin_ex;@+else goto emulate_virt;
|
4867 |
|
|
p=alloc_slot(DTcache,trans_key(data->y.o));
|
4868 |
|
|
if (!p) goto square_one;
|
4869 |
|
|
data->ptr_b=DTcache->filler_ctl.ptr_b=(void *)p;
|
4870 |
|
|
DTcache->filler_ctl.y.o=data->y.o;
|
4871 |
|
|
set_lock(self,DTcache->fill_lock);
|
4872 |
|
|
startup(&DTcache->filler,1);
|
4873 |
|
|
data->state=got_DT;
|
4874 |
|
|
if (data->i==preld || data->i==prest) goto fin_ex;@+else sleep;
|
4875 |
|
|
case got_DT: release_lock(self,DTcache->fill_lock);
|
4876 |
|
|
j=PRW_BITS;
|
4877 |
|
|
if (((data->z.o.l<
|
4878 |
|
|
if (data->i==syncd || data->i==syncid) goto sync_check;
|
4879 |
|
|
data->interrupt |= j&~(data->z.o.l<
|
4880 |
|
|
goto fin_ex;
|
4881 |
|
|
}
|
4882 |
|
|
data->z.o=phys_addr(data->y.o,data->z.o);
|
4883 |
|
|
if (data->i>=st && data->i<=syncid) goto finish_store;
|
4884 |
|
|
/* otherwise we fall through to |ld_retry| below */
|
4885 |
|
|
|
4886 |
|
|
@ The second stage might also want to fill the D-cache (and perhaps
|
4887 |
|
|
the S-cache) as we get the data.
|
4888 |
|
|
|
4889 |
|
|
Several load instructions might be trying to fill the same cache block.
|
4890 |
|
|
So we should go back and look in the D-cache again if we miss and
|
4891 |
|
|
cannot allocate a slot immediately.
|
4892 |
|
|
|
4893 |
|
|
A \.{PRELD} or \.{PREST} instruction, which is just a ``hint,'' doesn't do
|
4894 |
|
|
anything more if the caches are already busy.
|
4895 |
|
|
|
4896 |
|
|
@=
|
4897 |
|
|
ld_retry: data->state=DT_hit;
|
4898 |
|
|
case DT_hit:@+ if (data->i==preld || data->i==prest) goto fin_ex;
|
4899 |
|
|
@;
|
4900 |
|
|
if ((data->z.o.h&0xffff0000) || !Dcache)
|
4901 |
|
|
@;
|
4902 |
|
|
if (Dcache->lock || (j=get_reader(Dcache))<0) wait(1);
|
4903 |
|
|
startup(&Dcache->reader[j],Dcache->access_time);
|
4904 |
|
|
q=cache_search(Dcache,data->z.o);
|
4905 |
|
|
if (q) {
|
4906 |
|
|
if (data->i==ldunc) q=demote_and_fix(Dcache,q);
|
4907 |
|
|
else q=use_and_fix(Dcache,q);
|
4908 |
|
|
data->x.o=q->data[(data->z.o.l&(Dcache->bb-1))>>3];
|
4909 |
|
|
data->state=ld_ready;
|
4910 |
|
|
}@+else data->state=hit_and_miss;
|
4911 |
|
|
wait(Dcache->access_time);
|
4912 |
|
|
case hit_and_miss:@+if (data->i==ldunc) goto avoid_D;
|
4913 |
|
|
@z.o| in the D-cache@>;
|
4914 |
|
|
|
4915 |
|
|
@ @z.o| in the D-cache@>=
|
4916 |
|
|
@;
|
4917 |
|
|
if (Dcache->filler.next) goto ld_retry;
|
4918 |
|
|
if ((Scache&&Scache->lock) || (!Scache&&mem_lock)) goto ld_retry;
|
4919 |
|
|
q=alloc_slot(Dcache,data->z.o);
|
4920 |
|
|
if (!q) goto ld_retry;
|
4921 |
|
|
if (Scache) set_lock(&Dcache->filler,Scache->lock)@;
|
4922 |
|
|
else set_lock(&Dcache->filler,mem_lock);
|
4923 |
|
|
set_lock(self,Dcache->fill_lock);
|
4924 |
|
|
data->ptr_b=Dcache->filler_ctl.ptr_b=(void *)q;
|
4925 |
|
|
Dcache->filler_ctl.z.o=data->z.o;
|
4926 |
|
|
startup(&Dcache->filler,Scache? Scache->access_time: mem_addr_time);
|
4927 |
|
|
data->state=ld_ready;
|
4928 |
|
|
if (data->i==preld || data->i==prest) goto fin_ex;@+else sleep;
|
4929 |
|
|
|
4930 |
|
|
@ If a |prest| instruction makes it to the hot seat,
|
4931 |
|
|
we have been assured by the user of |PREST| that the current
|
4932 |
|
|
values of bytes in virtual addresses |data->y.o-(data->xx&-Dcache->bb)| through
|
4933 |
|
|
|data->y.o+(data->xx&(Dcache->bb-1))|
|
4934 |
|
|
are irrelevant. Hence we can pretend that we know they are zero. This
|
4935 |
|
|
is advantageous if it saves us from filling a cache block from
|
4936 |
|
|
the S-cache or from memory.
|
4937 |
|
|
|
4938 |
|
|
@=
|
4939 |
|
|
if (data->i==prest &&@|
|
4940 |
|
|
(data->xx>=Dcache->bb || ((data->y.o.l&(Dcache->bb-1))==0)) &&@|
|
4941 |
|
|
((data->y.o.l+(data->xx&(Dcache->bb-1))+1)^data->y.o.l)>=Dcache->bb)
|
4942 |
|
|
goto prest_span;
|
4943 |
|
|
|
4944 |
|
|
@ @=
|
4945 |
|
|
prest_span: data->state=prest_win;
|
4946 |
|
|
case prest_win:@+ if (data!=old_hot || Dlocker.next) wait(1);
|
4947 |
|
|
if (Dcache->lock) goto fin_ex;
|
4948 |
|
|
q=alloc_slot(Dcache,data->z.o); /* OK if |Dcache->filler| is busy */
|
4949 |
|
|
if (q) {
|
4950 |
|
|
clean_block(Dcache,q);
|
4951 |
|
|
q->tag=data->z.o;@+q->tag.l &=-Dcache->bb;
|
4952 |
|
|
set_lock(&Dlocker,Dcache->lock);
|
4953 |
|
|
startup(&Dlocker,Dcache->copy_in_time);
|
4954 |
|
|
}
|
4955 |
|
|
goto fin_ex;
|
4956 |
|
|
|
4957 |
|
|
@ @=
|
4958 |
|
|
{
|
4959 |
|
|
avoid_D:@+ if (mem_lock) wait(1);
|
4960 |
|
|
set_lock(&mem_locker,mem_lock);
|
4961 |
|
|
startup(&mem_locker, mem_addr_time+mem_read_time);
|
4962 |
|
|
data->x.o=mem_read(data->z.o);
|
4963 |
|
|
data->state=ld_ready;@+ wait(mem_addr_time+mem_read_time);
|
4964 |
|
|
}
|
4965 |
|
|
|
4966 |
|
|
@ @=
|
4967 |
|
|
{
|
4968 |
|
|
octa *m=write_search(data,data->z.o);
|
4969 |
|
|
if (m==DUNNO) wait(1);
|
4970 |
|
|
if (m) {
|
4971 |
|
|
data->x.o=*m;
|
4972 |
|
|
data->state=ld_ready;
|
4973 |
|
|
wait(1);
|
4974 |
|
|
}
|
4975 |
|
|
}
|
4976 |
|
|
|
4977 |
|
|
@ The requested octabyte will arrive sooner or later in |data->x.o|.
|
4978 |
|
|
Then a load instruction is almost done, except that we might need
|
4979 |
|
|
to massage the input a little bit.
|
4980 |
|
|
|
4981 |
|
|
@=
|
4982 |
|
|
case ld_ready:@+if (self->lockloc)
|
4983 |
|
|
*(self->lockloc)=NULL, self->lockloc=NULL;
|
4984 |
|
|
if (data->i>=st) goto finish_store;
|
4985 |
|
|
switch(data->op>>1) {
|
4986 |
|
|
case LDB>>1: case LDBU>>1: j=(data->z.o.l&0x7)<<3;@+i=56;@+goto fin_ld;
|
4987 |
|
|
case LDW>>1: case LDWU>>1: j=(data->z.o.l&0x6)<<3;@+i=48;@+goto fin_ld;
|
4988 |
|
|
case LDT>>1: case LDTU>>1: j=(data->z.o.l&0x4)<<3;@+i=32;
|
4989 |
|
|
fin_ld: data->x.o=shift_right(shift_left(data->x.o,j),i,data->op&0x2);
|
4990 |
|
|
default: goto fin_ex;
|
4991 |
|
|
case LDHT>>1:@+if (data->z.o.l&4) data->x.o.h=data->x.o.l;
|
4992 |
|
|
data->x.o.l=0;@+ goto fin_ex;
|
4993 |
|
|
case LDSF>>1:@+if (data->z.o.l&4) data->x.o.h=data->x.o.l;
|
4994 |
|
|
if ((data->x.o.h&0x7f800000)==0 && (data->x.o.h&0x7fffff)) {
|
4995 |
|
|
data->x.o=load_sf(data->x.o.h);
|
4996 |
|
|
data->state=3;@+wait(denin_penalty);
|
4997 |
|
|
}
|
4998 |
|
|
else data->x.o=load_sf(data->x.o.h);@+goto fin_ex;
|
4999 |
|
|
case LDPTP>>1:@+
|
5000 |
|
|
if ((data->x.o.h&sign_bit)==0 || (data->x.o.l&0x1ff8)!=page_n)
|
5001 |
|
|
data->x.o=zero_octa;
|
5002 |
|
|
else data->x.o.l &= -(1<<13);
|
5003 |
|
|
goto fin_ex;
|
5004 |
|
|
case LDPTE>>1:@+if ((data->x.o.l&0x1ff8)!=page_n) data->x.o=zero_octa;
|
5005 |
|
|
else data->x.o=incr(oandn(data->x.o,page_mask),data->x.o.l&0x7);
|
5006 |
|
|
data->x.o.h &= 0xffff;@+ goto fin_ex;
|
5007 |
|
|
case UNSAVE>>1: @;
|
5008 |
|
|
}
|
5009 |
|
|
|
5010 |
|
|
@ @=
|
5011 |
|
|
finish_store: data->state=st_ready;
|
5012 |
|
|
case st_ready:@+ switch (data->i) {
|
5013 |
|
|
case st: case pst: @;
|
5014 |
|
|
case syncd: data->b.o.l=(Dcache? Dcache->bb: 8192);@+goto do_syncd;
|
5015 |
|
|
case syncid: data->b.o.l=(Icache? Icache->bb: 8192);
|
5016 |
|
|
if (Dcache && Dcache->bbb.o.l) data->b.o.l=Dcache->bb;
|
5017 |
|
|
goto do_syncid;
|
5018 |
|
|
}
|
5019 |
|
|
|
5020 |
|
|
@ Store instructions have an extra complication, because some of them need
|
5021 |
|
|
to check for overflow.
|
5022 |
|
|
|
5023 |
|
|
@=
|
5024 |
|
|
data->x.addr=data->z.o;
|
5025 |
|
|
if (data->b.p) wait(1);
|
5026 |
|
|
switch(data->op>>1) {
|
5027 |
|
|
case STUNC>>1: data->i=stunc;
|
5028 |
|
|
default: data->x.o=data->b.o;@+goto fin_ex;
|
5029 |
|
|
case STSF>>1: set_round;@+ data->b.o.h=store_sf(data->b.o);
|
5030 |
|
|
data->interrupt |= exceptions;
|
5031 |
|
|
if ((data->b.o.h&0x7f800000)==0 && (data->b.o.h&0x7fffff)) {
|
5032 |
|
|
if (data->z.o.l&4) data->x.o.l=data->b.o.h;
|
5033 |
|
|
else data->x.o.h=data->b.o.h;
|
5034 |
|
|
data->state=3;@+wait(denout_penalty);
|
5035 |
|
|
}
|
5036 |
|
|
case STHT>>1:@+if (data->z.o.l&4) data->x.o.l=data->b.o.h;
|
5037 |
|
|
else data->x.o.h=data->b.o.h;
|
5038 |
|
|
goto fin_ex;
|
5039 |
|
|
case STB>>1: case STBU>>1: j=(data->z.o.l&0x7)<<3;@+i=56;@+goto fin_st;
|
5040 |
|
|
case STW>>1: case STWU>>1: j=(data->z.o.l&0x6)<<3;@+i=48;@+goto fin_st;
|
5041 |
|
|
case STT>>1: case STTU>>1: j=(data->z.o.l&0x4)<<3;@+i=32;
|
5042 |
|
|
fin_st: @b.o| into the proper field of |data->x.o|,
|
5043 |
|
|
checking for arithmetic exceptions if signed@>;
|
5044 |
|
|
goto fin_ex;
|
5045 |
|
|
case CSWAP>>1: @;
|
5046 |
|
|
case SAVE>>1: @;
|
5047 |
|
|
}
|
5048 |
|
|
|
5049 |
|
|
@ @b.o| into the proper field...@>=
|
5050 |
|
|
{
|
5051 |
|
|
octa mask;
|
5052 |
|
|
if (!(data->op&2)) {@+octa before,after;
|
5053 |
|
|
before=data->b.o;@+after=shift_right(shift_left(data->b.o,i),i,0);
|
5054 |
|
|
if (before.l!=after.l || before.h!=after.h) data->interrupt|=V_BIT;
|
5055 |
|
|
}
|
5056 |
|
|
mask=shift_right(shift_left(neg_one,i),j,1);
|
5057 |
|
|
data->b.o=shift_right(shift_left(data->b.o,i),j,1);
|
5058 |
|
|
data->x.o.h^=mask.h&(data->x.o.h^data->b.o.h);
|
5059 |
|
|
data->x.o.l^=mask.l&(data->x.o.l^data->b.o.l);
|
5060 |
|
|
}
|
5061 |
|
|
|
5062 |
|
|
@ The \.{CSWAP} operation has four inputs $\rm(\$X, \$Y, \$Z, rP)$ as well as
|
5063 |
|
|
three outputs $\rm(\$X,M_8[A],rP)$. To keep from exceeding the capacity
|
5064 |
|
|
of the control blocks in our pipeline, we wait until this instruction reaches
|
5065 |
|
|
the hot seat, thereby allowing us non-speculative access to~rP.
|
5066 |
|
|
|
5067 |
|
|
@=
|
5068 |
|
|
if (data!=old_hot) wait(1);
|
5069 |
|
|
if (data->x.o.h==g[rP].o.h && data->x.o.l==g[rP].o.l) {
|
5070 |
|
|
data->a.o.l=1; /* |data->a.o.h| is zero */
|
5071 |
|
|
data->x.o=data->b.o;
|
5072 |
|
|
}@+else {
|
5073 |
|
|
g[rP].o=data->x.o; /* |data->a.o| is zero */
|
5074 |
|
|
if (verbose&issue_bit) {
|
5075 |
|
|
printf(" setting rP=");@+print_octa(g[rP].o);@+printf("\n");
|
5076 |
|
|
}
|
5077 |
|
|
}
|
5078 |
|
|
data->i=cswap; /* cosmetic change, affects the trace output only */
|
5079 |
|
|
goto fin_ex;
|
5080 |
|
|
|
5081 |
|
|
@* The fetch stage. Now that we've mastered the most difficult memory
|
5082 |
|
|
operations, we can relax and apply our knowledge to the slightly simpler task
|
5083 |
|
|
of filling the fetch buffer. Fetching is like loading/storing, except that we
|
5084 |
|
|
use the I-cache instead of the D-cache. It's slightly simpler because the
|
5085 |
|
|
I-cache is read-only. Further simplifications would be possible if there
|
5086 |
|
|
were no \.{PREGO} instruction, because there is only one fetch unit.
|
5087 |
|
|
However, we want to implement \.{PREGO} with reasonable efficiency, in order
|
5088 |
|
|
to see if that instruction is worthwhile; so we include the complications of
|
5089 |
|
|
simultaneous I-cache and IT-cache readers, which we
|
5090 |
|
|
have already implemented for the D-cache and DT-cache.
|
5091 |
|
|
|
5092 |
|
|
The fetch coroutine is always present, as the one and only coroutine with
|
5093 |
|
|
|stage| number~zero.
|
5094 |
|
|
|
5095 |
|
|
In normal circumstances, the fetch coroutine accesses a cache block containing
|
5096 |
|
|
the instruction whose virtual address is given by |inst_ptr| (the instruction
|
5097 |
|
|
pointer), and transfers up to |fetch_max| instructions from that block to the
|
5098 |
|
|
fetch buffer. Complications arise if the instruction isn't in the cache, or if
|
5099 |
|
|
we can't translate the virtual address because of a miss in the IT-cache.
|
5100 |
|
|
Moreover, |inst_ptr| is a \&{spec} variable whose value might not even be
|
5101 |
|
|
known; if |inst_ptr.p| is nonnull, we don't know what to fetch.
|
5102 |
|
|
@^program counter@>
|
5103 |
|
|
|
5104 |
|
|
@=
|
5105 |
|
|
Extern spec inst_ptr; /* the instruction pointer (aka program counter) */
|
5106 |
|
|
Extern octa *fetched; /* buffer for incoming instructions */
|
5107 |
|
|
|
5108 |
|
|
@ The fetch coroutine usually begins a cycle in state |fetch_ready|, with
|
5109 |
|
|
the most recently fetched octabytes in positions |fetch_lo|, |fetch_lo+1|,
|
5110 |
|
|
\dots, |fetch_hi-1| of a buffer called |fetched|. Once that buffer has been
|
5111 |
|
|
exhausted, the coroutine reverts to state~0; with luck, the buffer might have
|
5112 |
|
|
more data by the time the next cycle rolls around.
|
5113 |
|
|
|
5114 |
|
|
@=
|
5115 |
|
|
int fetch_lo, fetch_hi; /* the active region of that buffer */
|
5116 |
|
|
coroutine fetch_co;
|
5117 |
|
|
control fetch_ctl;
|
5118 |
|
|
|
5119 |
|
|
@ @=
|
5120 |
|
|
fetch_co.ctl=&fetch_ctl;
|
5121 |
|
|
fetch_co.name="Fetch";
|
5122 |
|
|
fetch_ctl.go.o.l=4;
|
5123 |
|
|
startup(&fetch_co,1);
|
5124 |
|
|
|
5125 |
|
|
@ @=
|
5126 |
|
|
if (fetch_co.lockloc) *(fetch_co.lockloc)=NULL,fetch_co.lockloc=NULL;
|
5127 |
|
|
unschedule(&fetch_co);
|
5128 |
|
|
startup(&fetch_co,1);
|
5129 |
|
|
|
5130 |
|
|
@ Some of the actions here are done not only by the fetcher but also by the
|
5131 |
|
|
first and second stages of a |prego| operation.
|
5132 |
|
|
|
5133 |
|
|
@d wait_or_pass(t) if (data->i==prego) {@+pass_after(t);@+goto passit;@+}
|
5134 |
|
|
else wait(t)
|
5135 |
|
|
|
5136 |
|
|
@=
|
5137 |
|
|
switch0:@+ switch(data->state) {
|
5138 |
|
|
new_fetch: data->state=0;
|
5139 |
|
|
case 0: @;
|
5140 |
|
|
data->y.o=inst_ptr.o;
|
5141 |
|
|
data->state=1;@+ data->interrupt=0;@+ data->x.o=data->z.o=zero_octa;
|
5142 |
|
|
case 1: start_fetch:@+ if (data->y.o.h&sign_bit)
|
5143 |
|
|
@;
|
5144 |
|
|
if (page_bad) goto bad_fetch;
|
5145 |
|
|
if (ITcache->lock || (j=get_reader(ITcache))<0) wait(1);
|
5146 |
|
|
startup(&ITcache->reader[j],ITcache->access_time);
|
5147 |
|
|
@;
|
5148 |
|
|
wait_or_pass(ITcache->access_time);
|
5149 |
|
|
@@;
|
5150 |
|
|
}
|
5151 |
|
|
|
5152 |
|
|
@ @=
|
5153 |
|
|
if (data->i==prego) goto start_fetch;
|
5154 |
|
|
|
5155 |
|
|
@ @=
|
5156 |
|
|
if (inst_ptr.p) {
|
5157 |
|
|
if (inst_ptr.p!=UNKNOWN_SPEC && inst_ptr.p->known)
|
5158 |
|
|
inst_ptr.o=inst_ptr.p->o, inst_ptr.p=NULL;
|
5159 |
|
|
wait(1);
|
5160 |
|
|
}
|
5161 |
|
|
|
5162 |
|
|
@ @d got_IT 19 /* |state| when IT-cache entry has been computed */
|
5163 |
|
|
@d IT_miss 20 /* |state| when IT-cache doesn't hold the key */
|
5164 |
|
|
@d IT_hit 21 /* |state| when physical instruction address is known */
|
5165 |
|
|
@d Ihit_and_miss 22 /* |state| when I-cache misses */
|
5166 |
|
|
@d fetch_ready 23 /* |state| when instructions have been read */
|
5167 |
|
|
@d got_one 24 /* |state| when a ``preview'' octabyte is ready */
|
5168 |
|
|
|
5169 |
|
|
@=
|
5170 |
|
|
p=cache_search(ITcache,trans_key(data->y.o));
|
5171 |
|
|
if (!Icache || Icache->lock || (j=get_reader(Icache))<0)
|
5172 |
|
|
@;
|
5173 |
|
|
startup(&Icache->reader[j],Icache->access_time);
|
5174 |
|
|
if (p) @@;
|
5175 |
|
|
else data->state=IT_miss;
|
5176 |
|
|
|
5177 |
|
|
@ We assume that it is possible to look up a virtual address in the IT-cache
|
5178 |
|
|
at the same time as we look for a corresponding physical address in the
|
5179 |
|
|
I-cache, provided that the lower $b+c$ bits of the two addresses are the same.
|
5180 |
|
|
(See the remarks about ``page coloring,'' when we made similar assumptions
|
5181 |
|
|
about the DT-cache and D-cache.)
|
5182 |
|
|
@^page coloring@>
|
5183 |
|
|
|
5184 |
|
|
@=
|
5185 |
|
|
{
|
5186 |
|
|
@;
|
5187 |
|
|
data->z.o=phys_addr(data->y.o,p->data[0]);
|
5188 |
|
|
if (Icache->b+Icache->c>page_s &&@|
|
5189 |
|
|
((data->y.o.l^data->z.o.l)&((Icache->bb<c)-(1<
|
5190 |
|
|
data->state=IT_hit; /* spurious I-cache lookup */
|
5191 |
|
|
else {
|
5192 |
|
|
q=cache_search(Icache,data->z.o);
|
5193 |
|
|
if (q) {
|
5194 |
|
|
q=use_and_fix(Icache,q);
|
5195 |
|
|
@;
|
5196 |
|
|
data->state=fetch_ready;
|
5197 |
|
|
}@+else data->state=Ihit_and_miss;
|
5198 |
|
|
}
|
5199 |
|
|
wait_or_pass(max(ITcache->access_time,Icache->access_time));
|
5200 |
|
|
}
|
5201 |
|
|
|
5202 |
|
|
@ @=
|
5203 |
|
|
p=use_and_fix(ITcache,p);
|
5204 |
|
|
if (!(p->data[0].l&(PX_BIT>>PROT_OFFSET))) goto bad_fetch;
|
5205 |
|
|
|
5206 |
|
|
@ At this point |inst_ptr.o| equals |data->y.o|.
|
5207 |
|
|
|
5208 |
|
|
@=
|
5209 |
|
|
if (data->i!=prego) {
|
5210 |
|
|
for (j=0;jbb;j++) fetched[j]=q->data[j];
|
5211 |
|
|
fetch_lo=(inst_ptr.o.l&(Icache->bb-1))>>3;
|
5212 |
|
|
fetch_hi=Icache->bb>>3;
|
5213 |
|
|
}
|
5214 |
|
|
|
5215 |
|
|
@ @=
|
5216 |
|
|
{
|
5217 |
|
|
if (p) {
|
5218 |
|
|
@;
|
5219 |
|
|
data->z.o=phys_addr(data->y.o,p->data[0]);
|
5220 |
|
|
data->state=IT_hit;
|
5221 |
|
|
}@+ else data->state=IT_miss;
|
5222 |
|
|
wait_or_pass(ITcache->access_time);
|
5223 |
|
|
}
|
5224 |
|
|
|
5225 |
|
|
@ @=
|
5226 |
|
|
{
|
5227 |
|
|
if (data->i==prego && !(data->loc.h&sign_bit)) goto fin_ex;
|
5228 |
|
|
data->z.o=data->y.o;@+ data->z.o.h -= sign_bit;
|
5229 |
|
|
known_phys:@+ if (data->z.o.h&0xffff0000) goto bad_fetch;
|
5230 |
|
|
if (!Icache) @;
|
5231 |
|
|
if (Icache->lock || (j=get_reader(Icache))<0) {
|
5232 |
|
|
data->state=IT_hit;@+ wait_or_pass(1);
|
5233 |
|
|
}
|
5234 |
|
|
startup(&Icache->reader[j],Icache->access_time);
|
5235 |
|
|
q=cache_search(Icache,data->z.o);
|
5236 |
|
|
if (q) {
|
5237 |
|
|
q=use_and_fix(Icache,q);
|
5238 |
|
|
@;
|
5239 |
|
|
data->state=fetch_ready;
|
5240 |
|
|
}@+else data->state=Ihit_and_miss;
|
5241 |
|
|
wait_or_pass(Icache->access_time);
|
5242 |
|
|
}
|
5243 |
|
|
|
5244 |
|
|
@ @=
|
5245 |
|
|
{@+octa addr;
|
5246 |
|
|
addr=data->z.o;
|
5247 |
|
|
if (mem_lock) wait(1);
|
5248 |
|
|
set_lock(&mem_locker,mem_lock);
|
5249 |
|
|
startup(&mem_locker,mem_addr_time+mem_read_time);
|
5250 |
|
|
addr.l&=-(bus_words<<3);
|
5251 |
|
|
fetched[0]=mem_read(addr);
|
5252 |
|
|
for (j=1;j
|
5253 |
|
|
fetched[j]=mem_hash[last_h].chunk[((addr.l&0xffff)>>3)+j];
|
5254 |
|
|
fetch_lo=(data->z.o.l>>3)&(bus_words-1);@+ fetch_hi=bus_words;
|
5255 |
|
|
data->state=fetch_ready;
|
5256 |
|
|
wait(mem_addr_time+mem_read_time);
|
5257 |
|
|
}
|
5258 |
|
|
|
5259 |
|
|
@ @=
|
5260 |
|
|
case IT_miss:@+if (ITcache->filler.next)
|
5261 |
|
|
if (data->i==prego) goto fin_ex;@+else wait(1);
|
5262 |
|
|
if (no_hardware_PT) @;
|
5263 |
|
|
p=alloc_slot(ITcache,trans_key(data->y.o));
|
5264 |
|
|
if (!p) /* hey, it was present after all */
|
5265 |
|
|
if (data->i==prego) goto fin_ex;@+else goto new_fetch;
|
5266 |
|
|
data->ptr_b=ITcache->filler_ctl.ptr_b=(void *)p;
|
5267 |
|
|
ITcache->filler_ctl.y.o=data->y.o;
|
5268 |
|
|
set_lock(self,ITcache->fill_lock);
|
5269 |
|
|
startup(&ITcache->filler,1);
|
5270 |
|
|
data->state=got_IT;
|
5271 |
|
|
if (data->i==prego) goto fin_ex;@+else sleep;
|
5272 |
|
|
case got_IT: release_lock(self,ITcache->fill_lock);
|
5273 |
|
|
if (!(data->z.o.l&(PX_BIT>>PROT_OFFSET))) goto bad_fetch;
|
5274 |
|
|
data->z.o=phys_addr(data->y.o,data->z.o);
|
5275 |
|
|
fetch_retry: data->state=IT_hit;
|
5276 |
|
|
case IT_hit:@+if (data->i==prego) goto fin_ex;@+else goto known_phys;
|
5277 |
|
|
case Ihit_and_miss:
|
5278 |
|
|
@z.o| in the I-cache@>;
|
5279 |
|
|
|
5280 |
|
|
@ @=
|
5281 |
|
|
case IT_miss: case Ihit_and_miss: case IT_hit: case fetch_ready: goto switch0;
|
5282 |
|
|
|
5283 |
|
|
@ @z.o| in the I-cache@>=
|
5284 |
|
|
if (Icache->filler.next) goto fetch_retry;
|
5285 |
|
|
if ((Scache&&Scache->lock) || (!Scache&&mem_lock)) goto fetch_retry;
|
5286 |
|
|
q=alloc_slot(Icache,data->z.o);
|
5287 |
|
|
if (!q) goto fetch_retry;
|
5288 |
|
|
if (Scache) set_lock(&Icache->filler,Scache->lock)@;
|
5289 |
|
|
else set_lock(&Icache->filler,mem_lock);
|
5290 |
|
|
set_lock(self,Icache->fill_lock);
|
5291 |
|
|
data->ptr_b=Icache->filler_ctl.ptr_b=(void *)q;
|
5292 |
|
|
Icache->filler_ctl.z.o=data->z.o;
|
5293 |
|
|
startup(&Icache->filler,Scache? Scache->access_time: mem_addr_time);
|
5294 |
|
|
data->state=got_one;
|
5295 |
|
|
if (data->i==prego) goto fin_ex;@+else sleep;
|
5296 |
|
|
|
5297 |
|
|
@ The I-cache filler will wake us up with the octabyte we want, before
|
5298 |
|
|
it has filled the entire cache block. In that case we can fetch one
|
5299 |
|
|
or two instructions before the rest of the block has been loaded.
|
5300 |
|
|
|
5301 |
|
|
@=
|
5302 |
|
|
bad_fetch:@+ if (data->i==prego) goto fin_ex;
|
5303 |
|
|
data->interrupt |= PX_BIT;
|
5304 |
|
|
swym_one: fetched[0].h=fetched[0].l=SWYM<<24;
|
5305 |
|
|
goto fetch_one;
|
5306 |
|
|
case got_one: fetched[0]=data->x.o; /* a ``preview'' of the new cache data */
|
5307 |
|
|
fetch_one: fetch_lo=0;@+fetch_hi=1;
|
5308 |
|
|
data->state=fetch_ready;
|
5309 |
|
|
case fetch_ready:@+if (self->lockloc)
|
5310 |
|
|
*(self->lockloc)=NULL, self->lockloc=NULL;
|
5311 |
|
|
if (data->i==prego) goto fin_ex;
|
5312 |
|
|
for (j=0;j
|
5313 |
|
|
register fetch *new_tail;
|
5314 |
|
|
if (tail==fetch_bot) new_tail=fetch_top;
|
5315 |
|
|
else new_tail=tail-1;
|
5316 |
|
|
if (new_tail==head) break; /* fetch buffer is full */
|
5317 |
|
|
@;
|
5318 |
|
|
tail=new_tail;
|
5319 |
|
|
if (sleepy) {
|
5320 |
|
|
sleepy=false;@+ sleep;
|
5321 |
|
|
}
|
5322 |
|
|
inst_ptr.o=incr(inst_ptr.o,4);
|
5323 |
|
|
if (fetch_lo==fetch_hi) goto new_fetch;
|
5324 |
|
|
}
|
5325 |
|
|
wait(1);
|
5326 |
|
|
|
5327 |
|
|
@ @=
|
5328 |
|
|
{
|
5329 |
|
|
if (cache_search(ITcache,trans_key(inst_ptr.o))) goto new_fetch;
|
5330 |
|
|
data->interrupt|=F_BIT;
|
5331 |
|
|
sleepy=true;
|
5332 |
|
|
goto swym_one;
|
5333 |
|
|
}
|
5334 |
|
|
|
5335 |
|
|
@ @=
|
5336 |
|
|
bool sleepy; /* have we just emitted the page table emulation call? */
|
5337 |
|
|
|
5338 |
|
|
@ At this point we check for egregiously invalid instructions. (Sometimes
|
5339 |
|
|
the dispatcher will actually allow such instructions to occupy
|
5340 |
|
|
the fetch buffer, for internally generated commands.)
|
5341 |
|
|
|
5342 |
|
|
@=
|
5343 |
|
|
tail->loc=inst_ptr.o;
|
5344 |
|
|
if (inst_ptr.o.l&4) tail->inst=fetched[fetch_lo++].l;
|
5345 |
|
|
else tail->inst=fetched[fetch_lo].h;
|
5346 |
|
|
@^big-endian versus little-endian@>
|
5347 |
|
|
@^little-endian versus big-endian@>
|
5348 |
|
|
tail->interrupt=data->interrupt;
|
5349 |
|
|
i=tail->inst>>24;
|
5350 |
|
|
if (i>=RESUME && i<=SYNC && (tail->inst&bad_inst_mask[i-RESUME]))
|
5351 |
|
|
tail->interrupt |= B_BIT;
|
5352 |
|
|
tail->noted=false;
|
5353 |
|
|
if (inst_ptr.o.l==breakpoint.l && inst_ptr.o.h==breakpoint.h)
|
5354 |
|
|
breakpoint_hit=true;
|
5355 |
|
|
|
5356 |
|
|
@ The commands |RESUME|, |SAVE|, |UNSAVE|, and |SYNC| should not have
|
5357 |
|
|
nonzero bits in the positions defined here.
|
5358 |
|
|
|
5359 |
|
|
@=
|
5360 |
|
|
int bad_inst_mask[4]={0xfffffe,0xffff,0xffff00,0xfffff8};
|
5361 |
|
|
|
5362 |
|
|
@* Interrupts. The scariest thing about the design of a pipelined machine is
|
5363 |
|
|
the existence of interrupts, which disrupt the smooth flow of a computation in
|
5364 |
|
|
ways that are difficult to anticipate. Fortunately, however, the discipline of
|
5365 |
|
|
a reorder buffer, which forces instructions to be committed in order,
|
5366 |
|
|
allows us to deal with interrupts in a fairly natural way. Our solution to the
|
5367 |
|
|
problems of dynamic scheduling and speculative execution therefore solves the
|
5368 |
|
|
interrupt problem as well.
|
5369 |
|
|
@^interrupts@>
|
5370 |
|
|
|
5371 |
|
|
\MMIX\ has three kinds of interrupts, which show up as bit codes in the
|
5372 |
|
|
|interrupt| field when an instruction is ready to be committed:
|
5373 |
|
|
|H_BIT| invokes a trip handler, for \.{TRIP} instructions and
|
5374 |
|
|
arithmetic exceptions; |F_BIT| invokes a forced-trap handler, for \.{TRAP}
|
5375 |
|
|
instructions and unimplemented instructions that need to be emulated
|
5376 |
|
|
in software; |E_BIT| invokes a dynamic-trap handler, for external
|
5377 |
|
|
interrupts like I/O signals or for internal interrupts caused by
|
5378 |
|
|
improper instructions.
|
5379 |
|
|
In all three cases, the pipeline control has already been redirected to fetch
|
5380 |
|
|
new instructions starting at the correct handler address by the time an
|
5381 |
|
|
interrupted instruction is ready to be committed.
|
5382 |
|
|
|
5383 |
|
|
@ Most instructions come to the following part of the program, if they
|
5384 |
|
|
have finished execution with any~1s among the eight trip bits or the
|
5385 |
|
|
eight trap bits.
|
5386 |
|
|
|
5387 |
|
|
If the trip bits aren't all zero, we want to update the event bits
|
5388 |
|
|
of~rA, or perform an enabled trip handler, or both. If the trap bits
|
5389 |
|
|
are nonzero, we need to hold onto them until we get to the hot seat,
|
5390 |
|
|
when they will be joined with the bits of~rQ and probably cause an interrupt.
|
5391 |
|
|
A load or store instruction with nonzero trap bits will be nullified,
|
5392 |
|
|
not committed.
|
5393 |
|
|
|
5394 |
|
|
Underflow that is exact and not enabled is ignored, in accordance with
|
5395 |
|
|
the IEEE standard conventions. (This applies also to underflow
|
5396 |
|
|
triggered by |RESUME_SET|.)
|
5397 |
|
|
|
5398 |
|
|
@d is_load_store(i) (i>=ld && i<=cswap)
|
5399 |
|
|
|
5400 |
|
|
@=
|
5401 |
|
|
{
|
5402 |
|
|
if ((data->interrupt&0xff) && is_load_store(data->i)) goto state_5;
|
5403 |
|
|
j=data->interrupt&0xff00;
|
5404 |
|
|
data->interrupt -= j;
|
5405 |
|
|
if ((j&(U_BIT+X_BIT))==U_BIT && !(data->ra.o.l & U_BIT)) j&=~U_BIT;
|
5406 |
|
|
data->arith_exc=(j&~data->ra.o.l)>>8;
|
5407 |
|
|
if (j&data->ra.o.l) @;
|
5408 |
|
|
if (data->interrupt&0xff) goto state_5;
|
5409 |
|
|
}
|
5410 |
|
|
|
5411 |
|
|
@ Since execution is speculative, an exceptional condition might not
|
5412 |
|
|
be part of the ``real'' computation. Indeed, the present coroutine
|
5413 |
|
|
might have already been deissued.
|
5414 |
|
|
|
5415 |
|
|
@=
|
5416 |
|
|
{
|
5417 |
|
|
i=issued_between(data,cool);
|
5418 |
|
|
if (i
|
5419 |
|
|
deissues=i;
|
5420 |
|
|
old_tail=tail=head;@+resuming=0; /* clear the fetch buffer */
|
5421 |
|
|
@;
|
5422 |
|
|
cool_hist=data->hist;
|
5423 |
|
|
for (i=j&data->ra.o.l,m=16;!(i&D_BIT);i<<=1,m+=16);
|
5424 |
|
|
data->go.o.h=0, data->go.o.l=m;
|
5425 |
|
|
inst_ptr.o=data->go.o, inst_ptr.p=NULL;
|
5426 |
|
|
data->interrupt |= H_BIT;
|
5427 |
|
|
goto state_4;
|
5428 |
|
|
}
|
5429 |
|
|
|
5430 |
|
|
@ @=
|
5431 |
|
|
i=issued_between(data,cool);
|
5432 |
|
|
if (i
|
5433 |
|
|
deissues=i;
|
5434 |
|
|
old_tail=tail=head;@+resuming=0; /* clear the fetch buffer */
|
5435 |
|
|
@;
|
5436 |
|
|
cool_hist=data->hist;
|
5437 |
|
|
inst_ptr.p=UNKNOWN_SPEC;
|
5438 |
|
|
data->interrupt |= F_BIT;
|
5439 |
|
|
|
5440 |
|
|
@ We need to stop dispatching when calling a trip handler from within
|
5441 |
|
|
the reorder buffer,
|
5442 |
|
|
lest we issue an instruction that uses
|
5443 |
|
|
|g[255]| or |rB| as an operand.
|
5444 |
|
|
|
5445 |
|
|
@=
|
5446 |
|
|
emulate_virt: @;
|
5447 |
|
|
state_4: data->state=4;
|
5448 |
|
|
case 4:@+if (dispatch_lock) wait(1);
|
5449 |
|
|
set_lock(self,dispatch_lock);
|
5450 |
|
|
state_5: data->state=5;
|
5451 |
|
|
case 5:@+if (data!=old_hot) wait(1);
|
5452 |
|
|
if ((data->interrupt&F_BIT) && data->i!=trap) {
|
5453 |
|
|
inst_ptr.o=g[rT].o, inst_ptr.p=NULL;
|
5454 |
|
|
if (is_load_store(data->i)) nullifying=true;
|
5455 |
|
|
}
|
5456 |
|
|
if (data->interrupt&0xff) {
|
5457 |
|
|
g[rQ].o.h |= data->interrupt&0xff;
|
5458 |
|
|
new_Q.h |= data->interrupt&0xff;
|
5459 |
|
|
if (verbose&issue_bit) {
|
5460 |
|
|
printf(" setting rQ=");@+print_octa(g[rQ].o);@+printf("\n");
|
5461 |
|
|
}
|
5462 |
|
|
}
|
5463 |
|
|
goto die;
|
5464 |
|
|
|
5465 |
|
|
@ The instructions of the previous section appear in the switch for
|
5466 |
|
|
coroutine stage~1 only. We need to use them also in later stages.
|
5467 |
|
|
|
5468 |
|
|
@=
|
5469 |
|
|
case 4: goto state_4;
|
5470 |
|
|
case 5: goto state_5;
|
5471 |
|
|
|
5472 |
|
|
@ @=
|
5473 |
|
|
case trap:@+ if ((flags[op]&X_is_dest_bit) &&
|
5474 |
|
|
cool->xxxx>=cool_L)
|
5475 |
|
|
goto increase_L;
|
5476 |
|
|
if (!g[rT].up->known || !g[rJ].up->known) goto stall;
|
5477 |
|
|
inst_ptr=specval(&g[rT]); /* traps and emulated ops */
|
5478 |
|
|
cool->need_b=true, cool->b=specval(&g[255]);
|
5479 |
|
|
case trip: if (!g[rJ].up->known) goto stall;
|
5480 |
|
|
cool->ren_x=true, spec_install(&g[255],&cool->x);
|
5481 |
|
|
cool->x.known=true, cool->x.o=g[rJ].up->o;
|
5482 |
|
|
if (i==trip) cool->go.o=zero_octa;
|
5483 |
|
|
cool->ren_a=true, spec_install(&g[i==trap? rBB: rB],&cool->a);@+break;
|
5484 |
|
|
|
5485 |
|
|
@ @=
|
5486 |
|
|
case trap: data->interrupt |= F_BIT;@+ data->a.o=data->b.o;@+ goto fin_ex;
|
5487 |
|
|
case trip: data->interrupt |= H_BIT;@+ data->a.o=data->b.o;@+ goto fin_ex;
|
5488 |
|
|
|
5489 |
|
|
@ The following check is performed at the beginning of every cycle.
|
5490 |
|
|
An instruction in the hot seat can be externally interrupted only if
|
5491 |
|
|
it is ready to be committed and not already marked for tripping
|
5492 |
|
|
or trapping.
|
5493 |
|
|
|
5494 |
|
|
@=
|
5495 |
|
|
g[rI].o=incr(g[rI].o,-1);
|
5496 |
|
|
if (g[rI].o.l==0 && g[rI].o.h==0) {
|
5497 |
|
|
g[rQ].o.l |= INTERVAL_TIMEOUT, new_Q.l |= INTERVAL_TIMEOUT;
|
5498 |
|
|
if (verbose&issue_bit) {
|
5499 |
|
|
printf(" setting rQ=");@+print_octa(g[rQ].o);@+printf("\n");
|
5500 |
|
|
}
|
5501 |
|
|
}
|
5502 |
|
|
trying_to_interrupt=false;
|
5503 |
|
|
if (((g[rQ].o.h&g[rK].o.h)||(g[rQ].o.l&g[rK].o.l)) && cool!=hot &&@|
|
5504 |
|
|
!(hot->interrupt&(E_BIT+F_BIT+H_BIT)) && !doing_interrupt &&@|
|
5505 |
|
|
!(hot->i==resum)) {
|
5506 |
|
|
if (hot->owner) trying_to_interrupt=true;
|
5507 |
|
|
else {
|
5508 |
|
|
hot->interrupt |= E_BIT;
|
5509 |
|
|
@;
|
5510 |
|
|
inst_ptr.o=g[rTT].o;@+inst_ptr.p=NULL;
|
5511 |
|
|
}
|
5512 |
|
|
}
|
5513 |
|
|
|
5514 |
|
|
@ @=
|
5515 |
|
|
bool trying_to_interrupt; /* encouraging interruptible operations to pause */
|
5516 |
|
|
bool nullifying; /* stopping dispatch to nullify a load/store command */
|
5517 |
|
|
|
5518 |
|
|
@ It's possible that the command in the hot seat has been deissued,
|
5519 |
|
|
but only if the simulator has done so at the user's request. Otherwise
|
5520 |
|
|
the test `|i>=deissues|' here will always succeed.
|
5521 |
|
|
|
5522 |
|
|
The value of |cool_hist| becomes flaky here. We could try to keep it
|
5523 |
|
|
strictly up to date, but the unpredictable nature of external interrupts
|
5524 |
|
|
suggests that we are better off leaving it alone. (It's only a heuristic
|
5525 |
|
|
for branch prediction, and a sufficiently strong prediction will survive
|
5526 |
|
|
one-time glitches due to interrupts.)
|
5527 |
|
|
|
5528 |
|
|
@=
|
5529 |
|
|
i=issued_between(hot,cool);
|
5530 |
|
|
if (i>=deissues) {
|
5531 |
|
|
deissues=i;
|
5532 |
|
|
tail=head;@+resuming=0; /* clear the fetch buffer */
|
5533 |
|
|
@;
|
5534 |
|
|
if (is_load_store(hot->i)) nullifying=true;
|
5535 |
|
|
}
|
5536 |
|
|
|
5537 |
|
|
@ Even though an interrupted instruction has officially been either
|
5538 |
|
|
``committed'' or ``nullified,'' it stays in the hot seat for
|
5539 |
|
|
two or three extra cycles,
|
5540 |
|
|
while we save enough of the machine state to resume the computation later.
|
5541 |
|
|
|
5542 |
|
|
%Notice, incidentally, that |H_BIT| and |E_BIT| might both be present
|
5543 |
|
|
%simultaneously. In such cases we first prepare for a trip handler, but
|
5544 |
|
|
%interrupt that for a dynamic trap handler. (Ah, the joys of computer
|
5545 |
|
|
%architecture.)
|
5546 |
|
|
|
5547 |
|
|
@=
|
5548 |
|
|
{
|
5549 |
|
|
if (!(hot->interrupt&H_BIT)) g[rK].o=zero_octa; /* trap */
|
5550 |
|
|
if (((hot->interrupt&H_BIT)&&hot->i!=trip) ||@|
|
5551 |
|
|
((hot->interrupt&F_BIT)&&hot->i!=trap) ||@|
|
5552 |
|
|
(hot->interrupt&E_BIT)) doing_interrupt=3, suppress_dispatch=true;
|
5553 |
|
|
else doing_interrupt=2; /* trip or trap started by dispatcher */
|
5554 |
|
|
break;
|
5555 |
|
|
}
|
5556 |
|
|
|
5557 |
|
|
@ If a memory failure occurs, we should set rF here, either in
|
5558 |
|
|
case~2 or case~1. The simulator doesn't do anything with~rF at present.
|
5559 |
|
|
|
5560 |
|
|
@=
|
5561 |
|
|
switch (doing_interrupt--) {
|
5562 |
|
|
case 3: @;
|
5563 |
|
|
@+break;
|
5564 |
|
|
case 2: @;@+break;
|
5565 |
|
|
case 1: @;
|
5566 |
|
|
if (hot==reorder_bot) hot=reorder_top;@+ else hot--;
|
5567 |
|
|
break;
|
5568 |
|
|
}
|
5569 |
|
|
|
5570 |
|
|
@ @=
|
5571 |
|
|
j=hot->interrupt&H_BIT;
|
5572 |
|
|
g[j?rB:rBB].o=g[255].o;
|
5573 |
|
|
g[255].o=g[rJ].o;
|
5574 |
|
|
if (verbose&issue_bit) {
|
5575 |
|
|
if (j) {
|
5576 |
|
|
printf(" setting rB=");@+print_octa(g[rB].o);
|
5577 |
|
|
}@+else {
|
5578 |
|
|
printf(" setting rBB=");@+print_octa(g[rBB].o);
|
5579 |
|
|
}
|
5580 |
|
|
printf(", $255=");@+print_octa(g[255].o);@+printf("\n");
|
5581 |
|
|
}
|
5582 |
|
|
|
5583 |
|
|
@ Here's where we manufacture the ``ropcodes'' for resumption.
|
5584 |
|
|
|
5585 |
|
|
@d RESUME_AGAIN 0 /* repeat the command in rX as if in location $\rm rW-4$ */
|
5586 |
|
|
@d RESUME_CONT 1 /* same, but substitute rY and rZ for operands */
|
5587 |
|
|
@d RESUME_SET 2 /* set r[X] to rZ */
|
5588 |
|
|
@d RESUME_TRANS 3 /* install $\rm(rY,rZ)$ into IT-cache or DT-cache,
|
5589 |
|
|
then |RESUME_AGAIN| */
|
5590 |
|
|
@d pack_bytes(a,b,c,d) ((((((unsigned)(a)<<8)+(b))<<8)+(c))<<8)+(d)
|
5591 |
|
|
|
5592 |
|
|
@=
|
5593 |
|
|
j=pack_bytes(hot->op,hot->xx,hot->yy,hot->zz);
|
5594 |
|
|
if (hot->interrupt&H_BIT) { /* trip */
|
5595 |
|
|
g[rW].o=incr(hot->loc,4);
|
5596 |
|
|
g[rX].o.h=sign_bit, g[rX].o.l=j;
|
5597 |
|
|
if (verbose&issue_bit) {
|
5598 |
|
|
printf(" setting rW=");@+print_octa(g[rW].o);
|
5599 |
|
|
printf(", rX=");@+print_octa(g[rX].o);@+printf("\n");
|
5600 |
|
|
}
|
5601 |
|
|
}@+else { /* trap */
|
5602 |
|
|
g[rWW].o=hot->go.o;
|
5603 |
|
|
g[rXX].o.l=j;
|
5604 |
|
|
if (hot->interrupt&F_BIT) { /* forced */
|
5605 |
|
|
if (hot->i!=trap) j=RESUME_TRANS; /* emulate page translation */
|
5606 |
|
|
else if (hot->op==TRAP) j=0x80; /* |TRAP| */
|
5607 |
|
|
else if (flags[internal_op[hot->op]]&X_is_dest_bit)
|
5608 |
|
|
j=RESUME_SET; /* emulation */
|
5609 |
|
|
else j=0x80; /* emulation when r[X] is not a destination */
|
5610 |
|
|
}@+else { /* dynamic */
|
5611 |
|
|
if (hot->interim)
|
5612 |
|
|
j=(hot->i==frem || hot->i==syncd || hot->i==syncid? RESUME_CONT:
|
5613 |
|
|
RESUME_AGAIN);
|
5614 |
|
|
else if (is_load_store(hot->i)) j=RESUME_AGAIN;
|
5615 |
|
|
else j=0x80; /* normal external interruption */
|
5616 |
|
|
}
|
5617 |
|
|
g[rXX].o.h=(j<<24)+(hot->interrupt&0xff);
|
5618 |
|
|
if (verbose&issue_bit) {
|
5619 |
|
|
printf(" setting rWW=");@+print_octa(g[rWW].o);
|
5620 |
|
|
printf(", rXX=");@+print_octa(g[rXX].o);@+printf("\n");
|
5621 |
|
|
}
|
5622 |
|
|
}
|
5623 |
|
|
|
5624 |
|
|
@ @=
|
5625 |
|
|
j=hot->interrupt&H_BIT;
|
5626 |
|
|
if ((hot->interrupt&F_BIT) && hot->op==SWYM) g[rYY].o=hot->go.o;
|
5627 |
|
|
else g[j?rY:rYY].o=hot->y.o;
|
5628 |
|
|
if (hot->i==st || hot->i==pst) g[j?rZ:rZZ].o=hot->x.o;
|
5629 |
|
|
else g[j?rZ:rZZ].o=hot->z.o;
|
5630 |
|
|
if (verbose&issue_bit) {
|
5631 |
|
|
if (j) {
|
5632 |
|
|
printf(" setting rY=");@+print_octa(g[rY].o);
|
5633 |
|
|
printf(", rZ=");@+print_octa(g[rZ].o);@+printf("\n");
|
5634 |
|
|
}@+else {
|
5635 |
|
|
printf(" setting rYY=");@+print_octa(g[rYY].o);
|
5636 |
|
|
printf(", rZZ=");@+print_octa(g[rZZ].o);@+printf("\n");
|
5637 |
|
|
}
|
5638 |
|
|
}
|
5639 |
|
|
|
5640 |
|
|
@ Whew; we've successfully interrupted the computation. The remaining
|
5641 |
|
|
task is to restart it again, as transparently as possible.
|
5642 |
|
|
|
5643 |
|
|
The \.{RESUME} instruction waits for the pipeline to drain, because
|
5644 |
|
|
it has to do such drastic things. For example, an interrupt may be
|
5645 |
|
|
occurring at this very moment, changing the registers needed for resumption.
|
5646 |
|
|
|
5647 |
|
|
@=
|
5648 |
|
|
case resume:@+ if (cool!=old_hot) goto stall;
|
5649 |
|
|
inst_ptr=specval(&g[cool->zz? rWW:rW]);
|
5650 |
|
|
if (!(cool->loc.h&sign_bit)) {
|
5651 |
|
|
if (cool->zz) cool->interrupt |= K_BIT;
|
5652 |
|
|
else if (inst_ptr.o.h&sign_bit) cool->interrupt |= P_BIT;
|
5653 |
|
|
}
|
5654 |
|
|
if (cool->interrupt) {
|
5655 |
|
|
inst_ptr.o=incr(cool->loc,4);@+cool->i=noop;
|
5656 |
|
|
}@+ else {
|
5657 |
|
|
cool->go.o=inst_ptr.o;
|
5658 |
|
|
if (cool->zz) {
|
5659 |
|
|
@loc| is rT@>;
|
5660 |
|
|
cool->ren_a=true, spec_install(&g[rK],&cool->a);
|
5661 |
|
|
cool->a.known=true, cool->a.o=g[255].o;
|
5662 |
|
|
cool->ren_x=true, spec_install(&g[255],&cool->x);
|
5663 |
|
|
cool->x.known=true, cool->x.o=g[rBB].o;
|
5664 |
|
|
}
|
5665 |
|
|
cool->b= specval(&g[cool->zz? rXX:rX]);
|
5666 |
|
|
if (!(cool->b.o.h&sign_bit)) @;
|
5667 |
|
|
}@+break;
|
5668 |
|
|
|
5669 |
|
|
@ Here we set |cool->i=resum|, since we want to issue another instruction
|
5670 |
|
|
after the \.{RESUME} itself.
|
5671 |
|
|
|
5672 |
|
|
The restrictions on inserted instructions are designed to ensure that
|
5673 |
|
|
those instructions will be the very next ones issued. (If, for example,
|
5674 |
|
|
an |incgamma| instruction were necessary, it might cause a page fault
|
5675 |
|
|
and we'd lose the operand values for |RESUME_SET| or |RESUME_CONT|.)
|
5676 |
|
|
|
5677 |
|
|
A subtle point arises here: If |RESUME_TRANS| is being used to compute
|
5678 |
|
|
the page translation of virtual address zero, we don't want to execute
|
5679 |
|
|
the dummy \.{SWYM} instruction from virtual address $-4$! So we avoid
|
5680 |
|
|
the \.{SWYM} altogether.
|
5681 |
|
|
|
5682 |
|
|
@=
|
5683 |
|
|
{
|
5684 |
|
|
cool->xx=cool->b.o.h>>24, cool->i=resum;
|
5685 |
|
|
head->loc=incr(inst_ptr.o,-4);
|
5686 |
|
|
switch(cool->xx) {
|
5687 |
|
|
case RESUME_SET: cool->b.o.l=(SETH<<24)+(cool->b.o.l&0xff0000);
|
5688 |
|
|
head->interrupt|=cool->b.o.h&0xff00;
|
5689 |
|
|
resuming=2;
|
5690 |
|
|
case RESUME_CONT: resuming+=1+cool->zz;
|
5691 |
|
|
if (((cool->b.o.l>>24)&0xfa)!=0xb8) { /* not |syncd| or |syncid| */
|
5692 |
|
|
m=cool->b.o.l>>28;
|
5693 |
|
|
if ((1<
|
5694 |
|
|
m=(cool->b.o.l>>16)&0xff;
|
5695 |
|
|
if (m>=cool_L && m
|
5696 |
|
|
}
|
5697 |
|
|
case RESUME_AGAIN: resume_again: head->inst=cool->b.o.l;
|
5698 |
|
|
m=head->inst>>24;
|
5699 |
|
|
if (m==RESUME) goto bad_resume; /* avoid uninterruptible loop */
|
5700 |
|
|
if (!cool->zz &&
|
5701 |
|
|
m>RESUME && m<=SYNC && (head->inst&bad_inst_mask[m-RESUME]))
|
5702 |
|
|
head->interrupt|=B_BIT;
|
5703 |
|
|
head->noted=false;@+break;
|
5704 |
|
|
case RESUME_TRANS:@+if (cool->zz) {
|
5705 |
|
|
cool->y=specval(&g[rYY]), cool->z=specval(&g[rZZ]);
|
5706 |
|
|
if ((cool->b.o.l>>24)!=SWYM) goto resume_again;
|
5707 |
|
|
cool->i=resume;@+break; /* see ``subtle point'' above */
|
5708 |
|
|
}
|
5709 |
|
|
default: bad_resume: cool->interrupt |= B_BIT, cool->i=noop;
|
5710 |
|
|
resuming=0;@+break;
|
5711 |
|
|
}
|
5712 |
|
|
}
|
5713 |
|
|
|
5714 |
|
|
@ @=
|
5715 |
|
|
{
|
5716 |
|
|
if (resuming&1) {
|
5717 |
|
|
cool->y=specval(&g[rY]);
|
5718 |
|
|
cool->z=specval(&g[rZ]);
|
5719 |
|
|
}@+else {
|
5720 |
|
|
cool->y=specval(&g[rYY]);
|
5721 |
|
|
cool->z=specval(&g[rZZ]);
|
5722 |
|
|
}
|
5723 |
|
|
if (resuming>=3) { /* |RESUME_SET| */
|
5724 |
|
|
cool->need_ra=true, cool->ra=specval(&g[rA]);
|
5725 |
|
|
}
|
5726 |
|
|
cool->usage=false;
|
5727 |
|
|
}
|
5728 |
|
|
|
5729 |
|
|
@ @d do_resume_trans 17 /* |state| for performing |RESUME_TRANS| actions */
|
5730 |
|
|
|
5731 |
|
|
@=
|
5732 |
|
|
case resume: case resum:@+if (data->xx!=RESUME_TRANS) goto fin_ex;
|
5733 |
|
|
data->ptr_a=(void*)((data->b.o.l>>24)==SWYM? ITcache: DTcache);
|
5734 |
|
|
data->state=do_resume_trans;
|
5735 |
|
|
data->z.o=incr(oandn(data->z.o,page_mask),data->z.o.l&7);
|
5736 |
|
|
data->z.o.h &= 0xffff;
|
5737 |
|
|
goto resume_trans;
|
5738 |
|
|
|
5739 |
|
|
@ @=
|
5740 |
|
|
case do_resume_trans: resume_trans: {@+register cache*c=(cache*)data->ptr_a;
|
5741 |
|
|
if (c->lock) wait(1);
|
5742 |
|
|
if (c->filler.next) wait(1);
|
5743 |
|
|
p=alloc_slot(c,trans_key(data->y.o));
|
5744 |
|
|
if (p) {
|
5745 |
|
|
c->filler_ctl.ptr_b=(void*)p;
|
5746 |
|
|
c->filler_ctl.y.o=data->y.o;
|
5747 |
|
|
c->filler_ctl.b.o=data->z.o;
|
5748 |
|
|
c->filler_ctl.state=1;
|
5749 |
|
|
schedule(&c->filler,c->access_time,1);
|
5750 |
|
|
}
|
5751 |
|
|
goto fin_ex;
|
5752 |
|
|
}
|
5753 |
|
|
|
5754 |
|
|
|
5755 |
|
|
@* Administrative operations.
|
5756 |
|
|
The internal instructions that handle the register stack simply reduce
|
5757 |
|
|
to things we already know how to do. (Well, the internal instructions
|
5758 |
|
|
for saving and unsaving do sometimes lead to special cases, based on
|
5759 |
|
|
|data->op|; for the most part, though, the necessary mechanisms are
|
5760 |
|
|
already present.)
|
5761 |
|
|
|
5762 |
|
|
@=
|
5763 |
|
|
case noop:@+if (data->interrupt&F_BIT) goto emulate_virt;
|
5764 |
|
|
case jmp: case pushj: case incrl: case unsave: goto fin_ex;
|
5765 |
|
|
case sav:@+if (!(data->mem_x)) goto fin_ex;
|
5766 |
|
|
case incgamma: case save: data->i=st; goto switch1;
|
5767 |
|
|
case decgamma: case unsav: data->i=ld; goto switch1;
|
5768 |
|
|
|
5769 |
|
|
@ We can \.{GET} special registers $\ge21$ (that is, rA, rF, rP, rW--rZ,
|
5770 |
|
|
or rWW--rZZ) only in the hot seat, because those registers are
|
5771 |
|
|
implicit outputs of many instructions.
|
5772 |
|
|
|
5773 |
|
|
The same applies to rK, since it is changed by \.{TRAP} and
|
5774 |
|
|
by emulated instructions.
|
5775 |
|
|
|
5776 |
|
|
@=
|
5777 |
|
|
case get:@+ if (data->zz>=21 || data->zz==rK) {
|
5778 |
|
|
if (data!=old_hot) wait(1);
|
5779 |
|
|
data->z.o=g[data->zz].o;
|
5780 |
|
|
}
|
5781 |
|
|
data->x.o=data->z.o;@+goto fin_ex;
|
5782 |
|
|
|
5783 |
|
|
@ A \.{PUT} is, similarly, delayed in the cases that hold |dispatch_lock|.
|
5784 |
|
|
This program does not restrict the 1~bits that might be
|
5785 |
|
|
\.{PUT} into~rQ, although the contents of that register can have
|
5786 |
|
|
drastic implications.
|
5787 |
|
|
|
5788 |
|
|
@=
|
5789 |
|
|
case put:@+if (data->xx>=15 && data->xx<=20) {
|
5790 |
|
|
if (data!=old_hot) wait(1);
|
5791 |
|
|
switch (data->xx) {
|
5792 |
|
|
case rV: @;@+break;
|
5793 |
|
|
case rQ: new_Q.h |= data->z.o.h &~ g[rQ].o.h;@+
|
5794 |
|
|
new_Q.l |= data->z.o.l &~ g[rQ].o.l;
|
5795 |
|
|
data->z.o.l |= new_Q.l;@+
|
5796 |
|
|
data->z.o.h |= new_Q.h;@+break;
|
5797 |
|
|
case rL:@+ if (data->z.o.h!=0) data->z.o.h=0, data->z.o.l=g[rL].o.l;
|
5798 |
|
|
else if (data->z.o.l>g[rL].o.l) data->z.o.l=g[rL].o.l;
|
5799 |
|
|
default: break;
|
5800 |
|
|
case rG: @;@+break;
|
5801 |
|
|
}
|
5802 |
|
|
}@+else if (data->xx==rA && (data->z.o.h!=0 || data->z.o.l>=0x40000))
|
5803 |
|
|
data->interrupt |= B_BIT;
|
5804 |
|
|
data->x.o=data->z.o;@+goto fin_ex;
|
5805 |
|
|
|
5806 |
|
|
@ When rG decreases, we assume that up to |commit_max| marginal registers can
|
5807 |
|
|
be zeroed during each clock cycle. (Remember that we're currently in the hot
|
5808 |
|
|
seat, and holding |dispatch_lock|.)
|
5809 |
|
|
|
5810 |
|
|
@=
|
5811 |
|
|
if (data->z.o.h!=0 || data->z.o.l>=256 ||
|
5812 |
|
|
data->z.o.lz.o.l<32)
|
5813 |
|
|
data->interrupt |= B_BIT;
|
5814 |
|
|
else if (data->z.o.l
|
5815 |
|
|
data->interim=true; /* potentially interruptible */
|
5816 |
|
|
for (j=0;j
|
5817 |
|
|
g[rG].o.l--;
|
5818 |
|
|
g[g[rG].o.l].o=zero_octa;
|
5819 |
|
|
if (data->z.o.l==g[rG].o.l) break;
|
5820 |
|
|
}
|
5821 |
|
|
if (j==commit_max) {
|
5822 |
|
|
if (!trying_to_interrupt) wait(1);
|
5823 |
|
|
}@+else data->interim=false;
|
5824 |
|
|
}
|
5825 |
|
|
|
5826 |
|
|
@ Computed jumps put the desired destination address into the |go| field.
|
5827 |
|
|
|
5828 |
|
|
@=
|
5829 |
|
|
case go: data->x.o=data->go.o;@+ goto add_go;
|
5830 |
|
|
case pop: data->x.o=data->y.o; data->y.o=data->b.o; /* move rJ to |y| field */
|
5831 |
|
|
case pushgo: add_go: data->go.o=oplus(data->y.o,data->z.o);
|
5832 |
|
|
if ((data->go.o.h&sign_bit) && !(data->loc.h&sign_bit))
|
5833 |
|
|
data->interrupt |= P_BIT;
|
5834 |
|
|
data->go.known=true;@+goto fin_ex;
|
5835 |
|
|
|
5836 |
|
|
@ The instruction \.{UNSAVE} $z$ generates a sequence of internal instructions
|
5837 |
|
|
that accomplish the actual unsaving. This sequence is controlled by the
|
5838 |
|
|
instruction currently in the fetch buffer, which changes its X and~Y fields
|
5839 |
|
|
until all global registers have been loaded. The first instructions of the
|
5840 |
|
|
sequence are \.{UNSAVE}~$0,0,z$; \.{UNSAVE}~$1,rZ,z-8$;
|
5841 |
|
|
\.{UNSAVE}~$1,rY,z-16$; \dots;
|
5842 |
|
|
\.{UNSAVE}~$1,rB,z-96$; \.{UNSAVE}~$2,255,z-104$; \.{UNSAVE}~$2,254,z-112$;
|
5843 |
|
|
etc. If an interrupt occurs before these instructions have all been committed,
|
5844 |
|
|
the execution register will contain enough information to restart the process.
|
5845 |
|
|
|
5846 |
|
|
After the global registers have all been loaded, \.{UNSAVE} continues by
|
5847 |
|
|
acting rather like~\.{POP}. An interrupt occurring during this last stage
|
5848 |
|
|
will find $\rm rS
|
5849 |
|
|
restoring the local registers again. But no information will be lost,
|
5850 |
|
|
even though the register from which we began unsaving has long since
|
5851 |
|
|
been replaced.
|
5852 |
|
|
|
5853 |
|
|
@=
|
5854 |
|
|
case unsave:@+if (cool->interrupt&B_BIT) cool->i=noop;
|
5855 |
|
|
else {
|
5856 |
|
|
cool->interim=true;
|
5857 |
|
|
op=LDOU; /* this instruction needs to be handled by load/store unit */
|
5858 |
|
|
cool->i=unsav;
|
5859 |
|
|
switch(cool->xx) {
|
5860 |
|
|
case 0:@+ if (cool->z.p) goto stall;
|
5861 |
|
|
@;@+break;
|
5862 |
|
|
case 1: case 2: @;@+break;
|
5863 |
|
|
case 3: cool->i=unsave, cool->interim=false, op=UNSAVE;
|
5864 |
|
|
goto pop_unsave;
|
5865 |
|
|
default: cool->interim=false,cool->i=noop,cool->interrupt|=B_BIT;@+break;
|
5866 |
|
|
}
|
5867 |
|
|
}
|
5868 |
|
|
break; /* this takes us to |dispatch_done| */
|
5869 |
|
|
|
5870 |
|
|
@ @=
|
5871 |
|
|
cool->ren_x=true, spec_install(&g[cool->yy],&cool->x);
|
5872 |
|
|
new_O=new_S=incr(cool_O,-1);
|
5873 |
|
|
cool->z.o=shift_left(new_O,3);
|
5874 |
|
|
cool->ptr_a=(void*)mem.up;
|
5875 |
|
|
|
5876 |
|
|
@ @=
|
5877 |
|
|
cool->ren_x=true, spec_install(&g[rG],&cool->x);
|
5878 |
|
|
cool->ren_a=true, spec_install(&g[rA],&cool->a);
|
5879 |
|
|
new_O=new_S=shift_right(cool->z.o,3,1);
|
5880 |
|
|
cool->set_l=true, spec_install(&g[rL],&cool->rl);
|
5881 |
|
|
cool->ptr_a=(void*)mem.up;
|
5882 |
|
|
|
5883 |
|
|
@ @=
|
5884 |
|
|
switch (cool->xx) {
|
5885 |
|
|
case 0: head->inst=pack_bytes(UNSAVE,1,rZ,0);@+ break;
|
5886 |
|
|
case 1:@+ if (cool->yy==rP) head->inst=pack_bytes(UNSAVE,1,rR,0);
|
5887 |
|
|
else if (cool->yy==0) head->inst=pack_bytes(UNSAVE,2,255,0);
|
5888 |
|
|
else head->inst=pack_bytes(UNSAVE,1,cool->yy-1,0);@+ break;
|
5889 |
|
|
case 2:@+ if (cool->yy==cool_G) head->inst=pack_bytes(UNSAVE,3,0,0);
|
5890 |
|
|
else head->inst=pack_bytes(UNSAVE,2,cool->yy-1,0);@+ break;
|
5891 |
|
|
}
|
5892 |
|
|
|
5893 |
|
|
@ @=
|
5894 |
|
|
if (data->xx==0) {
|
5895 |
|
|
data->a.o=data->x.o;@+data->a.o.h &=0xffffff; /* unsaved rA */
|
5896 |
|
|
data->x.o.l=data->x.o.h>>24;@+data->x.o.h=0; /* unsaved rG */
|
5897 |
|
|
if (data->a.o.h || (data->a.o.l&0xfffc0000)) {
|
5898 |
|
|
data->a.o.h=0, data->a.o.l&=0x3ffff;@+ data->interrupt |= B_BIT;
|
5899 |
|
|
}
|
5900 |
|
|
if (data->x.o.l<32) {
|
5901 |
|
|
data->x.o.l=32;@+ data->interrupt |= B_BIT;
|
5902 |
|
|
}
|
5903 |
|
|
}
|
5904 |
|
|
goto fin_ex;
|
5905 |
|
|
|
5906 |
|
|
@ Of course \.{SAVE} is handled essentially like \.{UNSAVE}, but backwards.
|
5907 |
|
|
|
5908 |
|
|
@=
|
5909 |
|
|
case save:@+if (cool->xxinterrupt|=B_BIT;
|
5910 |
|
|
if (cool->interrupt&B_BIT) cool->i=noop;
|
5911 |
|
|
else if (((cool_S.l-cool_O.l-cool_L-1)&lring_mask)==0)
|
5912 |
|
|
@@;
|
5913 |
|
|
else {
|
5914 |
|
|
cool->interim=true;
|
5915 |
|
|
cool->i=sav;
|
5916 |
|
|
switch(cool->zz) {
|
5917 |
|
|
case 0: @;@+break;
|
5918 |
|
|
case 1:@+if (cool_O.l!=cool_S.l) @@;
|
5919 |
|
|
cool->zz=2;@+ cool->yy=cool_G;
|
5920 |
|
|
case 2: case 3: @;@+break;
|
5921 |
|
|
default: cool->interim=false,cool->i=noop,cool->interrupt|=B_BIT;@+break;
|
5922 |
|
|
}
|
5923 |
|
|
}
|
5924 |
|
|
break;
|
5925 |
|
|
|
5926 |
|
|
@ If an interrupt occurs during the first phase, say between two |incgamma|
|
5927 |
|
|
instructions, the value |cool->zz=1| will get things restarted properly.
|
5928 |
|
|
(Indeed, if context is saved and unsaved during the interrupt, many
|
5929 |
|
|
|incgamma| instructions may no longer be necessary.)
|
5930 |
|
|
|
5931 |
|
|
@=
|
5932 |
|
|
cool->zz=1;
|
5933 |
|
|
cool->ren_x=true, spec_install(&l[(cool_O.l+cool_L)&lring_mask],&cool->x);
|
5934 |
|
|
cool->x.known=true, cool->x.o.h=0, cool->x.o.l=cool_L;
|
5935 |
|
|
cool->set_l=true, spec_install(&g[rL],&cool->rl);
|
5936 |
|
|
new_O=incr(cool_O,cool_L+1);
|
5937 |
|
|
|
5938 |
|
|
@ @=
|
5939 |
|
|
op=STOU; /* this instruction needs to be handled by load/store unit */
|
5940 |
|
|
cool->mem_x=true, spec_install(&mem,&cool->x);
|
5941 |
|
|
cool->z.o=shift_left(cool_O,3);
|
5942 |
|
|
new_O=new_S=incr(cool_O,1);
|
5943 |
|
|
if (cool->zz==3 && cool->yy>rZ) @@;
|
5944 |
|
|
else cool->b=specval(&g[cool->yy]);
|
5945 |
|
|
|
5946 |
|
|
@ The final \.{SAVE} instruction not only stores rG and rA, it also
|
5947 |
|
|
places the final address in global register~X.
|
5948 |
|
|
|
5949 |
|
|
@=
|
5950 |
|
|
{
|
5951 |
|
|
cool->i=save;
|
5952 |
|
|
cool->interim=false;
|
5953 |
|
|
cool->ren_a=true, spec_install(&g[cool->xx],&cool->a);
|
5954 |
|
|
}
|
5955 |
|
|
|
5956 |
|
|
@ @=
|
5957 |
|
|
switch (cool->zz) {
|
5958 |
|
|
case 1: head->inst=pack_bytes(SAVE,cool->xx,0,1);@+ break;
|
5959 |
|
|
case 2:@+ if (cool->yy==255) head->inst=pack_bytes(SAVE,cool->xx,0,3);
|
5960 |
|
|
else head->inst=pack_bytes(SAVE,cool->xx,cool->yy+1,2);@+break;
|
5961 |
|
|
case 3:@+ if (cool->yy==rR) head->inst=pack_bytes(SAVE,cool->xx,rP,3);
|
5962 |
|
|
else head->inst=pack_bytes(SAVE,cool->xx,cool->yy+1,3);@+break;
|
5963 |
|
|
}
|
5964 |
|
|
|
5965 |
|
|
@ @=
|
5966 |
|
|
{
|
5967 |
|
|
if (data->interim) data->x.o=data->b.o;
|
5968 |
|
|
else {
|
5969 |
|
|
if (data!=old_hot) wait(1); /* we need the hottest value of rA */
|
5970 |
|
|
data->x.o.h=g[rG].o.l<<24;
|
5971 |
|
|
data->x.o.l=g[rA].o.l;
|
5972 |
|
|
data->a.o=data->y.o;
|
5973 |
|
|
}
|
5974 |
|
|
goto fin_ex;
|
5975 |
|
|
}
|
5976 |
|
|
|
5977 |
|
|
@* More register-to-register ops.
|
5978 |
|
|
Now that we've finished most of the hard stuff,
|
5979 |
|
|
we can relax and fill in the holes that we left in the
|
5980 |
|
|
all-register parts of the execution stages.
|
5981 |
|
|
|
5982 |
|
|
First let's complete the fixed point arithmetic operations,
|
5983 |
|
|
by dispensing with multiplication and division.
|
5984 |
|
|
|
5985 |
|
|
@=
|
5986 |
|
|
case mulu: data->x.o=omult(data->y.o,data->z.o);
|
5987 |
|
|
data->a.o=aux;
|
5988 |
|
|
goto quantify_mul;
|
5989 |
|
|
case mul: data->x.o=signed_omult(data->y.o,data->z.o);
|
5990 |
|
|
if (overflow) data->interrupt |= V_BIT;
|
5991 |
|
|
quantify_mul: aux=data->z.o;
|
5992 |
|
|
for (j=mul0;aux.l||aux.h;j++) aux=shift_right(aux,8,1);
|
5993 |
|
|
data->i=j;@+break; /* |j| is |mul0| or |mul1| or \dots~or |mul8| */
|
5994 |
|
|
case divu: data->x.o=odiv(data->b.o,data->y.o,data->z.o);
|
5995 |
|
|
data->a.o=aux;@+data->i=div;@+break;
|
5996 |
|
|
case div:@+ if (data->z.o.l==0 && data->z.o.h==0) {
|
5997 |
|
|
data->interrupt |= D_BIT;@+ data->a.o=data->y.o;
|
5998 |
|
|
data->i=set; /* divide by zero needn't wait in the pipeline */
|
5999 |
|
|
}@+else {
|
6000 |
|
|
data->x.o=signed_odiv(data->y.o,data->z.o);
|
6001 |
|
|
if (overflow) data->interrupt |= V_BIT;
|
6002 |
|
|
data->a.o=aux;
|
6003 |
|
|
}@+break;
|
6004 |
|
|
|
6005 |
|
|
@ Next let's polish off the bitwise and bytewise operations.
|
6006 |
|
|
|
6007 |
|
|
@=
|
6008 |
|
|
case sadd: data->x.o.l=count_bits(data->y.o.h&~data->z.o.h)
|
6009 |
|
|
+count_bits(data->y.o.l&~data->z.o.l);@+ break;
|
6010 |
|
|
case mor: data->x.o=bool_mult(data->y.o,data->z.o,data->op&0x2);@+ break;
|
6011 |
|
|
case bdif: data->x.o.h=byte_diff(data->y.o.h,data->z.o.h);
|
6012 |
|
|
data->x.o.l=byte_diff(data->y.o.l,data->z.o.l);@+ break;
|
6013 |
|
|
case wdif: data->x.o.h=wyde_diff(data->y.o.h,data->z.o.h);
|
6014 |
|
|
data->x.o.l=wyde_diff(data->y.o.l,data->z.o.l);@+ break;
|
6015 |
|
|
case tdif:@+ if (data->y.o.h>data->z.o.h)
|
6016 |
|
|
data->x.o.h=data->y.o.h-data->z.o.h;
|
6017 |
|
|
tdif_l:@+ if (data->y.o.l>data->z.o.l)
|
6018 |
|
|
data->x.o.l=data->y.o.l-data->z.o.l;@+ break;
|
6019 |
|
|
case odif:@+ if (data->y.o.h>data->z.o.h)
|
6020 |
|
|
data->x.o=ominus(data->y.o,data->z.o);
|
6021 |
|
|
else if (data->y.o.h==data->z.o.h) goto tdif_l;
|
6022 |
|
|
break;
|
6023 |
|
|
|
6024 |
|
|
|
6025 |
|
|
@ The conditional set (\.{CS}) instructions are, rather surprisingly,
|
6026 |
|
|
more difficult to implement than the zero~set (\.{ZS}) instructions,
|
6027 |
|
|
although the \.{ZS} instructions do more. The reason is that dynamic
|
6028 |
|
|
instruction dependencies are more complicated with \.{CS}. Consider, for
|
6029 |
|
|
example, the instructions
|
6030 |
|
|
$$\advance\abovedisplayskip-.5\baselineskip
|
6031 |
|
|
\advance\belowdisplayskip-.5\baselineskip
|
6032 |
|
|
\hbox{\tt LDO x,a,b; \ FDIV y,c,d; \ CSZ y,x,0; \ INCL y,1.}$$
|
6033 |
|
|
If the value of \.x is zero, the \.{INCL} instruction need not wait for the
|
6034 |
|
|
division to be completed. (We do not, however, abort the division in such a
|
6035 |
|
|
case; it might invoke a trip handler, or change the inexact bit, etc. Our
|
6036 |
|
|
policy is to treat common cases efficiently and to treat all cases correctly,
|
6037 |
|
|
but not to treat all cases with maximum efficiency.)
|
6038 |
|
|
|
6039 |
|
|
@=
|
6040 |
|
|
case zset:@+if (register_truth(data->y.o,data->op)) data->x.o=data->z.o;
|
6041 |
|
|
/* otherwise |data->x.o| is already zero */
|
6042 |
|
|
goto fin_ex;
|
6043 |
|
|
case cset:@+if (register_truth(data->y.o,data->op))
|
6044 |
|
|
data->x.o=data->z.o, data->b.p=NULL;
|
6045 |
|
|
else if (data->b.p==NULL) data->x.o=data->b.o;
|
6046 |
|
|
else {
|
6047 |
|
|
data->state=0;@+data->need_b=true;@+goto switch1;
|
6048 |
|
|
}@+break;
|
6049 |
|
|
|
6050 |
|
|
@ Floating point computations are mostly handled by the routines in
|
6051 |
|
|
{\mc MMIX-ARITH}, which record anomalous events in the global
|
6052 |
|
|
variable |exceptions|. But we consider the operation trivial if an
|
6053 |
|
|
input is infinite or NaN; and we may need to increase the execution
|
6054 |
|
|
time when subnormals are present.
|
6055 |
|
|
|
6056 |
|
|
@d ROUND_OFF 1
|
6057 |
|
|
@d ROUND_UP 2
|
6058 |
|
|
@d ROUND_DOWN 3
|
6059 |
|
|
@d ROUND_NEAR 4
|
6060 |
|
|
@d is_subnormal(x) ((x.h&0x7ff00000)==0 && ((x.h&0xfffff) || x.l))
|
6061 |
|
|
@d is_trivial(x) ((x.h&0x7ff00000)==0x7ff00000)
|
6062 |
|
|
@d set_round cur_round=(data->ra.o.l<0x10000? ROUND_NEAR: data->ra.o.l>>16)
|
6063 |
|
|
|
6064 |
|
|
@=
|
6065 |
|
|
case fadd: set_round;@+data->x.o=fplus(data->y.o,data->z.o);
|
6066 |
|
|
fin_bflot:@+ if (is_subnormal(data->y.o)) data->denin=denin_penalty;
|
6067 |
|
|
fin_uflot:@+ if (is_subnormal(data->x.o)) data->denout=denout_penalty;
|
6068 |
|
|
fin_flot:@+ if (is_subnormal(data->z.o)) data->denin=denin_penalty;
|
6069 |
|
|
data->interrupt|=exceptions;
|
6070 |
|
|
if (is_trivial(data->y.o) || is_trivial(data->z.o)) goto fin_ex;
|
6071 |
|
|
if (data->i==fsqrt && (data->z.o.h&sign_bit)) goto fin_ex;
|
6072 |
|
|
break;
|
6073 |
|
|
case fsub: data->a.o=data->z.o;
|
6074 |
|
|
if (fcomp(data->z.o,zero_octa)!=2) data->a.o.h ^= sign_bit;
|
6075 |
|
|
set_round;@+data->x.o=fplus(data->y.o,data->a.o);
|
6076 |
|
|
data->i=fadd; /* use pipeline times for addition */
|
6077 |
|
|
goto fin_bflot;
|
6078 |
|
|
case fmul: set_round;@+ data->x.o=fmult(data->y.o,data->z.o);@+ goto fin_bflot;
|
6079 |
|
|
case fdiv: set_round;@+ data->x.o=fdivide(data->y.o,data->z.o);@+
|
6080 |
|
|
goto fin_bflot;
|
6081 |
|
|
case fsqrt: set_round;@+ data->x.o=froot(data->z.o,data->y.o.l);@+
|
6082 |
|
|
goto fin_uflot;
|
6083 |
|
|
case fint: set_round;@+ data->x.o=fintegerize(data->z.o,data->y.o.l);@+
|
6084 |
|
|
goto fin_uflot;
|
6085 |
|
|
case fix: set_round;@+ data->x.o=fixit(data->z.o,data->y.o.l);
|
6086 |
|
|
if (data->op&0x2) exceptions&=~W_BIT; /* unsigned case doesn't overflow */
|
6087 |
|
|
goto fin_flot;
|
6088 |
|
|
case flot: set_round;@+
|
6089 |
|
|
data->x.o=floatit(data->z.o,data->y.o.l,data->op&0x2, data->op&0x4);
|
6090 |
|
|
data->interrupt|=exceptions;@+break;
|
6091 |
|
|
|
6092 |
|
|
@ @=
|
6093 |
|
|
case fsqrt: case fint: case fix: case flot:@+ if (cool->y.o.l>4)
|
6094 |
|
|
goto illegal_inst;
|
6095 |
|
|
break;
|
6096 |
|
|
|
6097 |
|
|
@ @=
|
6098 |
|
|
case feps: j=fepscomp(data->y.o,data->z.o,data->b.o,data->op!=FEQLE);
|
6099 |
|
|
if (j==2) data->i=fcmp;
|
6100 |
|
|
else if (is_subnormal(data->y.o) || is_subnormal(data->z.o))
|
6101 |
|
|
data->denin=denin_penalty;
|
6102 |
|
|
switch (data->op) {
|
6103 |
|
|
case FUNE:@+ if (j==2) goto cmp_pos;@+ else goto cmp_zero;
|
6104 |
|
|
case FEQLE: goto cmp_fin;
|
6105 |
|
|
case FCMPE:@+ if (j) goto cmp_zero_or_invalid;
|
6106 |
|
|
}
|
6107 |
|
|
case fcmp: j=fcomp(data->y.o,data->z.o);
|
6108 |
|
|
if (j<0) goto cmp_neg;
|
6109 |
|
|
cmp_fin:@+ if (j==1) goto cmp_pos;
|
6110 |
|
|
cmp_zero_or_invalid:@+ if (j==2) data->interrupt |= I_BIT;
|
6111 |
|
|
goto cmp_zero;
|
6112 |
|
|
case funeq:@+ if (fcomp(data->y.o,data->z.o)==(data->op==FUN? 2:0))
|
6113 |
|
|
goto cmp_pos;
|
6114 |
|
|
else goto cmp_zero;
|
6115 |
|
|
|
6116 |
|
|
@ @=
|
6117 |
|
|
Extern int frem_max;
|
6118 |
|
|
Extern int denin_penalty, denout_penalty;
|
6119 |
|
|
|
6120 |
|
|
@ The floating point remainder operation is especially interesting
|
6121 |
|
|
because it can be interrupted when it's in the hot seat.
|
6122 |
|
|
|
6123 |
|
|
@=
|
6124 |
|
|
case frem:@+if(is_trivial(data->y.o) || is_trivial(data->z.o))
|
6125 |
|
|
{
|
6126 |
|
|
data->x.o=fremstep(data->y.o,data->z.o,2500);@+ goto fin_ex;
|
6127 |
|
|
}
|
6128 |
|
|
if ((self+1)->next) wait(1);
|
6129 |
|
|
data->interim=true;
|
6130 |
|
|
j=1;
|
6131 |
|
|
if (is_subnormal(data->y.o)||is_subnormal(data->z.o)) j+=denin_penalty;
|
6132 |
|
|
pass_after(j);
|
6133 |
|
|
goto passit;
|
6134 |
|
|
|
6135 |
|
|
|
6136 |
|
|
@ @=
|
6137 |
|
|
j=1;
|
6138 |
|
|
if (data->i==frem) {
|
6139 |
|
|
data->x.o=fremstep(data->y.o,data->z.o,frem_max);
|
6140 |
|
|
if (exceptions&E_BIT) {
|
6141 |
|
|
data->y.o=data->x.o;
|
6142 |
|
|
if (trying_to_interrupt && data==old_hot) goto fin_ex;
|
6143 |
|
|
}@+else {
|
6144 |
|
|
data->state=3;
|
6145 |
|
|
data->interim=false;
|
6146 |
|
|
data->interrupt |= exceptions;
|
6147 |
|
|
if (is_subnormal(data->x.o)) j+=denout_penalty;
|
6148 |
|
|
}
|
6149 |
|
|
wait(j);
|
6150 |
|
|
}
|
6151 |
|
|
|
6152 |
|
|
@* System operations. Finally we need to implement some operations for the
|
6153 |
|
|
operating system; then the hardware simulation will be done!
|
6154 |
|
|
|
6155 |
|
|
A \.{LDVTS} instruction is delayed until it reaches the hot seat, because
|
6156 |
|
|
it changes the IT and DT caches. The operating system should use \.{SYNC}
|
6157 |
|
|
after \.{LDVTS} if the effects are needed immediately; the system is also
|
6158 |
|
|
responsible for ensuring that the page table permission bits agree with
|
6159 |
|
|
the \.{LDVTS} permission bits when the latter are nonzero. (Also, if
|
6160 |
|
|
write permission is taken away from a page, the operating system must
|
6161 |
|
|
have previously used \.{SYNCD} to write out any dirty bytes that might
|
6162 |
|
|
have been cached from that page; \.{SYNCD} will be inoperative after write
|
6163 |
|
|
permission goes away.)
|
6164 |
|
|
|
6165 |
|
|
@=
|
6166 |
|
|
if (data->i==ldvts) @;
|
6167 |
|
|
|
6168 |
|
|
@ @=
|
6169 |
|
|
{
|
6170 |
|
|
if (data!=old_hot) wait(1);
|
6171 |
|
|
if (DTcache->lock || (j=get_reader(DTcache))<0) wait(1);
|
6172 |
|
|
startup(&DTcache->reader[j],DTcache->access_time);
|
6173 |
|
|
data->z.o.h=0, data->z.o.l=data->y.o.l&0x7;
|
6174 |
|
|
p=cache_search(DTcache,data->y.o); /* N.B.: Not |trans_key(data->y.o)| */
|
6175 |
|
|
if (p) {
|
6176 |
|
|
data->x.o.l=2;
|
6177 |
|
|
if (data->z.o.l) {
|
6178 |
|
|
p=use_and_fix(DTcache,p);
|
6179 |
|
|
p->data[0].l=(p->data[0].l&-8)+data->z.o.l;
|
6180 |
|
|
}@+else {
|
6181 |
|
|
p=demote_and_fix(DTcache,p);
|
6182 |
|
|
p->tag.h|=sign_bit; /* invalidate the tag */
|
6183 |
|
|
}
|
6184 |
|
|
}
|
6185 |
|
|
pass_after(DTcache->access_time);@+goto passit;
|
6186 |
|
|
}
|
6187 |
|
|
|
6188 |
|
|
@ @=
|
6189 |
|
|
case ld_st_launch:@+ if (ITcache->lock || (j=get_reader(ITcache))<0) wait(1);
|
6190 |
|
|
startup(&ITcache->reader[j],ITcache->access_time);
|
6191 |
|
|
p=cache_search(ITcache,data->y.o); /* N.B.: Not |trans_key(data->y.o)| */
|
6192 |
|
|
if (p) {
|
6193 |
|
|
data->x.o.l|=1;
|
6194 |
|
|
if (data->z.o.l) {
|
6195 |
|
|
p=use_and_fix(ITcache,p);
|
6196 |
|
|
p->data[0].l=(p->data[0].l&-8)+data->z.o.l;
|
6197 |
|
|
}@+else {
|
6198 |
|
|
p=demote_and_fix(ITcache,p);
|
6199 |
|
|
p->tag.h|=sign_bit; /* invalidate the tag */
|
6200 |
|
|
}
|
6201 |
|
|
}
|
6202 |
|
|
data->state=3;@+wait(ITcache->access_time);
|
6203 |
|
|
|
6204 |
|
|
@ The \.{SYNC} operation interacts with the pipeline in interesting ways.
|
6205 |
|
|
\.{SYNC}~\.0 and \.{SYNC}~\.4 are the simplest; they just lock the
|
6206 |
|
|
dispatch and wait until they get to the hot seat, after which the
|
6207 |
|
|
pipeline has drained. \.{SYNC}~\.1 and \.{SYNC}~\.3 put a ``barrier''
|
6208 |
|
|
into the write buffer so that subsequent store instructions will not merge with
|
6209 |
|
|
previous stores. \.{SYNC}~\.2 and \.{SYNC}~\.3 lock the dispatch until
|
6210 |
|
|
all previous load instructions have left the pipeline. \.{SYNC}~\.5,
|
6211 |
|
|
\.{SYNC}~\.6, and \.{SYNC}~\.7 remove things from caches once they
|
6212 |
|
|
get to the hot seat.
|
6213 |
|
|
|
6214 |
|
|
@=
|
6215 |
|
|
case sync:@+ if (cool->zz>3) {
|
6216 |
|
|
if (!(cool->loc.h&sign_bit)) goto privileged_inst;
|
6217 |
|
|
if (cool->zz==4) freeze_dispatch=true;
|
6218 |
|
|
}@+else {
|
6219 |
|
|
if (cool->zz!=1) freeze_dispatch=true;
|
6220 |
|
|
if (cool->zz&1) cool->mem_x=true, spec_install(&mem,&cool->x);
|
6221 |
|
|
}@+break;
|
6222 |
|
|
|
6223 |
|
|
@ @=
|
6224 |
|
|
case sync:@+ switch (data->zz) {
|
6225 |
|
|
case 0: case 4:@+ if (data!=old_hot) wait(1);
|
6226 |
|
|
halted=(data->zz!=0);@+goto fin_ex;
|
6227 |
|
|
case 2: case 3: @;
|
6228 |
|
|
release_lock(self,dispatch_lock);
|
6229 |
|
|
case 1: data->x.addr=zero_octa;@+goto fin_ex;
|
6230 |
|
|
case 5:@+ if (data!=old_hot) wait(1);
|
6231 |
|
|
@;
|
6232 |
|
|
case 6:@+ if (data!=old_hot) wait(1);
|
6233 |
|
|
@;
|
6234 |
|
|
case 7:@+ if (data!=old_hot) wait(1);
|
6235 |
|
|
@;
|
6236 |
|
|
}
|
6237 |
|
|
|
6238 |
|
|
@ @=
|
6239 |
|
|
{
|
6240 |
|
|
register control *cc;
|
6241 |
|
|
for (cc=data;cc!=hot;) {
|
6242 |
|
|
cc=(cc==reorder_top? reorder_bot: cc+1);
|
6243 |
|
|
if (cc->owner && (cc->i==ld || cc->i==ldunc || cc->i==pst)) wait(1);
|
6244 |
|
|
}
|
6245 |
|
|
}
|
6246 |
|
|
|
6247 |
|
|
@ Perhaps the delay should be longer here.
|
6248 |
|
|
|
6249 |
|
|
@=
|
6250 |
|
|
if (DTcache->lock || (j=get_reader(DTcache))<0) wait(1);
|
6251 |
|
|
startup(&DTcache->reader[j],DTcache->access_time);
|
6252 |
|
|
set_lock(self,DTcache->lock);
|
6253 |
|
|
zap_cache(DTcache);
|
6254 |
|
|
data->state=10;@+wait(DTcache->access_time);
|
6255 |
|
|
|
6256 |
|
|
@ @=
|
6257 |
|
|
if (!Icache) {
|
6258 |
|
|
data->state=11;@+goto switch1;
|
6259 |
|
|
}
|
6260 |
|
|
if (Icache->lock || (j=get_reader(Icache))<0) wait(1);
|
6261 |
|
|
startup(&Icache->reader[j],Icache->access_time);
|
6262 |
|
|
set_lock(self,Icache->lock);
|
6263 |
|
|
zap_cache(Icache);
|
6264 |
|
|
data->state=11;@+wait(Icache->access_time);
|
6265 |
|
|
|
6266 |
|
|
@ @=
|
6267 |
|
|
case 10:@+ if (self->lockloc) *(self->lockloc)=NULL,self->lockloc=NULL;
|
6268 |
|
|
if (ITcache->lock || (j=get_reader(ITcache))<0) wait(1);
|
6269 |
|
|
startup(&ITcache->reader[j],ITcache->access_time);
|
6270 |
|
|
set_lock(self,ITcache->lock);
|
6271 |
|
|
zap_cache(ITcache);
|
6272 |
|
|
data->state=3;@+wait(ITcache->access_time);
|
6273 |
|
|
case 11:@+ if (self->lockloc) *(self->lockloc)=NULL,self->lockloc=NULL;
|
6274 |
|
|
if (wbuf_lock) wait(1);
|
6275 |
|
|
write_head=write_tail, write_ctl.state=0; /* zap the write buffer */
|
6276 |
|
|
if (!Dcache) {
|
6277 |
|
|
data->state=12;@+ goto switch1;
|
6278 |
|
|
}
|
6279 |
|
|
if (Dcache->lock || (j=get_reader(Dcache))<0) wait(1);
|
6280 |
|
|
startup(&Dcache->reader[j],Dcache->access_time);
|
6281 |
|
|
set_lock(self,Dcache->lock);
|
6282 |
|
|
zap_cache(Dcache);
|
6283 |
|
|
data->state=12;@+wait(Dcache->access_time);
|
6284 |
|
|
case 12:@+ if (self->lockloc) *(self->lockloc)=NULL,self->lockloc=NULL;
|
6285 |
|
|
if (!Scache) goto fin_ex;
|
6286 |
|
|
if (Scache->lock) wait(1);
|
6287 |
|
|
set_lock(self,Scache->lock);
|
6288 |
|
|
zap_cache(Scache);
|
6289 |
|
|
data->state=3;@+wait(Scache->access_time);
|
6290 |
|
|
|
6291 |
|
|
@ @=
|
6292 |
|
|
if (self->lockloc) *(self->lockloc)=NULL,self->lockloc=NULL;
|
6293 |
|
|
@;
|
6294 |
|
|
if (clean_co.next || clean_lock) wait(1);
|
6295 |
|
|
set_lock(self,clean_lock);
|
6296 |
|
|
clean_ctl.i=sync;@+
|
6297 |
|
|
clean_ctl.state=0;@+
|
6298 |
|
|
clean_ctl.x.o.h=0;
|
6299 |
|
|
startup(&clean_co,1);
|
6300 |
|
|
data->state=13;
|
6301 |
|
|
data->interim=true;
|
6302 |
|
|
wait(1);
|
6303 |
|
|
|
6304 |
|
|
@ @=
|
6305 |
|
|
if (write_head!=write_tail) {
|
6306 |
|
|
if (!speed_lock) set_lock(self,speed_lock);
|
6307 |
|
|
wait(1);
|
6308 |
|
|
}
|
6309 |
|
|
|
6310 |
|
|
@ The cleanup process might take a huge amount of time, so we must allow
|
6311 |
|
|
it to be interrupted. (Servicing the interruption might, of course,
|
6312 |
|
|
put more stuff into the cache.)
|
6313 |
|
|
|
6314 |
|
|
@=
|
6315 |
|
|
case 13:@+ if (!clean_co.next) {
|
6316 |
|
|
data->interim=false;@+ goto fin_ex; /* it's done! */
|
6317 |
|
|
}
|
6318 |
|
|
if (trying_to_interrupt) goto fin_ex; /* accept an interruption */
|
6319 |
|
|
wait(1);
|
6320 |
|
|
|
6321 |
|
|
@ Now we consider \.{SYNCD} and \.{SYNCID}. When control comes to this
|
6322 |
|
|
part of the program, |data->y.o| is a virtual address and |data->z.o|
|
6323 |
|
|
is the corresponding physical address; |data->xx+1| is the number of
|
6324 |
|
|
bytes we are supposed to be syncing; |data->b.o.l| is the number of
|
6325 |
|
|
bytes we can handle at once (either |Icache->bb| or |Dcache->bb| or 8192).
|
6326 |
|
|
|
6327 |
|
|
We need a more elaborate scheme to implement \.{SYNCD} and \.{SYNCID}
|
6328 |
|
|
than we have used for the ``hint'' instructions \.{PRELD}, \.{PREGO},
|
6329 |
|
|
and \.{PREST}, because \.{SYNCD} and \.{SYNCID} are not merely hints.
|
6330 |
|
|
They cannot be converted into a sequence of cache-block-size commands at
|
6331 |
|
|
dispatch time, because we cannot be sure that the starting virtual address
|
6332 |
|
|
will be aligned with the beginning of a cache block. We need to realize
|
6333 |
|
|
that the bytes specified by \.{SYNCD} or \.{SYNCID} might cross a
|
6334 |
|
|
virtual page boundary---possibly with different protection bits
|
6335 |
|
|
on each page. We need to allow for interrupts. And we also need to
|
6336 |
|
|
keep the fetch buffer empty until a user's \.{SYNCID} has completely
|
6337 |
|
|
brought the memory up to date.
|
6338 |
|
|
|
6339 |
|
|
@=
|
6340 |
|
|
do_syncid: data->state=30;
|
6341 |
|
|
case 30:@+ if (data!=old_hot) wait(1);
|
6342 |
|
|
if (!Icache) {
|
6343 |
|
|
data->state=(data->loc.h&sign_bit? 31:33);@+goto switch2;
|
6344 |
|
|
}
|
6345 |
|
|
@z.o|, if any@>;
|
6346 |
|
|
data->state=(data->loc.h&sign_bit? 31: 33);@+wait(Icache->access_time);
|
6347 |
|
|
case 31:@+ if (self->lockloc) *(self->lockloc)=NULL,self->lockloc=NULL;
|
6348 |
|
|
@;
|
6349 |
|
|
if (((data->b.o.l-1)&~data->y.o.l)xx) data->interim=true;
|
6350 |
|
|
if (!Dcache) goto next_sync;
|
6351 |
|
|
@z.o|, if any@>;
|
6352 |
|
|
data->state=32;@+wait(Dcache->access_time);
|
6353 |
|
|
case 32:@+ if (self->lockloc) *(self->lockloc)=NULL,self->lockloc=NULL;
|
6354 |
|
|
if (!Scache) goto next_sync;
|
6355 |
|
|
@z.o|, if any@>;
|
6356 |
|
|
data->state=35;@+wait(Scache->access_time);
|
6357 |
|
|
do_syncd: data->state=33;
|
6358 |
|
|
case 33:@+ if (data!=old_hot) wait(1);
|
6359 |
|
|
if (self->lockloc) *(self->lockloc)=NULL,self->lockloc=NULL;
|
6360 |
|
|
@;
|
6361 |
|
|
if (((data->b.o.l-1)&~data->y.o.l)xx) data->interim=true;
|
6362 |
|
|
if (!Dcache)
|
6363 |
|
|
if (data->i==syncd) goto fin_ex;@+ else goto next_sync;
|
6364 |
|
|
@
|
6365 |
|
|
data->state=34;
|
6366 |
|
|
case 34:@+if (!clean_co.next) goto next_sync;
|
6367 |
|
|
if (trying_to_interrupt && data->interim && data==old_hot) {
|
6368 |
|
|
data->z.o=zero_octa; /* anticipate |RESUME_CONT| */
|
6369 |
|
|
goto fin_ex; /* accept an interruption */
|
6370 |
|
|
}
|
6371 |
|
|
wait(1);
|
6372 |
|
|
next_sync: data->state=35;
|
6373 |
|
|
case 35:@+ if (self->lockloc) *(self->lockloc)=NULL,self->lockloc=NULL;
|
6374 |
|
|
if (data->interim) @;
|
6375 |
|
|
data->go.known=true;
|
6376 |
|
|
goto fin_ex;
|
6377 |
|
|
|
6378 |
|
|
@ @z.o|, if any@>=
|
6379 |
|
|
if (Icache->lock || (j=get_reader(Icache))<0) wait(1);
|
6380 |
|
|
startup(&Icache->reader[j],Icache->access_time);
|
6381 |
|
|
set_lock(self,Icache->lock);
|
6382 |
|
|
p=cache_search(Icache,data->z.o);
|
6383 |
|
|
if (p) {
|
6384 |
|
|
demote_and_fix(Icache,p);
|
6385 |
|
|
clean_block(Icache,p);
|
6386 |
|
|
}
|
6387 |
|
|
|
6388 |
|
|
@ @z.o|, if any@>=
|
6389 |
|
|
if (Dcache->lock || (j=get_reader(Dcache))<0) wait(1);
|
6390 |
|
|
startup(&Dcache->reader[j],Dcache->access_time);
|
6391 |
|
|
set_lock(self,Dcache->lock);
|
6392 |
|
|
p=cache_search(Dcache,data->z.o);
|
6393 |
|
|
if (p) {
|
6394 |
|
|
demote_and_fix(Dcache,p);
|
6395 |
|
|
clean_block(Dcache,p);
|
6396 |
|
|
}
|
6397 |
|
|
|
6398 |
|
|
@ @z.o|, if any@>=
|
6399 |
|
|
if (Scache->lock) wait(1);
|
6400 |
|
|
set_lock(self,Scache->lock);
|
6401 |
|
|
p=cache_search(Scache,data->z.o);
|
6402 |
|
|
if (p) {
|
6403 |
|
|
demote_and_fix(Scache,p);
|
6404 |
|
|
clean_block(Scache,p);
|
6405 |
|
|
}
|
6406 |
|
|
|
6407 |
|
|
@ @
|
6408 |
|
|
if (clean_co.next || clean_lock) wait(1);
|
6409 |
|
|
set_lock(self,clean_lock);
|
6410 |
|
|
clean_ctl.i=syncd;
|
6411 |
|
|
clean_ctl.state=4;
|
6412 |
|
|
clean_ctl.x.o.h=data->loc.h&sign_bit;
|
6413 |
|
|
clean_ctl.z.o=data->z.o;
|
6414 |
|
|
schedule(&clean_co,1,4);
|
6415 |
|
|
|
6416 |
|
|
@ We use the fact that cache block sizes are divisors of 8192.
|
6417 |
|
|
|
6418 |
|
|
@=
|
6419 |
|
|
{
|
6420 |
|
|
data->interim=false;
|
6421 |
|
|
data->xx -= ((data->b.o.l-1)&~data->y.o.l)+1;
|
6422 |
|
|
data->y.o=incr(data->y.o,data->b.o.l);
|
6423 |
|
|
data->y.o.l &= -data->b.o.l;
|
6424 |
|
|
data->z.o.l = (data->z.o.l&-8192)+(data->y.o.l&8191);
|
6425 |
|
|
if ((data->y.o.l&8191)==0) goto square_one;
|
6426 |
|
|
/* maybe crossed a page boundary */
|
6427 |
|
|
if (data->i==syncd) goto do_syncd;@+else goto do_syncid;
|
6428 |
|
|
}
|
6429 |
|
|
|
6430 |
|
|
@ If the first page lacks proper protection, we still must try the
|
6431 |
|
|
second, in the rare case that a page boundary is spanned.
|
6432 |
|
|
|
6433 |
|
|
@=
|
6434 |
|
|
sync_check:@+ if ((data->y.o.l ^ (data->y.o.l+data->xx))>=8192) {
|
6435 |
|
|
data->xx -= (8191&~data->y.o.l)+1;
|
6436 |
|
|
data->y.o=incr(data->y.o,8192);
|
6437 |
|
|
data->y.o.l &= -8192;
|
6438 |
|
|
goto square_one;
|
6439 |
|
|
}
|
6440 |
|
|
goto fin_ex;
|
6441 |
|
|
|
6442 |
|
|
@* Input and output. We're done implementing the hardware, but there's
|
6443 |
|
|
still a small matter of software remaining, because we sometimes
|
6444 |
|
|
want to pretend that a real operating
|
6445 |
|
|
system is present without actually having one loaded. This simulator
|
6446 |
|
|
therefore implements a special feature: If \.{RESUME}~\.1 is issued in
|
6447 |
|
|
location~rT, the ten special I/O traps of {\mc MMIX-SIM} are performed
|
6448 |
|
|
instantaneously behind the scenes.
|
6449 |
|
|
|
6450 |
|
|
Of course all claims of accurate simulation go out the door when this
|
6451 |
|
|
feature is used.
|
6452 |
|
|
|
6453 |
|
|
@d max_sys_call Ftell
|
6454 |
|
|
|
6455 |
|
|
@=
|
6456 |
|
|
typedef enum{
|
6457 |
|
|
@!Halt,@!Fopen,@!Fclose,@!Fread,@!Fgets,@!Fgetws,
|
6458 |
|
|
@!Fwrite,@!Fputs,@!Fputws,@!Fseek,@!Ftell} @!sys_call;
|
6459 |
|
|
|
6460 |
|
|
@ @loc| is rT@>=
|
6461 |
|
|
if (cool->loc.l==g[rT].o.l && cool->loc.h==g[rT].o.h) {
|
6462 |
|
|
register unsigned char yy,zz; octa ma,mb;
|
6463 |
|
|
if (g[rXX].o.l&0xffff0000) goto magic_done;
|
6464 |
|
|
yy=g[rXX].o.l>>8, zz=g[rXX].o.l&0xff;
|
6465 |
|
|
if (yy>max_sys_call) goto magic_done;
|
6466 |
|
|
@
|
6467 |
|
|
if needed@>;
|
6468 |
|
|
switch (yy) {
|
6469 |
|
|
case Halt: @;@+break;
|
6470 |
|
|
case Fopen: g[rBB].o=mmix_fopen(zz,mb,ma);@+break;
|
6471 |
|
|
case Fclose: g[rBB].o=mmix_fclose(zz);@+break;
|
6472 |
|
|
case Fread: g[rBB].o=mmix_fread(zz,mb,ma);@+break;
|
6473 |
|
|
case Fgets: g[rBB].o=mmix_fgets(zz,mb,ma);@+break;
|
6474 |
|
|
case Fgetws: g[rBB].o=mmix_fgetws(zz,mb,ma);@+break;
|
6475 |
|
|
case Fwrite: g[rBB].o=mmix_fwrite(zz,mb,ma);@+break;
|
6476 |
|
|
case Fputs: g[rBB].o=mmix_fputs(zz,g[rBB].o);@+break;
|
6477 |
|
|
case Fputws: g[rBB].o=mmix_fputws(zz,g[rBB].o);@+break;
|
6478 |
|
|
case Fseek: g[rBB].o=mmix_fseek(zz,g[rBB].o);@+break;
|
6479 |
|
|
case Ftell: g[rBB].o=mmix_ftell(zz);@+break;
|
6480 |
|
|
}
|
6481 |
|
|
magic_done: g[255].o=neg_one; /* this will enable interrupts */
|
6482 |
|
|
}
|
6483 |
|
|
|
6484 |
|
|
@ @=
|
6485 |
|
|
if (!zz) halted=true;
|
6486 |
|
|
else if (zz==1) {
|
6487 |
|
|
octa trap_loc;
|
6488 |
|
|
trap_loc=incr(g[rWW].o,-4);
|
6489 |
|
|
if (!(trap_loc.h || trap_loc.l>=0x90))
|
6490 |
|
|
print_trip_warning(trap_loc.l>>4,incr(g[rW].o,-4));
|
6491 |
|
|
}
|
6492 |
|
|
|
6493 |
|
|
@ @=
|
6494 |
|
|
char arg_count[]={1,3,1,3,3,3,3,2,2,2,1};
|
6495 |
|
|
|
6496 |
|
|
@ The input/output operations invoked by \.{TRAP}s are
|
6497 |
|
|
done by subroutines in an auxiliary program module called {\mc MMIX-IO}.
|
6498 |
|
|
Here we need only declare those subroutines, and write three primitive
|
6499 |
|
|
interfaces on which they depend.
|
6500 |
|
|
|
6501 |
|
|
@ @=
|
6502 |
|
|
extern octa mmix_fopen @,@,@[ARGS((unsigned char,octa,octa))@];
|
6503 |
|
|
extern octa mmix_fclose @,@,@[ARGS((unsigned char))@];
|
6504 |
|
|
extern octa mmix_fread @,@,@[ARGS((unsigned char,octa,octa))@];
|
6505 |
|
|
extern octa mmix_fgets @,@,@[ARGS((unsigned char,octa,octa))@];
|
6506 |
|
|
extern octa mmix_fgetws @,@,@[ARGS((unsigned char,octa,octa))@];
|
6507 |
|
|
extern octa mmix_fwrite @,@,@[ARGS((unsigned char,octa,octa))@];
|
6508 |
|
|
extern octa mmix_fputs @,@,@[ARGS((unsigned char,octa))@];
|
6509 |
|
|
extern octa mmix_fputws @,@,@[ARGS((unsigned char,octa))@];
|
6510 |
|
|
extern octa mmix_fseek @,@,@[ARGS((unsigned char,octa))@];
|
6511 |
|
|
extern octa mmix_ftell @,@,@[ARGS((unsigned char))@];
|
6512 |
|
|
extern void print_trip_warning @,@,@[ARGS((int,octa))@];
|
6513 |
|
|
|
6514 |
|
|
@ @=
|
6515 |
|
|
int mmgetchars @,@,@[ARGS((char*,int,octa,int))@];
|
6516 |
|
|
void mmputchars @,@,@[ARGS((unsigned char*,int,octa))@];
|
6517 |
|
|
char stdin_chr @,@,@[ARGS((void))@];
|
6518 |
|
|
octa magic_read @,@,@[ARGS((octa))@];
|
6519 |
|
|
void magic_write @,@,@[ARGS((octa,octa))@];
|
6520 |
|
|
|
6521 |
|
|
@ We need to cut through all the complications of buffers and
|
6522 |
|
|
caches in order to do magical I/O. The |magic_read| routine finds
|
6523 |
|
|
the current octabyte in a given physical address by looking at the
|
6524 |
|
|
write buffer, D-cache, S-cache, and memory until finding it.
|
6525 |
|
|
|
6526 |
|
|
@=
|
6527 |
|
|
octa magic_read(addr)
|
6528 |
|
|
octa addr;
|
6529 |
|
|
{
|
6530 |
|
|
register write_node *q;
|
6531 |
|
|
register cacheblock *p;
|
6532 |
|
|
for (q=write_tail;;) {
|
6533 |
|
|
if (q==write_head) break;
|
6534 |
|
|
if (q==wbuf_top) q=wbuf_bot;@+ else q++;
|
6535 |
|
|
if ((q->addr.l&-8)==(addr.l&-8) && q->addr.h==addr.h) return q->o;
|
6536 |
|
|
}
|
6537 |
|
|
if (Dcache) {
|
6538 |
|
|
p=cache_search(Dcache,addr);
|
6539 |
|
|
if (p) return p->data[(addr.l&(Dcache->bb-1))>>3];
|
6540 |
|
|
if (((Dcache->outbuf.tag.l^addr.l)&-Dcache->bb)==0 &&
|
6541 |
|
|
Dcache->outbuf.tag.h==addr.h)
|
6542 |
|
|
return Dcache->outbuf.data[(addr.l&(Dcache->bb-1))>>3];
|
6543 |
|
|
if (Scache) {
|
6544 |
|
|
p=cache_search(Scache,addr);
|
6545 |
|
|
if (p) return p->data[(addr.l&(Scache->bb-1))>>3];
|
6546 |
|
|
if (((Scache->outbuf.tag.l^addr.l)&-Scache->bb)==0 &&
|
6547 |
|
|
Scache->outbuf.tag.h==addr.h)
|
6548 |
|
|
return Scache->outbuf.data[(addr.l&(Scache->bb-1))>>3];
|
6549 |
|
|
}
|
6550 |
|
|
}
|
6551 |
|
|
return mem_read(addr);
|
6552 |
|
|
}
|
6553 |
|
|
|
6554 |
|
|
@ The |magic_write| routine changes the octabyte in a given physical
|
6555 |
|
|
address by changing it wherever it appears in a buffer or cache.
|
6556 |
|
|
Any ``dirty'' or ``least recently used'' status remains unchanged.
|
6557 |
|
|
(Yes, this {\it is\/} magic.)
|
6558 |
|
|
|
6559 |
|
|
@=
|
6560 |
|
|
void magic_write(addr,val)
|
6561 |
|
|
octa addr,val;
|
6562 |
|
|
{
|
6563 |
|
|
register write_node *q;
|
6564 |
|
|
register cacheblock *p;
|
6565 |
|
|
for (q=write_tail;;) {
|
6566 |
|
|
if (q==write_head) break;
|
6567 |
|
|
if (q==wbuf_top) q=wbuf_bot;@+ else q++;
|
6568 |
|
|
if ((q->addr.l&-8)==(addr.l&-8) && q->addr.h==addr.h) q->o=val;
|
6569 |
|
|
}
|
6570 |
|
|
if (Dcache) {
|
6571 |
|
|
p=cache_search(Dcache,addr);
|
6572 |
|
|
if (p) p->data[(addr.l&(Dcache->bb-1))>>3]=val;
|
6573 |
|
|
if (((Dcache->inbuf.tag.l^addr.l)&-Dcache->bb)==0 &&
|
6574 |
|
|
Dcache->inbuf.tag.h==addr.h)
|
6575 |
|
|
Dcache->inbuf.data[(addr.l&(Dcache->bb-1))>>3]=val;
|
6576 |
|
|
if (((Dcache->outbuf.tag.l^addr.l)&-Dcache->bb)==0 &&
|
6577 |
|
|
Dcache->outbuf.tag.h==addr.h)
|
6578 |
|
|
Dcache->outbuf.data[(addr.l&(Dcache->bb-1))>>3]=val;
|
6579 |
|
|
if (Scache) {
|
6580 |
|
|
p=cache_search(Scache,addr);
|
6581 |
|
|
if (p) p->data[(addr.l&(Scache->bb-1))>>3]=val;
|
6582 |
|
|
if (((Scache->inbuf.tag.l^addr.l)&-Scache->bb)==0 &&
|
6583 |
|
|
Scache->inbuf.tag.h==addr.h)
|
6584 |
|
|
Scache->inbuf.data[(addr.l&(Scache->bb-1))>>3]=val;
|
6585 |
|
|
if (((Scache->outbuf.tag.l^addr.l)&-Scache->bb)==0 &&
|
6586 |
|
|
Scache->outbuf.tag.h==addr.h)
|
6587 |
|
|
Scache->outbuf.data[(addr.l&(Scache->bb-1))>>3]=val;
|
6588 |
|
|
}
|
6589 |
|
|
}
|
6590 |
|
|
mem_write(addr,val);
|
6591 |
|
|
}
|
6592 |
|
|
|
6593 |
|
|
@ The conventions of our imaginary operating system require us to
|
6594 |
|
|
apply the trivial memory mapping in which segment~$i$ appears in
|
6595 |
|
|
a $2^{32}$-byte page of physical addresses starting at $2^{32}i$.
|
6596 |
|
|
|
6597 |
|
|
@=
|
6598 |
|
|
if (arg_count[yy]==3) {
|
6599 |
|
|
octa arg_loc;
|
6600 |
|
|
arg_loc=g[rBB].o;
|
6601 |
|
|
if (arg_loc.h&0x9fffffff) mb=zero_octa;
|
6602 |
|
|
else arg_loc.h>>=29, mb=magic_read(arg_loc);
|
6603 |
|
|
arg_loc=incr(g[rBB].o,8);
|
6604 |
|
|
if (arg_loc.h&0x9fffffff) ma=zero_octa;
|
6605 |
|
|
else arg_loc.h>>=29, ma=magic_read(arg_loc);
|
6606 |
|
|
}
|
6607 |
|
|
|
6608 |
|
|
@ The subroutine |mmgetchars(buf,size,addr,stop)| reads characters
|
6609 |
|
|
starting at address |addr| in the simulated memory and stores them
|
6610 |
|
|
in |buf|, continuing until |size| characters have been read or
|
6611 |
|
|
some other stopping criterion has been met. If |stop<0| there is
|
6612 |
|
|
no other criterion; if |stop=0| a null character will also terminate
|
6613 |
|
|
the process; otherwise |addr| is even, and two consecutive null bytes
|
6614 |
|
|
starting at an even address will terminate the process. The number
|
6615 |
|
|
of bytes read and stored, exclusive of terminating nulls, is returned.
|
6616 |
|
|
|
6617 |
|
|
@=
|
6618 |
|
|
int mmgetchars(buf,size,addr,stop)
|
6619 |
|
|
char *buf;
|
6620 |
|
|
int size;
|
6621 |
|
|
octa addr;
|
6622 |
|
|
int stop;
|
6623 |
|
|
{
|
6624 |
|
|
register char *p;
|
6625 |
|
|
register int m;
|
6626 |
|
|
octa a,x;
|
6627 |
|
|
if (((addr.h&0x9fffffff)||(incr(addr,size-1).h&0x9fffffff))&&size) {
|
6628 |
|
|
fprintf(stderr,"Attempt to get characters from off the page!\n");
|
6629 |
|
|
@.Attempt to get characters...@>
|
6630 |
|
|
return 0;
|
6631 |
|
|
}
|
6632 |
|
|
for (p=buf,m=0,a=addr,a.h>>=29; m
|
6633 |
|
|
x=magic_read(a);
|
6634 |
|
|
if ((a.l&0x7) || m>size-8) @@;
|
6635 |
|
|
else @@;
|
6636 |
|
|
}
|
6637 |
|
|
return size;
|
6638 |
|
|
}
|
6639 |
|
|
|
6640 |
|
|
@ @=
|
6641 |
|
|
{
|
6642 |
|
|
if (a.l&0x4) *p=(x.l>>(8*((~a.l)&0x3)))&0xff;
|
6643 |
|
|
else *p=(x.h>>(8*((~a.l)&0x3)))&0xff;
|
6644 |
|
|
if (!*p && stop>=0) {
|
6645 |
|
|
if (stop==0) return m;
|
6646 |
|
|
if ((a.l&0x1) && *(p-1)=='\0') return m-1;
|
6647 |
|
|
}
|
6648 |
|
|
p++,m++,a=incr(a,1);
|
6649 |
|
|
}
|
6650 |
|
|
|
6651 |
|
|
@ @=
|
6652 |
|
|
{
|
6653 |
|
|
*p=x.h>>24;
|
6654 |
|
|
if (!*p && (stop==0 || (stop>0 && x.h<0x10000))) return m;
|
6655 |
|
|
*(p+1)=(x.h>>16)&0xff;
|
6656 |
|
|
if (!*(p+1) && stop==0) return m+1;
|
6657 |
|
|
*(p+2)=(x.h>>8)&0xff;
|
6658 |
|
|
if (!*(p+2) && (stop==0 || (stop>0 && (x.h&0xffff)==0))) return m+2;
|
6659 |
|
|
*(p+3)=x.h&0xff;
|
6660 |
|
|
if (!*(p+3) && stop==0) return m+3;
|
6661 |
|
|
*(p+4)=x.l>>24;
|
6662 |
|
|
if (!*(p+4) && (stop==0 || (stop>0 && x.l<0x10000))) return m+4;
|
6663 |
|
|
*(p+5)=(x.l>>16)&0xff;
|
6664 |
|
|
if (!*(p+5) && stop==0) return m+5;
|
6665 |
|
|
*(p+6)=(x.l>>8)&0xff;
|
6666 |
|
|
if (!*(p+6) && (stop==0 || (stop>0 && (x.l&0xffff)==0))) return m+6;
|
6667 |
|
|
*(p+7)=x.l&0xff;
|
6668 |
|
|
if (!*(p+7) && stop==0) return m+7;
|
6669 |
|
|
p+=8,m+=8,a=incr(a,8);
|
6670 |
|
|
}
|
6671 |
|
|
|
6672 |
|
|
@ The subroutine |mmputchars(buf,size,addr)| puts |size| characters
|
6673 |
|
|
into the simulated memory starting at address |addr|.
|
6674 |
|
|
|
6675 |
|
|
@=
|
6676 |
|
|
void mmputchars(buf,size,addr)
|
6677 |
|
|
unsigned char *buf;
|
6678 |
|
|
int size;
|
6679 |
|
|
octa addr;
|
6680 |
|
|
{
|
6681 |
|
|
register unsigned char *p;
|
6682 |
|
|
register int m;
|
6683 |
|
|
octa a,x;
|
6684 |
|
|
if (((addr.h&0x9fffffff)||(incr(addr,size-1).h&0x9fffffff))&&size) {
|
6685 |
|
|
fprintf(stderr,"Attempt to put characters off the page!\n");
|
6686 |
|
|
@.Attempt to put characters...@>
|
6687 |
|
|
return;
|
6688 |
|
|
}
|
6689 |
|
|
for (p=buf,m=0,a=addr,a.h>>=29; m
|
6690 |
|
|
if ((a.l&0x7) || m>size-8) @@;
|
6691 |
|
|
else @;
|
6692 |
|
|
}
|
6693 |
|
|
}
|
6694 |
|
|
|
6695 |
|
|
@ @=
|
6696 |
|
|
{
|
6697 |
|
|
register int s=8*((~a.l)&0x3);
|
6698 |
|
|
x=magic_read(a);
|
6699 |
|
|
if (a.l&0x4) x.l^=(((x.l>>s)^*p)&0xff)<
|
6700 |
|
|
else x.h^=(((x.h>>s)^*p)&0xff)<
|
6701 |
|
|
magic_write(a,x);
|
6702 |
|
|
p++,m++,a=incr(a,1);
|
6703 |
|
|
}
|
6704 |
|
|
|
6705 |
|
|
@ @=
|
6706 |
|
|
{
|
6707 |
|
|
x.h=(*p<<24)+(*(p+1)<<16)+(*(p+2)<<8)+*(p+3);
|
6708 |
|
|
x.l=(*(p+4)<<24)+(*(p+5)<<16)+(*(p+6)<<8)+*(p+7);
|
6709 |
|
|
magic_write(a,x);
|
6710 |
|
|
p+=8,m+=8,a=incr(a,8);
|
6711 |
|
|
}
|
6712 |
|
|
|
6713 |
|
|
@ When standard input is being read by the simulated program at the same time
|
6714 |
|
|
as it is being used for interaction, we try to keep the two uses separate
|
6715 |
|
|
by maintaining a private buffer for the simulated program's \.{StdIn}.
|
6716 |
|
|
Online input is usually transmitted from the keyboard to a \CEE/ program
|
6717 |
|
|
a line at a time; therefore an
|
6718 |
|
|
|fgets| operation works much better than |fread| when we prompt
|
6719 |
|
|
for new input. But there is a slight complication, because |fgets|
|
6720 |
|
|
might read a null character before coming to a newline character.
|
6721 |
|
|
We cannot deduce the number of characters read by |fgets| simply
|
6722 |
|
|
by looking at |strlen(stdin_buf)|.
|
6723 |
|
|
|
6724 |
|
|
@=
|
6725 |
|
|
char stdin_chr()
|
6726 |
|
|
{
|
6727 |
|
|
register char* p;
|
6728 |
|
|
while (stdin_buf_start==stdin_buf_end) {
|
6729 |
|
|
printf("StdIn> ");@+fflush(stdout);
|
6730 |
|
|
@.StdIn>@>
|
6731 |
|
|
fgets(stdin_buf,256,stdin);
|
6732 |
|
|
stdin_buf_start=stdin_buf;
|
6733 |
|
|
for (p=stdin_buf;p
|
6734 |
|
|
stdin_buf_end=p+1;
|
6735 |
|
|
}
|
6736 |
|
|
return *stdin_buf_start++;
|
6737 |
|
|
}
|
6738 |
|
|
|
6739 |
|
|
@ @=
|
6740 |
|
|
char stdin_buf[256]; /* standard input to the simulated program */
|
6741 |
|
|
char *stdin_buf_start; /* current position in that buffer */
|
6742 |
|
|
char *stdin_buf_end; /* current end of that buffer */
|
6743 |
|
|
|
6744 |
|
|
@* Index.
|