1 |
15 |
hellwig |
% This file is part of the MMIXware package (c) Donald E Knuth 1999
|
2 |
|
|
@i boilerplate.w %<< legal stuff: PLEASE READ IT BEFORE MAKING ANY CHANGES!
|
3 |
|
|
|
4 |
|
|
\def\title{MMIX-CONFIG}
|
5 |
|
|
\def\MMIX{\.{MMIX}}
|
6 |
|
|
\def\Hex#1{\hbox{$^{\scriptscriptstyle\#}$\tt#1}} % experimental hex constant
|
7 |
|
|
@s bool int
|
8 |
|
|
@s cache int
|
9 |
|
|
@s func int
|
10 |
|
|
@s coroutine int
|
11 |
|
|
@s octa int
|
12 |
|
|
@s cacheset int
|
13 |
|
|
@s cacheblock int
|
14 |
|
|
@s fetch int
|
15 |
|
|
@s control int
|
16 |
|
|
@s write_node int
|
17 |
|
|
@s internal_opcode int
|
18 |
|
|
@s replace_policy int
|
19 |
|
|
@s PV TeX
|
20 |
|
|
@s mmix_opcode int
|
21 |
|
|
@s specnode int
|
22 |
|
|
\def\PV{\\{PV}} % use italics, not \tt
|
23 |
|
|
@s CPV TeX
|
24 |
|
|
\def\CPV{\\{CPV}}
|
25 |
|
|
@s OP TeX
|
26 |
|
|
\def\OP{\\{OP}}
|
27 |
|
|
@s and normal @q unreserve a C++ keyword @>
|
28 |
|
|
@s or normal @q unreserve a C++ keyword @>
|
29 |
|
|
@s xor normal @q unreserve a C++ keyword @>
|
30 |
|
|
|
31 |
|
|
@*Input format. Configuration files allow this simulator to adapt itself to
|
32 |
|
|
infinitely many possible combinations of hardware features. The purpose of the
|
33 |
|
|
present module is to read a configuration file, check it for validity, and
|
34 |
|
|
set up the relevant data structures.
|
35 |
|
|
|
36 |
|
|
All data in a configuration file consists simply of {\it tokens\/} separated
|
37 |
|
|
by one or more units of white space, where a ``token'' is any sequence of
|
38 |
|
|
nonspace characters that doesn't contain a percent sign. Percent signs
|
39 |
|
|
and anything following them on a line are ignored; this convention allows
|
40 |
|
|
a user to include comments in the file. Here's a simple (but weird) example:
|
41 |
|
|
$$\vbox{\halign{\tt#\hfil\cr
|
42 |
|
|
\% Silly configuration\cr
|
43 |
|
|
writebuffer 200\cr
|
44 |
|
|
memaddresstime 100\cr
|
45 |
|
|
Dcache associativity 4 lru\cr
|
46 |
|
|
Dcache blocksize 1024\cr
|
47 |
|
|
unit ODD 5555555555555555555555555555555555555555555555555555555555555555\cr
|
48 |
|
|
unit EVEN aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\cr
|
49 |
|
|
div 40 30 20\ \ \% three-stage divide\cr
|
50 |
|
|
}}$$
|
51 |
|
|
It means that (1) the write buffer has capacity for 200 octabytes;
|
52 |
|
|
(2)~the memory bus takes 100 cycles to process an address;
|
53 |
|
|
(3)~there's a D-cache, in which each set has 4 blocks and the replacement
|
54 |
|
|
policy is least-recently-used;
|
55 |
|
|
(4)~each block in the D-cache has 1024 bytes;
|
56 |
|
|
(5)~there are two functional units, one for all the odd-numbered opcodes
|
57 |
|
|
and one for all the rest;
|
58 |
|
|
(6)~the division instructions take three pipeline stages, spending 40 cycles
|
59 |
|
|
in the first stage, 30~in the second, and 20 in the last;
|
60 |
|
|
(7)~all other parameters have default values.
|
61 |
|
|
|
62 |
|
|
@ Four kinds of specifications can appear in a configuration file,
|
63 |
|
|
according to the following syntax:
|
64 |
|
|
\def\<#1>{\hbox{$\langle\,$#1$\,\rangle$}}\let\is=\longrightarrow
|
65 |
|
|
$$\vbox{\halign{$#$\hfil\cr
|
66 |
|
|
\\is\\mid\\mid\\mid
|
67 |
|
|
\\cr
|
68 |
|
|
\\is\\\cr
|
69 |
|
|
\\is\\\\\cr
|
70 |
|
|
\\is\\\cr
|
71 |
|
|
\\is\.{unit}\ \\<64 hexadecimal digits>\cr}}$$
|
72 |
|
|
|
73 |
|
|
@ A \ simply assigns a given value to a given parameter. The
|
74 |
|
|
possibilities for \ are as follows:
|
75 |
|
|
|
76 |
|
|
\def\bull#1 {\smallskip\hang\textindent{$\bullet$}\.{#1}\enspace}
|
77 |
|
|
\bull fetchbuffer (default 4), maximum instructions in the fetch buffer;
|
78 |
|
|
must be $\ge1$.
|
79 |
|
|
|
80 |
|
|
\bull writebuffer (default 2), maximum octabytes in the write buffer;
|
81 |
|
|
must be $\ge1$.
|
82 |
|
|
|
83 |
|
|
\bull reorderbuffer (default 5), maximum instructions issued but not
|
84 |
|
|
committed; must be $\ge1$.
|
85 |
|
|
|
86 |
|
|
\bull renameregs (default 5), maximum partial results in the reorder
|
87 |
|
|
buffer; must be $\ge1$.
|
88 |
|
|
|
89 |
|
|
\bull memslots (default 2), maximum store instructions in the reorder
|
90 |
|
|
buffer; must be $\ge1$.
|
91 |
|
|
|
92 |
|
|
\bull localregs (default 256), number of local registers in ring;
|
93 |
|
|
must be 256, 512, or 1024.
|
94 |
|
|
|
95 |
|
|
\bull fetchmax (default 2), maximum instructions fetched per cycle;
|
96 |
|
|
must be $\ge1$.
|
97 |
|
|
|
98 |
|
|
\bull dispatchmax (default 1), maximum instructions issued per cycle;
|
99 |
|
|
must be $\ge1$.
|
100 |
|
|
|
101 |
|
|
\bull peekahead (default 1), maximum lookahead for jumps per cycle.
|
102 |
|
|
|
103 |
|
|
\bull commitmax (default 1), maximum instructions committed per cycle;
|
104 |
|
|
must be $\ge1$.
|
105 |
|
|
|
106 |
|
|
\bull fremmax (default 1), maximum reductions in \.{FREM} computation per
|
107 |
|
|
cycle; must be $\ge1$.
|
108 |
|
|
|
109 |
|
|
\bull denin (default 1), extra cycles taken if a floating point input
|
110 |
|
|
is subnormal.
|
111 |
|
|
|
112 |
|
|
\bull denout (default 1), extra cycles taken if a floating point result
|
113 |
|
|
is subnormal.
|
114 |
|
|
|
115 |
|
|
\bull writeholdingtime (default 0), minimum number of cycles for data to
|
116 |
|
|
remain in the write buffer.
|
117 |
|
|
|
118 |
|
|
\bull memaddresstime (default 20), cycles to process memory address;
|
119 |
|
|
must be $\ge1$.
|
120 |
|
|
|
121 |
|
|
\bull memreadtime (default 20), cycles to read one memory busload;
|
122 |
|
|
must be $\ge1$.
|
123 |
|
|
|
124 |
|
|
\bull memwritetime (default 20), cycles to write one memory busload;
|
125 |
|
|
must be $\ge1$.
|
126 |
|
|
|
127 |
|
|
\bull membusbytes (default 8), number of bytes per memory busload; must be a
|
128 |
|
|
power of~2 that is 8~or~more.
|
129 |
|
|
|
130 |
|
|
\bull branchpredictbits (default 0), number of bits in each branch prediction
|
131 |
|
|
table entry; must be $\le8$.
|
132 |
|
|
|
133 |
|
|
\bull branchaddressbits (default 0), number of bits in instruction address
|
134 |
|
|
used to index the branch prediction table.
|
135 |
|
|
|
136 |
|
|
\bull branchhistorybits (default 0), number of bits in branch history used to
|
137 |
|
|
index the branch prediction table.
|
138 |
|
|
|
139 |
|
|
\bull branchdualbits (default 0), number of bits of
|
140 |
|
|
instruction-address-xor-branch-history used to index the branch prediction
|
141 |
|
|
table.
|
142 |
|
|
|
143 |
|
|
\bull hardwarepagetable (default 1), is zero if page table calculations
|
144 |
|
|
must be emulated by the operating system.
|
145 |
|
|
|
146 |
|
|
\bull disablesecurity (default 0), is 1 if the hot-seat security checks
|
147 |
|
|
are turned off. This option is used only for testing purposes; it means
|
148 |
|
|
that the `\.s' interrupt will not occur, and the `\.p' interrupt will
|
149 |
|
|
be signaled only when going from a nonnegative location to a negative one.
|
150 |
|
|
|
151 |
|
|
\bull memchunksmax (default 1000), maximum number of $2^{16}$-byte chunks of
|
152 |
|
|
simulated memory; must be $\ge1$.
|
153 |
|
|
|
154 |
|
|
\bull hashprime (default 2003), prime number used to address simulated memory;
|
155 |
|
|
must exceed \.{memchunksmax}, preferably by a factor of about~2.
|
156 |
|
|
|
157 |
|
|
\smallskip\noindent
|
158 |
|
|
The values of \.{memchunksmax} and \.{hashprime} affect only the speed of the
|
159 |
|
|
simulator, not its results---unless a very huge program is being simulated.
|
160 |
|
|
The stated defaults for \.{memchunksmax} and \.{hashprime}
|
161 |
|
|
should be adequate for almost all applications.
|
162 |
|
|
|
163 |
|
|
@ A \ assigns a given value to a parameter affecting one of five
|
164 |
|
|
possible caches:
|
165 |
|
|
$$\vbox{\halign{$#$\hfil\cr
|
166 |
|
|
\\is\\\\\cr
|
167 |
|
|
\\is\.{ITcache}\mid\.{DTcache}\mid\.{Icache}\mid\.{Dcache}
|
168 |
|
|
\mid\.{Scache}\cr
|
169 |
|
|
\\is\\mid\.{random}\mid\.{serial}
|
170 |
|
|
\mid\.{pseudolru}\mid\.{lru}\cr}}$$
|
171 |
|
|
The possibilities for \ are as follows:
|
172 |
|
|
|
173 |
|
|
\bull associativity (default 1), number of cache blocks per cache set;
|
174 |
|
|
must be a power of~2. (A cache with associativity~1 is said to be
|
175 |
|
|
``direct-mapped.'')
|
176 |
|
|
|
177 |
|
|
\bull blocksize (default 8), number of bytes per cache block; must be a power
|
178 |
|
|
of~2, at least equal to the granularity, and at most equal to~8192.
|
179 |
|
|
The blocksize of \.{ITcache} and \.{DTcache} must be~8.
|
180 |
|
|
|
181 |
|
|
\bull setsize (default 1), number of sets of cache blocks; must be a power
|
182 |
|
|
of~2. (A cache with set size~1 is said to be ``fully associative.'')
|
183 |
|
|
|
184 |
|
|
\bull granularity (default 8), number of bytes per ``dirty bit,'' used to
|
185 |
|
|
remember which items of data have changed since they were read from memory;
|
186 |
|
|
must be a power of~2 and at least~8. The granularity must be~8 if
|
187 |
|
|
\.{writeallocate} is~0.
|
188 |
|
|
|
189 |
|
|
\bull victimsize (default 0), number of cache blocks in the victim buffer,
|
190 |
|
|
which holds blocks removed from the main cache sets; must be zero or a power
|
191 |
|
|
of~2.
|
192 |
|
|
|
193 |
|
|
\bull writeback (default 0), is 1 in a ``write-back'' cache, which holds dirty
|
194 |
|
|
data as long as possible; is 0 in a ``write-through'' cache, which cleans
|
195 |
|
|
all data as soon as possible.
|
196 |
|
|
|
197 |
|
|
\bull writeallocate (default 0), is 1 in a ``write-allocate'' cache,
|
198 |
|
|
which remembers all recently written data;
|
199 |
|
|
is 0 in a ``write-around'' cache, which doesn't make space for newly written
|
200 |
|
|
data that fails to hit an existing cache block.
|
201 |
|
|
|
202 |
|
|
\bull accesstime (default 1), number of cycles to query the cache;
|
203 |
|
|
must be $\ge1$. (Hits in the S-cache actually require {\it twice}
|
204 |
|
|
the accesstime, once to query the tag and once to transmit the data.)
|
205 |
|
|
|
206 |
|
|
\bull copyintime (default 1), number of cycles to move a cache block from
|
207 |
|
|
its input buffer into the cache proper; must be $\ge1$.
|
208 |
|
|
|
209 |
|
|
\bull copyouttime (default 1), number of cycles to move a cache block
|
210 |
|
|
from the cache proper to its output buffer; must be $\ge1$.
|
211 |
|
|
|
212 |
|
|
\bull ports (default 1), number of processes that can simultaneous
|
213 |
|
|
query the cache; must be $\ge1$.
|
214 |
|
|
|
215 |
|
|
\smallskip
|
216 |
|
|
The \ parameter should be nonempty only on cache specifications
|
217 |
|
|
for parameters
|
218 |
|
|
\.{associativity} and \.{victimsize}. If no replacement policy is specified,
|
219 |
|
|
\.{random} is the default. All four policies are equivalent when the
|
220 |
|
|
\.{associativity} or \.{victimsize} is~1; \.{pseudolru} is equivalent
|
221 |
|
|
to \.{lru} when the \.{associativity} or \.{victimsize} is~2.
|
222 |
|
|
|
223 |
|
|
The \.{granularity}, \.{writeback}, \.{writeallocate}, and \.{copyouttime}
|
224 |
|
|
parameters affect the performance only of the D-cache and S-cache; the other
|
225 |
|
|
three caches are read-only, so they never need to write their data.
|
226 |
|
|
|
227 |
|
|
The \.{ports} parameter affects the performance of the D-cache and
|
228 |
|
|
DT-cache, and (if the \.{PREGO} command is used) the performance of the
|
229 |
|
|
I-cache and IT-cache. The S-cache accommodates only one process at a time,
|
230 |
|
|
regardless of the number of specified ports.
|
231 |
|
|
|
232 |
|
|
Only the translation caches (the IT-cache and DT-cache) are present by
|
233 |
|
|
default. But if any specifications are given for, say, an I-cache,
|
234 |
|
|
all of the unspecified I-cache parameters take their default values.
|
235 |
|
|
|
236 |
|
|
The existence of an S-cache (secondary cache) implies the existence of both
|
237 |
|
|
I-cache and D-cache (primary caches for instructions and data).
|
238 |
|
|
The block size of the secondary cache must not be less than the block
|
239 |
|
|
size of the primary caches. The secondary cache must have the
|
240 |
|
|
same granularity as the D-cache.
|
241 |
|
|
|
242 |
|
|
@ A \ governs the execution time of potentially slow operations.
|
243 |
|
|
$$\vbox{\halign{$#$\hfil\cr
|
244 |
|
|
\\is\\\cr
|
245 |
|
|
\\is\\mid\\\cr}}$$
|
246 |
|
|
Here the \ is one of the following:
|
247 |
|
|
|
248 |
|
|
\bull mul0 through \.{mul8} (default 10); the values for \.{mul}$j$ refer
|
249 |
|
|
to products in which the second operand is less than $2^{8j}$, where $j$
|
250 |
|
|
is as small as possible. Thus, for example, \.{mul1} applies to
|
251 |
|
|
nonzero one-byte multipliers.
|
252 |
|
|
|
253 |
|
|
\bull div (default 60); this applies to integer division, signed and unsigned.
|
254 |
|
|
|
255 |
|
|
\bull sh (default 1); this applies to left and right shifts, signed and
|
256 |
|
|
unsigned.
|
257 |
|
|
|
258 |
|
|
\bull mux (default 1); the multiplex operator.
|
259 |
|
|
|
260 |
|
|
\bull sadd (default 1); the sideways addition operator.
|
261 |
|
|
|
262 |
|
|
\bull mor (default 1); the boolean matrix multiplication operators \.{MOR} and
|
263 |
|
|
\.{MXOR}.
|
264 |
|
|
|
265 |
|
|
\bull fadd (default 4); floating point addition and subtraction.
|
266 |
|
|
|
267 |
|
|
\bull fmul (default 4); floating point multiplication.
|
268 |
|
|
|
269 |
|
|
\bull fdiv (default 40); floating point division.
|
270 |
|
|
|
271 |
|
|
\bull fsqrt (default 40); floating point square root.
|
272 |
|
|
|
273 |
|
|
\bull fint (default 4); floating point integerization.
|
274 |
|
|
|
275 |
|
|
\bull fix (default 2); conversion from floating to fixed, signed and unsigned.
|
276 |
|
|
|
277 |
|
|
\bull flot (default 2); conversion from fixed to floating, signed and unsigned.
|
278 |
|
|
|
279 |
|
|
\bull feps (default 4); floating comparison with respect to epsilon.
|
280 |
|
|
|
281 |
|
|
\smallskip\noindent
|
282 |
|
|
In each case one can specify a sequence of pipeline stages, with a positive
|
283 |
|
|
number of cycles to be spent in each stage. For example, a specification like
|
284 |
|
|
`\.{fmul}~\.{3}~\.{1}' would say that a functional unit that supports
|
285 |
|
|
\.{FMUL} takes a total of four cycles to compute the floating point product
|
286 |
|
|
in two stages; it can start working on a second product after three cycles
|
287 |
|
|
have gone by.
|
288 |
|
|
|
289 |
|
|
If a floating point operation has a subnormal input, \.{denin} is added to
|
290 |
|
|
the time for the first stage. If a floating point operation has a subnormal
|
291 |
|
|
result, \.{denout} is added to the time for the last stage.
|
292 |
|
|
|
293 |
|
|
@ The fourth and final kind of specification defines a functional unit:
|
294 |
|
|
$$\\is\.{unit}\ \\<64 hexadecimal digits>$$
|
295 |
|
|
The symbolic name should be at most fifteen characters long.
|
296 |
|
|
The 64 hexadecimal digits contain 256 bits, with `1' for each supported
|
297 |
|
|
opcode; the most significant (leftmost) bit is for opcode 0 (\.{TRAP}),
|
298 |
|
|
and the least significant bit is for opcode 255 (\.{TRIP}).
|
299 |
|
|
|
300 |
|
|
For example, we can define a load/store unit (which handles register/memory
|
301 |
|
|
operations), a multiplication unit (which handles fixed and floating point
|
302 |
|
|
multiplication), a boolean unit (which handles only bitwise operations),
|
303 |
|
|
and a more general arithmetic-logical unit, as follows:
|
304 |
|
|
$$\vbox{\halign{\tt#\hfil\cr
|
305 |
|
|
unit LSU 00000000000000000000000000000000fffffffcfffffffc0000000000000000\cr
|
306 |
|
|
unit MUL 000080f000000000000000000000000000000000000000000000000000000000\cr
|
307 |
|
|
unit BIT 000000000000000000000000000000000000000000000000ffff00ff00ff0000\cr
|
308 |
|
|
unit ALU f0000000ffffffffffffffffffffffff0000000300000003ffffffffffffffff\cr
|
309 |
|
|
}}$$
|
310 |
|
|
|
311 |
|
|
The order in which units are specified is important, because \MMIX's dispatcher
|
312 |
|
|
will try to match each instruction with the first functional unit that
|
313 |
|
|
supports its opcode. Therefore it is best to list more specialized
|
314 |
|
|
units (like the \.{BIT} unit in this example) before more general ones;
|
315 |
|
|
this lets the specialized units have first chance at the instructions
|
316 |
|
|
they can handle.
|
317 |
|
|
|
318 |
|
|
There can be any number of functional units, having possibly identical
|
319 |
|
|
specifications. One should, however, give each unit a unique name
|
320 |
|
|
(e.g., \.{ALU1} and \.{ALU2} if there are two arithmetic-logical units),
|
321 |
|
|
since these names are used in diagnostic messages.
|
322 |
|
|
|
323 |
|
|
Opcodes that aren't supported by any specified unit will cause an
|
324 |
|
|
emulation trap.
|
325 |
|
|
@^emulation@>
|
326 |
|
|
|
327 |
|
|
@ Full details about the significance of all these parameters can be found
|
328 |
|
|
in the \.{mmix-pipe} module, which defines and discusses the data structures
|
329 |
|
|
that need to be configured and initialized.
|
330 |
|
|
|
331 |
|
|
Of course the specifications in a configuration file needn't make any sense,
|
332 |
|
|
nor need they be practically achievable. We could, for example, specify
|
333 |
|
|
a unit that handles only the two opcodes \.{NXOR} and \.{DIVUI};
|
334 |
|
|
we could specify 1-cycle division but pipelined 100-cycle shifts, or
|
335 |
|
|
1-cycle memory access but 100-cycle cache access. We could create
|
336 |
|
|
a thousand rename registers and issue a hundred instructions per cycle,
|
337 |
|
|
etc. Some combinations of parameters are clearly ridiculous.
|
338 |
|
|
|
339 |
|
|
But there remain a huge number of possibilities of interest, especially
|
340 |
|
|
as technology continues to evolve. By experimenting with configurations that
|
341 |
|
|
are extreme by present-day standards, we can see how much might be gained
|
342 |
|
|
if the corresponding hardware could be built economically.
|
343 |
|
|
|
344 |
|
|
@* Basic input/output. Let's get ready to program the |MMIX_config| subroutine
|
345 |
|
|
by building some simple infrastructure. First we need some macros to
|
346 |
|
|
print error messages.
|
347 |
|
|
|
348 |
|
|
@d errprint0(f) fprintf(stderr,f)
|
349 |
|
|
@d errprint1(f,a) fprintf(stderr,f,a)
|
350 |
|
|
@d errprint2(f,a,b) fprintf(stderr,f,a,b)
|
351 |
|
|
@d errprint3(f,a,b,c) fprintf(stderr,f,a,b,c)
|
352 |
|
|
@d panic(x)@+ {@+x;@+errprint0("!\n");@+exit(-1);@+}
|
353 |
|
|
|
354 |
|
|
@ And we need a place to look at the input.
|
355 |
|
|
|
356 |
|
|
@d BUF_SIZE 100 /* we don't need long lines */
|
357 |
|
|
|
358 |
|
|
@=
|
359 |
|
|
FILE *config_file; /* input comes from here */
|
360 |
|
|
char buffer[BUF_SIZE]; /* input lines go here */
|
361 |
|
|
char token[BUF_SIZE]; /* and tokens are copied to here */
|
362 |
|
|
char *buf_pointer=buffer; /* this is our current position */
|
363 |
|
|
bool token_prescanned; /* does |token| contain the next token already? */
|
364 |
|
|
|
365 |
|
|
@ The |get_token| routine copies the next token of input into the |token|
|
366 |
|
|
buffer. After the input has ended, a final `\.{end}' is appended.
|
367 |
|
|
|
368 |
|
|
@=
|
369 |
|
|
static void get_token @,@,@[ARGS((void))@];@+@t}\6{@>
|
370 |
|
|
static void get_token() /* set |token| to the next token of the configuration file */
|
371 |
|
|
{
|
372 |
|
|
register char *p,*q;
|
373 |
|
|
if (token_prescanned) {
|
374 |
|
|
token_prescanned=false;@+ return;
|
375 |
|
|
}
|
376 |
|
|
while(1) { /* scan past white space */
|
377 |
|
|
if (*buf_pointer=='\0' || *buf_pointer=='\n' || *buf_pointer=='%') {
|
378 |
|
|
if (!fgets(buffer,BUF_SIZE,config_file)) {
|
379 |
|
|
strcpy(token,"end");@+return;
|
380 |
|
|
}
|
381 |
|
|
if (strlen(buffer)==BUF_SIZE-1 && buffer[BUF_SIZE-2]!='\n')
|
382 |
|
|
panic(errprint1("config file line too long: `%s...'",buffer));
|
383 |
|
|
@.config file line...@>
|
384 |
|
|
buf_pointer=buffer;
|
385 |
|
|
}@+else if (!isspace(*buf_pointer)) break;
|
386 |
|
|
else buf_pointer++;
|
387 |
|
|
}
|
388 |
|
|
for (p=buf_pointer,q=token;!isspace(*p) && *p!='%';p++,q++) *q=*p;
|
389 |
|
|
buf_pointer=p;@+ *q='\0';
|
390 |
|
|
return;
|
391 |
|
|
}
|
392 |
|
|
|
393 |
|
|
@ The |get_int| routine is called when we wish to input a decimal value.
|
394 |
|
|
It returns $-1$ if the next token isn't a string of decimal digits.
|
395 |
|
|
|
396 |
|
|
@=
|
397 |
|
|
static int get_int @,@,@[ARGS((void))@];@+@t}\6{@>
|
398 |
|
|
static int get_int()
|
399 |
|
|
{@+ int v;
|
400 |
|
|
char *p;
|
401 |
|
|
get_token();
|
402 |
|
|
for (p=token,v=0; *p>='0' && *p<='9'; p++) v=10*v+*p-'0';
|
403 |
|
|
if (*p) return -1;
|
404 |
|
|
return v;
|
405 |
|
|
}
|
406 |
|
|
|
407 |
|
|
@ A simple data structure makes it fairly easy to deal with
|
408 |
|
|
parameter/value specifications.
|
409 |
|
|
|
410 |
|
|
@=
|
411 |
|
|
typedef struct {
|
412 |
|
|
char name[20]; /* symbolic name */
|
413 |
|
|
int *v; /* internal name */
|
414 |
|
|
int defval; /* default value */
|
415 |
|
|
int minval, maxval; /* minimum and maximum legal values */
|
416 |
|
|
bool power_of_two; /* must it be a power of two? */
|
417 |
|
|
} pv_spec;
|
418 |
|
|
|
419 |
|
|
@ Cache parameters are a bit more difficult, but still not bad.
|
420 |
|
|
|
421 |
|
|
@=
|
422 |
|
|
typedef enum {@!assoc,@!blksz,@!setsz,@!gran,@!vctsz,
|
423 |
|
|
@!wrb,@!wra,@!acctm,@!citm,@!cotm,@!prts} c_param;
|
424 |
|
|
@#
|
425 |
|
|
typedef struct {
|
426 |
|
|
char name[20]; /* symbolic name */
|
427 |
|
|
c_param v; /* internal code */
|
428 |
|
|
int defval; /* default value */
|
429 |
|
|
int minval, maxval; /* minimum and maximum legal values */
|
430 |
|
|
bool power_of_two; /* must it be a power of two? */
|
431 |
|
|
} cpv_spec;
|
432 |
|
|
|
433 |
|
|
@ Operation codes are the easiest of all.
|
434 |
|
|
|
435 |
|
|
@=
|
436 |
|
|
typedef struct {
|
437 |
|
|
char name[8]; /* symbolic name */
|
438 |
|
|
internal_opcode v; /* internal code */
|
439 |
|
|
int defval; /* default value */
|
440 |
|
|
} op_spec;
|
441 |
|
|
|
442 |
|
|
@ Most of the parameters are external variables declared in the header
|
443 |
|
|
file \.{mmix-pipe.h}; but some are private to this module. Here we
|
444 |
|
|
define the main tables used below.
|
445 |
|
|
|
446 |
|
|
@=
|
447 |
|
|
int fetch_buf_size,write_buf_size,reorder_buf_size,mem_bus_bytes,hardware_PT;
|
448 |
|
|
int max_cycs=60;
|
449 |
|
|
pv_spec PV[]={@/
|
450 |
|
|
{"fetchbuffer", &fetch_buf_size, 4, 1, INT_MAX, false},@/
|
451 |
|
|
{"writebuffer", &write_buf_size, 2, 1, INT_MAX, false},@/
|
452 |
|
|
{"reorderbuffer", &reorder_buf_size, 5, 1, INT_MAX, false},@/
|
453 |
|
|
{"renameregs", &max_rename_regs, 5, 1, INT_MAX, false},@/
|
454 |
|
|
{"memslots", &max_mem_slots, 2, 1, INT_MAX, false},@/
|
455 |
|
|
{"localregs", &lring_size, 256, 256, 1024, true},@/
|
456 |
|
|
{"fetchmax", &fetch_max, 2, 1, INT_MAX, false},@/
|
457 |
|
|
{"dispatchmax", &dispatch_max, 1, 1, INT_MAX, false},@/
|
458 |
|
|
{"peekahead", &peekahead, 1, 0, INT_MAX, false},@/
|
459 |
|
|
{"commitmax", &commit_max, 1, 1, INT_MAX, false},@/
|
460 |
|
|
{"fremmax", &frem_max, 1, 1, INT_MAX, false},@/
|
461 |
|
|
{"denin",&denin_penalty, 1, 0, INT_MAX, false},@/
|
462 |
|
|
{"denout",&denout_penalty, 1, 0, INT_MAX, false},@/
|
463 |
|
|
{"writeholdingtime", &holding_time, 0, 0, INT_MAX, false},@/
|
464 |
|
|
{"memaddresstime", &mem_addr_time, 20, 1, INT_MAX, false},@/
|
465 |
|
|
{"memreadtime", &mem_read_time, 20, 1, INT_MAX, false},@/
|
466 |
|
|
{"memwritetime", &mem_write_time, 20, 1, INT_MAX, false},@/
|
467 |
|
|
{"membusbytes", &mem_bus_bytes, 8, 8, INT_MAX, true},@/
|
468 |
|
|
{"branchpredictbits", &bp_n, 0, 0, 8, false},@/
|
469 |
|
|
{"branchaddressbits", &bp_a, 0, 0, 32, false},@/
|
470 |
|
|
{"branchhistorybits", &bp_b, 0, 0, 32, false},@/
|
471 |
|
|
{"branchdualbits", &bp_c, 0, 0, 32, false},@/
|
472 |
|
|
{"hardwarepagetable", &hardware_PT, 1, 0, 1, false},@/
|
473 |
|
|
{"disablesecurity", (int*)&security_disabled, 0, 0, 1, false},@/
|
474 |
|
|
{"memchunksmax", &mem_chunks_max, 1000, 1, INT_MAX, false},@/
|
475 |
|
|
{"hashprime", &hash_prime, 2003, 2, INT_MAX, false}};
|
476 |
|
|
@#
|
477 |
|
|
cpv_spec CPV[]={
|
478 |
|
|
{"associativity", assoc, 1, 1, INT_MAX, true},@/
|
479 |
|
|
{"blocksize", blksz, 8, 8, 8192, true},@/
|
480 |
|
|
{"setsize", setsz, 1, 1, INT_MAX, true},@/
|
481 |
|
|
{"granularity", gran, 8, 8, 8192, true},@/
|
482 |
|
|
{"victimsize", vctsz, 0, 0, INT_MAX, true},@/
|
483 |
|
|
{"writeback", wrb, 0, 0, 1,false},@/
|
484 |
|
|
{"writeallocate", wra, 0, 0, 1,false},@/
|
485 |
|
|
{"accesstime", acctm, 1, 1, INT_MAX, false},@/
|
486 |
|
|
{"copyintime", citm, 1, 1, INT_MAX, false},@/
|
487 |
|
|
{"copyouttime", cotm, 1, 1, INT_MAX, false},@/
|
488 |
|
|
{"ports", prts, 1, 1, INT_MAX,false}};
|
489 |
|
|
@#
|
490 |
|
|
op_spec OP[]={
|
491 |
|
|
{"mul0", mul0, 10},
|
492 |
|
|
{"mul1", mul1, 10},
|
493 |
|
|
{"mul2", mul2, 10},
|
494 |
|
|
{"mul3", mul3, 10},
|
495 |
|
|
{"mul4", mul4, 10},
|
496 |
|
|
{"mul5", mul5, 10},
|
497 |
|
|
{"mul6", mul6, 10},
|
498 |
|
|
{"mul7", mul7, 10},
|
499 |
|
|
{"mul8", mul8, 10},@|
|
500 |
|
|
{"div", div, 60},
|
501 |
|
|
{"sh", sh, 1},
|
502 |
|
|
{"mux", mux, 1},
|
503 |
|
|
{"sadd", sadd, 1},
|
504 |
|
|
{"mor", mor, 1},@|
|
505 |
|
|
{"fadd", fadd, 4},
|
506 |
|
|
{"fmul", fmul, 4},
|
507 |
|
|
{"fdiv", fdiv, 40},
|
508 |
|
|
{"fsqrt", fsqrt, 40},
|
509 |
|
|
{"fint", fint, 4},@|
|
510 |
|
|
{"fix", fix, 2},
|
511 |
|
|
{"flot", flot, 2},
|
512 |
|
|
{"feps", feps, 4}};
|
513 |
|
|
int PV_size,CPV_size,OP_size; /* the number of entries in |PV|, |CPV|, |OP| */
|
514 |
|
|
|
515 |
|
|
@ The |new_cache| routine creates a \&{cache} structure with default values.
|
516 |
|
|
(These default values are ``hard-wired'' into the program, not actually
|
517 |
|
|
read from the |CPV| table.)
|
518 |
|
|
|
519 |
|
|
@=
|
520 |
|
|
static cache* new_cache @,@,@[ARGS((char*))@];@+@t}\6{@>
|
521 |
|
|
static cache* new_cache(name)
|
522 |
|
|
char *name;
|
523 |
|
|
{@+register cache *c=(cache*)calloc(1,sizeof(cache));
|
524 |
|
|
if (!c) panic(errprint1("Can't allocate %s",name));
|
525 |
|
|
@.Can't allocate...@>
|
526 |
|
|
c->aa=1; /* default associativity, should equal |CPV[0].defval| */
|
527 |
|
|
c->bb=8; /* default blocksize */
|
528 |
|
|
c->cc=1; /* default setsize */
|
529 |
|
|
c->gg=8; /* default granularity */
|
530 |
|
|
c->vv=0; /* default victimsize */
|
531 |
|
|
c->repl=random; /* default replacement policy */
|
532 |
|
|
c->vrepl=random; /* default victim replacement policy */
|
533 |
|
|
c->mode=0; /* default mode is write-through and write-around */
|
534 |
|
|
c->access_time=c->copy_in_time=c->copy_out_time=1;
|
535 |
|
|
c->filler.ctl=&(c->filler_ctl);
|
536 |
|
|
c->filler_ctl.ptr_a=(void*)c;
|
537 |
|
|
c->filler_ctl.go.o.l=4;
|
538 |
|
|
c->flusher.ctl=&(c->flusher_ctl);
|
539 |
|
|
c->flusher_ctl.ptr_a=(void*)c;
|
540 |
|
|
c->flusher_ctl.go.o.l=4;
|
541 |
|
|
c->ports=1;
|
542 |
|
|
c->name=name;
|
543 |
|
|
return c;
|
544 |
|
|
}
|
545 |
|
|
|
546 |
|
|
@ @=
|
547 |
|
|
PV_size=(sizeof PV)/sizeof(pv_spec);
|
548 |
|
|
CPV_size=(sizeof CPV)/sizeof(cpv_spec);
|
549 |
|
|
OP_size=(sizeof OP)/sizeof(op_spec);
|
550 |
|
|
ITcache=new_cache("ITcache");
|
551 |
|
|
DTcache=new_cache("DTcache");
|
552 |
|
|
Icache=Dcache=Scache=NULL;
|
553 |
|
|
for (j=0;j
|
554 |
|
|
for (j=0;j
|
555 |
|
|
pipe_seq[OP[j].v][0]=OP[j].defval;
|
556 |
|
|
pipe_seq[OP[j].v][1]=0; /* one stage */
|
557 |
|
|
}
|
558 |
|
|
|
559 |
|
|
@* Reading the specs. Before we're ready to process the configuration file,
|
560 |
|
|
we need to count the number of functional units, so that we know
|
561 |
|
|
how much space to allocate for them.
|
562 |
|
|
|
563 |
|
|
A special background unit is always provided, just to make sure that
|
564 |
|
|
\.{TRAP} and \.{TRIP} instructions are handled by somebody.
|
565 |
|
|
|
566 |
|
|
@=
|
567 |
|
|
funit_count=0;
|
568 |
|
|
while (strcmp(token,"end")!=0) {
|
569 |
|
|
get_token();
|
570 |
|
|
if (strcmp(token,"unit")==0) {
|
571 |
|
|
funit_count++;
|
572 |
|
|
get_token();@+get_token(); /* a unit might be named \.{unit} or \.{end} */
|
573 |
|
|
}
|
574 |
|
|
}
|
575 |
|
|
funit=(func*)calloc(funit_count+1,sizeof(func));
|
576 |
|
|
if (!funit) panic(errprint0("Can't allocate the functional units"));
|
577 |
|
|
@.Can't allocate...@>
|
578 |
|
|
strcpy(funit[funit_count].name,"%%");
|
579 |
|
|
@.\%\%@>
|
580 |
|
|
funit[funit_count].ops[0]=0x80000000; /* \.{TRAP} */
|
581 |
|
|
funit[funit_count].ops[7]=0x1; /* \.{TRIP} */
|
582 |
|
|
|
583 |
|
|
@ Now we can read the specifications and obey them. This program doesn't
|
584 |
|
|
bother to be very tolerant of errors, nor does it try to be very efficient.
|
585 |
|
|
|
586 |
|
|
Incidentally, the specifications don't have to be broken into individual lines
|
587 |
|
|
in any meaningful way. We simply read them token by token.
|
588 |
|
|
|
589 |
|
|
@=
|
590 |
|
|
rewind(config_file);
|
591 |
|
|
funit_count=0;
|
592 |
|
|
token[0]='\0';
|
593 |
|
|
while (strcmp(token,"end")!=0) {
|
594 |
|
|
get_token();
|
595 |
|
|
if (strcmp(token,"end")==0) break;
|
596 |
|
|
@;
|
597 |
|
|
@;
|
598 |
|
|
@;
|
599 |
|
|
if (strcmp(token,"unit")==0) @;
|
600 |
|
|
panic(errprint1(
|
601 |
|
|
"Configuration syntax error: Specification can't start with `%s'",token));
|
602 |
|
|
@.Configuration syntax error...@>
|
603 |
|
|
}
|
604 |
|
|
|
605 |
|
|
@ @=
|
606 |
|
|
for (j=0;j
|
607 |
|
|
n=get_int();
|
608 |
|
|
if (n
|
609 |
|
|
@.Configuration error...@>
|
610 |
|
|
"Configuration error: %s must be >= %d",PV[j].name,PV[j].minval));
|
611 |
|
|
if (n>PV[j].maxval) panic(errprint2(
|
612 |
|
|
"Configuration error: %s must be <= %d",PV[j].name,PV[j].maxval));
|
613 |
|
|
if (PV[j].power_of_two && (n&(n-1))) panic(errprint1(
|
614 |
|
|
"Configuration error: %s must be a power of 2",PV[j].name));
|
615 |
|
|
*(PV[j].v)=n;
|
616 |
|
|
break;
|
617 |
|
|
}
|
618 |
|
|
if (j
|
619 |
|
|
|
620 |
|
|
@ @=
|
621 |
|
|
if (strcmp(token,"ITcache")==0) {
|
622 |
|
|
pcs(ITcache);@+continue;
|
623 |
|
|
}@+else if (strcmp(token,"DTcache")==0) {
|
624 |
|
|
pcs(DTcache);@+continue;
|
625 |
|
|
}@+else if (strcmp(token,"Icache")==0) {
|
626 |
|
|
if (!Icache) Icache=new_cache("Icache");
|
627 |
|
|
pcs(Icache);@+continue;
|
628 |
|
|
}@+else if (strcmp(token,"Dcache")==0) {
|
629 |
|
|
if (!Dcache) Dcache=new_cache("Dcache");
|
630 |
|
|
pcs(Dcache);@+continue;
|
631 |
|
|
}@+else if (strcmp(token,"Scache")==0) {
|
632 |
|
|
if (!Icache) Icache=new_cache("Icache");
|
633 |
|
|
if (!Dcache) Dcache=new_cache("Dcache");
|
634 |
|
|
if (!Scache) Scache=new_cache("Scache");
|
635 |
|
|
pcs(Scache);@+continue;
|
636 |
|
|
}
|
637 |
|
|
|
638 |
|
|
@ @=
|
639 |
|
|
static void ppol @,@,@[ARGS((replace_policy*))@];@+@t}\6{@>
|
640 |
|
|
static void ppol(rr) /* subroutine to scan for a replacement policy */
|
641 |
|
|
replace_policy *rr;
|
642 |
|
|
{
|
643 |
|
|
get_token();
|
644 |
|
|
if (strcmp(token,"random")==0) *rr=random;
|
645 |
|
|
else if (strcmp(token,"serial")==0) *rr=serial;
|
646 |
|
|
else if (strcmp(token,"pseudolru")==0) *rr=pseudo_lru;
|
647 |
|
|
else if (strcmp(token,"lru")==0) *rr=lru;
|
648 |
|
|
else token_prescanned=true; /* oops, we should rescan that token */
|
649 |
|
|
}
|
650 |
|
|
|
651 |
|
|
@ @=
|
652 |
|
|
static void pcs @,@,@[ARGS((cache*))@];@+@t}\6{@>
|
653 |
|
|
static void pcs(c) /* subroutine to process a cache spec */
|
654 |
|
|
cache *c;
|
655 |
|
|
{
|
656 |
|
|
register int j,n;
|
657 |
|
|
get_token();
|
658 |
|
|
for (j=0;j
|
659 |
|
|
if (j==CPV_size) panic(errprint1(
|
660 |
|
|
"Configuration syntax error: `%s' isn't a cache parameter name",token));
|
661 |
|
|
@.Configuration syntax error...@>
|
662 |
|
|
n=get_int();
|
663 |
|
|
if (n
|
664 |
|
|
"Configuration error: %s must be >= %d",CPV[j].name,CPV[j].minval));
|
665 |
|
|
@.Configuration error...@>
|
666 |
|
|
if (n>CPV[j].maxval) panic(errprint2(
|
667 |
|
|
"Configuration error: %s must be <= %d",CPV[j].name,CPV[j].maxval));
|
668 |
|
|
if (CPV[j].power_of_two && (n&(n-1))) panic(errprint1(
|
669 |
|
|
"Configuration error: %s must be power of 2",CPV[j].name));
|
670 |
|
|
switch (CPV[j].v) {
|
671 |
|
|
case assoc: c->aa=n;@+ppol(&(c->repl));@+break;
|
672 |
|
|
case blksz: c->bb=n;@+break;
|
673 |
|
|
case setsz: c->cc=n;@+break;
|
674 |
|
|
case gran: c->gg=n;@+break;
|
675 |
|
|
case vctsz: c->vv=n;@+ppol(&(c->vrepl));@+break;
|
676 |
|
|
case wrb: c->mode=(c->mode&~WRITE_BACK)+n*WRITE_BACK;@+break;
|
677 |
|
|
case wra: c->mode=(c->mode&~WRITE_ALLOC)+n*WRITE_ALLOC;@+break;
|
678 |
|
|
case acctm:@+ if (n>max_cycs) max_cycs=n;
|
679 |
|
|
c->access_time=n;@+break;
|
680 |
|
|
case citm:@+ if (n>max_cycs) max_cycs=n;
|
681 |
|
|
c->copy_in_time=n;@+break;
|
682 |
|
|
case cotm:@+ if (n>max_cycs) max_cycs=n;
|
683 |
|
|
c->copy_out_time=n;@+break;
|
684 |
|
|
case prts: c->ports=n;@+break;
|
685 |
|
|
}
|
686 |
|
|
}
|
687 |
|
|
|
688 |
|
|
@ @=
|
689 |
|
|
for (j=0;j
|
690 |
|
|
for (i=0;;i++) {
|
691 |
|
|
n=get_int();
|
692 |
|
|
if (n<0) break;
|
693 |
|
|
if (n==0) panic(errprint0(
|
694 |
|
|
"Configuration error: Pipeline cycles must be positive"));
|
695 |
|
|
@.Configuration error...@>
|
696 |
|
|
if (n>255) panic(errprint0(
|
697 |
|
|
"Configuration error: Pipeline cycles must be <= 255"));
|
698 |
|
|
if (n>max_cycs) max_cycs=n;
|
699 |
|
|
if (i>=pipe_limit) panic(errprint1(
|
700 |
|
|
"Configuration error: More than %d pipeline stages",pipe_limit));
|
701 |
|
|
pipe_seq[OP[j].v][i]=n;
|
702 |
|
|
}
|
703 |
|
|
token_prescanned=true;
|
704 |
|
|
break;
|
705 |
|
|
}
|
706 |
|
|
if (j
|
707 |
|
|
|
708 |
|
|
@ @=
|
709 |
|
|
{
|
710 |
|
|
get_token();
|
711 |
|
|
if (strlen(token)>15) panic(errprint1(
|
712 |
|
|
"Configuration error: `%s' is more than 15 characters long",token));
|
713 |
|
|
@.Configuration error...@>
|
714 |
|
|
strcpy(funit[funit_count].name,token);
|
715 |
|
|
get_token();
|
716 |
|
|
if (strlen(token)!=64) panic(errprint1(
|
717 |
|
|
"Configuration error: unit %s doesn't have 64 hex digit specs",
|
718 |
|
|
funit[funit_count].name));
|
719 |
|
|
for (i=j=n=0;j<64;j++) {
|
720 |
|
|
if (token[j]>='0' && token[j]<='9') n=(n<<4)+(token[j]-'0');
|
721 |
|
|
else if (token[j]>='a' && token[j]<='f') n=(n<<4)+(token[j]-'a'+10);
|
722 |
|
|
else if (token[j]>='A' && token[j]<='F') n=(n<<4)+(token[j]-'A'+10);
|
723 |
|
|
else panic(errprint1(
|
724 |
|
|
"Configuration error: `%c' is not a hex digit",token[j]));
|
725 |
|
|
if ((j&0x7)==0x7) funit[funit_count].ops[i++]=n, n=0;
|
726 |
|
|
}
|
727 |
|
|
funit_count++;
|
728 |
|
|
continue;
|
729 |
|
|
}
|
730 |
|
|
|
731 |
|
|
@* Checking and allocating. The battle is only half over when we've
|
732 |
|
|
absorbed all the data of the configuration file. We still must check for
|
733 |
|
|
interactions between different quantities, and we must allocate
|
734 |
|
|
space for cache blocks, coroutines, etc.
|
735 |
|
|
|
736 |
|
|
One of the most difficult tasks facing us to determine the maximum number
|
737 |
|
|
of pipeline stages needed by each functional unit. Let's tackle that first.
|
738 |
|
|
|
739 |
|
|
@=
|
740 |
|
|
@;
|
741 |
|
|
for (j=0;j<=funit_count;j++) {
|
742 |
|
|
@;
|
743 |
|
|
funit[j].k=n;
|
744 |
|
|
funit[j].co=(coroutine*)calloc(n,sizeof(coroutine));
|
745 |
|
|
for (i=0;i
|
746 |
|
|
funit[j].co[i].name=funit[j].name;
|
747 |
|
|
funit[j].co[i].stage=i+1;
|
748 |
|
|
}
|
749 |
|
|
}
|
750 |
|
|
|
751 |
|
|
@ @=
|
752 |
|
|
for (j=div;j<=max_pipe_op;j++) int_stages[j]=strlen(pipe_seq[j]);
|
753 |
|
|
for (;j<=max_real_command;j++) int_stages[j]=1;
|
754 |
|
|
for (j=mul0,n=0;j<=mul8;j++)
|
755 |
|
|
if (strlen(pipe_seq[j])>n) n=strlen(pipe_seq[j]);
|
756 |
|
|
int_stages[mul]=n;
|
757 |
|
|
int_stages[ld]=int_stages[st]=int_stages[frem]=2;
|
758 |
|
|
for (j=0;j<256;j++) stages[j]=int_stages[int_op[j]];
|
759 |
|
|
|
760 |
|
|
@ The |int_op| conversion table is similar to the |internal_op| array of
|
761 |
|
|
the \\{MMIX\_pipe} routine, but it replaces |divu| by |div|,
|
762 |
|
|
|fsub| by |fadd|, etc.
|
763 |
|
|
|
764 |
|
|
@=
|
765 |
|
|
internal_opcode int_op[256]={@/
|
766 |
|
|
trap,fcmp,funeq,funeq,fadd,fix,fadd,fix,@/
|
767 |
|
|
flot,flot,flot,flot,flot,flot,flot,flot,@/
|
768 |
|
|
fmul,feps,feps,feps,fdiv,fsqrt,frem,fint,@/
|
769 |
|
|
mul,mul,mul,mul,div,div,div,div,@/
|
770 |
|
|
add,add,addu,addu,sub,sub,subu,subu,@/
|
771 |
|
|
addu,addu,addu,addu,addu,addu,addu,addu,@/
|
772 |
|
|
cmp,cmp,cmpu,cmpu,sub,sub,subu,subu,@/
|
773 |
|
|
sh,sh,sh,sh,sh,sh,sh,sh,@/
|
774 |
|
|
br,br,br,br,br,br,br,br,@/
|
775 |
|
|
br,br,br,br,br,br,br,br,@/
|
776 |
|
|
pbr,pbr,pbr,pbr,pbr,pbr,pbr,pbr,@/
|
777 |
|
|
pbr,pbr,pbr,pbr,pbr,pbr,pbr,pbr,@/
|
778 |
|
|
cset,cset,cset,cset,cset,cset,cset,cset,@/
|
779 |
|
|
cset,cset,cset,cset,cset,cset,cset,cset,@/
|
780 |
|
|
zset,zset,zset,zset,zset,zset,zset,zset,@/
|
781 |
|
|
zset,zset,zset,zset,zset,zset,zset,zset,@/
|
782 |
|
|
ld,ld,ld,ld,ld,ld,ld,ld,@/
|
783 |
|
|
ld,ld,ld,ld,ld,ld,ld,ld,@/
|
784 |
|
|
ld,ld,ld,ld,ld,ld,ld,ld,@/
|
785 |
|
|
ld,ld,ld,ld,prego,prego,go,go,@/
|
786 |
|
|
st,st,st,st,st,st,st,st,@/
|
787 |
|
|
st,st,st,st,st,st,st,st,@/
|
788 |
|
|
st,st,st,st,st,st,st,st,@/
|
789 |
|
|
st,st,st,st,st,st,pushgo,pushgo,@/
|
790 |
|
|
or,or,orn,orn,nor,nor,xor,xor,@/
|
791 |
|
|
and,and,andn,andn,nand,nand,nxor,nxor,@/
|
792 |
|
|
bdif,bdif,wdif,wdif,tdif,tdif,odif,odif,@/
|
793 |
|
|
mux,mux,sadd,sadd,mor,mor,mor,mor,@/
|
794 |
|
|
set,set,set,set,addu,addu,addu,addu,@/
|
795 |
|
|
or,or,or,or,andn,andn,andn,andn,@/
|
796 |
|
|
noop,noop,pushj,pushj,set,set,put,put,@/
|
797 |
|
|
pop,resume,save,unsave,sync,noop,get,trip};
|
798 |
|
|
int int_stages[max_real_command+1];
|
799 |
|
|
/* stages as function of |internal_opcode| */
|
800 |
|
|
int stages[256]; /* stages as function of |mmix_opcode| */
|
801 |
|
|
|
802 |
|
|
@ @=
|
803 |
|
|
for (i=n=0;i<256;i++)
|
804 |
|
|
if (((funit[j].ops[i>>5]<<(i&0x1f))&0x80000000) && stages[i]>n)
|
805 |
|
|
n=stages[i];
|
806 |
|
|
if (n==0) panic(errprint1(
|
807 |
|
|
"Configuration error: unit %s doesn't do anything",funit[j].name));
|
808 |
|
|
@.Configuration error...@>
|
809 |
|
|
|
810 |
|
|
@ The next hardest thing on our agenda is to set up the cache structure
|
811 |
|
|
fields that depend on the parameters. For example, although we have defined
|
812 |
|
|
the parameter in the |bb| field (the block size), we also need to compute the
|
813 |
|
|
|b|~field (log of the block size), and we must create the cache blocks
|
814 |
|
|
themselves.
|
815 |
|
|
|
816 |
|
|
@=
|
817 |
|
|
static int lg @,@,@[ARGS((int))@];@+@t}\6{@>
|
818 |
|
|
static int lg(n) /* compute binary logarithm */
|
819 |
|
|
int n;
|
820 |
|
|
{@+register int j,l;
|
821 |
|
|
for (j=n,l=0;j;j>>=1) l++;
|
822 |
|
|
return l-1;
|
823 |
|
|
}
|
824 |
|
|
|
825 |
|
|
@ @=
|
826 |
|
|
static void alloc_cache @,@,@[ARGS((cache*,char*))@];@+@t}\6{@>
|
827 |
|
|
static void alloc_cache(c,name)
|
828 |
|
|
cache *c;
|
829 |
|
|
char *name;
|
830 |
|
|
{@+register int j,k;
|
831 |
|
|
if (c->bbgg) panic(errprint1(
|
832 |
|
|
"Configuration error: blocksize of %s is less than granularity",name));
|
833 |
|
|
@.Configuration error...@>
|
834 |
|
|
if (name[1]=='T' && c->bb!=8) panic(errprint1(
|
835 |
|
|
"Configuration error: blocksize of %s must be 8",name));
|
836 |
|
|
c->a=lg(c->aa);
|
837 |
|
|
c->b=lg(c->bb);
|
838 |
|
|
c->c=lg(c->cc);
|
839 |
|
|
c->g=lg(c->gg);
|
840 |
|
|
c->v=lg(c->vv);
|
841 |
|
|
c->tagmask=-(1<<(c->b+c->c));
|
842 |
|
|
if (c->a+c->b+c->c>=32) panic(errprint1(
|
843 |
|
|
"Configuration error: %s has >= 4 gigabytes of data",name));
|
844 |
|
|
if (c->gg!=8 && !(c->mode&WRITE_ALLOC)) panic(errprint2(
|
845 |
|
|
"Configuration error: %s does write-around with granularity %d",
|
846 |
|
|
name,c->gg));
|
847 |
|
|
@;
|
848 |
|
|
if (c->vv) @;
|
849 |
|
|
c->inbuf.dirty=(char*)calloc(c->bb>>c->g,sizeof(char));
|
850 |
|
|
if (!c->inbuf.dirty) panic(errprint1(
|
851 |
|
|
"Can't allocate dirty bits for inbuffer of %s",name));
|
852 |
|
|
@.Can't allocate...@>
|
853 |
|
|
c->inbuf.data=(octa *)calloc(c->bb>>3,sizeof(octa));
|
854 |
|
|
if (!c->inbuf.data) panic(errprint1(
|
855 |
|
|
"Can't allocate data for inbuffer of %s",name));
|
856 |
|
|
c->outbuf.dirty=(char*)calloc(c->bb>>c->g,sizeof(char));
|
857 |
|
|
if (!c->outbuf.dirty) panic(errprint1(
|
858 |
|
|
"Can't allocate dirty bits for outbuffer of %s",name));
|
859 |
|
|
c->outbuf.data=(octa *)calloc(c->bb>>3,sizeof(octa));
|
860 |
|
|
if (!c->outbuf.data) panic(errprint1(
|
861 |
|
|
"Can't allocate data for outbuffer of %s",name));
|
862 |
|
|
if (name[0]!='S') @;
|
863 |
|
|
}
|
864 |
|
|
|
865 |
|
|
@ @d sign_bit 0x80000000
|
866 |
|
|
|
867 |
|
|
@=
|
868 |
|
|
c->set=(cacheset *)calloc(c->cc,sizeof(cacheset));
|
869 |
|
|
if (!c->set) panic(errprint1(
|
870 |
|
|
"Can't allocate cache sets for %s",name));
|
871 |
|
|
@.Can't allocate...@>
|
872 |
|
|
for (j=0;jcc;j++) {
|
873 |
|
|
c->set[j]=(cacheblock *)calloc(c->aa,sizeof(cacheblock));
|
874 |
|
|
if (!c->set[j]) panic(errprint2(
|
875 |
|
|
"Can't allocate cache blocks for set %d of %s",j,name));
|
876 |
|
|
for (k=0;kaa;k++) {
|
877 |
|
|
c->set[j][k].tag.h=sign_bit; /* invalid tag */
|
878 |
|
|
c->set[j][k].dirty=(char *)calloc(c->bb>>c->g,sizeof(char));
|
879 |
|
|
if (!c->set[j][k].dirty) panic(errprint3(
|
880 |
|
|
"Can't allocate dirty bits for block %d of set %d of %s",k,j,name));
|
881 |
|
|
c->set[j][k].data=(octa *)calloc(c->bb>>3,sizeof(octa));
|
882 |
|
|
if (!c->set[j][k].data) panic(errprint3(
|
883 |
|
|
"Can't allocate data for block %d of set %d of %s",k,j,name));
|
884 |
|
|
}
|
885 |
|
|
}
|
886 |
|
|
|
887 |
|
|
@ @=
|
888 |
|
|
{
|
889 |
|
|
c->victim=(cacheblock*)calloc(c->vv,sizeof(cacheblock));
|
890 |
|
|
if (!c->victim) panic(errprint1(
|
891 |
|
|
"Can't allocate blocks for victim cache of %s",name));
|
892 |
|
|
for (k=0;kvv;k++) {
|
893 |
|
|
c->victim[k].tag.h=sign_bit; /* invalid tag */
|
894 |
|
|
c->victim[k].dirty=(char *)calloc(c->bb>>c->g,sizeof(char));
|
895 |
|
|
if (!c->victim[k].dirty) panic(errprint2(
|
896 |
|
|
"Can't allocate dirty bits for block %d of victim cache of %s",
|
897 |
|
|
k,name));
|
898 |
|
|
@.Can't allocate...@>
|
899 |
|
|
c->victim[k].data=(octa *)calloc(c->bb>>3,sizeof(octa));
|
900 |
|
|
if (!c->victim[k].data) panic(errprint2(
|
901 |
|
|
"Can't allocate data for block %d of victim cache of %s",k,name));
|
902 |
|
|
}
|
903 |
|
|
}
|
904 |
|
|
|
905 |
|
|
@ @=
|
906 |
|
|
{
|
907 |
|
|
c->reader=(coroutine*)calloc(c->ports,sizeof(coroutine));
|
908 |
|
|
if (!c->reader) panic(errprint1(
|
909 |
|
|
@.Can't allocate...@>
|
910 |
|
|
"Can't allocate readers for %s",name));
|
911 |
|
|
for (j=0;jports;j++) {
|
912 |
|
|
c->reader[j].stage=vanish;
|
913 |
|
|
c->reader[j].name=(name[0]=='D'? (name[1]=='T'? "DTreader": "Dreader"):
|
914 |
|
|
(name[1]=='T'? "ITreader": "Ireader"));
|
915 |
|
|
}
|
916 |
|
|
}
|
917 |
|
|
|
918 |
|
|
@ @=
|
919 |
|
|
alloc_cache(ITcache,"ITcache");
|
920 |
|
|
ITcache->filler.name="ITfiller";@+ ITcache->filler.stage=fill_from_virt;
|
921 |
|
|
alloc_cache(DTcache,"DTcache");
|
922 |
|
|
DTcache->filler.name="DTfiller";@+ DTcache->filler.stage=fill_from_virt;
|
923 |
|
|
if (Icache) {
|
924 |
|
|
alloc_cache(Icache,"Icache");
|
925 |
|
|
Icache->filler.name="Ifiller";@+ Icache->filler.stage=fill_from_mem;
|
926 |
|
|
}
|
927 |
|
|
if (Dcache) {
|
928 |
|
|
alloc_cache(Dcache,"Dcache");
|
929 |
|
|
Dcache->filler.name="Dfiller";@+ Dcache->filler.stage=fill_from_mem;
|
930 |
|
|
Dcache->flusher.name="Dflusher";@+ Dcache->flusher.stage=flush_to_mem;
|
931 |
|
|
}
|
932 |
|
|
if (Scache) {
|
933 |
|
|
alloc_cache(Scache,"Scache");
|
934 |
|
|
if (Scache->bbbb) panic(errprint0(
|
935 |
|
|
"Configuration error: Scache blocks smaller than Icache blocks"));
|
936 |
|
|
@.Configuration error...@>
|
937 |
|
|
if (Scache->bbbb) panic(errprint0(
|
938 |
|
|
"Configuration error: Scache blocks smaller than Dcache blocks"));
|
939 |
|
|
if (Scache->gg!=Dcache->gg) panic(errprint0(
|
940 |
|
|
"Configuration error: Scache granularity differs from the Dcache"));
|
941 |
|
|
Icache->filler.stage=fill_from_S;
|
942 |
|
|
Dcache->filler.stage=fill_from_S;@+ Dcache->flusher.stage=flush_to_S;
|
943 |
|
|
Scache->filler.name="Sfiller";@+ Scache->filler.stage=fill_from_mem;
|
944 |
|
|
Scache->flusher.name="Sflusher";@+ Scache->flusher.stage=flush_to_mem;
|
945 |
|
|
}
|
946 |
|
|
|
947 |
|
|
@ Now we are nearly done. The only nontrivial task remaining is
|
948 |
|
|
to allocate the ring of queues for coroutine scheduling; for this we
|
949 |
|
|
need to determine the maximum waiting time that will occur between
|
950 |
|
|
scheduler and schedulee.
|
951 |
|
|
|
952 |
|
|
@=
|
953 |
|
|
bus_words=mem_bus_bytes>>3;
|
954 |
|
|
j=(mem_read_time
|
955 |
|
|
n=1;
|
956 |
|
|
if (Scache && Scache->bb>n) n=Scache->bb;
|
957 |
|
|
if (Icache && Icache->bb>n) n=Icache->bb;
|
958 |
|
|
if (Dcache && Dcache->bb>n) n=Dcache->bb;
|
959 |
|
|
n=mem_addr_time+((int)(n+bus_words-1)/bus_words)*j;
|
960 |
|
|
if (n>max_cycs) max_cycs=n; /* now |max_cycs| bounds the waiting time */
|
961 |
|
|
ring_size=max_cycs+1;
|
962 |
|
|
ring=(coroutine *)calloc(ring_size,sizeof(coroutine));
|
963 |
|
|
if (!ring) panic(errprint0("Can't allocate the scheduling ring"));
|
964 |
|
|
@.Can't allocate...@>
|
965 |
|
|
{@+register coroutine *p;
|
966 |
|
|
for (p=ring;p
|
967 |
|
|
p->name=""; /* header nodes are nameless */
|
968 |
|
|
p->stage=max_stage;
|
969 |
|
|
}
|
970 |
|
|
}
|
971 |
|
|
|
972 |
|
|
@ @s chunknode int
|
973 |
|
|
|
974 |
|
|
@=
|
975 |
|
|
if (hash_prime<=mem_chunks_max) panic(errprint0(
|
976 |
|
|
"Configuration error: hashprime must exceed memchunksmax"));
|
977 |
|
|
@.Configuration error...@>
|
978 |
|
|
mem_hash=(chunknode *)calloc(hash_prime+1,sizeof(chunknode));
|
979 |
|
|
if (!mem_hash) panic(errprint0("Can't allocate the hash table"));
|
980 |
|
|
@.Can't allocate...@>
|
981 |
|
|
mem_hash[0].chunk=(octa*)calloc(1<<13,sizeof(octa));
|
982 |
|
|
if (!mem_hash[0].chunk) panic(errprint0("Can't allocate chunk 0"));
|
983 |
|
|
mem_hash[hash_prime].chunk=(octa*)calloc(1<<13,sizeof(octa));
|
984 |
|
|
if (!mem_hash[hash_prime].chunk) panic(errprint0("Can't allocate 0 chunk"));
|
985 |
|
|
mem_chunks=1;
|
986 |
|
|
fetch_bot=(fetch*)calloc(fetch_buf_size+1,sizeof(fetch));
|
987 |
|
|
if (!fetch_bot) panic(errprint0("Can't allocate the fetch buffer"));
|
988 |
|
|
fetch_top=fetch_bot+fetch_buf_size;
|
989 |
|
|
reorder_bot=(control*)calloc(reorder_buf_size+1,sizeof(control));
|
990 |
|
|
if (!reorder_bot) panic(errprint0("Can't allocate the reorder buffer"));
|
991 |
|
|
reorder_top=reorder_bot+reorder_buf_size;
|
992 |
|
|
wbuf_bot=(write_node*)calloc(write_buf_size+1,sizeof(write_node));
|
993 |
|
|
if (!wbuf_bot) panic(errprint0("Can't allocate the write buffer"));
|
994 |
|
|
wbuf_top=wbuf_bot+write_buf_size;
|
995 |
|
|
if (bp_n==0) bp_table=NULL;
|
996 |
|
|
else { /* a branch prediction table is desired */
|
997 |
|
|
if (bp_a+bp_b+bp_c>=32) panic(errprint0(
|
998 |
|
|
"Configuration error: Branch table has >= 4 gigabytes of data"));
|
999 |
|
|
bp_table=(char*)calloc(1<<(bp_a+bp_b+bp_c),sizeof(char));
|
1000 |
|
|
if (!bp_table) panic(errprint0("Can't allocate the branch table"));
|
1001 |
|
|
}
|
1002 |
|
|
l=(specnode*)calloc(lring_size,sizeof(specnode));
|
1003 |
|
|
if (!l) panic(errprint0("Can't allocate local registers"));
|
1004 |
|
|
j=bus_words;
|
1005 |
|
|
if (Icache && Icache->bb>j) j=Icache->bb;
|
1006 |
|
|
fetched=(octa*)calloc(j,sizeof(octa));
|
1007 |
|
|
if (!fetched) panic(errprint0("Can't allocate prefetch buffer"));
|
1008 |
|
|
dispatch_stat=(int*)calloc(dispatch_max+1,sizeof(int));
|
1009 |
|
|
if (!dispatch_stat) panic(errprint0("Can't allocate dispatch counts"));
|
1010 |
|
|
no_hardware_PT=1-hardware_PT;
|
1011 |
|
|
|
1012 |
|
|
@* Putting it all together. Here then is the desired configuration
|
1013 |
|
|
subroutine.
|
1014 |
|
|
|
1015 |
|
|
@c
|
1016 |
|
|
#include /* |fopen|, |fgets|, |sscanf|, |rewind| */
|
1017 |
|
|
#include /* |calloc|, |exit| */
|
1018 |
|
|
#include /* |isspace| */
|
1019 |
|
|
#include /* |strcpy|, |strlen|, |strcmp| */
|
1020 |
|
|
#include /* |INT_MAX| */
|
1021 |
|
|
#include "mmix-pipe.h"
|
1022 |
|
|
@@;
|
1023 |
|
|
@@;
|
1024 |
|
|
@@;
|
1025 |
|
|
void MMIX_config(filename)
|
1026 |
|
|
char *filename;
|
1027 |
|
|
{@+register int i,j,n;
|
1028 |
|
|
config_file=fopen(filename,"r");
|
1029 |
|
|
if (!config_file)
|
1030 |
|
|
panic(errprint1("Can't open configuration file %s",filename));
|
1031 |
|
|
@.Can't open...@>
|
1032 |
|
|
@;
|
1033 |
|
|
@;
|
1034 |
|
|
@;
|
1035 |
|
|
@;
|
1036 |
|
|
@;
|
1037 |
|
|
@;
|
1038 |
|
|
@;
|
1039 |
|
|
}
|
1040 |
|
|
|
1041 |
|
|
@*Index.
|