OpenCores
URL https://opencores.org/ocsvn/openrisc/openrisc/trunk

Subversion Repositories openrisc

[/] [openrisc/] [trunk/] [gnu-old/] [gcc-4.2.2/] [gcc/] [config/] [i386/] [ppro.md] - Blame information for rev 816

Details | Compare with Previous | View Log

Line No. Rev Author Line
1 38 julius
;; Scheduling for the Intel P6 family of processors
2
;; Copyright (C) 2004, 2005, 2007 Free Software Foundation, Inc.
3
;;
4
;; This file is part of GCC.
5
;;
6
;; GCC is free software; you can redistribute it and/or modify
7
;; it under the terms of the GNU General Public License as published by
8
;; the Free Software Foundation; either version 3, or (at your option)
9
;; any later version.
10
;;
11
;; GCC is distributed in the hope that it will be useful,
12
;; but WITHOUT ANY WARRANTY; without even the implied warranty of
13
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14
;; GNU General Public License for more details.
15
;;
16
;; You should have received a copy of the GNU General Public License
17
;; along with GCC; see the file COPYING3.  If not see
18
;; .  */
19
 
20
;; The P6 family includes the Pentium Pro, Pentium II, Pentium III, Celeron
21
;; and Xeon lines of CPUs.  The DFA scheduler description in this file is
22
;; based on information that can be found in the following three documents:
23
;;
24
;;    "P6 Family of Processors Hardware Developer's Manual",
25
;;    Intel, September 1999.
26
;;
27
;;    "Intel Architecture Optimization Manual",
28
;;    Intel, 1999 (Order Number: 245127-001).
29
;;
30
;;    "How to optimize for the Pentium family of microprocessors",
31
;;    by Agner Fog, PhD.
32
;;
33
;; The P6 pipeline has three major components:
34
;;   1) the FETCH/DECODE unit, an in-order issue front-end
35
;;   2) the DISPATCH/EXECUTE unit, which is the out-of-order core
36
;;   3) the RETIRE unit, an in-order retirement unit
37
;;
38
;; So, the P6 CPUs have out-of-order cores, but the instruction decoder and
39
;; retirement unit are naturally in-order.
40
;;
41
;;                       BUS INTERFACE UNIT
42
;;                     /                   \
43
;;                L1 ICACHE             L1 DCACHE
44
;;              /     |     \              |     \
45
;;       DECODER0  DECODER1  DECODER2  DISP/EXEC  RETIRE
46
;;              \     |     /              |        |
47
;;            INSTRUCTION POOL   __________|_______/
48
;;          (inc. reorder buffer)
49
;;
50
;; Since the P6 CPUs execute instructions out-of-order, the most important
51
;; consideration in performance tuning is making sure enough micro-ops are
52
;; ready for execution in the out-of-order core, while not stalling the
53
;; decoder.
54
;;
55
;; TODO:
56
;; - Find a less crude way to model complex instructions, in
57
;;   particular how many cycles they take to be decoded.
58
;; - Include decoder latencies in the total reservation latencies.
59
;;   This isn't necessary right now because we assume for every
60
;;   instruction that it never blocks a decoder.
61
;; - Figure out where the p0 and p1 reservations come from.  These
62
;;   appear not to be in the manual (e.g. why is cld "(p0+p1)*2"
63
;;   better than "(p0|p1)*4" ???)
64
;; - Lots more because I'm sure this is still far from optimal :-)
65
 
66
;; The ppro_idiv and ppro_fdiv automata are used to model issue
67
;; latencies of idiv and fdiv type insns.
68
(define_automaton "ppro_decoder,ppro_core,ppro_idiv,ppro_fdiv,ppro_load,ppro_store")
69
 
70
;; Simple instructions of the register-register form have only one uop.
71
;; Load instructions are also only one uop.  Store instructions decode to
72
;; two uops, and simple read-modify instructions also take two uops.
73
;; Simple instructions of the register-memory form have two to three uops.
74
;; Simple read-modify-write instructions have four uops.  The rules for
75
;; the decoder are simple:
76
;;  - an instruction with 1 uop can be decoded by any of the three
77
;;    decoders in one cycle.
78
;;  - an instruction with 1 to 4 uops can be decoded only by decoder 0
79
;;    but still in only one cycle.
80
;;  - a complex (microcode) instruction can also only be decoded by
81
;;    decoder 0, and this takes an unspecified number of cycles.
82
;;
83
;; The goal is to schedule such that we have a few-one-one uops sequence
84
;; in each cycle, to decode as many instructions per cycle as possible.
85
(define_cpu_unit "decoder0" "ppro_decoder")
86
(define_cpu_unit "decoder1" "ppro_decoder")
87
(define_cpu_unit "decoder2" "ppro_decoder")
88
 
89
;; We first wish to find an instruction for decoder0, so exclude
90
;; decoder1 and decoder2 from being reserved until decoder 0 is
91
;; reserved.
92
(presence_set "decoder1" "decoder0")
93
(presence_set "decoder2" "decoder0")
94
 
95
;; Most instructions can be decoded on any of the three decoders.
96
(define_reservation "decodern" "(decoder0|decoder1|decoder2)")
97
 
98
;; The out-of-order core has five pipelines.  During each cycle, the core
99
;; may dispatch zero or one uop on the port of any of the five pipelines
100
;; so the maximum number of dispatched uops per cycle is 5.  In practicer,
101
;; 3 uops per cycle is more realistic.
102
;;
103
;; Two of the five pipelines contain several execution units:
104
;;
105
;; Port 0       Port 1          Port 2          Port 3          Port 4
106
;; ALU          ALU             LOAD            SAC             SDA
107
;; FPU          JUE
108
;; AGU          MMX
109
;; MMX          P3FPU
110
;; P3FPU
111
;;
112
;; (SAC=Store Address Calculation, SDA=Store Data Unit, P3FPU = SSE unit,
113
;;  JUE = Jump Execution Unit, AGU = Address Generation Unit)
114
;;
115
(define_cpu_unit "p0,p1" "ppro_core")
116
(define_cpu_unit "p2" "ppro_load")
117
(define_cpu_unit "p3,p4" "ppro_store")
118
(define_cpu_unit "idiv" "ppro_idiv")
119
(define_cpu_unit "fdiv" "ppro_fdiv")
120
 
121
;; Only the irregular instructions have to be modeled here.  A load
122
;; increases the latency by 2 or 3, or by nothing if the manual gives
123
;; a latency already.  Store latencies are not accounted for.
124
;;
125
;; The simple instructions follow a very regular pattern of 1 uop per
126
;; reg-reg operation, 1 uop per load on port 2. and 2 uops per store
127
;; on port 4 and port 3.  These instructions are modelled at the bottom
128
;; of this file.
129
;;
130
;; For microcoded instructions we don't know how many uops are produced.
131
;; These instructions are the "complex" ones in the Intel manuals.  All
132
;; we _do_ know is that they typically produce four or more uops, so
133
;; they can only be decoded on decoder0.  Modelling their latencies
134
;; doesn't make sense because we don't know how these instructions are
135
;; executed in the core.  So we just model that they can only be decoded
136
;; on decoder 0, and say that it takes a little while before the result
137
;; is available.
138
(define_insn_reservation "ppro_complex_insn" 6
139
                         (and (eq_attr "cpu" "pentiumpro,generic32")
140
                              (eq_attr "type" "other,multi,call,callv,str"))
141
                         "decoder0")
142
 
143
;; imov with memory operands does not use the integer units.
144
(define_insn_reservation "ppro_imov" 1
145
                         (and (eq_attr "cpu" "pentiumpro,generic32")
146
                              (and (eq_attr "memory" "none")
147
                                   (eq_attr "type" "imov")))
148
                         "decodern,(p0|p1)")
149
 
150
(define_insn_reservation "ppro_imov_load" 4
151
                         (and (eq_attr "cpu" "pentiumpro,generic32")
152
                              (and (eq_attr "memory" "load")
153
                                   (eq_attr "type" "imov")))
154
                         "decodern,p2")
155
 
156
(define_insn_reservation "ppro_imov_store" 1
157
                         (and (eq_attr "cpu" "pentiumpro,generic32")
158
                              (and (eq_attr "memory" "store")
159
                                   (eq_attr "type" "imov")))
160
                         "decoder0,p4+p3")
161
 
162
;; imovx always decodes to one uop, and also doesn't use the integer
163
;; units if it has memory operands.
164
(define_insn_reservation "ppro_imovx" 1
165
                         (and (eq_attr "cpu" "pentiumpro,generic32")
166
                              (and (eq_attr "memory" "none")
167
                                   (eq_attr "type" "imovx")))
168
                         "decodern,(p0|p1)")
169
 
170
(define_insn_reservation "ppro_imovx_load" 4
171
                         (and (eq_attr "cpu" "pentiumpro,generic32")
172
                              (and (eq_attr "memory" "load")
173
                                   (eq_attr "type" "imovx")))
174
                         "decodern,p2")
175
 
176
;; lea executes on port 0 with latency one and throughput 1.
177
(define_insn_reservation "ppro_lea" 1
178
                         (and (eq_attr "cpu" "pentiumpro,generic32")
179
                              (and (eq_attr "memory" "none")
180
                                   (eq_attr "type" "lea")))
181
                         "decodern,p0")
182
 
183
;; Shift and rotate execute on port 0 with latency and throughput 1.
184
;; The load and store units need to be reserved when memory operands
185
;; are involved.
186
(define_insn_reservation "ppro_shift_rotate" 1
187
                         (and (eq_attr "cpu" "pentiumpro,generic32")
188
                              (and (eq_attr "memory" "none")
189
                                   (eq_attr "type" "ishift,ishift1,rotate,rotate1")))
190
                         "decodern,p0")
191
 
192
(define_insn_reservation "ppro_shift_rotate_mem" 4
193
                         (and (eq_attr "cpu" "pentiumpro,generic32")
194
                              (and (eq_attr "memory" "!none")
195
                                   (eq_attr "type" "ishift,ishift1,rotate,rotate1")))
196
                         "decoder0,p2+p0,p4+p3")
197
 
198
(define_insn_reservation "ppro_cld" 2
199
                         (and (eq_attr "cpu" "pentiumpro,generic32")
200
                              (eq_attr "type" "cld"))
201
                         "decoder0,(p0+p1)*2")
202
 
203
;; The P6 has a sophisticated branch prediction mechanism to minimize
204
;; latencies due to branching.  In particular, it has a fast way to
205
;; execute branches that are taken multiple times (such as in loops).
206
;; Branches not taken suffer no penalty, and correctly predicted
207
;; branches cost only one fetch cycle.  Mispredicted branches are very
208
;; costly: typically 15 cycles and possibly as many as 26 cycles.
209
;;
210
;; Unfortunately all this makes it quite difficult to properly model
211
;; the latencies for the compiler.  Here I've made the choice to be
212
;; optimistic and assume branches are often predicted correctly, so
213
;; they have latency 1, and the decoders are not blocked.
214
;;
215
;; In addition, the model assumes a branch always decodes to only 1 uop,
216
;; which is not exactly true because there are a few instructions that
217
;; decode to 2 uops or microcode.  But this probably gives the best
218
;; results because we can assume these instructions can decode on all
219
;; decoders.
220
(define_insn_reservation "ppro_branch" 1
221
                         (and (eq_attr "cpu" "pentiumpro,generic32")
222
                              (and (eq_attr "memory" "none")
223
                                   (eq_attr "type" "ibr")))
224
                         "decodern,p1")
225
 
226
;; ??? Indirect branches probably have worse latency than this.
227
(define_insn_reservation "ppro_indirect_branch" 6
228
                         (and (eq_attr "cpu" "pentiumpro,generic32")
229
                              (and (eq_attr "memory" "!none")
230
                                   (eq_attr "type" "ibr")))
231
                         "decoder0,p2+p1")
232
 
233
(define_insn_reservation "ppro_leave" 4
234
                         (and (eq_attr "cpu" "pentiumpro,generic32")
235
                              (eq_attr "type" "leave"))
236
                         "decoder0,p2+(p0|p1),(p0|p1)")
237
 
238
;; imul has throughput one, but latency 4, and can only execute on port 0.
239
(define_insn_reservation "ppro_imul" 4
240
                         (and (eq_attr "cpu" "pentiumpro,generic32")
241
                              (and (eq_attr "memory" "none")
242
                                   (eq_attr "type" "imul")))
243
                         "decodern,p0")
244
 
245
(define_insn_reservation "ppro_imul_mem" 4
246
                         (and (eq_attr "cpu" "pentiumpro,generic32")
247
                              (and (eq_attr "memory" "!none")
248
                                   (eq_attr "type" "imul")))
249
                         "decoder0,p2+p0")
250
 
251
;; div and idiv are very similar, so we model them the same.
252
;; QI, HI, and SI have issue latency 12, 21, and 37, respectively.
253
;; These issue latencies are modelled via the ppro_div automaton.
254
(define_insn_reservation "ppro_idiv_QI" 19
255
                         (and (eq_attr "cpu" "pentiumpro,generic32")
256
                              (and (eq_attr "memory" "none")
257
                                   (and (eq_attr "mode" "QI")
258
                                        (eq_attr "type" "idiv"))))
259
                         "decoder0,(p0+idiv)*2,(p0|p1)+idiv,idiv*9")
260
 
261
(define_insn_reservation "ppro_idiv_QI_load" 19
262
                         (and (eq_attr "cpu" "pentiumpro,generic32")
263
                              (and (eq_attr "memory" "load")
264
                                   (and (eq_attr "mode" "QI")
265
                                        (eq_attr "type" "idiv"))))
266
                         "decoder0,p2+p0+idiv,p0+idiv,(p0|p1)+idiv,idiv*9")
267
 
268
(define_insn_reservation "ppro_idiv_HI" 23
269
                         (and (eq_attr "cpu" "pentiumpro,generic32")
270
                              (and (eq_attr "memory" "none")
271
                                   (and (eq_attr "mode" "HI")
272
                                        (eq_attr "type" "idiv"))))
273
                         "decoder0,(p0+idiv)*3,(p0|p1)+idiv,idiv*17")
274
 
275
(define_insn_reservation "ppro_idiv_HI_load" 23
276
                         (and (eq_attr "cpu" "pentiumpro,generic32")
277
                              (and (eq_attr "memory" "load")
278
                                   (and (eq_attr "mode" "HI")
279
                                        (eq_attr "type" "idiv"))))
280
                         "decoder0,p2+p0+idiv,p0+idiv,(p0|p1)+idiv,idiv*18")
281
 
282
(define_insn_reservation "ppro_idiv_SI" 39
283
                         (and (eq_attr "cpu" "pentiumpro,generic32")
284
                              (and (eq_attr "memory" "none")
285
                                   (and (eq_attr "mode" "SI")
286
                                        (eq_attr "type" "idiv"))))
287
                         "decoder0,(p0+idiv)*3,(p0|p1)+idiv,idiv*33")
288
 
289
(define_insn_reservation "ppro_idiv_SI_load" 39
290
                         (and (eq_attr "cpu" "pentiumpro,generic32")
291
                              (and (eq_attr "memory" "load")
292
                                   (and (eq_attr "mode" "SI")
293
                                        (eq_attr "type" "idiv"))))
294
                         "decoder0,p2+p0+idiv,p0+idiv,(p0|p1)+idiv,idiv*34")
295
 
296
;; Floating point operations always execute on port 0.
297
;; ??? where do these latencies come from? fadd has latency 3 and
298
;;     has throughput "1/cycle (align with FADD)".  What do they
299
;;     mean and how can we model that?
300
(define_insn_reservation "ppro_fop" 3
301
                         (and (eq_attr "cpu" "pentiumpro,generic32")
302
                              (and (eq_attr "memory" "none,unknown")
303
                                   (eq_attr "type" "fop")))
304
                         "decodern,p0")
305
 
306
(define_insn_reservation "ppro_fop_load" 5
307
                         (and (eq_attr "cpu" "pentiumpro,generic32")
308
                              (and (eq_attr "memory" "load")
309
                                   (eq_attr "type" "fop")))
310
                         "decoder0,p2+p0,p0")
311
 
312
(define_insn_reservation "ppro_fop_store" 3
313
                         (and (eq_attr "cpu" "pentiumpro,generic32")
314
                              (and (eq_attr "memory" "store")
315
                                   (eq_attr "type" "fop")))
316
                         "decoder0,p0,p0,p0+p4+p3")
317
 
318
(define_insn_reservation "ppro_fop_both" 5
319
                         (and (eq_attr "cpu" "pentiumpro,generic32")
320
                              (and (eq_attr "memory" "both")
321
                                   (eq_attr "type" "fop")))
322
                         "decoder0,p2+p0,p0+p4+p3")
323
 
324
(define_insn_reservation "ppro_fsgn" 1
325
                         (and (eq_attr "cpu" "pentiumpro,generic32")
326
                              (eq_attr "type" "fsgn"))
327
                         "decodern,p0")
328
 
329
(define_insn_reservation "ppro_fistp" 5
330
                         (and (eq_attr "cpu" "pentiumpro,generic32")
331
                              (eq_attr "type" "fistp"))
332
                         "decoder0,p0*2,p4+p3")
333
 
334
(define_insn_reservation "ppro_fcmov" 2
335
                         (and (eq_attr "cpu" "pentiumpro,generic32")
336
                              (eq_attr "type" "fcmov"))
337
                         "decoder0,p0*2")
338
 
339
(define_insn_reservation "ppro_fcmp" 1
340
                         (and (eq_attr "cpu" "pentiumpro,generic32")
341
                              (and (eq_attr "memory" "none")
342
                                   (eq_attr "type" "fcmp")))
343
                         "decodern,p0")
344
 
345
(define_insn_reservation "ppro_fcmp_load" 4
346
                         (and (eq_attr "cpu" "pentiumpro,generic32")
347
                              (and (eq_attr "memory" "load")
348
                                   (eq_attr "type" "fcmp")))
349
                         "decoder0,p2+p0")
350
 
351
(define_insn_reservation "ppro_fmov" 1
352
                         (and (eq_attr "cpu" "pentiumpro,generic32")
353
                              (and (eq_attr "memory" "none")
354
                                   (eq_attr "type" "fmov")))
355
                         "decodern,p0")
356
 
357
(define_insn_reservation "ppro_fmov_load" 1
358
                         (and (eq_attr "cpu" "pentiumpro,generic32")
359
                              (and (eq_attr "memory" "load")
360
                                   (and (eq_attr "mode" "!XF")
361
                                        (eq_attr "type" "fmov"))))
362
                         "decodern,p2")
363
 
364
(define_insn_reservation "ppro_fmov_XF_load" 3
365
                         (and (eq_attr "cpu" "pentiumpro,generic32")
366
                              (and (eq_attr "memory" "load")
367
                                   (and (eq_attr "mode" "XF")
368
                                        (eq_attr "type" "fmov"))))
369
                         "decoder0,(p2+p0)*2")
370
 
371
(define_insn_reservation "ppro_fmov_store" 1
372
                         (and (eq_attr "cpu" "pentiumpro,generic32")
373
                              (and (eq_attr "memory" "store")
374
                                   (and (eq_attr "mode" "!XF")
375
                                        (eq_attr "type" "fmov"))))
376
                         "decodern,p0")
377
 
378
(define_insn_reservation "ppro_fmov_XF_store" 3
379
                         (and (eq_attr "cpu" "pentiumpro,generic32")
380
                              (and (eq_attr "memory" "store")
381
                                   (and (eq_attr "mode" "XF")
382
                                        (eq_attr "type" "fmov"))))
383
                         "decoder0,(p0+p4),(p0+p3)")
384
 
385
;; fmul executes on port 0 with latency 5.  It has issue latency 2,
386
;; but we don't model this.
387
(define_insn_reservation "ppro_fmul" 5
388
                         (and (eq_attr "cpu" "pentiumpro,generic32")
389
                              (and (eq_attr "memory" "none")
390
                                   (eq_attr "type" "fmul")))
391
                         "decoder0,p0*2")
392
 
393
(define_insn_reservation "ppro_fmul_load" 6
394
                         (and (eq_attr "cpu" "pentiumpro,generic32")
395
                              (and (eq_attr "memory" "load")
396
                                   (eq_attr "type" "fmul")))
397
                         "decoder0,p2+p0,p0")
398
 
399
;; fdiv latencies depend on the mode of the operands.  XFmode gives
400
;; a latency of 38 cycles, DFmode gives 32, and SFmode gives latency 18.
401
;; Division by a power of 2 takes only 9 cycles, but we cannot model
402
;; that.  Throughput is equal to latency - 1, which we model using the
403
;; ppro_div automaton.
404
(define_insn_reservation "ppro_fdiv_SF" 18
405
                         (and (eq_attr "cpu" "pentiumpro,generic32")
406
                              (and (eq_attr "memory" "none")
407
                                   (and (eq_attr "mode" "SF")
408
                                        (eq_attr "type" "fdiv,fpspc"))))
409
                         "decodern,p0+fdiv,fdiv*16")
410
 
411
(define_insn_reservation "ppro_fdiv_SF_load" 19
412
                         (and (eq_attr "cpu" "pentiumpro,generic32")
413
                              (and (eq_attr "memory" "load")
414
                                   (and (eq_attr "mode" "SF")
415
                                        (eq_attr "type" "fdiv,fpspc"))))
416
                         "decoder0,p2+p0+fdiv,fdiv*16")
417
 
418
(define_insn_reservation "ppro_fdiv_DF" 32
419
                         (and (eq_attr "cpu" "pentiumpro,generic32")
420
                              (and (eq_attr "memory" "none")
421
                                   (and (eq_attr "mode" "DF")
422
                                        (eq_attr "type" "fdiv,fpspc"))))
423
                         "decodern,p0+fdiv,fdiv*30")
424
 
425
(define_insn_reservation "ppro_fdiv_DF_load" 33
426
                         (and (eq_attr "cpu" "pentiumpro,generic32")
427
                              (and (eq_attr "memory" "load")
428
                                   (and (eq_attr "mode" "DF")
429
                                        (eq_attr "type" "fdiv,fpspc"))))
430
                         "decoder0,p2+p0+fdiv,fdiv*30")
431
 
432
(define_insn_reservation "ppro_fdiv_XF" 38
433
                         (and (eq_attr "cpu" "pentiumpro,generic32")
434
                              (and (eq_attr "memory" "none")
435
                                   (and (eq_attr "mode" "XF")
436
                                        (eq_attr "type" "fdiv,fpspc"))))
437
                         "decodern,p0+fdiv,fdiv*36")
438
 
439
(define_insn_reservation "ppro_fdiv_XF_load" 39
440
                         (and (eq_attr "cpu" "pentiumpro,generic32")
441
                              (and (eq_attr "memory" "load")
442
                                   (and (eq_attr "mode" "XF")
443
                                        (eq_attr "type" "fdiv,fpspc"))))
444
                         "decoder0,p2+p0+fdiv,fdiv*36")
445
 
446
;; MMX instructions can execute on either port 0 or port 1 with a
447
;; throughput of 1/cycle.
448
;;   on port 0: - ALU (latency 1)
449
;;              - Multiplier Unit (latency 3)
450
;;   on port 1: - ALU (latency 1)
451
;;              - Shift Unit (latency 1)
452
;;
453
;; MMX instructions are either of the type reg-reg, or read-modify, and
454
;; except for mmxshft and mmxmul they can execute on port 0 or port 1,
455
;; so they behave as "simple" instructions that need no special modelling.
456
;; We only have to model mmxshft and mmxmul.
457
(define_insn_reservation "ppro_mmx_shft" 1
458
                         (and (eq_attr "cpu" "pentiumpro,generic32")
459
                              (and (eq_attr "memory" "none")
460
                                   (eq_attr "type" "mmxshft")))
461
                         "decodern,p1")
462
 
463
(define_insn_reservation "ppro_mmx_shft_load" 2
464
                         (and (eq_attr "cpu" "pentiumpro,generic32")
465
                              (and (eq_attr "memory" "none")
466
                                   (eq_attr "type" "mmxshft")))
467
                         "decoder0,p2+p1")
468
 
469
(define_insn_reservation "ppro_mmx_mul" 3
470
                         (and (eq_attr "cpu" "pentiumpro,generic32")
471
                              (and (eq_attr "memory" "none")
472
                                   (eq_attr "type" "mmxmul")))
473
                         "decodern,p0")
474
 
475
(define_insn_reservation "ppro_mmx_mul_load" 3
476
                         (and (eq_attr "cpu" "pentiumpro,generic32")
477
                              (and (eq_attr "memory" "none")
478
                                   (eq_attr "type" "mmxmul")))
479
                         "decoder0,p2+p0")
480
 
481
(define_insn_reservation "ppro_sse_mmxcvt" 4
482
                         (and (eq_attr "cpu" "pentiumpro,generic32")
483
                              (and (eq_attr "mode" "DI")
484
                                   (eq_attr "type" "mmxcvt")))
485
                         "decodern,p1")
486
 
487
;; FIXME: These are Pentium III only, but we cannot tell here if
488
;; we're generating code for PentiumPro/Pentium II or Pentium III
489
;; (define_insn_reservation "ppro_sse_mmxshft" 2
490
;;                       (and (eq_attr "cpu" "pentiumpro,generic32")
491
;;                            (and (eq_attr "mode" "DI")
492
;;                                 (eq_attr "type" "mmxshft")))
493
;;                       "decodern,p0")
494
 
495
;; SSE is very complicated, and takes a bit more effort.
496
;; ??? I assumed that all SSE instructions decode on decoder0,
497
;;     but is this correct?
498
 
499
;; The sfence instruction.
500
(define_insn_reservation "ppro_sse_sfence" 3
501
                         (and (eq_attr "cpu" "pentiumpro,generic32")
502
                              (and (eq_attr "memory" "unknown")
503
                                   (eq_attr "type" "sse")))
504
                         "decoder0,p4+p3")
505
 
506
;; FIXME: This reservation is all wrong when we're scheduling sqrtss.
507
(define_insn_reservation "ppro_sse_SF" 3
508
                         (and (eq_attr "cpu" "pentiumpro,generic32")
509
                              (and (eq_attr "mode" "SF")
510
                                   (eq_attr "type" "sse")))
511
                         "decodern,p0")
512
 
513
(define_insn_reservation "ppro_sse_add_SF" 3
514
                         (and (eq_attr "cpu" "pentiumpro,generic32")
515
                              (and (eq_attr "memory" "none")
516
                                   (and (eq_attr "mode" "SF")
517
                                        (eq_attr "type" "sseadd"))))
518
                         "decodern,p1")
519
 
520
(define_insn_reservation "ppro_sse_add_SF_load" 3
521
                         (and (eq_attr "cpu" "pentiumpro,generic32")
522
                              (and (eq_attr "memory" "load")
523
                                   (and (eq_attr "mode" "SF")
524
                                        (eq_attr "type" "sseadd"))))
525
                         "decoder0,p2+p1")
526
 
527
(define_insn_reservation "ppro_sse_cmp_SF" 3
528
                         (and (eq_attr "cpu" "pentiumpro,generic32")
529
                              (and (eq_attr "memory" "none")
530
                                   (and (eq_attr "mode" "SF")
531
                                        (eq_attr "type" "ssecmp"))))
532
                         "decoder0,p1")
533
 
534
(define_insn_reservation "ppro_sse_cmp_SF_load" 3
535
                         (and (eq_attr "cpu" "pentiumpro,generic32")
536
                              (and (eq_attr "memory" "load")
537
                                   (and (eq_attr "mode" "SF")
538
                                        (eq_attr "type" "ssecmp"))))
539
                         "decoder0,p2+p1")
540
 
541
(define_insn_reservation "ppro_sse_comi_SF" 1
542
                         (and (eq_attr "cpu" "pentiumpro,generic32")
543
                              (and (eq_attr "memory" "none")
544
                                   (and (eq_attr "mode" "SF")
545
                                        (eq_attr "type" "ssecomi"))))
546
                         "decodern,p0")
547
 
548
(define_insn_reservation "ppro_sse_comi_SF_load" 1
549
                         (and (eq_attr "cpu" "pentiumpro,generic32")
550
                              (and (eq_attr "memory" "load")
551
                                   (and (eq_attr "mode" "SF")
552
                                        (eq_attr "type" "ssecomi"))))
553
                         "decoder0,p2+p0")
554
 
555
(define_insn_reservation "ppro_sse_mul_SF" 4
556
                         (and (eq_attr "cpu" "pentiumpro,generic32")
557
                              (and (eq_attr "memory" "none")
558
                                   (and (eq_attr "mode" "SF")
559
                                        (eq_attr "type" "ssemul"))))
560
                        "decodern,p0")
561
 
562
(define_insn_reservation "ppro_sse_mul_SF_load" 4
563
                         (and (eq_attr "cpu" "pentiumpro,generic32")
564
                              (and (eq_attr "memory" "load")
565
                                   (and (eq_attr "mode" "SF")
566
                                        (eq_attr "type" "ssemul"))))
567
                        "decoder0,p2+p0")
568
 
569
;; FIXME: ssediv doesn't close p0 for 17 cycles, surely???
570
(define_insn_reservation "ppro_sse_div_SF" 18
571
                         (and (eq_attr "cpu" "pentiumpro,generic32")
572
                              (and (eq_attr "memory" "none")
573
                                   (and (eq_attr "mode" "SF")
574
                                        (eq_attr "type" "ssediv"))))
575
                         "decoder0,p0*17")
576
 
577
(define_insn_reservation "ppro_sse_div_SF_load" 18
578
                         (and (eq_attr "cpu" "pentiumpro,generic32")
579
                              (and (eq_attr "memory" "none")
580
                                   (and (eq_attr "mode" "SF")
581
                                        (eq_attr "type" "ssediv"))))
582
                         "decoder0,(p2+p0),p0*16")
583
 
584
(define_insn_reservation "ppro_sse_icvt_SF" 4
585
                         (and (eq_attr "cpu" "pentiumpro,generic32")
586
                              (and (eq_attr "mode" "SF")
587
                                   (eq_attr "type" "sseicvt")))
588
                         "decoder0,(p2+p1)*2")
589
 
590
(define_insn_reservation "ppro_sse_icvt_SI" 3
591
                         (and (eq_attr "cpu" "pentiumpro,generic32")
592
                              (and (eq_attr "mode" "SI")
593
                                   (eq_attr "type" "sseicvt")))
594
                         "decoder0,(p2+p1)")
595
 
596
(define_insn_reservation "ppro_sse_mov_SF" 3
597
                         (and (eq_attr "cpu" "pentiumpro,generic32")
598
                              (and (eq_attr "memory" "none")
599
                                   (and (eq_attr "mode" "SF")
600
                                        (eq_attr "type" "ssemov"))))
601
                         "decoder0,(p0|p1)")
602
 
603
(define_insn_reservation "ppro_sse_mov_SF_load" 3
604
                         (and (eq_attr "cpu" "pentiumpro,generic32")
605
                              (and (eq_attr "memory" "load")
606
                                   (and (eq_attr "mode" "SF")
607
                                        (eq_attr "type" "ssemov"))))
608
                         "decoder0,p2+(p0|p1)")
609
 
610
(define_insn_reservation "ppro_sse_mov_SF_store" 3
611
                         (and (eq_attr "cpu" "pentiumpro,generic32")
612
                              (and (eq_attr "memory" "store")
613
                                   (and (eq_attr "mode" "SF")
614
                                        (eq_attr "type" "ssemov"))))
615
                         "decoder0,p4+p3")
616
 
617
(define_insn_reservation "ppro_sse_V4SF" 4
618
                         (and (eq_attr "cpu" "pentiumpro,generic32")
619
                              (and (eq_attr "mode" "V4SF")
620
                                   (eq_attr "type" "sse")))
621
                         "decoder0,p1*2")
622
 
623
(define_insn_reservation "ppro_sse_add_V4SF" 3
624
                         (and (eq_attr "cpu" "pentiumpro,generic32")
625
                              (and (eq_attr "memory" "none")
626
                                   (and (eq_attr "mode" "V4SF")
627
                                        (eq_attr "type" "sseadd"))))
628
                         "decoder0,p1*2")
629
 
630
(define_insn_reservation "ppro_sse_add_V4SF_load" 3
631
                         (and (eq_attr "cpu" "pentiumpro,generic32")
632
                              (and (eq_attr "memory" "load")
633
                                   (and (eq_attr "mode" "V4SF")
634
                                        (eq_attr "type" "sseadd"))))
635
                         "decoder0,(p2+p1)*2")
636
 
637
(define_insn_reservation "ppro_sse_cmp_V4SF" 3
638
                         (and (eq_attr "cpu" "pentiumpro,generic32")
639
                              (and (eq_attr "memory" "none")
640
                                   (and (eq_attr "mode" "V4SF")
641
                                        (eq_attr "type" "ssecmp"))))
642
                         "decoder0,p1*2")
643
 
644
(define_insn_reservation "ppro_sse_cmp_V4SF_load" 3
645
                         (and (eq_attr "cpu" "pentiumpro,generic32")
646
                              (and (eq_attr "memory" "load")
647
                                   (and (eq_attr "mode" "V4SF")
648
                                        (eq_attr "type" "ssecmp"))))
649
                         "decoder0,(p2+p1)*2")
650
 
651
(define_insn_reservation "ppro_sse_cvt_V4SF" 3
652
                         (and (eq_attr "cpu" "pentiumpro,generic32")
653
                              (and (eq_attr "memory" "none,unknown")
654
                                   (and (eq_attr "mode" "V4SF")
655
                                        (eq_attr "type" "ssecvt"))))
656
                         "decoder0,p1*2")
657
 
658
(define_insn_reservation "ppro_sse_cvt_V4SF_other" 4
659
                         (and (eq_attr "cpu" "pentiumpro,generic32")
660
                              (and (eq_attr "memory" "!none,unknown")
661
                                   (and (eq_attr "mode" "V4SF")
662
                                        (eq_attr "type" "ssecmp"))))
663
                         "decoder0,p1,p4+p3")
664
 
665
(define_insn_reservation "ppro_sse_mul_V4SF" 5
666
                         (and (eq_attr "cpu" "pentiumpro,generic32")
667
                              (and (eq_attr "memory" "none")
668
                                   (and (eq_attr "mode" "V4SF")
669
                                        (eq_attr "type" "ssemul"))))
670
                        "decoder0,p0*2")
671
 
672
(define_insn_reservation "ppro_sse_mul_V4SF_load" 5
673
                         (and (eq_attr "cpu" "pentiumpro,generic32")
674
                              (and (eq_attr "memory" "load")
675
                                   (and (eq_attr "mode" "V4SF")
676
                                        (eq_attr "type" "ssemul"))))
677
                        "decoder0,(p2+p0)*2")
678
 
679
;; FIXME: p0 really closed this long???
680
(define_insn_reservation "ppro_sse_div_V4SF" 48
681
                         (and (eq_attr "cpu" "pentiumpro,generic32")
682
                              (and (eq_attr "memory" "none")
683
                                   (and (eq_attr "mode" "V4SF")
684
                                        (eq_attr "type" "ssediv"))))
685
                         "decoder0,p0*34")
686
 
687
(define_insn_reservation "ppro_sse_div_V4SF_load" 48
688
                         (and (eq_attr "cpu" "pentiumpro,generic32")
689
                              (and (eq_attr "memory" "load")
690
                                   (and (eq_attr "mode" "V4SF")
691
                                        (eq_attr "type" "ssediv"))))
692
                         "decoder0,(p2+p0)*2,p0*32")
693
 
694
(define_insn_reservation "ppro_sse_log_V4SF" 2
695
                         (and (eq_attr "cpu" "pentiumpro,generic32")
696
                              (and (eq_attr "memory" "none")
697
                                   (and (eq_attr "mode" "V4SF")
698
                                        (eq_attr "type" "sselog,sselog1"))))
699
                         "decodern,p1")
700
 
701
(define_insn_reservation "ppro_sse_log_V4SF_load" 2
702
                         (and (eq_attr "cpu" "pentiumpro,generic32")
703
                              (and (eq_attr "memory" "load")
704
                                   (and (eq_attr "mode" "V4SF")
705
                                        (eq_attr "type" "sselog,sselog1"))))
706
                         "decoder0,(p2+p1)")
707
 
708
(define_insn_reservation "ppro_sse_mov_V4SF" 1
709
                         (and (eq_attr "cpu" "pentiumpro,generic32")
710
                              (and (eq_attr "memory" "none")
711
                                   (and (eq_attr "mode" "V4SF")
712
                                        (eq_attr "type" "ssemov"))))
713
                         "decoder0,(p0|p1)*2")
714
 
715
(define_insn_reservation "ppro_sse_mov_V4SF_load" 2
716
                         (and (eq_attr "cpu" "pentiumpro,generic32")
717
                              (and (eq_attr "memory" "load")
718
                                   (and (eq_attr "mode" "V4SF")
719
                                        (eq_attr "type" "ssemov"))))
720
                         "decoder0,p2*2")
721
 
722
(define_insn_reservation "ppro_sse_mov_V4SF_store" 3
723
                         (and (eq_attr "cpu" "pentiumpro,generic32")
724
                              (and (eq_attr "memory" "store")
725
                                   (and (eq_attr "mode" "V4SF")
726
                                        (eq_attr "type" "ssemov"))))
727
                         "decoder0,(p4+p3)*2")
728
 
729
;; All other instructions are modelled as simple instructions.
730
;; We have already modelled all i387 floating point instructions, so all
731
;; other instructions execute on either port 0 or port 1.  This includes
732
;; the ALU units, and the MMX units.
733
;;
734
;; reg-reg instructions produce 1 uop so they can be decoded on any of
735
;; the three decoders.
736
(define_insn_reservation "ppro_insn" 1
737
                         (and (eq_attr "cpu" "pentiumpro,generic32")
738
                              (and (eq_attr "memory" "none,unknown")
739
                                   (eq_attr "type" "alu,alu1,negnot,incdec,icmp,test,setcc,icmov,push,pop,fxch,sseiadd,sseishft,sseimul,mmx,mmxadd,mmxcmp")))
740
                         "decodern,(p0|p1)")
741
 
742
;; read-modify and register-memory instructions have 2 or three uops,
743
;; so they have to be decoded on decoder0.
744
(define_insn_reservation "ppro_insn_load" 3
745
                         (and (eq_attr "cpu" "pentiumpro,generic32")
746
                              (and (eq_attr "memory" "load")
747
                                   (eq_attr "type" "alu,alu1,negnot,incdec,icmp,test,setcc,icmov,push,pop,fxch,sseiadd,sseishft,sseimul,mmx,mmxadd,mmxcmp")))
748
                         "decoder0,p2+(p0|p1)")
749
 
750
(define_insn_reservation "ppro_insn_store" 1
751
                         (and (eq_attr "cpu" "pentiumpro,generic32")
752
                              (and (eq_attr "memory" "store")
753
                                   (eq_attr "type" "alu,alu1,negnot,incdec,icmp,test,setcc,icmov,push,pop,fxch,sseiadd,sseishft,sseimul,mmx,mmxadd,mmxcmp")))
754
                         "decoder0,(p0|p1),p4+p3")
755
 
756
;; read-modify-store instructions produce 4 uops so they have to be
757
;; decoded on decoder0 as well.
758
(define_insn_reservation "ppro_insn_both" 4
759
                         (and (eq_attr "cpu" "pentiumpro,generic32")
760
                              (and (eq_attr "memory" "both")
761
                                   (eq_attr "type" "alu,alu1,negnot,incdec,icmp,test,setcc,icmov,push,pop,fxch,sseiadd,sseishft,sseimul,mmx,mmxadd,mmxcmp")))
762
                         "decoder0,p2+(p0|p1),p4+p3")
763
 

powered by: WebSVN 2.1.0

© copyright 1999-2024 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.