OpenCores
URL https://opencores.org/ocsvn/openrisc/openrisc/trunk

Subversion Repositories openrisc

[/] [openrisc/] [trunk/] [gnu-dev/] [or1k-gcc/] [gcc/] [config/] [rs6000/] [cell.md] - Blame information for rev 801

Go to most recent revision | Details | Compare with Previous | View Log

Line No. Rev Author Line
1 709 jeremybenn
;; Scheduling description for cell processor.
2
;; Copyright (C) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2009
3
;; Free Software Foundation, Inc.
4
;; Contributed by Sony Computer Entertainment, Inc.,
5
 
6
 
7
;; This file is free software; you can redistribute it and/or modify it under
8
;; the terms of the GNU General Public License as published by the Free
9
;; Software Foundation; either version 3 of the License, or (at your option)
10
;; any later version.
11
 
12
;; This file is distributed in the hope that it will be useful, but WITHOUT
13
;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14
;; FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
15
;; for more details.
16
 
17
;; You should have received a copy of the GNU General Public License
18
;; along with GCC; see the file COPYING3.  If not see
19
;; .
20
 
21
;; Sources: BE BOOK4 (/sfs/enc/doc/PPU_BookIV_DD3.0_latest.pdf)
22
 
23
;; BE Architecture *DD3.0 and DD3.1*
24
;; This file simulate PPU processor unit backend of pipeline, maualP24.
25
;; manual P27, stall and flush points
26
;; IU, XU, VSU, dispatcher decodes and dispatch 2 insns per cycle in program
27
;;  order, the grouped address are aligned by 8
28
;; This file only simulate one thread situation
29
;; XU executes all fixed point insns(3 units, a simple alu, a complex unit,
30
;;   and load/store unit)
31
;; VSU executes all scalar floating points insn(a float unit),
32
;;   VMX insns(VMX unit, 4 sub units, simple, permute, complex, floating point)
33
 
34
;; Dual issue combination
35
 
36
;;      FXU     LSU     BR              VMX                    VMX
37
;;                             (sx,cx,vsu_fp,fp_arith)    (perm,vsu_ls,fp_ls)
38
;;FXU   X
39
;;LSU           X                       X                       X
40
;;BR                    X
41
;;VMX(sx,cx,vsu_fp,fp_arth)             X
42
;;VMX(perm,vsu_ls, fp_ls)                                       X
43
;;    X are illegal combination.
44
 
45
;; Dual issue exceptions:
46
;;(1) nop-pipelined FXU instr in slot 0
47
;;(2) non-pipelined FPU inst in slot 0
48
;; CSI instr(contex-synchronizing insn)
49
;; Microcode insn
50
 
51
;; BRU unit: bru(none register stall), bru_cr(cr register stall)
52
;; VSU unit: vus(vmx simple), vup(vmx permute), vuc(vmx complex),
53
;;  vuf(vmx float), fpu(floats). fpu_div is hypothetical, it is for
54
;;  nonpipelined simulation
55
;; micr insns will stall at least 7 cycles to get the first instr from ROM,
56
;;  micro instructions are not dual issued.
57
 
58
;; slot0 is older than slot1
59
;; non-pipelined insn need to be in slot1 to avoid 1cycle stall
60
 
61
;; There different stall point
62
;; IB2, only stall one thread if stall here, so try to stall here as much as
63
;; we can
64
;; condition(1) insert nop, OR and ORI instruction form
65
;; condition(2) flush happens, in case of: RAW, WAW, D-ERAT miss, or
66
;;   CR0-access while stdcx, or stwcx
67
;; IS2 stall ;; Page91 for details
68
;; VQ8 stall
69
;; IS2 stall can be activated by VQ8 stall and trying to issue a vsu instr to
70
;;  the vsu issue queue
71
 
72
;;(define_automaton "cellxu")
73
 
74
;;(define_cpu_unit "fxu_cell,lsu_cell,bru_cell,vsu1_cell,vsu2_cell" "cellxu")
75
 
76
;; ndfa
77
(define_automaton "cellxu,cellvsu,cellbru,cell_mis")
78
 
79
(define_cpu_unit "fxu_cell,lsu_cell" "cellxu")
80
(define_cpu_unit "bru_cell" "cellbru")
81
(define_cpu_unit "vsu1_cell,vsu2_cell" "cellvsu")
82
 
83
(define_cpu_unit "slot0,slot1" "cell_mis")
84
 
85
(absence_set "slot0" "slot1")
86
 
87
(define_reservation "nonpipeline" "fxu_cell+lsu_cell+vsu1_cell+vsu2_cell")
88
(define_reservation "slot01" "slot0|slot1")
89
 
90
 
91
;; Load/store
92
;; lmw, lswi, lswx are only generated for optimize for space, MC,
93
;;   these instr are not simulated
94
(define_insn_reservation "cell-load" 2
95
  (and (eq_attr "type" "load")
96
       (eq_attr "cpu" "cell"))
97
  "slot01,lsu_cell")
98
 
99
;; ldux, ldu, lbzux, lbzu, hardware breaks it down to two instrs,
100
;;  if with 32bytes alignment, CMC
101
(define_insn_reservation "cell-load-ux" 2
102
  (and (eq_attr "type" "load_ux,load_u")
103
       (eq_attr "cpu" "cell"))
104
  "slot01,fxu_cell+lsu_cell")
105
 
106
;; lha, lhax, lhau, lhaux, lwa, lwax, lwaux, MC, latency unknown
107
;;   11/7, 11/8, 11/12
108
(define_insn_reservation "cell-load-ext" 2
109
  (and (eq_attr "type" "load_ext,load_ext_u,load_ext_ux")
110
       (eq_attr "cpu" "cell"))
111
  "slot01,fxu_cell+lsu_cell")
112
 
113
;;lfs,lfsx,lfd,lfdx, 1 cycle
114
(define_insn_reservation "cell-fpload" 1
115
  (and (eq_attr "type" "fpload")
116
       (eq_attr "cpu" "cell"))
117
  "vsu2_cell+lsu_cell+slot01")
118
 
119
;; lfsu,lfsux,lfdu,lfdux 1cycle(fpr) 2 cycle(gpr)
120
(define_insn_reservation "cell-fpload-update" 1
121
  (and (eq_attr "type" "fpload,fpload_u,fpload_ux")
122
       (eq_attr "cpu" "cell"))
123
  "fxu_cell+vsu2_cell+lsu_cell+slot01")
124
 
125
(define_insn_reservation "cell-vecload" 2
126
  (and (eq_attr "type" "vecload")
127
       (eq_attr "cpu" "cell"))
128
  "slot01,vsu2_cell+lsu_cell")
129
 
130
;;st? stw(MC)
131
(define_insn_reservation "cell-store" 1
132
  (and (eq_attr "type" "store")
133
       (eq_attr "cpu" "cell"))
134
  "lsu_cell+slot01")
135
 
136
;;stdux, stdu, (hardware breaks into store and add) 2 for update reg
137
(define_insn_reservation "cell-store-update" 1
138
  (and (eq_attr "type" "store_ux,store_u")
139
       (eq_attr "cpu" "cell"))
140
  "fxu_cell+lsu_cell+slot01")
141
 
142
(define_insn_reservation "cell-fpstore" 1
143
  (and (eq_attr "type" "fpstore")
144
       (eq_attr "cpu" "cell"))
145
  "vsu2_cell+lsu_cell+slot01")
146
 
147
(define_insn_reservation "cell-fpstore-update" 1
148
  (and (eq_attr "type" "fpstore_ux,fpstore_u")
149
       (eq_attr "cpu" "cell"))
150
  "vsu2_cell+fxu_cell+lsu_cell+slot01")
151
 
152
(define_insn_reservation "cell-vecstore" 1
153
  (and (eq_attr "type" "vecstore")
154
       (eq_attr "cpu" "cell"))
155
  "vsu2_cell+lsu_cell+slot01")
156
 
157
;; Integer latency is 2 cycles
158
(define_insn_reservation "cell-integer" 2
159
  (and (eq_attr "type" "integer,insert_dword,shift,trap,\
160
                        var_shift_rotate,cntlz,exts,isel")
161
       (eq_attr "cpu" "cell"))
162
  "slot01,fxu_cell")
163
 
164
;; Two integer latency is 4 cycles
165
(define_insn_reservation "cell-two" 4
166
  (and (eq_attr "type" "two")
167
       (eq_attr "cpu" "cell"))
168
  "slot01,fxu_cell,fxu_cell*2")
169
 
170
;; Three integer latency is 6 cycles
171
(define_insn_reservation "cell-three" 6
172
  (and (eq_attr "type" "three")
173
       (eq_attr "cpu" "cell"))
174
  "slot01,fxu_cell,fxu_cell*4")
175
 
176
;; rlwimi, alter cr0
177
(define_insn_reservation "cell-insert" 2
178
  (and (eq_attr "type" "insert_word")
179
       (eq_attr "cpu" "cell"))
180
 "slot01,fxu_cell")
181
 
182
;; cmpi, cmpli, cmpla, add, addo, sub, subo, alter cr0
183
(define_insn_reservation "cell-cmp" 1
184
  (and (eq_attr "type" "cmp")
185
       (eq_attr "cpu" "cell"))
186
  "fxu_cell+slot01")
187
 
188
;; add, addo, sub, subo, alter cr0, rldcli, rlwinm
189
(define_insn_reservation "cell-fast-cmp" 2
190
  (and (and (eq_attr "type" "fast_compare,delayed_compare,compare,\
191
                            var_delayed_compare")
192
            (eq_attr "cpu" "cell"))
193
        (eq_attr "cell_micro" "not"))
194
  "slot01,fxu_cell")
195
 
196
(define_insn_reservation "cell-cmp-microcoded" 9
197
  (and (and (eq_attr "type" "fast_compare,delayed_compare,compare,\
198
                            var_delayed_compare")
199
            (eq_attr "cpu" "cell"))
200
        (eq_attr "cell_micro" "always"))
201
  "slot0+slot1,fxu_cell,fxu_cell*7")
202
 
203
;; mulld
204
(define_insn_reservation "cell-lmul" 15
205
  (and (eq_attr "type" "lmul")
206
       (eq_attr "cpu" "cell"))
207
  "slot1,nonpipeline,nonpipeline*13")
208
 
209
;; mulld. is microcoded
210
(define_insn_reservation "cell-lmul-cmp" 22
211
  (and (eq_attr "type" "lmul_compare")
212
       (eq_attr "cpu" "cell"))
213
  "slot0+slot1,nonpipeline,nonpipeline*20")
214
 
215
;; mulli, 6 cycles
216
(define_insn_reservation "cell-imul23" 6
217
  (and (eq_attr "type" "imul2,imul3")
218
       (eq_attr "cpu" "cell"))
219
  "slot1,nonpipeline,nonpipeline*4")
220
 
221
;; mullw, 9
222
(define_insn_reservation "cell-imul" 9
223
  (and (eq_attr "type" "imul")
224
       (eq_attr "cpu" "cell"))
225
  "slot1,nonpipeline,nonpipeline*7")
226
 
227
;; divide
228
(define_insn_reservation "cell-idiv" 32
229
  (and (eq_attr "type" "idiv")
230
       (eq_attr "cpu" "cell"))
231
  "slot1,nonpipeline,nonpipeline*30")
232
 
233
(define_insn_reservation "cell-ldiv" 64
234
  (and (eq_attr "type" "ldiv")
235
       (eq_attr "cpu" "cell"))
236
  "slot1,nonpipeline,nonpipeline*62")
237
 
238
;;mflr and mfctr are pipelined
239
(define_insn_reservation "cell-mfjmpr" 1
240
  (and (eq_attr "type" "mfjmpr")
241
       (eq_attr "cpu" "cell"))
242
  "slot01+bru_cell")
243
 
244
;;mtlr and mtctr,
245
;;mtspr fully pipelined
246
(define_insn_reservation "cell-mtjmpr" 1
247
 (and (eq_attr "type" "mtjmpr")
248
       (eq_attr "cpu" "cell"))
249
  "bru_cell+slot01")
250
 
251
;; Branches
252
;; b, ba, bl, bla, unconditional branch always predicts correctly n/a latency
253
;; bcctr, bcctrl, latency 2, actually adjust by be to 4
254
(define_insn_reservation "cell-branch" 1
255
  (and (eq_attr "type" "branch")
256
       (eq_attr "cpu" "cell"))
257
  "bru_cell+slot1")
258
 
259
(define_insn_reservation "cell-branchreg" 1
260
  (and (eq_attr "type" "jmpreg")
261
       (eq_attr "cpu" "cell"))
262
  "bru_cell+slot1")
263
 
264
;; cr hazard
265
;; page 90, special cases for CR hazard, only one instr can access cr per cycle
266
;; if insn reads CR following a stwcx, pipeline stall till stwcx finish
267
(define_insn_reservation "cell-crlogical" 1
268
  (and (eq_attr "type" "cr_logical,delayed_cr")
269
       (eq_attr "cpu" "cell"))
270
  "bru_cell+slot01")
271
 
272
;; mfcrf and mfcr is about 34 cycles and nonpipelined
273
(define_insn_reservation "cell-mfcr" 34
274
  (and (eq_attr "type" "mfcrf,mfcr")
275
       (eq_attr "cpu" "cell"))
276
   "slot1,nonpipeline,nonpipeline*32")
277
 
278
;; mtcrf (1 field)
279
(define_insn_reservation "cell-mtcrf" 1
280
  (and (eq_attr "type" "mtcr")
281
       (eq_attr "cpu" "cell"))
282
  "fxu_cell+slot01")
283
 
284
; Basic FP latency is 10 cycles, thoughput is 1/cycle
285
(define_insn_reservation "cell-fp" 10
286
  (and (eq_attr "type" "fp,dmul")
287
       (eq_attr "cpu" "cell"))
288
  "slot01,vsu1_cell,vsu1_cell*8")
289
 
290
(define_insn_reservation "cell-fpcompare" 1
291
  (and (eq_attr "type" "fpcompare")
292
       (eq_attr "cpu" "cell"))
293
  "vsu1_cell+slot01")
294
 
295
;; sdiv thoughput 1/74, not pipelined but only in the FPU
296
(define_insn_reservation "cell-sdiv" 74
297
  (and (eq_attr "type" "sdiv,ddiv")
298
       (eq_attr "cpu" "cell"))
299
  "slot1,nonpipeline,nonpipeline*72")
300
 
301
;; fsqrt thoughput 1/84, not pipelined but only in the FPU
302
(define_insn_reservation "cell-sqrt" 84
303
  (and (eq_attr "type" "ssqrt,dsqrt")
304
       (eq_attr "cpu" "cell"))
305
  "slot1,nonpipeline,nonpipeline*82")
306
 
307
; VMX
308
(define_insn_reservation "cell-vecsimple" 4
309
  (and (eq_attr "type" "vecsimple")
310
       (eq_attr "cpu" "cell"))
311
  "slot01,vsu1_cell,vsu1_cell*2")
312
 
313
;; mult, div, madd
314
(define_insn_reservation "cell-veccomplex" 10
315
  (and (eq_attr "type" "veccomplex")
316
       (eq_attr "cpu" "cell"))
317
  "slot01,vsu1_cell,vsu1_cell*8")
318
 
319
;; TODO: add support for recording instructions
320
(define_insn_reservation "cell-veccmp" 4
321
  (and (eq_attr "type" "veccmp")
322
       (eq_attr "cpu" "cell"))
323
  "slot01,vsu1_cell,vsu1_cell*2")
324
 
325
(define_insn_reservation "cell-vecfloat" 12
326
  (and (eq_attr "type" "vecfloat")
327
       (eq_attr "cpu" "cell"))
328
  "slot01,vsu1_cell,vsu1_cell*10")
329
 
330
(define_insn_reservation "cell-vecperm" 4
331
  (and (eq_attr "type" "vecperm")
332
       (eq_attr "cpu" "cell"))
333
  "slot01,vsu2_cell,vsu2_cell*2")
334
 
335
;; New for 4.2, syncs
336
 
337
(define_insn_reservation "cell-sync" 11
338
  (and (eq_attr "type" "sync")
339
       (eq_attr "cpu" "cell"))
340
  "slot01,lsu_cell,lsu_cell*9")
341
 
342
(define_insn_reservation "cell-isync" 11
343
  (and (eq_attr "type" "isync")
344
       (eq_attr "cpu" "cell"))
345
  "slot01,lsu_cell,lsu_cell*9")
346
 
347
(define_insn_reservation "cell-load_l" 11
348
  (and (eq_attr "type" "load_l")
349
       (eq_attr "cpu" "cell"))
350
  "slot01,lsu_cell,lsu_cell*9")
351
 
352
(define_insn_reservation "cell-store_c" 11
353
  (and (eq_attr "type" "store_c")
354
       (eq_attr "cpu" "cell"))
355
  "slot01,lsu_cell,lsu_cell*9")
356
 
357
;; RAW register dependency
358
 
359
;; addi r3, r3, 1
360
;; lw r4,offset(r3)
361
;; there are 5 cycle deplay for r3 bypassing
362
;; there are 5 cycle delay for a dependent load after a load
363
(define_bypass 5 "cell-integer" "cell-load")
364
(define_bypass 5 "cell-integer" "cell-load-ext")
365
(define_bypass 5 "cell-load,cell-load-ext" "cell-load,cell-load-ext")
366
 
367
;; there is a 6 cycle delay after a fp compare until you can use the cr.
368
(define_bypass 6 "cell-fpcompare" "cell-branch,cell-branchreg,cell-mfcr,cell-crlogical")
369
 
370
;; VXU float RAW
371
(define_bypass 11 "cell-vecfloat" "cell-vecfloat")
372
 
373
;; VXU and FPU
374
(define_bypass 6 "cell-veccomplex" "cell-vecsimple")
375
;;(define_bypass 6 "cell-veccompare" "cell-branch,cell-branchreg")
376
(define_bypass 3 "cell-vecfloat" "cell-veccomplex")
377
; this is not correct,
378
;;  this is a stall in general and not dependent on result
379
(define_bypass 13 "cell-vecstore" "cell-fpstore")
380
; this is not correct, this can never be true, not dependent on result
381
(define_bypass 7 "cell-fp" "cell-fpload")
382
;; vsu1 should avoid writing to the same target register as vsu2 insn
383
;;   within 12 cycles.
384
 
385
;; WAW hazard
386
 
387
;; the target of VSU estimate should not be reused within 10 dispatch groups
388
;; the target of VSU float should not be reused within 8 dispatch groups
389
;; the target of VSU complex should not be reused within 5 dispatch groups
390
;; FP LOAD should not reuse an FPU Arithmetic target with 6 dispatch gropus
391
 
392
;; mtctr-bcctr/bcctrl, branch target ctr register shadow update at
393
;;  ex4 stage(10 cycles)
394
(define_bypass 10 "cell-mtjmpr" "cell-branchreg")
395
 
396
;;Things are not simulated:
397
;; update instruction, update address gpr are not simulated
398
;; vrefp, vrsqrtefp have latency(14), currently simulated as 12 cycle float
399
;;  insns
400
 

powered by: WebSVN 2.1.0

© copyright 1999-2024 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.