URL https://opencores.org/ocsvn/openrisc/openrisc/trunk

Subversion Repositories openrisc

[/] [openrisc/] [trunk/] [gnu-dev/] [or1k-gcc/] [gcc/] [config/] [rs6000/] [cell.md] - Blame information for rev 801

Go to most recent revision | Details | Compare with Previous | View Log


;; Scheduling description for cell processor.
;; Copyright (C) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2009
;; Free Software Foundation, Inc.
;; Contributed by Sony Computer Entertainment, Inc.,
 
 
;; This file is free software; you can redistribute it and/or modify it under
;; the terms of the GNU General Public License as published by the Free
;; Software Foundation; either version 3 of the License, or (at your option)
;; any later version.
 
;; This file is distributed in the hope that it will be useful, but WITHOUT
;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
;; FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
;; for more details.
 
;; You should have received a copy of the GNU General Public License
;; along with GCC; see the file COPYING3.  If not see
;; .
 
;; Sources: BE BOOK4 (/sfs/enc/doc/PPU_BookIV_DD3.0_latest.pdf)
 
;; BE Architecture *DD3.0 and DD3.1*
;; This file simulate PPU processor unit backend of pipeline, maualP24.
;; manual P27, stall and flush points
;; IU, XU, VSU, dispatcher decodes and dispatch 2 insns per cycle in program
;;  order, the grouped address are aligned by 8
;; This file only simulate one thread situation
;; XU executes all fixed point insns(3 units, a simple alu, a complex unit,
;;   and load/store unit)
;; VSU executes all scalar floating points insn(a float unit),
;;   VMX insns(VMX unit, 4 sub units, simple, permute, complex, floating point)
 
;; Dual issue combination
 
;;      FXU     LSU     BR              VMX                    VMX
;;                             (sx,cx,vsu_fp,fp_arith)    (perm,vsu_ls,fp_ls)
;;FXU   X
;;LSU           X                       X                       X
;;BR                    X
;;VMX(sx,cx,vsu_fp,fp_arth)             X
;;VMX(perm,vsu_ls, fp_ls)                                       X
;;    X are illegal combination.
 
;; Dual issue exceptions:
;;(1) nop-pipelined FXU instr in slot 0
;;(2) non-pipelined FPU inst in slot 0
;; CSI instr(contex-synchronizing insn)
;; Microcode insn
 
;; BRU unit: bru(none register stall), bru_cr(cr register stall)
;; VSU unit: vus(vmx simple), vup(vmx permute), vuc(vmx complex),
;;  vuf(vmx float), fpu(floats). fpu_div is hypothetical, it is for
;;  nonpipelined simulation
;; micr insns will stall at least 7 cycles to get the first instr from ROM,
;;  micro instructions are not dual issued.
 
;; slot0 is older than slot1
;; non-pipelined insn need to be in slot1 to avoid 1cycle stall
 
;; There different stall point
;; IB2, only stall one thread if stall here, so try to stall here as much as
;; we can
;; condition(1) insert nop, OR and ORI instruction form
;; condition(2) flush happens, in case of: RAW, WAW, D-ERAT miss, or
;;   CR0-access while stdcx, or stwcx
;; IS2 stall ;; Page91 for details
;; VQ8 stall
;; IS2 stall can be activated by VQ8 stall and trying to issue a vsu instr to
;;  the vsu issue queue
 
;;(define_automaton "cellxu")
 
;;(define_cpu_unit "fxu_cell,lsu_cell,bru_cell,vsu1_cell,vsu2_cell" "cellxu")
 
;; ndfa
(define_automaton "cellxu,cellvsu,cellbru,cell_mis")
 
(define_cpu_unit "fxu_cell,lsu_cell" "cellxu")
(define_cpu_unit "bru_cell" "cellbru")
(define_cpu_unit "vsu1_cell,vsu2_cell" "cellvsu")
 
(define_cpu_unit "slot0,slot1" "cell_mis")
 
(absence_set "slot0" "slot1")
 
(define_reservation "nonpipeline" "fxu_cell+lsu_cell+vsu1_cell+vsu2_cell")
(define_reservation "slot01" "slot0|slot1")
 
 
;; Load/store
;; lmw, lswi, lswx are only generated for optimize for space, MC,
;;   these instr are not simulated
(define_insn_reservation "cell-load" 2
  (and (eq_attr "type" "load")
       (eq_attr "cpu" "cell"))
  "slot01,lsu_cell")
 
;; ldux, ldu, lbzux, lbzu, hardware breaks it down to two instrs,
;;  if with 32bytes alignment, CMC
(define_insn_reservation "cell-load-ux" 2
  (and (eq_attr "type" "load_ux,load_u")
       (eq_attr "cpu" "cell"))
  "slot01,fxu_cell+lsu_cell")
 
;; lha, lhax, lhau, lhaux, lwa, lwax, lwaux, MC, latency unknown
;;   11/7, 11/8, 11/12
(define_insn_reservation "cell-load-ext" 2
  (and (eq_attr "type" "load_ext,load_ext_u,load_ext_ux")
       (eq_attr "cpu" "cell"))
  "slot01,fxu_cell+lsu_cell")
 
;;lfs,lfsx,lfd,lfdx, 1 cycle
(define_insn_reservation "cell-fpload" 1
  (and (eq_attr "type" "fpload")
       (eq_attr "cpu" "cell"))
  "vsu2_cell+lsu_cell+slot01")
 
;; lfsu,lfsux,lfdu,lfdux 1cycle(fpr) 2 cycle(gpr)
(define_insn_reservation "cell-fpload-update" 1
  (and (eq_attr "type" "fpload,fpload_u,fpload_ux")
       (eq_attr "cpu" "cell"))
  "fxu_cell+vsu2_cell+lsu_cell+slot01")
 
(define_insn_reservation "cell-vecload" 2
  (and (eq_attr "type" "vecload")
       (eq_attr "cpu" "cell"))
  "slot01,vsu2_cell+lsu_cell")
 
;;st? stw(MC)
(define_insn_reservation "cell-store" 1
  (and (eq_attr "type" "store")
       (eq_attr "cpu" "cell"))
  "lsu_cell+slot01")
 
;;stdux, stdu, (hardware breaks into store and add) 2 for update reg
(define_insn_reservation "cell-store-update" 1
  (and (eq_attr "type" "store_ux,store_u")
       (eq_attr "cpu" "cell"))
  "fxu_cell+lsu_cell+slot01")
 
(define_insn_reservation "cell-fpstore" 1
  (and (eq_attr "type" "fpstore")
       (eq_attr "cpu" "cell"))
  "vsu2_cell+lsu_cell+slot01")
 
(define_insn_reservation "cell-fpstore-update" 1
  (and (eq_attr "type" "fpstore_ux,fpstore_u")
       (eq_attr "cpu" "cell"))
  "vsu2_cell+fxu_cell+lsu_cell+slot01")
 
(define_insn_reservation "cell-vecstore" 1
  (and (eq_attr "type" "vecstore")
       (eq_attr "cpu" "cell"))
  "vsu2_cell+lsu_cell+slot01")
 
;; Integer latency is 2 cycles
(define_insn_reservation "cell-integer" 2
  (and (eq_attr "type" "integer,insert_dword,shift,trap,\
                        var_shift_rotate,cntlz,exts,isel")
       (eq_attr "cpu" "cell"))
  "slot01,fxu_cell")
 
;; Two integer latency is 4 cycles
(define_insn_reservation "cell-two" 4
  (and (eq_attr "type" "two")
       (eq_attr "cpu" "cell"))
  "slot01,fxu_cell,fxu_cell*2")
 
;; Three integer latency is 6 cycles
(define_insn_reservation "cell-three" 6
  (and (eq_attr "type" "three")
       (eq_attr "cpu" "cell"))
  "slot01,fxu_cell,fxu_cell*4")
 
;; rlwimi, alter cr0
(define_insn_reservation "cell-insert" 2
  (and (eq_attr "type" "insert_word")
       (eq_attr "cpu" "cell"))
 "slot01,fxu_cell")
 
;; cmpi, cmpli, cmpla, add, addo, sub, subo, alter cr0
(define_insn_reservation "cell-cmp" 1
  (and (eq_attr "type" "cmp")
       (eq_attr "cpu" "cell"))
  "fxu_cell+slot01")
 
;; add, addo, sub, subo, alter cr0, rldcli, rlwinm
(define_insn_reservation "cell-fast-cmp" 2
  (and (and (eq_attr "type" "fast_compare,delayed_compare,compare,\
                            var_delayed_compare")
            (eq_attr "cpu" "cell"))
        (eq_attr "cell_micro" "not"))
  "slot01,fxu_cell")
 
(define_insn_reservation "cell-cmp-microcoded" 9
  (and (and (eq_attr "type" "fast_compare,delayed_compare,compare,\
                            var_delayed_compare")
            (eq_attr "cpu" "cell"))
        (eq_attr "cell_micro" "always"))
  "slot0+slot1,fxu_cell,fxu_cell*7")
 
;; mulld
(define_insn_reservation "cell-lmul" 15
  (and (eq_attr "type" "lmul")
       (eq_attr "cpu" "cell"))
  "slot1,nonpipeline,nonpipeline*13")
 
;; mulld. is microcoded
(define_insn_reservation "cell-lmul-cmp" 22
  (and (eq_attr "type" "lmul_compare")
       (eq_attr "cpu" "cell"))
  "slot0+slot1,nonpipeline,nonpipeline*20")
 
;; mulli, 6 cycles
(define_insn_reservation "cell-imul23" 6
  (and (eq_attr "type" "imul2,imul3")
       (eq_attr "cpu" "cell"))
  "slot1,nonpipeline,nonpipeline*4")
 
;; mullw, 9
(define_insn_reservation "cell-imul" 9
  (and (eq_attr "type" "imul")
       (eq_attr "cpu" "cell"))
  "slot1,nonpipeline,nonpipeline*7")
 
;; divide
(define_insn_reservation "cell-idiv" 32
  (and (eq_attr "type" "idiv")
       (eq_attr "cpu" "cell"))
  "slot1,nonpipeline,nonpipeline*30")
 
(define_insn_reservation "cell-ldiv" 64
  (and (eq_attr "type" "ldiv")
       (eq_attr "cpu" "cell"))
  "slot1,nonpipeline,nonpipeline*62")
 
;;mflr and mfctr are pipelined
(define_insn_reservation "cell-mfjmpr" 1
  (and (eq_attr "type" "mfjmpr")
       (eq_attr "cpu" "cell"))
  "slot01+bru_cell")
 
;;mtlr and mtctr,
;;mtspr fully pipelined
(define_insn_reservation "cell-mtjmpr" 1
 (and (eq_attr "type" "mtjmpr")
       (eq_attr "cpu" "cell"))
  "bru_cell+slot01")
 
;; Branches
;; b, ba, bl, bla, unconditional branch always predicts correctly n/a latency
;; bcctr, bcctrl, latency 2, actually adjust by be to 4
(define_insn_reservation "cell-branch" 1
  (and (eq_attr "type" "branch")
       (eq_attr "cpu" "cell"))
  "bru_cell+slot1")
 
(define_insn_reservation "cell-branchreg" 1
  (and (eq_attr "type" "jmpreg")
       (eq_attr "cpu" "cell"))
  "bru_cell+slot1")
 
;; cr hazard
;; page 90, special cases for CR hazard, only one instr can access cr per cycle
;; if insn reads CR following a stwcx, pipeline stall till stwcx finish
(define_insn_reservation "cell-crlogical" 1
  (and (eq_attr "type" "cr_logical,delayed_cr")
       (eq_attr "cpu" "cell"))
  "bru_cell+slot01")
 
;; mfcrf and mfcr is about 34 cycles and nonpipelined
(define_insn_reservation "cell-mfcr" 34
  (and (eq_attr "type" "mfcrf,mfcr")
       (eq_attr "cpu" "cell"))
   "slot1,nonpipeline,nonpipeline*32")
 
;; mtcrf (1 field)
(define_insn_reservation "cell-mtcrf" 1
  (and (eq_attr "type" "mtcr")
       (eq_attr "cpu" "cell"))
  "fxu_cell+slot01")
 
; Basic FP latency is 10 cycles, thoughput is 1/cycle
(define_insn_reservation "cell-fp" 10
  (and (eq_attr "type" "fp,dmul")
       (eq_attr "cpu" "cell"))
  "slot01,vsu1_cell,vsu1_cell*8")
 
(define_insn_reservation "cell-fpcompare" 1
  (and (eq_attr "type" "fpcompare")
       (eq_attr "cpu" "cell"))
  "vsu1_cell+slot01")
 
;; sdiv thoughput 1/74, not pipelined but only in the FPU
(define_insn_reservation "cell-sdiv" 74
  (and (eq_attr "type" "sdiv,ddiv")
       (eq_attr "cpu" "cell"))
  "slot1,nonpipeline,nonpipeline*72")
 
;; fsqrt thoughput 1/84, not pipelined but only in the FPU
(define_insn_reservation "cell-sqrt" 84
  (and (eq_attr "type" "ssqrt,dsqrt")
       (eq_attr "cpu" "cell"))
  "slot1,nonpipeline,nonpipeline*82")
 
; VMX
(define_insn_reservation "cell-vecsimple" 4
  (and (eq_attr "type" "vecsimple")
       (eq_attr "cpu" "cell"))
  "slot01,vsu1_cell,vsu1_cell*2")
 
;; mult, div, madd
(define_insn_reservation "cell-veccomplex" 10
  (and (eq_attr "type" "veccomplex")
       (eq_attr "cpu" "cell"))
  "slot01,vsu1_cell,vsu1_cell*8")
 
;; TODO: add support for recording instructions
(define_insn_reservation "cell-veccmp" 4
  (and (eq_attr "type" "veccmp")
       (eq_attr "cpu" "cell"))
  "slot01,vsu1_cell,vsu1_cell*2")
 
(define_insn_reservation "cell-vecfloat" 12
  (and (eq_attr "type" "vecfloat")
       (eq_attr "cpu" "cell"))
  "slot01,vsu1_cell,vsu1_cell*10")
 
(define_insn_reservation "cell-vecperm" 4
  (and (eq_attr "type" "vecperm")
       (eq_attr "cpu" "cell"))
  "slot01,vsu2_cell,vsu2_cell*2")
 
;; New for 4.2, syncs
 
(define_insn_reservation "cell-sync" 11
  (and (eq_attr "type" "sync")
       (eq_attr "cpu" "cell"))
  "slot01,lsu_cell,lsu_cell*9")
 
(define_insn_reservation "cell-isync" 11
  (and (eq_attr "type" "isync")
       (eq_attr "cpu" "cell"))
  "slot01,lsu_cell,lsu_cell*9")
 
(define_insn_reservation "cell-load_l" 11
  (and (eq_attr "type" "load_l")
       (eq_attr "cpu" "cell"))
  "slot01,lsu_cell,lsu_cell*9")
 
(define_insn_reservation "cell-store_c" 11
  (and (eq_attr "type" "store_c")
       (eq_attr "cpu" "cell"))
  "slot01,lsu_cell,lsu_cell*9")
 
;; RAW register dependency
 
;; addi r3, r3, 1
;; lw r4,offset(r3)
;; there are 5 cycle deplay for r3 bypassing
;; there are 5 cycle delay for a dependent load after a load
(define_bypass 5 "cell-integer" "cell-load")
(define_bypass 5 "cell-integer" "cell-load-ext")
(define_bypass 5 "cell-load,cell-load-ext" "cell-load,cell-load-ext")
 
;; there is a 6 cycle delay after a fp compare until you can use the cr.
(define_bypass 6 "cell-fpcompare" "cell-branch,cell-branchreg,cell-mfcr,cell-crlogical")
 
;; VXU float RAW
(define_bypass 11 "cell-vecfloat" "cell-vecfloat")
 
;; VXU and FPU
(define_bypass 6 "cell-veccomplex" "cell-vecsimple")
;;(define_bypass 6 "cell-veccompare" "cell-branch,cell-branchreg")
(define_bypass 3 "cell-vecfloat" "cell-veccomplex")
; this is not correct,
;;  this is a stall in general and not dependent on result
(define_bypass 13 "cell-vecstore" "cell-fpstore")
; this is not correct, this can never be true, not dependent on result
(define_bypass 7 "cell-fp" "cell-fpload")
;; vsu1 should avoid writing to the same target register as vsu2 insn
;;   within 12 cycles.
 
;; WAW hazard
 
;; the target of VSU estimate should not be reused within 10 dispatch groups
;; the target of VSU float should not be reused within 8 dispatch groups
;; the target of VSU complex should not be reused within 5 dispatch groups
;; FP LOAD should not reuse an FPU Arithmetic target with 6 dispatch gropus
 
;; mtctr-bcctr/bcctrl, branch target ctr register shadow update at
;;  ex4 stage(10 cycles)
(define_bypass 10 "cell-mtjmpr" "cell-branchreg")
 
;;Things are not simulated:
;; update instruction, update address gpr are not simulated
;; vrefp, vrsqrtefp have latency(14), currently simulated as 12 cycle float
;;  insns
 

Browse

Tools

Subversion Repositories openrisc

[/] [openrisc/] [trunk/] [gnu-dev/] [or1k-gcc/] [gcc/] [config/] [rs6000/] [cell.md] - Blame information for rev 801

Line No.	Rev	Author	Line
1	709	jeremybenn	`;; Scheduling description for cell processor.`
2			`;; Copyright (C) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2009`
3			`;; Free Software Foundation, Inc.`
4			`;; Contributed by Sony Computer Entertainment, Inc.,`
5
6
7			`;; This file is free software; you can redistribute it and/or modify it under`
8			`;; the terms of the GNU General Public License as published by the Free`
9			`;; Software Foundation; either version 3 of the License, or (at your option)`
10			`;; any later version.`
11
12			`;; This file is distributed in the hope that it will be useful, but WITHOUT`
13			`;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or`
14			`;; FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License`
15			`;; for more details.`
16
17			`;; You should have received a copy of the GNU General Public License`
18			`;; along with GCC; see the file COPYING3. If not see`
19			`;; .`
20
21			`;; Sources: BE BOOK4 (/sfs/enc/doc/PPU_BookIV_DD3.0_latest.pdf)`
22
23			`;; BE Architecture DD3.0 and DD3.1`
24			`;; This file simulate PPU processor unit backend of pipeline, maualP24.`
25			`;; manual P27, stall and flush points`
26			`;; IU, XU, VSU, dispatcher decodes and dispatch 2 insns per cycle in program`
27			`;; order, the grouped address are aligned by 8`
28			`;; This file only simulate one thread situation`
29			`;; XU executes all fixed point insns(3 units, a simple alu, a complex unit,`
30			`;; and load/store unit)`
31			`;; VSU executes all scalar floating points insn(a float unit),`
32			`;; VMX insns(VMX unit, 4 sub units, simple, permute, complex, floating point)`
33
34			`;; Dual issue combination`
35
36			`;; FXU LSU BR VMX VMX`
37			`;; (sx,cx,vsu_fp,fp_arith) (perm,vsu_ls,fp_ls)`
38			`;;FXU X`
39			`;;LSU X X X`
40			`;;BR X`
41			`;;VMX(sx,cx,vsu_fp,fp_arth) X`
42			`;;VMX(perm,vsu_ls, fp_ls) X`
43			`;; X are illegal combination.`
44
45			`;; Dual issue exceptions:`
46			`;;(1) nop-pipelined FXU instr in slot 0`
47			`;;(2) non-pipelined FPU inst in slot 0`
48			`;; CSI instr(contex-synchronizing insn)`
49			`;; Microcode insn`
50
51			`;; BRU unit: bru(none register stall), bru_cr(cr register stall)`
52			`;; VSU unit: vus(vmx simple), vup(vmx permute), vuc(vmx complex),`
53			`;; vuf(vmx float), fpu(floats). fpu_div is hypothetical, it is for`
54			`;; nonpipelined simulation`
55			`;; micr insns will stall at least 7 cycles to get the first instr from ROM,`
56			`;; micro instructions are not dual issued.`
57
58			`;; slot0 is older than slot1`
59			`;; non-pipelined insn need to be in slot1 to avoid 1cycle stall`
60
61			`;; There different stall point`
62			`;; IB2, only stall one thread if stall here, so try to stall here as much as`
63			`;; we can`
64			`;; condition(1) insert nop, OR and ORI instruction form`
65			`;; condition(2) flush happens, in case of: RAW, WAW, D-ERAT miss, or`
66			`;; CR0-access while stdcx, or stwcx`
67			`;; IS2 stall ;; Page91 for details`
68			`;; VQ8 stall`
69			`;; IS2 stall can be activated by VQ8 stall and trying to issue a vsu instr to`
70			`;; the vsu issue queue`
71
72			`;;(define_automaton "cellxu")`
73
74			`;;(define_cpu_unit "fxu_cell,lsu_cell,bru_cell,vsu1_cell,vsu2_cell" "cellxu")`
75
76			`;; ndfa`
77			`(define_automaton "cellxu,cellvsu,cellbru,cell_mis")`
78
79			`(define_cpu_unit "fxu_cell,lsu_cell" "cellxu")`
80			`(define_cpu_unit "bru_cell" "cellbru")`
81			`(define_cpu_unit "vsu1_cell,vsu2_cell" "cellvsu")`
82
83			`(define_cpu_unit "slot0,slot1" "cell_mis")`
84
85			`(absence_set "slot0" "slot1")`
86
87			`(define_reservation "nonpipeline" "fxu_cell+lsu_cell+vsu1_cell+vsu2_cell")`
88			`(define_reservation "slot01" "slot0\|slot1")`
89
90
91			`;; Load/store`
92			`;; lmw, lswi, lswx are only generated for optimize for space, MC,`
93			`;; these instr are not simulated`
94			`(define_insn_reservation "cell-load" 2`
95			`(and (eq_attr "type" "load")`
96			`(eq_attr "cpu" "cell"))`
97			`"slot01,lsu_cell")`
98
99			`;; ldux, ldu, lbzux, lbzu, hardware breaks it down to two instrs,`
100			`;; if with 32bytes alignment, CMC`
101			`(define_insn_reservation "cell-load-ux" 2`
102			`(and (eq_attr "type" "load_ux,load_u")`
103			`(eq_attr "cpu" "cell"))`
104			`"slot01,fxu_cell+lsu_cell")`
105
106			`;; lha, lhax, lhau, lhaux, lwa, lwax, lwaux, MC, latency unknown`
107			`;; 11/7, 11/8, 11/12`
108			`(define_insn_reservation "cell-load-ext" 2`
109			`(and (eq_attr "type" "load_ext,load_ext_u,load_ext_ux")`
110			`(eq_attr "cpu" "cell"))`
111			`"slot01,fxu_cell+lsu_cell")`
112
113			`;;lfs,lfsx,lfd,lfdx, 1 cycle`
114			`(define_insn_reservation "cell-fpload" 1`
115			`(and (eq_attr "type" "fpload")`
116			`(eq_attr "cpu" "cell"))`
117			`"vsu2_cell+lsu_cell+slot01")`
118
119			`;; lfsu,lfsux,lfdu,lfdux 1cycle(fpr) 2 cycle(gpr)`
120			`(define_insn_reservation "cell-fpload-update" 1`
121			`(and (eq_attr "type" "fpload,fpload_u,fpload_ux")`
122			`(eq_attr "cpu" "cell"))`
123			`"fxu_cell+vsu2_cell+lsu_cell+slot01")`
124
125			`(define_insn_reservation "cell-vecload" 2`
126			`(and (eq_attr "type" "vecload")`
127			`(eq_attr "cpu" "cell"))`
128			`"slot01,vsu2_cell+lsu_cell")`
129
130			`;;st? stw(MC)`
131			`(define_insn_reservation "cell-store" 1`
132			`(and (eq_attr "type" "store")`
133			`(eq_attr "cpu" "cell"))`
134			`"lsu_cell+slot01")`
135
136			`;;stdux, stdu, (hardware breaks into store and add) 2 for update reg`
137			`(define_insn_reservation "cell-store-update" 1`
138			`(and (eq_attr "type" "store_ux,store_u")`
139			`(eq_attr "cpu" "cell"))`
140			`"fxu_cell+lsu_cell+slot01")`
141
142			`(define_insn_reservation "cell-fpstore" 1`
143			`(and (eq_attr "type" "fpstore")`
144			`(eq_attr "cpu" "cell"))`
145			`"vsu2_cell+lsu_cell+slot01")`
146
147			`(define_insn_reservation "cell-fpstore-update" 1`
148			`(and (eq_attr "type" "fpstore_ux,fpstore_u")`
149			`(eq_attr "cpu" "cell"))`
150			`"vsu2_cell+fxu_cell+lsu_cell+slot01")`
151
152			`(define_insn_reservation "cell-vecstore" 1`
153			`(and (eq_attr "type" "vecstore")`
154			`(eq_attr "cpu" "cell"))`
155			`"vsu2_cell+lsu_cell+slot01")`
156
157			`;; Integer latency is 2 cycles`
158			`(define_insn_reservation "cell-integer" 2`
159			`(and (eq_attr "type" "integer,insert_dword,shift,trap,\`
160			`var_shift_rotate,cntlz,exts,isel")`
161			`(eq_attr "cpu" "cell"))`
162			`"slot01,fxu_cell")`
163
164			`;; Two integer latency is 4 cycles`
165			`(define_insn_reservation "cell-two" 4`
166			`(and (eq_attr "type" "two")`
167			`(eq_attr "cpu" "cell"))`
168			`"slot01,fxu_cell,fxu_cell*2")`
169
170			`;; Three integer latency is 6 cycles`
171			`(define_insn_reservation "cell-three" 6`
172			`(and (eq_attr "type" "three")`
173			`(eq_attr "cpu" "cell"))`
174			`"slot01,fxu_cell,fxu_cell*4")`
175
176			`;; rlwimi, alter cr0`
177			`(define_insn_reservation "cell-insert" 2`
178			`(and (eq_attr "type" "insert_word")`
179			`(eq_attr "cpu" "cell"))`
180			`"slot01,fxu_cell")`
181
182			`;; cmpi, cmpli, cmpla, add, addo, sub, subo, alter cr0`
183			`(define_insn_reservation "cell-cmp" 1`
184			`(and (eq_attr "type" "cmp")`
185			`(eq_attr "cpu" "cell"))`
186			`"fxu_cell+slot01")`
187
188			`;; add, addo, sub, subo, alter cr0, rldcli, rlwinm`
189			`(define_insn_reservation "cell-fast-cmp" 2`
190			`(and (and (eq_attr "type" "fast_compare,delayed_compare,compare,\`
191			`var_delayed_compare")`
192			`(eq_attr "cpu" "cell"))`
193			`(eq_attr "cell_micro" "not"))`
194			`"slot01,fxu_cell")`
195
196			`(define_insn_reservation "cell-cmp-microcoded" 9`
197			`(and (and (eq_attr "type" "fast_compare,delayed_compare,compare,\`
198			`var_delayed_compare")`
199			`(eq_attr "cpu" "cell"))`
200			`(eq_attr "cell_micro" "always"))`
201			`"slot0+slot1,fxu_cell,fxu_cell*7")`
202
203			`;; mulld`
204			`(define_insn_reservation "cell-lmul" 15`
205			`(and (eq_attr "type" "lmul")`
206			`(eq_attr "cpu" "cell"))`
207			`"slot1,nonpipeline,nonpipeline*13")`
208
209			`;; mulld. is microcoded`
210			`(define_insn_reservation "cell-lmul-cmp" 22`
211			`(and (eq_attr "type" "lmul_compare")`
212			`(eq_attr "cpu" "cell"))`
213			`"slot0+slot1,nonpipeline,nonpipeline*20")`
214
215			`;; mulli, 6 cycles`
216			`(define_insn_reservation "cell-imul23" 6`
217			`(and (eq_attr "type" "imul2,imul3")`
218			`(eq_attr "cpu" "cell"))`
219			`"slot1,nonpipeline,nonpipeline*4")`
220
221			`;; mullw, 9`
222			`(define_insn_reservation "cell-imul" 9`
223			`(and (eq_attr "type" "imul")`
224			`(eq_attr "cpu" "cell"))`
225			`"slot1,nonpipeline,nonpipeline*7")`
226
227			`;; divide`
228			`(define_insn_reservation "cell-idiv" 32`
229			`(and (eq_attr "type" "idiv")`
230			`(eq_attr "cpu" "cell"))`
231			`"slot1,nonpipeline,nonpipeline*30")`
232
233			`(define_insn_reservation "cell-ldiv" 64`
234			`(and (eq_attr "type" "ldiv")`
235			`(eq_attr "cpu" "cell"))`
236			`"slot1,nonpipeline,nonpipeline*62")`
237
238			`;;mflr and mfctr are pipelined`
239			`(define_insn_reservation "cell-mfjmpr" 1`
240			`(and (eq_attr "type" "mfjmpr")`
241			`(eq_attr "cpu" "cell"))`
242			`"slot01+bru_cell")`
243
244			`;;mtlr and mtctr,`
245			`;;mtspr fully pipelined`
246			`(define_insn_reservation "cell-mtjmpr" 1`
247			`(and (eq_attr "type" "mtjmpr")`
248			`(eq_attr "cpu" "cell"))`
249			`"bru_cell+slot01")`
250
251			`;; Branches`
252			`;; b, ba, bl, bla, unconditional branch always predicts correctly n/a latency`
253			`;; bcctr, bcctrl, latency 2, actually adjust by be to 4`
254			`(define_insn_reservation "cell-branch" 1`
255			`(and (eq_attr "type" "branch")`
256			`(eq_attr "cpu" "cell"))`
257			`"bru_cell+slot1")`
258
259			`(define_insn_reservation "cell-branchreg" 1`
260			`(and (eq_attr "type" "jmpreg")`
261			`(eq_attr "cpu" "cell"))`
262			`"bru_cell+slot1")`
263
264			`;; cr hazard`
265			`;; page 90, special cases for CR hazard, only one instr can access cr per cycle`
266			`;; if insn reads CR following a stwcx, pipeline stall till stwcx finish`
267			`(define_insn_reservation "cell-crlogical" 1`
268			`(and (eq_attr "type" "cr_logical,delayed_cr")`
269			`(eq_attr "cpu" "cell"))`
270			`"bru_cell+slot01")`
271
272			`;; mfcrf and mfcr is about 34 cycles and nonpipelined`
273			`(define_insn_reservation "cell-mfcr" 34`
274			`(and (eq_attr "type" "mfcrf,mfcr")`
275			`(eq_attr "cpu" "cell"))`
276			`"slot1,nonpipeline,nonpipeline*32")`
277
278			`;; mtcrf (1 field)`
279			`(define_insn_reservation "cell-mtcrf" 1`
280			`(and (eq_attr "type" "mtcr")`
281			`(eq_attr "cpu" "cell"))`
282			`"fxu_cell+slot01")`
283
284			`; Basic FP latency is 10 cycles, thoughput is 1/cycle`
285			`(define_insn_reservation "cell-fp" 10`
286			`(and (eq_attr "type" "fp,dmul")`
287			`(eq_attr "cpu" "cell"))`
288			`"slot01,vsu1_cell,vsu1_cell*8")`
289
290			`(define_insn_reservation "cell-fpcompare" 1`
291			`(and (eq_attr "type" "fpcompare")`
292			`(eq_attr "cpu" "cell"))`
293			`"vsu1_cell+slot01")`
294
295			`;; sdiv thoughput 1/74, not pipelined but only in the FPU`
296			`(define_insn_reservation "cell-sdiv" 74`
297			`(and (eq_attr "type" "sdiv,ddiv")`
298			`(eq_attr "cpu" "cell"))`
299			`"slot1,nonpipeline,nonpipeline*72")`
300
301			`;; fsqrt thoughput 1/84, not pipelined but only in the FPU`
302			`(define_insn_reservation "cell-sqrt" 84`
303			`(and (eq_attr "type" "ssqrt,dsqrt")`
304			`(eq_attr "cpu" "cell"))`
305			`"slot1,nonpipeline,nonpipeline*82")`
306
307			`; VMX`
308			`(define_insn_reservation "cell-vecsimple" 4`
309			`(and (eq_attr "type" "vecsimple")`
310			`(eq_attr "cpu" "cell"))`
311			`"slot01,vsu1_cell,vsu1_cell*2")`
312
313			`;; mult, div, madd`
314			`(define_insn_reservation "cell-veccomplex" 10`
315			`(and (eq_attr "type" "veccomplex")`
316			`(eq_attr "cpu" "cell"))`
317			`"slot01,vsu1_cell,vsu1_cell*8")`
318
319			`;; TODO: add support for recording instructions`
320			`(define_insn_reservation "cell-veccmp" 4`
321			`(and (eq_attr "type" "veccmp")`
322			`(eq_attr "cpu" "cell"))`
323			`"slot01,vsu1_cell,vsu1_cell*2")`
324
325			`(define_insn_reservation "cell-vecfloat" 12`
326			`(and (eq_attr "type" "vecfloat")`
327			`(eq_attr "cpu" "cell"))`
328			`"slot01,vsu1_cell,vsu1_cell*10")`
329
330			`(define_insn_reservation "cell-vecperm" 4`
331			`(and (eq_attr "type" "vecperm")`
332			`(eq_attr "cpu" "cell"))`
333			`"slot01,vsu2_cell,vsu2_cell*2")`
334
335			`;; New for 4.2, syncs`
336
337			`(define_insn_reservation "cell-sync" 11`
338			`(and (eq_attr "type" "sync")`
339			`(eq_attr "cpu" "cell"))`
340			`"slot01,lsu_cell,lsu_cell*9")`
341
342			`(define_insn_reservation "cell-isync" 11`
343			`(and (eq_attr "type" "isync")`
344			`(eq_attr "cpu" "cell"))`
345			`"slot01,lsu_cell,lsu_cell*9")`
346
347			`(define_insn_reservation "cell-load_l" 11`
348			`(and (eq_attr "type" "load_l")`
349			`(eq_attr "cpu" "cell"))`
350			`"slot01,lsu_cell,lsu_cell*9")`
351
352			`(define_insn_reservation "cell-store_c" 11`
353			`(and (eq_attr "type" "store_c")`
354			`(eq_attr "cpu" "cell"))`
355			`"slot01,lsu_cell,lsu_cell*9")`
356
357			`;; RAW register dependency`
358
359			`;; addi r3, r3, 1`
360			`;; lw r4,offset(r3)`
361			`;; there are 5 cycle deplay for r3 bypassing`
362			`;; there are 5 cycle delay for a dependent load after a load`
363			`(define_bypass 5 "cell-integer" "cell-load")`
364			`(define_bypass 5 "cell-integer" "cell-load-ext")`
365			`(define_bypass 5 "cell-load,cell-load-ext" "cell-load,cell-load-ext")`
366
367			`;; there is a 6 cycle delay after a fp compare until you can use the cr.`
368			`(define_bypass 6 "cell-fpcompare" "cell-branch,cell-branchreg,cell-mfcr,cell-crlogical")`
369
370			`;; VXU float RAW`
371			`(define_bypass 11 "cell-vecfloat" "cell-vecfloat")`
372
373			`;; VXU and FPU`
374			`(define_bypass 6 "cell-veccomplex" "cell-vecsimple")`
375			`;;(define_bypass 6 "cell-veccompare" "cell-branch,cell-branchreg")`
376			`(define_bypass 3 "cell-vecfloat" "cell-veccomplex")`
377			`; this is not correct,`
378			`;; this is a stall in general and not dependent on result`
379			`(define_bypass 13 "cell-vecstore" "cell-fpstore")`
380			`; this is not correct, this can never be true, not dependent on result`
381			`(define_bypass 7 "cell-fp" "cell-fpload")`
382			`;; vsu1 should avoid writing to the same target register as vsu2 insn`
383			`;; within 12 cycles.`
384
385			`;; WAW hazard`
386
387			`;; the target of VSU estimate should not be reused within 10 dispatch groups`
388			`;; the target of VSU float should not be reused within 8 dispatch groups`
389			`;; the target of VSU complex should not be reused within 5 dispatch groups`
390			`;; FP LOAD should not reuse an FPU Arithmetic target with 6 dispatch gropus`
391
392			`;; mtctr-bcctr/bcctrl, branch target ctr register shadow update at`
393			`;; ex4 stage(10 cycles)`
394			`(define_bypass 10 "cell-mtjmpr" "cell-branchreg")`
395
396			`;;Things are not simulated:`
397			`;; update instruction, update address gpr are not simulated`
398			`;; vrefp, vrsqrtefp have latency(14), currently simulated as 12 cycle float`
399			`;; insns`
400