1 |
709 |
jeremybenn |
;; Faraday FA726TE Pipeline Description
|
2 |
|
|
;; Copyright (C) 2010 Free Software Foundation, Inc.
|
3 |
|
|
;; Written by I-Jui Sung, based on ARM926EJ-S Pipeline Description.
|
4 |
|
|
;;
|
5 |
|
|
;; This file is part of GCC.
|
6 |
|
|
;;
|
7 |
|
|
;; GCC is free software; you can redistribute it and/or modify it under
|
8 |
|
|
;; the terms of the GNU General Public License as published by the Free
|
9 |
|
|
;; Software Foundation; either version 3, or (at your option) any later
|
10 |
|
|
;; version.
|
11 |
|
|
;;
|
12 |
|
|
;; GCC is distributed in the hope that it will be useful, but WITHOUT ANY
|
13 |
|
|
;; WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
14 |
|
|
;; FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
15 |
|
|
;; for more details.
|
16 |
|
|
;;
|
17 |
|
|
;; You should have received a copy of the GNU General Public License
|
18 |
|
|
;; along with GCC; see the file COPYING3. If not see
|
19 |
|
|
;; . */
|
20 |
|
|
|
21 |
|
|
;; These descriptions are based on the information contained in the
|
22 |
|
|
;; FA726TE Core Design Note, Copyright (c) 2010 Faraday Technology Corp.
|
23 |
|
|
|
24 |
|
|
;; This automaton provides a pipeline description for the Faraday
|
25 |
|
|
;; FA726TE core.
|
26 |
|
|
;;
|
27 |
|
|
;; The model given here assumes that the condition for all conditional
|
28 |
|
|
;; instructions is "true", i.e., that all of the instructions are
|
29 |
|
|
;; actually executed.
|
30 |
|
|
|
31 |
|
|
(define_automaton "fa726te")
|
32 |
|
|
|
33 |
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
34 |
|
|
;; Pipelines
|
35 |
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
36 |
|
|
|
37 |
|
|
;; The ALU pipeline has fetch, decode, execute, memory, and
|
38 |
|
|
;; write stages. We only need to model the execute, memory and write
|
39 |
|
|
;; stages.
|
40 |
|
|
|
41 |
|
|
;; E1 E2 E3 E4 E5 WB
|
42 |
|
|
;;______________________________________________________
|
43 |
|
|
;;
|
44 |
|
|
;; <-------------- LD/ST ----------->
|
45 |
|
|
;; shifter + LU <-- AU -->
|
46 |
|
|
;; <-- AU --> shifter + LU CPSR (Pipe 0)
|
47 |
|
|
;;______________________________________________________
|
48 |
|
|
;;
|
49 |
|
|
;; <---------- MUL --------->
|
50 |
|
|
;; shifter + LU <-- AU -->
|
51 |
|
|
;; <-- AU --> shifter + LU CPSR (Pipe 1)
|
52 |
|
|
|
53 |
|
|
|
54 |
|
|
(define_cpu_unit "fa726te_alu0_pipe,fa726te_alu1_pipe" "fa726te")
|
55 |
|
|
(define_cpu_unit "fa726te_mac_pipe" "fa726te")
|
56 |
|
|
(define_cpu_unit "fa726te_lsu_pipe_e,fa726te_lsu_pipe_w" "fa726te")
|
57 |
|
|
|
58 |
|
|
;; Pretend we have 2 LSUs (the second is ONLY for LDR), which can possibly
|
59 |
|
|
;; improve code quality.
|
60 |
|
|
(define_query_cpu_unit "fa726te_lsu1_pipe_e,fa726te_lsu1_pipe_w" "fa726te")
|
61 |
|
|
(define_cpu_unit "fa726te_is0,fa726te_is1" "fa726te")
|
62 |
|
|
|
63 |
|
|
(define_reservation "fa726te_issue" "(fa726te_is0|fa726te_is1)")
|
64 |
|
|
;; Reservation to restrict issue to 1.
|
65 |
|
|
(define_reservation "fa726te_blockage" "(fa726te_is0+fa726te_is1)")
|
66 |
|
|
|
67 |
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
68 |
|
|
;; ALU Instructions
|
69 |
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
70 |
|
|
|
71 |
|
|
;; ALU instructions require three cycles to execute, and use the ALU
|
72 |
|
|
;; pipeline in each of the three stages. The results are available
|
73 |
|
|
;; after the execute stage stage has finished.
|
74 |
|
|
;;
|
75 |
|
|
;; If the destination register is the PC, the pipelines are stalled
|
76 |
|
|
;; for several cycles. That case is not modeled here.
|
77 |
|
|
|
78 |
|
|
;; Move instructions.
|
79 |
|
|
(define_insn_reservation "726te_shift_op" 1
|
80 |
|
|
(and (eq_attr "tune" "fa726te")
|
81 |
|
|
(eq_attr "insn" "mov,mvn"))
|
82 |
|
|
"fa726te_issue+(fa726te_alu0_pipe|fa726te_alu1_pipe)")
|
83 |
|
|
|
84 |
|
|
;; ALU operations with no shifted operand will finished in 1 cycle
|
85 |
|
|
;; Other ALU instructions 2 cycles.
|
86 |
|
|
(define_insn_reservation "726te_alu_op" 1
|
87 |
|
|
(and (eq_attr "tune" "fa726te")
|
88 |
|
|
(and (eq_attr "type" "alu")
|
89 |
|
|
(not (eq_attr "insn" "mov,mvn"))))
|
90 |
|
|
"fa726te_issue+(fa726te_alu0_pipe|fa726te_alu1_pipe)")
|
91 |
|
|
|
92 |
|
|
;; ALU operations with a shift-by-register operand.
|
93 |
|
|
;; These really stall in the decoder, in order to read the shift value
|
94 |
|
|
;; in the first cycle. If the instruction uses both shifter and AU,
|
95 |
|
|
;; it takes 3 cycles.
|
96 |
|
|
(define_insn_reservation "726te_alu_shift_op" 3
|
97 |
|
|
(and (eq_attr "tune" "fa726te")
|
98 |
|
|
(and (eq_attr "type" "alu_shift")
|
99 |
|
|
(not (eq_attr "insn" "mov,mvn"))))
|
100 |
|
|
"fa726te_issue+(fa726te_alu0_pipe|fa726te_alu1_pipe)")
|
101 |
|
|
|
102 |
|
|
(define_insn_reservation "726te_alu_shift_reg_op" 3
|
103 |
|
|
(and (eq_attr "tune" "fa726te")
|
104 |
|
|
(and (eq_attr "type" "alu_shift_reg")
|
105 |
|
|
(not (eq_attr "insn" "mov,mvn"))))
|
106 |
|
|
"fa726te_issue+(fa726te_alu0_pipe|fa726te_alu1_pipe)")
|
107 |
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
108 |
|
|
;; Multiplication Instructions
|
109 |
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
110 |
|
|
|
111 |
|
|
;; Multiplication instructions loop in the execute stage until the
|
112 |
|
|
;; instruction has been passed through the multiplier array enough
|
113 |
|
|
;; times. Multiply operations occur in both the execute and memory
|
114 |
|
|
;; stages of the pipeline
|
115 |
|
|
|
116 |
|
|
(define_insn_reservation "726te_mult_op" 3
|
117 |
|
|
(and (eq_attr "tune" "fa726te")
|
118 |
|
|
(eq_attr "insn" "smlalxy,mul,mla,muls,mlas,umull,umlal,smull,smlal,\
|
119 |
|
|
umulls,umlals,smulls,smlals,smlawx,smulxy,smlaxy"))
|
120 |
|
|
"fa726te_issue+fa726te_mac_pipe")
|
121 |
|
|
|
122 |
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
123 |
|
|
;; Load/Store Instructions
|
124 |
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
125 |
|
|
|
126 |
|
|
;; The models for load/store instructions do not accurately describe
|
127 |
|
|
;; the difference between operations with a base register writeback
|
128 |
|
|
;; (such as "ldm!"). These models assume that all memory references
|
129 |
|
|
;; hit in dcache.
|
130 |
|
|
|
131 |
|
|
;; Loads with a shifted offset take 3 cycles, and are (a) probably the
|
132 |
|
|
;; most common and (b) the pessimistic assumption will lead to fewer stalls.
|
133 |
|
|
|
134 |
|
|
;; Scalar loads are pipelined in FA726TE LSU pipe.
|
135 |
|
|
;; Here we model the resource conflict between Load@E3-stage & Store@W-stage.
|
136 |
|
|
;; The 2nd LSU (lsu1) is to model the fact that if 2 loads are scheduled in the
|
137 |
|
|
;; same "bundle", and the 2nd load will introudce another ISSUE stall but is
|
138 |
|
|
;; still ok to execute (and may be benefical sometimes).
|
139 |
|
|
|
140 |
|
|
(define_insn_reservation "726te_load1_op" 3
|
141 |
|
|
(and (eq_attr "tune" "fa726te")
|
142 |
|
|
(eq_attr "type" "load1,load_byte"))
|
143 |
|
|
"(fa726te_issue+fa726te_lsu_pipe_e+fa726te_lsu_pipe_w)\
|
144 |
|
|
| (fa726te_issue+fa726te_lsu1_pipe_e+fa726te_lsu1_pipe_w,fa726te_blockage)")
|
145 |
|
|
|
146 |
|
|
(define_insn_reservation "726te_store1_op" 1
|
147 |
|
|
(and (eq_attr "tune" "fa726te")
|
148 |
|
|
(eq_attr "type" "store1"))
|
149 |
|
|
"fa726te_blockage*2")
|
150 |
|
|
|
151 |
|
|
;; Load/Store Multiple blocks all pipelines in EX stages until WB.
|
152 |
|
|
;; No other instructions can be issued together. Since they essentially
|
153 |
|
|
;; prevent all scheduling opportunities, we model them together here.
|
154 |
|
|
|
155 |
|
|
;; The LDM is breaking into multiple load instructions, later instruction in
|
156 |
|
|
;; the pipe 1 is stalled.
|
157 |
|
|
(define_insn_reservation "726te_ldm2_op" 4
|
158 |
|
|
(and (eq_attr "tune" "fa726te")
|
159 |
|
|
(eq_attr "type" "load2,load3"))
|
160 |
|
|
"fa726te_blockage*4")
|
161 |
|
|
|
162 |
|
|
(define_insn_reservation "726te_ldm3_op" 5
|
163 |
|
|
(and (eq_attr "tune" "fa726te")
|
164 |
|
|
(eq_attr "type" "load4"))
|
165 |
|
|
"fa726te_blockage*5")
|
166 |
|
|
|
167 |
|
|
(define_insn_reservation "726te_stm2_op" 2
|
168 |
|
|
(and (eq_attr "tune" "fa726te")
|
169 |
|
|
(eq_attr "type" "store2,store3"))
|
170 |
|
|
"fa726te_blockage*3")
|
171 |
|
|
|
172 |
|
|
(define_insn_reservation "726te_stm3_op" 3
|
173 |
|
|
(and (eq_attr "tune" "fa726te")
|
174 |
|
|
(eq_attr "type" "store4"))
|
175 |
|
|
"fa726te_blockage*4")
|
176 |
|
|
|
177 |
|
|
(define_bypass 1 "726te_load1_op,726te_ldm2_op,726te_ldm3_op" "726te_store1_op,\
|
178 |
|
|
726te_stm2_op,726te_stm3_op" "arm_no_early_store_addr_dep")
|
179 |
|
|
(define_bypass 0 "726te_shift_op,726te_alu_op,726te_alu_shift_op,\
|
180 |
|
|
726te_alu_shift_reg_op,726te_mult_op" "726te_store1_op"
|
181 |
|
|
"arm_no_early_store_addr_dep")
|
182 |
|
|
(define_bypass 0 "726te_shift_op,726te_alu_op" "726te_shift_op,726te_alu_op")
|
183 |
|
|
(define_bypass 1 "726te_alu_shift_op,726te_alu_shift_reg_op"
|
184 |
|
|
"726te_shift_op,726te_alu_op")
|
185 |
|
|
(define_bypass 1 "726te_alu_shift_op,726te_alu_shift_reg_op,726te_mult_op"
|
186 |
|
|
"726te_alu_shift_op" "arm_no_early_alu_shift_dep")
|
187 |
|
|
(define_bypass 1 "726te_alu_shift_op,726te_alu_shift_reg_op,726te_mult_op"
|
188 |
|
|
"726te_alu_shift_reg_op" "arm_no_early_alu_shift_value_dep")
|
189 |
|
|
(define_bypass 1 "726te_mult_op" "726te_shift_op,726te_alu_op")
|
190 |
|
|
|
191 |
|
|
(define_bypass 4 "726te_load1_op" "726te_mult_op")
|
192 |
|
|
(define_bypass 5 "726te_ldm2_op" "726te_mult_op")
|
193 |
|
|
(define_bypass 6 "726te_ldm3_op" "726te_mult_op")
|
194 |
|
|
|
195 |
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
196 |
|
|
;; Branch and Call Instructions
|
197 |
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
198 |
|
|
|
199 |
|
|
;; Branch instructions are difficult to model accurately. The FA726TE
|
200 |
|
|
;; core can predict most branches. If the branch is predicted
|
201 |
|
|
;; correctly, and predicted early enough, the branch can be completely
|
202 |
|
|
;; eliminated from the instruction stream. Some branches can
|
203 |
|
|
;; therefore appear to require zero cycle to execute. We assume that
|
204 |
|
|
;; all branches are predicted correctly, and that the latency is
|
205 |
|
|
;; therefore the minimum value.
|
206 |
|
|
|
207 |
|
|
(define_insn_reservation "726te_branch_op" 0
|
208 |
|
|
(and (eq_attr "tune" "fa726te")
|
209 |
|
|
(eq_attr "type" "branch"))
|
210 |
|
|
"fa726te_blockage")
|
211 |
|
|
|
212 |
|
|
;; The latency for a call is actually the latency when the result is available.
|
213 |
|
|
;; i.e. R0 is ready for int return value.
|
214 |
|
|
(define_insn_reservation "726te_call_op" 1
|
215 |
|
|
(and (eq_attr "tune" "fa726te")
|
216 |
|
|
(eq_attr "type" "call"))
|
217 |
|
|
"fa726te_blockage")
|
218 |
|
|
|