OpenCores
URL https://opencores.org/ocsvn/scarts/scarts/trunk

Subversion Repositories scarts

[/] [scarts/] [trunk/] [toolchain/] [scarts-gcc/] [gcc-4.1.1/] [gcc/] [config/] [arm/] [arm1020e.md] - Blame information for rev 12

Details | Compare with Previous | View Log

Line No. Rev Author Line
1 12 jlechner
;; ARM 1020E & ARM 1022E Pipeline Description
2
;; Copyright (C) 2005 Free Software Foundation, Inc.
3
;; Contributed by Richard Earnshaw (richard.earnshaw@arm.com)
4
;;
5
;; This file is part of GCC.
6
;;
7
;; GCC is free software; you can redistribute it and/or modify it
8
;; under the terms of the GNU General Public License as published by
9
;; the Free Software Foundation; either version 2, or (at your option)
10
;; any later version.
11
;;
12
;; GCC is distributed in the hope that it will be useful, but
13
;; WITHOUT ANY WARRANTY; without even the implied warranty of
14
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15
;; General Public License for more details.
16
;;
17
;; You should have received a copy of the GNU General Public License
18
;; along with GCC; see the file COPYING.  If not, write to the Free
19
;; Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
20
;; 02110-1301, USA.  */
21
 
22
;; These descriptions are based on the information contained in the
23
;; ARM1020E Technical Reference Manual, Copyright (c) 2003 ARM
24
;; Limited.
25
;;
26
 
27
;; This automaton provides a pipeline description for the ARM
28
;; 1020E core.
29
;;
30
;; The model given here assumes that the condition for all conditional
31
;; instructions is "true", i.e., that all of the instructions are
32
;; actually executed.
33
 
34
(define_automaton "arm1020e")
35
 
36
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
37
;; Pipelines
38
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
39
 
40
;; There are two pipelines:
41
;;
42
;; - An Arithmetic Logic Unit (ALU) pipeline.
43
;;
44
;;   The ALU pipeline has fetch, issue, decode, execute, memory, and
45
;;   write stages. We only need to model the execute, memory and write
46
;;   stages.
47
;;
48
;; - A Load-Store Unit (LSU) pipeline.
49
;;
50
;;   The LSU pipeline has decode, execute, memory, and write stages.
51
;;   We only model the execute, memory and write stages.
52
 
53
(define_cpu_unit "1020a_e,1020a_m,1020a_w" "arm1020e")
54
(define_cpu_unit "1020l_e,1020l_m,1020l_w" "arm1020e")
55
 
56
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
57
;; ALU Instructions
58
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
59
 
60
;; ALU instructions require three cycles to execute, and use the ALU
61
;; pipeline in each of the three stages.  The results are available
62
;; after the execute stage stage has finished.
63
;;
64
;; If the destination register is the PC, the pipelines are stalled
65
;; for several cycles.  That case is not modeled here.
66
 
67
;; ALU operations with no shifted operand
68
(define_insn_reservation "1020alu_op" 1
69
 (and (eq_attr "tune" "arm1020e,arm1022e")
70
      (eq_attr "type" "alu"))
71
 "1020a_e,1020a_m,1020a_w")
72
 
73
;; ALU operations with a shift-by-constant operand
74
(define_insn_reservation "1020alu_shift_op" 1
75
 (and (eq_attr "tune" "arm1020e,arm1022e")
76
      (eq_attr "type" "alu_shift"))
77
 "1020a_e,1020a_m,1020a_w")
78
 
79
;; ALU operations with a shift-by-register operand
80
;; These really stall in the decoder, in order to read
81
;; the shift value in a second cycle. Pretend we take two cycles in
82
;; the execute stage.
83
(define_insn_reservation "1020alu_shift_reg_op" 2
84
 (and (eq_attr "tune" "arm1020e,arm1022e")
85
      (eq_attr "type" "alu_shift_reg"))
86
 "1020a_e*2,1020a_m,1020a_w")
87
 
88
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
89
;; Multiplication Instructions
90
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
91
 
92
;; Multiplication instructions loop in the execute stage until the
93
;; instruction has been passed through the multiplier array enough
94
;; times.
95
 
96
;; The result of the "smul" and "smulw" instructions is not available
97
;; until after the memory stage.
98
(define_insn_reservation "1020mult1" 2
99
 (and (eq_attr "tune" "arm1020e,arm1022e")
100
      (eq_attr "insn" "smulxy,smulwy"))
101
 "1020a_e,1020a_m,1020a_w")
102
 
103
;; The "smlaxy" and "smlawx" instructions require two iterations through
104
;; the execute stage; the result is available immediately following
105
;; the execute stage.
106
(define_insn_reservation "1020mult2" 2
107
 (and (eq_attr "tune" "arm1020e,arm1022e")
108
      (eq_attr "insn" "smlaxy,smlalxy,smlawx"))
109
 "1020a_e*2,1020a_m,1020a_w")
110
 
111
;; The "smlalxy", "mul", and "mla" instructions require two iterations
112
;; through the execute stage; the result is not available until after
113
;; the memory stage.
114
(define_insn_reservation "1020mult3" 3
115
 (and (eq_attr "tune" "arm1020e,arm1022e")
116
      (eq_attr "insn" "smlalxy,mul,mla"))
117
 "1020a_e*2,1020a_m,1020a_w")
118
 
119
;; The "muls" and "mlas" instructions loop in the execute stage for
120
;; four iterations in order to set the flags.  The value result is
121
;; available after three iterations.
122
(define_insn_reservation "1020mult4" 3
123
 (and (eq_attr "tune" "arm1020e,arm1022e")
124
      (eq_attr "insn" "muls,mlas"))
125
 "1020a_e*4,1020a_m,1020a_w")
126
 
127
;; Long multiply instructions that produce two registers of
128
;; output (such as umull) make their results available in two cycles;
129
;; the least significant word is available before the most significant
130
;; word.  That fact is not modeled; instead, the instructions are
131
;; described.as if the entire result was available at the end of the
132
;; cycle in which both words are available.
133
 
134
;; The "umull", "umlal", "smull", and "smlal" instructions all take
135
;; three iterations through the execute cycle, and make their results
136
;; available after the memory cycle.
137
(define_insn_reservation "1020mult5" 4
138
 (and (eq_attr "tune" "arm1020e,arm1022e")
139
      (eq_attr "insn" "umull,umlal,smull,smlal"))
140
 "1020a_e*3,1020a_m,1020a_w")
141
 
142
;; The "umulls", "umlals", "smulls", and "smlals" instructions loop in
143
;; the execute stage for five iterations in order to set the flags.
144
;; The value result is available after four iterations.
145
(define_insn_reservation "1020mult6" 4
146
 (and (eq_attr "tune" "arm1020e,arm1022e")
147
      (eq_attr "insn" "umulls,umlals,smulls,smlals"))
148
 "1020a_e*5,1020a_m,1020a_w")
149
 
150
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
151
;; Load/Store Instructions
152
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
153
 
154
;; The models for load/store instructions do not accurately describe
155
;; the difference between operations with a base register writeback
156
;; (such as "ldm!").  These models assume that all memory references
157
;; hit in dcache.
158
 
159
;; LSU instructions require six cycles to execute.  They use the ALU
160
;; pipeline in all but the 5th cycle, and the LSU pipeline in cycles
161
;; three through six.
162
;; Loads and stores which use a scaled register offset or scaled
163
;; register pre-indexed addressing mode take three cycles EXCEPT for
164
;; those that are base + offset with LSL of 0 or 2, or base - offset
165
;; with LSL of zero.  The remainder take 1 cycle to execute.
166
;; For 4byte loads there is a bypass from the load stage
167
 
168
(define_insn_reservation "1020load1_op" 2
169
 (and (eq_attr "tune" "arm1020e,arm1022e")
170
      (eq_attr "type" "load_byte,load1"))
171
 "1020a_e+1020l_e,1020l_m,1020l_w")
172
 
173
(define_insn_reservation "1020store1_op" 0
174
 (and (eq_attr "tune" "arm1020e,arm1022e")
175
      (eq_attr "type" "store1"))
176
 "1020a_e+1020l_e,1020l_m,1020l_w")
177
 
178
;; A load's result can be stored by an immediately following store
179
(define_bypass 1 "1020load1_op" "1020store1_op" "arm_no_early_store_addr_dep")
180
 
181
;; On a LDM/STM operation, the LSU pipeline iterates until all of the
182
;; registers have been processed.
183
;;
184
;; The time it takes to load the data depends on whether or not the
185
;; base address is 64-bit aligned; if it is not, an additional cycle
186
;; is required.  This model assumes that the address is always 64-bit
187
;; aligned.  Because the processor can load two registers per cycle,
188
;; that assumption means that we use the same instruction reservations
189
;; for loading 2k and 2k - 1 registers.
190
;;
191
;; The ALU pipeline is decoupled after the first cycle unless there is
192
;; a register dependency; the dependency is cleared as soon as the LDM/STM
193
;; has dealt with the corresponding register.  So for example,
194
;;  stmia sp, {r0-r3}
195
;;  add r0, r0, #4
196
;; will have one fewer stalls than
197
;;  stmia sp, {r0-r3}
198
;;  add r3, r3, #4
199
;;
200
;; As with ALU operations, if one of the destination registers is the
201
;; PC, there are additional stalls; that is not modeled.
202
 
203
(define_insn_reservation "1020load2_op" 2
204
 (and (eq_attr "tune" "arm1020e,arm1022e")
205
      (eq_attr "type" "load2"))
206
 "1020a_e+1020l_e,1020l_m,1020l_w")
207
 
208
(define_insn_reservation "1020store2_op" 0
209
 (and (eq_attr "tune" "arm1020e,arm1022e")
210
      (eq_attr "type" "store2"))
211
 "1020a_e+1020l_e,1020l_m,1020l_w")
212
 
213
(define_insn_reservation "1020load34_op" 3
214
 (and (eq_attr "tune" "arm1020e,arm1022e")
215
      (eq_attr "type" "load3,load4"))
216
 "1020a_e+1020l_e,1020l_e+1020l_m,1020l_m,1020l_w")
217
 
218
(define_insn_reservation "1020store34_op" 0
219
 (and (eq_attr "tune" "arm1020e,arm1022e")
220
      (eq_attr "type" "store3,store4"))
221
 "1020a_e+1020l_e,1020l_e+1020l_m,1020l_m,1020l_w")
222
 
223
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
224
;; Branch and Call Instructions
225
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
226
 
227
;; Branch instructions are difficult to model accurately.  The ARM
228
;; core can predict most branches.  If the branch is predicted
229
;; correctly, and predicted early enough, the branch can be completely
230
;; eliminated from the instruction stream.  Some branches can
231
;; therefore appear to require zero cycles to execute.  We assume that
232
;; all branches are predicted correctly, and that the latency is
233
;; therefore the minimum value.
234
 
235
(define_insn_reservation "1020branch_op" 0
236
 (and (eq_attr "tune" "arm1020e,arm1022e")
237
      (eq_attr "type" "branch"))
238
 "1020a_e")
239
 
240
;; The latency for a call is not predictable.  Therefore, we use 32 as
241
;; roughly equivalent to positive infinity.
242
 
243
(define_insn_reservation "1020call_op" 32
244
 (and (eq_attr "tune" "arm1020e,arm1022e")
245
      (eq_attr "type" "call"))
246
 "1020a_e*32")
247
 
248
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
249
;; VFP
250
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
251
 
252
(define_cpu_unit "v10_fmac" "arm1020e")
253
 
254
(define_cpu_unit "v10_ds" "arm1020e")
255
 
256
(define_cpu_unit "v10_fmstat" "arm1020e")
257
 
258
(define_cpu_unit "v10_ls1,v10_ls2,v10_ls3" "arm1020e")
259
 
260
;; fmstat is a serializing instruction.  It will stall the core until
261
;; the mac and ds units have completed.
262
(exclusion_set "v10_fmac,v10_ds" "v10_fmstat")
263
 
264
(define_attr "vfp10" "yes,no"
265
  (const (if_then_else (and (eq_attr "tune" "arm1020e,arm1022e")
266
                            (eq_attr "fpu" "vfp"))
267
                       (const_string "yes") (const_string "no"))))
268
 
269
;; The VFP "type" attributes differ from those used in the FPA model.
270
;; ffarith      Fast floating point insns, e.g. abs, neg, cpy, cmp.
271
;; farith       Most arithmetic insns.
272
;; fmul         Double precision multiply.
273
;; fdivs        Single precision sqrt or division.
274
;; fdivd        Double precision sqrt or division.
275
;; f_flag       fmstat operation
276
;; f_load       Floating point load from memory.
277
;; f_store      Floating point store to memory.
278
;; f_2_r        Transfer vfp to arm reg.
279
;; r_2_f        Transfer arm to vfp reg.
280
 
281
;; Note, no instruction can issue to the VFP if the core is stalled in the
282
;; first execute state.  We model this by using 1020a_e in the first cycle.
283
(define_insn_reservation "v10_ffarith" 5
284
 (and (eq_attr "vfp10" "yes")
285
      (eq_attr "type" "ffarith"))
286
 "1020a_e+v10_fmac")
287
 
288
(define_insn_reservation "v10_farith" 5
289
 (and (eq_attr "vfp10" "yes")
290
      (eq_attr "type" "farith"))
291
 "1020a_e+v10_fmac")
292
 
293
(define_insn_reservation "v10_cvt" 5
294
 (and (eq_attr "vfp10" "yes")
295
      (eq_attr "type" "f_cvt"))
296
 "1020a_e+v10_fmac")
297
 
298
(define_insn_reservation "v10_fmul" 6
299
 (and (eq_attr "vfp10" "yes")
300
      (eq_attr "type" "fmul"))
301
 "1020a_e+v10_fmac*2")
302
 
303
(define_insn_reservation "v10_fdivs" 18
304
 (and (eq_attr "vfp10" "yes")
305
      (eq_attr "type" "fdivs"))
306
 "1020a_e+v10_ds*14")
307
 
308
(define_insn_reservation "v10_fdivd" 32
309
 (and (eq_attr "vfp10" "yes")
310
      (eq_attr "type" "fdivd"))
311
 "1020a_e+v10_fmac+v10_ds*28")
312
 
313
(define_insn_reservation "v10_floads" 4
314
 (and (eq_attr "vfp10" "yes")
315
      (eq_attr "type" "f_loads"))
316
 "1020a_e+1020l_e+v10_ls1,v10_ls2")
317
 
318
;; We model a load of a double as needing all the vfp ls* stage in cycle 1.
319
;; This gives the correct mix between single-and double loads where a flds
320
;; followed by and fldd will stall for one cycle, but two back-to-back fldd
321
;; insns stall for two cycles.
322
(define_insn_reservation "v10_floadd" 5
323
 (and (eq_attr "vfp10" "yes")
324
      (eq_attr "type" "f_loadd"))
325
 "1020a_e+1020l_e+v10_ls1+v10_ls2+v10_ls3,v10_ls2+v10_ls3,v10_ls3")
326
 
327
;; Moves to/from arm regs also use the load/store pipeline.
328
 
329
(define_insn_reservation "v10_c2v" 4
330
 (and (eq_attr "vfp10" "yes")
331
      (eq_attr "type" "r_2_f"))
332
 "1020a_e+1020l_e+v10_ls1,v10_ls2")
333
 
334
(define_insn_reservation "v10_fstores" 1
335
 (and (eq_attr "vfp10" "yes")
336
      (eq_attr "type" "f_stores"))
337
 "1020a_e+1020l_e+v10_ls1,v10_ls2")
338
 
339
(define_insn_reservation "v10_fstored" 1
340
 (and (eq_attr "vfp10" "yes")
341
      (eq_attr "type" "f_stored"))
342
 "1020a_e+1020l_e+v10_ls1+v10_ls2+v10_ls3,v10_ls2+v10_ls3,v10_ls3")
343
 
344
(define_insn_reservation "v10_v2c" 1
345
 (and (eq_attr "vfp10" "yes")
346
      (eq_attr "type" "f_2_r"))
347
 "1020a_e+1020l_e,1020l_m,1020l_w")
348
 
349
(define_insn_reservation "v10_to_cpsr" 2
350
 (and (eq_attr "vfp10" "yes")
351
      (eq_attr "type" "f_flag"))
352
 "1020a_e+v10_fmstat,1020a_e+1020l_e,1020l_m,1020l_w")
353
 
354
;; VFP bypasses
355
 
356
;; There are bypasses for most operations other than store
357
 
358
(define_bypass 3
359
 "v10_c2v,v10_floads"
360
 "v10_ffarith,v10_farith,v10_fmul,v10_fdivs,v10_fdivd,v10_cvt")
361
 
362
(define_bypass 4
363
 "v10_floadd"
364
 "v10_ffarith,v10_farith,v10_fmul,v10_fdivs,v10_fdivd")
365
 
366
;; Arithmetic to other arithmetic saves a cycle due to forwarding
367
(define_bypass 4
368
 "v10_ffarith,v10_farith"
369
 "v10_ffarith,v10_farith,v10_fmul,v10_fdivs,v10_fdivd")
370
 
371
(define_bypass 5
372
 "v10_fmul"
373
 "v10_ffarith,v10_farith,v10_fmul,v10_fdivs,v10_fdivd")
374
 
375
(define_bypass 17
376
 "v10_fdivs"
377
 "v10_ffarith,v10_farith,v10_fmul,v10_fdivs,v10_fdivd")
378
 
379
(define_bypass 31
380
 "v10_fdivd"
381
 "v10_ffarith,v10_farith,v10_fmul,v10_fdivs,v10_fdivd")
382
 
383
;; VFP anti-dependencies.
384
 
385
;; There is one anti-dependence in the following case (not yet modelled):
386
;; - After a store: one extra cycle for both fsts and fstd
387
;; Note, back-to-back fstd instructions will overload the load/store datapath
388
;; causing a two-cycle stall.

powered by: WebSVN 2.1.0

© copyright 1999-2024 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.