URL https://opencores.org/ocsvn/scarts/scarts/trunk

Subversion Repositories scarts

[/] [scarts/] [trunk/] [toolchain/] [scarts-gcc/] [gcc-4.1.1/] [gcc/] [config/] [arm/] [arm1136jfs.md] - Blame information for rev 12

Details | Compare with Previous | View Log


;; ARM 1136J[F]-S Pipeline Description
;; Copyright (C) 2003 Free Software Foundation, Inc.
;; Written by CodeSourcery, LLC.
;;
;; This file is part of GCC.
;;
;; GCC is free software; you can redistribute it and/or modify it
;; under the terms of the GNU General Public License as published by
;; the Free Software Foundation; either version 2, or (at your option)
;; any later version.
;;
;; GCC is distributed in the hope that it will be useful, but
;; WITHOUT ANY WARRANTY; without even the implied warranty of
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
;; General Public License for more details.
;;
;; You should have received a copy of the GNU General Public License
;; along with GCC; see the file COPYING.  If not, write to the Free
;; Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
;; 02110-1301, USA.  */
 
;; These descriptions are based on the information contained in the
;; ARM1136JF-S Technical Reference Manual, Copyright (c) 2003 ARM
;; Limited.
;;
 
;; This automaton provides a pipeline description for the ARM
;; 1136J-S and 1136JF-S cores.
;;
;; The model given here assumes that the condition for all conditional
;; instructions is "true", i.e., that all of the instructions are
;; actually executed.
 
(define_automaton "arm1136jfs")
 
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Pipelines
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
;; There are three distinct pipelines (page 1-26 and following):
;;
;; - A 4-stage decode pipeline, shared by all three.  It has fetch (1),
;;   fetch (2), decode, and issue stages.  Since this is always involved,
;;   we do not model it in the scheduler.
;;
;; - A 4-stage ALU pipeline.  It has shifter, ALU (main integer operations),
;;   and saturation stages.  The fourth stage is writeback; see below.
;;
;; - A 4-stage multiply-accumulate pipeline.  It has three stages, called
;;   MAC1 through MAC3, and a fourth writeback stage.
;;
;;   The 4th-stage writeback is shared between the ALU and MAC pipelines,
;;   which operate in lockstep.  Results from either pipeline will be
;;   moved into the writeback stage.  Because the two pipelines operate
;;   in lockstep, we schedule them as a single "execute" pipeline.
;;
;; - A 4-stage LSU pipeline.  It has address generation, data cache (1),
;;   data cache (2), and writeback stages.  (Note that this pipeline,
;;   including the writeback stage, is independent from the ALU & LSU pipes.)
 
(define_cpu_unit "e_1,e_2,e_3,e_wb" "arm1136jfs")     ; ALU and MAC
; e_1 = Sh/Mac1, e_2 = ALU/Mac2, e_3 = SAT/Mac3
(define_cpu_unit "l_a,l_dc1,l_dc2,l_wb" "arm1136jfs") ; Load/Store
 
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; ALU Instructions
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
;; ALU instructions require eight cycles to execute, and use the ALU
;; pipeline in each of the eight stages.  The results are available
;; after the alu stage has finished.
;;
;; If the destination register is the PC, the pipelines are stalled
;; for several cycles.  That case is not modelled here.
 
;; ALU operations with no shifted operand
(define_insn_reservation "11_alu_op" 2
 (and (eq_attr "tune" "arm1136js,arm1136jfs")
      (eq_attr "type" "alu"))
 "e_1,e_2,e_3,e_wb")
 
;; ALU operations with a shift-by-constant operand
(define_insn_reservation "11_alu_shift_op" 2
 (and (eq_attr "tune" "arm1136js,arm1136jfs")
      (eq_attr "type" "alu_shift"))
 "e_1,e_2,e_3,e_wb")
 
;; ALU operations with a shift-by-register operand
;; These really stall in the decoder, in order to read
;; the shift value in a second cycle. Pretend we take two cycles in
;; the shift stage.
(define_insn_reservation "11_alu_shift_reg_op" 3
 (and (eq_attr "tune" "arm1136js,arm1136jfs")
      (eq_attr "type" "alu_shift_reg"))
 "e_1*2,e_2,e_3,e_wb")
 
;; alu_ops can start sooner, if there is no shifter dependency
(define_bypass 1 "11_alu_op,11_alu_shift_op"
               "11_alu_op")
(define_bypass 1 "11_alu_op,11_alu_shift_op"
               "11_alu_shift_op"
               "arm_no_early_alu_shift_value_dep")
(define_bypass 1 "11_alu_op,11_alu_shift_op"
               "11_alu_shift_reg_op"
               "arm_no_early_alu_shift_dep")
(define_bypass 2 "11_alu_shift_reg_op"
               "11_alu_op")
(define_bypass 2 "11_alu_shift_reg_op"
               "11_alu_shift_op"
               "arm_no_early_alu_shift_value_dep")
(define_bypass 2 "11_alu_shift_reg_op"
               "11_alu_shift_reg_op"
               "arm_no_early_alu_shift_dep")
 
(define_bypass 1 "11_alu_op,11_alu_shift_op"
               "11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7"
               "arm_no_early_mul_dep")
(define_bypass 2 "11_alu_shift_reg_op"
               "11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7"
               "arm_no_early_mul_dep")
 
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Multiplication Instructions
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
;; Multiplication instructions loop in the first two execute stages until
;; the instruction has been passed through the multiplier array enough
;; times.
 
;; Multiply and multiply-accumulate results are available after four stages.
(define_insn_reservation "11_mult1" 4
 (and (eq_attr "tune" "arm1136js,arm1136jfs")
      (eq_attr "insn" "mul,mla"))
 "e_1*2,e_2,e_3,e_wb")
 
;; The *S variants set the condition flags, which requires three more cycles.
(define_insn_reservation "11_mult2" 4
 (and (eq_attr "tune" "arm1136js,arm1136jfs")
      (eq_attr "insn" "muls,mlas"))
 "e_1*2,e_2,e_3,e_wb")
 
(define_bypass 3 "11_mult1,11_mult2"
               "11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7"
               "arm_no_early_mul_dep")
(define_bypass 3 "11_mult1,11_mult2"
               "11_alu_op")
(define_bypass 3 "11_mult1,11_mult2"
               "11_alu_shift_op"
               "arm_no_early_alu_shift_value_dep")
(define_bypass 3 "11_mult1,11_mult2"
               "11_alu_shift_reg_op"
               "arm_no_early_alu_shift_dep")
(define_bypass 3 "11_mult1,11_mult2"
               "11_store1"
               "arm_no_early_store_addr_dep")
 
;; Signed and unsigned multiply long results are available across two cycles;
;; the less significant word is available one cycle before the more significant
;; word.  Here we conservatively wait until both are available, which is
;; after three iterations and the memory cycle.  The same is also true of
;; the two multiply-accumulate instructions.
(define_insn_reservation "11_mult3" 5
 (and (eq_attr "tune" "arm1136js,arm1136jfs")
      (eq_attr "insn" "smull,umull,smlal,umlal"))
 "e_1*3,e_2,e_3,e_wb*2")
 
;; The *S variants set the condition flags, which requires three more cycles.
(define_insn_reservation "11_mult4" 5
 (and (eq_attr "tune" "arm1136js,arm1136jfs")
      (eq_attr "insn" "smulls,umulls,smlals,umlals"))
 "e_1*3,e_2,e_3,e_wb*2")
 
(define_bypass 4 "11_mult3,11_mult4"
               "11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7"
               "arm_no_early_mul_dep")
(define_bypass 4 "11_mult3,11_mult4"
               "11_alu_op")
(define_bypass 4 "11_mult3,11_mult4"
               "11_alu_shift_op"
               "arm_no_early_alu_shift_value_dep")
(define_bypass 4 "11_mult3,11_mult4"
               "11_alu_shift_reg_op"
               "arm_no_early_alu_shift_dep")
(define_bypass 4 "11_mult3,11_mult4"
               "11_store1"
               "arm_no_early_store_addr_dep")
 
;; Various 16x16->32 multiplies and multiply-accumulates, using combinations
;; of high and low halves of the argument registers.  They take a single
;; pass through the pipeline and make the result available after three
;; cycles.
(define_insn_reservation "11_mult5" 3
 (and (eq_attr "tune" "arm1136js,arm1136jfs")
      (eq_attr "insn" "smulxy,smlaxy,smulwy,smlawy,smuad,smuadx,smlad,smladx,smusd,smusdx,smlsd,smlsdx"))
 "e_1,e_2,e_3,e_wb")
 
(define_bypass 2 "11_mult5"
               "11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7"
               "arm_no_early_mul_dep")
(define_bypass 2 "11_mult5"
               "11_alu_op")
(define_bypass 2 "11_mult5"
               "11_alu_shift_op"
               "arm_no_early_alu_shift_value_dep")
(define_bypass 2 "11_mult5"
               "11_alu_shift_reg_op"
               "arm_no_early_alu_shift_dep")
(define_bypass 2 "11_mult5"
               "11_store1"
               "arm_no_early_store_addr_dep")
 
;; The same idea, then the 32-bit result is added to a 64-bit quantity.
(define_insn_reservation "11_mult6" 4
 (and (eq_attr "tune" "arm1136js,arm1136jfs")
      (eq_attr "insn" "smlalxy"))
 "e_1*2,e_2,e_3,e_wb*2")
 
;; Signed 32x32 multiply, then the most significant 32 bits are extracted
;; and are available after the memory stage.
(define_insn_reservation "11_mult7" 4
 (and (eq_attr "tune" "arm1136js,arm1136jfs")
      (eq_attr "insn" "smmul,smmulr"))
 "e_1*2,e_2,e_3,e_wb")
 
(define_bypass 3 "11_mult6,11_mult7"
               "11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7"
               "arm_no_early_mul_dep")
(define_bypass 3 "11_mult6,11_mult7"
               "11_alu_op")
(define_bypass 3 "11_mult6,11_mult7"
               "11_alu_shift_op"
               "arm_no_early_alu_shift_value_dep")
(define_bypass 3 "11_mult6,11_mult7"
               "11_alu_shift_reg_op"
               "arm_no_early_alu_shift_dep")
(define_bypass 3 "11_mult6,11_mult7"
               "11_store1"
               "arm_no_early_store_addr_dep")
 
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Branch Instructions
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
;; These vary greatly depending on their arguments and the results of
;; stat prediction.  Cycle count ranges from zero (unconditional branch,
;; folded dynamic prediction) to seven (incorrect predictions, etc).  We
;; assume an optimal case for now, because the cost of a cache miss
;; overwhelms the cost of everything else anyhow.
 
(define_insn_reservation "11_branches" 0
 (and (eq_attr "tune" "arm1136js,arm1136jfs")
      (eq_attr "type" "branch"))
 "nothing")
 
;; Call latencies are not predictable.  A semi-arbitrary very large
;; number is used as "positive infinity" so that everything should be
;; finished by the time of return.
(define_insn_reservation "11_call" 32
 (and (eq_attr "tune" "arm1136js,arm1136jfs")
      (eq_attr "type" "call"))
 "nothing")
 
;; Branches are predicted. A correctly predicted branch will be no
;; cost, but we're conservative here, and use the timings a
;; late-register would give us.
(define_bypass 1 "11_alu_op,11_alu_shift_op"
               "11_branches")
(define_bypass 2 "11_alu_shift_reg_op"
               "11_branches")
(define_bypass 2 "11_load1,11_load2"
               "11_branches")
(define_bypass 3 "11_load34"
               "11_branches")
 
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Load/Store Instructions
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
;; The models for load/store instructions do not accurately describe
;; the difference between operations with a base register writeback.
;; These models assume that all memory references hit in dcache.  Also,
;; if the PC is one of the registers involved, there are additional stalls
;; not modelled here.  Addressing modes are also not modelled.
 
(define_insn_reservation "11_load1" 3
 (and (eq_attr "tune" "arm1136js,arm1136jfs")
      (eq_attr "type" "load1"))
 "l_a+e_1,l_dc1,l_dc2,l_wb")
 
;; Load byte results are not available until the writeback stage, where
;; the correct byte is extracted.
 
(define_insn_reservation "11_loadb" 4
 (and (eq_attr "tune" "arm1136js,arm1136jfs")
      (eq_attr "type" "load_byte"))
 "l_a+e_1,l_dc1,l_dc2,l_wb")
 
(define_insn_reservation "11_store1" 0
 (and (eq_attr "tune" "arm1136js,arm1136jfs")
      (eq_attr "type" "store1"))
 "l_a+e_1,l_dc1,l_dc2,l_wb")
 
;; Load/store double words into adjacent registers.  The timing and
;; latencies are different depending on whether the address is 64-bit
;; aligned.  This model assumes that it is.
(define_insn_reservation "11_load2" 3
 (and (eq_attr "tune" "arm1136js,arm1136jfs")
      (eq_attr "type" "load2"))
 "l_a+e_1,l_dc1,l_dc2,l_wb")
 
(define_insn_reservation "11_store2" 0
 (and (eq_attr "tune" "arm1136js,arm1136jfs")
      (eq_attr "type" "store2"))
 "l_a+e_1,l_dc1,l_dc2,l_wb")
 
;; Load/store multiple registers.  Two registers are stored per cycle.
;; Actual timing depends on how many registers are affected, so we
;; optimistically schedule a low latency.
(define_insn_reservation "11_load34" 4
 (and (eq_attr "tune" "arm1136js,arm1136jfs")
      (eq_attr "type" "load3,load4"))
 "l_a+e_1,l_dc1*2,l_dc2,l_wb")
 
(define_insn_reservation "11_store34" 0
 (and (eq_attr "tune" "arm1136js,arm1136jfs")
      (eq_attr "type" "store3,store4"))
 "l_a+e_1,l_dc1*2,l_dc2,l_wb")
 
;; A store can start immediately after an alu op, if that alu op does
;; not provide part of the address to access.
(define_bypass 1 "11_alu_op,11_alu_shift_op"
               "11_store1"
               "arm_no_early_store_addr_dep")
(define_bypass 2 "11_alu_shift_reg_op"
               "11_store1"
               "arm_no_early_store_addr_dep")
 
;; An alu op can start sooner after a load, if that alu op does not
;; have an early register dependency on the load
(define_bypass 2 "11_load1"
               "11_alu_op")
(define_bypass 2 "11_load1"
               "11_alu_shift_op"
               "arm_no_early_alu_shift_value_dep")
(define_bypass 2 "11_load1"
               "11_alu_shift_reg_op"
               "arm_no_early_alu_shift_dep")
 
(define_bypass 3 "11_loadb"
               "11_alu_op")
(define_bypass 3 "11_loadb"
               "11_alu_shift_op"
               "arm_no_early_alu_shift_value_dep")
(define_bypass 3 "11_loadb"
               "11_alu_shift_reg_op"
               "arm_no_early_alu_shift_dep")
 
;; A mul op can start sooner after a load, if that mul op does not
;; have an early multiply dependency
(define_bypass 2 "11_load1"
               "11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7"
               "arm_no_early_mul_dep")
(define_bypass 3 "11_load34"
               "11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7"
               "arm_no_early_mul_dep")
(define_bypass 3 "11_loadb"
               "11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7"
               "arm_no_early_mul_dep")
 
;; A store can start sooner after a load, if that load does not
;; produce part of the address to access
(define_bypass 2 "11_load1"
               "11_store1"
               "arm_no_early_store_addr_dep")
(define_bypass 3 "11_loadb"
               "11_store1"
               "arm_no_early_store_addr_dep")

Browse

Tools

Subversion Repositories scarts

[/] [scarts/] [trunk/] [toolchain/] [scarts-gcc/] [gcc-4.1.1/] [gcc/] [config/] [arm/] [arm1136jfs.md] - Blame information for rev 12

Line No.	Rev	Author	Line
1	12	jlechner	`;; ARM 1136J[F]-S Pipeline Description`
2			`;; Copyright (C) 2003 Free Software Foundation, Inc.`
3			`;; Written by CodeSourcery, LLC.`
4			`;;`
5			`;; This file is part of GCC.`
6			`;;`
7			`;; GCC is free software; you can redistribute it and/or modify it`
8			`;; under the terms of the GNU General Public License as published by`
9			`;; the Free Software Foundation; either version 2, or (at your option)`
10			`;; any later version.`
11			`;;`
12			`;; GCC is distributed in the hope that it will be useful, but`
13			`;; WITHOUT ANY WARRANTY; without even the implied warranty of`
14			`;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU`
15			`;; General Public License for more details.`
16			`;;`
17			`;; You should have received a copy of the GNU General Public License`
18			`;; along with GCC; see the file COPYING. If not, write to the Free`
19			`;; Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA`
20			`;; 02110-1301, USA. */`
21
22			`;; These descriptions are based on the information contained in the`
23			`;; ARM1136JF-S Technical Reference Manual, Copyright (c) 2003 ARM`
24			`;; Limited.`
25			`;;`
26
27			`;; This automaton provides a pipeline description for the ARM`
28			`;; 1136J-S and 1136JF-S cores.`
29			`;;`
30			`;; The model given here assumes that the condition for all conditional`
31			`;; instructions is "true", i.e., that all of the instructions are`
32			`;; actually executed.`
33
34			`(define_automaton "arm1136jfs")`
35
36			`;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;`
37			`;; Pipelines`
38			`;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;`
39
40			`;; There are three distinct pipelines (page 1-26 and following):`
41			`;;`
42			`;; - A 4-stage decode pipeline, shared by all three. It has fetch (1),`
43			`;; fetch (2), decode, and issue stages. Since this is always involved,`
44			`;; we do not model it in the scheduler.`
45			`;;`
46			`;; - A 4-stage ALU pipeline. It has shifter, ALU (main integer operations),`
47			`;; and saturation stages. The fourth stage is writeback; see below.`
48			`;;`
49			`;; - A 4-stage multiply-accumulate pipeline. It has three stages, called`
50			`;; MAC1 through MAC3, and a fourth writeback stage.`
51			`;;`
52			`;; The 4th-stage writeback is shared between the ALU and MAC pipelines,`
53			`;; which operate in lockstep. Results from either pipeline will be`
54			`;; moved into the writeback stage. Because the two pipelines operate`
55			`;; in lockstep, we schedule them as a single "execute" pipeline.`
56			`;;`
57			`;; - A 4-stage LSU pipeline. It has address generation, data cache (1),`
58			`;; data cache (2), and writeback stages. (Note that this pipeline,`
59			`;; including the writeback stage, is independent from the ALU & LSU pipes.)`
60
61			`(define_cpu_unit "e_1,e_2,e_3,e_wb" "arm1136jfs") ; ALU and MAC`
62			`; e_1 = Sh/Mac1, e_2 = ALU/Mac2, e_3 = SAT/Mac3`
63			`(define_cpu_unit "l_a,l_dc1,l_dc2,l_wb" "arm1136jfs") ; Load/Store`
64
65			`;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;`
66			`;; ALU Instructions`
67			`;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;`
68
69			`;; ALU instructions require eight cycles to execute, and use the ALU`
70			`;; pipeline in each of the eight stages. The results are available`
71			`;; after the alu stage has finished.`
72			`;;`
73			`;; If the destination register is the PC, the pipelines are stalled`
74			`;; for several cycles. That case is not modelled here.`
75
76			`;; ALU operations with no shifted operand`
77			`(define_insn_reservation "11_alu_op" 2`
78			`(and (eq_attr "tune" "arm1136js,arm1136jfs")`
79			`(eq_attr "type" "alu"))`
80			`"e_1,e_2,e_3,e_wb")`
81
82			`;; ALU operations with a shift-by-constant operand`
83			`(define_insn_reservation "11_alu_shift_op" 2`
84			`(and (eq_attr "tune" "arm1136js,arm1136jfs")`
85			`(eq_attr "type" "alu_shift"))`
86			`"e_1,e_2,e_3,e_wb")`
87
88			`;; ALU operations with a shift-by-register operand`
89			`;; These really stall in the decoder, in order to read`
90			`;; the shift value in a second cycle. Pretend we take two cycles in`
91			`;; the shift stage.`
92			`(define_insn_reservation "11_alu_shift_reg_op" 3`
93			`(and (eq_attr "tune" "arm1136js,arm1136jfs")`
94			`(eq_attr "type" "alu_shift_reg"))`
95			`"e_1*2,e_2,e_3,e_wb")`
96
97			`;; alu_ops can start sooner, if there is no shifter dependency`
98			`(define_bypass 1 "11_alu_op,11_alu_shift_op"`
99			`"11_alu_op")`
100			`(define_bypass 1 "11_alu_op,11_alu_shift_op"`
101			`"11_alu_shift_op"`
102			`"arm_no_early_alu_shift_value_dep")`
103			`(define_bypass 1 "11_alu_op,11_alu_shift_op"`
104			`"11_alu_shift_reg_op"`
105			`"arm_no_early_alu_shift_dep")`
106			`(define_bypass 2 "11_alu_shift_reg_op"`
107			`"11_alu_op")`
108			`(define_bypass 2 "11_alu_shift_reg_op"`
109			`"11_alu_shift_op"`
110			`"arm_no_early_alu_shift_value_dep")`
111			`(define_bypass 2 "11_alu_shift_reg_op"`
112			`"11_alu_shift_reg_op"`
113			`"arm_no_early_alu_shift_dep")`
114
115			`(define_bypass 1 "11_alu_op,11_alu_shift_op"`
116			`"11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7"`
117			`"arm_no_early_mul_dep")`
118			`(define_bypass 2 "11_alu_shift_reg_op"`
119			`"11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7"`
120			`"arm_no_early_mul_dep")`
121
122			`;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;`
123			`;; Multiplication Instructions`
124			`;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;`
125
126			`;; Multiplication instructions loop in the first two execute stages until`
127			`;; the instruction has been passed through the multiplier array enough`
128			`;; times.`
129
130			`;; Multiply and multiply-accumulate results are available after four stages.`
131			`(define_insn_reservation "11_mult1" 4`
132			`(and (eq_attr "tune" "arm1136js,arm1136jfs")`
133			`(eq_attr "insn" "mul,mla"))`
134			`"e_1*2,e_2,e_3,e_wb")`
135
136			`;; The *S variants set the condition flags, which requires three more cycles.`
137			`(define_insn_reservation "11_mult2" 4`
138			`(and (eq_attr "tune" "arm1136js,arm1136jfs")`
139			`(eq_attr "insn" "muls,mlas"))`
140			`"e_1*2,e_2,e_3,e_wb")`
141
142			`(define_bypass 3 "11_mult1,11_mult2"`
143			`"11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7"`
144			`"arm_no_early_mul_dep")`
145			`(define_bypass 3 "11_mult1,11_mult2"`
146			`"11_alu_op")`
147			`(define_bypass 3 "11_mult1,11_mult2"`
148			`"11_alu_shift_op"`
149			`"arm_no_early_alu_shift_value_dep")`
150			`(define_bypass 3 "11_mult1,11_mult2"`
151			`"11_alu_shift_reg_op"`
152			`"arm_no_early_alu_shift_dep")`
153			`(define_bypass 3 "11_mult1,11_mult2"`
154			`"11_store1"`
155			`"arm_no_early_store_addr_dep")`
156
157			`;; Signed and unsigned multiply long results are available across two cycles;`
158			`;; the less significant word is available one cycle before the more significant`
159			`;; word. Here we conservatively wait until both are available, which is`
160			`;; after three iterations and the memory cycle. The same is also true of`
161			`;; the two multiply-accumulate instructions.`
162			`(define_insn_reservation "11_mult3" 5`
163			`(and (eq_attr "tune" "arm1136js,arm1136jfs")`
164			`(eq_attr "insn" "smull,umull,smlal,umlal"))`
165			`"e_13,e_2,e_3,e_wb2")`
166
167			`;; The *S variants set the condition flags, which requires three more cycles.`
168			`(define_insn_reservation "11_mult4" 5`
169			`(and (eq_attr "tune" "arm1136js,arm1136jfs")`
170			`(eq_attr "insn" "smulls,umulls,smlals,umlals"))`
171			`"e_13,e_2,e_3,e_wb2")`
172
173			`(define_bypass 4 "11_mult3,11_mult4"`
174			`"11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7"`
175			`"arm_no_early_mul_dep")`
176			`(define_bypass 4 "11_mult3,11_mult4"`
177			`"11_alu_op")`
178			`(define_bypass 4 "11_mult3,11_mult4"`
179			`"11_alu_shift_op"`
180			`"arm_no_early_alu_shift_value_dep")`
181			`(define_bypass 4 "11_mult3,11_mult4"`
182			`"11_alu_shift_reg_op"`
183			`"arm_no_early_alu_shift_dep")`
184			`(define_bypass 4 "11_mult3,11_mult4"`
185			`"11_store1"`
186			`"arm_no_early_store_addr_dep")`
187
188			`;; Various 16x16->32 multiplies and multiply-accumulates, using combinations`
189			`;; of high and low halves of the argument registers. They take a single`
190			`;; pass through the pipeline and make the result available after three`
191			`;; cycles.`
192			`(define_insn_reservation "11_mult5" 3`
193			`(and (eq_attr "tune" "arm1136js,arm1136jfs")`
194			`(eq_attr "insn" "smulxy,smlaxy,smulwy,smlawy,smuad,smuadx,smlad,smladx,smusd,smusdx,smlsd,smlsdx"))`
195			`"e_1,e_2,e_3,e_wb")`
196
197			`(define_bypass 2 "11_mult5"`
198			`"11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7"`
199			`"arm_no_early_mul_dep")`
200			`(define_bypass 2 "11_mult5"`
201			`"11_alu_op")`
202			`(define_bypass 2 "11_mult5"`
203			`"11_alu_shift_op"`
204			`"arm_no_early_alu_shift_value_dep")`
205			`(define_bypass 2 "11_mult5"`
206			`"11_alu_shift_reg_op"`
207			`"arm_no_early_alu_shift_dep")`
208			`(define_bypass 2 "11_mult5"`
209			`"11_store1"`
210			`"arm_no_early_store_addr_dep")`
211
212			`;; The same idea, then the 32-bit result is added to a 64-bit quantity.`
213			`(define_insn_reservation "11_mult6" 4`
214			`(and (eq_attr "tune" "arm1136js,arm1136jfs")`
215			`(eq_attr "insn" "smlalxy"))`
216			`"e_12,e_2,e_3,e_wb2")`
217
218			`;; Signed 32x32 multiply, then the most significant 32 bits are extracted`
219			`;; and are available after the memory stage.`
220			`(define_insn_reservation "11_mult7" 4`
221			`(and (eq_attr "tune" "arm1136js,arm1136jfs")`
222			`(eq_attr "insn" "smmul,smmulr"))`
223			`"e_1*2,e_2,e_3,e_wb")`
224
225			`(define_bypass 3 "11_mult6,11_mult7"`
226			`"11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7"`
227			`"arm_no_early_mul_dep")`
228			`(define_bypass 3 "11_mult6,11_mult7"`
229			`"11_alu_op")`
230			`(define_bypass 3 "11_mult6,11_mult7"`
231			`"11_alu_shift_op"`
232			`"arm_no_early_alu_shift_value_dep")`
233			`(define_bypass 3 "11_mult6,11_mult7"`
234			`"11_alu_shift_reg_op"`
235			`"arm_no_early_alu_shift_dep")`
236			`(define_bypass 3 "11_mult6,11_mult7"`
237			`"11_store1"`
238			`"arm_no_early_store_addr_dep")`
239
240			`;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;`
241			`;; Branch Instructions`
242			`;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;`
243
244			`;; These vary greatly depending on their arguments and the results of`
245			`;; stat prediction. Cycle count ranges from zero (unconditional branch,`
246			`;; folded dynamic prediction) to seven (incorrect predictions, etc). We`
247			`;; assume an optimal case for now, because the cost of a cache miss`
248			`;; overwhelms the cost of everything else anyhow.`
249
250			`(define_insn_reservation "11_branches" 0`
251			`(and (eq_attr "tune" "arm1136js,arm1136jfs")`
252			`(eq_attr "type" "branch"))`
253			`"nothing")`
254
255			`;; Call latencies are not predictable. A semi-arbitrary very large`
256			`;; number is used as "positive infinity" so that everything should be`
257			`;; finished by the time of return.`
258			`(define_insn_reservation "11_call" 32`
259			`(and (eq_attr "tune" "arm1136js,arm1136jfs")`
260			`(eq_attr "type" "call"))`
261			`"nothing")`
262
263			`;; Branches are predicted. A correctly predicted branch will be no`
264			`;; cost, but we're conservative here, and use the timings a`
265			`;; late-register would give us.`
266			`(define_bypass 1 "11_alu_op,11_alu_shift_op"`
267			`"11_branches")`
268			`(define_bypass 2 "11_alu_shift_reg_op"`
269			`"11_branches")`
270			`(define_bypass 2 "11_load1,11_load2"`
271			`"11_branches")`
272			`(define_bypass 3 "11_load34"`
273			`"11_branches")`
274
275			`;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;`
276			`;; Load/Store Instructions`
277			`;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;`
278
279			`;; The models for load/store instructions do not accurately describe`
280			`;; the difference between operations with a base register writeback.`
281			`;; These models assume that all memory references hit in dcache. Also,`
282			`;; if the PC is one of the registers involved, there are additional stalls`
283			`;; not modelled here. Addressing modes are also not modelled.`
284
285			`(define_insn_reservation "11_load1" 3`
286			`(and (eq_attr "tune" "arm1136js,arm1136jfs")`
287			`(eq_attr "type" "load1"))`
288			`"l_a+e_1,l_dc1,l_dc2,l_wb")`
289
290			`;; Load byte results are not available until the writeback stage, where`
291			`;; the correct byte is extracted.`
292
293			`(define_insn_reservation "11_loadb" 4`
294			`(and (eq_attr "tune" "arm1136js,arm1136jfs")`
295			`(eq_attr "type" "load_byte"))`
296			`"l_a+e_1,l_dc1,l_dc2,l_wb")`
297
298			`(define_insn_reservation "11_store1" 0`
299			`(and (eq_attr "tune" "arm1136js,arm1136jfs")`
300			`(eq_attr "type" "store1"))`
301			`"l_a+e_1,l_dc1,l_dc2,l_wb")`
302
303			`;; Load/store double words into adjacent registers. The timing and`
304			`;; latencies are different depending on whether the address is 64-bit`
305			`;; aligned. This model assumes that it is.`
306			`(define_insn_reservation "11_load2" 3`
307			`(and (eq_attr "tune" "arm1136js,arm1136jfs")`
308			`(eq_attr "type" "load2"))`
309			`"l_a+e_1,l_dc1,l_dc2,l_wb")`
310
311			`(define_insn_reservation "11_store2" 0`
312			`(and (eq_attr "tune" "arm1136js,arm1136jfs")`
313			`(eq_attr "type" "store2"))`
314			`"l_a+e_1,l_dc1,l_dc2,l_wb")`
315
316			`;; Load/store multiple registers. Two registers are stored per cycle.`
317			`;; Actual timing depends on how many registers are affected, so we`
318			`;; optimistically schedule a low latency.`
319			`(define_insn_reservation "11_load34" 4`
320			`(and (eq_attr "tune" "arm1136js,arm1136jfs")`
321			`(eq_attr "type" "load3,load4"))`
322			`"l_a+e_1,l_dc1*2,l_dc2,l_wb")`
323
324			`(define_insn_reservation "11_store34" 0`
325			`(and (eq_attr "tune" "arm1136js,arm1136jfs")`
326			`(eq_attr "type" "store3,store4"))`
327			`"l_a+e_1,l_dc1*2,l_dc2,l_wb")`
328
329			`;; A store can start immediately after an alu op, if that alu op does`
330			`;; not provide part of the address to access.`
331			`(define_bypass 1 "11_alu_op,11_alu_shift_op"`
332			`"11_store1"`
333			`"arm_no_early_store_addr_dep")`
334			`(define_bypass 2 "11_alu_shift_reg_op"`
335			`"11_store1"`
336			`"arm_no_early_store_addr_dep")`
337
338			`;; An alu op can start sooner after a load, if that alu op does not`
339			`;; have an early register dependency on the load`
340			`(define_bypass 2 "11_load1"`
341			`"11_alu_op")`
342			`(define_bypass 2 "11_load1"`
343			`"11_alu_shift_op"`
344			`"arm_no_early_alu_shift_value_dep")`
345			`(define_bypass 2 "11_load1"`
346			`"11_alu_shift_reg_op"`
347			`"arm_no_early_alu_shift_dep")`
348
349			`(define_bypass 3 "11_loadb"`
350			`"11_alu_op")`
351			`(define_bypass 3 "11_loadb"`
352			`"11_alu_shift_op"`
353			`"arm_no_early_alu_shift_value_dep")`
354			`(define_bypass 3 "11_loadb"`
355			`"11_alu_shift_reg_op"`
356			`"arm_no_early_alu_shift_dep")`
357
358			`;; A mul op can start sooner after a load, if that mul op does not`
359			`;; have an early multiply dependency`
360			`(define_bypass 2 "11_load1"`
361			`"11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7"`
362			`"arm_no_early_mul_dep")`
363			`(define_bypass 3 "11_load34"`
364			`"11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7"`
365			`"arm_no_early_mul_dep")`
366			`(define_bypass 3 "11_loadb"`
367			`"11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7"`
368			`"arm_no_early_mul_dep")`
369
370			`;; A store can start sooner after a load, if that load does not`
371			`;; produce part of the address to access`
372			`(define_bypass 2 "11_load1"`
373			`"11_store1"`
374			`"arm_no_early_store_addr_dep")`
375			`(define_bypass 3 "11_loadb"`
376			`"11_store1"`
377			`"arm_no_early_store_addr_dep")`