URL https://opencores.org/ocsvn/openrisc/openrisc/trunk

Subversion Repositories openrisc

[/] [openrisc/] [trunk/] [gnu-dev/] [or1k-gcc/] [gcc/] [config/] [arm/] [arm1136jfs.md] - Blame information for rev 709

Details | Compare with Previous | View Log


;; ARM 1136J[F]-S Pipeline Description
;; Copyright (C) 2003, 2007 Free Software Foundation, Inc.
;; Written by CodeSourcery, LLC.
;;
;; This file is part of GCC.
;;
;; GCC is free software; you can redistribute it and/or modify it
;; under the terms of the GNU General Public License as published by
;; the Free Software Foundation; either version 3, or (at your option)
;; any later version.
;;
;; GCC is distributed in the hope that it will be useful, but
;; WITHOUT ANY WARRANTY; without even the implied warranty of
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
;; General Public License for more details.
;;
;; You should have received a copy of the GNU General Public License
;; along with GCC; see the file COPYING3.  If not see
;; .  */
 
;; These descriptions are based on the information contained in the
;; ARM1136JF-S Technical Reference Manual, Copyright (c) 2003 ARM
;; Limited.
;;
 
;; This automaton provides a pipeline description for the ARM
;; 1136J-S and 1136JF-S cores.
;;
;; The model given here assumes that the condition for all conditional
;; instructions is "true", i.e., that all of the instructions are
;; actually executed.
 
(define_automaton "arm1136jfs")
 
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Pipelines
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
;; There are three distinct pipelines (page 1-26 and following):
;;
;; - A 4-stage decode pipeline, shared by all three.  It has fetch (1),
;;   fetch (2), decode, and issue stages.  Since this is always involved,
;;   we do not model it in the scheduler.
;;
;; - A 4-stage ALU pipeline.  It has shifter, ALU (main integer operations),
;;   and saturation stages.  The fourth stage is writeback; see below.
;;
;; - A 4-stage multiply-accumulate pipeline.  It has three stages, called
;;   MAC1 through MAC3, and a fourth writeback stage.
;;
;;   The 4th-stage writeback is shared between the ALU and MAC pipelines,
;;   which operate in lockstep.  Results from either pipeline will be
;;   moved into the writeback stage.  Because the two pipelines operate
;;   in lockstep, we schedule them as a single "execute" pipeline.
;;
;; - A 4-stage LSU pipeline.  It has address generation, data cache (1),
;;   data cache (2), and writeback stages.  (Note that this pipeline,
;;   including the writeback stage, is independent from the ALU & LSU pipes.)
 
(define_cpu_unit "e_1,e_2,e_3,e_wb" "arm1136jfs")     ; ALU and MAC
; e_1 = Sh/Mac1, e_2 = ALU/Mac2, e_3 = SAT/Mac3
(define_cpu_unit "l_a,l_dc1,l_dc2,l_wb" "arm1136jfs") ; Load/Store
 
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; ALU Instructions
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
;; ALU instructions require eight cycles to execute, and use the ALU
;; pipeline in each of the eight stages.  The results are available
;; after the alu stage has finished.
;;
;; If the destination register is the PC, the pipelines are stalled
;; for several cycles.  That case is not modelled here.
 
;; ALU operations with no shifted operand
(define_insn_reservation "11_alu_op" 2
 (and (eq_attr "tune" "arm1136js,arm1136jfs")
      (eq_attr "type" "alu"))
 "e_1,e_2,e_3,e_wb")
 
;; ALU operations with a shift-by-constant operand
(define_insn_reservation "11_alu_shift_op" 2
 (and (eq_attr "tune" "arm1136js,arm1136jfs")
      (eq_attr "type" "alu_shift"))
 "e_1,e_2,e_3,e_wb")
 
;; ALU operations with a shift-by-register operand
;; These really stall in the decoder, in order to read
;; the shift value in a second cycle. Pretend we take two cycles in
;; the shift stage.
(define_insn_reservation "11_alu_shift_reg_op" 3
 (and (eq_attr "tune" "arm1136js,arm1136jfs")
      (eq_attr "type" "alu_shift_reg"))
 "e_1*2,e_2,e_3,e_wb")
 
;; alu_ops can start sooner, if there is no shifter dependency
(define_bypass 1 "11_alu_op,11_alu_shift_op"
               "11_alu_op")
(define_bypass 1 "11_alu_op,11_alu_shift_op"
               "11_alu_shift_op"
               "arm_no_early_alu_shift_value_dep")
(define_bypass 1 "11_alu_op,11_alu_shift_op"
               "11_alu_shift_reg_op"
               "arm_no_early_alu_shift_dep")
(define_bypass 2 "11_alu_shift_reg_op"
               "11_alu_op")
(define_bypass 2 "11_alu_shift_reg_op"
               "11_alu_shift_op"
               "arm_no_early_alu_shift_value_dep")
(define_bypass 2 "11_alu_shift_reg_op"
               "11_alu_shift_reg_op"
               "arm_no_early_alu_shift_dep")
 
(define_bypass 1 "11_alu_op,11_alu_shift_op"
               "11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7"
               "arm_no_early_mul_dep")
(define_bypass 2 "11_alu_shift_reg_op"
               "11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7"
               "arm_no_early_mul_dep")
 
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Multiplication Instructions
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
;; Multiplication instructions loop in the first two execute stages until
;; the instruction has been passed through the multiplier array enough
;; times.
 
;; Multiply and multiply-accumulate results are available after four stages.
(define_insn_reservation "11_mult1" 4
 (and (eq_attr "tune" "arm1136js,arm1136jfs")
      (eq_attr "insn" "mul,mla"))
 "e_1*2,e_2,e_3,e_wb")
 
;; The *S variants set the condition flags, which requires three more cycles.
(define_insn_reservation "11_mult2" 4
 (and (eq_attr "tune" "arm1136js,arm1136jfs")
      (eq_attr "insn" "muls,mlas"))
 "e_1*2,e_2,e_3,e_wb")
 
(define_bypass 3 "11_mult1,11_mult2"
               "11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7"
               "arm_no_early_mul_dep")
(define_bypass 3 "11_mult1,11_mult2"
               "11_alu_op")
(define_bypass 3 "11_mult1,11_mult2"
               "11_alu_shift_op"
               "arm_no_early_alu_shift_value_dep")
(define_bypass 3 "11_mult1,11_mult2"
               "11_alu_shift_reg_op"
               "arm_no_early_alu_shift_dep")
(define_bypass 3 "11_mult1,11_mult2"
               "11_store1"
               "arm_no_early_store_addr_dep")
 
;; Signed and unsigned multiply long results are available across two cycles;
;; the less significant word is available one cycle before the more significant
;; word.  Here we conservatively wait until both are available, which is
;; after three iterations and the memory cycle.  The same is also true of
;; the two multiply-accumulate instructions.
(define_insn_reservation "11_mult3" 5
 (and (eq_attr "tune" "arm1136js,arm1136jfs")
      (eq_attr "insn" "smull,umull,smlal,umlal"))
 "e_1*3,e_2,e_3,e_wb*2")
 
;; The *S variants set the condition flags, which requires three more cycles.
(define_insn_reservation "11_mult4" 5
 (and (eq_attr "tune" "arm1136js,arm1136jfs")
      (eq_attr "insn" "smulls,umulls,smlals,umlals"))
 "e_1*3,e_2,e_3,e_wb*2")
 
(define_bypass 4 "11_mult3,11_mult4"
               "11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7"
               "arm_no_early_mul_dep")
(define_bypass 4 "11_mult3,11_mult4"
               "11_alu_op")
(define_bypass 4 "11_mult3,11_mult4"
               "11_alu_shift_op"
               "arm_no_early_alu_shift_value_dep")
(define_bypass 4 "11_mult3,11_mult4"
               "11_alu_shift_reg_op"
               "arm_no_early_alu_shift_dep")
(define_bypass 4 "11_mult3,11_mult4"
               "11_store1"
               "arm_no_early_store_addr_dep")
 
;; Various 16x16->32 multiplies and multiply-accumulates, using combinations
;; of high and low halves of the argument registers.  They take a single
;; pass through the pipeline and make the result available after three
;; cycles.
(define_insn_reservation "11_mult5" 3
 (and (eq_attr "tune" "arm1136js,arm1136jfs")
      (eq_attr "insn" "smulxy,smlaxy,smulwy,smlawy,smuad,smuadx,smlad,smladx,smusd,smusdx,smlsd,smlsdx"))
 "e_1,e_2,e_3,e_wb")
 
(define_bypass 2 "11_mult5"
               "11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7"
               "arm_no_early_mul_dep")
(define_bypass 2 "11_mult5"
               "11_alu_op")
(define_bypass 2 "11_mult5"
               "11_alu_shift_op"
               "arm_no_early_alu_shift_value_dep")
(define_bypass 2 "11_mult5"
               "11_alu_shift_reg_op"
               "arm_no_early_alu_shift_dep")
(define_bypass 2 "11_mult5"
               "11_store1"
               "arm_no_early_store_addr_dep")
 
;; The same idea, then the 32-bit result is added to a 64-bit quantity.
(define_insn_reservation "11_mult6" 4
 (and (eq_attr "tune" "arm1136js,arm1136jfs")
      (eq_attr "insn" "smlalxy"))
 "e_1*2,e_2,e_3,e_wb*2")
 
;; Signed 32x32 multiply, then the most significant 32 bits are extracted
;; and are available after the memory stage.
(define_insn_reservation "11_mult7" 4
 (and (eq_attr "tune" "arm1136js,arm1136jfs")
      (eq_attr "insn" "smmul,smmulr"))
 "e_1*2,e_2,e_3,e_wb")
 
(define_bypass 3 "11_mult6,11_mult7"
               "11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7"
               "arm_no_early_mul_dep")
(define_bypass 3 "11_mult6,11_mult7"
               "11_alu_op")
(define_bypass 3 "11_mult6,11_mult7"
               "11_alu_shift_op"
               "arm_no_early_alu_shift_value_dep")
(define_bypass 3 "11_mult6,11_mult7"
               "11_alu_shift_reg_op"
               "arm_no_early_alu_shift_dep")
(define_bypass 3 "11_mult6,11_mult7"
               "11_store1"
               "arm_no_early_store_addr_dep")
 
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Branch Instructions
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
;; These vary greatly depending on their arguments and the results of
;; stat prediction.  Cycle count ranges from zero (unconditional branch,
;; folded dynamic prediction) to seven (incorrect predictions, etc).  We
;; assume an optimal case for now, because the cost of a cache miss
;; overwhelms the cost of everything else anyhow.
 
(define_insn_reservation "11_branches" 0
 (and (eq_attr "tune" "arm1136js,arm1136jfs")
      (eq_attr "type" "branch"))
 "nothing")
 
;; Call latencies are not predictable.  A semi-arbitrary very large
;; number is used as "positive infinity" so that everything should be
;; finished by the time of return.
(define_insn_reservation "11_call" 32
 (and (eq_attr "tune" "arm1136js,arm1136jfs")
      (eq_attr "type" "call"))
 "nothing")
 
;; Branches are predicted. A correctly predicted branch will be no
;; cost, but we're conservative here, and use the timings a
;; late-register would give us.
(define_bypass 1 "11_alu_op,11_alu_shift_op"
               "11_branches")
(define_bypass 2 "11_alu_shift_reg_op"
               "11_branches")
(define_bypass 2 "11_load1,11_load2"
               "11_branches")
(define_bypass 3 "11_load34"
               "11_branches")
 
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Load/Store Instructions
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
;; The models for load/store instructions do not accurately describe
;; the difference between operations with a base register writeback.
;; These models assume that all memory references hit in dcache.  Also,
;; if the PC is one of the registers involved, there are additional stalls
;; not modelled here.  Addressing modes are also not modelled.
 
(define_insn_reservation "11_load1" 3
 (and (eq_attr "tune" "arm1136js,arm1136jfs")
      (eq_attr "type" "load1"))
 "l_a+e_1,l_dc1,l_dc2,l_wb")
 
;; Load byte results are not available until the writeback stage, where
;; the correct byte is extracted.
 
(define_insn_reservation "11_loadb" 4
 (and (eq_attr "tune" "arm1136js,arm1136jfs")
      (eq_attr "type" "load_byte"))
 "l_a+e_1,l_dc1,l_dc2,l_wb")
 
(define_insn_reservation "11_store1" 0
 (and (eq_attr "tune" "arm1136js,arm1136jfs")
      (eq_attr "type" "store1"))
 "l_a+e_1,l_dc1,l_dc2,l_wb")
 
;; Load/store double words into adjacent registers.  The timing and
;; latencies are different depending on whether the address is 64-bit
;; aligned.  This model assumes that it is.
(define_insn_reservation "11_load2" 3
 (and (eq_attr "tune" "arm1136js,arm1136jfs")
      (eq_attr "type" "load2"))
 "l_a+e_1,l_dc1,l_dc2,l_wb")
 
(define_insn_reservation "11_store2" 0
 (and (eq_attr "tune" "arm1136js,arm1136jfs")
      (eq_attr "type" "store2"))
 "l_a+e_1,l_dc1,l_dc2,l_wb")
 
;; Load/store multiple registers.  Two registers are stored per cycle.
;; Actual timing depends on how many registers are affected, so we
;; optimistically schedule a low latency.
(define_insn_reservation "11_load34" 4
 (and (eq_attr "tune" "arm1136js,arm1136jfs")
      (eq_attr "type" "load3,load4"))
 "l_a+e_1,l_dc1*2,l_dc2,l_wb")
 
(define_insn_reservation "11_store34" 0
 (and (eq_attr "tune" "arm1136js,arm1136jfs")
      (eq_attr "type" "store3,store4"))
 "l_a+e_1,l_dc1*2,l_dc2,l_wb")
 
;; A store can start immediately after an alu op, if that alu op does
;; not provide part of the address to access.
(define_bypass 1 "11_alu_op,11_alu_shift_op"
               "11_store1"
               "arm_no_early_store_addr_dep")
(define_bypass 2 "11_alu_shift_reg_op"
               "11_store1"
               "arm_no_early_store_addr_dep")
 
;; An alu op can start sooner after a load, if that alu op does not
;; have an early register dependency on the load
(define_bypass 2 "11_load1"
               "11_alu_op")
(define_bypass 2 "11_load1"
               "11_alu_shift_op"
               "arm_no_early_alu_shift_value_dep")
(define_bypass 2 "11_load1"
               "11_alu_shift_reg_op"
               "arm_no_early_alu_shift_dep")
 
(define_bypass 3 "11_loadb"
               "11_alu_op")
(define_bypass 3 "11_loadb"
               "11_alu_shift_op"
               "arm_no_early_alu_shift_value_dep")
(define_bypass 3 "11_loadb"
               "11_alu_shift_reg_op"
               "arm_no_early_alu_shift_dep")
 
;; A mul op can start sooner after a load, if that mul op does not
;; have an early multiply dependency
(define_bypass 2 "11_load1"
               "11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7"
               "arm_no_early_mul_dep")
(define_bypass 3 "11_load34"
               "11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7"
               "arm_no_early_mul_dep")
(define_bypass 3 "11_loadb"
               "11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7"
               "arm_no_early_mul_dep")
 
;; A store can start sooner after a load, if that load does not
;; produce part of the address to access
(define_bypass 2 "11_load1"
               "11_store1"
               "arm_no_early_store_addr_dep")
(define_bypass 3 "11_loadb"
               "11_store1"
               "arm_no_early_store_addr_dep")

Browse

Tools

Subversion Repositories openrisc

[/] [openrisc/] [trunk/] [gnu-dev/] [or1k-gcc/] [gcc/] [config/] [arm/] [arm1136jfs.md] - Blame information for rev 709

Line No.	Rev	Author	Line
1	709	jeremybenn	`;; ARM 1136J[F]-S Pipeline Description`
2			`;; Copyright (C) 2003, 2007 Free Software Foundation, Inc.`
3			`;; Written by CodeSourcery, LLC.`
4			`;;`
5			`;; This file is part of GCC.`
6			`;;`
7			`;; GCC is free software; you can redistribute it and/or modify it`
8			`;; under the terms of the GNU General Public License as published by`
9			`;; the Free Software Foundation; either version 3, or (at your option)`
10			`;; any later version.`
11			`;;`
12			`;; GCC is distributed in the hope that it will be useful, but`
13			`;; WITHOUT ANY WARRANTY; without even the implied warranty of`
14			`;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU`
15			`;; General Public License for more details.`
16			`;;`
17			`;; You should have received a copy of the GNU General Public License`
18			`;; along with GCC; see the file COPYING3. If not see`
19			`;; . */`
20
21			`;; These descriptions are based on the information contained in the`
22			`;; ARM1136JF-S Technical Reference Manual, Copyright (c) 2003 ARM`
23			`;; Limited.`
24			`;;`
25
26			`;; This automaton provides a pipeline description for the ARM`
27			`;; 1136J-S and 1136JF-S cores.`
28			`;;`
29			`;; The model given here assumes that the condition for all conditional`
30			`;; instructions is "true", i.e., that all of the instructions are`
31			`;; actually executed.`
32
33			`(define_automaton "arm1136jfs")`
34
35			`;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;`
36			`;; Pipelines`
37			`;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;`
38
39			`;; There are three distinct pipelines (page 1-26 and following):`
40			`;;`
41			`;; - A 4-stage decode pipeline, shared by all three. It has fetch (1),`
42			`;; fetch (2), decode, and issue stages. Since this is always involved,`
43			`;; we do not model it in the scheduler.`
44			`;;`
45			`;; - A 4-stage ALU pipeline. It has shifter, ALU (main integer operations),`
46			`;; and saturation stages. The fourth stage is writeback; see below.`
47			`;;`
48			`;; - A 4-stage multiply-accumulate pipeline. It has three stages, called`
49			`;; MAC1 through MAC3, and a fourth writeback stage.`
50			`;;`
51			`;; The 4th-stage writeback is shared between the ALU and MAC pipelines,`
52			`;; which operate in lockstep. Results from either pipeline will be`
53			`;; moved into the writeback stage. Because the two pipelines operate`
54			`;; in lockstep, we schedule them as a single "execute" pipeline.`
55			`;;`
56			`;; - A 4-stage LSU pipeline. It has address generation, data cache (1),`
57			`;; data cache (2), and writeback stages. (Note that this pipeline,`
58			`;; including the writeback stage, is independent from the ALU & LSU pipes.)`
59
60			`(define_cpu_unit "e_1,e_2,e_3,e_wb" "arm1136jfs") ; ALU and MAC`
61			`; e_1 = Sh/Mac1, e_2 = ALU/Mac2, e_3 = SAT/Mac3`
62			`(define_cpu_unit "l_a,l_dc1,l_dc2,l_wb" "arm1136jfs") ; Load/Store`
63
64			`;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;`
65			`;; ALU Instructions`
66			`;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;`
67
68			`;; ALU instructions require eight cycles to execute, and use the ALU`
69			`;; pipeline in each of the eight stages. The results are available`
70			`;; after the alu stage has finished.`
71			`;;`
72			`;; If the destination register is the PC, the pipelines are stalled`
73			`;; for several cycles. That case is not modelled here.`
74
75			`;; ALU operations with no shifted operand`
76			`(define_insn_reservation "11_alu_op" 2`
77			`(and (eq_attr "tune" "arm1136js,arm1136jfs")`
78			`(eq_attr "type" "alu"))`
79			`"e_1,e_2,e_3,e_wb")`
80
81			`;; ALU operations with a shift-by-constant operand`
82			`(define_insn_reservation "11_alu_shift_op" 2`
83			`(and (eq_attr "tune" "arm1136js,arm1136jfs")`
84			`(eq_attr "type" "alu_shift"))`
85			`"e_1,e_2,e_3,e_wb")`
86
87			`;; ALU operations with a shift-by-register operand`
88			`;; These really stall in the decoder, in order to read`
89			`;; the shift value in a second cycle. Pretend we take two cycles in`
90			`;; the shift stage.`
91			`(define_insn_reservation "11_alu_shift_reg_op" 3`
92			`(and (eq_attr "tune" "arm1136js,arm1136jfs")`
93			`(eq_attr "type" "alu_shift_reg"))`
94			`"e_1*2,e_2,e_3,e_wb")`
95
96			`;; alu_ops can start sooner, if there is no shifter dependency`
97			`(define_bypass 1 "11_alu_op,11_alu_shift_op"`
98			`"11_alu_op")`
99			`(define_bypass 1 "11_alu_op,11_alu_shift_op"`
100			`"11_alu_shift_op"`
101			`"arm_no_early_alu_shift_value_dep")`
102			`(define_bypass 1 "11_alu_op,11_alu_shift_op"`
103			`"11_alu_shift_reg_op"`
104			`"arm_no_early_alu_shift_dep")`
105			`(define_bypass 2 "11_alu_shift_reg_op"`
106			`"11_alu_op")`
107			`(define_bypass 2 "11_alu_shift_reg_op"`
108			`"11_alu_shift_op"`
109			`"arm_no_early_alu_shift_value_dep")`
110			`(define_bypass 2 "11_alu_shift_reg_op"`
111			`"11_alu_shift_reg_op"`
112			`"arm_no_early_alu_shift_dep")`
113
114			`(define_bypass 1 "11_alu_op,11_alu_shift_op"`
115			`"11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7"`
116			`"arm_no_early_mul_dep")`
117			`(define_bypass 2 "11_alu_shift_reg_op"`
118			`"11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7"`
119			`"arm_no_early_mul_dep")`
120
121			`;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;`
122			`;; Multiplication Instructions`
123			`;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;`
124
125			`;; Multiplication instructions loop in the first two execute stages until`
126			`;; the instruction has been passed through the multiplier array enough`
127			`;; times.`
128
129			`;; Multiply and multiply-accumulate results are available after four stages.`
130			`(define_insn_reservation "11_mult1" 4`
131			`(and (eq_attr "tune" "arm1136js,arm1136jfs")`
132			`(eq_attr "insn" "mul,mla"))`
133			`"e_1*2,e_2,e_3,e_wb")`
134
135			`;; The *S variants set the condition flags, which requires three more cycles.`
136			`(define_insn_reservation "11_mult2" 4`
137			`(and (eq_attr "tune" "arm1136js,arm1136jfs")`
138			`(eq_attr "insn" "muls,mlas"))`
139			`"e_1*2,e_2,e_3,e_wb")`
140
141			`(define_bypass 3 "11_mult1,11_mult2"`
142			`"11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7"`
143			`"arm_no_early_mul_dep")`
144			`(define_bypass 3 "11_mult1,11_mult2"`
145			`"11_alu_op")`
146			`(define_bypass 3 "11_mult1,11_mult2"`
147			`"11_alu_shift_op"`
148			`"arm_no_early_alu_shift_value_dep")`
149			`(define_bypass 3 "11_mult1,11_mult2"`
150			`"11_alu_shift_reg_op"`
151			`"arm_no_early_alu_shift_dep")`
152			`(define_bypass 3 "11_mult1,11_mult2"`
153			`"11_store1"`
154			`"arm_no_early_store_addr_dep")`
155
156			`;; Signed and unsigned multiply long results are available across two cycles;`
157			`;; the less significant word is available one cycle before the more significant`
158			`;; word. Here we conservatively wait until both are available, which is`
159			`;; after three iterations and the memory cycle. The same is also true of`
160			`;; the two multiply-accumulate instructions.`
161			`(define_insn_reservation "11_mult3" 5`
162			`(and (eq_attr "tune" "arm1136js,arm1136jfs")`
163			`(eq_attr "insn" "smull,umull,smlal,umlal"))`
164			`"e_13,e_2,e_3,e_wb2")`
165
166			`;; The *S variants set the condition flags, which requires three more cycles.`
167			`(define_insn_reservation "11_mult4" 5`
168			`(and (eq_attr "tune" "arm1136js,arm1136jfs")`
169			`(eq_attr "insn" "smulls,umulls,smlals,umlals"))`
170			`"e_13,e_2,e_3,e_wb2")`
171
172			`(define_bypass 4 "11_mult3,11_mult4"`
173			`"11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7"`
174			`"arm_no_early_mul_dep")`
175			`(define_bypass 4 "11_mult3,11_mult4"`
176			`"11_alu_op")`
177			`(define_bypass 4 "11_mult3,11_mult4"`
178			`"11_alu_shift_op"`
179			`"arm_no_early_alu_shift_value_dep")`
180			`(define_bypass 4 "11_mult3,11_mult4"`
181			`"11_alu_shift_reg_op"`
182			`"arm_no_early_alu_shift_dep")`
183			`(define_bypass 4 "11_mult3,11_mult4"`
184			`"11_store1"`
185			`"arm_no_early_store_addr_dep")`
186
187			`;; Various 16x16->32 multiplies and multiply-accumulates, using combinations`
188			`;; of high and low halves of the argument registers. They take a single`
189			`;; pass through the pipeline and make the result available after three`
190			`;; cycles.`
191			`(define_insn_reservation "11_mult5" 3`
192			`(and (eq_attr "tune" "arm1136js,arm1136jfs")`
193			`(eq_attr "insn" "smulxy,smlaxy,smulwy,smlawy,smuad,smuadx,smlad,smladx,smusd,smusdx,smlsd,smlsdx"))`
194			`"e_1,e_2,e_3,e_wb")`
195
196			`(define_bypass 2 "11_mult5"`
197			`"11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7"`
198			`"arm_no_early_mul_dep")`
199			`(define_bypass 2 "11_mult5"`
200			`"11_alu_op")`
201			`(define_bypass 2 "11_mult5"`
202			`"11_alu_shift_op"`
203			`"arm_no_early_alu_shift_value_dep")`
204			`(define_bypass 2 "11_mult5"`
205			`"11_alu_shift_reg_op"`
206			`"arm_no_early_alu_shift_dep")`
207			`(define_bypass 2 "11_mult5"`
208			`"11_store1"`
209			`"arm_no_early_store_addr_dep")`
210
211			`;; The same idea, then the 32-bit result is added to a 64-bit quantity.`
212			`(define_insn_reservation "11_mult6" 4`
213			`(and (eq_attr "tune" "arm1136js,arm1136jfs")`
214			`(eq_attr "insn" "smlalxy"))`
215			`"e_12,e_2,e_3,e_wb2")`
216
217			`;; Signed 32x32 multiply, then the most significant 32 bits are extracted`
218			`;; and are available after the memory stage.`
219			`(define_insn_reservation "11_mult7" 4`
220			`(and (eq_attr "tune" "arm1136js,arm1136jfs")`
221			`(eq_attr "insn" "smmul,smmulr"))`
222			`"e_1*2,e_2,e_3,e_wb")`
223
224			`(define_bypass 3 "11_mult6,11_mult7"`
225			`"11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7"`
226			`"arm_no_early_mul_dep")`
227			`(define_bypass 3 "11_mult6,11_mult7"`
228			`"11_alu_op")`
229			`(define_bypass 3 "11_mult6,11_mult7"`
230			`"11_alu_shift_op"`
231			`"arm_no_early_alu_shift_value_dep")`
232			`(define_bypass 3 "11_mult6,11_mult7"`
233			`"11_alu_shift_reg_op"`
234			`"arm_no_early_alu_shift_dep")`
235			`(define_bypass 3 "11_mult6,11_mult7"`
236			`"11_store1"`
237			`"arm_no_early_store_addr_dep")`
238
239			`;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;`
240			`;; Branch Instructions`
241			`;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;`
242
243			`;; These vary greatly depending on their arguments and the results of`
244			`;; stat prediction. Cycle count ranges from zero (unconditional branch,`
245			`;; folded dynamic prediction) to seven (incorrect predictions, etc). We`
246			`;; assume an optimal case for now, because the cost of a cache miss`
247			`;; overwhelms the cost of everything else anyhow.`
248
249			`(define_insn_reservation "11_branches" 0`
250			`(and (eq_attr "tune" "arm1136js,arm1136jfs")`
251			`(eq_attr "type" "branch"))`
252			`"nothing")`
253
254			`;; Call latencies are not predictable. A semi-arbitrary very large`
255			`;; number is used as "positive infinity" so that everything should be`
256			`;; finished by the time of return.`
257			`(define_insn_reservation "11_call" 32`
258			`(and (eq_attr "tune" "arm1136js,arm1136jfs")`
259			`(eq_attr "type" "call"))`
260			`"nothing")`
261
262			`;; Branches are predicted. A correctly predicted branch will be no`
263			`;; cost, but we're conservative here, and use the timings a`
264			`;; late-register would give us.`
265			`(define_bypass 1 "11_alu_op,11_alu_shift_op"`
266			`"11_branches")`
267			`(define_bypass 2 "11_alu_shift_reg_op"`
268			`"11_branches")`
269			`(define_bypass 2 "11_load1,11_load2"`
270			`"11_branches")`
271			`(define_bypass 3 "11_load34"`
272			`"11_branches")`
273
274			`;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;`
275			`;; Load/Store Instructions`
276			`;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;`
277
278			`;; The models for load/store instructions do not accurately describe`
279			`;; the difference between operations with a base register writeback.`
280			`;; These models assume that all memory references hit in dcache. Also,`
281			`;; if the PC is one of the registers involved, there are additional stalls`
282			`;; not modelled here. Addressing modes are also not modelled.`
283
284			`(define_insn_reservation "11_load1" 3`
285			`(and (eq_attr "tune" "arm1136js,arm1136jfs")`
286			`(eq_attr "type" "load1"))`
287			`"l_a+e_1,l_dc1,l_dc2,l_wb")`
288
289			`;; Load byte results are not available until the writeback stage, where`
290			`;; the correct byte is extracted.`
291
292			`(define_insn_reservation "11_loadb" 4`
293			`(and (eq_attr "tune" "arm1136js,arm1136jfs")`
294			`(eq_attr "type" "load_byte"))`
295			`"l_a+e_1,l_dc1,l_dc2,l_wb")`
296
297			`(define_insn_reservation "11_store1" 0`
298			`(and (eq_attr "tune" "arm1136js,arm1136jfs")`
299			`(eq_attr "type" "store1"))`
300			`"l_a+e_1,l_dc1,l_dc2,l_wb")`
301
302			`;; Load/store double words into adjacent registers. The timing and`
303			`;; latencies are different depending on whether the address is 64-bit`
304			`;; aligned. This model assumes that it is.`
305			`(define_insn_reservation "11_load2" 3`
306			`(and (eq_attr "tune" "arm1136js,arm1136jfs")`
307			`(eq_attr "type" "load2"))`
308			`"l_a+e_1,l_dc1,l_dc2,l_wb")`
309
310			`(define_insn_reservation "11_store2" 0`
311			`(and (eq_attr "tune" "arm1136js,arm1136jfs")`
312			`(eq_attr "type" "store2"))`
313			`"l_a+e_1,l_dc1,l_dc2,l_wb")`
314
315			`;; Load/store multiple registers. Two registers are stored per cycle.`
316			`;; Actual timing depends on how many registers are affected, so we`
317			`;; optimistically schedule a low latency.`
318			`(define_insn_reservation "11_load34" 4`
319			`(and (eq_attr "tune" "arm1136js,arm1136jfs")`
320			`(eq_attr "type" "load3,load4"))`
321			`"l_a+e_1,l_dc1*2,l_dc2,l_wb")`
322
323			`(define_insn_reservation "11_store34" 0`
324			`(and (eq_attr "tune" "arm1136js,arm1136jfs")`
325			`(eq_attr "type" "store3,store4"))`
326			`"l_a+e_1,l_dc1*2,l_dc2,l_wb")`
327
328			`;; A store can start immediately after an alu op, if that alu op does`
329			`;; not provide part of the address to access.`
330			`(define_bypass 1 "11_alu_op,11_alu_shift_op"`
331			`"11_store1"`
332			`"arm_no_early_store_addr_dep")`
333			`(define_bypass 2 "11_alu_shift_reg_op"`
334			`"11_store1"`
335			`"arm_no_early_store_addr_dep")`
336
337			`;; An alu op can start sooner after a load, if that alu op does not`
338			`;; have an early register dependency on the load`
339			`(define_bypass 2 "11_load1"`
340			`"11_alu_op")`
341			`(define_bypass 2 "11_load1"`
342			`"11_alu_shift_op"`
343			`"arm_no_early_alu_shift_value_dep")`
344			`(define_bypass 2 "11_load1"`
345			`"11_alu_shift_reg_op"`
346			`"arm_no_early_alu_shift_dep")`
347
348			`(define_bypass 3 "11_loadb"`
349			`"11_alu_op")`
350			`(define_bypass 3 "11_loadb"`
351			`"11_alu_shift_op"`
352			`"arm_no_early_alu_shift_value_dep")`
353			`(define_bypass 3 "11_loadb"`
354			`"11_alu_shift_reg_op"`
355			`"arm_no_early_alu_shift_dep")`
356
357			`;; A mul op can start sooner after a load, if that mul op does not`
358			`;; have an early multiply dependency`
359			`(define_bypass 2 "11_load1"`
360			`"11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7"`
361			`"arm_no_early_mul_dep")`
362			`(define_bypass 3 "11_load34"`
363			`"11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7"`
364			`"arm_no_early_mul_dep")`
365			`(define_bypass 3 "11_loadb"`
366			`"11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7"`
367			`"arm_no_early_mul_dep")`
368
369			`;; A store can start sooner after a load, if that load does not`
370			`;; produce part of the address to access`
371			`(define_bypass 2 "11_load1"`
372			`"11_store1"`
373			`"arm_no_early_store_addr_dep")`
374			`(define_bypass 3 "11_loadb"`
375			`"11_store1"`
376			`"arm_no_early_store_addr_dep")`