URL https://opencores.org/ocsvn/openrisc/openrisc/trunk

Subversion Repositories openrisc

[/] [openrisc/] [trunk/] [gnu-dev/] [or1k-gcc/] [gcc/] [config/] [arm/] [cortex-a5.md] - Blame information for rev 867

Go to most recent revision | Details | Compare with Previous | View Log


;; ARM Cortex-A5 pipeline description
;; Copyright (C) 2010 Free Software Foundation, Inc.
;; Contributed by CodeSourcery.
;;
;; This file is part of GCC.
;;
;; GCC is free software; you can redistribute it and/or modify it
;; under the terms of the GNU General Public License as published by
;; the Free Software Foundation; either version 3, or (at your option)
;; any later version.
;;
;; GCC is distributed in the hope that it will be useful, but
;; WITHOUT ANY WARRANTY; without even the implied warranty of
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
;; General Public License for more details.
;;
;; You should have received a copy of the GNU General Public License
;; along with GCC; see the file COPYING3.  If not see
;; .
 
(define_automaton "cortex_a5")
 
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Functional units.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
;; The integer (ALU) pipeline.  There are five DPU pipeline
;; stages. However the decode/issue stages operate the same for all
;; instructions, so do not model them.  We only need to model the
;; first execute stage because instructions always advance one stage
;; per cycle in order.  Only branch instructions may dual-issue, so a
;; single unit covers all of the LS, ALU, MAC and FPU pipelines.
 
(define_cpu_unit "cortex_a5_ex1" "cortex_a5")
 
;; The branch pipeline.  Branches can dual-issue with other instructions
;; (except when those instructions take multiple cycles to issue).
 
(define_cpu_unit "cortex_a5_branch" "cortex_a5")
 
;; Pseudo-unit for blocking the multiply pipeline when a double-precision
;; multiply is in progress.
 
(define_cpu_unit "cortex_a5_fpmul_pipe" "cortex_a5")
 
;; The floating-point add pipeline (ex1/f1 stage), used to model the usage
;; of the add pipeline by fmac instructions, etc.
 
(define_cpu_unit "cortex_a5_fpadd_pipe" "cortex_a5")
 
;; Floating-point div/sqrt (long latency, out-of-order completion).
 
(define_cpu_unit "cortex_a5_fp_div_sqrt" "cortex_a5")
 
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; ALU instructions.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
(define_insn_reservation "cortex_a5_alu" 2
  (and (eq_attr "tune" "cortexa5")
       (eq_attr "type" "alu"))
  "cortex_a5_ex1")
 
(define_insn_reservation "cortex_a5_alu_shift" 2
  (and (eq_attr "tune" "cortexa5")
       (eq_attr "type" "alu_shift,alu_shift_reg"))
  "cortex_a5_ex1")
 
;; Forwarding path for unshifted operands.
 
(define_bypass 1 "cortex_a5_alu,cortex_a5_alu_shift"
  "cortex_a5_alu")
 
(define_bypass 1 "cortex_a5_alu,cortex_a5_alu_shift"
  "cortex_a5_alu_shift"
  "arm_no_early_alu_shift_dep")
 
;; The multiplier pipeline can forward results from wr stage only so
;; there's no need to specify bypasses).
 
(define_insn_reservation "cortex_a5_mul" 2
  (and (eq_attr "tune" "cortexa5")
       (eq_attr "type" "mult"))
  "cortex_a5_ex1")
 
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Load/store instructions.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
;; Address-generation happens in the issue stage, which is one stage behind
;; the ex1 stage (the first stage we care about for scheduling purposes). The
;; dc1 stage is parallel with ex1, dc2 with ex2 and rot with wr.
 
(define_insn_reservation "cortex_a5_load1" 2
  (and (eq_attr "tune" "cortexa5")
       (eq_attr "type" "load_byte,load1"))
  "cortex_a5_ex1")
 
(define_insn_reservation "cortex_a5_store1" 0
  (and (eq_attr "tune" "cortexa5")
       (eq_attr "type" "store1"))
  "cortex_a5_ex1")
 
(define_insn_reservation "cortex_a5_load2" 3
  (and (eq_attr "tune" "cortexa5")
       (eq_attr "type" "load2"))
  "cortex_a5_ex1+cortex_a5_branch, cortex_a5_ex1")
 
(define_insn_reservation "cortex_a5_store2" 0
  (and (eq_attr "tune" "cortexa5")
       (eq_attr "type" "store2"))
  "cortex_a5_ex1+cortex_a5_branch, cortex_a5_ex1")
 
(define_insn_reservation "cortex_a5_load3" 4
  (and (eq_attr "tune" "cortexa5")
       (eq_attr "type" "load3"))
  "cortex_a5_ex1+cortex_a5_branch, cortex_a5_ex1+cortex_a5_branch,\
   cortex_a5_ex1")
 
(define_insn_reservation "cortex_a5_store3" 0
  (and (eq_attr "tune" "cortexa5")
       (eq_attr "type" "store3"))
  "cortex_a5_ex1+cortex_a5_branch, cortex_a5_ex1+cortex_a5_branch,\
   cortex_a5_ex1")
 
(define_insn_reservation "cortex_a5_load4" 5
  (and (eq_attr "tune" "cortexa5")
       (eq_attr "type" "load3"))
  "cortex_a5_ex1+cortex_a5_branch, cortex_a5_ex1+cortex_a5_branch,\
   cortex_a5_ex1+cortex_a5_branch, cortex_a5_ex1")
 
(define_insn_reservation "cortex_a5_store4" 0
  (and (eq_attr "tune" "cortexa5")
       (eq_attr "type" "store3"))
  "cortex_a5_ex1+cortex_a5_branch, cortex_a5_ex1+cortex_a5_branch,\
   cortex_a5_ex1+cortex_a5_branch, cortex_a5_ex1")
 
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Branches.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
;; Direct branches are the only instructions we can dual-issue (also IT and
;; nop, but those aren't very interesting for scheduling).  (The latency here
;; is meant to represent when the branch actually takes place, but may not be
;; entirely correct.)
 
(define_insn_reservation "cortex_a5_branch" 3
  (and (eq_attr "tune" "cortexa5")
       (eq_attr "type" "branch,call"))
  "cortex_a5_branch")
 
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Floating-point arithmetic.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
(define_insn_reservation "cortex_a5_fpalu" 4
  (and (eq_attr "tune" "cortexa5")
       (eq_attr "type" "ffariths, fadds, ffarithd, faddd, fcpys, fmuls, f_cvt,\
                        fcmps, fcmpd"))
  "cortex_a5_ex1+cortex_a5_fpadd_pipe")
 
;; For fconsts and fconstd, 8-bit immediate data is passed directly from
;; f1 to f3 (which I think reduces the latency by one cycle).
 
(define_insn_reservation "cortex_a5_fconst" 3
  (and (eq_attr "tune" "cortexa5")
       (eq_attr "type" "fconsts,fconstd"))
  "cortex_a5_ex1+cortex_a5_fpadd_pipe")
 
;; We should try not to attempt to issue a single-precision multiplication in
;; the middle of a double-precision multiplication operation (the usage of
;; cortex_a5_fpmul_pipe).
 
(define_insn_reservation "cortex_a5_fpmuls" 4
  (and (eq_attr "tune" "cortexa5")
       (eq_attr "type" "fmuls"))
  "cortex_a5_ex1+cortex_a5_fpmul_pipe")
 
;; For single-precision multiply-accumulate, the add (accumulate) is issued
;; whilst the multiply is in F4.  The multiply result can then be forwarded
;; from F5 to F1.  The issue unit is only used once (when we first start
;; processing the instruction), but the usage of the FP add pipeline could
;; block other instructions attempting to use it simultaneously.  We try to
;; avoid that using cortex_a5_fpadd_pipe.
 
(define_insn_reservation "cortex_a5_fpmacs" 8
  (and (eq_attr "tune" "cortexa5")
       (eq_attr "type" "fmacs"))
  "cortex_a5_ex1+cortex_a5_fpmul_pipe, nothing*3, cortex_a5_fpadd_pipe")
 
;; Non-multiply instructions can issue in the middle two instructions of a
;; double-precision multiply.  Note that it isn't entirely clear when a branch
;; can dual-issue when a multi-cycle multiplication is in progress; we ignore
;; that for now though.
 
(define_insn_reservation "cortex_a5_fpmuld" 7
  (and (eq_attr "tune" "cortexa5")
       (eq_attr "type" "fmuld"))
  "cortex_a5_ex1+cortex_a5_fpmul_pipe, cortex_a5_fpmul_pipe*2,\
   cortex_a5_ex1+cortex_a5_fpmul_pipe")
 
(define_insn_reservation "cortex_a5_fpmacd" 11
  (and (eq_attr "tune" "cortexa5")
       (eq_attr "type" "fmacd"))
  "cortex_a5_ex1+cortex_a5_fpmul_pipe, cortex_a5_fpmul_pipe*2,\
   cortex_a5_ex1+cortex_a5_fpmul_pipe, nothing*3, cortex_a5_fpadd_pipe")
 
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Floating-point divide/square root instructions.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
;; ??? Not sure if the 14 cycles taken for single-precision divide to complete
;; includes the time taken for the special instruction used to collect the
;; result to travel down the multiply pipeline, or not.  Assuming so.  (If
;; that's wrong, the latency should be increased by a few cycles.)
 
;; fsqrt takes one cycle less, but that is not modelled, nor is the use of the
;; multiply pipeline to collect the divide/square-root result.
 
(define_insn_reservation "cortex_a5_fdivs" 14
  (and (eq_attr "tune" "cortexa5")
       (eq_attr "type" "fdivs"))
  "cortex_a5_ex1, cortex_a5_fp_div_sqrt * 13")
 
;; ??? Similarly for fdivd.
 
(define_insn_reservation "cortex_a5_fdivd" 29
  (and (eq_attr "tune" "cortexa5")
       (eq_attr "type" "fdivd"))
  "cortex_a5_ex1, cortex_a5_fp_div_sqrt * 28")
 
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; VFP to/from core transfers.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
;; FP loads take data from wr/rot/f3.
 
;; Core-to-VFP transfers use the multiply pipeline.
 
(define_insn_reservation "cortex_a5_r2f" 4
  (and (eq_attr "tune" "cortexa5")
       (eq_attr "type" "r_2_f"))
  "cortex_a5_ex1")
 
(define_insn_reservation "cortex_a5_f2r" 2
  (and (eq_attr "tune" "cortexa5")
       (eq_attr "type" "f_2_r"))
  "cortex_a5_ex1")
 
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; VFP flag transfer.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
;; ??? The flag forwarding from fmstat to the ex2 stage of the second
;; instruction is not modeled at present.
 
(define_insn_reservation "cortex_a5_f_flags" 4
  (and (eq_attr "tune" "cortexa5")
       (eq_attr "type" "f_flag"))
  "cortex_a5_ex1")
 
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; VFP load/store.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
(define_insn_reservation "cortex_a5_f_loads" 4
  (and (eq_attr "tune" "cortexa5")
       (eq_attr "type" "f_loads"))
  "cortex_a5_ex1")
 
(define_insn_reservation "cortex_a5_f_loadd" 5
  (and (eq_attr "tune" "cortexa5")
       (eq_attr "type" "f_loadd"))
  "cortex_a5_ex1+cortex_a5_branch, cortex_a5_ex1")
 
(define_insn_reservation "cortex_a5_f_stores" 0
  (and (eq_attr "tune" "cortexa5")
       (eq_attr "type" "f_stores"))
  "cortex_a5_ex1")
 
(define_insn_reservation "cortex_a5_f_stored" 0
  (and (eq_attr "tune" "cortexa5")
       (eq_attr "type" "f_stored"))
  "cortex_a5_ex1+cortex_a5_branch, cortex_a5_ex1")
 
;; Load-to-use for floating-point values has a penalty of one cycle,
;; i.e. a latency of two.
 
(define_bypass 2 "cortex_a5_f_loads"
                 "cortex_a5_fpalu, cortex_a5_fpmacs, cortex_a5_fpmuld,\
                  cortex_a5_fpmacd, cortex_a5_fdivs, cortex_a5_fdivd,\
                  cortex_a5_f2r")
 
(define_bypass 3 "cortex_a5_f_loadd"
                 "cortex_a5_fpalu, cortex_a5_fpmacs, cortex_a5_fpmuld,\
                  cortex_a5_fpmacd, cortex_a5_fdivs, cortex_a5_fdivd,\
                  cortex_a5_f2r")

Browse

Tools

Subversion Repositories openrisc

[/] [openrisc/] [trunk/] [gnu-dev/] [or1k-gcc/] [gcc/] [config/] [arm/] [cortex-a5.md] - Blame information for rev 867

Line No.	Rev	Author	Line
1	709	jeremybenn	`;; ARM Cortex-A5 pipeline description`
2			`;; Copyright (C) 2010 Free Software Foundation, Inc.`
3			`;; Contributed by CodeSourcery.`
4			`;;`
5			`;; This file is part of GCC.`
6			`;;`
7			`;; GCC is free software; you can redistribute it and/or modify it`
8			`;; under the terms of the GNU General Public License as published by`
9			`;; the Free Software Foundation; either version 3, or (at your option)`
10			`;; any later version.`
11			`;;`
12			`;; GCC is distributed in the hope that it will be useful, but`
13			`;; WITHOUT ANY WARRANTY; without even the implied warranty of`
14			`;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU`
15			`;; General Public License for more details.`
16			`;;`
17			`;; You should have received a copy of the GNU General Public License`
18			`;; along with GCC; see the file COPYING3. If not see`
19			`;; .`
20
21			`(define_automaton "cortex_a5")`
22
23			`;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;`
24			`;; Functional units.`
25			`;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;`
26
27			`;; The integer (ALU) pipeline. There are five DPU pipeline`
28			`;; stages. However the decode/issue stages operate the same for all`
29			`;; instructions, so do not model them. We only need to model the`
30			`;; first execute stage because instructions always advance one stage`
31			`;; per cycle in order. Only branch instructions may dual-issue, so a`
32			`;; single unit covers all of the LS, ALU, MAC and FPU pipelines.`
33
34			`(define_cpu_unit "cortex_a5_ex1" "cortex_a5")`
35
36			`;; The branch pipeline. Branches can dual-issue with other instructions`
37			`;; (except when those instructions take multiple cycles to issue).`
38
39			`(define_cpu_unit "cortex_a5_branch" "cortex_a5")`
40
41			`;; Pseudo-unit for blocking the multiply pipeline when a double-precision`
42			`;; multiply is in progress.`
43
44			`(define_cpu_unit "cortex_a5_fpmul_pipe" "cortex_a5")`
45
46			`;; The floating-point add pipeline (ex1/f1 stage), used to model the usage`
47			`;; of the add pipeline by fmac instructions, etc.`
48
49			`(define_cpu_unit "cortex_a5_fpadd_pipe" "cortex_a5")`
50
51			`;; Floating-point div/sqrt (long latency, out-of-order completion).`
52
53			`(define_cpu_unit "cortex_a5_fp_div_sqrt" "cortex_a5")`
54
55			`;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;`
56			`;; ALU instructions.`
57			`;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;`
58
59			`(define_insn_reservation "cortex_a5_alu" 2`
60			`(and (eq_attr "tune" "cortexa5")`
61			`(eq_attr "type" "alu"))`
62			`"cortex_a5_ex1")`
63
64			`(define_insn_reservation "cortex_a5_alu_shift" 2`
65			`(and (eq_attr "tune" "cortexa5")`
66			`(eq_attr "type" "alu_shift,alu_shift_reg"))`
67			`"cortex_a5_ex1")`
68
69			`;; Forwarding path for unshifted operands.`
70
71			`(define_bypass 1 "cortex_a5_alu,cortex_a5_alu_shift"`
72			`"cortex_a5_alu")`
73
74			`(define_bypass 1 "cortex_a5_alu,cortex_a5_alu_shift"`
75			`"cortex_a5_alu_shift"`
76			`"arm_no_early_alu_shift_dep")`
77
78			`;; The multiplier pipeline can forward results from wr stage only so`
79			`;; there's no need to specify bypasses).`
80
81			`(define_insn_reservation "cortex_a5_mul" 2`
82			`(and (eq_attr "tune" "cortexa5")`
83			`(eq_attr "type" "mult"))`
84			`"cortex_a5_ex1")`
85
86			`;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;`
87			`;; Load/store instructions.`
88			`;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;`
89
90			`;; Address-generation happens in the issue stage, which is one stage behind`
91			`;; the ex1 stage (the first stage we care about for scheduling purposes). The`
92			`;; dc1 stage is parallel with ex1, dc2 with ex2 and rot with wr.`
93
94			`(define_insn_reservation "cortex_a5_load1" 2`
95			`(and (eq_attr "tune" "cortexa5")`
96			`(eq_attr "type" "load_byte,load1"))`
97			`"cortex_a5_ex1")`
98
99			`(define_insn_reservation "cortex_a5_store1" 0`
100			`(and (eq_attr "tune" "cortexa5")`
101			`(eq_attr "type" "store1"))`
102			`"cortex_a5_ex1")`
103
104			`(define_insn_reservation "cortex_a5_load2" 3`
105			`(and (eq_attr "tune" "cortexa5")`
106			`(eq_attr "type" "load2"))`
107			`"cortex_a5_ex1+cortex_a5_branch, cortex_a5_ex1")`
108
109			`(define_insn_reservation "cortex_a5_store2" 0`
110			`(and (eq_attr "tune" "cortexa5")`
111			`(eq_attr "type" "store2"))`
112			`"cortex_a5_ex1+cortex_a5_branch, cortex_a5_ex1")`
113
114			`(define_insn_reservation "cortex_a5_load3" 4`
115			`(and (eq_attr "tune" "cortexa5")`
116			`(eq_attr "type" "load3"))`
117			`"cortex_a5_ex1+cortex_a5_branch, cortex_a5_ex1+cortex_a5_branch,\`
118			`cortex_a5_ex1")`
119
120			`(define_insn_reservation "cortex_a5_store3" 0`
121			`(and (eq_attr "tune" "cortexa5")`
122			`(eq_attr "type" "store3"))`
123			`"cortex_a5_ex1+cortex_a5_branch, cortex_a5_ex1+cortex_a5_branch,\`
124			`cortex_a5_ex1")`
125
126			`(define_insn_reservation "cortex_a5_load4" 5`
127			`(and (eq_attr "tune" "cortexa5")`
128			`(eq_attr "type" "load3"))`
129			`"cortex_a5_ex1+cortex_a5_branch, cortex_a5_ex1+cortex_a5_branch,\`
130			`cortex_a5_ex1+cortex_a5_branch, cortex_a5_ex1")`
131
132			`(define_insn_reservation "cortex_a5_store4" 0`
133			`(and (eq_attr "tune" "cortexa5")`
134			`(eq_attr "type" "store3"))`
135			`"cortex_a5_ex1+cortex_a5_branch, cortex_a5_ex1+cortex_a5_branch,\`
136			`cortex_a5_ex1+cortex_a5_branch, cortex_a5_ex1")`
137
138			`;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;`
139			`;; Branches.`
140			`;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;`
141
142			`;; Direct branches are the only instructions we can dual-issue (also IT and`
143			`;; nop, but those aren't very interesting for scheduling). (The latency here`
144			`;; is meant to represent when the branch actually takes place, but may not be`
145			`;; entirely correct.)`
146
147			`(define_insn_reservation "cortex_a5_branch" 3`
148			`(and (eq_attr "tune" "cortexa5")`
149			`(eq_attr "type" "branch,call"))`
150			`"cortex_a5_branch")`
151
152			`;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;`
153			`;; Floating-point arithmetic.`
154			`;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;`
155
156			`(define_insn_reservation "cortex_a5_fpalu" 4`
157			`(and (eq_attr "tune" "cortexa5")`
158			`(eq_attr "type" "ffariths, fadds, ffarithd, faddd, fcpys, fmuls, f_cvt,\`
159			`fcmps, fcmpd"))`
160			`"cortex_a5_ex1+cortex_a5_fpadd_pipe")`
161
162			`;; For fconsts and fconstd, 8-bit immediate data is passed directly from`
163			`;; f1 to f3 (which I think reduces the latency by one cycle).`
164
165			`(define_insn_reservation "cortex_a5_fconst" 3`
166			`(and (eq_attr "tune" "cortexa5")`
167			`(eq_attr "type" "fconsts,fconstd"))`
168			`"cortex_a5_ex1+cortex_a5_fpadd_pipe")`
169
170			`;; We should try not to attempt to issue a single-precision multiplication in`
171			`;; the middle of a double-precision multiplication operation (the usage of`
172			`;; cortex_a5_fpmul_pipe).`
173
174			`(define_insn_reservation "cortex_a5_fpmuls" 4`
175			`(and (eq_attr "tune" "cortexa5")`
176			`(eq_attr "type" "fmuls"))`
177			`"cortex_a5_ex1+cortex_a5_fpmul_pipe")`
178
179			`;; For single-precision multiply-accumulate, the add (accumulate) is issued`
180			`;; whilst the multiply is in F4. The multiply result can then be forwarded`
181			`;; from F5 to F1. The issue unit is only used once (when we first start`
182			`;; processing the instruction), but the usage of the FP add pipeline could`
183			`;; block other instructions attempting to use it simultaneously. We try to`
184			`;; avoid that using cortex_a5_fpadd_pipe.`
185
186			`(define_insn_reservation "cortex_a5_fpmacs" 8`
187			`(and (eq_attr "tune" "cortexa5")`
188			`(eq_attr "type" "fmacs"))`
189			`"cortex_a5_ex1+cortex_a5_fpmul_pipe, nothing*3, cortex_a5_fpadd_pipe")`
190
191			`;; Non-multiply instructions can issue in the middle two instructions of a`
192			`;; double-precision multiply. Note that it isn't entirely clear when a branch`
193			`;; can dual-issue when a multi-cycle multiplication is in progress; we ignore`
194			`;; that for now though.`
195
196			`(define_insn_reservation "cortex_a5_fpmuld" 7`
197			`(and (eq_attr "tune" "cortexa5")`
198			`(eq_attr "type" "fmuld"))`
199			`"cortex_a5_ex1+cortex_a5_fpmul_pipe, cortex_a5_fpmul_pipe*2,\`
200			`cortex_a5_ex1+cortex_a5_fpmul_pipe")`
201
202			`(define_insn_reservation "cortex_a5_fpmacd" 11`
203			`(and (eq_attr "tune" "cortexa5")`
204			`(eq_attr "type" "fmacd"))`
205			`"cortex_a5_ex1+cortex_a5_fpmul_pipe, cortex_a5_fpmul_pipe*2,\`
206			`cortex_a5_ex1+cortex_a5_fpmul_pipe, nothing*3, cortex_a5_fpadd_pipe")`
207
208			`;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;`
209			`;; Floating-point divide/square root instructions.`
210			`;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;`
211
212			`;; ??? Not sure if the 14 cycles taken for single-precision divide to complete`
213			`;; includes the time taken for the special instruction used to collect the`
214			`;; result to travel down the multiply pipeline, or not. Assuming so. (If`
215			`;; that's wrong, the latency should be increased by a few cycles.)`
216
217			`;; fsqrt takes one cycle less, but that is not modelled, nor is the use of the`
218			`;; multiply pipeline to collect the divide/square-root result.`
219
220			`(define_insn_reservation "cortex_a5_fdivs" 14`
221			`(and (eq_attr "tune" "cortexa5")`
222			`(eq_attr "type" "fdivs"))`
223			`"cortex_a5_ex1, cortex_a5_fp_div_sqrt * 13")`
224
225			`;; ??? Similarly for fdivd.`
226
227			`(define_insn_reservation "cortex_a5_fdivd" 29`
228			`(and (eq_attr "tune" "cortexa5")`
229			`(eq_attr "type" "fdivd"))`
230			`"cortex_a5_ex1, cortex_a5_fp_div_sqrt * 28")`
231
232			`;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;`
233			`;; VFP to/from core transfers.`
234			`;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;`
235
236			`;; FP loads take data from wr/rot/f3.`
237
238			`;; Core-to-VFP transfers use the multiply pipeline.`
239
240			`(define_insn_reservation "cortex_a5_r2f" 4`
241			`(and (eq_attr "tune" "cortexa5")`
242			`(eq_attr "type" "r_2_f"))`
243			`"cortex_a5_ex1")`
244
245			`(define_insn_reservation "cortex_a5_f2r" 2`
246			`(and (eq_attr "tune" "cortexa5")`
247			`(eq_attr "type" "f_2_r"))`
248			`"cortex_a5_ex1")`
249
250			`;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;`
251			`;; VFP flag transfer.`
252			`;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;`
253
254			`;; ??? The flag forwarding from fmstat to the ex2 stage of the second`
255			`;; instruction is not modeled at present.`
256
257			`(define_insn_reservation "cortex_a5_f_flags" 4`
258			`(and (eq_attr "tune" "cortexa5")`
259			`(eq_attr "type" "f_flag"))`
260			`"cortex_a5_ex1")`
261
262			`;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;`
263			`;; VFP load/store.`
264			`;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;`
265
266			`(define_insn_reservation "cortex_a5_f_loads" 4`
267			`(and (eq_attr "tune" "cortexa5")`
268			`(eq_attr "type" "f_loads"))`
269			`"cortex_a5_ex1")`
270
271			`(define_insn_reservation "cortex_a5_f_loadd" 5`
272			`(and (eq_attr "tune" "cortexa5")`
273			`(eq_attr "type" "f_loadd"))`
274			`"cortex_a5_ex1+cortex_a5_branch, cortex_a5_ex1")`
275
276			`(define_insn_reservation "cortex_a5_f_stores" 0`
277			`(and (eq_attr "tune" "cortexa5")`
278			`(eq_attr "type" "f_stores"))`
279			`"cortex_a5_ex1")`
280
281			`(define_insn_reservation "cortex_a5_f_stored" 0`
282			`(and (eq_attr "tune" "cortexa5")`
283			`(eq_attr "type" "f_stored"))`
284			`"cortex_a5_ex1+cortex_a5_branch, cortex_a5_ex1")`
285
286			`;; Load-to-use for floating-point values has a penalty of one cycle,`
287			`;; i.e. a latency of two.`
288
289			`(define_bypass 2 "cortex_a5_f_loads"`
290			`"cortex_a5_fpalu, cortex_a5_fpmacs, cortex_a5_fpmuld,\`
291			`cortex_a5_fpmacd, cortex_a5_fdivs, cortex_a5_fdivd,\`
292			`cortex_a5_f2r")`
293
294			`(define_bypass 3 "cortex_a5_f_loadd"`
295			`"cortex_a5_fpalu, cortex_a5_fpmacs, cortex_a5_fpmuld,\`
296			`cortex_a5_fpmacd, cortex_a5_fdivs, cortex_a5_fdivd,\`
297			`cortex_a5_f2r")`