1 |
282 |
jeremybenn |
(* Emission of the core of the Cortex-A8 NEON scheduling description.
|
2 |
|
|
Copyright (C) 2007, 2010 Free Software Foundation, Inc.
|
3 |
|
|
Contributed by CodeSourcery.
|
4 |
|
|
|
5 |
|
|
This file is part of GCC.
|
6 |
|
|
|
7 |
|
|
GCC is free software; you can redistribute it and/or modify it under
|
8 |
|
|
the terms of the GNU General Public License as published by the Free
|
9 |
|
|
Software Foundation; either version 3, or (at your option) any later
|
10 |
|
|
version.
|
11 |
|
|
|
12 |
|
|
GCC is distributed in the hope that it will be useful, but WITHOUT ANY
|
13 |
|
|
WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
14 |
|
|
FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
15 |
|
|
for more details.
|
16 |
|
|
|
17 |
|
|
You should have received a copy of the GNU General Public License
|
18 |
|
|
along with GCC; see the file COPYING3. If not see
|
19 |
|
|
.
|
20 |
|
|
*)
|
21 |
|
|
|
22 |
|
|
(* This scheduling description generator works as follows.
|
23 |
|
|
- Each group of instructions has source and destination requirements
|
24 |
|
|
specified. The source requirements may be specified using
|
25 |
|
|
Source (the stage at which all source operands not otherwise
|
26 |
|
|
described are read), Source_m (the stage at which Rm operands are
|
27 |
|
|
read), Source_n (likewise for Rn) and Source_d (likewise for Rd).
|
28 |
|
|
- For each group of instructions the earliest stage where a source
|
29 |
|
|
operand may be required is calculated.
|
30 |
|
|
- Each group of instructions is selected in turn as a producer.
|
31 |
|
|
The latencies between this group and every other group are then
|
32 |
|
|
calculated, yielding up to four values for each combination:
|
33 |
|
|
1. Producer -> consumer Rn latency
|
34 |
|
|
2. Producer -> consumer Rm latency
|
35 |
|
|
3. Producer -> consumer Rd (as a source) latency
|
36 |
|
|
4. Producer -> consumer worst-case latency.
|
37 |
|
|
Value 4 is calculated from the destination availability requirements
|
38 |
|
|
of the consumer and the earliest source availability requirements
|
39 |
|
|
of the producer.
|
40 |
|
|
- The largest Value 4 calculated for the current producer is the
|
41 |
|
|
worse-case latency, L, for that instruction group. This value is written
|
42 |
|
|
out in a define_insn_reservation for the producer group.
|
43 |
|
|
- For each producer and consumer pair, the latencies calculated above
|
44 |
|
|
are collated. The average (of up to four values) is calculated and
|
45 |
|
|
if this average is different from the worst-case latency, an
|
46 |
|
|
unguarded define_bypass construction is issued for that pair.
|
47 |
|
|
(For each pair only one define_bypass construction will be emitted,
|
48 |
|
|
and at present we do not emit specific guards.)
|
49 |
|
|
*)
|
50 |
|
|
|
51 |
|
|
let find_with_result fn lst =
|
52 |
|
|
let rec scan = function
|
53 |
|
|
[] -> raise Not_found
|
54 |
|
|
| l::ls ->
|
55 |
|
|
match fn l with
|
56 |
|
|
Some result -> result
|
57 |
|
|
| _ -> scan ls in
|
58 |
|
|
scan lst
|
59 |
|
|
|
60 |
|
|
let n1 = 1 and n2 = 2 and n3 = 3 and n4 = 4 and n5 = 5 and n6 = 6
|
61 |
|
|
and n7 = 7 and n8 = 8 and n9 = 9
|
62 |
|
|
|
63 |
|
|
type availability = Source of int
|
64 |
|
|
| Source_n of int
|
65 |
|
|
| Source_m of int
|
66 |
|
|
| Source_d of int
|
67 |
|
|
| Dest of int
|
68 |
|
|
| Dest_n_after of int * int
|
69 |
|
|
|
70 |
|
|
type guard = Guard_none | Guard_only_m | Guard_only_n | Guard_only_d
|
71 |
|
|
|
72 |
|
|
(* Reservation behaviors. All but the last row here correspond to one
|
73 |
|
|
pipeline each. Each constructor will correspond to one
|
74 |
|
|
define_reservation. *)
|
75 |
|
|
type reservation =
|
76 |
|
|
Mul | Mul_2cycle | Mul_4cycle
|
77 |
|
|
| Shift | Shift_2cycle
|
78 |
|
|
| ALU | ALU_2cycle
|
79 |
|
|
| Fmul | Fmul_2cycle
|
80 |
|
|
| Fadd | Fadd_2cycle
|
81 |
|
|
(* | VFP *)
|
82 |
|
|
| Permute of int
|
83 |
|
|
| Ls of int
|
84 |
|
|
| Fmul_then_fadd | Fmul_then_fadd_2
|
85 |
|
|
|
86 |
|
|
(* This table must be kept as short as possible by conflating
|
87 |
|
|
entries with the same availability behavior.
|
88 |
|
|
|
89 |
|
|
First components: instruction group names
|
90 |
|
|
Second components: availability requirements, in the order in which
|
91 |
|
|
they should appear in the comments in the .md file.
|
92 |
|
|
Third components: reservation info
|
93 |
|
|
*)
|
94 |
|
|
let availability_table = [
|
95 |
|
|
(* NEON integer ALU instructions. *)
|
96 |
|
|
(* vbit vbif vbsl vorr vbic vnot vcls vclz vcnt vadd vand vorr
|
97 |
|
|
veor vbic vorn ddd qqq *)
|
98 |
|
|
"neon_int_1", [Source n2; Dest n3], ALU;
|
99 |
|
|
(* vadd vsub qqd vsub ddd qqq *)
|
100 |
|
|
"neon_int_2", [Source_m n1; Source_n n2; Dest n3], ALU;
|
101 |
|
|
(* vsum vneg dd qq vadd vsub qdd *)
|
102 |
|
|
"neon_int_3", [Source n1; Dest n3], ALU;
|
103 |
|
|
(* vabs vceqz vcgez vcbtz vclez vcltz vadh vradh vsbh vrsbh dqq *)
|
104 |
|
|
(* vhadd vrhadd vqadd vtst ddd qqq *)
|
105 |
|
|
"neon_int_4", [Source n2; Dest n4], ALU;
|
106 |
|
|
(* vabd qdd vhsub vqsub vabd vceq vcge vcgt vmax vmin vfmx vfmn ddd ddd *)
|
107 |
|
|
"neon_int_5", [Source_m n1; Source_n n2; Dest n4], ALU;
|
108 |
|
|
(* vqneg vqabs dd qq *)
|
109 |
|
|
"neon_vqneg_vqabs", [Source n1; Dest n4], ALU;
|
110 |
|
|
(* vmov vmvn *)
|
111 |
|
|
"neon_vmov", [Dest n3], ALU;
|
112 |
|
|
(* vaba *)
|
113 |
|
|
"neon_vaba", [Source_n n2; Source_m n1; Source_d n3; Dest n6], ALU;
|
114 |
|
|
"neon_vaba_qqq",
|
115 |
|
|
[Source_n n2; Source_m n1; Source_d n3; Dest_n_after (1, n6)], ALU_2cycle;
|
116 |
|
|
(* vsma *)
|
117 |
|
|
"neon_vsma", [Source_m n1; Source_d n3; Dest n6], ALU;
|
118 |
|
|
|
119 |
|
|
(* NEON integer multiply instructions. *)
|
120 |
|
|
(* vmul, vqdmlh, vqrdmlh *)
|
121 |
|
|
(* vmul, vqdmul, qdd 16/8 long 32/16 long *)
|
122 |
|
|
"neon_mul_ddd_8_16_qdd_16_8_long_32_16_long", [Source n2; Dest n6], Mul;
|
123 |
|
|
"neon_mul_qqq_8_16_32_ddd_32", [Source n2; Dest_n_after (1, n6)], Mul_2cycle;
|
124 |
|
|
(* vmul, vqdmul again *)
|
125 |
|
|
"neon_mul_qdd_64_32_long_qqd_16_ddd_32_scalar_64_32_long_scalar",
|
126 |
|
|
[Source_n n2; Source_m n1; Dest_n_after (1, n6)], Mul_2cycle;
|
127 |
|
|
(* vmla, vmls *)
|
128 |
|
|
"neon_mla_ddd_8_16_qdd_16_8_long_32_16_long",
|
129 |
|
|
[Source_n n2; Source_m n2; Source_d n3; Dest n6], Mul;
|
130 |
|
|
"neon_mla_qqq_8_16",
|
131 |
|
|
[Source_n n2; Source_m n2; Source_d n3; Dest_n_after (1, n6)], Mul_2cycle;
|
132 |
|
|
"neon_mla_ddd_32_qqd_16_ddd_32_scalar_qdd_64_32_long_scalar_qdd_64_32_long",
|
133 |
|
|
[Source_n n2; Source_m n1; Source_d n3; Dest_n_after (1, n6)], Mul_2cycle;
|
134 |
|
|
"neon_mla_qqq_32_qqd_32_scalar",
|
135 |
|
|
[Source_n n2; Source_m n1; Source_d n3; Dest_n_after (3, n6)], Mul_4cycle;
|
136 |
|
|
(* vmul, vqdmulh, vqrdmulh *)
|
137 |
|
|
(* vmul, vqdmul *)
|
138 |
|
|
"neon_mul_ddd_16_scalar_32_16_long_scalar",
|
139 |
|
|
[Source_n n2; Source_m n1; Dest n6], Mul;
|
140 |
|
|
"neon_mul_qqd_32_scalar",
|
141 |
|
|
[Source_n n2; Source_m n1; Dest_n_after (3, n6)], Mul_4cycle;
|
142 |
|
|
(* vmla, vmls *)
|
143 |
|
|
(* vmla, vmla, vqdmla, vqdmls *)
|
144 |
|
|
"neon_mla_ddd_16_scalar_qdd_32_16_long_scalar",
|
145 |
|
|
[Source_n n2; Source_m n1; Source_d n3; Dest n6], Mul;
|
146 |
|
|
|
147 |
|
|
(* NEON integer shift instructions. *)
|
148 |
|
|
(* vshr/vshl immediate, vshr_narrow, vshl_vmvh, vsli_vsri_ddd *)
|
149 |
|
|
"neon_shift_1", [Source n1; Dest n3], Shift;
|
150 |
|
|
(* vqshl, vrshr immediate; vqshr, vqmov, vrshr, vqrshr narrow;
|
151 |
|
|
vqshl_vrshl_vqrshl_ddd *)
|
152 |
|
|
"neon_shift_2", [Source n1; Dest n4], Shift;
|
153 |
|
|
(* vsli, vsri and vshl for qqq *)
|
154 |
|
|
"neon_shift_3", [Source n1; Dest_n_after (1, n3)], Shift_2cycle;
|
155 |
|
|
"neon_vshl_ddd", [Source n1; Dest n1], Shift;
|
156 |
|
|
"neon_vqshl_vrshl_vqrshl_qqq", [Source n1; Dest_n_after (1, n4)],
|
157 |
|
|
Shift_2cycle;
|
158 |
|
|
"neon_vsra_vrsra", [Source_m n1; Source_d n3; Dest n6], Shift;
|
159 |
|
|
|
160 |
|
|
(* NEON floating-point instructions. *)
|
161 |
|
|
(* vadd, vsub, vabd, vmul, vceq, vcge, vcgt, vcage, vcagt, vmax, vmin *)
|
162 |
|
|
(* vabs, vneg, vceqz, vcgez, vcgtz, vclez, vcltz, vrecpe, vrsqrte, vcvt *)
|
163 |
|
|
"neon_fp_vadd_ddd_vabs_dd", [Source n2; Dest n5], Fadd;
|
164 |
|
|
"neon_fp_vadd_qqq_vabs_qq", [Source n2; Dest_n_after (1, n5)],
|
165 |
|
|
Fadd_2cycle;
|
166 |
|
|
(* vsum, fvmx, vfmn *)
|
167 |
|
|
"neon_fp_vsum", [Source n1; Dest n5], Fadd;
|
168 |
|
|
"neon_fp_vmul_ddd", [Source_n n2; Source_m n1; Dest n5], Fmul;
|
169 |
|
|
"neon_fp_vmul_qqd", [Source_n n2; Source_m n1; Dest_n_after (1, n5)],
|
170 |
|
|
Fmul_2cycle;
|
171 |
|
|
(* vmla, vmls *)
|
172 |
|
|
"neon_fp_vmla_ddd",
|
173 |
|
|
[Source_n n2; Source_m n2; Source_d n3; Dest n9], Fmul_then_fadd;
|
174 |
|
|
"neon_fp_vmla_qqq",
|
175 |
|
|
[Source_n n2; Source_m n2; Source_d n3; Dest_n_after (1, n9)],
|
176 |
|
|
Fmul_then_fadd_2;
|
177 |
|
|
"neon_fp_vmla_ddd_scalar",
|
178 |
|
|
[Source_n n2; Source_m n1; Source_d n3; Dest n9], Fmul_then_fadd;
|
179 |
|
|
"neon_fp_vmla_qqq_scalar",
|
180 |
|
|
[Source_n n2; Source_m n1; Source_d n3; Dest_n_after (1, n9)],
|
181 |
|
|
Fmul_then_fadd_2;
|
182 |
|
|
"neon_fp_vrecps_vrsqrts_ddd", [Source n2; Dest n9], Fmul_then_fadd;
|
183 |
|
|
"neon_fp_vrecps_vrsqrts_qqq", [Source n2; Dest_n_after (1, n9)],
|
184 |
|
|
Fmul_then_fadd_2;
|
185 |
|
|
|
186 |
|
|
(* NEON byte permute instructions. *)
|
187 |
|
|
(* vmov; vtrn and vswp for dd; vzip for dd; vuzp for dd; vrev; vext for dd *)
|
188 |
|
|
"neon_bp_simple", [Source n1; Dest n2], Permute 1;
|
189 |
|
|
(* vswp for qq; vext for qqq; vtbl with {Dn} or {Dn, Dn1};
|
190 |
|
|
similarly for vtbx *)
|
191 |
|
|
"neon_bp_2cycle", [Source n1; Dest_n_after (1, n2)], Permute 2;
|
192 |
|
|
(* all the rest *)
|
193 |
|
|
"neon_bp_3cycle", [Source n1; Dest_n_after (2, n2)], Permute 3;
|
194 |
|
|
|
195 |
|
|
(* NEON load/store instructions. *)
|
196 |
|
|
"neon_ldr", [Dest n1], Ls 1;
|
197 |
|
|
"neon_str", [Source n1], Ls 1;
|
198 |
|
|
"neon_vld1_1_2_regs", [Dest_n_after (1, n1)], Ls 2;
|
199 |
|
|
"neon_vld1_3_4_regs", [Dest_n_after (2, n1)], Ls 3;
|
200 |
|
|
"neon_vld2_2_regs_vld1_vld2_all_lanes", [Dest_n_after (1, n2)], Ls 2;
|
201 |
|
|
"neon_vld2_4_regs", [Dest_n_after (2, n2)], Ls 3;
|
202 |
|
|
"neon_vld3_vld4", [Dest_n_after (3, n2)], Ls 4;
|
203 |
|
|
"neon_vst1_1_2_regs_vst2_2_regs", [Source n1], Ls 2;
|
204 |
|
|
"neon_vst1_3_4_regs", [Source n1], Ls 3;
|
205 |
|
|
"neon_vst2_4_regs_vst3_vst4", [Source n1], Ls 4;
|
206 |
|
|
"neon_vst3_vst4", [Source n1], Ls 4;
|
207 |
|
|
"neon_vld1_vld2_lane", [Source n1; Dest_n_after (2, n2)], Ls 3;
|
208 |
|
|
"neon_vld3_vld4_lane", [Source n1; Dest_n_after (4, n2)], Ls 5;
|
209 |
|
|
"neon_vst1_vst2_lane", [Source n1], Ls 2;
|
210 |
|
|
"neon_vst3_vst4_lane", [Source n1], Ls 3;
|
211 |
|
|
"neon_vld3_vld4_all_lanes", [Dest_n_after (1, n2)], Ls 3;
|
212 |
|
|
|
213 |
|
|
(* NEON register transfer instructions. *)
|
214 |
|
|
"neon_mcr", [Dest n2], Permute 1;
|
215 |
|
|
"neon_mcr_2_mcrr", [Dest n2], Permute 2;
|
216 |
|
|
(* MRC instructions are in the .tpl file. *)
|
217 |
|
|
]
|
218 |
|
|
|
219 |
|
|
(* Augment the tuples in the availability table with an extra component
|
220 |
|
|
that describes the earliest stage where a source operand may be
|
221 |
|
|
required. (It is also possible that an entry in the table has no
|
222 |
|
|
source requirements.) *)
|
223 |
|
|
let calculate_sources =
|
224 |
|
|
List.map (fun (name, avail, res) ->
|
225 |
|
|
let earliest_stage =
|
226 |
|
|
List.fold_left
|
227 |
|
|
(fun cur -> fun info ->
|
228 |
|
|
match info with
|
229 |
|
|
Source stage
|
230 |
|
|
| Source_n stage
|
231 |
|
|
| Source_m stage
|
232 |
|
|
| Source_d stage ->
|
233 |
|
|
(match cur with
|
234 |
|
|
None -> Some stage
|
235 |
|
|
| Some stage' when stage < stage' -> Some stage
|
236 |
|
|
| _ -> cur)
|
237 |
|
|
| _ -> cur) None avail
|
238 |
|
|
in
|
239 |
|
|
(name, avail, res, earliest_stage))
|
240 |
|
|
|
241 |
|
|
(* Find the stage, if any, at the end of which a group produces a result. *)
|
242 |
|
|
let find_dest (attr, avail, _, _) =
|
243 |
|
|
try
|
244 |
|
|
find_with_result
|
245 |
|
|
(fun av -> match av with
|
246 |
|
|
Dest st -> Some (Some st)
|
247 |
|
|
| Dest_n_after (after, st) -> Some (Some (after + st))
|
248 |
|
|
| _ -> None) avail
|
249 |
|
|
with Not_found -> None
|
250 |
|
|
|
251 |
|
|
(* Find the worst-case latency between a producer and a consumer. *)
|
252 |
|
|
let worst_case_latency producer (_, _, _, earliest_required) =
|
253 |
|
|
let dest = find_dest producer in
|
254 |
|
|
match earliest_required, dest with
|
255 |
|
|
None, _ ->
|
256 |
|
|
(* The consumer doesn't have any source requirements. *)
|
257 |
|
|
None
|
258 |
|
|
| _, None ->
|
259 |
|
|
(* The producer doesn't produce any results (e.g. a store insn). *)
|
260 |
|
|
None
|
261 |
|
|
| Some consumed, Some produced -> Some (produced - consumed + 1)
|
262 |
|
|
|
263 |
|
|
(* Helper function for below. *)
|
264 |
|
|
let latency_calc f producer (_, avail, _, _) =
|
265 |
|
|
try
|
266 |
|
|
let source_avail = find_with_result f avail in
|
267 |
|
|
match find_dest producer with
|
268 |
|
|
None ->
|
269 |
|
|
(* The producer does not produce a result. *)
|
270 |
|
|
Some 0
|
271 |
|
|
| Some produced ->
|
272 |
|
|
let latency = produced - source_avail + 1 in
|
273 |
|
|
(* Latencies below zero are raised to zero since we don't have
|
274 |
|
|
delay slots. *)
|
275 |
|
|
if latency < 0 then Some 0 else Some latency
|
276 |
|
|
with Not_found -> None
|
277 |
|
|
|
278 |
|
|
(* Find any Rm latency between a producer and a consumer. If no
|
279 |
|
|
Rm source requirement is explicitly specified for the consumer,
|
280 |
|
|
return "positive infinity". Also return "positive infinity" if
|
281 |
|
|
the latency matches the supplied worst-case latency for this
|
282 |
|
|
producer. *)
|
283 |
|
|
let get_m_latency producer consumer =
|
284 |
|
|
match latency_calc (fun av -> match av with Source_m stage -> Some stage
|
285 |
|
|
| _ -> None) producer consumer
|
286 |
|
|
with None -> [] | Some latency -> [(Guard_only_m, latency)]
|
287 |
|
|
|
288 |
|
|
(* Likewise for Rn. *)
|
289 |
|
|
let get_n_latency producer consumer =
|
290 |
|
|
match latency_calc (fun av -> match av with Source_n stage -> Some stage
|
291 |
|
|
| _ -> None) producer consumer
|
292 |
|
|
with None -> [] | Some latency -> [(Guard_only_n, latency)]
|
293 |
|
|
|
294 |
|
|
(* Likewise for Rd. *)
|
295 |
|
|
let get_d_latency producer consumer =
|
296 |
|
|
match
|
297 |
|
|
latency_calc (fun av -> match av with Source_d stage -> Some stage
|
298 |
|
|
| _ -> None) producer consumer
|
299 |
|
|
with None -> [] | Some latency -> [(Guard_only_d, latency)]
|
300 |
|
|
|
301 |
|
|
(* Given a producer and a consumer, work out the latency of the producer
|
302 |
|
|
to the consumer in each of the four cases (availability information
|
303 |
|
|
permitting) identified at the top of this file. Return the
|
304 |
|
|
consumer, the worst-case unguarded latency and any guarded latencies. *)
|
305 |
|
|
let calculate_latencies producer consumer =
|
306 |
|
|
let worst = worst_case_latency producer consumer in
|
307 |
|
|
let m_latency = get_m_latency producer consumer in
|
308 |
|
|
let n_latency = get_n_latency producer consumer in
|
309 |
|
|
let d_latency = get_d_latency producer consumer in
|
310 |
|
|
(consumer, worst, m_latency @ n_latency @ d_latency)
|
311 |
|
|
|
312 |
|
|
(* Helper function for below. *)
|
313 |
|
|
let pick_latency largest worst guards =
|
314 |
|
|
let guards =
|
315 |
|
|
match worst with
|
316 |
|
|
None -> guards
|
317 |
|
|
| Some worst -> (Guard_none, worst) :: guards
|
318 |
|
|
in
|
319 |
|
|
if List.length guards = 0 then None else
|
320 |
|
|
let total_latency =
|
321 |
|
|
List.fold_left (fun acc -> fun (_, latency) -> acc + latency) 0 guards
|
322 |
|
|
in
|
323 |
|
|
let average_latency = (float_of_int total_latency) /.
|
324 |
|
|
(float_of_int (List.length guards)) in
|
325 |
|
|
let rounded_latency = int_of_float (ceil average_latency) in
|
326 |
|
|
if rounded_latency = largest then None
|
327 |
|
|
else Some (Guard_none, rounded_latency)
|
328 |
|
|
|
329 |
|
|
(* Collate all bypasses for a particular producer as required in
|
330 |
|
|
worst_case_latencies_and_bypasses. (By this stage there is a maximum
|
331 |
|
|
of one bypass from this producer to any particular consumer listed
|
332 |
|
|
in LATENCIES.) Use a hash table to collate bypasses with the
|
333 |
|
|
same latency and guard. *)
|
334 |
|
|
let collate_bypasses (producer_name, _, _, _) largest latencies =
|
335 |
|
|
let ht = Hashtbl.create 42 in
|
336 |
|
|
let keys = ref [] in
|
337 |
|
|
List.iter (
|
338 |
|
|
fun ((consumer, _, _, _), worst, guards) ->
|
339 |
|
|
(* Find out which latency to use. Ignoring latencies that match
|
340 |
|
|
the *overall* worst-case latency for this producer (which will
|
341 |
|
|
be in define_insn_reservation), we have to examine:
|
342 |
|
|
1. the latency with no guard between this producer and this
|
343 |
|
|
consumer; and
|
344 |
|
|
2. any guarded latency. *)
|
345 |
|
|
let guard_latency_opt = pick_latency largest worst guards in
|
346 |
|
|
match guard_latency_opt with
|
347 |
|
|
None -> ()
|
348 |
|
|
| Some (guard, latency) ->
|
349 |
|
|
begin
|
350 |
|
|
(if (try ignore (Hashtbl.find ht (guard, latency)); false
|
351 |
|
|
with Not_found -> true) then
|
352 |
|
|
keys := (guard, latency) :: !keys);
|
353 |
|
|
Hashtbl.add ht (guard, latency) consumer
|
354 |
|
|
end
|
355 |
|
|
) latencies;
|
356 |
|
|
(* The hash table now has bypasses collated so that ones with the
|
357 |
|
|
same latency and guard have the same keys. Walk through all the
|
358 |
|
|
keys, extract the associated bypasses, and concatenate the names
|
359 |
|
|
of the consumers for each bypass. *)
|
360 |
|
|
List.map (
|
361 |
|
|
fun ((guard, latency) as key) ->
|
362 |
|
|
let consumers = Hashtbl.find_all ht key in
|
363 |
|
|
(producer_name,
|
364 |
|
|
String.concat ",\\\n " consumers,
|
365 |
|
|
latency,
|
366 |
|
|
guard)
|
367 |
|
|
) !keys
|
368 |
|
|
|
369 |
|
|
(* For every producer, find the worst-case latency between it and
|
370 |
|
|
*any* consumer. Also determine (if such a thing exists) the
|
371 |
|
|
lowest-latency bypass from each producer to each consumer. Group
|
372 |
|
|
the output in such a way that all bypasses with the same producer
|
373 |
|
|
and latency are together, and so that bypasses with the worst-case
|
374 |
|
|
latency are ignored. *)
|
375 |
|
|
let worst_case_latencies_and_bypasses =
|
376 |
|
|
let rec f (worst_acc, bypasses_acc) prev xs =
|
377 |
|
|
match xs with
|
378 |
|
|
[] -> (worst_acc, bypasses_acc)
|
379 |
|
|
| ((producer_name, producer_avail, res_string, _) as producer)::next ->
|
380 |
|
|
(* For this particular producer, work out the latencies between
|
381 |
|
|
it and every consumer. *)
|
382 |
|
|
let latencies =
|
383 |
|
|
List.fold_left (fun acc -> fun consumer ->
|
384 |
|
|
(calculate_latencies producer consumer) :: acc)
|
385 |
|
|
[] (prev @ xs)
|
386 |
|
|
in
|
387 |
|
|
(* Now work out what the overall worst case latency was for this
|
388 |
|
|
particular producer. *)
|
389 |
|
|
match latencies with
|
390 |
|
|
[] -> assert false
|
391 |
|
|
| _ ->
|
392 |
|
|
let comp_fn (_, l1, _) (_, l2, _) =
|
393 |
|
|
if l1 > l2 then -1 else if l1 = l2 then 0 else 1
|
394 |
|
|
in
|
395 |
|
|
let largest =
|
396 |
|
|
match List.hd (List.sort comp_fn latencies) with
|
397 |
|
|
(_, None, _) -> 0 (* Producer has no consumers. *)
|
398 |
|
|
| (_, Some worst, _) -> worst
|
399 |
|
|
in
|
400 |
|
|
(* Having got the largest latency, collect all bypasses for
|
401 |
|
|
this producer and filter out those with that larger
|
402 |
|
|
latency. Record the others for later emission. *)
|
403 |
|
|
let bypasses = collate_bypasses producer largest latencies in
|
404 |
|
|
(* Go on to process remaining producers, having noted
|
405 |
|
|
the result for this one. *)
|
406 |
|
|
f ((producer_name, producer_avail, largest,
|
407 |
|
|
res_string) :: worst_acc,
|
408 |
|
|
bypasses @ bypasses_acc)
|
409 |
|
|
(prev @ [producer]) next
|
410 |
|
|
in
|
411 |
|
|
f ([], []) []
|
412 |
|
|
|
413 |
|
|
(* Emit a helpful comment for a define_insn_reservation. *)
|
414 |
|
|
let write_comment producer avail =
|
415 |
|
|
let seen_source = ref false in
|
416 |
|
|
let describe info =
|
417 |
|
|
let read = if !seen_source then "" else "read " in
|
418 |
|
|
match info with
|
419 |
|
|
Source stage ->
|
420 |
|
|
seen_source := true;
|
421 |
|
|
Printf.printf "%stheir source operands at N%d" read stage
|
422 |
|
|
| Source_n stage ->
|
423 |
|
|
seen_source := true;
|
424 |
|
|
Printf.printf "%stheir (D|Q)n operands at N%d" read stage
|
425 |
|
|
| Source_m stage ->
|
426 |
|
|
seen_source := true;
|
427 |
|
|
Printf.printf "%stheir (D|Q)m operands at N%d" read stage
|
428 |
|
|
| Source_d stage ->
|
429 |
|
|
Printf.printf "%stheir (D|Q)d operands at N%d" read stage
|
430 |
|
|
| Dest stage ->
|
431 |
|
|
Printf.printf "produce a result at N%d" stage
|
432 |
|
|
| Dest_n_after (after, stage) ->
|
433 |
|
|
Printf.printf "produce a result at N%d on cycle %d" stage (after + 1)
|
434 |
|
|
in
|
435 |
|
|
Printf.printf ";; Instructions using this reservation ";
|
436 |
|
|
let rec f infos x =
|
437 |
|
|
let sep = if x mod 2 = 1 then "" else "\n;;" in
|
438 |
|
|
match infos with
|
439 |
|
|
[] -> assert false
|
440 |
|
|
| [info] -> describe info; Printf.printf ".\n"
|
441 |
|
|
| info::(_::[] as infos) ->
|
442 |
|
|
describe info; Printf.printf ", and%s " sep; f infos (x+1)
|
443 |
|
|
| info::infos -> describe info; Printf.printf ",%s " sep; f infos (x+1)
|
444 |
|
|
in
|
445 |
|
|
f avail 0
|
446 |
|
|
|
447 |
|
|
(* Emit a define_insn_reservation for each producer. The latency
|
448 |
|
|
written in will be its worst-case latency. *)
|
449 |
|
|
let emit_insn_reservations =
|
450 |
|
|
List.iter (
|
451 |
|
|
fun (producer, avail, latency, reservation) ->
|
452 |
|
|
write_comment producer avail;
|
453 |
|
|
Printf.printf "(define_insn_reservation \"%s\" %d\n" producer latency;
|
454 |
|
|
Printf.printf " (and (eq_attr \"tune\" \"cortexa8\")\n";
|
455 |
|
|
Printf.printf " (eq_attr \"neon_type\" \"%s\"))\n" producer;
|
456 |
|
|
let str =
|
457 |
|
|
match reservation with
|
458 |
|
|
Mul -> "dp" | Mul_2cycle -> "dp_2" | Mul_4cycle -> "dp_4"
|
459 |
|
|
| Shift -> "dp" | Shift_2cycle -> "dp_2"
|
460 |
|
|
| ALU -> "dp" | ALU_2cycle -> "dp_2"
|
461 |
|
|
| Fmul -> "dp" | Fmul_2cycle -> "dp_2"
|
462 |
|
|
| Fadd -> "fadd" | Fadd_2cycle -> "fadd_2"
|
463 |
|
|
| Ls 1 -> "ls"
|
464 |
|
|
| Ls n -> "ls_" ^ (string_of_int n)
|
465 |
|
|
| Permute 1 -> "perm"
|
466 |
|
|
| Permute n -> "perm_" ^ (string_of_int n)
|
467 |
|
|
| Fmul_then_fadd -> "fmul_then_fadd"
|
468 |
|
|
| Fmul_then_fadd_2 -> "fmul_then_fadd_2"
|
469 |
|
|
in
|
470 |
|
|
Printf.printf " \"cortex_a8_neon_%s\")\n\n" str
|
471 |
|
|
)
|
472 |
|
|
|
473 |
|
|
(* Given a guard description, return the name of the C function to
|
474 |
|
|
be used as the guard for define_bypass. *)
|
475 |
|
|
let guard_fn g =
|
476 |
|
|
match g with
|
477 |
|
|
Guard_only_m -> "arm_neon_only_m_dependency"
|
478 |
|
|
| Guard_only_n -> "arm_neon_only_n_dependency"
|
479 |
|
|
| Guard_only_d -> "arm_neon_only_d_dependency"
|
480 |
|
|
| Guard_none -> assert false
|
481 |
|
|
|
482 |
|
|
(* Emit a define_bypass for each bypass. *)
|
483 |
|
|
let emit_bypasses =
|
484 |
|
|
List.iter (
|
485 |
|
|
fun (producer, consumers, latency, guard) ->
|
486 |
|
|
Printf.printf "(define_bypass %d \"%s\"\n" latency producer;
|
487 |
|
|
if guard = Guard_none then
|
488 |
|
|
Printf.printf " \"%s\")\n\n" consumers
|
489 |
|
|
else
|
490 |
|
|
begin
|
491 |
|
|
Printf.printf " \"%s\"\n" consumers;
|
492 |
|
|
Printf.printf " \"%s\")\n\n" (guard_fn guard)
|
493 |
|
|
end
|
494 |
|
|
)
|
495 |
|
|
|
496 |
|
|
(* Program entry point. *)
|
497 |
|
|
let main =
|
498 |
|
|
let table = calculate_sources availability_table in
|
499 |
|
|
let worst_cases, bypasses = worst_case_latencies_and_bypasses table in
|
500 |
|
|
emit_insn_reservations (List.rev worst_cases);
|
501 |
|
|
Printf.printf ";; Exceptions to the default latencies.\n\n";
|
502 |
|
|
emit_bypasses bypasses
|
503 |
|
|
|