OpenCores
URL https://opencores.org/ocsvn/openrisc/openrisc/trunk

Subversion Repositories openrisc

[/] [openrisc/] [trunk/] [gnu-dev/] [or1k-gcc/] [gcc/] [tree-ssa-loop-prefetch.c] - Blame information for rev 700

Go to most recent revision | Details | Compare with Previous | View Log

Line No. Rev Author Line
1 684 jeremybenn
/* Array prefetching.
2
   Copyright (C) 2005, 2007, 2008, 2009, 2010, 2011
3
   Free Software Foundation, Inc.
4
 
5
This file is part of GCC.
6
 
7
GCC is free software; you can redistribute it and/or modify it
8
under the terms of the GNU General Public License as published by the
9
Free Software Foundation; either version 3, or (at your option) any
10
later version.
11
 
12
GCC is distributed in the hope that it will be useful, but WITHOUT
13
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14
FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
15
for more details.
16
 
17
You should have received a copy of the GNU General Public License
18
along with GCC; see the file COPYING3.  If not see
19
<http://www.gnu.org/licenses/>.  */
20
 
21
#include "config.h"
22
#include "system.h"
23
#include "coretypes.h"
24
#include "tm.h"
25
#include "tree.h"
26
#include "tm_p.h"
27
#include "basic-block.h"
28
#include "output.h"
29
#include "tree-pretty-print.h"
30
#include "tree-flow.h"
31
#include "tree-dump.h"
32
#include "timevar.h"
33
#include "cfgloop.h"
34
#include "tree-pass.h"
35
#include "insn-config.h"
36
#include "recog.h"
37
#include "hashtab.h"
38
#include "tree-chrec.h"
39
#include "tree-scalar-evolution.h"
40
#include "diagnostic-core.h"
41
#include "params.h"
42
#include "langhooks.h"
43
#include "tree-inline.h"
44
#include "tree-data-ref.h"
45
 
46
 
47
/* FIXME: Needed for optabs, but this should all be moved to a TBD interface
48
   between the GIMPLE and RTL worlds.  */
49
#include "expr.h"
50
#include "optabs.h"
51
 
52
/* This pass inserts prefetch instructions to optimize cache usage during
53
   accesses to arrays in loops.  It processes loops sequentially and:
54
 
55
   1) Gathers all memory references in the single loop.
56
   2) For each of the references it decides when it is profitable to prefetch
57
      it.  To do it, we evaluate the reuse among the accesses, and determines
58
      two values: PREFETCH_BEFORE (meaning that it only makes sense to do
59
      prefetching in the first PREFETCH_BEFORE iterations of the loop) and
60
      PREFETCH_MOD (meaning that it only makes sense to prefetch in the
61
      iterations of the loop that are zero modulo PREFETCH_MOD).  For example
62
      (assuming cache line size is 64 bytes, char has size 1 byte and there
63
      is no hardware sequential prefetch):
64
 
65
      char *a;
66
      for (i = 0; i < max; i++)
67
        {
68
          a[255] = ...;         (0)
69
          a[i] = ...;           (1)
70
          a[i + 64] = ...;      (2)
71
          a[16*i] = ...;        (3)
72
          a[187*i] = ...;       (4)
73
          a[187*i + 50] = ...;  (5)
74
        }
75
 
76
       (0) obviously has PREFETCH_BEFORE 1
77
       (1) has PREFETCH_BEFORE 64, since (2) accesses the same memory
78
           location 64 iterations before it, and PREFETCH_MOD 64 (since
79
           it hits the same cache line otherwise).
80
       (2) has PREFETCH_MOD 64
81
       (3) has PREFETCH_MOD 4
82
       (4) has PREFETCH_MOD 1.  We do not set PREFETCH_BEFORE here, since
83
           the cache line accessed by (5) is the same with probability only
84
           7/32.
85
       (5) has PREFETCH_MOD 1 as well.
86
 
87
      Additionally, we use data dependence analysis to determine for each
88
      reference the distance till the first reuse; this information is used
89
      to determine the temporality of the issued prefetch instruction.
90
 
91
   3) We determine how much ahead we need to prefetch.  The number of
92
      iterations needed is time to fetch / time spent in one iteration of
93
      the loop.  The problem is that we do not know either of these values,
94
      so we just make a heuristic guess based on a magic (possibly)
95
      target-specific constant and size of the loop.
96
 
97
   4) Determine which of the references we prefetch.  We take into account
98
      that there is a maximum number of simultaneous prefetches (provided
99
      by machine description).  We prefetch as many prefetches as possible
100
      while still within this bound (starting with those with lowest
101
      prefetch_mod, since they are responsible for most of the cache
102
      misses).
103
 
104
   5) We unroll and peel loops so that we are able to satisfy PREFETCH_MOD
105
      and PREFETCH_BEFORE requirements (within some bounds), and to avoid
106
      prefetching nonaccessed memory.
107
      TODO -- actually implement peeling.
108
 
109
   6) We actually emit the prefetch instructions.  ??? Perhaps emit the
110
      prefetch instructions with guards in cases where 5) was not sufficient
111
      to satisfy the constraints?
112
 
113
   A cost model is implemented to determine whether or not prefetching is
114
   profitable for a given loop.  The cost model has three heuristics:
115
 
116
   1. Function trip_count_to_ahead_ratio_too_small_p implements a
117
      heuristic that determines whether or not the loop has too few
118
      iterations (compared to ahead).  Prefetching is not likely to be
119
      beneficial if the trip count to ahead ratio is below a certain
120
      minimum.
121
 
122
   2. Function mem_ref_count_reasonable_p implements a heuristic that
123
      determines whether the given loop has enough CPU ops that can be
124
      overlapped with cache missing memory ops.  If not, the loop
125
      won't benefit from prefetching.  In the implementation,
126
      prefetching is not considered beneficial if the ratio between
127
      the instruction count and the mem ref count is below a certain
128
      minimum.
129
 
130
   3. Function insn_to_prefetch_ratio_too_small_p implements a
131
      heuristic that disables prefetching in a loop if the prefetching
132
      cost is above a certain limit.  The relative prefetching cost is
133
      estimated by taking the ratio between the prefetch count and the
134
      total intruction count (this models the I-cache cost).
135
 
136
   The limits used in these heuristics are defined as parameters with
137
   reasonable default values. Machine-specific default values will be
138
   added later.
139
 
140
   Some other TODO:
141
      -- write and use more general reuse analysis (that could be also used
142
         in other cache aimed loop optimizations)
143
      -- make it behave sanely together with the prefetches given by user
144
         (now we just ignore them; at the very least we should avoid
145
         optimizing loops in that user put his own prefetches)
146
      -- we assume cache line size alignment of arrays; this could be
147
         improved.  */
148
 
149
/* Magic constants follow.  These should be replaced by machine specific
150
   numbers.  */
151
 
152
/* True if write can be prefetched by a read prefetch.  */
153
 
154
#ifndef WRITE_CAN_USE_READ_PREFETCH
155
#define WRITE_CAN_USE_READ_PREFETCH 1
156
#endif
157
 
158
/* True if read can be prefetched by a write prefetch. */
159
 
160
#ifndef READ_CAN_USE_WRITE_PREFETCH
161
#define READ_CAN_USE_WRITE_PREFETCH 0
162
#endif
163
 
164
/* The size of the block loaded by a single prefetch.  Usually, this is
165
   the same as cache line size (at the moment, we only consider one level
166
   of cache hierarchy).  */
167
 
168
#ifndef PREFETCH_BLOCK
169
#define PREFETCH_BLOCK L1_CACHE_LINE_SIZE
170
#endif
171
 
172
/* Do we have a forward hardware sequential prefetching?  */
173
 
174
#ifndef HAVE_FORWARD_PREFETCH
175
#define HAVE_FORWARD_PREFETCH 0
176
#endif
177
 
178
/* Do we have a backward hardware sequential prefetching?  */
179
 
180
#ifndef HAVE_BACKWARD_PREFETCH
181
#define HAVE_BACKWARD_PREFETCH 0
182
#endif
183
 
184
/* In some cases we are only able to determine that there is a certain
185
   probability that the two accesses hit the same cache line.  In this
186
   case, we issue the prefetches for both of them if this probability
187
   is less then (1000 - ACCEPTABLE_MISS_RATE) per thousand.  */
188
 
189
#ifndef ACCEPTABLE_MISS_RATE
190
#define ACCEPTABLE_MISS_RATE 50
191
#endif
192
 
193
#ifndef HAVE_prefetch
194
#define HAVE_prefetch 0
195
#endif
196
 
197
#define L1_CACHE_SIZE_BYTES ((unsigned) (L1_CACHE_SIZE * 1024))
198
#define L2_CACHE_SIZE_BYTES ((unsigned) (L2_CACHE_SIZE * 1024))
199
 
200
/* We consider a memory access nontemporal if it is not reused sooner than
201
   after L2_CACHE_SIZE_BYTES of memory are accessed.  However, we ignore
202
   accesses closer than L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION,
203
   so that we use nontemporal prefetches e.g. if single memory location
204
   is accessed several times in a single iteration of the loop.  */
205
#define NONTEMPORAL_FRACTION 16
206
 
207
/* In case we have to emit a memory fence instruction after the loop that
208
   uses nontemporal stores, this defines the builtin to use.  */
209
 
210
#ifndef FENCE_FOLLOWING_MOVNT
211
#define FENCE_FOLLOWING_MOVNT NULL_TREE
212
#endif
213
 
214
/* It is not profitable to prefetch when the trip count is not at
215
   least TRIP_COUNT_TO_AHEAD_RATIO times the prefetch ahead distance.
216
   For example, in a loop with a prefetch ahead distance of 10,
217
   supposing that TRIP_COUNT_TO_AHEAD_RATIO is equal to 4, it is
218
   profitable to prefetch when the trip count is greater or equal to
219
   40.  In that case, 30 out of the 40 iterations will benefit from
220
   prefetching.  */
221
 
222
#ifndef TRIP_COUNT_TO_AHEAD_RATIO
223
#define TRIP_COUNT_TO_AHEAD_RATIO 4
224
#endif
225
 
226
/* The group of references between that reuse may occur.  */
227
 
228
struct mem_ref_group
229
{
230
  tree base;                    /* Base of the reference.  */
231
  tree step;                    /* Step of the reference.  */
232
  struct mem_ref *refs;         /* References in the group.  */
233
  struct mem_ref_group *next;   /* Next group of references.  */
234
};
235
 
236
/* Assigned to PREFETCH_BEFORE when all iterations are to be prefetched.  */
237
 
238
#define PREFETCH_ALL            (~(unsigned HOST_WIDE_INT) 0)
239
 
240
/* Do not generate a prefetch if the unroll factor is significantly less
241
   than what is required by the prefetch.  This is to avoid redundant
242
   prefetches.  For example, when prefetch_mod is 16 and unroll_factor is
243
   2, prefetching requires unrolling the loop 16 times, but
244
   the loop is actually unrolled twice.  In this case (ratio = 8),
245
   prefetching is not likely to be beneficial.  */
246
 
247
#ifndef PREFETCH_MOD_TO_UNROLL_FACTOR_RATIO
248
#define PREFETCH_MOD_TO_UNROLL_FACTOR_RATIO 4
249
#endif
250
 
251
/* Some of the prefetch computations have quadratic complexity.  We want to
252
   avoid huge compile times and, therefore, want to limit the amount of
253
   memory references per loop where we consider prefetching.  */
254
 
255
#ifndef PREFETCH_MAX_MEM_REFS_PER_LOOP
256
#define PREFETCH_MAX_MEM_REFS_PER_LOOP 200
257
#endif
258
 
259
/* The memory reference.  */
260
 
261
struct mem_ref
262
{
263
  gimple stmt;                  /* Statement in that the reference appears.  */
264
  tree mem;                     /* The reference.  */
265
  HOST_WIDE_INT delta;          /* Constant offset of the reference.  */
266
  struct mem_ref_group *group;  /* The group of references it belongs to.  */
267
  unsigned HOST_WIDE_INT prefetch_mod;
268
                                /* Prefetch only each PREFETCH_MOD-th
269
                                   iteration.  */
270
  unsigned HOST_WIDE_INT prefetch_before;
271
                                /* Prefetch only first PREFETCH_BEFORE
272
                                   iterations.  */
273
  unsigned reuse_distance;      /* The amount of data accessed before the first
274
                                   reuse of this value.  */
275
  struct mem_ref *next;         /* The next reference in the group.  */
276
  unsigned write_p : 1;         /* Is it a write?  */
277
  unsigned independent_p : 1;   /* True if the reference is independent on
278
                                   all other references inside the loop.  */
279
  unsigned issue_prefetch_p : 1;        /* Should we really issue the prefetch?  */
280
  unsigned storent_p : 1;       /* True if we changed the store to a
281
                                   nontemporal one.  */
282
};
283
 
284
/* Dumps information about reference REF to FILE.  */
285
 
286
static void
287
dump_mem_ref (FILE *file, struct mem_ref *ref)
288
{
289
  fprintf (file, "Reference %p:\n", (void *) ref);
290
 
291
  fprintf (file, "  group %p (base ", (void *) ref->group);
292
  print_generic_expr (file, ref->group->base, TDF_SLIM);
293
  fprintf (file, ", step ");
294
  if (cst_and_fits_in_hwi (ref->group->step))
295
    fprintf (file, HOST_WIDE_INT_PRINT_DEC, int_cst_value (ref->group->step));
296
  else
297
    print_generic_expr (file, ref->group->step, TDF_TREE);
298
  fprintf (file, ")\n");
299
 
300
  fprintf (file, "  delta ");
301
  fprintf (file, HOST_WIDE_INT_PRINT_DEC, ref->delta);
302
  fprintf (file, "\n");
303
 
304
  fprintf (file, "  %s\n", ref->write_p ? "write" : "read");
305
 
306
  fprintf (file, "\n");
307
}
308
 
309
/* Finds a group with BASE and STEP in GROUPS, or creates one if it does not
310
   exist.  */
311
 
312
static struct mem_ref_group *
313
find_or_create_group (struct mem_ref_group **groups, tree base, tree step)
314
{
315
  struct mem_ref_group *group;
316
 
317
  for (; *groups; groups = &(*groups)->next)
318
    {
319
      if (operand_equal_p ((*groups)->step, step, 0)
320
          && operand_equal_p ((*groups)->base, base, 0))
321
        return *groups;
322
 
323
      /* If step is an integer constant, keep the list of groups sorted
324
         by decreasing step.  */
325
        if (cst_and_fits_in_hwi ((*groups)->step) && cst_and_fits_in_hwi (step)
326
            && int_cst_value ((*groups)->step) < int_cst_value (step))
327
        break;
328
    }
329
 
330
  group = XNEW (struct mem_ref_group);
331
  group->base = base;
332
  group->step = step;
333
  group->refs = NULL;
334
  group->next = *groups;
335
  *groups = group;
336
 
337
  return group;
338
}
339
 
340
/* Records a memory reference MEM in GROUP with offset DELTA and write status
341
   WRITE_P.  The reference occurs in statement STMT.  */
342
 
343
static void
344
record_ref (struct mem_ref_group *group, gimple stmt, tree mem,
345
            HOST_WIDE_INT delta, bool write_p)
346
{
347
  struct mem_ref **aref;
348
 
349
  /* Do not record the same address twice.  */
350
  for (aref = &group->refs; *aref; aref = &(*aref)->next)
351
    {
352
      /* It does not have to be possible for write reference to reuse the read
353
         prefetch, or vice versa.  */
354
      if (!WRITE_CAN_USE_READ_PREFETCH
355
          && write_p
356
          && !(*aref)->write_p)
357
        continue;
358
      if (!READ_CAN_USE_WRITE_PREFETCH
359
          && !write_p
360
          && (*aref)->write_p)
361
        continue;
362
 
363
      if ((*aref)->delta == delta)
364
        return;
365
    }
366
 
367
  (*aref) = XNEW (struct mem_ref);
368
  (*aref)->stmt = stmt;
369
  (*aref)->mem = mem;
370
  (*aref)->delta = delta;
371
  (*aref)->write_p = write_p;
372
  (*aref)->prefetch_before = PREFETCH_ALL;
373
  (*aref)->prefetch_mod = 1;
374
  (*aref)->reuse_distance = 0;
375
  (*aref)->issue_prefetch_p = false;
376
  (*aref)->group = group;
377
  (*aref)->next = NULL;
378
  (*aref)->independent_p = false;
379
  (*aref)->storent_p = false;
380
 
381
  if (dump_file && (dump_flags & TDF_DETAILS))
382
    dump_mem_ref (dump_file, *aref);
383
}
384
 
385
/* Release memory references in GROUPS.  */
386
 
387
static void
388
release_mem_refs (struct mem_ref_group *groups)
389
{
390
  struct mem_ref_group *next_g;
391
  struct mem_ref *ref, *next_r;
392
 
393
  for (; groups; groups = next_g)
394
    {
395
      next_g = groups->next;
396
      for (ref = groups->refs; ref; ref = next_r)
397
        {
398
          next_r = ref->next;
399
          free (ref);
400
        }
401
      free (groups);
402
    }
403
}
404
 
405
/* A structure used to pass arguments to idx_analyze_ref.  */
406
 
407
struct ar_data
408
{
409
  struct loop *loop;                    /* Loop of the reference.  */
410
  gimple stmt;                          /* Statement of the reference.  */
411
  tree *step;                           /* Step of the memory reference.  */
412
  HOST_WIDE_INT *delta;                 /* Offset of the memory reference.  */
413
};
414
 
415
/* Analyzes a single INDEX of a memory reference to obtain information
416
   described at analyze_ref.  Callback for for_each_index.  */
417
 
418
static bool
419
idx_analyze_ref (tree base, tree *index, void *data)
420
{
421
  struct ar_data *ar_data = (struct ar_data *) data;
422
  tree ibase, step, stepsize;
423
  HOST_WIDE_INT idelta = 0, imult = 1;
424
  affine_iv iv;
425
 
426
  if (!simple_iv (ar_data->loop, loop_containing_stmt (ar_data->stmt),
427
                  *index, &iv, true))
428
    return false;
429
  ibase = iv.base;
430
  step = iv.step;
431
 
432
  if (TREE_CODE (ibase) == POINTER_PLUS_EXPR
433
      && cst_and_fits_in_hwi (TREE_OPERAND (ibase, 1)))
434
    {
435
      idelta = int_cst_value (TREE_OPERAND (ibase, 1));
436
      ibase = TREE_OPERAND (ibase, 0);
437
    }
438
  if (cst_and_fits_in_hwi (ibase))
439
    {
440
      idelta += int_cst_value (ibase);
441
      ibase = build_int_cst (TREE_TYPE (ibase), 0);
442
    }
443
 
444
  if (TREE_CODE (base) == ARRAY_REF)
445
    {
446
      stepsize = array_ref_element_size (base);
447
      if (!cst_and_fits_in_hwi (stepsize))
448
        return false;
449
      imult = int_cst_value (stepsize);
450
      step = fold_build2 (MULT_EXPR, sizetype,
451
                          fold_convert (sizetype, step),
452
                          fold_convert (sizetype, stepsize));
453
      idelta *= imult;
454
    }
455
 
456
  if (*ar_data->step == NULL_TREE)
457
    *ar_data->step = step;
458
  else
459
    *ar_data->step = fold_build2 (PLUS_EXPR, sizetype,
460
                                  fold_convert (sizetype, *ar_data->step),
461
                                  fold_convert (sizetype, step));
462
  *ar_data->delta += idelta;
463
  *index = ibase;
464
 
465
  return true;
466
}
467
 
468
/* Tries to express REF_P in shape &BASE + STEP * iter + DELTA, where DELTA and
469
   STEP are integer constants and iter is number of iterations of LOOP.  The
470
   reference occurs in statement STMT.  Strips nonaddressable component
471
   references from REF_P.  */
472
 
473
static bool
474
analyze_ref (struct loop *loop, tree *ref_p, tree *base,
475
             tree *step, HOST_WIDE_INT *delta,
476
             gimple stmt)
477
{
478
  struct ar_data ar_data;
479
  tree off;
480
  HOST_WIDE_INT bit_offset;
481
  tree ref = *ref_p;
482
 
483
  *step = NULL_TREE;
484
  *delta = 0;
485
 
486
  /* First strip off the component references.  Ignore bitfields.
487
     Also strip off the real and imagine parts of a complex, so that
488
     they can have the same base.  */
489
  if (TREE_CODE (ref) == REALPART_EXPR
490
      || TREE_CODE (ref) == IMAGPART_EXPR
491
      || (TREE_CODE (ref) == COMPONENT_REF
492
          && DECL_NONADDRESSABLE_P (TREE_OPERAND (ref, 1))))
493
    {
494
      if (TREE_CODE (ref) == IMAGPART_EXPR)
495
        *delta += int_size_in_bytes (TREE_TYPE (ref));
496
      ref = TREE_OPERAND (ref, 0);
497
    }
498
 
499
  *ref_p = ref;
500
 
501
  for (; TREE_CODE (ref) == COMPONENT_REF; ref = TREE_OPERAND (ref, 0))
502
    {
503
      off = DECL_FIELD_BIT_OFFSET (TREE_OPERAND (ref, 1));
504
      bit_offset = TREE_INT_CST_LOW (off);
505
      gcc_assert (bit_offset % BITS_PER_UNIT == 0);
506
 
507
      *delta += bit_offset / BITS_PER_UNIT;
508
    }
509
 
510
  *base = unshare_expr (ref);
511
  ar_data.loop = loop;
512
  ar_data.stmt = stmt;
513
  ar_data.step = step;
514
  ar_data.delta = delta;
515
  return for_each_index (base, idx_analyze_ref, &ar_data);
516
}
517
 
518
/* Record a memory reference REF to the list REFS.  The reference occurs in
519
   LOOP in statement STMT and it is write if WRITE_P.  Returns true if the
520
   reference was recorded, false otherwise.  */
521
 
522
static bool
523
gather_memory_references_ref (struct loop *loop, struct mem_ref_group **refs,
524
                              tree ref, bool write_p, gimple stmt)
525
{
526
  tree base, step;
527
  HOST_WIDE_INT delta;
528
  struct mem_ref_group *agrp;
529
 
530
  if (get_base_address (ref) == NULL)
531
    return false;
532
 
533
  if (!analyze_ref (loop, &ref, &base, &step, &delta, stmt))
534
    return false;
535
  /* If analyze_ref fails the default is a NULL_TREE.  We can stop here.  */
536
  if (step == NULL_TREE)
537
    return false;
538
 
539
  /* Stop if the address of BASE could not be taken.  */
540
  if (may_be_nonaddressable_p (base))
541
    return false;
542
 
543
  /* Limit non-constant step prefetching only to the innermost loops.  */
544
  if (!cst_and_fits_in_hwi (step) && loop->inner != NULL)
545
    return false;
546
 
547
  /* Now we know that REF = &BASE + STEP * iter + DELTA, where DELTA and STEP
548
     are integer constants.  */
549
  agrp = find_or_create_group (refs, base, step);
550
  record_ref (agrp, stmt, ref, delta, write_p);
551
 
552
  return true;
553
}
554
 
555
/* Record the suitable memory references in LOOP.  NO_OTHER_REFS is set to
556
   true if there are no other memory references inside the loop.  */
557
 
558
static struct mem_ref_group *
559
gather_memory_references (struct loop *loop, bool *no_other_refs, unsigned *ref_count)
560
{
561
  basic_block *body = get_loop_body_in_dom_order (loop);
562
  basic_block bb;
563
  unsigned i;
564
  gimple_stmt_iterator bsi;
565
  gimple stmt;
566
  tree lhs, rhs;
567
  struct mem_ref_group *refs = NULL;
568
 
569
  *no_other_refs = true;
570
  *ref_count = 0;
571
 
572
  /* Scan the loop body in order, so that the former references precede the
573
     later ones.  */
574
  for (i = 0; i < loop->num_nodes; i++)
575
    {
576
      bb = body[i];
577
      if (bb->loop_father != loop)
578
        continue;
579
 
580
      for (bsi = gsi_start_bb (bb); !gsi_end_p (bsi); gsi_next (&bsi))
581
        {
582
          stmt = gsi_stmt (bsi);
583
 
584
          if (gimple_code (stmt) != GIMPLE_ASSIGN)
585
            {
586
              if (gimple_vuse (stmt)
587
                  || (is_gimple_call (stmt)
588
                      && !(gimple_call_flags (stmt) & ECF_CONST)))
589
                *no_other_refs = false;
590
              continue;
591
            }
592
 
593
          lhs = gimple_assign_lhs (stmt);
594
          rhs = gimple_assign_rhs1 (stmt);
595
 
596
          if (REFERENCE_CLASS_P (rhs))
597
            {
598
            *no_other_refs &= gather_memory_references_ref (loop, &refs,
599
                                                            rhs, false, stmt);
600
            *ref_count += 1;
601
            }
602
          if (REFERENCE_CLASS_P (lhs))
603
            {
604
            *no_other_refs &= gather_memory_references_ref (loop, &refs,
605
                                                            lhs, true, stmt);
606
            *ref_count += 1;
607
            }
608
        }
609
    }
610
  free (body);
611
 
612
  return refs;
613
}
614
 
615
/* Prune the prefetch candidate REF using the self-reuse.  */
616
 
617
static void
618
prune_ref_by_self_reuse (struct mem_ref *ref)
619
{
620
  HOST_WIDE_INT step;
621
  bool backward;
622
 
623
  /* If the step size is non constant, we cannot calculate prefetch_mod.  */
624
  if (!cst_and_fits_in_hwi (ref->group->step))
625
    return;
626
 
627
  step = int_cst_value (ref->group->step);
628
 
629
  backward = step < 0;
630
 
631
  if (step == 0)
632
    {
633
      /* Prefetch references to invariant address just once.  */
634
      ref->prefetch_before = 1;
635
      return;
636
    }
637
 
638
  if (backward)
639
    step = -step;
640
 
641
  if (step > PREFETCH_BLOCK)
642
    return;
643
 
644
  if ((backward && HAVE_BACKWARD_PREFETCH)
645
      || (!backward && HAVE_FORWARD_PREFETCH))
646
    {
647
      ref->prefetch_before = 1;
648
      return;
649
    }
650
 
651
  ref->prefetch_mod = PREFETCH_BLOCK / step;
652
}
653
 
654
/* Divides X by BY, rounding down.  */
655
 
656
static HOST_WIDE_INT
657
ddown (HOST_WIDE_INT x, unsigned HOST_WIDE_INT by)
658
{
659
  gcc_assert (by > 0);
660
 
661
  if (x >= 0)
662
    return x / by;
663
  else
664
    return (x + by - 1) / by;
665
}
666
 
667
/* Given a CACHE_LINE_SIZE and two inductive memory references
668
   with a common STEP greater than CACHE_LINE_SIZE and an address
669
   difference DELTA, compute the probability that they will fall
670
   in different cache lines.  Return true if the computed miss rate
671
   is not greater than the ACCEPTABLE_MISS_RATE.  DISTINCT_ITERS is the
672
   number of distinct iterations after which the pattern repeats itself.
673
   ALIGN_UNIT is the unit of alignment in bytes.  */
674
 
675
static bool
676
is_miss_rate_acceptable (unsigned HOST_WIDE_INT cache_line_size,
677
                   HOST_WIDE_INT step, HOST_WIDE_INT delta,
678
                   unsigned HOST_WIDE_INT distinct_iters,
679
                   int align_unit)
680
{
681
  unsigned align, iter;
682
  int total_positions, miss_positions, max_allowed_miss_positions;
683
  int address1, address2, cache_line1, cache_line2;
684
 
685
  /* It always misses if delta is greater than or equal to the cache
686
     line size.  */
687
  if (delta >= (HOST_WIDE_INT) cache_line_size)
688
    return false;
689
 
690
  miss_positions = 0;
691
  total_positions = (cache_line_size / align_unit) * distinct_iters;
692
  max_allowed_miss_positions = (ACCEPTABLE_MISS_RATE * total_positions) / 1000;
693
 
694
  /* Iterate through all possible alignments of the first
695
     memory reference within its cache line.  */
696
  for (align = 0; align < cache_line_size; align += align_unit)
697
 
698
    /* Iterate through all distinct iterations.  */
699
    for (iter = 0; iter < distinct_iters; iter++)
700
      {
701
        address1 = align + step * iter;
702
        address2 = address1 + delta;
703
        cache_line1 = address1 / cache_line_size;
704
        cache_line2 = address2 / cache_line_size;
705
        if (cache_line1 != cache_line2)
706
          {
707
            miss_positions += 1;
708
            if (miss_positions > max_allowed_miss_positions)
709
              return false;
710
          }
711
      }
712
  return true;
713
}
714
 
715
/* Prune the prefetch candidate REF using the reuse with BY.
716
   If BY_IS_BEFORE is true, BY is before REF in the loop.  */
717
 
718
static void
719
prune_ref_by_group_reuse (struct mem_ref *ref, struct mem_ref *by,
720
                          bool by_is_before)
721
{
722
  HOST_WIDE_INT step;
723
  bool backward;
724
  HOST_WIDE_INT delta_r = ref->delta, delta_b = by->delta;
725
  HOST_WIDE_INT delta = delta_b - delta_r;
726
  HOST_WIDE_INT hit_from;
727
  unsigned HOST_WIDE_INT prefetch_before, prefetch_block;
728
  HOST_WIDE_INT reduced_step;
729
  unsigned HOST_WIDE_INT reduced_prefetch_block;
730
  tree ref_type;
731
  int align_unit;
732
 
733
  /* If the step is non constant we cannot calculate prefetch_before.  */
734
  if (!cst_and_fits_in_hwi (ref->group->step)) {
735
    return;
736
  }
737
 
738
  step = int_cst_value (ref->group->step);
739
 
740
  backward = step < 0;
741
 
742
 
743
  if (delta == 0)
744
    {
745
      /* If the references has the same address, only prefetch the
746
         former.  */
747
      if (by_is_before)
748
        ref->prefetch_before = 0;
749
 
750
      return;
751
    }
752
 
753
  if (!step)
754
    {
755
      /* If the reference addresses are invariant and fall into the
756
         same cache line, prefetch just the first one.  */
757
      if (!by_is_before)
758
        return;
759
 
760
      if (ddown (ref->delta, PREFETCH_BLOCK)
761
          != ddown (by->delta, PREFETCH_BLOCK))
762
        return;
763
 
764
      ref->prefetch_before = 0;
765
      return;
766
    }
767
 
768
  /* Only prune the reference that is behind in the array.  */
769
  if (backward)
770
    {
771
      if (delta > 0)
772
        return;
773
 
774
      /* Transform the data so that we may assume that the accesses
775
         are forward.  */
776
      delta = - delta;
777
      step = -step;
778
      delta_r = PREFETCH_BLOCK - 1 - delta_r;
779
      delta_b = PREFETCH_BLOCK - 1 - delta_b;
780
    }
781
  else
782
    {
783
      if (delta < 0)
784
        return;
785
    }
786
 
787
  /* Check whether the two references are likely to hit the same cache
788
     line, and how distant the iterations in that it occurs are from
789
     each other.  */
790
 
791
  if (step <= PREFETCH_BLOCK)
792
    {
793
      /* The accesses are sure to meet.  Let us check when.  */
794
      hit_from = ddown (delta_b, PREFETCH_BLOCK) * PREFETCH_BLOCK;
795
      prefetch_before = (hit_from - delta_r + step - 1) / step;
796
 
797
      /* Do not reduce prefetch_before if we meet beyond cache size.  */
798
      if (prefetch_before > absu_hwi (L2_CACHE_SIZE_BYTES / step))
799
        prefetch_before = PREFETCH_ALL;
800
      if (prefetch_before < ref->prefetch_before)
801
        ref->prefetch_before = prefetch_before;
802
 
803
      return;
804
    }
805
 
806
  /* A more complicated case with step > prefetch_block.  First reduce
807
     the ratio between the step and the cache line size to its simplest
808
     terms.  The resulting denominator will then represent the number of
809
     distinct iterations after which each address will go back to its
810
     initial location within the cache line.  This computation assumes
811
     that PREFETCH_BLOCK is a power of two.  */
812
  prefetch_block = PREFETCH_BLOCK;
813
  reduced_prefetch_block = prefetch_block;
814
  reduced_step = step;
815
  while ((reduced_step & 1) == 0
816
         && reduced_prefetch_block > 1)
817
    {
818
      reduced_step >>= 1;
819
      reduced_prefetch_block >>= 1;
820
    }
821
 
822
  prefetch_before = delta / step;
823
  delta %= step;
824
  ref_type = TREE_TYPE (ref->mem);
825
  align_unit = TYPE_ALIGN (ref_type) / 8;
826
  if (is_miss_rate_acceptable (prefetch_block, step, delta,
827
                               reduced_prefetch_block, align_unit))
828
    {
829
      /* Do not reduce prefetch_before if we meet beyond cache size.  */
830
      if (prefetch_before > L2_CACHE_SIZE_BYTES / PREFETCH_BLOCK)
831
        prefetch_before = PREFETCH_ALL;
832
      if (prefetch_before < ref->prefetch_before)
833
        ref->prefetch_before = prefetch_before;
834
 
835
      return;
836
    }
837
 
838
  /* Try also the following iteration.  */
839
  prefetch_before++;
840
  delta = step - delta;
841
  if (is_miss_rate_acceptable (prefetch_block, step, delta,
842
                               reduced_prefetch_block, align_unit))
843
    {
844
      if (prefetch_before < ref->prefetch_before)
845
        ref->prefetch_before = prefetch_before;
846
 
847
      return;
848
    }
849
 
850
  /* The ref probably does not reuse by.  */
851
  return;
852
}
853
 
854
/* Prune the prefetch candidate REF using the reuses with other references
855
   in REFS.  */
856
 
857
static void
858
prune_ref_by_reuse (struct mem_ref *ref, struct mem_ref *refs)
859
{
860
  struct mem_ref *prune_by;
861
  bool before = true;
862
 
863
  prune_ref_by_self_reuse (ref);
864
 
865
  for (prune_by = refs; prune_by; prune_by = prune_by->next)
866
    {
867
      if (prune_by == ref)
868
        {
869
          before = false;
870
          continue;
871
        }
872
 
873
      if (!WRITE_CAN_USE_READ_PREFETCH
874
          && ref->write_p
875
          && !prune_by->write_p)
876
        continue;
877
      if (!READ_CAN_USE_WRITE_PREFETCH
878
          && !ref->write_p
879
          && prune_by->write_p)
880
        continue;
881
 
882
      prune_ref_by_group_reuse (ref, prune_by, before);
883
    }
884
}
885
 
886
/* Prune the prefetch candidates in GROUP using the reuse analysis.  */
887
 
888
static void
889
prune_group_by_reuse (struct mem_ref_group *group)
890
{
891
  struct mem_ref *ref_pruned;
892
 
893
  for (ref_pruned = group->refs; ref_pruned; ref_pruned = ref_pruned->next)
894
    {
895
      prune_ref_by_reuse (ref_pruned, group->refs);
896
 
897
      if (dump_file && (dump_flags & TDF_DETAILS))
898
        {
899
          fprintf (dump_file, "Reference %p:", (void *) ref_pruned);
900
 
901
          if (ref_pruned->prefetch_before == PREFETCH_ALL
902
              && ref_pruned->prefetch_mod == 1)
903
            fprintf (dump_file, " no restrictions");
904
          else if (ref_pruned->prefetch_before == 0)
905
            fprintf (dump_file, " do not prefetch");
906
          else if (ref_pruned->prefetch_before <= ref_pruned->prefetch_mod)
907
            fprintf (dump_file, " prefetch once");
908
          else
909
            {
910
              if (ref_pruned->prefetch_before != PREFETCH_ALL)
911
                {
912
                  fprintf (dump_file, " prefetch before ");
913
                  fprintf (dump_file, HOST_WIDE_INT_PRINT_DEC,
914
                           ref_pruned->prefetch_before);
915
                }
916
              if (ref_pruned->prefetch_mod != 1)
917
                {
918
                  fprintf (dump_file, " prefetch mod ");
919
                  fprintf (dump_file, HOST_WIDE_INT_PRINT_DEC,
920
                           ref_pruned->prefetch_mod);
921
                }
922
            }
923
          fprintf (dump_file, "\n");
924
        }
925
    }
926
}
927
 
928
/* Prune the list of prefetch candidates GROUPS using the reuse analysis.  */
929
 
930
static void
931
prune_by_reuse (struct mem_ref_group *groups)
932
{
933
  for (; groups; groups = groups->next)
934
    prune_group_by_reuse (groups);
935
}
936
 
937
/* Returns true if we should issue prefetch for REF.  */
938
 
939
static bool
940
should_issue_prefetch_p (struct mem_ref *ref)
941
{
942
  /* For now do not issue prefetches for only first few of the
943
     iterations.  */
944
  if (ref->prefetch_before != PREFETCH_ALL)
945
    {
946
      if (dump_file && (dump_flags & TDF_DETAILS))
947
        fprintf (dump_file, "Ignoring %p due to prefetch_before\n",
948
                 (void *) ref);
949
      return false;
950
    }
951
 
952
  /* Do not prefetch nontemporal stores.  */
953
  if (ref->storent_p)
954
    {
955
      if (dump_file && (dump_flags & TDF_DETAILS))
956
        fprintf (dump_file, "Ignoring nontemporal store %p\n", (void *) ref);
957
      return false;
958
    }
959
 
960
  return true;
961
}
962
 
963
/* Decide which of the prefetch candidates in GROUPS to prefetch.
964
   AHEAD is the number of iterations to prefetch ahead (which corresponds
965
   to the number of simultaneous instances of one prefetch running at a
966
   time).  UNROLL_FACTOR is the factor by that the loop is going to be
967
   unrolled.  Returns true if there is anything to prefetch.  */
968
 
969
static bool
970
schedule_prefetches (struct mem_ref_group *groups, unsigned unroll_factor,
971
                     unsigned ahead)
972
{
973
  unsigned remaining_prefetch_slots, n_prefetches, prefetch_slots;
974
  unsigned slots_per_prefetch;
975
  struct mem_ref *ref;
976
  bool any = false;
977
 
978
  /* At most SIMULTANEOUS_PREFETCHES should be running at the same time.  */
979
  remaining_prefetch_slots = SIMULTANEOUS_PREFETCHES;
980
 
981
  /* The prefetch will run for AHEAD iterations of the original loop, i.e.,
982
     AHEAD / UNROLL_FACTOR iterations of the unrolled loop.  In each iteration,
983
     it will need a prefetch slot.  */
984
  slots_per_prefetch = (ahead + unroll_factor / 2) / unroll_factor;
985
  if (dump_file && (dump_flags & TDF_DETAILS))
986
    fprintf (dump_file, "Each prefetch instruction takes %u prefetch slots.\n",
987
             slots_per_prefetch);
988
 
989
  /* For now we just take memory references one by one and issue
990
     prefetches for as many as possible.  The groups are sorted
991
     starting with the largest step, since the references with
992
     large step are more likely to cause many cache misses.  */
993
 
994
  for (; groups; groups = groups->next)
995
    for (ref = groups->refs; ref; ref = ref->next)
996
      {
997
        if (!should_issue_prefetch_p (ref))
998
          continue;
999
 
1000
        /* The loop is far from being sufficiently unrolled for this
1001
           prefetch.  Do not generate prefetch to avoid many redudant
1002
           prefetches.  */
1003
        if (ref->prefetch_mod / unroll_factor > PREFETCH_MOD_TO_UNROLL_FACTOR_RATIO)
1004
          continue;
1005
 
1006
        /* If we need to prefetch the reference each PREFETCH_MOD iterations,
1007
           and we unroll the loop UNROLL_FACTOR times, we need to insert
1008
           ceil (UNROLL_FACTOR / PREFETCH_MOD) instructions in each
1009
           iteration.  */
1010
        n_prefetches = ((unroll_factor + ref->prefetch_mod - 1)
1011
                        / ref->prefetch_mod);
1012
        prefetch_slots = n_prefetches * slots_per_prefetch;
1013
 
1014
        /* If more than half of the prefetches would be lost anyway, do not
1015
           issue the prefetch.  */
1016
        if (2 * remaining_prefetch_slots < prefetch_slots)
1017
          continue;
1018
 
1019
        ref->issue_prefetch_p = true;
1020
 
1021
        if (remaining_prefetch_slots <= prefetch_slots)
1022
          return true;
1023
        remaining_prefetch_slots -= prefetch_slots;
1024
        any = true;
1025
      }
1026
 
1027
  return any;
1028
}
1029
 
1030
/* Return TRUE if no prefetch is going to be generated in the given
1031
   GROUPS.  */
1032
 
1033
static bool
1034
nothing_to_prefetch_p (struct mem_ref_group *groups)
1035
{
1036
  struct mem_ref *ref;
1037
 
1038
  for (; groups; groups = groups->next)
1039
    for (ref = groups->refs; ref; ref = ref->next)
1040
      if (should_issue_prefetch_p (ref))
1041
        return false;
1042
 
1043
  return true;
1044
}
1045
 
1046
/* Estimate the number of prefetches in the given GROUPS.
1047
   UNROLL_FACTOR is the factor by which LOOP was unrolled.  */
1048
 
1049
static int
1050
estimate_prefetch_count (struct mem_ref_group *groups, unsigned unroll_factor)
1051
{
1052
  struct mem_ref *ref;
1053
  unsigned n_prefetches;
1054
  int prefetch_count = 0;
1055
 
1056
  for (; groups; groups = groups->next)
1057
    for (ref = groups->refs; ref; ref = ref->next)
1058
      if (should_issue_prefetch_p (ref))
1059
        {
1060
          n_prefetches = ((unroll_factor + ref->prefetch_mod - 1)
1061
                          / ref->prefetch_mod);
1062
          prefetch_count += n_prefetches;
1063
        }
1064
 
1065
  return prefetch_count;
1066
}
1067
 
1068
/* Issue prefetches for the reference REF into loop as decided before.
1069
   HEAD is the number of iterations to prefetch ahead.  UNROLL_FACTOR
1070
   is the factor by which LOOP was unrolled.  */
1071
 
1072
static void
1073
issue_prefetch_ref (struct mem_ref *ref, unsigned unroll_factor, unsigned ahead)
1074
{
1075
  HOST_WIDE_INT delta;
1076
  tree addr, addr_base, write_p, local, forward;
1077
  gimple prefetch;
1078
  gimple_stmt_iterator bsi;
1079
  unsigned n_prefetches, ap;
1080
  bool nontemporal = ref->reuse_distance >= L2_CACHE_SIZE_BYTES;
1081
 
1082
  if (dump_file && (dump_flags & TDF_DETAILS))
1083
    fprintf (dump_file, "Issued%s prefetch for %p.\n",
1084
             nontemporal ? " nontemporal" : "",
1085
             (void *) ref);
1086
 
1087
  bsi = gsi_for_stmt (ref->stmt);
1088
 
1089
  n_prefetches = ((unroll_factor + ref->prefetch_mod - 1)
1090
                  / ref->prefetch_mod);
1091
  addr_base = build_fold_addr_expr_with_type (ref->mem, ptr_type_node);
1092
  addr_base = force_gimple_operand_gsi (&bsi, unshare_expr (addr_base),
1093
                                        true, NULL, true, GSI_SAME_STMT);
1094
  write_p = ref->write_p ? integer_one_node : integer_zero_node;
1095
  local = nontemporal ? integer_zero_node : integer_three_node;
1096
 
1097
  for (ap = 0; ap < n_prefetches; ap++)
1098
    {
1099
      if (cst_and_fits_in_hwi (ref->group->step))
1100
        {
1101
          /* Determine the address to prefetch.  */
1102
          delta = (ahead + ap * ref->prefetch_mod) *
1103
                   int_cst_value (ref->group->step);
1104
          addr = fold_build_pointer_plus_hwi (addr_base, delta);
1105
          addr = force_gimple_operand_gsi (&bsi, unshare_expr (addr), true, NULL,
1106
                                           true, GSI_SAME_STMT);
1107
        }
1108
      else
1109
        {
1110
          /* The step size is non-constant but loop-invariant.  We use the
1111
             heuristic to simply prefetch ahead iterations ahead.  */
1112
          forward = fold_build2 (MULT_EXPR, sizetype,
1113
                                 fold_convert (sizetype, ref->group->step),
1114
                                 fold_convert (sizetype, size_int (ahead)));
1115
          addr = fold_build_pointer_plus (addr_base, forward);
1116
          addr = force_gimple_operand_gsi (&bsi, unshare_expr (addr), true,
1117
                                           NULL, true, GSI_SAME_STMT);
1118
      }
1119
      /* Create the prefetch instruction.  */
1120
      prefetch = gimple_build_call (builtin_decl_explicit (BUILT_IN_PREFETCH),
1121
                                    3, addr, write_p, local);
1122
      gsi_insert_before (&bsi, prefetch, GSI_SAME_STMT);
1123
    }
1124
}
1125
 
1126
/* Issue prefetches for the references in GROUPS into loop as decided before.
1127
   HEAD is the number of iterations to prefetch ahead.  UNROLL_FACTOR is the
1128
   factor by that LOOP was unrolled.  */
1129
 
1130
static void
1131
issue_prefetches (struct mem_ref_group *groups,
1132
                  unsigned unroll_factor, unsigned ahead)
1133
{
1134
  struct mem_ref *ref;
1135
 
1136
  for (; groups; groups = groups->next)
1137
    for (ref = groups->refs; ref; ref = ref->next)
1138
      if (ref->issue_prefetch_p)
1139
        issue_prefetch_ref (ref, unroll_factor, ahead);
1140
}
1141
 
1142
/* Returns true if REF is a memory write for that a nontemporal store insn
1143
   can be used.  */
1144
 
1145
static bool
1146
nontemporal_store_p (struct mem_ref *ref)
1147
{
1148
  enum machine_mode mode;
1149
  enum insn_code code;
1150
 
1151
  /* REF must be a write that is not reused.  We require it to be independent
1152
     on all other memory references in the loop, as the nontemporal stores may
1153
     be reordered with respect to other memory references.  */
1154
  if (!ref->write_p
1155
      || !ref->independent_p
1156
      || ref->reuse_distance < L2_CACHE_SIZE_BYTES)
1157
    return false;
1158
 
1159
  /* Check that we have the storent instruction for the mode.  */
1160
  mode = TYPE_MODE (TREE_TYPE (ref->mem));
1161
  if (mode == BLKmode)
1162
    return false;
1163
 
1164
  code = optab_handler (storent_optab, mode);
1165
  return code != CODE_FOR_nothing;
1166
}
1167
 
1168
/* If REF is a nontemporal store, we mark the corresponding modify statement
1169
   and return true.  Otherwise, we return false.  */
1170
 
1171
static bool
1172
mark_nontemporal_store (struct mem_ref *ref)
1173
{
1174
  if (!nontemporal_store_p (ref))
1175
    return false;
1176
 
1177
  if (dump_file && (dump_flags & TDF_DETAILS))
1178
    fprintf (dump_file, "Marked reference %p as a nontemporal store.\n",
1179
             (void *) ref);
1180
 
1181
  gimple_assign_set_nontemporal_move (ref->stmt, true);
1182
  ref->storent_p = true;
1183
 
1184
  return true;
1185
}
1186
 
1187
/* Issue a memory fence instruction after LOOP.  */
1188
 
1189
static void
1190
emit_mfence_after_loop (struct loop *loop)
1191
{
1192
  VEC (edge, heap) *exits = get_loop_exit_edges (loop);
1193
  edge exit;
1194
  gimple call;
1195
  gimple_stmt_iterator bsi;
1196
  unsigned i;
1197
 
1198
  FOR_EACH_VEC_ELT (edge, exits, i, exit)
1199
    {
1200
      call = gimple_build_call (FENCE_FOLLOWING_MOVNT, 0);
1201
 
1202
      if (!single_pred_p (exit->dest)
1203
          /* If possible, we prefer not to insert the fence on other paths
1204
             in cfg.  */
1205
          && !(exit->flags & EDGE_ABNORMAL))
1206
        split_loop_exit_edge (exit);
1207
      bsi = gsi_after_labels (exit->dest);
1208
 
1209
      gsi_insert_before (&bsi, call, GSI_NEW_STMT);
1210
      mark_virtual_ops_for_renaming (call);
1211
    }
1212
 
1213
  VEC_free (edge, heap, exits);
1214
  update_ssa (TODO_update_ssa_only_virtuals);
1215
}
1216
 
1217
/* Returns true if we can use storent in loop, false otherwise.  */
1218
 
1219
static bool
1220
may_use_storent_in_loop_p (struct loop *loop)
1221
{
1222
  bool ret = true;
1223
 
1224
  if (loop->inner != NULL)
1225
    return false;
1226
 
1227
  /* If we must issue a mfence insn after using storent, check that there
1228
     is a suitable place for it at each of the loop exits.  */
1229
  if (FENCE_FOLLOWING_MOVNT != NULL_TREE)
1230
    {
1231
      VEC (edge, heap) *exits = get_loop_exit_edges (loop);
1232
      unsigned i;
1233
      edge exit;
1234
 
1235
      FOR_EACH_VEC_ELT (edge, exits, i, exit)
1236
        if ((exit->flags & EDGE_ABNORMAL)
1237
            && exit->dest == EXIT_BLOCK_PTR)
1238
          ret = false;
1239
 
1240
      VEC_free (edge, heap, exits);
1241
    }
1242
 
1243
  return ret;
1244
}
1245
 
1246
/* Marks nontemporal stores in LOOP.  GROUPS contains the description of memory
1247
   references in the loop.  */
1248
 
1249
static void
1250
mark_nontemporal_stores (struct loop *loop, struct mem_ref_group *groups)
1251
{
1252
  struct mem_ref *ref;
1253
  bool any = false;
1254
 
1255
  if (!may_use_storent_in_loop_p (loop))
1256
    return;
1257
 
1258
  for (; groups; groups = groups->next)
1259
    for (ref = groups->refs; ref; ref = ref->next)
1260
      any |= mark_nontemporal_store (ref);
1261
 
1262
  if (any && FENCE_FOLLOWING_MOVNT != NULL_TREE)
1263
    emit_mfence_after_loop (loop);
1264
}
1265
 
1266
/* Determines whether we can profitably unroll LOOP FACTOR times, and if
1267
   this is the case, fill in DESC by the description of number of
1268
   iterations.  */
1269
 
1270
static bool
1271
should_unroll_loop_p (struct loop *loop, struct tree_niter_desc *desc,
1272
                      unsigned factor)
1273
{
1274
  if (!can_unroll_loop_p (loop, factor, desc))
1275
    return false;
1276
 
1277
  /* We only consider loops without control flow for unrolling.  This is not
1278
     a hard restriction -- tree_unroll_loop works with arbitrary loops
1279
     as well; but the unrolling/prefetching is usually more profitable for
1280
     loops consisting of a single basic block, and we want to limit the
1281
     code growth.  */
1282
  if (loop->num_nodes > 2)
1283
    return false;
1284
 
1285
  return true;
1286
}
1287
 
1288
/* Determine the coefficient by that unroll LOOP, from the information
1289
   contained in the list of memory references REFS.  Description of
1290
   umber of iterations of LOOP is stored to DESC.  NINSNS is the number of
1291
   insns of the LOOP.  EST_NITER is the estimated number of iterations of
1292
   the loop, or -1 if no estimate is available.  */
1293
 
1294
static unsigned
1295
determine_unroll_factor (struct loop *loop, struct mem_ref_group *refs,
1296
                         unsigned ninsns, struct tree_niter_desc *desc,
1297
                         HOST_WIDE_INT est_niter)
1298
{
1299
  unsigned upper_bound;
1300
  unsigned nfactor, factor, mod_constraint;
1301
  struct mem_ref_group *agp;
1302
  struct mem_ref *ref;
1303
 
1304
  /* First check whether the loop is not too large to unroll.  We ignore
1305
     PARAM_MAX_UNROLL_TIMES, because for small loops, it prevented us
1306
     from unrolling them enough to make exactly one cache line covered by each
1307
     iteration.  Also, the goal of PARAM_MAX_UNROLL_TIMES is to prevent
1308
     us from unrolling the loops too many times in cases where we only expect
1309
     gains from better scheduling and decreasing loop overhead, which is not
1310
     the case here.  */
1311
  upper_bound = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / ninsns;
1312
 
1313
  /* If we unrolled the loop more times than it iterates, the unrolled version
1314
     of the loop would be never entered.  */
1315
  if (est_niter >= 0 && est_niter < (HOST_WIDE_INT) upper_bound)
1316
    upper_bound = est_niter;
1317
 
1318
  if (upper_bound <= 1)
1319
    return 1;
1320
 
1321
  /* Choose the factor so that we may prefetch each cache just once,
1322
     but bound the unrolling by UPPER_BOUND.  */
1323
  factor = 1;
1324
  for (agp = refs; agp; agp = agp->next)
1325
    for (ref = agp->refs; ref; ref = ref->next)
1326
      if (should_issue_prefetch_p (ref))
1327
        {
1328
          mod_constraint = ref->prefetch_mod;
1329
          nfactor = least_common_multiple (mod_constraint, factor);
1330
          if (nfactor <= upper_bound)
1331
            factor = nfactor;
1332
        }
1333
 
1334
  if (!should_unroll_loop_p (loop, desc, factor))
1335
    return 1;
1336
 
1337
  return factor;
1338
}
1339
 
1340
/* Returns the total volume of the memory references REFS, taking into account
1341
   reuses in the innermost loop and cache line size.  TODO -- we should also
1342
   take into account reuses across the iterations of the loops in the loop
1343
   nest.  */
1344
 
1345
static unsigned
1346
volume_of_references (struct mem_ref_group *refs)
1347
{
1348
  unsigned volume = 0;
1349
  struct mem_ref_group *gr;
1350
  struct mem_ref *ref;
1351
 
1352
  for (gr = refs; gr; gr = gr->next)
1353
    for (ref = gr->refs; ref; ref = ref->next)
1354
      {
1355
        /* Almost always reuses another value?  */
1356
        if (ref->prefetch_before != PREFETCH_ALL)
1357
          continue;
1358
 
1359
        /* If several iterations access the same cache line, use the size of
1360
           the line divided by this number.  Otherwise, a cache line is
1361
           accessed in each iteration.  TODO -- in the latter case, we should
1362
           take the size of the reference into account, rounding it up on cache
1363
           line size multiple.  */
1364
        volume += L1_CACHE_LINE_SIZE / ref->prefetch_mod;
1365
      }
1366
  return volume;
1367
}
1368
 
1369
/* Returns the volume of memory references accessed across VEC iterations of
1370
   loops, whose sizes are described in the LOOP_SIZES array.  N is the number
1371
   of the loops in the nest (length of VEC and LOOP_SIZES vectors).  */
1372
 
1373
static unsigned
1374
volume_of_dist_vector (lambda_vector vec, unsigned *loop_sizes, unsigned n)
1375
{
1376
  unsigned i;
1377
 
1378
  for (i = 0; i < n; i++)
1379
    if (vec[i] != 0)
1380
      break;
1381
 
1382
  if (i == n)
1383
    return 0;
1384
 
1385
  gcc_assert (vec[i] > 0);
1386
 
1387
  /* We ignore the parts of the distance vector in subloops, since usually
1388
     the numbers of iterations are much smaller.  */
1389
  return loop_sizes[i] * vec[i];
1390
}
1391
 
1392
/* Add the steps of ACCESS_FN multiplied by STRIDE to the array STRIDE
1393
   at the position corresponding to the loop of the step.  N is the depth
1394
   of the considered loop nest, and, LOOP is its innermost loop.  */
1395
 
1396
static void
1397
add_subscript_strides (tree access_fn, unsigned stride,
1398
                       HOST_WIDE_INT *strides, unsigned n, struct loop *loop)
1399
{
1400
  struct loop *aloop;
1401
  tree step;
1402
  HOST_WIDE_INT astep;
1403
  unsigned min_depth = loop_depth (loop) - n;
1404
 
1405
  while (TREE_CODE (access_fn) == POLYNOMIAL_CHREC)
1406
    {
1407
      aloop = get_chrec_loop (access_fn);
1408
      step = CHREC_RIGHT (access_fn);
1409
      access_fn = CHREC_LEFT (access_fn);
1410
 
1411
      if ((unsigned) loop_depth (aloop) <= min_depth)
1412
        continue;
1413
 
1414
      if (host_integerp (step, 0))
1415
        astep = tree_low_cst (step, 0);
1416
      else
1417
        astep = L1_CACHE_LINE_SIZE;
1418
 
1419
      strides[n - 1 - loop_depth (loop) + loop_depth (aloop)] += astep * stride;
1420
 
1421
    }
1422
}
1423
 
1424
/* Returns the volume of memory references accessed between two consecutive
1425
   self-reuses of the reference DR.  We consider the subscripts of DR in N
1426
   loops, and LOOP_SIZES contains the volumes of accesses in each of the
1427
   loops.  LOOP is the innermost loop of the current loop nest.  */
1428
 
1429
static unsigned
1430
self_reuse_distance (data_reference_p dr, unsigned *loop_sizes, unsigned n,
1431
                     struct loop *loop)
1432
{
1433
  tree stride, access_fn;
1434
  HOST_WIDE_INT *strides, astride;
1435
  VEC (tree, heap) *access_fns;
1436
  tree ref = DR_REF (dr);
1437
  unsigned i, ret = ~0u;
1438
 
1439
  /* In the following example:
1440
 
1441
     for (i = 0; i < N; i++)
1442
       for (j = 0; j < N; j++)
1443
         use (a[j][i]);
1444
     the same cache line is accessed each N steps (except if the change from
1445
     i to i + 1 crosses the boundary of the cache line).  Thus, for self-reuse,
1446
     we cannot rely purely on the results of the data dependence analysis.
1447
 
1448
     Instead, we compute the stride of the reference in each loop, and consider
1449
     the innermost loop in that the stride is less than cache size.  */
1450
 
1451
  strides = XCNEWVEC (HOST_WIDE_INT, n);
1452
  access_fns = DR_ACCESS_FNS (dr);
1453
 
1454
  FOR_EACH_VEC_ELT (tree, access_fns, i, access_fn)
1455
    {
1456
      /* Keep track of the reference corresponding to the subscript, so that we
1457
         know its stride.  */
1458
      while (handled_component_p (ref) && TREE_CODE (ref) != ARRAY_REF)
1459
        ref = TREE_OPERAND (ref, 0);
1460
 
1461
      if (TREE_CODE (ref) == ARRAY_REF)
1462
        {
1463
          stride = TYPE_SIZE_UNIT (TREE_TYPE (ref));
1464
          if (host_integerp (stride, 1))
1465
            astride = tree_low_cst (stride, 1);
1466
          else
1467
            astride = L1_CACHE_LINE_SIZE;
1468
 
1469
          ref = TREE_OPERAND (ref, 0);
1470
        }
1471
      else
1472
        astride = 1;
1473
 
1474
      add_subscript_strides (access_fn, astride, strides, n, loop);
1475
    }
1476
 
1477
  for (i = n; i-- > 0; )
1478
    {
1479
      unsigned HOST_WIDE_INT s;
1480
 
1481
      s = strides[i] < 0 ?  -strides[i] : strides[i];
1482
 
1483
      if (s < (unsigned) L1_CACHE_LINE_SIZE
1484
          && (loop_sizes[i]
1485
              > (unsigned) (L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION)))
1486
        {
1487
          ret = loop_sizes[i];
1488
          break;
1489
        }
1490
    }
1491
 
1492
  free (strides);
1493
  return ret;
1494
}
1495
 
1496
/* Determines the distance till the first reuse of each reference in REFS
1497
   in the loop nest of LOOP.  NO_OTHER_REFS is true if there are no other
1498
   memory references in the loop.  */
1499
 
1500
static void
1501
determine_loop_nest_reuse (struct loop *loop, struct mem_ref_group *refs,
1502
                           bool no_other_refs)
1503
{
1504
  struct loop *nest, *aloop;
1505
  VEC (data_reference_p, heap) *datarefs = NULL;
1506
  VEC (ddr_p, heap) *dependences = NULL;
1507
  struct mem_ref_group *gr;
1508
  struct mem_ref *ref, *refb;
1509
  VEC (loop_p, heap) *vloops = NULL;
1510
  unsigned *loop_data_size;
1511
  unsigned i, j, n;
1512
  unsigned volume, dist, adist;
1513
  HOST_WIDE_INT vol;
1514
  data_reference_p dr;
1515
  ddr_p dep;
1516
 
1517
  if (loop->inner)
1518
    return;
1519
 
1520
  /* Find the outermost loop of the loop nest of loop (we require that
1521
     there are no sibling loops inside the nest).  */
1522
  nest = loop;
1523
  while (1)
1524
    {
1525
      aloop = loop_outer (nest);
1526
 
1527
      if (aloop == current_loops->tree_root
1528
          || aloop->inner->next)
1529
        break;
1530
 
1531
      nest = aloop;
1532
    }
1533
 
1534
  /* For each loop, determine the amount of data accessed in each iteration.
1535
     We use this to estimate whether the reference is evicted from the
1536
     cache before its reuse.  */
1537
  find_loop_nest (nest, &vloops);
1538
  n = VEC_length (loop_p, vloops);
1539
  loop_data_size = XNEWVEC (unsigned, n);
1540
  volume = volume_of_references (refs);
1541
  i = n;
1542
  while (i-- != 0)
1543
    {
1544
      loop_data_size[i] = volume;
1545
      /* Bound the volume by the L2 cache size, since above this bound,
1546
         all dependence distances are equivalent.  */
1547
      if (volume > L2_CACHE_SIZE_BYTES)
1548
        continue;
1549
 
1550
      aloop = VEC_index (loop_p, vloops, i);
1551
      vol = max_stmt_executions_int (aloop, false);
1552
      if (vol < 0)
1553
        vol = expected_loop_iterations (aloop);
1554
      volume *= vol;
1555
    }
1556
 
1557
  /* Prepare the references in the form suitable for data dependence
1558
     analysis.  We ignore unanalyzable data references (the results
1559
     are used just as a heuristics to estimate temporality of the
1560
     references, hence we do not need to worry about correctness).  */
1561
  for (gr = refs; gr; gr = gr->next)
1562
    for (ref = gr->refs; ref; ref = ref->next)
1563
      {
1564
        dr = create_data_ref (nest, loop_containing_stmt (ref->stmt),
1565
                              ref->mem, ref->stmt, !ref->write_p);
1566
 
1567
        if (dr)
1568
          {
1569
            ref->reuse_distance = volume;
1570
            dr->aux = ref;
1571
            VEC_safe_push (data_reference_p, heap, datarefs, dr);
1572
          }
1573
        else
1574
          no_other_refs = false;
1575
      }
1576
 
1577
  FOR_EACH_VEC_ELT (data_reference_p, datarefs, i, dr)
1578
    {
1579
      dist = self_reuse_distance (dr, loop_data_size, n, loop);
1580
      ref = (struct mem_ref *) dr->aux;
1581
      if (ref->reuse_distance > dist)
1582
        ref->reuse_distance = dist;
1583
 
1584
      if (no_other_refs)
1585
        ref->independent_p = true;
1586
    }
1587
 
1588
  compute_all_dependences (datarefs, &dependences, vloops, true);
1589
 
1590
  FOR_EACH_VEC_ELT (ddr_p, dependences, i, dep)
1591
    {
1592
      if (DDR_ARE_DEPENDENT (dep) == chrec_known)
1593
        continue;
1594
 
1595
      ref = (struct mem_ref *) DDR_A (dep)->aux;
1596
      refb = (struct mem_ref *) DDR_B (dep)->aux;
1597
 
1598
      if (DDR_ARE_DEPENDENT (dep) == chrec_dont_know
1599
          || DDR_NUM_DIST_VECTS (dep) == 0)
1600
        {
1601
          /* If the dependence cannot be analyzed, assume that there might be
1602
             a reuse.  */
1603
          dist = 0;
1604
 
1605
          ref->independent_p = false;
1606
          refb->independent_p = false;
1607
        }
1608
      else
1609
        {
1610
          /* The distance vectors are normalized to be always lexicographically
1611
             positive, hence we cannot tell just from them whether DDR_A comes
1612
             before DDR_B or vice versa.  However, it is not important,
1613
             anyway -- if DDR_A is close to DDR_B, then it is either reused in
1614
             DDR_B (and it is not nontemporal), or it reuses the value of DDR_B
1615
             in cache (and marking it as nontemporal would not affect
1616
             anything).  */
1617
 
1618
          dist = volume;
1619
          for (j = 0; j < DDR_NUM_DIST_VECTS (dep); j++)
1620
            {
1621
              adist = volume_of_dist_vector (DDR_DIST_VECT (dep, j),
1622
                                             loop_data_size, n);
1623
 
1624
              /* If this is a dependence in the innermost loop (i.e., the
1625
                 distances in all superloops are zero) and it is not
1626
                 the trivial self-dependence with distance zero, record that
1627
                 the references are not completely independent.  */
1628
              if (lambda_vector_zerop (DDR_DIST_VECT (dep, j), n - 1)
1629
                  && (ref != refb
1630
                      || DDR_DIST_VECT (dep, j)[n-1] != 0))
1631
                {
1632
                  ref->independent_p = false;
1633
                  refb->independent_p = false;
1634
                }
1635
 
1636
              /* Ignore accesses closer than
1637
                 L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION,
1638
                 so that we use nontemporal prefetches e.g. if single memory
1639
                 location is accessed several times in a single iteration of
1640
                 the loop.  */
1641
              if (adist < L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION)
1642
                continue;
1643
 
1644
              if (adist < dist)
1645
                dist = adist;
1646
            }
1647
        }
1648
 
1649
      if (ref->reuse_distance > dist)
1650
        ref->reuse_distance = dist;
1651
      if (refb->reuse_distance > dist)
1652
        refb->reuse_distance = dist;
1653
    }
1654
 
1655
  free_dependence_relations (dependences);
1656
  free_data_refs (datarefs);
1657
  free (loop_data_size);
1658
 
1659
  if (dump_file && (dump_flags & TDF_DETAILS))
1660
    {
1661
      fprintf (dump_file, "Reuse distances:\n");
1662
      for (gr = refs; gr; gr = gr->next)
1663
        for (ref = gr->refs; ref; ref = ref->next)
1664
          fprintf (dump_file, " ref %p distance %u\n",
1665
                   (void *) ref, ref->reuse_distance);
1666
    }
1667
}
1668
 
1669
/* Determine whether or not the trip count to ahead ratio is too small based
1670
   on prefitablility consideration.
1671
   AHEAD: the iteration ahead distance,
1672
   EST_NITER: the estimated trip count.  */
1673
 
1674
static bool
1675
trip_count_to_ahead_ratio_too_small_p (unsigned ahead, HOST_WIDE_INT est_niter)
1676
{
1677
  /* Assume trip count to ahead ratio is big enough if the trip count could not
1678
     be estimated at compile time.  */
1679
  if (est_niter < 0)
1680
    return false;
1681
 
1682
  if (est_niter < (HOST_WIDE_INT) (TRIP_COUNT_TO_AHEAD_RATIO * ahead))
1683
    {
1684
      if (dump_file && (dump_flags & TDF_DETAILS))
1685
        fprintf (dump_file,
1686
                 "Not prefetching -- loop estimated to roll only %d times\n",
1687
                 (int) est_niter);
1688
      return true;
1689
    }
1690
 
1691
  return false;
1692
}
1693
 
1694
/* Determine whether or not the number of memory references in the loop is
1695
   reasonable based on the profitablity and compilation time considerations.
1696
   NINSNS: estimated number of instructions in the loop,
1697
   MEM_REF_COUNT: total number of memory references in the loop.  */
1698
 
1699
static bool
1700
mem_ref_count_reasonable_p (unsigned ninsns, unsigned mem_ref_count)
1701
{
1702
  int insn_to_mem_ratio;
1703
 
1704
  if (mem_ref_count == 0)
1705
    return false;
1706
 
1707
  /* Miss rate computation (is_miss_rate_acceptable) and dependence analysis
1708
     (compute_all_dependences) have high costs based on quadratic complexity.
1709
     To avoid huge compilation time, we give up prefetching if mem_ref_count
1710
     is too large.  */
1711
  if (mem_ref_count > PREFETCH_MAX_MEM_REFS_PER_LOOP)
1712
    return false;
1713
 
1714
  /* Prefetching improves performance by overlapping cache missing
1715
     memory accesses with CPU operations.  If the loop does not have
1716
     enough CPU operations to overlap with memory operations, prefetching
1717
     won't give a significant benefit.  One approximate way of checking
1718
     this is to require the ratio of instructions to memory references to
1719
     be above a certain limit.  This approximation works well in practice.
1720
     TODO: Implement a more precise computation by estimating the time
1721
     for each CPU or memory op in the loop. Time estimates for memory ops
1722
     should account for cache misses.  */
1723
  insn_to_mem_ratio = ninsns / mem_ref_count;
1724
 
1725
  if (insn_to_mem_ratio < PREFETCH_MIN_INSN_TO_MEM_RATIO)
1726
    {
1727
      if (dump_file && (dump_flags & TDF_DETAILS))
1728
        fprintf (dump_file,
1729
                 "Not prefetching -- instruction to memory reference ratio (%d) too small\n",
1730
                 insn_to_mem_ratio);
1731
      return false;
1732
    }
1733
 
1734
  return true;
1735
}
1736
 
1737
/* Determine whether or not the instruction to prefetch ratio in the loop is
1738
   too small based on the profitablity consideration.
1739
   NINSNS: estimated number of instructions in the loop,
1740
   PREFETCH_COUNT: an estimate of the number of prefetches,
1741
   UNROLL_FACTOR:  the factor to unroll the loop if prefetching.  */
1742
 
1743
static bool
1744
insn_to_prefetch_ratio_too_small_p (unsigned ninsns, unsigned prefetch_count,
1745
                                     unsigned unroll_factor)
1746
{
1747
  int insn_to_prefetch_ratio;
1748
 
1749
  /* Prefetching most likely causes performance degradation when the instruction
1750
     to prefetch ratio is too small.  Too many prefetch instructions in a loop
1751
     may reduce the I-cache performance.
1752
     (unroll_factor * ninsns) is used to estimate the number of instructions in
1753
     the unrolled loop.  This implementation is a bit simplistic -- the number
1754
     of issued prefetch instructions is also affected by unrolling.  So,
1755
     prefetch_mod and the unroll factor should be taken into account when
1756
     determining prefetch_count.  Also, the number of insns of the unrolled
1757
     loop will usually be significantly smaller than the number of insns of the
1758
     original loop * unroll_factor (at least the induction variable increases
1759
     and the exit branches will get eliminated), so it might be better to use
1760
     tree_estimate_loop_size + estimated_unrolled_size.  */
1761
  insn_to_prefetch_ratio = (unroll_factor * ninsns) / prefetch_count;
1762
  if (insn_to_prefetch_ratio < MIN_INSN_TO_PREFETCH_RATIO)
1763
    {
1764
      if (dump_file && (dump_flags & TDF_DETAILS))
1765
        fprintf (dump_file,
1766
                 "Not prefetching -- instruction to prefetch ratio (%d) too small\n",
1767
                 insn_to_prefetch_ratio);
1768
      return true;
1769
    }
1770
 
1771
  return false;
1772
}
1773
 
1774
 
1775
/* Issue prefetch instructions for array references in LOOP.  Returns
1776
   true if the LOOP was unrolled.  */
1777
 
1778
static bool
1779
loop_prefetch_arrays (struct loop *loop)
1780
{
1781
  struct mem_ref_group *refs;
1782
  unsigned ahead, ninsns, time, unroll_factor;
1783
  HOST_WIDE_INT est_niter;
1784
  struct tree_niter_desc desc;
1785
  bool unrolled = false, no_other_refs;
1786
  unsigned prefetch_count;
1787
  unsigned mem_ref_count;
1788
 
1789
  if (optimize_loop_nest_for_size_p (loop))
1790
    {
1791
      if (dump_file && (dump_flags & TDF_DETAILS))
1792
        fprintf (dump_file, "  ignored (cold area)\n");
1793
      return false;
1794
    }
1795
 
1796
  /* FIXME: the time should be weighted by the probabilities of the blocks in
1797
     the loop body.  */
1798
  time = tree_num_loop_insns (loop, &eni_time_weights);
1799
  if (time == 0)
1800
    return false;
1801
 
1802
  ahead = (PREFETCH_LATENCY + time - 1) / time;
1803
  est_niter = max_stmt_executions_int (loop, false);
1804
 
1805
  /* Prefetching is not likely to be profitable if the trip count to ahead
1806
     ratio is too small.  */
1807
  if (trip_count_to_ahead_ratio_too_small_p (ahead, est_niter))
1808
    return false;
1809
 
1810
  ninsns = tree_num_loop_insns (loop, &eni_size_weights);
1811
 
1812
  /* Step 1: gather the memory references.  */
1813
  refs = gather_memory_references (loop, &no_other_refs, &mem_ref_count);
1814
 
1815
  /* Give up prefetching if the number of memory references in the
1816
     loop is not reasonable based on profitablity and compilation time
1817
     considerations.  */
1818
  if (!mem_ref_count_reasonable_p (ninsns, mem_ref_count))
1819
    goto fail;
1820
 
1821
  /* Step 2: estimate the reuse effects.  */
1822
  prune_by_reuse (refs);
1823
 
1824
  if (nothing_to_prefetch_p (refs))
1825
    goto fail;
1826
 
1827
  determine_loop_nest_reuse (loop, refs, no_other_refs);
1828
 
1829
  /* Step 3: determine unroll factor.  */
1830
  unroll_factor = determine_unroll_factor (loop, refs, ninsns, &desc,
1831
                                           est_niter);
1832
 
1833
  /* Estimate prefetch count for the unrolled loop.  */
1834
  prefetch_count = estimate_prefetch_count (refs, unroll_factor);
1835
  if (prefetch_count == 0)
1836
    goto fail;
1837
 
1838
  if (dump_file && (dump_flags & TDF_DETAILS))
1839
    fprintf (dump_file, "Ahead %d, unroll factor %d, trip count "
1840
             HOST_WIDE_INT_PRINT_DEC "\n"
1841
             "insn count %d, mem ref count %d, prefetch count %d\n",
1842
             ahead, unroll_factor, est_niter,
1843
             ninsns, mem_ref_count, prefetch_count);
1844
 
1845
  /* Prefetching is not likely to be profitable if the instruction to prefetch
1846
     ratio is too small.  */
1847
  if (insn_to_prefetch_ratio_too_small_p (ninsns, prefetch_count,
1848
                                          unroll_factor))
1849
    goto fail;
1850
 
1851
  mark_nontemporal_stores (loop, refs);
1852
 
1853
  /* Step 4: what to prefetch?  */
1854
  if (!schedule_prefetches (refs, unroll_factor, ahead))
1855
    goto fail;
1856
 
1857
  /* Step 5: unroll the loop.  TODO -- peeling of first and last few
1858
     iterations so that we do not issue superfluous prefetches.  */
1859
  if (unroll_factor != 1)
1860
    {
1861
      tree_unroll_loop (loop, unroll_factor,
1862
                        single_dom_exit (loop), &desc);
1863
      unrolled = true;
1864
    }
1865
 
1866
  /* Step 6: issue the prefetches.  */
1867
  issue_prefetches (refs, unroll_factor, ahead);
1868
 
1869
fail:
1870
  release_mem_refs (refs);
1871
  return unrolled;
1872
}
1873
 
1874
/* Issue prefetch instructions for array references in loops.  */
1875
 
1876
unsigned int
1877
tree_ssa_prefetch_arrays (void)
1878
{
1879
  loop_iterator li;
1880
  struct loop *loop;
1881
  bool unrolled = false;
1882
  int todo_flags = 0;
1883
 
1884
  if (!HAVE_prefetch
1885
      /* It is possible to ask compiler for say -mtune=i486 -march=pentium4.
1886
         -mtune=i486 causes us having PREFETCH_BLOCK 0, since this is part
1887
         of processor costs and i486 does not have prefetch, but
1888
         -march=pentium4 causes HAVE_prefetch to be true.  Ugh.  */
1889
      || PREFETCH_BLOCK == 0)
1890
    return 0;
1891
 
1892
  if (dump_file && (dump_flags & TDF_DETAILS))
1893
    {
1894
      fprintf (dump_file, "Prefetching parameters:\n");
1895
      fprintf (dump_file, "    simultaneous prefetches: %d\n",
1896
               SIMULTANEOUS_PREFETCHES);
1897
      fprintf (dump_file, "    prefetch latency: %d\n", PREFETCH_LATENCY);
1898
      fprintf (dump_file, "    prefetch block size: %d\n", PREFETCH_BLOCK);
1899
      fprintf (dump_file, "    L1 cache size: %d lines, %d kB\n",
1900
               L1_CACHE_SIZE_BYTES / L1_CACHE_LINE_SIZE, L1_CACHE_SIZE);
1901
      fprintf (dump_file, "    L1 cache line size: %d\n", L1_CACHE_LINE_SIZE);
1902
      fprintf (dump_file, "    L2 cache size: %d kB\n", L2_CACHE_SIZE);
1903
      fprintf (dump_file, "    min insn-to-prefetch ratio: %d \n",
1904
               MIN_INSN_TO_PREFETCH_RATIO);
1905
      fprintf (dump_file, "    min insn-to-mem ratio: %d \n",
1906
               PREFETCH_MIN_INSN_TO_MEM_RATIO);
1907
      fprintf (dump_file, "\n");
1908
    }
1909
 
1910
  initialize_original_copy_tables ();
1911
 
1912
  if (!builtin_decl_explicit_p (BUILT_IN_PREFETCH))
1913
    {
1914
      tree type = build_function_type_list (void_type_node,
1915
                                            const_ptr_type_node, NULL_TREE);
1916
      tree decl = add_builtin_function ("__builtin_prefetch", type,
1917
                                        BUILT_IN_PREFETCH, BUILT_IN_NORMAL,
1918
                                        NULL, NULL_TREE);
1919
      DECL_IS_NOVOPS (decl) = true;
1920
      set_builtin_decl (BUILT_IN_PREFETCH, decl, false);
1921
    }
1922
 
1923
  /* We assume that size of cache line is a power of two, so verify this
1924
     here.  */
1925
  gcc_assert ((PREFETCH_BLOCK & (PREFETCH_BLOCK - 1)) == 0);
1926
 
1927
  FOR_EACH_LOOP (li, loop, LI_FROM_INNERMOST)
1928
    {
1929
      if (dump_file && (dump_flags & TDF_DETAILS))
1930
        fprintf (dump_file, "Processing loop %d:\n", loop->num);
1931
 
1932
      unrolled |= loop_prefetch_arrays (loop);
1933
 
1934
      if (dump_file && (dump_flags & TDF_DETAILS))
1935
        fprintf (dump_file, "\n\n");
1936
    }
1937
 
1938
  if (unrolled)
1939
    {
1940
      scev_reset ();
1941
      todo_flags |= TODO_cleanup_cfg;
1942
    }
1943
 
1944
  free_original_copy_tables ();
1945
  return todo_flags;
1946
}

powered by: WebSVN 2.1.0

© copyright 1999-2024 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.