OpenCores
URL https://opencores.org/ocsvn/openrisc_me/openrisc_me/trunk

Subversion Repositories openrisc_me

[/] [openrisc/] [trunk/] [gnu-src/] [gcc-4.5.1/] [gcc/] [tree-ssa-loop-prefetch.c] - Blame information for rev 378

Details | Compare with Previous | View Log

Line No. Rev Author Line
1 280 jeremybenn
/* Array prefetching.
2
   Copyright (C) 2005, 2007, 2008 Free Software Foundation, Inc.
3
 
4
This file is part of GCC.
5
 
6
GCC is free software; you can redistribute it and/or modify it
7
under the terms of the GNU General Public License as published by the
8
Free Software Foundation; either version 3, or (at your option) any
9
later version.
10
 
11
GCC is distributed in the hope that it will be useful, but WITHOUT
12
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13
FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14
for more details.
15
 
16
You should have received a copy of the GNU General Public License
17
along with GCC; see the file COPYING3.  If not see
18
<http://www.gnu.org/licenses/>.  */
19
 
20
#include "config.h"
21
#include "system.h"
22
#include "coretypes.h"
23
#include "tm.h"
24
#include "tree.h"
25
#include "rtl.h"
26
#include "tm_p.h"
27
#include "hard-reg-set.h"
28
#include "basic-block.h"
29
#include "output.h"
30
#include "diagnostic.h"
31
#include "tree-flow.h"
32
#include "tree-dump.h"
33
#include "timevar.h"
34
#include "cfgloop.h"
35
#include "varray.h"
36
#include "expr.h"
37
#include "tree-pass.h"
38
#include "ggc.h"
39
#include "insn-config.h"
40
#include "recog.h"
41
#include "hashtab.h"
42
#include "tree-chrec.h"
43
#include "tree-scalar-evolution.h"
44
#include "toplev.h"
45
#include "params.h"
46
#include "langhooks.h"
47
#include "tree-inline.h"
48
#include "tree-data-ref.h"
49
#include "optabs.h"
50
 
51
/* This pass inserts prefetch instructions to optimize cache usage during
52
   accesses to arrays in loops.  It processes loops sequentially and:
53
 
54
   1) Gathers all memory references in the single loop.
55
   2) For each of the references it decides when it is profitable to prefetch
56
      it.  To do it, we evaluate the reuse among the accesses, and determines
57
      two values: PREFETCH_BEFORE (meaning that it only makes sense to do
58
      prefetching in the first PREFETCH_BEFORE iterations of the loop) and
59
      PREFETCH_MOD (meaning that it only makes sense to prefetch in the
60
      iterations of the loop that are zero modulo PREFETCH_MOD).  For example
61
      (assuming cache line size is 64 bytes, char has size 1 byte and there
62
      is no hardware sequential prefetch):
63
 
64
      char *a;
65
      for (i = 0; i < max; i++)
66
        {
67
          a[255] = ...;         (0)
68
          a[i] = ...;           (1)
69
          a[i + 64] = ...;      (2)
70
          a[16*i] = ...;        (3)
71
          a[187*i] = ...;       (4)
72
          a[187*i + 50] = ...;  (5)
73
        }
74
 
75
       (0) obviously has PREFETCH_BEFORE 1
76
       (1) has PREFETCH_BEFORE 64, since (2) accesses the same memory
77
           location 64 iterations before it, and PREFETCH_MOD 64 (since
78
           it hits the same cache line otherwise).
79
       (2) has PREFETCH_MOD 64
80
       (3) has PREFETCH_MOD 4
81
       (4) has PREFETCH_MOD 1.  We do not set PREFETCH_BEFORE here, since
82
           the cache line accessed by (4) is the same with probability only
83
           7/32.
84
       (5) has PREFETCH_MOD 1 as well.
85
 
86
      Additionally, we use data dependence analysis to determine for each
87
      reference the distance till the first reuse; this information is used
88
      to determine the temporality of the issued prefetch instruction.
89
 
90
   3) We determine how much ahead we need to prefetch.  The number of
91
      iterations needed is time to fetch / time spent in one iteration of
92
      the loop.  The problem is that we do not know either of these values,
93
      so we just make a heuristic guess based on a magic (possibly)
94
      target-specific constant and size of the loop.
95
 
96
   4) Determine which of the references we prefetch.  We take into account
97
      that there is a maximum number of simultaneous prefetches (provided
98
      by machine description).  We prefetch as many prefetches as possible
99
      while still within this bound (starting with those with lowest
100
      prefetch_mod, since they are responsible for most of the cache
101
      misses).
102
 
103
   5) We unroll and peel loops so that we are able to satisfy PREFETCH_MOD
104
      and PREFETCH_BEFORE requirements (within some bounds), and to avoid
105
      prefetching nonaccessed memory.
106
      TODO -- actually implement peeling.
107
 
108
   6) We actually emit the prefetch instructions.  ??? Perhaps emit the
109
      prefetch instructions with guards in cases where 5) was not sufficient
110
      to satisfy the constraints?
111
 
112
   The function is_loop_prefetching_profitable() implements a cost model
113
   to determine if prefetching is profitable for a given loop. The cost
114
   model has two heuristcs:
115
   1. A heuristic that determines whether the given loop has enough CPU
116
      ops that can be overlapped with cache missing memory ops.
117
      If not, the loop won't benefit from prefetching. This is implemented
118
      by requirung the ratio between the instruction count and the mem ref
119
      count to be above a certain minimum.
120
   2. A heuristic that disables prefetching in a loop with an unknown trip
121
      count if the prefetching cost is above a certain limit. The relative
122
      prefetching cost is estimated by taking the ratio between the
123
      prefetch count and the total intruction count (this models the I-cache
124
      cost).
125
   The limits used in these heuristics are defined as parameters with
126
   reasonable default values. Machine-specific default values will be
127
   added later.
128
 
129
   Some other TODO:
130
      -- write and use more general reuse analysis (that could be also used
131
         in other cache aimed loop optimizations)
132
      -- make it behave sanely together with the prefetches given by user
133
         (now we just ignore them; at the very least we should avoid
134
         optimizing loops in that user put his own prefetches)
135
      -- we assume cache line size alignment of arrays; this could be
136
         improved.  */
137
 
138
/* Magic constants follow.  These should be replaced by machine specific
139
   numbers.  */
140
 
141
/* True if write can be prefetched by a read prefetch.  */
142
 
143
#ifndef WRITE_CAN_USE_READ_PREFETCH
144
#define WRITE_CAN_USE_READ_PREFETCH 1
145
#endif
146
 
147
/* True if read can be prefetched by a write prefetch. */
148
 
149
#ifndef READ_CAN_USE_WRITE_PREFETCH
150
#define READ_CAN_USE_WRITE_PREFETCH 0
151
#endif
152
 
153
/* The size of the block loaded by a single prefetch.  Usually, this is
154
   the same as cache line size (at the moment, we only consider one level
155
   of cache hierarchy).  */
156
 
157
#ifndef PREFETCH_BLOCK
158
#define PREFETCH_BLOCK L1_CACHE_LINE_SIZE
159
#endif
160
 
161
/* Do we have a forward hardware sequential prefetching?  */
162
 
163
#ifndef HAVE_FORWARD_PREFETCH
164
#define HAVE_FORWARD_PREFETCH 0
165
#endif
166
 
167
/* Do we have a backward hardware sequential prefetching?  */
168
 
169
#ifndef HAVE_BACKWARD_PREFETCH
170
#define HAVE_BACKWARD_PREFETCH 0
171
#endif
172
 
173
/* In some cases we are only able to determine that there is a certain
174
   probability that the two accesses hit the same cache line.  In this
175
   case, we issue the prefetches for both of them if this probability
176
   is less then (1000 - ACCEPTABLE_MISS_RATE) per thousand.  */
177
 
178
#ifndef ACCEPTABLE_MISS_RATE
179
#define ACCEPTABLE_MISS_RATE 50
180
#endif
181
 
182
#ifndef HAVE_prefetch
183
#define HAVE_prefetch 0
184
#endif
185
 
186
#define L1_CACHE_SIZE_BYTES ((unsigned) (L1_CACHE_SIZE * 1024))
187
#define L2_CACHE_SIZE_BYTES ((unsigned) (L2_CACHE_SIZE * 1024))
188
 
189
/* We consider a memory access nontemporal if it is not reused sooner than
190
   after L2_CACHE_SIZE_BYTES of memory are accessed.  However, we ignore
191
   accesses closer than L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION,
192
   so that we use nontemporal prefetches e.g. if single memory location
193
   is accessed several times in a single iteration of the loop.  */
194
#define NONTEMPORAL_FRACTION 16
195
 
196
/* In case we have to emit a memory fence instruction after the loop that
197
   uses nontemporal stores, this defines the builtin to use.  */
198
 
199
#ifndef FENCE_FOLLOWING_MOVNT
200
#define FENCE_FOLLOWING_MOVNT NULL_TREE
201
#endif
202
 
203
/* The group of references between that reuse may occur.  */
204
 
205
struct mem_ref_group
206
{
207
  tree base;                    /* Base of the reference.  */
208
  HOST_WIDE_INT step;           /* Step of the reference.  */
209
  struct mem_ref *refs;         /* References in the group.  */
210
  struct mem_ref_group *next;   /* Next group of references.  */
211
};
212
 
213
/* Assigned to PREFETCH_BEFORE when all iterations are to be prefetched.  */
214
 
215
#define PREFETCH_ALL            (~(unsigned HOST_WIDE_INT) 0)
216
 
217
/* The memory reference.  */
218
 
219
struct mem_ref
220
{
221
  gimple stmt;                  /* Statement in that the reference appears.  */
222
  tree mem;                     /* The reference.  */
223
  HOST_WIDE_INT delta;          /* Constant offset of the reference.  */
224
  struct mem_ref_group *group;  /* The group of references it belongs to.  */
225
  unsigned HOST_WIDE_INT prefetch_mod;
226
                                /* Prefetch only each PREFETCH_MOD-th
227
                                   iteration.  */
228
  unsigned HOST_WIDE_INT prefetch_before;
229
                                /* Prefetch only first PREFETCH_BEFORE
230
                                   iterations.  */
231
  unsigned reuse_distance;      /* The amount of data accessed before the first
232
                                   reuse of this value.  */
233
  struct mem_ref *next;         /* The next reference in the group.  */
234
  unsigned write_p : 1;         /* Is it a write?  */
235
  unsigned independent_p : 1;   /* True if the reference is independent on
236
                                   all other references inside the loop.  */
237
  unsigned issue_prefetch_p : 1;        /* Should we really issue the prefetch?  */
238
  unsigned storent_p : 1;       /* True if we changed the store to a
239
                                   nontemporal one.  */
240
};
241
 
242
/* Dumps information about reference REF to FILE.  */
243
 
244
static void
245
dump_mem_ref (FILE *file, struct mem_ref *ref)
246
{
247
  fprintf (file, "Reference %p:\n", (void *) ref);
248
 
249
  fprintf (file, "  group %p (base ", (void *) ref->group);
250
  print_generic_expr (file, ref->group->base, TDF_SLIM);
251
  fprintf (file, ", step ");
252
  fprintf (file, HOST_WIDE_INT_PRINT_DEC, ref->group->step);
253
  fprintf (file, ")\n");
254
 
255
  fprintf (file, "  delta ");
256
  fprintf (file, HOST_WIDE_INT_PRINT_DEC, ref->delta);
257
  fprintf (file, "\n");
258
 
259
  fprintf (file, "  %s\n", ref->write_p ? "write" : "read");
260
 
261
  fprintf (file, "\n");
262
}
263
 
264
/* Finds a group with BASE and STEP in GROUPS, or creates one if it does not
265
   exist.  */
266
 
267
static struct mem_ref_group *
268
find_or_create_group (struct mem_ref_group **groups, tree base,
269
                      HOST_WIDE_INT step)
270
{
271
  struct mem_ref_group *group;
272
 
273
  for (; *groups; groups = &(*groups)->next)
274
    {
275
      if ((*groups)->step == step
276
          && operand_equal_p ((*groups)->base, base, 0))
277
        return *groups;
278
 
279
      /* Keep the list of groups sorted by decreasing step.  */
280
      if ((*groups)->step < step)
281
        break;
282
    }
283
 
284
  group = XNEW (struct mem_ref_group);
285
  group->base = base;
286
  group->step = step;
287
  group->refs = NULL;
288
  group->next = *groups;
289
  *groups = group;
290
 
291
  return group;
292
}
293
 
294
/* Records a memory reference MEM in GROUP with offset DELTA and write status
295
   WRITE_P.  The reference occurs in statement STMT.  */
296
 
297
static void
298
record_ref (struct mem_ref_group *group, gimple stmt, tree mem,
299
            HOST_WIDE_INT delta, bool write_p)
300
{
301
  struct mem_ref **aref;
302
 
303
  /* Do not record the same address twice.  */
304
  for (aref = &group->refs; *aref; aref = &(*aref)->next)
305
    {
306
      /* It does not have to be possible for write reference to reuse the read
307
         prefetch, or vice versa.  */
308
      if (!WRITE_CAN_USE_READ_PREFETCH
309
          && write_p
310
          && !(*aref)->write_p)
311
        continue;
312
      if (!READ_CAN_USE_WRITE_PREFETCH
313
          && !write_p
314
          && (*aref)->write_p)
315
        continue;
316
 
317
      if ((*aref)->delta == delta)
318
        return;
319
    }
320
 
321
  (*aref) = XNEW (struct mem_ref);
322
  (*aref)->stmt = stmt;
323
  (*aref)->mem = mem;
324
  (*aref)->delta = delta;
325
  (*aref)->write_p = write_p;
326
  (*aref)->prefetch_before = PREFETCH_ALL;
327
  (*aref)->prefetch_mod = 1;
328
  (*aref)->reuse_distance = 0;
329
  (*aref)->issue_prefetch_p = false;
330
  (*aref)->group = group;
331
  (*aref)->next = NULL;
332
  (*aref)->independent_p = false;
333
  (*aref)->storent_p = false;
334
 
335
  if (dump_file && (dump_flags & TDF_DETAILS))
336
    dump_mem_ref (dump_file, *aref);
337
}
338
 
339
/* Release memory references in GROUPS.  */
340
 
341
static void
342
release_mem_refs (struct mem_ref_group *groups)
343
{
344
  struct mem_ref_group *next_g;
345
  struct mem_ref *ref, *next_r;
346
 
347
  for (; groups; groups = next_g)
348
    {
349
      next_g = groups->next;
350
      for (ref = groups->refs; ref; ref = next_r)
351
        {
352
          next_r = ref->next;
353
          free (ref);
354
        }
355
      free (groups);
356
    }
357
}
358
 
359
/* A structure used to pass arguments to idx_analyze_ref.  */
360
 
361
struct ar_data
362
{
363
  struct loop *loop;                    /* Loop of the reference.  */
364
  gimple stmt;                          /* Statement of the reference.  */
365
  HOST_WIDE_INT *step;                  /* Step of the memory reference.  */
366
  HOST_WIDE_INT *delta;                 /* Offset of the memory reference.  */
367
};
368
 
369
/* Analyzes a single INDEX of a memory reference to obtain information
370
   described at analyze_ref.  Callback for for_each_index.  */
371
 
372
static bool
373
idx_analyze_ref (tree base, tree *index, void *data)
374
{
375
  struct ar_data *ar_data = (struct ar_data *) data;
376
  tree ibase, step, stepsize;
377
  HOST_WIDE_INT istep, idelta = 0, imult = 1;
378
  affine_iv iv;
379
 
380
  if (TREE_CODE (base) == MISALIGNED_INDIRECT_REF
381
      || TREE_CODE (base) == ALIGN_INDIRECT_REF)
382
    return false;
383
 
384
  if (!simple_iv (ar_data->loop, loop_containing_stmt (ar_data->stmt),
385
                  *index, &iv, false))
386
    return false;
387
  ibase = iv.base;
388
  step = iv.step;
389
 
390
  if (!cst_and_fits_in_hwi (step))
391
    return false;
392
  istep = int_cst_value (step);
393
 
394
  if (TREE_CODE (ibase) == POINTER_PLUS_EXPR
395
      && cst_and_fits_in_hwi (TREE_OPERAND (ibase, 1)))
396
    {
397
      idelta = int_cst_value (TREE_OPERAND (ibase, 1));
398
      ibase = TREE_OPERAND (ibase, 0);
399
    }
400
  if (cst_and_fits_in_hwi (ibase))
401
    {
402
      idelta += int_cst_value (ibase);
403
      ibase = build_int_cst (TREE_TYPE (ibase), 0);
404
    }
405
 
406
  if (TREE_CODE (base) == ARRAY_REF)
407
    {
408
      stepsize = array_ref_element_size (base);
409
      if (!cst_and_fits_in_hwi (stepsize))
410
        return false;
411
      imult = int_cst_value (stepsize);
412
 
413
      istep *= imult;
414
      idelta *= imult;
415
    }
416
 
417
  *ar_data->step += istep;
418
  *ar_data->delta += idelta;
419
  *index = ibase;
420
 
421
  return true;
422
}
423
 
424
/* Tries to express REF_P in shape &BASE + STEP * iter + DELTA, where DELTA and
425
   STEP are integer constants and iter is number of iterations of LOOP.  The
426
   reference occurs in statement STMT.  Strips nonaddressable component
427
   references from REF_P.  */
428
 
429
static bool
430
analyze_ref (struct loop *loop, tree *ref_p, tree *base,
431
             HOST_WIDE_INT *step, HOST_WIDE_INT *delta,
432
             gimple stmt)
433
{
434
  struct ar_data ar_data;
435
  tree off;
436
  HOST_WIDE_INT bit_offset;
437
  tree ref = *ref_p;
438
 
439
  *step = 0;
440
  *delta = 0;
441
 
442
  /* First strip off the component references.  Ignore bitfields.  */
443
  if (TREE_CODE (ref) == COMPONENT_REF
444
      && DECL_NONADDRESSABLE_P (TREE_OPERAND (ref, 1)))
445
    ref = TREE_OPERAND (ref, 0);
446
 
447
  *ref_p = ref;
448
 
449
  for (; TREE_CODE (ref) == COMPONENT_REF; ref = TREE_OPERAND (ref, 0))
450
    {
451
      off = DECL_FIELD_BIT_OFFSET (TREE_OPERAND (ref, 1));
452
      bit_offset = TREE_INT_CST_LOW (off);
453
      gcc_assert (bit_offset % BITS_PER_UNIT == 0);
454
 
455
      *delta += bit_offset / BITS_PER_UNIT;
456
    }
457
 
458
  *base = unshare_expr (ref);
459
  ar_data.loop = loop;
460
  ar_data.stmt = stmt;
461
  ar_data.step = step;
462
  ar_data.delta = delta;
463
  return for_each_index (base, idx_analyze_ref, &ar_data);
464
}
465
 
466
/* Record a memory reference REF to the list REFS.  The reference occurs in
467
   LOOP in statement STMT and it is write if WRITE_P.  Returns true if the
468
   reference was recorded, false otherwise.  */
469
 
470
static bool
471
gather_memory_references_ref (struct loop *loop, struct mem_ref_group **refs,
472
                              tree ref, bool write_p, gimple stmt)
473
{
474
  tree base;
475
  HOST_WIDE_INT step, delta;
476
  struct mem_ref_group *agrp;
477
 
478
  if (get_base_address (ref) == NULL)
479
    return false;
480
 
481
  if (!analyze_ref (loop, &ref, &base, &step, &delta, stmt))
482
    return false;
483
 
484 378 julius
  /* Stop if the address of BASE could not be taken.  */
485
  if (may_be_nonaddressable_p (base))
486
    return false;
487
 
488 280 jeremybenn
  /* Now we know that REF = &BASE + STEP * iter + DELTA, where DELTA and STEP
489
     are integer constants.  */
490
  agrp = find_or_create_group (refs, base, step);
491
  record_ref (agrp, stmt, ref, delta, write_p);
492
 
493
  return true;
494
}
495
 
496
/* Record the suitable memory references in LOOP.  NO_OTHER_REFS is set to
497
   true if there are no other memory references inside the loop.  */
498
 
499
static struct mem_ref_group *
500
gather_memory_references (struct loop *loop, bool *no_other_refs, unsigned *ref_count)
501
{
502
  basic_block *body = get_loop_body_in_dom_order (loop);
503
  basic_block bb;
504
  unsigned i;
505
  gimple_stmt_iterator bsi;
506
  gimple stmt;
507
  tree lhs, rhs;
508
  struct mem_ref_group *refs = NULL;
509
 
510
  *no_other_refs = true;
511
  *ref_count = 0;
512
 
513
  /* Scan the loop body in order, so that the former references precede the
514
     later ones.  */
515
  for (i = 0; i < loop->num_nodes; i++)
516
    {
517
      bb = body[i];
518
      if (bb->loop_father != loop)
519
        continue;
520
 
521
      for (bsi = gsi_start_bb (bb); !gsi_end_p (bsi); gsi_next (&bsi))
522
        {
523
          stmt = gsi_stmt (bsi);
524
 
525
          if (gimple_code (stmt) != GIMPLE_ASSIGN)
526
            {
527
              if (gimple_vuse (stmt)
528
                  || (is_gimple_call (stmt)
529
                      && !(gimple_call_flags (stmt) & ECF_CONST)))
530
                *no_other_refs = false;
531
              continue;
532
            }
533
 
534
          lhs = gimple_assign_lhs (stmt);
535
          rhs = gimple_assign_rhs1 (stmt);
536
 
537
          if (REFERENCE_CLASS_P (rhs))
538
            {
539
            *no_other_refs &= gather_memory_references_ref (loop, &refs,
540
                                                            rhs, false, stmt);
541
            *ref_count += 1;
542
            }
543
          if (REFERENCE_CLASS_P (lhs))
544
            {
545
            *no_other_refs &= gather_memory_references_ref (loop, &refs,
546
                                                            lhs, true, stmt);
547
            *ref_count += 1;
548
            }
549
        }
550
    }
551
  free (body);
552
 
553
  return refs;
554
}
555
 
556
/* Prune the prefetch candidate REF using the self-reuse.  */
557
 
558
static void
559
prune_ref_by_self_reuse (struct mem_ref *ref)
560
{
561
  HOST_WIDE_INT step = ref->group->step;
562
  bool backward = step < 0;
563
 
564
  if (step == 0)
565
    {
566
      /* Prefetch references to invariant address just once.  */
567
      ref->prefetch_before = 1;
568
      return;
569
    }
570
 
571
  if (backward)
572
    step = -step;
573
 
574
  if (step > PREFETCH_BLOCK)
575
    return;
576
 
577
  if ((backward && HAVE_BACKWARD_PREFETCH)
578
      || (!backward && HAVE_FORWARD_PREFETCH))
579
    {
580
      ref->prefetch_before = 1;
581
      return;
582
    }
583
 
584
  ref->prefetch_mod = PREFETCH_BLOCK / step;
585
}
586
 
587
/* Divides X by BY, rounding down.  */
588
 
589
static HOST_WIDE_INT
590
ddown (HOST_WIDE_INT x, unsigned HOST_WIDE_INT by)
591
{
592
  gcc_assert (by > 0);
593
 
594
  if (x >= 0)
595
    return x / by;
596
  else
597
    return (x + by - 1) / by;
598
}
599
 
600
/* Given a CACHE_LINE_SIZE and two inductive memory references
601
   with a common STEP greater than CACHE_LINE_SIZE and an address
602
   difference DELTA, compute the probability that they will fall
603
   in different cache lines.  DISTINCT_ITERS is the number of
604
   distinct iterations after which the pattern repeats itself.
605
   ALIGN_UNIT is the unit of alignment in bytes.  */
606
 
607
static int
608
compute_miss_rate (unsigned HOST_WIDE_INT cache_line_size,
609
                   HOST_WIDE_INT step, HOST_WIDE_INT delta,
610
                   unsigned HOST_WIDE_INT distinct_iters,
611
                   int align_unit)
612
{
613
  unsigned align, iter;
614
  int total_positions, miss_positions, miss_rate;
615
  int address1, address2, cache_line1, cache_line2;
616
 
617
  total_positions = 0;
618
  miss_positions = 0;
619
 
620
  /* Iterate through all possible alignments of the first
621
     memory reference within its cache line.  */
622
  for (align = 0; align < cache_line_size; align += align_unit)
623
 
624
    /* Iterate through all distinct iterations.  */
625
    for (iter = 0; iter < distinct_iters; iter++)
626
      {
627
        address1 = align + step * iter;
628
        address2 = address1 + delta;
629
        cache_line1 = address1 / cache_line_size;
630
        cache_line2 = address2 / cache_line_size;
631
        total_positions += 1;
632
        if (cache_line1 != cache_line2)
633
          miss_positions += 1;
634
      }
635
  miss_rate = 1000 * miss_positions / total_positions;
636
  return miss_rate;
637
}
638
 
639
/* Prune the prefetch candidate REF using the reuse with BY.
640
   If BY_IS_BEFORE is true, BY is before REF in the loop.  */
641
 
642
static void
643
prune_ref_by_group_reuse (struct mem_ref *ref, struct mem_ref *by,
644
                          bool by_is_before)
645
{
646
  HOST_WIDE_INT step = ref->group->step;
647
  bool backward = step < 0;
648
  HOST_WIDE_INT delta_r = ref->delta, delta_b = by->delta;
649
  HOST_WIDE_INT delta = delta_b - delta_r;
650
  HOST_WIDE_INT hit_from;
651
  unsigned HOST_WIDE_INT prefetch_before, prefetch_block;
652
  int miss_rate;
653
  HOST_WIDE_INT reduced_step;
654
  unsigned HOST_WIDE_INT reduced_prefetch_block;
655
  tree ref_type;
656
  int align_unit;
657
 
658
  if (delta == 0)
659
    {
660
      /* If the references has the same address, only prefetch the
661
         former.  */
662
      if (by_is_before)
663
        ref->prefetch_before = 0;
664
 
665
      return;
666
    }
667
 
668
  if (!step)
669
    {
670
      /* If the reference addresses are invariant and fall into the
671
         same cache line, prefetch just the first one.  */
672
      if (!by_is_before)
673
        return;
674
 
675
      if (ddown (ref->delta, PREFETCH_BLOCK)
676
          != ddown (by->delta, PREFETCH_BLOCK))
677
        return;
678
 
679
      ref->prefetch_before = 0;
680
      return;
681
    }
682
 
683
  /* Only prune the reference that is behind in the array.  */
684
  if (backward)
685
    {
686
      if (delta > 0)
687
        return;
688
 
689
      /* Transform the data so that we may assume that the accesses
690
         are forward.  */
691
      delta = - delta;
692
      step = -step;
693
      delta_r = PREFETCH_BLOCK - 1 - delta_r;
694
      delta_b = PREFETCH_BLOCK - 1 - delta_b;
695
    }
696
  else
697
    {
698
      if (delta < 0)
699
        return;
700
    }
701
 
702
  /* Check whether the two references are likely to hit the same cache
703
     line, and how distant the iterations in that it occurs are from
704
     each other.  */
705
 
706
  if (step <= PREFETCH_BLOCK)
707
    {
708
      /* The accesses are sure to meet.  Let us check when.  */
709
      hit_from = ddown (delta_b, PREFETCH_BLOCK) * PREFETCH_BLOCK;
710
      prefetch_before = (hit_from - delta_r + step - 1) / step;
711
 
712
      if (prefetch_before < ref->prefetch_before)
713
        ref->prefetch_before = prefetch_before;
714
 
715
      return;
716
    }
717
 
718
  /* A more complicated case with step > prefetch_block.  First reduce
719
     the ratio between the step and the cache line size to its simplest
720
     terms.  The resulting denominator will then represent the number of
721
     distinct iterations after which each address will go back to its
722
     initial location within the cache line.  This computation assumes
723
     that PREFETCH_BLOCK is a power of two.  */
724
  prefetch_block = PREFETCH_BLOCK;
725
  reduced_prefetch_block = prefetch_block;
726
  reduced_step = step;
727
  while ((reduced_step & 1) == 0
728
         && reduced_prefetch_block > 1)
729
    {
730
      reduced_step >>= 1;
731
      reduced_prefetch_block >>= 1;
732
    }
733
 
734
  prefetch_before = delta / step;
735
  delta %= step;
736
  ref_type = TREE_TYPE (ref->mem);
737
  align_unit = TYPE_ALIGN (ref_type) / 8;
738
  miss_rate = compute_miss_rate(prefetch_block, step, delta,
739
                                reduced_prefetch_block, align_unit);
740
  if (miss_rate <= ACCEPTABLE_MISS_RATE)
741
    {
742
      if (prefetch_before < ref->prefetch_before)
743
        ref->prefetch_before = prefetch_before;
744
 
745
      return;
746
    }
747
 
748
  /* Try also the following iteration.  */
749
  prefetch_before++;
750
  delta = step - delta;
751
  miss_rate = compute_miss_rate(prefetch_block, step, delta,
752
                                reduced_prefetch_block, align_unit);
753
  if (miss_rate <= ACCEPTABLE_MISS_RATE)
754
    {
755
      if (prefetch_before < ref->prefetch_before)
756
        ref->prefetch_before = prefetch_before;
757
 
758
      return;
759
    }
760
 
761
  /* The ref probably does not reuse by.  */
762
  return;
763
}
764
 
765
/* Prune the prefetch candidate REF using the reuses with other references
766
   in REFS.  */
767
 
768
static void
769
prune_ref_by_reuse (struct mem_ref *ref, struct mem_ref *refs)
770
{
771
  struct mem_ref *prune_by;
772
  bool before = true;
773
 
774
  prune_ref_by_self_reuse (ref);
775
 
776
  for (prune_by = refs; prune_by; prune_by = prune_by->next)
777
    {
778
      if (prune_by == ref)
779
        {
780
          before = false;
781
          continue;
782
        }
783
 
784
      if (!WRITE_CAN_USE_READ_PREFETCH
785
          && ref->write_p
786
          && !prune_by->write_p)
787
        continue;
788
      if (!READ_CAN_USE_WRITE_PREFETCH
789
          && !ref->write_p
790
          && prune_by->write_p)
791
        continue;
792
 
793
      prune_ref_by_group_reuse (ref, prune_by, before);
794
    }
795
}
796
 
797
/* Prune the prefetch candidates in GROUP using the reuse analysis.  */
798
 
799
static void
800
prune_group_by_reuse (struct mem_ref_group *group)
801
{
802
  struct mem_ref *ref_pruned;
803
 
804
  for (ref_pruned = group->refs; ref_pruned; ref_pruned = ref_pruned->next)
805
    {
806
      prune_ref_by_reuse (ref_pruned, group->refs);
807
 
808
      if (dump_file && (dump_flags & TDF_DETAILS))
809
        {
810
          fprintf (dump_file, "Reference %p:", (void *) ref_pruned);
811
 
812
          if (ref_pruned->prefetch_before == PREFETCH_ALL
813
              && ref_pruned->prefetch_mod == 1)
814
            fprintf (dump_file, " no restrictions");
815
          else if (ref_pruned->prefetch_before == 0)
816
            fprintf (dump_file, " do not prefetch");
817
          else if (ref_pruned->prefetch_before <= ref_pruned->prefetch_mod)
818
            fprintf (dump_file, " prefetch once");
819
          else
820
            {
821
              if (ref_pruned->prefetch_before != PREFETCH_ALL)
822
                {
823
                  fprintf (dump_file, " prefetch before ");
824
                  fprintf (dump_file, HOST_WIDE_INT_PRINT_DEC,
825
                           ref_pruned->prefetch_before);
826
                }
827
              if (ref_pruned->prefetch_mod != 1)
828
                {
829
                  fprintf (dump_file, " prefetch mod ");
830
                  fprintf (dump_file, HOST_WIDE_INT_PRINT_DEC,
831
                           ref_pruned->prefetch_mod);
832
                }
833
            }
834
          fprintf (dump_file, "\n");
835
        }
836
    }
837
}
838
 
839
/* Prune the list of prefetch candidates GROUPS using the reuse analysis.  */
840
 
841
static void
842
prune_by_reuse (struct mem_ref_group *groups)
843
{
844
  for (; groups; groups = groups->next)
845
    prune_group_by_reuse (groups);
846
}
847
 
848
/* Returns true if we should issue prefetch for REF.  */
849
 
850
static bool
851
should_issue_prefetch_p (struct mem_ref *ref)
852
{
853
  /* For now do not issue prefetches for only first few of the
854
     iterations.  */
855
  if (ref->prefetch_before != PREFETCH_ALL)
856
    return false;
857
 
858
  /* Do not prefetch nontemporal stores.  */
859
  if (ref->storent_p)
860
    return false;
861
 
862
  return true;
863
}
864
 
865
/* Decide which of the prefetch candidates in GROUPS to prefetch.
866
   AHEAD is the number of iterations to prefetch ahead (which corresponds
867
   to the number of simultaneous instances of one prefetch running at a
868
   time).  UNROLL_FACTOR is the factor by that the loop is going to be
869
   unrolled.  Returns true if there is anything to prefetch.  */
870
 
871
static bool
872
schedule_prefetches (struct mem_ref_group *groups, unsigned unroll_factor,
873
                     unsigned ahead)
874
{
875
  unsigned remaining_prefetch_slots, n_prefetches, prefetch_slots;
876
  unsigned slots_per_prefetch;
877
  struct mem_ref *ref;
878
  bool any = false;
879
 
880
  /* At most SIMULTANEOUS_PREFETCHES should be running at the same time.  */
881
  remaining_prefetch_slots = SIMULTANEOUS_PREFETCHES;
882
 
883
  /* The prefetch will run for AHEAD iterations of the original loop, i.e.,
884
     AHEAD / UNROLL_FACTOR iterations of the unrolled loop.  In each iteration,
885
     it will need a prefetch slot.  */
886
  slots_per_prefetch = (ahead + unroll_factor / 2) / unroll_factor;
887
  if (dump_file && (dump_flags & TDF_DETAILS))
888
    fprintf (dump_file, "Each prefetch instruction takes %u prefetch slots.\n",
889
             slots_per_prefetch);
890
 
891
  /* For now we just take memory references one by one and issue
892
     prefetches for as many as possible.  The groups are sorted
893
     starting with the largest step, since the references with
894
     large step are more likely to cause many cache misses.  */
895
 
896
  for (; groups; groups = groups->next)
897
    for (ref = groups->refs; ref; ref = ref->next)
898
      {
899
        if (!should_issue_prefetch_p (ref))
900
          continue;
901
 
902
        /* If we need to prefetch the reference each PREFETCH_MOD iterations,
903
           and we unroll the loop UNROLL_FACTOR times, we need to insert
904
           ceil (UNROLL_FACTOR / PREFETCH_MOD) instructions in each
905
           iteration.  */
906
        n_prefetches = ((unroll_factor + ref->prefetch_mod - 1)
907
                        / ref->prefetch_mod);
908
        prefetch_slots = n_prefetches * slots_per_prefetch;
909
 
910
        /* If more than half of the prefetches would be lost anyway, do not
911
           issue the prefetch.  */
912
        if (2 * remaining_prefetch_slots < prefetch_slots)
913
          continue;
914
 
915
        ref->issue_prefetch_p = true;
916
 
917
        if (remaining_prefetch_slots <= prefetch_slots)
918
          return true;
919
        remaining_prefetch_slots -= prefetch_slots;
920
        any = true;
921
      }
922
 
923
  return any;
924
}
925
 
926
/* Estimate the number of prefetches in the given GROUPS.  */
927
 
928
static int
929
estimate_prefetch_count (struct mem_ref_group *groups)
930
{
931
  struct mem_ref *ref;
932
  int prefetch_count = 0;
933
 
934
  for (; groups; groups = groups->next)
935
    for (ref = groups->refs; ref; ref = ref->next)
936
      if (should_issue_prefetch_p (ref))
937
          prefetch_count++;
938
 
939
  return prefetch_count;
940
}
941
 
942
/* Issue prefetches for the reference REF into loop as decided before.
943
   HEAD is the number of iterations to prefetch ahead.  UNROLL_FACTOR
944
   is the factor by which LOOP was unrolled.  */
945
 
946
static void
947
issue_prefetch_ref (struct mem_ref *ref, unsigned unroll_factor, unsigned ahead)
948
{
949
  HOST_WIDE_INT delta;
950
  tree addr, addr_base, write_p, local;
951
  gimple prefetch;
952
  gimple_stmt_iterator bsi;
953
  unsigned n_prefetches, ap;
954
  bool nontemporal = ref->reuse_distance >= L2_CACHE_SIZE_BYTES;
955
 
956
  if (dump_file && (dump_flags & TDF_DETAILS))
957
    fprintf (dump_file, "Issued%s prefetch for %p.\n",
958
             nontemporal ? " nontemporal" : "",
959
             (void *) ref);
960
 
961
  bsi = gsi_for_stmt (ref->stmt);
962
 
963
  n_prefetches = ((unroll_factor + ref->prefetch_mod - 1)
964
                  / ref->prefetch_mod);
965
  addr_base = build_fold_addr_expr_with_type (ref->mem, ptr_type_node);
966
  addr_base = force_gimple_operand_gsi (&bsi, unshare_expr (addr_base),
967
                                        true, NULL, true, GSI_SAME_STMT);
968
  write_p = ref->write_p ? integer_one_node : integer_zero_node;
969
  local = build_int_cst (integer_type_node, nontemporal ? 0 : 3);
970
 
971
  for (ap = 0; ap < n_prefetches; ap++)
972
    {
973
      /* Determine the address to prefetch.  */
974
      delta = (ahead + ap * ref->prefetch_mod) * ref->group->step;
975
      addr = fold_build2 (POINTER_PLUS_EXPR, ptr_type_node,
976
                          addr_base, size_int (delta));
977
      addr = force_gimple_operand_gsi (&bsi, unshare_expr (addr), true, NULL,
978
                                       true, GSI_SAME_STMT);
979
 
980
      /* Create the prefetch instruction.  */
981
      prefetch = gimple_build_call (built_in_decls[BUILT_IN_PREFETCH],
982
                                    3, addr, write_p, local);
983
      gsi_insert_before (&bsi, prefetch, GSI_SAME_STMT);
984
    }
985
}
986
 
987
/* Issue prefetches for the references in GROUPS into loop as decided before.
988
   HEAD is the number of iterations to prefetch ahead.  UNROLL_FACTOR is the
989
   factor by that LOOP was unrolled.  */
990
 
991
static void
992
issue_prefetches (struct mem_ref_group *groups,
993
                  unsigned unroll_factor, unsigned ahead)
994
{
995
  struct mem_ref *ref;
996
 
997
  for (; groups; groups = groups->next)
998
    for (ref = groups->refs; ref; ref = ref->next)
999
      if (ref->issue_prefetch_p)
1000
        issue_prefetch_ref (ref, unroll_factor, ahead);
1001
}
1002
 
1003
/* Returns true if REF is a memory write for that a nontemporal store insn
1004
   can be used.  */
1005
 
1006
static bool
1007
nontemporal_store_p (struct mem_ref *ref)
1008
{
1009
  enum machine_mode mode;
1010
  enum insn_code code;
1011
 
1012
  /* REF must be a write that is not reused.  We require it to be independent
1013
     on all other memory references in the loop, as the nontemporal stores may
1014
     be reordered with respect to other memory references.  */
1015
  if (!ref->write_p
1016
      || !ref->independent_p
1017
      || ref->reuse_distance < L2_CACHE_SIZE_BYTES)
1018
    return false;
1019
 
1020
  /* Check that we have the storent instruction for the mode.  */
1021
  mode = TYPE_MODE (TREE_TYPE (ref->mem));
1022
  if (mode == BLKmode)
1023
    return false;
1024
 
1025
  code = optab_handler (storent_optab, mode)->insn_code;
1026
  return code != CODE_FOR_nothing;
1027
}
1028
 
1029
/* If REF is a nontemporal store, we mark the corresponding modify statement
1030
   and return true.  Otherwise, we return false.  */
1031
 
1032
static bool
1033
mark_nontemporal_store (struct mem_ref *ref)
1034
{
1035
  if (!nontemporal_store_p (ref))
1036
    return false;
1037
 
1038
  if (dump_file && (dump_flags & TDF_DETAILS))
1039
    fprintf (dump_file, "Marked reference %p as a nontemporal store.\n",
1040
             (void *) ref);
1041
 
1042
  gimple_assign_set_nontemporal_move (ref->stmt, true);
1043
  ref->storent_p = true;
1044
 
1045
  return true;
1046
}
1047
 
1048
/* Issue a memory fence instruction after LOOP.  */
1049
 
1050
static void
1051
emit_mfence_after_loop (struct loop *loop)
1052
{
1053
  VEC (edge, heap) *exits = get_loop_exit_edges (loop);
1054
  edge exit;
1055
  gimple call;
1056
  gimple_stmt_iterator bsi;
1057
  unsigned i;
1058
 
1059
  for (i = 0; VEC_iterate (edge, exits, i, exit); i++)
1060
    {
1061
      call = gimple_build_call (FENCE_FOLLOWING_MOVNT, 0);
1062
 
1063
      if (!single_pred_p (exit->dest)
1064
          /* If possible, we prefer not to insert the fence on other paths
1065
             in cfg.  */
1066
          && !(exit->flags & EDGE_ABNORMAL))
1067
        split_loop_exit_edge (exit);
1068
      bsi = gsi_after_labels (exit->dest);
1069
 
1070
      gsi_insert_before (&bsi, call, GSI_NEW_STMT);
1071
      mark_virtual_ops_for_renaming (call);
1072
    }
1073
 
1074
  VEC_free (edge, heap, exits);
1075
  update_ssa (TODO_update_ssa_only_virtuals);
1076
}
1077
 
1078
/* Returns true if we can use storent in loop, false otherwise.  */
1079
 
1080
static bool
1081
may_use_storent_in_loop_p (struct loop *loop)
1082
{
1083
  bool ret = true;
1084
 
1085
  if (loop->inner != NULL)
1086
    return false;
1087
 
1088
  /* If we must issue a mfence insn after using storent, check that there
1089
     is a suitable place for it at each of the loop exits.  */
1090
  if (FENCE_FOLLOWING_MOVNT != NULL_TREE)
1091
    {
1092
      VEC (edge, heap) *exits = get_loop_exit_edges (loop);
1093
      unsigned i;
1094
      edge exit;
1095
 
1096
      for (i = 0; VEC_iterate (edge, exits, i, exit); i++)
1097
        if ((exit->flags & EDGE_ABNORMAL)
1098
            && exit->dest == EXIT_BLOCK_PTR)
1099
          ret = false;
1100
 
1101
      VEC_free (edge, heap, exits);
1102
    }
1103
 
1104
  return ret;
1105
}
1106
 
1107
/* Marks nontemporal stores in LOOP.  GROUPS contains the description of memory
1108
   references in the loop.  */
1109
 
1110
static void
1111
mark_nontemporal_stores (struct loop *loop, struct mem_ref_group *groups)
1112
{
1113
  struct mem_ref *ref;
1114
  bool any = false;
1115
 
1116
  if (!may_use_storent_in_loop_p (loop))
1117
    return;
1118
 
1119
  for (; groups; groups = groups->next)
1120
    for (ref = groups->refs; ref; ref = ref->next)
1121
      any |= mark_nontemporal_store (ref);
1122
 
1123
  if (any && FENCE_FOLLOWING_MOVNT != NULL_TREE)
1124
    emit_mfence_after_loop (loop);
1125
}
1126
 
1127
/* Determines whether we can profitably unroll LOOP FACTOR times, and if
1128
   this is the case, fill in DESC by the description of number of
1129
   iterations.  */
1130
 
1131
static bool
1132
should_unroll_loop_p (struct loop *loop, struct tree_niter_desc *desc,
1133
                      unsigned factor)
1134
{
1135
  if (!can_unroll_loop_p (loop, factor, desc))
1136
    return false;
1137
 
1138
  /* We only consider loops without control flow for unrolling.  This is not
1139
     a hard restriction -- tree_unroll_loop works with arbitrary loops
1140
     as well; but the unrolling/prefetching is usually more profitable for
1141
     loops consisting of a single basic block, and we want to limit the
1142
     code growth.  */
1143
  if (loop->num_nodes > 2)
1144
    return false;
1145
 
1146
  return true;
1147
}
1148
 
1149
/* Determine the coefficient by that unroll LOOP, from the information
1150
   contained in the list of memory references REFS.  Description of
1151
   umber of iterations of LOOP is stored to DESC.  NINSNS is the number of
1152
   insns of the LOOP.  EST_NITER is the estimated number of iterations of
1153
   the loop, or -1 if no estimate is available.  */
1154
 
1155
static unsigned
1156
determine_unroll_factor (struct loop *loop, struct mem_ref_group *refs,
1157
                         unsigned ninsns, struct tree_niter_desc *desc,
1158
                         HOST_WIDE_INT est_niter)
1159
{
1160
  unsigned upper_bound;
1161
  unsigned nfactor, factor, mod_constraint;
1162
  struct mem_ref_group *agp;
1163
  struct mem_ref *ref;
1164
 
1165
  /* First check whether the loop is not too large to unroll.  We ignore
1166
     PARAM_MAX_UNROLL_TIMES, because for small loops, it prevented us
1167
     from unrolling them enough to make exactly one cache line covered by each
1168
     iteration.  Also, the goal of PARAM_MAX_UNROLL_TIMES is to prevent
1169
     us from unrolling the loops too many times in cases where we only expect
1170
     gains from better scheduling and decreasing loop overhead, which is not
1171
     the case here.  */
1172
  upper_bound = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / ninsns;
1173
 
1174
  /* If we unrolled the loop more times than it iterates, the unrolled version
1175
     of the loop would be never entered.  */
1176
  if (est_niter >= 0 && est_niter < (HOST_WIDE_INT) upper_bound)
1177
    upper_bound = est_niter;
1178
 
1179
  if (upper_bound <= 1)
1180
    return 1;
1181
 
1182
  /* Choose the factor so that we may prefetch each cache just once,
1183
     but bound the unrolling by UPPER_BOUND.  */
1184
  factor = 1;
1185
  for (agp = refs; agp; agp = agp->next)
1186
    for (ref = agp->refs; ref; ref = ref->next)
1187
      if (should_issue_prefetch_p (ref))
1188
        {
1189
          mod_constraint = ref->prefetch_mod;
1190
          nfactor = least_common_multiple (mod_constraint, factor);
1191
          if (nfactor <= upper_bound)
1192
            factor = nfactor;
1193
        }
1194
 
1195
  if (!should_unroll_loop_p (loop, desc, factor))
1196
    return 1;
1197
 
1198
  return factor;
1199
}
1200
 
1201
/* Returns the total volume of the memory references REFS, taking into account
1202
   reuses in the innermost loop and cache line size.  TODO -- we should also
1203
   take into account reuses across the iterations of the loops in the loop
1204
   nest.  */
1205
 
1206
static unsigned
1207
volume_of_references (struct mem_ref_group *refs)
1208
{
1209
  unsigned volume = 0;
1210
  struct mem_ref_group *gr;
1211
  struct mem_ref *ref;
1212
 
1213
  for (gr = refs; gr; gr = gr->next)
1214
    for (ref = gr->refs; ref; ref = ref->next)
1215
      {
1216
        /* Almost always reuses another value?  */
1217
        if (ref->prefetch_before != PREFETCH_ALL)
1218
          continue;
1219
 
1220
        /* If several iterations access the same cache line, use the size of
1221
           the line divided by this number.  Otherwise, a cache line is
1222
           accessed in each iteration.  TODO -- in the latter case, we should
1223
           take the size of the reference into account, rounding it up on cache
1224
           line size multiple.  */
1225
        volume += L1_CACHE_LINE_SIZE / ref->prefetch_mod;
1226
      }
1227
  return volume;
1228
}
1229
 
1230
/* Returns the volume of memory references accessed across VEC iterations of
1231
   loops, whose sizes are described in the LOOP_SIZES array.  N is the number
1232
   of the loops in the nest (length of VEC and LOOP_SIZES vectors).  */
1233
 
1234
static unsigned
1235
volume_of_dist_vector (lambda_vector vec, unsigned *loop_sizes, unsigned n)
1236
{
1237
  unsigned i;
1238
 
1239
  for (i = 0; i < n; i++)
1240
    if (vec[i] != 0)
1241
      break;
1242
 
1243
  if (i == n)
1244
    return 0;
1245
 
1246
  gcc_assert (vec[i] > 0);
1247
 
1248
  /* We ignore the parts of the distance vector in subloops, since usually
1249
     the numbers of iterations are much smaller.  */
1250
  return loop_sizes[i] * vec[i];
1251
}
1252
 
1253
/* Add the steps of ACCESS_FN multiplied by STRIDE to the array STRIDE
1254
   at the position corresponding to the loop of the step.  N is the depth
1255
   of the considered loop nest, and, LOOP is its innermost loop.  */
1256
 
1257
static void
1258
add_subscript_strides (tree access_fn, unsigned stride,
1259
                       HOST_WIDE_INT *strides, unsigned n, struct loop *loop)
1260
{
1261
  struct loop *aloop;
1262
  tree step;
1263
  HOST_WIDE_INT astep;
1264
  unsigned min_depth = loop_depth (loop) - n;
1265
 
1266
  while (TREE_CODE (access_fn) == POLYNOMIAL_CHREC)
1267
    {
1268
      aloop = get_chrec_loop (access_fn);
1269
      step = CHREC_RIGHT (access_fn);
1270
      access_fn = CHREC_LEFT (access_fn);
1271
 
1272
      if ((unsigned) loop_depth (aloop) <= min_depth)
1273
        continue;
1274
 
1275
      if (host_integerp (step, 0))
1276
        astep = tree_low_cst (step, 0);
1277
      else
1278
        astep = L1_CACHE_LINE_SIZE;
1279
 
1280
      strides[n - 1 - loop_depth (loop) + loop_depth (aloop)] += astep * stride;
1281
 
1282
    }
1283
}
1284
 
1285
/* Returns the volume of memory references accessed between two consecutive
1286
   self-reuses of the reference DR.  We consider the subscripts of DR in N
1287
   loops, and LOOP_SIZES contains the volumes of accesses in each of the
1288
   loops.  LOOP is the innermost loop of the current loop nest.  */
1289
 
1290
static unsigned
1291
self_reuse_distance (data_reference_p dr, unsigned *loop_sizes, unsigned n,
1292
                     struct loop *loop)
1293
{
1294
  tree stride, access_fn;
1295
  HOST_WIDE_INT *strides, astride;
1296
  VEC (tree, heap) *access_fns;
1297
  tree ref = DR_REF (dr);
1298
  unsigned i, ret = ~0u;
1299
 
1300
  /* In the following example:
1301
 
1302
     for (i = 0; i < N; i++)
1303
       for (j = 0; j < N; j++)
1304
         use (a[j][i]);
1305
     the same cache line is accessed each N steps (except if the change from
1306
     i to i + 1 crosses the boundary of the cache line).  Thus, for self-reuse,
1307
     we cannot rely purely on the results of the data dependence analysis.
1308
 
1309
     Instead, we compute the stride of the reference in each loop, and consider
1310
     the innermost loop in that the stride is less than cache size.  */
1311
 
1312
  strides = XCNEWVEC (HOST_WIDE_INT, n);
1313
  access_fns = DR_ACCESS_FNS (dr);
1314
 
1315
  for (i = 0; VEC_iterate (tree, access_fns, i, access_fn); i++)
1316
    {
1317
      /* Keep track of the reference corresponding to the subscript, so that we
1318
         know its stride.  */
1319
      while (handled_component_p (ref) && TREE_CODE (ref) != ARRAY_REF)
1320
        ref = TREE_OPERAND (ref, 0);
1321
 
1322
      if (TREE_CODE (ref) == ARRAY_REF)
1323
        {
1324
          stride = TYPE_SIZE_UNIT (TREE_TYPE (ref));
1325
          if (host_integerp (stride, 1))
1326
            astride = tree_low_cst (stride, 1);
1327
          else
1328
            astride = L1_CACHE_LINE_SIZE;
1329
 
1330
          ref = TREE_OPERAND (ref, 0);
1331
        }
1332
      else
1333
        astride = 1;
1334
 
1335
      add_subscript_strides (access_fn, astride, strides, n, loop);
1336
    }
1337
 
1338
  for (i = n; i-- > 0; )
1339
    {
1340
      unsigned HOST_WIDE_INT s;
1341
 
1342
      s = strides[i] < 0 ?  -strides[i] : strides[i];
1343
 
1344
      if (s < (unsigned) L1_CACHE_LINE_SIZE
1345
          && (loop_sizes[i]
1346
              > (unsigned) (L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION)))
1347
        {
1348
          ret = loop_sizes[i];
1349
          break;
1350
        }
1351
    }
1352
 
1353
  free (strides);
1354
  return ret;
1355
}
1356
 
1357
/* Determines the distance till the first reuse of each reference in REFS
1358
   in the loop nest of LOOP.  NO_OTHER_REFS is true if there are no other
1359
   memory references in the loop.  */
1360
 
1361
static void
1362
determine_loop_nest_reuse (struct loop *loop, struct mem_ref_group *refs,
1363
                           bool no_other_refs)
1364
{
1365
  struct loop *nest, *aloop;
1366
  VEC (data_reference_p, heap) *datarefs = NULL;
1367
  VEC (ddr_p, heap) *dependences = NULL;
1368
  struct mem_ref_group *gr;
1369
  struct mem_ref *ref, *refb;
1370
  VEC (loop_p, heap) *vloops = NULL;
1371
  unsigned *loop_data_size;
1372
  unsigned i, j, n;
1373
  unsigned volume, dist, adist;
1374
  HOST_WIDE_INT vol;
1375
  data_reference_p dr;
1376
  ddr_p dep;
1377
 
1378
  if (loop->inner)
1379
    return;
1380
 
1381
  /* Find the outermost loop of the loop nest of loop (we require that
1382
     there are no sibling loops inside the nest).  */
1383
  nest = loop;
1384
  while (1)
1385
    {
1386
      aloop = loop_outer (nest);
1387
 
1388
      if (aloop == current_loops->tree_root
1389
          || aloop->inner->next)
1390
        break;
1391
 
1392
      nest = aloop;
1393
    }
1394
 
1395
  /* For each loop, determine the amount of data accessed in each iteration.
1396
     We use this to estimate whether the reference is evicted from the
1397
     cache before its reuse.  */
1398
  find_loop_nest (nest, &vloops);
1399
  n = VEC_length (loop_p, vloops);
1400
  loop_data_size = XNEWVEC (unsigned, n);
1401
  volume = volume_of_references (refs);
1402
  i = n;
1403
  while (i-- != 0)
1404
    {
1405
      loop_data_size[i] = volume;
1406
      /* Bound the volume by the L2 cache size, since above this bound,
1407
         all dependence distances are equivalent.  */
1408
      if (volume > L2_CACHE_SIZE_BYTES)
1409
        continue;
1410
 
1411
      aloop = VEC_index (loop_p, vloops, i);
1412
      vol = estimated_loop_iterations_int (aloop, false);
1413
      if (vol < 0)
1414
        vol = expected_loop_iterations (aloop);
1415
      volume *= vol;
1416
    }
1417
 
1418
  /* Prepare the references in the form suitable for data dependence
1419
     analysis.  We ignore unanalyzable data references (the results
1420
     are used just as a heuristics to estimate temporality of the
1421
     references, hence we do not need to worry about correctness).  */
1422
  for (gr = refs; gr; gr = gr->next)
1423
    for (ref = gr->refs; ref; ref = ref->next)
1424
      {
1425
        dr = create_data_ref (nest, ref->mem, ref->stmt, !ref->write_p);
1426
 
1427
        if (dr)
1428
          {
1429
            ref->reuse_distance = volume;
1430
            dr->aux = ref;
1431
            VEC_safe_push (data_reference_p, heap, datarefs, dr);
1432
          }
1433
        else
1434
          no_other_refs = false;
1435
      }
1436
 
1437
  for (i = 0; VEC_iterate (data_reference_p, datarefs, i, dr); i++)
1438
    {
1439
      dist = self_reuse_distance (dr, loop_data_size, n, loop);
1440
      ref = (struct mem_ref *) dr->aux;
1441
      if (ref->reuse_distance > dist)
1442
        ref->reuse_distance = dist;
1443
 
1444
      if (no_other_refs)
1445
        ref->independent_p = true;
1446
    }
1447
 
1448
  compute_all_dependences (datarefs, &dependences, vloops, true);
1449
 
1450
  for (i = 0; VEC_iterate (ddr_p, dependences, i, dep); i++)
1451
    {
1452
      if (DDR_ARE_DEPENDENT (dep) == chrec_known)
1453
        continue;
1454
 
1455
      ref = (struct mem_ref *) DDR_A (dep)->aux;
1456
      refb = (struct mem_ref *) DDR_B (dep)->aux;
1457
 
1458
      if (DDR_ARE_DEPENDENT (dep) == chrec_dont_know
1459
          || DDR_NUM_DIST_VECTS (dep) == 0)
1460
        {
1461
          /* If the dependence cannot be analyzed, assume that there might be
1462
             a reuse.  */
1463
          dist = 0;
1464
 
1465
          ref->independent_p = false;
1466
          refb->independent_p = false;
1467
        }
1468
      else
1469
        {
1470
          /* The distance vectors are normalized to be always lexicographically
1471
             positive, hence we cannot tell just from them whether DDR_A comes
1472
             before DDR_B or vice versa.  However, it is not important,
1473
             anyway -- if DDR_A is close to DDR_B, then it is either reused in
1474
             DDR_B (and it is not nontemporal), or it reuses the value of DDR_B
1475
             in cache (and marking it as nontemporal would not affect
1476
             anything).  */
1477
 
1478
          dist = volume;
1479
          for (j = 0; j < DDR_NUM_DIST_VECTS (dep); j++)
1480
            {
1481
              adist = volume_of_dist_vector (DDR_DIST_VECT (dep, j),
1482
                                             loop_data_size, n);
1483
 
1484
              /* If this is a dependence in the innermost loop (i.e., the
1485
                 distances in all superloops are zero) and it is not
1486
                 the trivial self-dependence with distance zero, record that
1487
                 the references are not completely independent.  */
1488
              if (lambda_vector_zerop (DDR_DIST_VECT (dep, j), n - 1)
1489
                  && (ref != refb
1490
                      || DDR_DIST_VECT (dep, j)[n-1] != 0))
1491
                {
1492
                  ref->independent_p = false;
1493
                  refb->independent_p = false;
1494
                }
1495
 
1496
              /* Ignore accesses closer than
1497
                 L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION,
1498
                 so that we use nontemporal prefetches e.g. if single memory
1499
                 location is accessed several times in a single iteration of
1500
                 the loop.  */
1501
              if (adist < L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION)
1502
                continue;
1503
 
1504
              if (adist < dist)
1505
                dist = adist;
1506
            }
1507
        }
1508
 
1509
      if (ref->reuse_distance > dist)
1510
        ref->reuse_distance = dist;
1511
      if (refb->reuse_distance > dist)
1512
        refb->reuse_distance = dist;
1513
    }
1514
 
1515
  free_dependence_relations (dependences);
1516
  free_data_refs (datarefs);
1517
  free (loop_data_size);
1518
 
1519
  if (dump_file && (dump_flags & TDF_DETAILS))
1520
    {
1521
      fprintf (dump_file, "Reuse distances:\n");
1522
      for (gr = refs; gr; gr = gr->next)
1523
        for (ref = gr->refs; ref; ref = ref->next)
1524
          fprintf (dump_file, " ref %p distance %u\n",
1525
                   (void *) ref, ref->reuse_distance);
1526
    }
1527
}
1528
 
1529
/* Do a cost-benefit analysis to determine if prefetching is profitable
1530
   for the current loop given the following parameters:
1531
   AHEAD: the iteration ahead distance,
1532
   EST_NITER: the estimated trip count,
1533
   NINSNS: estimated number of instructions in the loop,
1534
   PREFETCH_COUNT: an estimate of the number of prefetches
1535
   MEM_REF_COUNT: total number of memory references in the loop.  */
1536
 
1537
static bool
1538
is_loop_prefetching_profitable (unsigned ahead, HOST_WIDE_INT est_niter,
1539
                                unsigned ninsns, unsigned prefetch_count,
1540
                                unsigned mem_ref_count)
1541
{
1542
  int insn_to_mem_ratio, insn_to_prefetch_ratio;
1543
 
1544
  if (mem_ref_count == 0)
1545
    return false;
1546
 
1547
  /* Prefetching improves performance by overlapping cache missing
1548
     memory accesses with CPU operations.  If the loop does not have
1549
     enough CPU operations to overlap with memory operations, prefetching
1550
     won't give a significant benefit.  One approximate way of checking
1551
     this is to require the ratio of instructions to memory references to
1552
     be above a certain limit.  This approximation works well in practice.
1553
     TODO: Implement a more precise computation by estimating the time
1554
     for each CPU or memory op in the loop. Time estimates for memory ops
1555
     should account for cache misses.  */
1556
  insn_to_mem_ratio = ninsns / mem_ref_count;
1557
 
1558
  if (insn_to_mem_ratio < PREFETCH_MIN_INSN_TO_MEM_RATIO)
1559
    return false;
1560
 
1561
  /* Profitability of prefetching is highly dependent on the trip count.
1562
     For a given AHEAD distance, the first AHEAD iterations do not benefit
1563
     from prefetching, and the last AHEAD iterations execute useless
1564
     prefetches.  So, if the trip count is not large enough relative to AHEAD,
1565
     prefetching may cause serious performance degradation.  To avoid this
1566
     problem when the trip count is not known at compile time, we
1567
     conservatively skip loops with high prefetching costs.  For now, only
1568
     the I-cache cost is considered.  The relative I-cache cost is estimated
1569
     by taking the ratio between the number of prefetches and the total
1570
     number of instructions.  Since we are using integer arithmetic, we
1571
     compute the reciprocal of this ratio.
1572
     TODO: Account for loop unrolling, which may reduce the costs of
1573
     shorter stride prefetches.  Note that not accounting for loop
1574
     unrolling over-estimates the cost and hence gives more conservative
1575
     results.  */
1576
  if (est_niter < 0)
1577
    {
1578
      insn_to_prefetch_ratio = ninsns / prefetch_count;
1579
      return insn_to_prefetch_ratio >= MIN_INSN_TO_PREFETCH_RATIO;
1580
    }
1581
 
1582
  if (est_niter <= (HOST_WIDE_INT) ahead)
1583
    {
1584
      if (dump_file && (dump_flags & TDF_DETAILS))
1585
        fprintf (dump_file,
1586
                 "Not prefetching -- loop estimated to roll only %d times\n",
1587
                 (int) est_niter);
1588
      return false;
1589
    }
1590
  return true;
1591
}
1592
 
1593
 
1594
/* Issue prefetch instructions for array references in LOOP.  Returns
1595
   true if the LOOP was unrolled.  */
1596
 
1597
static bool
1598
loop_prefetch_arrays (struct loop *loop)
1599
{
1600
  struct mem_ref_group *refs;
1601
  unsigned ahead, ninsns, time, unroll_factor;
1602
  HOST_WIDE_INT est_niter;
1603
  struct tree_niter_desc desc;
1604
  bool unrolled = false, no_other_refs;
1605
  unsigned prefetch_count;
1606
  unsigned mem_ref_count;
1607
 
1608
  if (optimize_loop_nest_for_size_p (loop))
1609
    {
1610
      if (dump_file && (dump_flags & TDF_DETAILS))
1611
        fprintf (dump_file, "  ignored (cold area)\n");
1612
      return false;
1613
    }
1614
 
1615
  /* Step 1: gather the memory references.  */
1616
  refs = gather_memory_references (loop, &no_other_refs, &mem_ref_count);
1617
 
1618
  /* Step 2: estimate the reuse effects.  */
1619
  prune_by_reuse (refs);
1620
 
1621
  prefetch_count = estimate_prefetch_count (refs);
1622
  if (prefetch_count == 0)
1623
    goto fail;
1624
 
1625
  determine_loop_nest_reuse (loop, refs, no_other_refs);
1626
 
1627
  /* Step 3: determine the ahead and unroll factor.  */
1628
 
1629
  /* FIXME: the time should be weighted by the probabilities of the blocks in
1630
     the loop body.  */
1631
  time = tree_num_loop_insns (loop, &eni_time_weights);
1632
  ahead = (PREFETCH_LATENCY + time - 1) / time;
1633
  est_niter = estimated_loop_iterations_int (loop, false);
1634
 
1635
  ninsns = tree_num_loop_insns (loop, &eni_size_weights);
1636
  unroll_factor = determine_unroll_factor (loop, refs, ninsns, &desc,
1637
                                           est_niter);
1638
  if (dump_file && (dump_flags & TDF_DETAILS))
1639
    fprintf (dump_file, "Ahead %d, unroll factor %d, trip count "
1640
             HOST_WIDE_INT_PRINT_DEC "\n"
1641
             "insn count %d, mem ref count %d, prefetch count %d\n",
1642
             ahead, unroll_factor, est_niter,
1643
             ninsns, mem_ref_count, prefetch_count);
1644
 
1645
  if (!is_loop_prefetching_profitable (ahead, est_niter, ninsns,
1646
                                       prefetch_count, mem_ref_count))
1647
    goto fail;
1648
 
1649
  mark_nontemporal_stores (loop, refs);
1650
 
1651
  /* Step 4: what to prefetch?  */
1652
  if (!schedule_prefetches (refs, unroll_factor, ahead))
1653
    goto fail;
1654
 
1655
  /* Step 5: unroll the loop.  TODO -- peeling of first and last few
1656
     iterations so that we do not issue superfluous prefetches.  */
1657
  if (unroll_factor != 1)
1658
    {
1659
      tree_unroll_loop (loop, unroll_factor,
1660
                        single_dom_exit (loop), &desc);
1661
      unrolled = true;
1662
    }
1663
 
1664
  /* Step 6: issue the prefetches.  */
1665
  issue_prefetches (refs, unroll_factor, ahead);
1666
 
1667
fail:
1668
  release_mem_refs (refs);
1669
  return unrolled;
1670
}
1671
 
1672
/* Issue prefetch instructions for array references in loops.  */
1673
 
1674
unsigned int
1675
tree_ssa_prefetch_arrays (void)
1676
{
1677
  loop_iterator li;
1678
  struct loop *loop;
1679
  bool unrolled = false;
1680
  int todo_flags = 0;
1681
 
1682
  if (!HAVE_prefetch
1683
      /* It is possible to ask compiler for say -mtune=i486 -march=pentium4.
1684
         -mtune=i486 causes us having PREFETCH_BLOCK 0, since this is part
1685
         of processor costs and i486 does not have prefetch, but
1686
         -march=pentium4 causes HAVE_prefetch to be true.  Ugh.  */
1687
      || PREFETCH_BLOCK == 0)
1688
    return 0;
1689
 
1690
  if (dump_file && (dump_flags & TDF_DETAILS))
1691
    {
1692
      fprintf (dump_file, "Prefetching parameters:\n");
1693
      fprintf (dump_file, "    simultaneous prefetches: %d\n",
1694
               SIMULTANEOUS_PREFETCHES);
1695
      fprintf (dump_file, "    prefetch latency: %d\n", PREFETCH_LATENCY);
1696
      fprintf (dump_file, "    prefetch block size: %d\n", PREFETCH_BLOCK);
1697
      fprintf (dump_file, "    L1 cache size: %d lines, %d kB\n",
1698
               L1_CACHE_SIZE_BYTES / L1_CACHE_LINE_SIZE, L1_CACHE_SIZE);
1699
      fprintf (dump_file, "    L1 cache line size: %d\n", L1_CACHE_LINE_SIZE);
1700
      fprintf (dump_file, "    L2 cache size: %d kB\n", L2_CACHE_SIZE);
1701
      fprintf (dump_file, "    min insn-to-prefetch ratio: %d \n",
1702
               MIN_INSN_TO_PREFETCH_RATIO);
1703
      fprintf (dump_file, "    min insn-to-mem ratio: %d \n",
1704
               PREFETCH_MIN_INSN_TO_MEM_RATIO);
1705
      fprintf (dump_file, "\n");
1706
    }
1707
 
1708
  initialize_original_copy_tables ();
1709
 
1710
  if (!built_in_decls[BUILT_IN_PREFETCH])
1711
    {
1712
      tree type = build_function_type (void_type_node,
1713
                                       tree_cons (NULL_TREE,
1714
                                                  const_ptr_type_node,
1715
                                                  NULL_TREE));
1716
      tree decl = add_builtin_function ("__builtin_prefetch", type,
1717
                                        BUILT_IN_PREFETCH, BUILT_IN_NORMAL,
1718
                                        NULL, NULL_TREE);
1719
      DECL_IS_NOVOPS (decl) = true;
1720
      built_in_decls[BUILT_IN_PREFETCH] = decl;
1721
    }
1722
 
1723
  /* We assume that size of cache line is a power of two, so verify this
1724
     here.  */
1725
  gcc_assert ((PREFETCH_BLOCK & (PREFETCH_BLOCK - 1)) == 0);
1726
 
1727
  FOR_EACH_LOOP (li, loop, LI_FROM_INNERMOST)
1728
    {
1729
      if (dump_file && (dump_flags & TDF_DETAILS))
1730
        fprintf (dump_file, "Processing loop %d:\n", loop->num);
1731
 
1732
      unrolled |= loop_prefetch_arrays (loop);
1733
 
1734
      if (dump_file && (dump_flags & TDF_DETAILS))
1735
        fprintf (dump_file, "\n\n");
1736
    }
1737
 
1738
  if (unrolled)
1739
    {
1740
      scev_reset ();
1741
      todo_flags |= TODO_cleanup_cfg;
1742
    }
1743
 
1744
  free_original_copy_tables ();
1745
  return todo_flags;
1746
}

powered by: WebSVN 2.1.0

© copyright 1999-2024 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.