OpenCores
URL https://opencores.org/ocsvn/openrisc/openrisc/trunk

Subversion Repositories openrisc

[/] [openrisc/] [trunk/] [gnu-stable/] [gcc-4.5.1/] [gcc/] [tree-ssa-loop-prefetch.c] - Blame information for rev 280

Go to most recent revision | Details | Compare with Previous | View Log

Line No. Rev Author Line
1 280 jeremybenn
/* Array prefetching.
2
   Copyright (C) 2005, 2007, 2008 Free Software Foundation, Inc.
3
 
4
This file is part of GCC.
5
 
6
GCC is free software; you can redistribute it and/or modify it
7
under the terms of the GNU General Public License as published by the
8
Free Software Foundation; either version 3, or (at your option) any
9
later version.
10
 
11
GCC is distributed in the hope that it will be useful, but WITHOUT
12
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13
FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14
for more details.
15
 
16
You should have received a copy of the GNU General Public License
17
along with GCC; see the file COPYING3.  If not see
18
<http://www.gnu.org/licenses/>.  */
19
 
20
#include "config.h"
21
#include "system.h"
22
#include "coretypes.h"
23
#include "tm.h"
24
#include "tree.h"
25
#include "rtl.h"
26
#include "tm_p.h"
27
#include "hard-reg-set.h"
28
#include "basic-block.h"
29
#include "output.h"
30
#include "diagnostic.h"
31
#include "tree-flow.h"
32
#include "tree-dump.h"
33
#include "timevar.h"
34
#include "cfgloop.h"
35
#include "varray.h"
36
#include "expr.h"
37
#include "tree-pass.h"
38
#include "ggc.h"
39
#include "insn-config.h"
40
#include "recog.h"
41
#include "hashtab.h"
42
#include "tree-chrec.h"
43
#include "tree-scalar-evolution.h"
44
#include "toplev.h"
45
#include "params.h"
46
#include "langhooks.h"
47
#include "tree-inline.h"
48
#include "tree-data-ref.h"
49
#include "optabs.h"
50
 
51
/* This pass inserts prefetch instructions to optimize cache usage during
52
   accesses to arrays in loops.  It processes loops sequentially and:
53
 
54
   1) Gathers all memory references in the single loop.
55
   2) For each of the references it decides when it is profitable to prefetch
56
      it.  To do it, we evaluate the reuse among the accesses, and determines
57
      two values: PREFETCH_BEFORE (meaning that it only makes sense to do
58
      prefetching in the first PREFETCH_BEFORE iterations of the loop) and
59
      PREFETCH_MOD (meaning that it only makes sense to prefetch in the
60
      iterations of the loop that are zero modulo PREFETCH_MOD).  For example
61
      (assuming cache line size is 64 bytes, char has size 1 byte and there
62
      is no hardware sequential prefetch):
63
 
64
      char *a;
65
      for (i = 0; i < max; i++)
66
        {
67
          a[255] = ...;         (0)
68
          a[i] = ...;           (1)
69
          a[i + 64] = ...;      (2)
70
          a[16*i] = ...;        (3)
71
          a[187*i] = ...;       (4)
72
          a[187*i + 50] = ...;  (5)
73
        }
74
 
75
       (0) obviously has PREFETCH_BEFORE 1
76
       (1) has PREFETCH_BEFORE 64, since (2) accesses the same memory
77
           location 64 iterations before it, and PREFETCH_MOD 64 (since
78
           it hits the same cache line otherwise).
79
       (2) has PREFETCH_MOD 64
80
       (3) has PREFETCH_MOD 4
81
       (4) has PREFETCH_MOD 1.  We do not set PREFETCH_BEFORE here, since
82
           the cache line accessed by (4) is the same with probability only
83
           7/32.
84
       (5) has PREFETCH_MOD 1 as well.
85
 
86
      Additionally, we use data dependence analysis to determine for each
87
      reference the distance till the first reuse; this information is used
88
      to determine the temporality of the issued prefetch instruction.
89
 
90
   3) We determine how much ahead we need to prefetch.  The number of
91
      iterations needed is time to fetch / time spent in one iteration of
92
      the loop.  The problem is that we do not know either of these values,
93
      so we just make a heuristic guess based on a magic (possibly)
94
      target-specific constant and size of the loop.
95
 
96
   4) Determine which of the references we prefetch.  We take into account
97
      that there is a maximum number of simultaneous prefetches (provided
98
      by machine description).  We prefetch as many prefetches as possible
99
      while still within this bound (starting with those with lowest
100
      prefetch_mod, since they are responsible for most of the cache
101
      misses).
102
 
103
   5) We unroll and peel loops so that we are able to satisfy PREFETCH_MOD
104
      and PREFETCH_BEFORE requirements (within some bounds), and to avoid
105
      prefetching nonaccessed memory.
106
      TODO -- actually implement peeling.
107
 
108
   6) We actually emit the prefetch instructions.  ??? Perhaps emit the
109
      prefetch instructions with guards in cases where 5) was not sufficient
110
      to satisfy the constraints?
111
 
112
   The function is_loop_prefetching_profitable() implements a cost model
113
   to determine if prefetching is profitable for a given loop. The cost
114
   model has two heuristcs:
115
   1. A heuristic that determines whether the given loop has enough CPU
116
      ops that can be overlapped with cache missing memory ops.
117
      If not, the loop won't benefit from prefetching. This is implemented
118
      by requirung the ratio between the instruction count and the mem ref
119
      count to be above a certain minimum.
120
   2. A heuristic that disables prefetching in a loop with an unknown trip
121
      count if the prefetching cost is above a certain limit. The relative
122
      prefetching cost is estimated by taking the ratio between the
123
      prefetch count and the total intruction count (this models the I-cache
124
      cost).
125
   The limits used in these heuristics are defined as parameters with
126
   reasonable default values. Machine-specific default values will be
127
   added later.
128
 
129
   Some other TODO:
130
      -- write and use more general reuse analysis (that could be also used
131
         in other cache aimed loop optimizations)
132
      -- make it behave sanely together with the prefetches given by user
133
         (now we just ignore them; at the very least we should avoid
134
         optimizing loops in that user put his own prefetches)
135
      -- we assume cache line size alignment of arrays; this could be
136
         improved.  */
137
 
138
/* Magic constants follow.  These should be replaced by machine specific
139
   numbers.  */
140
 
141
/* True if write can be prefetched by a read prefetch.  */
142
 
143
#ifndef WRITE_CAN_USE_READ_PREFETCH
144
#define WRITE_CAN_USE_READ_PREFETCH 1
145
#endif
146
 
147
/* True if read can be prefetched by a write prefetch. */
148
 
149
#ifndef READ_CAN_USE_WRITE_PREFETCH
150
#define READ_CAN_USE_WRITE_PREFETCH 0
151
#endif
152
 
153
/* The size of the block loaded by a single prefetch.  Usually, this is
154
   the same as cache line size (at the moment, we only consider one level
155
   of cache hierarchy).  */
156
 
157
#ifndef PREFETCH_BLOCK
158
#define PREFETCH_BLOCK L1_CACHE_LINE_SIZE
159
#endif
160
 
161
/* Do we have a forward hardware sequential prefetching?  */
162
 
163
#ifndef HAVE_FORWARD_PREFETCH
164
#define HAVE_FORWARD_PREFETCH 0
165
#endif
166
 
167
/* Do we have a backward hardware sequential prefetching?  */
168
 
169
#ifndef HAVE_BACKWARD_PREFETCH
170
#define HAVE_BACKWARD_PREFETCH 0
171
#endif
172
 
173
/* In some cases we are only able to determine that there is a certain
174
   probability that the two accesses hit the same cache line.  In this
175
   case, we issue the prefetches for both of them if this probability
176
   is less then (1000 - ACCEPTABLE_MISS_RATE) per thousand.  */
177
 
178
#ifndef ACCEPTABLE_MISS_RATE
179
#define ACCEPTABLE_MISS_RATE 50
180
#endif
181
 
182
#ifndef HAVE_prefetch
183
#define HAVE_prefetch 0
184
#endif
185
 
186
#define L1_CACHE_SIZE_BYTES ((unsigned) (L1_CACHE_SIZE * 1024))
187
#define L2_CACHE_SIZE_BYTES ((unsigned) (L2_CACHE_SIZE * 1024))
188
 
189
/* We consider a memory access nontemporal if it is not reused sooner than
190
   after L2_CACHE_SIZE_BYTES of memory are accessed.  However, we ignore
191
   accesses closer than L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION,
192
   so that we use nontemporal prefetches e.g. if single memory location
193
   is accessed several times in a single iteration of the loop.  */
194
#define NONTEMPORAL_FRACTION 16
195
 
196
/* In case we have to emit a memory fence instruction after the loop that
197
   uses nontemporal stores, this defines the builtin to use.  */
198
 
199
#ifndef FENCE_FOLLOWING_MOVNT
200
#define FENCE_FOLLOWING_MOVNT NULL_TREE
201
#endif
202
 
203
/* The group of references between that reuse may occur.  */
204
 
205
struct mem_ref_group
206
{
207
  tree base;                    /* Base of the reference.  */
208
  HOST_WIDE_INT step;           /* Step of the reference.  */
209
  struct mem_ref *refs;         /* References in the group.  */
210
  struct mem_ref_group *next;   /* Next group of references.  */
211
};
212
 
213
/* Assigned to PREFETCH_BEFORE when all iterations are to be prefetched.  */
214
 
215
#define PREFETCH_ALL            (~(unsigned HOST_WIDE_INT) 0)
216
 
217
/* The memory reference.  */
218
 
219
struct mem_ref
220
{
221
  gimple stmt;                  /* Statement in that the reference appears.  */
222
  tree mem;                     /* The reference.  */
223
  HOST_WIDE_INT delta;          /* Constant offset of the reference.  */
224
  struct mem_ref_group *group;  /* The group of references it belongs to.  */
225
  unsigned HOST_WIDE_INT prefetch_mod;
226
                                /* Prefetch only each PREFETCH_MOD-th
227
                                   iteration.  */
228
  unsigned HOST_WIDE_INT prefetch_before;
229
                                /* Prefetch only first PREFETCH_BEFORE
230
                                   iterations.  */
231
  unsigned reuse_distance;      /* The amount of data accessed before the first
232
                                   reuse of this value.  */
233
  struct mem_ref *next;         /* The next reference in the group.  */
234
  unsigned write_p : 1;         /* Is it a write?  */
235
  unsigned independent_p : 1;   /* True if the reference is independent on
236
                                   all other references inside the loop.  */
237
  unsigned issue_prefetch_p : 1;        /* Should we really issue the prefetch?  */
238
  unsigned storent_p : 1;       /* True if we changed the store to a
239
                                   nontemporal one.  */
240
};
241
 
242
/* Dumps information about reference REF to FILE.  */
243
 
244
static void
245
dump_mem_ref (FILE *file, struct mem_ref *ref)
246
{
247
  fprintf (file, "Reference %p:\n", (void *) ref);
248
 
249
  fprintf (file, "  group %p (base ", (void *) ref->group);
250
  print_generic_expr (file, ref->group->base, TDF_SLIM);
251
  fprintf (file, ", step ");
252
  fprintf (file, HOST_WIDE_INT_PRINT_DEC, ref->group->step);
253
  fprintf (file, ")\n");
254
 
255
  fprintf (file, "  delta ");
256
  fprintf (file, HOST_WIDE_INT_PRINT_DEC, ref->delta);
257
  fprintf (file, "\n");
258
 
259
  fprintf (file, "  %s\n", ref->write_p ? "write" : "read");
260
 
261
  fprintf (file, "\n");
262
}
263
 
264
/* Finds a group with BASE and STEP in GROUPS, or creates one if it does not
265
   exist.  */
266
 
267
static struct mem_ref_group *
268
find_or_create_group (struct mem_ref_group **groups, tree base,
269
                      HOST_WIDE_INT step)
270
{
271
  struct mem_ref_group *group;
272
 
273
  for (; *groups; groups = &(*groups)->next)
274
    {
275
      if ((*groups)->step == step
276
          && operand_equal_p ((*groups)->base, base, 0))
277
        return *groups;
278
 
279
      /* Keep the list of groups sorted by decreasing step.  */
280
      if ((*groups)->step < step)
281
        break;
282
    }
283
 
284
  group = XNEW (struct mem_ref_group);
285
  group->base = base;
286
  group->step = step;
287
  group->refs = NULL;
288
  group->next = *groups;
289
  *groups = group;
290
 
291
  return group;
292
}
293
 
294
/* Records a memory reference MEM in GROUP with offset DELTA and write status
295
   WRITE_P.  The reference occurs in statement STMT.  */
296
 
297
static void
298
record_ref (struct mem_ref_group *group, gimple stmt, tree mem,
299
            HOST_WIDE_INT delta, bool write_p)
300
{
301
  struct mem_ref **aref;
302
 
303
  /* Do not record the same address twice.  */
304
  for (aref = &group->refs; *aref; aref = &(*aref)->next)
305
    {
306
      /* It does not have to be possible for write reference to reuse the read
307
         prefetch, or vice versa.  */
308
      if (!WRITE_CAN_USE_READ_PREFETCH
309
          && write_p
310
          && !(*aref)->write_p)
311
        continue;
312
      if (!READ_CAN_USE_WRITE_PREFETCH
313
          && !write_p
314
          && (*aref)->write_p)
315
        continue;
316
 
317
      if ((*aref)->delta == delta)
318
        return;
319
    }
320
 
321
  (*aref) = XNEW (struct mem_ref);
322
  (*aref)->stmt = stmt;
323
  (*aref)->mem = mem;
324
  (*aref)->delta = delta;
325
  (*aref)->write_p = write_p;
326
  (*aref)->prefetch_before = PREFETCH_ALL;
327
  (*aref)->prefetch_mod = 1;
328
  (*aref)->reuse_distance = 0;
329
  (*aref)->issue_prefetch_p = false;
330
  (*aref)->group = group;
331
  (*aref)->next = NULL;
332
  (*aref)->independent_p = false;
333
  (*aref)->storent_p = false;
334
 
335
  if (dump_file && (dump_flags & TDF_DETAILS))
336
    dump_mem_ref (dump_file, *aref);
337
}
338
 
339
/* Release memory references in GROUPS.  */
340
 
341
static void
342
release_mem_refs (struct mem_ref_group *groups)
343
{
344
  struct mem_ref_group *next_g;
345
  struct mem_ref *ref, *next_r;
346
 
347
  for (; groups; groups = next_g)
348
    {
349
      next_g = groups->next;
350
      for (ref = groups->refs; ref; ref = next_r)
351
        {
352
          next_r = ref->next;
353
          free (ref);
354
        }
355
      free (groups);
356
    }
357
}
358
 
359
/* A structure used to pass arguments to idx_analyze_ref.  */
360
 
361
struct ar_data
362
{
363
  struct loop *loop;                    /* Loop of the reference.  */
364
  gimple stmt;                          /* Statement of the reference.  */
365
  HOST_WIDE_INT *step;                  /* Step of the memory reference.  */
366
  HOST_WIDE_INT *delta;                 /* Offset of the memory reference.  */
367
};
368
 
369
/* Analyzes a single INDEX of a memory reference to obtain information
370
   described at analyze_ref.  Callback for for_each_index.  */
371
 
372
static bool
373
idx_analyze_ref (tree base, tree *index, void *data)
374
{
375
  struct ar_data *ar_data = (struct ar_data *) data;
376
  tree ibase, step, stepsize;
377
  HOST_WIDE_INT istep, idelta = 0, imult = 1;
378
  affine_iv iv;
379
 
380
  if (TREE_CODE (base) == MISALIGNED_INDIRECT_REF
381
      || TREE_CODE (base) == ALIGN_INDIRECT_REF)
382
    return false;
383
 
384
  if (!simple_iv (ar_data->loop, loop_containing_stmt (ar_data->stmt),
385
                  *index, &iv, false))
386
    return false;
387
  ibase = iv.base;
388
  step = iv.step;
389
 
390
  if (!cst_and_fits_in_hwi (step))
391
    return false;
392
  istep = int_cst_value (step);
393
 
394
  if (TREE_CODE (ibase) == POINTER_PLUS_EXPR
395
      && cst_and_fits_in_hwi (TREE_OPERAND (ibase, 1)))
396
    {
397
      idelta = int_cst_value (TREE_OPERAND (ibase, 1));
398
      ibase = TREE_OPERAND (ibase, 0);
399
    }
400
  if (cst_and_fits_in_hwi (ibase))
401
    {
402
      idelta += int_cst_value (ibase);
403
      ibase = build_int_cst (TREE_TYPE (ibase), 0);
404
    }
405
 
406
  if (TREE_CODE (base) == ARRAY_REF)
407
    {
408
      stepsize = array_ref_element_size (base);
409
      if (!cst_and_fits_in_hwi (stepsize))
410
        return false;
411
      imult = int_cst_value (stepsize);
412
 
413
      istep *= imult;
414
      idelta *= imult;
415
    }
416
 
417
  *ar_data->step += istep;
418
  *ar_data->delta += idelta;
419
  *index = ibase;
420
 
421
  return true;
422
}
423
 
424
/* Tries to express REF_P in shape &BASE + STEP * iter + DELTA, where DELTA and
425
   STEP are integer constants and iter is number of iterations of LOOP.  The
426
   reference occurs in statement STMT.  Strips nonaddressable component
427
   references from REF_P.  */
428
 
429
static bool
430
analyze_ref (struct loop *loop, tree *ref_p, tree *base,
431
             HOST_WIDE_INT *step, HOST_WIDE_INT *delta,
432
             gimple stmt)
433
{
434
  struct ar_data ar_data;
435
  tree off;
436
  HOST_WIDE_INT bit_offset;
437
  tree ref = *ref_p;
438
 
439
  *step = 0;
440
  *delta = 0;
441
 
442
  /* First strip off the component references.  Ignore bitfields.  */
443
  if (TREE_CODE (ref) == COMPONENT_REF
444
      && DECL_NONADDRESSABLE_P (TREE_OPERAND (ref, 1)))
445
    ref = TREE_OPERAND (ref, 0);
446
 
447
  *ref_p = ref;
448
 
449
  for (; TREE_CODE (ref) == COMPONENT_REF; ref = TREE_OPERAND (ref, 0))
450
    {
451
      off = DECL_FIELD_BIT_OFFSET (TREE_OPERAND (ref, 1));
452
      bit_offset = TREE_INT_CST_LOW (off);
453
      gcc_assert (bit_offset % BITS_PER_UNIT == 0);
454
 
455
      *delta += bit_offset / BITS_PER_UNIT;
456
    }
457
 
458
  *base = unshare_expr (ref);
459
  ar_data.loop = loop;
460
  ar_data.stmt = stmt;
461
  ar_data.step = step;
462
  ar_data.delta = delta;
463
  return for_each_index (base, idx_analyze_ref, &ar_data);
464
}
465
 
466
/* Record a memory reference REF to the list REFS.  The reference occurs in
467
   LOOP in statement STMT and it is write if WRITE_P.  Returns true if the
468
   reference was recorded, false otherwise.  */
469
 
470
static bool
471
gather_memory_references_ref (struct loop *loop, struct mem_ref_group **refs,
472
                              tree ref, bool write_p, gimple stmt)
473
{
474
  tree base;
475
  HOST_WIDE_INT step, delta;
476
  struct mem_ref_group *agrp;
477
 
478
  if (get_base_address (ref) == NULL)
479
    return false;
480
 
481
  if (!analyze_ref (loop, &ref, &base, &step, &delta, stmt))
482
    return false;
483
 
484
  /* Now we know that REF = &BASE + STEP * iter + DELTA, where DELTA and STEP
485
     are integer constants.  */
486
  agrp = find_or_create_group (refs, base, step);
487
  record_ref (agrp, stmt, ref, delta, write_p);
488
 
489
  return true;
490
}
491
 
492
/* Record the suitable memory references in LOOP.  NO_OTHER_REFS is set to
493
   true if there are no other memory references inside the loop.  */
494
 
495
static struct mem_ref_group *
496
gather_memory_references (struct loop *loop, bool *no_other_refs, unsigned *ref_count)
497
{
498
  basic_block *body = get_loop_body_in_dom_order (loop);
499
  basic_block bb;
500
  unsigned i;
501
  gimple_stmt_iterator bsi;
502
  gimple stmt;
503
  tree lhs, rhs;
504
  struct mem_ref_group *refs = NULL;
505
 
506
  *no_other_refs = true;
507
  *ref_count = 0;
508
 
509
  /* Scan the loop body in order, so that the former references precede the
510
     later ones.  */
511
  for (i = 0; i < loop->num_nodes; i++)
512
    {
513
      bb = body[i];
514
      if (bb->loop_father != loop)
515
        continue;
516
 
517
      for (bsi = gsi_start_bb (bb); !gsi_end_p (bsi); gsi_next (&bsi))
518
        {
519
          stmt = gsi_stmt (bsi);
520
 
521
          if (gimple_code (stmt) != GIMPLE_ASSIGN)
522
            {
523
              if (gimple_vuse (stmt)
524
                  || (is_gimple_call (stmt)
525
                      && !(gimple_call_flags (stmt) & ECF_CONST)))
526
                *no_other_refs = false;
527
              continue;
528
            }
529
 
530
          lhs = gimple_assign_lhs (stmt);
531
          rhs = gimple_assign_rhs1 (stmt);
532
 
533
          if (REFERENCE_CLASS_P (rhs))
534
            {
535
            *no_other_refs &= gather_memory_references_ref (loop, &refs,
536
                                                            rhs, false, stmt);
537
            *ref_count += 1;
538
            }
539
          if (REFERENCE_CLASS_P (lhs))
540
            {
541
            *no_other_refs &= gather_memory_references_ref (loop, &refs,
542
                                                            lhs, true, stmt);
543
            *ref_count += 1;
544
            }
545
        }
546
    }
547
  free (body);
548
 
549
  return refs;
550
}
551
 
552
/* Prune the prefetch candidate REF using the self-reuse.  */
553
 
554
static void
555
prune_ref_by_self_reuse (struct mem_ref *ref)
556
{
557
  HOST_WIDE_INT step = ref->group->step;
558
  bool backward = step < 0;
559
 
560
  if (step == 0)
561
    {
562
      /* Prefetch references to invariant address just once.  */
563
      ref->prefetch_before = 1;
564
      return;
565
    }
566
 
567
  if (backward)
568
    step = -step;
569
 
570
  if (step > PREFETCH_BLOCK)
571
    return;
572
 
573
  if ((backward && HAVE_BACKWARD_PREFETCH)
574
      || (!backward && HAVE_FORWARD_PREFETCH))
575
    {
576
      ref->prefetch_before = 1;
577
      return;
578
    }
579
 
580
  ref->prefetch_mod = PREFETCH_BLOCK / step;
581
}
582
 
583
/* Divides X by BY, rounding down.  */
584
 
585
static HOST_WIDE_INT
586
ddown (HOST_WIDE_INT x, unsigned HOST_WIDE_INT by)
587
{
588
  gcc_assert (by > 0);
589
 
590
  if (x >= 0)
591
    return x / by;
592
  else
593
    return (x + by - 1) / by;
594
}
595
 
596
/* Given a CACHE_LINE_SIZE and two inductive memory references
597
   with a common STEP greater than CACHE_LINE_SIZE and an address
598
   difference DELTA, compute the probability that they will fall
599
   in different cache lines.  DISTINCT_ITERS is the number of
600
   distinct iterations after which the pattern repeats itself.
601
   ALIGN_UNIT is the unit of alignment in bytes.  */
602
 
603
static int
604
compute_miss_rate (unsigned HOST_WIDE_INT cache_line_size,
605
                   HOST_WIDE_INT step, HOST_WIDE_INT delta,
606
                   unsigned HOST_WIDE_INT distinct_iters,
607
                   int align_unit)
608
{
609
  unsigned align, iter;
610
  int total_positions, miss_positions, miss_rate;
611
  int address1, address2, cache_line1, cache_line2;
612
 
613
  total_positions = 0;
614
  miss_positions = 0;
615
 
616
  /* Iterate through all possible alignments of the first
617
     memory reference within its cache line.  */
618
  for (align = 0; align < cache_line_size; align += align_unit)
619
 
620
    /* Iterate through all distinct iterations.  */
621
    for (iter = 0; iter < distinct_iters; iter++)
622
      {
623
        address1 = align + step * iter;
624
        address2 = address1 + delta;
625
        cache_line1 = address1 / cache_line_size;
626
        cache_line2 = address2 / cache_line_size;
627
        total_positions += 1;
628
        if (cache_line1 != cache_line2)
629
          miss_positions += 1;
630
      }
631
  miss_rate = 1000 * miss_positions / total_positions;
632
  return miss_rate;
633
}
634
 
635
/* Prune the prefetch candidate REF using the reuse with BY.
636
   If BY_IS_BEFORE is true, BY is before REF in the loop.  */
637
 
638
static void
639
prune_ref_by_group_reuse (struct mem_ref *ref, struct mem_ref *by,
640
                          bool by_is_before)
641
{
642
  HOST_WIDE_INT step = ref->group->step;
643
  bool backward = step < 0;
644
  HOST_WIDE_INT delta_r = ref->delta, delta_b = by->delta;
645
  HOST_WIDE_INT delta = delta_b - delta_r;
646
  HOST_WIDE_INT hit_from;
647
  unsigned HOST_WIDE_INT prefetch_before, prefetch_block;
648
  int miss_rate;
649
  HOST_WIDE_INT reduced_step;
650
  unsigned HOST_WIDE_INT reduced_prefetch_block;
651
  tree ref_type;
652
  int align_unit;
653
 
654
  if (delta == 0)
655
    {
656
      /* If the references has the same address, only prefetch the
657
         former.  */
658
      if (by_is_before)
659
        ref->prefetch_before = 0;
660
 
661
      return;
662
    }
663
 
664
  if (!step)
665
    {
666
      /* If the reference addresses are invariant and fall into the
667
         same cache line, prefetch just the first one.  */
668
      if (!by_is_before)
669
        return;
670
 
671
      if (ddown (ref->delta, PREFETCH_BLOCK)
672
          != ddown (by->delta, PREFETCH_BLOCK))
673
        return;
674
 
675
      ref->prefetch_before = 0;
676
      return;
677
    }
678
 
679
  /* Only prune the reference that is behind in the array.  */
680
  if (backward)
681
    {
682
      if (delta > 0)
683
        return;
684
 
685
      /* Transform the data so that we may assume that the accesses
686
         are forward.  */
687
      delta = - delta;
688
      step = -step;
689
      delta_r = PREFETCH_BLOCK - 1 - delta_r;
690
      delta_b = PREFETCH_BLOCK - 1 - delta_b;
691
    }
692
  else
693
    {
694
      if (delta < 0)
695
        return;
696
    }
697
 
698
  /* Check whether the two references are likely to hit the same cache
699
     line, and how distant the iterations in that it occurs are from
700
     each other.  */
701
 
702
  if (step <= PREFETCH_BLOCK)
703
    {
704
      /* The accesses are sure to meet.  Let us check when.  */
705
      hit_from = ddown (delta_b, PREFETCH_BLOCK) * PREFETCH_BLOCK;
706
      prefetch_before = (hit_from - delta_r + step - 1) / step;
707
 
708
      if (prefetch_before < ref->prefetch_before)
709
        ref->prefetch_before = prefetch_before;
710
 
711
      return;
712
    }
713
 
714
  /* A more complicated case with step > prefetch_block.  First reduce
715
     the ratio between the step and the cache line size to its simplest
716
     terms.  The resulting denominator will then represent the number of
717
     distinct iterations after which each address will go back to its
718
     initial location within the cache line.  This computation assumes
719
     that PREFETCH_BLOCK is a power of two.  */
720
  prefetch_block = PREFETCH_BLOCK;
721
  reduced_prefetch_block = prefetch_block;
722
  reduced_step = step;
723
  while ((reduced_step & 1) == 0
724
         && reduced_prefetch_block > 1)
725
    {
726
      reduced_step >>= 1;
727
      reduced_prefetch_block >>= 1;
728
    }
729
 
730
  prefetch_before = delta / step;
731
  delta %= step;
732
  ref_type = TREE_TYPE (ref->mem);
733
  align_unit = TYPE_ALIGN (ref_type) / 8;
734
  miss_rate = compute_miss_rate(prefetch_block, step, delta,
735
                                reduced_prefetch_block, align_unit);
736
  if (miss_rate <= ACCEPTABLE_MISS_RATE)
737
    {
738
      if (prefetch_before < ref->prefetch_before)
739
        ref->prefetch_before = prefetch_before;
740
 
741
      return;
742
    }
743
 
744
  /* Try also the following iteration.  */
745
  prefetch_before++;
746
  delta = step - delta;
747
  miss_rate = compute_miss_rate(prefetch_block, step, delta,
748
                                reduced_prefetch_block, align_unit);
749
  if (miss_rate <= ACCEPTABLE_MISS_RATE)
750
    {
751
      if (prefetch_before < ref->prefetch_before)
752
        ref->prefetch_before = prefetch_before;
753
 
754
      return;
755
    }
756
 
757
  /* The ref probably does not reuse by.  */
758
  return;
759
}
760
 
761
/* Prune the prefetch candidate REF using the reuses with other references
762
   in REFS.  */
763
 
764
static void
765
prune_ref_by_reuse (struct mem_ref *ref, struct mem_ref *refs)
766
{
767
  struct mem_ref *prune_by;
768
  bool before = true;
769
 
770
  prune_ref_by_self_reuse (ref);
771
 
772
  for (prune_by = refs; prune_by; prune_by = prune_by->next)
773
    {
774
      if (prune_by == ref)
775
        {
776
          before = false;
777
          continue;
778
        }
779
 
780
      if (!WRITE_CAN_USE_READ_PREFETCH
781
          && ref->write_p
782
          && !prune_by->write_p)
783
        continue;
784
      if (!READ_CAN_USE_WRITE_PREFETCH
785
          && !ref->write_p
786
          && prune_by->write_p)
787
        continue;
788
 
789
      prune_ref_by_group_reuse (ref, prune_by, before);
790
    }
791
}
792
 
793
/* Prune the prefetch candidates in GROUP using the reuse analysis.  */
794
 
795
static void
796
prune_group_by_reuse (struct mem_ref_group *group)
797
{
798
  struct mem_ref *ref_pruned;
799
 
800
  for (ref_pruned = group->refs; ref_pruned; ref_pruned = ref_pruned->next)
801
    {
802
      prune_ref_by_reuse (ref_pruned, group->refs);
803
 
804
      if (dump_file && (dump_flags & TDF_DETAILS))
805
        {
806
          fprintf (dump_file, "Reference %p:", (void *) ref_pruned);
807
 
808
          if (ref_pruned->prefetch_before == PREFETCH_ALL
809
              && ref_pruned->prefetch_mod == 1)
810
            fprintf (dump_file, " no restrictions");
811
          else if (ref_pruned->prefetch_before == 0)
812
            fprintf (dump_file, " do not prefetch");
813
          else if (ref_pruned->prefetch_before <= ref_pruned->prefetch_mod)
814
            fprintf (dump_file, " prefetch once");
815
          else
816
            {
817
              if (ref_pruned->prefetch_before != PREFETCH_ALL)
818
                {
819
                  fprintf (dump_file, " prefetch before ");
820
                  fprintf (dump_file, HOST_WIDE_INT_PRINT_DEC,
821
                           ref_pruned->prefetch_before);
822
                }
823
              if (ref_pruned->prefetch_mod != 1)
824
                {
825
                  fprintf (dump_file, " prefetch mod ");
826
                  fprintf (dump_file, HOST_WIDE_INT_PRINT_DEC,
827
                           ref_pruned->prefetch_mod);
828
                }
829
            }
830
          fprintf (dump_file, "\n");
831
        }
832
    }
833
}
834
 
835
/* Prune the list of prefetch candidates GROUPS using the reuse analysis.  */
836
 
837
static void
838
prune_by_reuse (struct mem_ref_group *groups)
839
{
840
  for (; groups; groups = groups->next)
841
    prune_group_by_reuse (groups);
842
}
843
 
844
/* Returns true if we should issue prefetch for REF.  */
845
 
846
static bool
847
should_issue_prefetch_p (struct mem_ref *ref)
848
{
849
  /* For now do not issue prefetches for only first few of the
850
     iterations.  */
851
  if (ref->prefetch_before != PREFETCH_ALL)
852
    return false;
853
 
854
  /* Do not prefetch nontemporal stores.  */
855
  if (ref->storent_p)
856
    return false;
857
 
858
  return true;
859
}
860
 
861
/* Decide which of the prefetch candidates in GROUPS to prefetch.
862
   AHEAD is the number of iterations to prefetch ahead (which corresponds
863
   to the number of simultaneous instances of one prefetch running at a
864
   time).  UNROLL_FACTOR is the factor by that the loop is going to be
865
   unrolled.  Returns true if there is anything to prefetch.  */
866
 
867
static bool
868
schedule_prefetches (struct mem_ref_group *groups, unsigned unroll_factor,
869
                     unsigned ahead)
870
{
871
  unsigned remaining_prefetch_slots, n_prefetches, prefetch_slots;
872
  unsigned slots_per_prefetch;
873
  struct mem_ref *ref;
874
  bool any = false;
875
 
876
  /* At most SIMULTANEOUS_PREFETCHES should be running at the same time.  */
877
  remaining_prefetch_slots = SIMULTANEOUS_PREFETCHES;
878
 
879
  /* The prefetch will run for AHEAD iterations of the original loop, i.e.,
880
     AHEAD / UNROLL_FACTOR iterations of the unrolled loop.  In each iteration,
881
     it will need a prefetch slot.  */
882
  slots_per_prefetch = (ahead + unroll_factor / 2) / unroll_factor;
883
  if (dump_file && (dump_flags & TDF_DETAILS))
884
    fprintf (dump_file, "Each prefetch instruction takes %u prefetch slots.\n",
885
             slots_per_prefetch);
886
 
887
  /* For now we just take memory references one by one and issue
888
     prefetches for as many as possible.  The groups are sorted
889
     starting with the largest step, since the references with
890
     large step are more likely to cause many cache misses.  */
891
 
892
  for (; groups; groups = groups->next)
893
    for (ref = groups->refs; ref; ref = ref->next)
894
      {
895
        if (!should_issue_prefetch_p (ref))
896
          continue;
897
 
898
        /* If we need to prefetch the reference each PREFETCH_MOD iterations,
899
           and we unroll the loop UNROLL_FACTOR times, we need to insert
900
           ceil (UNROLL_FACTOR / PREFETCH_MOD) instructions in each
901
           iteration.  */
902
        n_prefetches = ((unroll_factor + ref->prefetch_mod - 1)
903
                        / ref->prefetch_mod);
904
        prefetch_slots = n_prefetches * slots_per_prefetch;
905
 
906
        /* If more than half of the prefetches would be lost anyway, do not
907
           issue the prefetch.  */
908
        if (2 * remaining_prefetch_slots < prefetch_slots)
909
          continue;
910
 
911
        ref->issue_prefetch_p = true;
912
 
913
        if (remaining_prefetch_slots <= prefetch_slots)
914
          return true;
915
        remaining_prefetch_slots -= prefetch_slots;
916
        any = true;
917
      }
918
 
919
  return any;
920
}
921
 
922
/* Estimate the number of prefetches in the given GROUPS.  */
923
 
924
static int
925
estimate_prefetch_count (struct mem_ref_group *groups)
926
{
927
  struct mem_ref *ref;
928
  int prefetch_count = 0;
929
 
930
  for (; groups; groups = groups->next)
931
    for (ref = groups->refs; ref; ref = ref->next)
932
      if (should_issue_prefetch_p (ref))
933
          prefetch_count++;
934
 
935
  return prefetch_count;
936
}
937
 
938
/* Issue prefetches for the reference REF into loop as decided before.
939
   HEAD is the number of iterations to prefetch ahead.  UNROLL_FACTOR
940
   is the factor by which LOOP was unrolled.  */
941
 
942
static void
943
issue_prefetch_ref (struct mem_ref *ref, unsigned unroll_factor, unsigned ahead)
944
{
945
  HOST_WIDE_INT delta;
946
  tree addr, addr_base, write_p, local;
947
  gimple prefetch;
948
  gimple_stmt_iterator bsi;
949
  unsigned n_prefetches, ap;
950
  bool nontemporal = ref->reuse_distance >= L2_CACHE_SIZE_BYTES;
951
 
952
  if (dump_file && (dump_flags & TDF_DETAILS))
953
    fprintf (dump_file, "Issued%s prefetch for %p.\n",
954
             nontemporal ? " nontemporal" : "",
955
             (void *) ref);
956
 
957
  bsi = gsi_for_stmt (ref->stmt);
958
 
959
  n_prefetches = ((unroll_factor + ref->prefetch_mod - 1)
960
                  / ref->prefetch_mod);
961
  addr_base = build_fold_addr_expr_with_type (ref->mem, ptr_type_node);
962
  addr_base = force_gimple_operand_gsi (&bsi, unshare_expr (addr_base),
963
                                        true, NULL, true, GSI_SAME_STMT);
964
  write_p = ref->write_p ? integer_one_node : integer_zero_node;
965
  local = build_int_cst (integer_type_node, nontemporal ? 0 : 3);
966
 
967
  for (ap = 0; ap < n_prefetches; ap++)
968
    {
969
      /* Determine the address to prefetch.  */
970
      delta = (ahead + ap * ref->prefetch_mod) * ref->group->step;
971
      addr = fold_build2 (POINTER_PLUS_EXPR, ptr_type_node,
972
                          addr_base, size_int (delta));
973
      addr = force_gimple_operand_gsi (&bsi, unshare_expr (addr), true, NULL,
974
                                       true, GSI_SAME_STMT);
975
 
976
      /* Create the prefetch instruction.  */
977
      prefetch = gimple_build_call (built_in_decls[BUILT_IN_PREFETCH],
978
                                    3, addr, write_p, local);
979
      gsi_insert_before (&bsi, prefetch, GSI_SAME_STMT);
980
    }
981
}
982
 
983
/* Issue prefetches for the references in GROUPS into loop as decided before.
984
   HEAD is the number of iterations to prefetch ahead.  UNROLL_FACTOR is the
985
   factor by that LOOP was unrolled.  */
986
 
987
static void
988
issue_prefetches (struct mem_ref_group *groups,
989
                  unsigned unroll_factor, unsigned ahead)
990
{
991
  struct mem_ref *ref;
992
 
993
  for (; groups; groups = groups->next)
994
    for (ref = groups->refs; ref; ref = ref->next)
995
      if (ref->issue_prefetch_p)
996
        issue_prefetch_ref (ref, unroll_factor, ahead);
997
}
998
 
999
/* Returns true if REF is a memory write for that a nontemporal store insn
1000
   can be used.  */
1001
 
1002
static bool
1003
nontemporal_store_p (struct mem_ref *ref)
1004
{
1005
  enum machine_mode mode;
1006
  enum insn_code code;
1007
 
1008
  /* REF must be a write that is not reused.  We require it to be independent
1009
     on all other memory references in the loop, as the nontemporal stores may
1010
     be reordered with respect to other memory references.  */
1011
  if (!ref->write_p
1012
      || !ref->independent_p
1013
      || ref->reuse_distance < L2_CACHE_SIZE_BYTES)
1014
    return false;
1015
 
1016
  /* Check that we have the storent instruction for the mode.  */
1017
  mode = TYPE_MODE (TREE_TYPE (ref->mem));
1018
  if (mode == BLKmode)
1019
    return false;
1020
 
1021
  code = optab_handler (storent_optab, mode)->insn_code;
1022
  return code != CODE_FOR_nothing;
1023
}
1024
 
1025
/* If REF is a nontemporal store, we mark the corresponding modify statement
1026
   and return true.  Otherwise, we return false.  */
1027
 
1028
static bool
1029
mark_nontemporal_store (struct mem_ref *ref)
1030
{
1031
  if (!nontemporal_store_p (ref))
1032
    return false;
1033
 
1034
  if (dump_file && (dump_flags & TDF_DETAILS))
1035
    fprintf (dump_file, "Marked reference %p as a nontemporal store.\n",
1036
             (void *) ref);
1037
 
1038
  gimple_assign_set_nontemporal_move (ref->stmt, true);
1039
  ref->storent_p = true;
1040
 
1041
  return true;
1042
}
1043
 
1044
/* Issue a memory fence instruction after LOOP.  */
1045
 
1046
static void
1047
emit_mfence_after_loop (struct loop *loop)
1048
{
1049
  VEC (edge, heap) *exits = get_loop_exit_edges (loop);
1050
  edge exit;
1051
  gimple call;
1052
  gimple_stmt_iterator bsi;
1053
  unsigned i;
1054
 
1055
  for (i = 0; VEC_iterate (edge, exits, i, exit); i++)
1056
    {
1057
      call = gimple_build_call (FENCE_FOLLOWING_MOVNT, 0);
1058
 
1059
      if (!single_pred_p (exit->dest)
1060
          /* If possible, we prefer not to insert the fence on other paths
1061
             in cfg.  */
1062
          && !(exit->flags & EDGE_ABNORMAL))
1063
        split_loop_exit_edge (exit);
1064
      bsi = gsi_after_labels (exit->dest);
1065
 
1066
      gsi_insert_before (&bsi, call, GSI_NEW_STMT);
1067
      mark_virtual_ops_for_renaming (call);
1068
    }
1069
 
1070
  VEC_free (edge, heap, exits);
1071
  update_ssa (TODO_update_ssa_only_virtuals);
1072
}
1073
 
1074
/* Returns true if we can use storent in loop, false otherwise.  */
1075
 
1076
static bool
1077
may_use_storent_in_loop_p (struct loop *loop)
1078
{
1079
  bool ret = true;
1080
 
1081
  if (loop->inner != NULL)
1082
    return false;
1083
 
1084
  /* If we must issue a mfence insn after using storent, check that there
1085
     is a suitable place for it at each of the loop exits.  */
1086
  if (FENCE_FOLLOWING_MOVNT != NULL_TREE)
1087
    {
1088
      VEC (edge, heap) *exits = get_loop_exit_edges (loop);
1089
      unsigned i;
1090
      edge exit;
1091
 
1092
      for (i = 0; VEC_iterate (edge, exits, i, exit); i++)
1093
        if ((exit->flags & EDGE_ABNORMAL)
1094
            && exit->dest == EXIT_BLOCK_PTR)
1095
          ret = false;
1096
 
1097
      VEC_free (edge, heap, exits);
1098
    }
1099
 
1100
  return ret;
1101
}
1102
 
1103
/* Marks nontemporal stores in LOOP.  GROUPS contains the description of memory
1104
   references in the loop.  */
1105
 
1106
static void
1107
mark_nontemporal_stores (struct loop *loop, struct mem_ref_group *groups)
1108
{
1109
  struct mem_ref *ref;
1110
  bool any = false;
1111
 
1112
  if (!may_use_storent_in_loop_p (loop))
1113
    return;
1114
 
1115
  for (; groups; groups = groups->next)
1116
    for (ref = groups->refs; ref; ref = ref->next)
1117
      any |= mark_nontemporal_store (ref);
1118
 
1119
  if (any && FENCE_FOLLOWING_MOVNT != NULL_TREE)
1120
    emit_mfence_after_loop (loop);
1121
}
1122
 
1123
/* Determines whether we can profitably unroll LOOP FACTOR times, and if
1124
   this is the case, fill in DESC by the description of number of
1125
   iterations.  */
1126
 
1127
static bool
1128
should_unroll_loop_p (struct loop *loop, struct tree_niter_desc *desc,
1129
                      unsigned factor)
1130
{
1131
  if (!can_unroll_loop_p (loop, factor, desc))
1132
    return false;
1133
 
1134
  /* We only consider loops without control flow for unrolling.  This is not
1135
     a hard restriction -- tree_unroll_loop works with arbitrary loops
1136
     as well; but the unrolling/prefetching is usually more profitable for
1137
     loops consisting of a single basic block, and we want to limit the
1138
     code growth.  */
1139
  if (loop->num_nodes > 2)
1140
    return false;
1141
 
1142
  return true;
1143
}
1144
 
1145
/* Determine the coefficient by that unroll LOOP, from the information
1146
   contained in the list of memory references REFS.  Description of
1147
   umber of iterations of LOOP is stored to DESC.  NINSNS is the number of
1148
   insns of the LOOP.  EST_NITER is the estimated number of iterations of
1149
   the loop, or -1 if no estimate is available.  */
1150
 
1151
static unsigned
1152
determine_unroll_factor (struct loop *loop, struct mem_ref_group *refs,
1153
                         unsigned ninsns, struct tree_niter_desc *desc,
1154
                         HOST_WIDE_INT est_niter)
1155
{
1156
  unsigned upper_bound;
1157
  unsigned nfactor, factor, mod_constraint;
1158
  struct mem_ref_group *agp;
1159
  struct mem_ref *ref;
1160
 
1161
  /* First check whether the loop is not too large to unroll.  We ignore
1162
     PARAM_MAX_UNROLL_TIMES, because for small loops, it prevented us
1163
     from unrolling them enough to make exactly one cache line covered by each
1164
     iteration.  Also, the goal of PARAM_MAX_UNROLL_TIMES is to prevent
1165
     us from unrolling the loops too many times in cases where we only expect
1166
     gains from better scheduling and decreasing loop overhead, which is not
1167
     the case here.  */
1168
  upper_bound = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / ninsns;
1169
 
1170
  /* If we unrolled the loop more times than it iterates, the unrolled version
1171
     of the loop would be never entered.  */
1172
  if (est_niter >= 0 && est_niter < (HOST_WIDE_INT) upper_bound)
1173
    upper_bound = est_niter;
1174
 
1175
  if (upper_bound <= 1)
1176
    return 1;
1177
 
1178
  /* Choose the factor so that we may prefetch each cache just once,
1179
     but bound the unrolling by UPPER_BOUND.  */
1180
  factor = 1;
1181
  for (agp = refs; agp; agp = agp->next)
1182
    for (ref = agp->refs; ref; ref = ref->next)
1183
      if (should_issue_prefetch_p (ref))
1184
        {
1185
          mod_constraint = ref->prefetch_mod;
1186
          nfactor = least_common_multiple (mod_constraint, factor);
1187
          if (nfactor <= upper_bound)
1188
            factor = nfactor;
1189
        }
1190
 
1191
  if (!should_unroll_loop_p (loop, desc, factor))
1192
    return 1;
1193
 
1194
  return factor;
1195
}
1196
 
1197
/* Returns the total volume of the memory references REFS, taking into account
1198
   reuses in the innermost loop and cache line size.  TODO -- we should also
1199
   take into account reuses across the iterations of the loops in the loop
1200
   nest.  */
1201
 
1202
static unsigned
1203
volume_of_references (struct mem_ref_group *refs)
1204
{
1205
  unsigned volume = 0;
1206
  struct mem_ref_group *gr;
1207
  struct mem_ref *ref;
1208
 
1209
  for (gr = refs; gr; gr = gr->next)
1210
    for (ref = gr->refs; ref; ref = ref->next)
1211
      {
1212
        /* Almost always reuses another value?  */
1213
        if (ref->prefetch_before != PREFETCH_ALL)
1214
          continue;
1215
 
1216
        /* If several iterations access the same cache line, use the size of
1217
           the line divided by this number.  Otherwise, a cache line is
1218
           accessed in each iteration.  TODO -- in the latter case, we should
1219
           take the size of the reference into account, rounding it up on cache
1220
           line size multiple.  */
1221
        volume += L1_CACHE_LINE_SIZE / ref->prefetch_mod;
1222
      }
1223
  return volume;
1224
}
1225
 
1226
/* Returns the volume of memory references accessed across VEC iterations of
1227
   loops, whose sizes are described in the LOOP_SIZES array.  N is the number
1228
   of the loops in the nest (length of VEC and LOOP_SIZES vectors).  */
1229
 
1230
static unsigned
1231
volume_of_dist_vector (lambda_vector vec, unsigned *loop_sizes, unsigned n)
1232
{
1233
  unsigned i;
1234
 
1235
  for (i = 0; i < n; i++)
1236
    if (vec[i] != 0)
1237
      break;
1238
 
1239
  if (i == n)
1240
    return 0;
1241
 
1242
  gcc_assert (vec[i] > 0);
1243
 
1244
  /* We ignore the parts of the distance vector in subloops, since usually
1245
     the numbers of iterations are much smaller.  */
1246
  return loop_sizes[i] * vec[i];
1247
}
1248
 
1249
/* Add the steps of ACCESS_FN multiplied by STRIDE to the array STRIDE
1250
   at the position corresponding to the loop of the step.  N is the depth
1251
   of the considered loop nest, and, LOOP is its innermost loop.  */
1252
 
1253
static void
1254
add_subscript_strides (tree access_fn, unsigned stride,
1255
                       HOST_WIDE_INT *strides, unsigned n, struct loop *loop)
1256
{
1257
  struct loop *aloop;
1258
  tree step;
1259
  HOST_WIDE_INT astep;
1260
  unsigned min_depth = loop_depth (loop) - n;
1261
 
1262
  while (TREE_CODE (access_fn) == POLYNOMIAL_CHREC)
1263
    {
1264
      aloop = get_chrec_loop (access_fn);
1265
      step = CHREC_RIGHT (access_fn);
1266
      access_fn = CHREC_LEFT (access_fn);
1267
 
1268
      if ((unsigned) loop_depth (aloop) <= min_depth)
1269
        continue;
1270
 
1271
      if (host_integerp (step, 0))
1272
        astep = tree_low_cst (step, 0);
1273
      else
1274
        astep = L1_CACHE_LINE_SIZE;
1275
 
1276
      strides[n - 1 - loop_depth (loop) + loop_depth (aloop)] += astep * stride;
1277
 
1278
    }
1279
}
1280
 
1281
/* Returns the volume of memory references accessed between two consecutive
1282
   self-reuses of the reference DR.  We consider the subscripts of DR in N
1283
   loops, and LOOP_SIZES contains the volumes of accesses in each of the
1284
   loops.  LOOP is the innermost loop of the current loop nest.  */
1285
 
1286
static unsigned
1287
self_reuse_distance (data_reference_p dr, unsigned *loop_sizes, unsigned n,
1288
                     struct loop *loop)
1289
{
1290
  tree stride, access_fn;
1291
  HOST_WIDE_INT *strides, astride;
1292
  VEC (tree, heap) *access_fns;
1293
  tree ref = DR_REF (dr);
1294
  unsigned i, ret = ~0u;
1295
 
1296
  /* In the following example:
1297
 
1298
     for (i = 0; i < N; i++)
1299
       for (j = 0; j < N; j++)
1300
         use (a[j][i]);
1301
     the same cache line is accessed each N steps (except if the change from
1302
     i to i + 1 crosses the boundary of the cache line).  Thus, for self-reuse,
1303
     we cannot rely purely on the results of the data dependence analysis.
1304
 
1305
     Instead, we compute the stride of the reference in each loop, and consider
1306
     the innermost loop in that the stride is less than cache size.  */
1307
 
1308
  strides = XCNEWVEC (HOST_WIDE_INT, n);
1309
  access_fns = DR_ACCESS_FNS (dr);
1310
 
1311
  for (i = 0; VEC_iterate (tree, access_fns, i, access_fn); i++)
1312
    {
1313
      /* Keep track of the reference corresponding to the subscript, so that we
1314
         know its stride.  */
1315
      while (handled_component_p (ref) && TREE_CODE (ref) != ARRAY_REF)
1316
        ref = TREE_OPERAND (ref, 0);
1317
 
1318
      if (TREE_CODE (ref) == ARRAY_REF)
1319
        {
1320
          stride = TYPE_SIZE_UNIT (TREE_TYPE (ref));
1321
          if (host_integerp (stride, 1))
1322
            astride = tree_low_cst (stride, 1);
1323
          else
1324
            astride = L1_CACHE_LINE_SIZE;
1325
 
1326
          ref = TREE_OPERAND (ref, 0);
1327
        }
1328
      else
1329
        astride = 1;
1330
 
1331
      add_subscript_strides (access_fn, astride, strides, n, loop);
1332
    }
1333
 
1334
  for (i = n; i-- > 0; )
1335
    {
1336
      unsigned HOST_WIDE_INT s;
1337
 
1338
      s = strides[i] < 0 ?  -strides[i] : strides[i];
1339
 
1340
      if (s < (unsigned) L1_CACHE_LINE_SIZE
1341
          && (loop_sizes[i]
1342
              > (unsigned) (L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION)))
1343
        {
1344
          ret = loop_sizes[i];
1345
          break;
1346
        }
1347
    }
1348
 
1349
  free (strides);
1350
  return ret;
1351
}
1352
 
1353
/* Determines the distance till the first reuse of each reference in REFS
1354
   in the loop nest of LOOP.  NO_OTHER_REFS is true if there are no other
1355
   memory references in the loop.  */
1356
 
1357
static void
1358
determine_loop_nest_reuse (struct loop *loop, struct mem_ref_group *refs,
1359
                           bool no_other_refs)
1360
{
1361
  struct loop *nest, *aloop;
1362
  VEC (data_reference_p, heap) *datarefs = NULL;
1363
  VEC (ddr_p, heap) *dependences = NULL;
1364
  struct mem_ref_group *gr;
1365
  struct mem_ref *ref, *refb;
1366
  VEC (loop_p, heap) *vloops = NULL;
1367
  unsigned *loop_data_size;
1368
  unsigned i, j, n;
1369
  unsigned volume, dist, adist;
1370
  HOST_WIDE_INT vol;
1371
  data_reference_p dr;
1372
  ddr_p dep;
1373
 
1374
  if (loop->inner)
1375
    return;
1376
 
1377
  /* Find the outermost loop of the loop nest of loop (we require that
1378
     there are no sibling loops inside the nest).  */
1379
  nest = loop;
1380
  while (1)
1381
    {
1382
      aloop = loop_outer (nest);
1383
 
1384
      if (aloop == current_loops->tree_root
1385
          || aloop->inner->next)
1386
        break;
1387
 
1388
      nest = aloop;
1389
    }
1390
 
1391
  /* For each loop, determine the amount of data accessed in each iteration.
1392
     We use this to estimate whether the reference is evicted from the
1393
     cache before its reuse.  */
1394
  find_loop_nest (nest, &vloops);
1395
  n = VEC_length (loop_p, vloops);
1396
  loop_data_size = XNEWVEC (unsigned, n);
1397
  volume = volume_of_references (refs);
1398
  i = n;
1399
  while (i-- != 0)
1400
    {
1401
      loop_data_size[i] = volume;
1402
      /* Bound the volume by the L2 cache size, since above this bound,
1403
         all dependence distances are equivalent.  */
1404
      if (volume > L2_CACHE_SIZE_BYTES)
1405
        continue;
1406
 
1407
      aloop = VEC_index (loop_p, vloops, i);
1408
      vol = estimated_loop_iterations_int (aloop, false);
1409
      if (vol < 0)
1410
        vol = expected_loop_iterations (aloop);
1411
      volume *= vol;
1412
    }
1413
 
1414
  /* Prepare the references in the form suitable for data dependence
1415
     analysis.  We ignore unanalyzable data references (the results
1416
     are used just as a heuristics to estimate temporality of the
1417
     references, hence we do not need to worry about correctness).  */
1418
  for (gr = refs; gr; gr = gr->next)
1419
    for (ref = gr->refs; ref; ref = ref->next)
1420
      {
1421
        dr = create_data_ref (nest, ref->mem, ref->stmt, !ref->write_p);
1422
 
1423
        if (dr)
1424
          {
1425
            ref->reuse_distance = volume;
1426
            dr->aux = ref;
1427
            VEC_safe_push (data_reference_p, heap, datarefs, dr);
1428
          }
1429
        else
1430
          no_other_refs = false;
1431
      }
1432
 
1433
  for (i = 0; VEC_iterate (data_reference_p, datarefs, i, dr); i++)
1434
    {
1435
      dist = self_reuse_distance (dr, loop_data_size, n, loop);
1436
      ref = (struct mem_ref *) dr->aux;
1437
      if (ref->reuse_distance > dist)
1438
        ref->reuse_distance = dist;
1439
 
1440
      if (no_other_refs)
1441
        ref->independent_p = true;
1442
    }
1443
 
1444
  compute_all_dependences (datarefs, &dependences, vloops, true);
1445
 
1446
  for (i = 0; VEC_iterate (ddr_p, dependences, i, dep); i++)
1447
    {
1448
      if (DDR_ARE_DEPENDENT (dep) == chrec_known)
1449
        continue;
1450
 
1451
      ref = (struct mem_ref *) DDR_A (dep)->aux;
1452
      refb = (struct mem_ref *) DDR_B (dep)->aux;
1453
 
1454
      if (DDR_ARE_DEPENDENT (dep) == chrec_dont_know
1455
          || DDR_NUM_DIST_VECTS (dep) == 0)
1456
        {
1457
          /* If the dependence cannot be analyzed, assume that there might be
1458
             a reuse.  */
1459
          dist = 0;
1460
 
1461
          ref->independent_p = false;
1462
          refb->independent_p = false;
1463
        }
1464
      else
1465
        {
1466
          /* The distance vectors are normalized to be always lexicographically
1467
             positive, hence we cannot tell just from them whether DDR_A comes
1468
             before DDR_B or vice versa.  However, it is not important,
1469
             anyway -- if DDR_A is close to DDR_B, then it is either reused in
1470
             DDR_B (and it is not nontemporal), or it reuses the value of DDR_B
1471
             in cache (and marking it as nontemporal would not affect
1472
             anything).  */
1473
 
1474
          dist = volume;
1475
          for (j = 0; j < DDR_NUM_DIST_VECTS (dep); j++)
1476
            {
1477
              adist = volume_of_dist_vector (DDR_DIST_VECT (dep, j),
1478
                                             loop_data_size, n);
1479
 
1480
              /* If this is a dependence in the innermost loop (i.e., the
1481
                 distances in all superloops are zero) and it is not
1482
                 the trivial self-dependence with distance zero, record that
1483
                 the references are not completely independent.  */
1484
              if (lambda_vector_zerop (DDR_DIST_VECT (dep, j), n - 1)
1485
                  && (ref != refb
1486
                      || DDR_DIST_VECT (dep, j)[n-1] != 0))
1487
                {
1488
                  ref->independent_p = false;
1489
                  refb->independent_p = false;
1490
                }
1491
 
1492
              /* Ignore accesses closer than
1493
                 L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION,
1494
                 so that we use nontemporal prefetches e.g. if single memory
1495
                 location is accessed several times in a single iteration of
1496
                 the loop.  */
1497
              if (adist < L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION)
1498
                continue;
1499
 
1500
              if (adist < dist)
1501
                dist = adist;
1502
            }
1503
        }
1504
 
1505
      if (ref->reuse_distance > dist)
1506
        ref->reuse_distance = dist;
1507
      if (refb->reuse_distance > dist)
1508
        refb->reuse_distance = dist;
1509
    }
1510
 
1511
  free_dependence_relations (dependences);
1512
  free_data_refs (datarefs);
1513
  free (loop_data_size);
1514
 
1515
  if (dump_file && (dump_flags & TDF_DETAILS))
1516
    {
1517
      fprintf (dump_file, "Reuse distances:\n");
1518
      for (gr = refs; gr; gr = gr->next)
1519
        for (ref = gr->refs; ref; ref = ref->next)
1520
          fprintf (dump_file, " ref %p distance %u\n",
1521
                   (void *) ref, ref->reuse_distance);
1522
    }
1523
}
1524
 
1525
/* Do a cost-benefit analysis to determine if prefetching is profitable
1526
   for the current loop given the following parameters:
1527
   AHEAD: the iteration ahead distance,
1528
   EST_NITER: the estimated trip count,
1529
   NINSNS: estimated number of instructions in the loop,
1530
   PREFETCH_COUNT: an estimate of the number of prefetches
1531
   MEM_REF_COUNT: total number of memory references in the loop.  */
1532
 
1533
static bool
1534
is_loop_prefetching_profitable (unsigned ahead, HOST_WIDE_INT est_niter,
1535
                                unsigned ninsns, unsigned prefetch_count,
1536
                                unsigned mem_ref_count)
1537
{
1538
  int insn_to_mem_ratio, insn_to_prefetch_ratio;
1539
 
1540
  if (mem_ref_count == 0)
1541
    return false;
1542
 
1543
  /* Prefetching improves performance by overlapping cache missing
1544
     memory accesses with CPU operations.  If the loop does not have
1545
     enough CPU operations to overlap with memory operations, prefetching
1546
     won't give a significant benefit.  One approximate way of checking
1547
     this is to require the ratio of instructions to memory references to
1548
     be above a certain limit.  This approximation works well in practice.
1549
     TODO: Implement a more precise computation by estimating the time
1550
     for each CPU or memory op in the loop. Time estimates for memory ops
1551
     should account for cache misses.  */
1552
  insn_to_mem_ratio = ninsns / mem_ref_count;
1553
 
1554
  if (insn_to_mem_ratio < PREFETCH_MIN_INSN_TO_MEM_RATIO)
1555
    return false;
1556
 
1557
  /* Profitability of prefetching is highly dependent on the trip count.
1558
     For a given AHEAD distance, the first AHEAD iterations do not benefit
1559
     from prefetching, and the last AHEAD iterations execute useless
1560
     prefetches.  So, if the trip count is not large enough relative to AHEAD,
1561
     prefetching may cause serious performance degradation.  To avoid this
1562
     problem when the trip count is not known at compile time, we
1563
     conservatively skip loops with high prefetching costs.  For now, only
1564
     the I-cache cost is considered.  The relative I-cache cost is estimated
1565
     by taking the ratio between the number of prefetches and the total
1566
     number of instructions.  Since we are using integer arithmetic, we
1567
     compute the reciprocal of this ratio.
1568
     TODO: Account for loop unrolling, which may reduce the costs of
1569
     shorter stride prefetches.  Note that not accounting for loop
1570
     unrolling over-estimates the cost and hence gives more conservative
1571
     results.  */
1572
  if (est_niter < 0)
1573
    {
1574
      insn_to_prefetch_ratio = ninsns / prefetch_count;
1575
      return insn_to_prefetch_ratio >= MIN_INSN_TO_PREFETCH_RATIO;
1576
    }
1577
 
1578
  if (est_niter <= (HOST_WIDE_INT) ahead)
1579
    {
1580
      if (dump_file && (dump_flags & TDF_DETAILS))
1581
        fprintf (dump_file,
1582
                 "Not prefetching -- loop estimated to roll only %d times\n",
1583
                 (int) est_niter);
1584
      return false;
1585
    }
1586
  return true;
1587
}
1588
 
1589
 
1590
/* Issue prefetch instructions for array references in LOOP.  Returns
1591
   true if the LOOP was unrolled.  */
1592
 
1593
static bool
1594
loop_prefetch_arrays (struct loop *loop)
1595
{
1596
  struct mem_ref_group *refs;
1597
  unsigned ahead, ninsns, time, unroll_factor;
1598
  HOST_WIDE_INT est_niter;
1599
  struct tree_niter_desc desc;
1600
  bool unrolled = false, no_other_refs;
1601
  unsigned prefetch_count;
1602
  unsigned mem_ref_count;
1603
 
1604
  if (optimize_loop_nest_for_size_p (loop))
1605
    {
1606
      if (dump_file && (dump_flags & TDF_DETAILS))
1607
        fprintf (dump_file, "  ignored (cold area)\n");
1608
      return false;
1609
    }
1610
 
1611
  /* Step 1: gather the memory references.  */
1612
  refs = gather_memory_references (loop, &no_other_refs, &mem_ref_count);
1613
 
1614
  /* Step 2: estimate the reuse effects.  */
1615
  prune_by_reuse (refs);
1616
 
1617
  prefetch_count = estimate_prefetch_count (refs);
1618
  if (prefetch_count == 0)
1619
    goto fail;
1620
 
1621
  determine_loop_nest_reuse (loop, refs, no_other_refs);
1622
 
1623
  /* Step 3: determine the ahead and unroll factor.  */
1624
 
1625
  /* FIXME: the time should be weighted by the probabilities of the blocks in
1626
     the loop body.  */
1627
  time = tree_num_loop_insns (loop, &eni_time_weights);
1628
  ahead = (PREFETCH_LATENCY + time - 1) / time;
1629
  est_niter = estimated_loop_iterations_int (loop, false);
1630
 
1631
  ninsns = tree_num_loop_insns (loop, &eni_size_weights);
1632
  unroll_factor = determine_unroll_factor (loop, refs, ninsns, &desc,
1633
                                           est_niter);
1634
  if (dump_file && (dump_flags & TDF_DETAILS))
1635
    fprintf (dump_file, "Ahead %d, unroll factor %d, trip count "
1636
             HOST_WIDE_INT_PRINT_DEC "\n"
1637
             "insn count %d, mem ref count %d, prefetch count %d\n",
1638
             ahead, unroll_factor, est_niter,
1639
             ninsns, mem_ref_count, prefetch_count);
1640
 
1641
  if (!is_loop_prefetching_profitable (ahead, est_niter, ninsns,
1642
                                       prefetch_count, mem_ref_count))
1643
    goto fail;
1644
 
1645
  mark_nontemporal_stores (loop, refs);
1646
 
1647
  /* Step 4: what to prefetch?  */
1648
  if (!schedule_prefetches (refs, unroll_factor, ahead))
1649
    goto fail;
1650
 
1651
  /* Step 5: unroll the loop.  TODO -- peeling of first and last few
1652
     iterations so that we do not issue superfluous prefetches.  */
1653
  if (unroll_factor != 1)
1654
    {
1655
      tree_unroll_loop (loop, unroll_factor,
1656
                        single_dom_exit (loop), &desc);
1657
      unrolled = true;
1658
    }
1659
 
1660
  /* Step 6: issue the prefetches.  */
1661
  issue_prefetches (refs, unroll_factor, ahead);
1662
 
1663
fail:
1664
  release_mem_refs (refs);
1665
  return unrolled;
1666
}
1667
 
1668
/* Issue prefetch instructions for array references in loops.  */
1669
 
1670
unsigned int
1671
tree_ssa_prefetch_arrays (void)
1672
{
1673
  loop_iterator li;
1674
  struct loop *loop;
1675
  bool unrolled = false;
1676
  int todo_flags = 0;
1677
 
1678
  if (!HAVE_prefetch
1679
      /* It is possible to ask compiler for say -mtune=i486 -march=pentium4.
1680
         -mtune=i486 causes us having PREFETCH_BLOCK 0, since this is part
1681
         of processor costs and i486 does not have prefetch, but
1682
         -march=pentium4 causes HAVE_prefetch to be true.  Ugh.  */
1683
      || PREFETCH_BLOCK == 0)
1684
    return 0;
1685
 
1686
  if (dump_file && (dump_flags & TDF_DETAILS))
1687
    {
1688
      fprintf (dump_file, "Prefetching parameters:\n");
1689
      fprintf (dump_file, "    simultaneous prefetches: %d\n",
1690
               SIMULTANEOUS_PREFETCHES);
1691
      fprintf (dump_file, "    prefetch latency: %d\n", PREFETCH_LATENCY);
1692
      fprintf (dump_file, "    prefetch block size: %d\n", PREFETCH_BLOCK);
1693
      fprintf (dump_file, "    L1 cache size: %d lines, %d kB\n",
1694
               L1_CACHE_SIZE_BYTES / L1_CACHE_LINE_SIZE, L1_CACHE_SIZE);
1695
      fprintf (dump_file, "    L1 cache line size: %d\n", L1_CACHE_LINE_SIZE);
1696
      fprintf (dump_file, "    L2 cache size: %d kB\n", L2_CACHE_SIZE);
1697
      fprintf (dump_file, "    min insn-to-prefetch ratio: %d \n",
1698
               MIN_INSN_TO_PREFETCH_RATIO);
1699
      fprintf (dump_file, "    min insn-to-mem ratio: %d \n",
1700
               PREFETCH_MIN_INSN_TO_MEM_RATIO);
1701
      fprintf (dump_file, "\n");
1702
    }
1703
 
1704
  initialize_original_copy_tables ();
1705
 
1706
  if (!built_in_decls[BUILT_IN_PREFETCH])
1707
    {
1708
      tree type = build_function_type (void_type_node,
1709
                                       tree_cons (NULL_TREE,
1710
                                                  const_ptr_type_node,
1711
                                                  NULL_TREE));
1712
      tree decl = add_builtin_function ("__builtin_prefetch", type,
1713
                                        BUILT_IN_PREFETCH, BUILT_IN_NORMAL,
1714
                                        NULL, NULL_TREE);
1715
      DECL_IS_NOVOPS (decl) = true;
1716
      built_in_decls[BUILT_IN_PREFETCH] = decl;
1717
    }
1718
 
1719
  /* We assume that size of cache line is a power of two, so verify this
1720
     here.  */
1721
  gcc_assert ((PREFETCH_BLOCK & (PREFETCH_BLOCK - 1)) == 0);
1722
 
1723
  FOR_EACH_LOOP (li, loop, LI_FROM_INNERMOST)
1724
    {
1725
      if (dump_file && (dump_flags & TDF_DETAILS))
1726
        fprintf (dump_file, "Processing loop %d:\n", loop->num);
1727
 
1728
      unrolled |= loop_prefetch_arrays (loop);
1729
 
1730
      if (dump_file && (dump_flags & TDF_DETAILS))
1731
        fprintf (dump_file, "\n\n");
1732
    }
1733
 
1734
  if (unrolled)
1735
    {
1736
      scev_reset ();
1737
      todo_flags |= TODO_cleanup_cfg;
1738
    }
1739
 
1740
  free_original_copy_tables ();
1741
  return todo_flags;
1742
}

powered by: WebSVN 2.1.0

© copyright 1999-2024 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.