OpenCores
URL https://opencores.org/ocsvn/openrisc_me/openrisc_me/trunk

Subversion Repositories openrisc_me

[/] [openrisc/] [trunk/] [gnu-src/] [gcc-4.2.2/] [gcc/] [tree-ssa-loop-prefetch.c] - Blame information for rev 258

Go to most recent revision | Details | Compare with Previous | View Log

Line No. Rev Author Line
1 38 julius
/* Array prefetching.
2
   Copyright (C) 2005, 2007 Free Software Foundation, Inc.
3
 
4
This file is part of GCC.
5
 
6
GCC is free software; you can redistribute it and/or modify it
7
under the terms of the GNU General Public License as published by the
8
Free Software Foundation; either version 3, or (at your option) any
9
later version.
10
 
11
GCC is distributed in the hope that it will be useful, but WITHOUT
12
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13
FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14
for more details.
15
 
16
You should have received a copy of the GNU General Public License
17
along with GCC; see the file COPYING3.  If not see
18
<http://www.gnu.org/licenses/>.  */
19
 
20
#include "config.h"
21
#include "system.h"
22
#include "coretypes.h"
23
#include "tm.h"
24
#include "tree.h"
25
#include "rtl.h"
26
#include "tm_p.h"
27
#include "hard-reg-set.h"
28
#include "basic-block.h"
29
#include "output.h"
30
#include "diagnostic.h"
31
#include "tree-flow.h"
32
#include "tree-dump.h"
33
#include "timevar.h"
34
#include "cfgloop.h"
35
#include "varray.h"
36
#include "expr.h"
37
#include "tree-pass.h"
38
#include "ggc.h"
39
#include "insn-config.h"
40
#include "recog.h"
41
#include "hashtab.h"
42
#include "tree-chrec.h"
43
#include "tree-scalar-evolution.h"
44
#include "toplev.h"
45
#include "params.h"
46
#include "langhooks.h"
47
 
48
/* This pass inserts prefetch instructions to optimize cache usage during
49
   accesses to arrays in loops.  It processes loops sequentially and:
50
 
51
   1) Gathers all memory references in the single loop.
52
   2) For each of the references it decides when it is profitable to prefetch
53
      it.  To do it, we evaluate the reuse among the accesses, and determines
54
      two values: PREFETCH_BEFORE (meaning that it only makes sense to do
55
      prefetching in the first PREFETCH_BEFORE iterations of the loop) and
56
      PREFETCH_MOD (meaning that it only makes sense to prefetch in the
57
      iterations of the loop that are zero modulo PREFETCH_MOD).  For example
58
      (assuming cache line size is 64 bytes, char has size 1 byte and there
59
      is no hardware sequential prefetch):
60
 
61
      char *a;
62
      for (i = 0; i < max; i++)
63
        {
64
          a[255] = ...;         (0)
65
          a[i] = ...;           (1)
66
          a[i + 64] = ...;      (2)
67
          a[16*i] = ...;        (3)
68
          a[187*i] = ...;       (4)
69
          a[187*i + 50] = ...;  (5)
70
        }
71
 
72
       (0) obviously has PREFETCH_BEFORE 1
73
       (1) has PREFETCH_BEFORE 64, since (2) accesses the same memory
74
           location 64 iterations before it, and PREFETCH_MOD 64 (since
75
           it hits the same cache line otherwise).
76
       (2) has PREFETCH_MOD 64
77
       (3) has PREFETCH_MOD 4
78
       (4) has PREFETCH_MOD 1.  We do not set PREFETCH_BEFORE here, since
79
           the cache line accessed by (4) is the same with probability only
80
           7/32.
81
       (5) has PREFETCH_MOD 1 as well.
82
 
83
   3) We determine how much ahead we need to prefetch.  The number of
84
      iterations needed is time to fetch / time spent in one iteration of
85
      the loop.  The problem is that we do not know either of these values,
86
      so we just make a heuristic guess based on a magic (possibly)
87
      target-specific constant and size of the loop.
88
 
89
   4) Determine which of the references we prefetch.  We take into account
90
      that there is a maximum number of simultaneous prefetches (provided
91
      by machine description).  We prefetch as many prefetches as possible
92
      while still within this bound (starting with those with lowest
93
      prefetch_mod, since they are responsible for most of the cache
94
      misses).
95
 
96
   5) We unroll and peel loops so that we are able to satisfy PREFETCH_MOD
97
      and PREFETCH_BEFORE requirements (within some bounds), and to avoid
98
      prefetching nonaccessed memory.
99
      TODO -- actually implement peeling.
100
 
101
   6) We actually emit the prefetch instructions.  ??? Perhaps emit the
102
      prefetch instructions with guards in cases where 5) was not sufficient
103
      to satisfy the constraints?
104
 
105
   Some other TODO:
106
      -- write and use more general reuse analysis (that could be also used
107
         in other cache aimed loop optimizations)
108
      -- make it behave sanely together with the prefetches given by user
109
         (now we just ignore them; at the very least we should avoid
110
         optimizing loops in that user put his own prefetches)
111
      -- we assume cache line size alignment of arrays; this could be
112
         improved.  */
113
 
114
/* Magic constants follow.  These should be replaced by machine specific
115
   numbers.  */
116
 
117
/* A number that should roughly correspond to the number of instructions
118
   executed before the prefetch is completed.  */
119
 
120
#ifndef PREFETCH_LATENCY
121
#define PREFETCH_LATENCY 200
122
#endif
123
 
124
/* Number of prefetches that can run at the same time.  */
125
 
126
#ifndef SIMULTANEOUS_PREFETCHES
127
#define SIMULTANEOUS_PREFETCHES 3
128
#endif
129
 
130
/* True if write can be prefetched by a read prefetch.  */
131
 
132
#ifndef WRITE_CAN_USE_READ_PREFETCH
133
#define WRITE_CAN_USE_READ_PREFETCH 1
134
#endif
135
 
136
/* True if read can be prefetched by a write prefetch. */
137
 
138
#ifndef READ_CAN_USE_WRITE_PREFETCH
139
#define READ_CAN_USE_WRITE_PREFETCH 0
140
#endif
141
 
142
/* Cache line size.  Assumed to be a power of two.  */
143
 
144
#ifndef PREFETCH_BLOCK
145
#define PREFETCH_BLOCK 32
146
#endif
147
 
148
/* Do we have a forward hardware sequential prefetching?  */
149
 
150
#ifndef HAVE_FORWARD_PREFETCH
151
#define HAVE_FORWARD_PREFETCH 0
152
#endif
153
 
154
/* Do we have a backward hardware sequential prefetching?  */
155
 
156
#ifndef HAVE_BACKWARD_PREFETCH
157
#define HAVE_BACKWARD_PREFETCH 0
158
#endif
159
 
160
/* In some cases we are only able to determine that there is a certain
161
   probability that the two accesses hit the same cache line.  In this
162
   case, we issue the prefetches for both of them if this probability
163
   is less then (1000 - ACCEPTABLE_MISS_RATE) promile.  */
164
 
165
#ifndef ACCEPTABLE_MISS_RATE
166
#define ACCEPTABLE_MISS_RATE 50
167
#endif
168
 
169
#ifndef HAVE_prefetch
170
#define HAVE_prefetch 0
171
#endif
172
 
173
/* The group of references between that reuse may occur.  */
174
 
175
struct mem_ref_group
176
{
177
  tree base;                    /* Base of the reference.  */
178
  HOST_WIDE_INT step;           /* Step of the reference.  */
179
  struct mem_ref *refs;         /* References in the group.  */
180
  struct mem_ref_group *next;   /* Next group of references.  */
181
};
182
 
183
/* Assigned to PREFETCH_BEFORE when all iterations are to be prefetched.  */
184
 
185
#define PREFETCH_ALL            (~(unsigned HOST_WIDE_INT) 0)
186
 
187
/* The memory reference.  */
188
 
189
struct mem_ref
190
{
191
  tree stmt;                    /* Statement in that the reference appears.  */
192
  tree mem;                     /* The reference.  */
193
  HOST_WIDE_INT delta;          /* Constant offset of the reference.  */
194
  bool write_p;                 /* Is it a write?  */
195
  struct mem_ref_group *group;  /* The group of references it belongs to.  */
196
  unsigned HOST_WIDE_INT prefetch_mod;
197
                                /* Prefetch only each PREFETCH_MOD-th
198
                                   iteration.  */
199
  unsigned HOST_WIDE_INT prefetch_before;
200
                                /* Prefetch only first PREFETCH_BEFORE
201
                                   iterations.  */
202
  bool issue_prefetch_p;        /* Should we really issue the prefetch?  */
203
  struct mem_ref *next;         /* The next reference in the group.  */
204
};
205
 
206
/* Dumps information about reference REF to FILE.  */
207
 
208
static void
209
dump_mem_ref (FILE *file, struct mem_ref *ref)
210
{
211
  fprintf (file, "Reference %p:\n", (void *) ref);
212
 
213
  fprintf (file, "  group %p (base ", (void *) ref->group);
214
  print_generic_expr (file, ref->group->base, TDF_SLIM);
215
  fprintf (file, ", step ");
216
  fprintf (file, HOST_WIDE_INT_PRINT_DEC, ref->group->step);
217
  fprintf (file, ")\n");
218
 
219
  fprintf (dump_file, "  delta ");
220
  fprintf (file, HOST_WIDE_INT_PRINT_DEC, ref->delta);
221
  fprintf (file, "\n");
222
 
223
  fprintf (file, "  %s\n", ref->write_p ? "write" : "read");
224
 
225
  fprintf (file, "\n");
226
}
227
 
228
/* Finds a group with BASE and STEP in GROUPS, or creates one if it does not
229
   exist.  */
230
 
231
static struct mem_ref_group *
232
find_or_create_group (struct mem_ref_group **groups, tree base,
233
                      HOST_WIDE_INT step)
234
{
235
  struct mem_ref_group *group;
236
 
237
  for (; *groups; groups = &(*groups)->next)
238
    {
239
      if ((*groups)->step == step
240
          && operand_equal_p ((*groups)->base, base, 0))
241
        return *groups;
242
 
243
      /* Keep the list of groups sorted by decreasing step.  */
244
      if ((*groups)->step < step)
245
        break;
246
    }
247
 
248
  group = xcalloc (1, sizeof (struct mem_ref_group));
249
  group->base = base;
250
  group->step = step;
251
  group->refs = NULL;
252
  group->next = *groups;
253
  *groups = group;
254
 
255
  return group;
256
}
257
 
258
/* Records a memory reference MEM in GROUP with offset DELTA and write status
259
   WRITE_P.  The reference occurs in statement STMT.  */
260
 
261
static void
262
record_ref (struct mem_ref_group *group, tree stmt, tree mem,
263
            HOST_WIDE_INT delta, bool write_p)
264
{
265
  struct mem_ref **aref;
266
 
267
  /* Do not record the same address twice.  */
268
  for (aref = &group->refs; *aref; aref = &(*aref)->next)
269
    {
270
      /* It does not have to be possible for write reference to reuse the read
271
         prefetch, or vice versa.  */
272
      if (!WRITE_CAN_USE_READ_PREFETCH
273
          && write_p
274
          && !(*aref)->write_p)
275
        continue;
276
      if (!READ_CAN_USE_WRITE_PREFETCH
277
          && !write_p
278
          && (*aref)->write_p)
279
        continue;
280
 
281
      if ((*aref)->delta == delta)
282
        return;
283
    }
284
 
285
  (*aref) = xcalloc (1, sizeof (struct mem_ref));
286
  (*aref)->stmt = stmt;
287
  (*aref)->mem = mem;
288
  (*aref)->delta = delta;
289
  (*aref)->write_p = write_p;
290
  (*aref)->prefetch_before = PREFETCH_ALL;
291
  (*aref)->prefetch_mod = 1;
292
  (*aref)->issue_prefetch_p = false;
293
  (*aref)->group = group;
294
  (*aref)->next = NULL;
295
 
296
  if (dump_file && (dump_flags & TDF_DETAILS))
297
    dump_mem_ref (dump_file, *aref);
298
}
299
 
300
/* Release memory references in GROUPS.  */
301
 
302
static void
303
release_mem_refs (struct mem_ref_group *groups)
304
{
305
  struct mem_ref_group *next_g;
306
  struct mem_ref *ref, *next_r;
307
 
308
  for (; groups; groups = next_g)
309
    {
310
      next_g = groups->next;
311
      for (ref = groups->refs; ref; ref = next_r)
312
        {
313
          next_r = ref->next;
314
          free (ref);
315
        }
316
      free (groups);
317
    }
318
}
319
 
320
/* A structure used to pass arguments to idx_analyze_ref.  */
321
 
322
struct ar_data
323
{
324
  struct loop *loop;                    /* Loop of the reference.  */
325
  tree stmt;                            /* Statement of the reference.  */
326
  HOST_WIDE_INT *step;                  /* Step of the memory reference.  */
327
  HOST_WIDE_INT *delta;                 /* Offset of the memory reference.  */
328
};
329
 
330
/* Analyzes a single INDEX of a memory reference to obtain information
331
   described at analyze_ref.  Callback for for_each_index.  */
332
 
333
static bool
334
idx_analyze_ref (tree base, tree *index, void *data)
335
{
336
  struct ar_data *ar_data = data;
337
  tree ibase, step, stepsize;
338
  HOST_WIDE_INT istep, idelta = 0, imult = 1;
339
  affine_iv iv;
340
 
341
  if (TREE_CODE (base) == MISALIGNED_INDIRECT_REF
342
      || TREE_CODE (base) == ALIGN_INDIRECT_REF)
343
    return false;
344
 
345
  if (!simple_iv (ar_data->loop, ar_data->stmt, *index, &iv, false))
346
    return false;
347
  ibase = iv.base;
348
  step = iv.step;
349
 
350
  if (zero_p (step))
351
    istep = 0;
352
  else
353
    {
354
      if (!cst_and_fits_in_hwi (step))
355
        return false;
356
      istep = int_cst_value (step);
357
    }
358
 
359
  if (TREE_CODE (ibase) == PLUS_EXPR
360
      && cst_and_fits_in_hwi (TREE_OPERAND (ibase, 1)))
361
    {
362
      idelta = int_cst_value (TREE_OPERAND (ibase, 1));
363
      ibase = TREE_OPERAND (ibase, 0);
364
    }
365
  if (cst_and_fits_in_hwi (ibase))
366
    {
367
      idelta += int_cst_value (ibase);
368
      ibase = build_int_cst (TREE_TYPE (ibase), 0);
369
    }
370
 
371
  if (TREE_CODE (base) == ARRAY_REF)
372
    {
373
      stepsize = array_ref_element_size (base);
374
      if (!cst_and_fits_in_hwi (stepsize))
375
        return false;
376
      imult = int_cst_value (stepsize);
377
 
378
      istep *= imult;
379
      idelta *= imult;
380
    }
381
 
382
  *ar_data->step += istep;
383
  *ar_data->delta += idelta;
384
  *index = ibase;
385
 
386
  return true;
387
}
388
 
389
/* Tries to express REF_P in shape &BASE + STEP * iter + DELTA, where DELTA and
390
   STEP are integer constants and iter is number of iterations of LOOP.  The
391
   reference occurs in statement STMT.  Strips nonaddressable component
392
   references from REF_P.  */
393
 
394
static bool
395
analyze_ref (struct loop *loop, tree *ref_p, tree *base,
396
             HOST_WIDE_INT *step, HOST_WIDE_INT *delta,
397
             tree stmt)
398
{
399
  struct ar_data ar_data;
400
  tree off;
401
  HOST_WIDE_INT bit_offset;
402
  tree ref = *ref_p;
403
 
404
  *step = 0;
405
  *delta = 0;
406
 
407
  /* First strip off the component references.  Ignore bitfields.  */
408
  if (TREE_CODE (ref) == COMPONENT_REF
409
      && DECL_NONADDRESSABLE_P (TREE_OPERAND (ref, 1)))
410
    ref = TREE_OPERAND (ref, 0);
411
 
412
  *ref_p = ref;
413
 
414
  for (; TREE_CODE (ref) == COMPONENT_REF; ref = TREE_OPERAND (ref, 0))
415
    {
416
      off = DECL_FIELD_BIT_OFFSET (TREE_OPERAND (ref, 1));
417
      bit_offset = TREE_INT_CST_LOW (off);
418
      gcc_assert (bit_offset % BITS_PER_UNIT == 0);
419
 
420
      *delta += bit_offset / BITS_PER_UNIT;
421
    }
422
 
423
  *base = unshare_expr (ref);
424
  ar_data.loop = loop;
425
  ar_data.stmt = stmt;
426
  ar_data.step = step;
427
  ar_data.delta = delta;
428
  return for_each_index (base, idx_analyze_ref, &ar_data);
429
}
430
 
431
/* Record a memory reference REF to the list REFS.  The reference occurs in
432
   LOOP in statement STMT and it is write if WRITE_P.  */
433
 
434
static void
435
gather_memory_references_ref (struct loop *loop, struct mem_ref_group **refs,
436
                              tree ref, bool write_p, tree stmt)
437
{
438
  tree base;
439
  HOST_WIDE_INT step, delta;
440
  struct mem_ref_group *agrp;
441
 
442
  if (!analyze_ref (loop, &ref, &base, &step, &delta, stmt))
443
    return;
444
 
445
  /* Now we know that REF = &BASE + STEP * iter + DELTA, where DELTA and STEP
446
     are integer constants.  */
447
  agrp = find_or_create_group (refs, base, step);
448
  record_ref (agrp, stmt, ref, delta, write_p);
449
}
450
 
451
/* Record the suitable memory references in LOOP.  */
452
 
453
static struct mem_ref_group *
454
gather_memory_references (struct loop *loop)
455
{
456
  basic_block *body = get_loop_body_in_dom_order (loop);
457
  basic_block bb;
458
  unsigned i;
459
  block_stmt_iterator bsi;
460
  tree stmt, lhs, rhs;
461
  struct mem_ref_group *refs = NULL;
462
 
463
  /* Scan the loop body in order, so that the former references precede the
464
     later ones.  */
465
  for (i = 0; i < loop->num_nodes; i++)
466
    {
467
      bb = body[i];
468
      if (bb->loop_father != loop)
469
        continue;
470
 
471
      for (bsi = bsi_start (bb); !bsi_end_p (bsi); bsi_next (&bsi))
472
        {
473
          stmt = bsi_stmt (bsi);
474
          if (TREE_CODE (stmt) != MODIFY_EXPR)
475
            continue;
476
 
477
          lhs = TREE_OPERAND (stmt, 0);
478
          rhs = TREE_OPERAND (stmt, 1);
479
 
480
          if (REFERENCE_CLASS_P (rhs))
481
            gather_memory_references_ref (loop, &refs, rhs, false, stmt);
482
          if (REFERENCE_CLASS_P (lhs))
483
            gather_memory_references_ref (loop, &refs, lhs, true, stmt);
484
        }
485
    }
486
  free (body);
487
 
488
  return refs;
489
}
490
 
491
/* Prune the prefetch candidate REF using the self-reuse.  */
492
 
493
static void
494
prune_ref_by_self_reuse (struct mem_ref *ref)
495
{
496
  HOST_WIDE_INT step = ref->group->step;
497
  bool backward = step < 0;
498
 
499
  if (step == 0)
500
    {
501
      /* Prefetch references to invariant address just once.  */
502
      ref->prefetch_before = 1;
503
      return;
504
    }
505
 
506
  if (backward)
507
    step = -step;
508
 
509
  if (step > PREFETCH_BLOCK)
510
    return;
511
 
512
  if ((backward && HAVE_BACKWARD_PREFETCH)
513
      || (!backward && HAVE_FORWARD_PREFETCH))
514
    {
515
      ref->prefetch_before = 1;
516
      return;
517
    }
518
 
519
  ref->prefetch_mod = PREFETCH_BLOCK / step;
520
}
521
 
522
/* Divides X by BY, rounding down.  */
523
 
524
static HOST_WIDE_INT
525
ddown (HOST_WIDE_INT x, unsigned HOST_WIDE_INT by)
526
{
527
  gcc_assert (by > 0);
528
 
529
  if (x >= 0)
530
    return x / by;
531
  else
532
    return (x + by - 1) / by;
533
}
534
 
535
/* Prune the prefetch candidate REF using the reuse with BY.
536
   If BY_IS_BEFORE is true, BY is before REF in the loop.  */
537
 
538
static void
539
prune_ref_by_group_reuse (struct mem_ref *ref, struct mem_ref *by,
540
                          bool by_is_before)
541
{
542
  HOST_WIDE_INT step = ref->group->step;
543
  bool backward = step < 0;
544
  HOST_WIDE_INT delta_r = ref->delta, delta_b = by->delta;
545
  HOST_WIDE_INT delta = delta_b - delta_r;
546
  HOST_WIDE_INT hit_from;
547
  unsigned HOST_WIDE_INT prefetch_before, prefetch_block;
548
 
549
  if (delta == 0)
550
    {
551
      /* If the references has the same address, only prefetch the
552
         former.  */
553
      if (by_is_before)
554
        ref->prefetch_before = 0;
555
 
556
      return;
557
    }
558
 
559
  if (!step)
560
    {
561
      /* If the reference addresses are invariant and fall into the
562
         same cache line, prefetch just the first one.  */
563
      if (!by_is_before)
564
        return;
565
 
566
      if (ddown (ref->delta, PREFETCH_BLOCK)
567
          != ddown (by->delta, PREFETCH_BLOCK))
568
        return;
569
 
570
      ref->prefetch_before = 0;
571
      return;
572
    }
573
 
574
  /* Only prune the reference that is behind in the array.  */
575
  if (backward)
576
    {
577
      if (delta > 0)
578
        return;
579
 
580
      /* Transform the data so that we may assume that the accesses
581
         are forward.  */
582
      delta = - delta;
583
      step = -step;
584
      delta_r = PREFETCH_BLOCK - 1 - delta_r;
585
      delta_b = PREFETCH_BLOCK - 1 - delta_b;
586
    }
587
  else
588
    {
589
      if (delta < 0)
590
        return;
591
    }
592
 
593
  /* Check whether the two references are likely to hit the same cache
594
     line, and how distant the iterations in that it occurs are from
595
     each other.  */
596
 
597
  if (step <= PREFETCH_BLOCK)
598
    {
599
      /* The accesses are sure to meet.  Let us check when.  */
600
      hit_from = ddown (delta_b, PREFETCH_BLOCK) * PREFETCH_BLOCK;
601
      prefetch_before = (hit_from - delta_r + step - 1) / step;
602
 
603
      if (prefetch_before < ref->prefetch_before)
604
        ref->prefetch_before = prefetch_before;
605
 
606
      return;
607
    }
608
 
609
  /* A more complicated case.  First let us ensure that size of cache line
610
     and step are coprime (here we assume that PREFETCH_BLOCK is a power
611
     of two.  */
612
  prefetch_block = PREFETCH_BLOCK;
613
  while ((step & 1) == 0
614
         && prefetch_block > 1)
615
    {
616
      step >>= 1;
617
      prefetch_block >>= 1;
618
      delta >>= 1;
619
    }
620
 
621
  /* Now step > prefetch_block, and step and prefetch_block are coprime.
622
     Determine the probability that the accesses hit the same cache line.  */
623
 
624
  prefetch_before = delta / step;
625
  delta %= step;
626
  if ((unsigned HOST_WIDE_INT) delta
627
      <= (prefetch_block * ACCEPTABLE_MISS_RATE / 1000))
628
    {
629
      if (prefetch_before < ref->prefetch_before)
630
        ref->prefetch_before = prefetch_before;
631
 
632
      return;
633
    }
634
 
635
  /* Try also the following iteration.  */
636
  prefetch_before++;
637
  delta = step - delta;
638
  if ((unsigned HOST_WIDE_INT) delta
639
      <= (prefetch_block * ACCEPTABLE_MISS_RATE / 1000))
640
    {
641
      if (prefetch_before < ref->prefetch_before)
642
        ref->prefetch_before = prefetch_before;
643
 
644
      return;
645
    }
646
 
647
  /* The ref probably does not reuse by.  */
648
  return;
649
}
650
 
651
/* Prune the prefetch candidate REF using the reuses with other references
652
   in REFS.  */
653
 
654
static void
655
prune_ref_by_reuse (struct mem_ref *ref, struct mem_ref *refs)
656
{
657
  struct mem_ref *prune_by;
658
  bool before = true;
659
 
660
  prune_ref_by_self_reuse (ref);
661
 
662
  for (prune_by = refs; prune_by; prune_by = prune_by->next)
663
    {
664
      if (prune_by == ref)
665
        {
666
          before = false;
667
          continue;
668
        }
669
 
670
      if (!WRITE_CAN_USE_READ_PREFETCH
671
          && ref->write_p
672
          && !prune_by->write_p)
673
        continue;
674
      if (!READ_CAN_USE_WRITE_PREFETCH
675
          && !ref->write_p
676
          && prune_by->write_p)
677
        continue;
678
 
679
      prune_ref_by_group_reuse (ref, prune_by, before);
680
    }
681
}
682
 
683
/* Prune the prefetch candidates in GROUP using the reuse analysis.  */
684
 
685
static void
686
prune_group_by_reuse (struct mem_ref_group *group)
687
{
688
  struct mem_ref *ref_pruned;
689
 
690
  for (ref_pruned = group->refs; ref_pruned; ref_pruned = ref_pruned->next)
691
    {
692
      prune_ref_by_reuse (ref_pruned, group->refs);
693
 
694
      if (dump_file && (dump_flags & TDF_DETAILS))
695
        {
696
          fprintf (dump_file, "Reference %p:", (void *) ref_pruned);
697
 
698
          if (ref_pruned->prefetch_before == PREFETCH_ALL
699
              && ref_pruned->prefetch_mod == 1)
700
            fprintf (dump_file, " no restrictions");
701
          else if (ref_pruned->prefetch_before == 0)
702
            fprintf (dump_file, " do not prefetch");
703
          else if (ref_pruned->prefetch_before <= ref_pruned->prefetch_mod)
704
            fprintf (dump_file, " prefetch once");
705
          else
706
            {
707
              if (ref_pruned->prefetch_before != PREFETCH_ALL)
708
                {
709
                  fprintf (dump_file, " prefetch before ");
710
                  fprintf (dump_file, HOST_WIDE_INT_PRINT_DEC,
711
                           ref_pruned->prefetch_before);
712
                }
713
              if (ref_pruned->prefetch_mod != 1)
714
                {
715
                  fprintf (dump_file, " prefetch mod ");
716
                  fprintf (dump_file, HOST_WIDE_INT_PRINT_DEC,
717
                           ref_pruned->prefetch_mod);
718
                }
719
            }
720
          fprintf (dump_file, "\n");
721
        }
722
    }
723
}
724
 
725
/* Prune the list of prefetch candidates GROUPS using the reuse analysis.  */
726
 
727
static void
728
prune_by_reuse (struct mem_ref_group *groups)
729
{
730
  for (; groups; groups = groups->next)
731
    prune_group_by_reuse (groups);
732
}
733
 
734
/* Returns true if we should issue prefetch for REF.  */
735
 
736
static bool
737
should_issue_prefetch_p (struct mem_ref *ref)
738
{
739
  /* For now do not issue prefetches for only first few of the
740
     iterations.  */
741
  if (ref->prefetch_before != PREFETCH_ALL)
742
    return false;
743
 
744
  return true;
745
}
746
 
747
/* Decide which of the prefetch candidates in GROUPS to prefetch.
748
   AHEAD is the number of iterations to prefetch ahead (which corresponds
749
   to the number of simultaneous instances of one prefetch running at a
750
   time).  UNROLL_FACTOR is the factor by that the loop is going to be
751
   unrolled.  Returns true if there is anything to prefetch.  */
752
 
753
static bool
754
schedule_prefetches (struct mem_ref_group *groups, unsigned unroll_factor,
755
                     unsigned ahead)
756
{
757
  unsigned max_prefetches, n_prefetches;
758
  struct mem_ref *ref;
759
  bool any = false;
760
 
761
  max_prefetches = (SIMULTANEOUS_PREFETCHES * unroll_factor) / ahead;
762
  if (max_prefetches > (unsigned) SIMULTANEOUS_PREFETCHES)
763
    max_prefetches = SIMULTANEOUS_PREFETCHES;
764
 
765
  if (dump_file && (dump_flags & TDF_DETAILS))
766
    fprintf (dump_file, "Max prefetches to issue: %d.\n", max_prefetches);
767
 
768
  if (!max_prefetches)
769
    return false;
770
 
771
  /* For now we just take memory references one by one and issue
772
     prefetches for as many as possible.  The groups are sorted
773
     starting with the largest step, since the references with
774
     large step are more likely to cause many cache misses.  */
775
 
776
  for (; groups; groups = groups->next)
777
    for (ref = groups->refs; ref; ref = ref->next)
778
      {
779
        if (!should_issue_prefetch_p (ref))
780
          continue;
781
 
782
        ref->issue_prefetch_p = true;
783
 
784
        /* If prefetch_mod is less then unroll_factor, we need to insert
785
           several prefetches for the reference.  */
786
        n_prefetches = ((unroll_factor + ref->prefetch_mod - 1)
787
                        / ref->prefetch_mod);
788
        if (max_prefetches <= n_prefetches)
789
          return true;
790
 
791
        max_prefetches -= n_prefetches;
792
        any = true;
793
      }
794
 
795
  return any;
796
}
797
 
798
/* Determine whether there is any reference suitable for prefetching
799
   in GROUPS.  */
800
 
801
static bool
802
anything_to_prefetch_p (struct mem_ref_group *groups)
803
{
804
  struct mem_ref *ref;
805
 
806
  for (; groups; groups = groups->next)
807
    for (ref = groups->refs; ref; ref = ref->next)
808
      if (should_issue_prefetch_p (ref))
809
        return true;
810
 
811
  return false;
812
}
813
 
814
/* Issue prefetches for the reference REF into loop as decided before.
815
   HEAD is the number of iterations to prefetch ahead.  UNROLL_FACTOR
816
   is the factor by which LOOP was unrolled.  */
817
 
818
static void
819
issue_prefetch_ref (struct mem_ref *ref, unsigned unroll_factor, unsigned ahead)
820
{
821
  HOST_WIDE_INT delta;
822
  tree addr, addr_base, prefetch, params, write_p;
823
  block_stmt_iterator bsi;
824
  unsigned n_prefetches, ap;
825
 
826
  if (dump_file && (dump_flags & TDF_DETAILS))
827
    fprintf (dump_file, "Issued prefetch for %p.\n", (void *) ref);
828
 
829
  bsi = bsi_for_stmt (ref->stmt);
830
 
831
  n_prefetches = ((unroll_factor + ref->prefetch_mod - 1)
832
                  / ref->prefetch_mod);
833
  addr_base = build_fold_addr_expr_with_type (ref->mem, ptr_type_node);
834
  addr_base = force_gimple_operand_bsi (&bsi, unshare_expr (addr_base), true, NULL);
835
 
836
  for (ap = 0; ap < n_prefetches; ap++)
837
    {
838
      /* Determine the address to prefetch.  */
839
      delta = (ahead + ap * ref->prefetch_mod) * ref->group->step;
840
      addr = fold_build2 (PLUS_EXPR, ptr_type_node,
841
                          addr_base, build_int_cst (ptr_type_node, delta));
842
      addr = force_gimple_operand_bsi (&bsi, unshare_expr (addr), true, NULL);
843
 
844
      /* Create the prefetch instruction.  */
845
      write_p = ref->write_p ? integer_one_node : integer_zero_node;
846
      params = tree_cons (NULL_TREE, addr,
847
                          tree_cons (NULL_TREE, write_p, NULL_TREE));
848
 
849
      prefetch = build_function_call_expr (built_in_decls[BUILT_IN_PREFETCH],
850
                                           params);
851
      bsi_insert_before (&bsi, prefetch, BSI_SAME_STMT);
852
    }
853
}
854
 
855
/* Issue prefetches for the references in GROUPS into loop as decided before.
856
   HEAD is the number of iterations to prefetch ahead.  UNROLL_FACTOR is the
857
   factor by that LOOP was unrolled.  */
858
 
859
static void
860
issue_prefetches (struct mem_ref_group *groups,
861
                  unsigned unroll_factor, unsigned ahead)
862
{
863
  struct mem_ref *ref;
864
 
865
  for (; groups; groups = groups->next)
866
    for (ref = groups->refs; ref; ref = ref->next)
867
      if (ref->issue_prefetch_p)
868
        issue_prefetch_ref (ref, unroll_factor, ahead);
869
}
870
 
871
/* Determines whether we can profitably unroll LOOP FACTOR times, and if
872
   this is the case, fill in DESC by the description of number of
873
   iterations.  */
874
 
875
static bool
876
should_unroll_loop_p (struct loop *loop, struct tree_niter_desc *desc,
877
                      unsigned factor)
878
{
879
  if (!can_unroll_loop_p (loop, factor, desc))
880
    return false;
881
 
882
  /* We only consider loops without control flow for unrolling.  This is not
883
     a hard restriction -- tree_unroll_loop works with arbitrary loops
884
     as well; but the unrolling/prefetching is usually more profitable for
885
     loops consisting of a single basic block, and we want to limit the
886
     code growth.  */
887
  if (loop->num_nodes > 2)
888
    return false;
889
 
890
  return true;
891
}
892
 
893
/* Determine the coefficient by that unroll LOOP, from the information
894
   contained in the list of memory references REFS.  Description of
895
   umber of iterations of LOOP is stored to DESC.  AHEAD is the number
896
   of iterations ahead that we need to prefetch.  NINSNS is number of
897
   insns of the LOOP.  */
898
 
899
static unsigned
900
determine_unroll_factor (struct loop *loop, struct mem_ref_group *refs,
901
                         unsigned ahead, unsigned ninsns,
902
                         struct tree_niter_desc *desc)
903
{
904
  unsigned upper_bound, size_factor, constraint_factor;
905
  unsigned factor, max_mod_constraint, ahead_factor;
906
  struct mem_ref_group *agp;
907
  struct mem_ref *ref;
908
 
909
  upper_bound = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
910
 
911
  /* First check whether the loop is not too large to unroll.  */
912
  size_factor = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / ninsns;
913
  if (size_factor <= 1)
914
    return 1;
915
 
916
  if (size_factor < upper_bound)
917
    upper_bound = size_factor;
918
 
919
  max_mod_constraint = 1;
920
  for (agp = refs; agp; agp = agp->next)
921
    for (ref = agp->refs; ref; ref = ref->next)
922
      if (should_issue_prefetch_p (ref)
923
          && ref->prefetch_mod > max_mod_constraint)
924
        max_mod_constraint = ref->prefetch_mod;
925
 
926
  /* Set constraint_factor as large as needed to be able to satisfy the
927
     largest modulo constraint.  */
928
  constraint_factor = max_mod_constraint;
929
 
930
  /* If ahead is too large in comparison with the number of available
931
     prefetches, unroll the loop as much as needed to be able to prefetch
932
     at least partially some of the references in the loop.  */
933
  ahead_factor = ((ahead + SIMULTANEOUS_PREFETCHES - 1)
934
                  / SIMULTANEOUS_PREFETCHES);
935
 
936
  /* Unroll as much as useful, but bound the code size growth.  */
937
  if (constraint_factor < ahead_factor)
938
    factor = ahead_factor;
939
  else
940
    factor = constraint_factor;
941
  if (factor > upper_bound)
942
    factor = upper_bound;
943
 
944
  if (!should_unroll_loop_p (loop, desc, factor))
945
    return 1;
946
 
947
  return factor;
948
}
949
 
950
/* Issue prefetch instructions for array references in LOOP.  Returns
951
   true if the LOOP was unrolled.  LOOPS is the array containing all
952
   loops.  */
953
 
954
static bool
955
loop_prefetch_arrays (struct loops *loops, struct loop *loop)
956
{
957
  struct mem_ref_group *refs;
958
  unsigned ahead, ninsns, unroll_factor;
959
  struct tree_niter_desc desc;
960
  bool unrolled = false;
961
 
962
  /* Step 1: gather the memory references.  */
963
  refs = gather_memory_references (loop);
964
 
965
  /* Step 2: estimate the reuse effects.  */
966
  prune_by_reuse (refs);
967
 
968
  if (!anything_to_prefetch_p (refs))
969
    goto fail;
970
 
971
  /* Step 3: determine the ahead and unroll factor.  */
972
 
973
  /* FIXME: We should use not size of the loop, but the average number of
974
     instructions executed per iteration of the loop.  */
975
  ninsns = tree_num_loop_insns (loop);
976
  ahead = (PREFETCH_LATENCY + ninsns - 1) / ninsns;
977
  unroll_factor = determine_unroll_factor (loop, refs, ahead, ninsns,
978
                                           &desc);
979
  if (dump_file && (dump_flags & TDF_DETAILS))
980
    fprintf (dump_file, "Ahead %d, unroll factor %d\n", ahead, unroll_factor);
981
 
982
  /* If the loop rolls less than the required unroll factor, prefetching
983
     is useless.  */
984
  if (unroll_factor > 1
985
      && cst_and_fits_in_hwi (desc.niter)
986
      && (unsigned HOST_WIDE_INT) int_cst_value (desc.niter) < unroll_factor)
987
    goto fail;
988
 
989
  /* Step 4: what to prefetch?  */
990
  if (!schedule_prefetches (refs, unroll_factor, ahead))
991
    goto fail;
992
 
993
  /* Step 5: unroll the loop.  TODO -- peeling of first and last few
994
     iterations so that we do not issue superfluous prefetches.  */
995
  if (unroll_factor != 1)
996
    {
997
      tree_unroll_loop (loops, loop, unroll_factor,
998
                        single_dom_exit (loop), &desc);
999
      unrolled = true;
1000
    }
1001
 
1002
  /* Step 6: issue the prefetches.  */
1003
  issue_prefetches (refs, unroll_factor, ahead);
1004
 
1005
fail:
1006
  release_mem_refs (refs);
1007
  return unrolled;
1008
}
1009
 
1010
/* Issue prefetch instructions for array references in LOOPS.  */
1011
 
1012
unsigned int
1013
tree_ssa_prefetch_arrays (struct loops *loops)
1014
{
1015
  unsigned i;
1016
  struct loop *loop;
1017
  bool unrolled = false;
1018
  int todo_flags = 0;
1019
 
1020
  if (!HAVE_prefetch
1021
      /* It is possible to ask compiler for say -mtune=i486 -march=pentium4.
1022
         -mtune=i486 causes us having PREFETCH_BLOCK 0, since this is part
1023
         of processor costs and i486 does not have prefetch, but
1024
         -march=pentium4 causes HAVE_prefetch to be true.  Ugh.  */
1025
      || PREFETCH_BLOCK == 0)
1026
    return 0;
1027
 
1028
  initialize_original_copy_tables ();
1029
 
1030
  if (!built_in_decls[BUILT_IN_PREFETCH])
1031
    {
1032
      tree type = build_function_type (void_type_node,
1033
                                       tree_cons (NULL_TREE,
1034
                                                  const_ptr_type_node,
1035
                                                  NULL_TREE));
1036
      tree decl = lang_hooks.builtin_function ("__builtin_prefetch", type,
1037
                        BUILT_IN_PREFETCH, BUILT_IN_NORMAL,
1038
                        NULL, NULL_TREE);
1039
      DECL_IS_NOVOPS (decl) = true;
1040
      built_in_decls[BUILT_IN_PREFETCH] = decl;
1041
    }
1042
 
1043
  /* We assume that size of cache line is a power of two, so verify this
1044
     here.  */
1045
  gcc_assert ((PREFETCH_BLOCK & (PREFETCH_BLOCK - 1)) == 0);
1046
 
1047
  for (i = loops->num - 1; i > 0; i--)
1048
    {
1049
      loop = loops->parray[i];
1050
 
1051
      if (dump_file && (dump_flags & TDF_DETAILS))
1052
        fprintf (dump_file, "Processing loop %d:\n", loop->num);
1053
 
1054
      if (loop)
1055
        unrolled |= loop_prefetch_arrays (loops, loop);
1056
 
1057
      if (dump_file && (dump_flags & TDF_DETAILS))
1058
        fprintf (dump_file, "\n\n");
1059
    }
1060
 
1061
  if (unrolled)
1062
    {
1063
      scev_reset ();
1064
      todo_flags |= TODO_cleanup_cfg;
1065
    }
1066
 
1067
  free_original_copy_tables ();
1068
  return todo_flags;
1069
}

powered by: WebSVN 2.1.0

© copyright 1999-2024 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.