OpenCores
URL https://opencores.org/ocsvn/openrisc/openrisc/trunk

Subversion Repositories openrisc

[/] [openrisc/] [trunk/] [gnu-dev/] [or1k-gcc/] [gcc/] [loop-unroll.c] - Blame information for rev 801

Go to most recent revision | Details | Compare with Previous | View Log

Line No. Rev Author Line
1 684 jeremybenn
/* Loop unrolling and peeling.
2
   Copyright (C) 2002, 2003, 2004, 2005, 2007, 2008, 2010, 2011
3
   Free Software Foundation, Inc.
4
 
5
This file is part of GCC.
6
 
7
GCC is free software; you can redistribute it and/or modify it under
8
the terms of the GNU General Public License as published by the Free
9
Software Foundation; either version 3, or (at your option) any later
10
version.
11
 
12
GCC is distributed in the hope that it will be useful, but WITHOUT ANY
13
WARRANTY; without even the implied warranty of MERCHANTABILITY or
14
FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
15
for more details.
16
 
17
You should have received a copy of the GNU General Public License
18
along with GCC; see the file COPYING3.  If not see
19
<http://www.gnu.org/licenses/>.  */
20
 
21
#include "config.h"
22
#include "system.h"
23
#include "coretypes.h"
24
#include "tm.h"
25
#include "rtl.h"
26
#include "hard-reg-set.h"
27
#include "obstack.h"
28
#include "basic-block.h"
29
#include "cfgloop.h"
30
#include "cfglayout.h"
31
#include "params.h"
32
#include "output.h"
33
#include "expr.h"
34
#include "hashtab.h"
35
#include "recog.h"
36
#include "target.h"
37
 
38
/* This pass performs loop unrolling and peeling.  We only perform these
39
   optimizations on innermost loops (with single exception) because
40
   the impact on performance is greatest here, and we want to avoid
41
   unnecessary code size growth.  The gain is caused by greater sequentiality
42
   of code, better code to optimize for further passes and in some cases
43
   by fewer testings of exit conditions.  The main problem is code growth,
44
   that impacts performance negatively due to effect of caches.
45
 
46
   What we do:
47
 
48
   -- complete peeling of once-rolling loops; this is the above mentioned
49
      exception, as this causes loop to be cancelled completely and
50
      does not cause code growth
51
   -- complete peeling of loops that roll (small) constant times.
52
   -- simple peeling of first iterations of loops that do not roll much
53
      (according to profile feedback)
54
   -- unrolling of loops that roll constant times; this is almost always
55
      win, as we get rid of exit condition tests.
56
   -- unrolling of loops that roll number of times that we can compute
57
      in runtime; we also get rid of exit condition tests here, but there
58
      is the extra expense for calculating the number of iterations
59
   -- simple unrolling of remaining loops; this is performed only if we
60
      are asked to, as the gain is questionable in this case and often
61
      it may even slow down the code
62
   For more detailed descriptions of each of those, see comments at
63
   appropriate function below.
64
 
65
   There is a lot of parameters (defined and described in params.def) that
66
   control how much we unroll/peel.
67
 
68
   ??? A great problem is that we don't have a good way how to determine
69
   how many times we should unroll the loop; the experiments I have made
70
   showed that this choice may affect performance in order of several %.
71
   */
72
 
73
/* Information about induction variables to split.  */
74
 
75
struct iv_to_split
76
{
77
  rtx insn;             /* The insn in that the induction variable occurs.  */
78
  rtx base_var;         /* The variable on that the values in the further
79
                           iterations are based.  */
80
  rtx step;             /* Step of the induction variable.  */
81
  struct iv_to_split *next; /* Next entry in walking order.  */
82
  unsigned n_loc;
83
  unsigned loc[3];      /* Location where the definition of the induction
84
                           variable occurs in the insn.  For example if
85
                           N_LOC is 2, the expression is located at
86
                           XEXP (XEXP (single_set, loc[0]), loc[1]).  */
87
};
88
 
89
/* Information about accumulators to expand.  */
90
 
91
struct var_to_expand
92
{
93
  rtx insn;                        /* The insn in that the variable expansion occurs.  */
94
  rtx reg;                         /* The accumulator which is expanded.  */
95
  VEC(rtx,heap) *var_expansions;   /* The copies of the accumulator which is expanded.  */
96
  struct var_to_expand *next;      /* Next entry in walking order.  */
97
  enum rtx_code op;                /* The type of the accumulation - addition, subtraction
98
                                      or multiplication.  */
99
  int expansion_count;             /* Count the number of expansions generated so far.  */
100
  int reuse_expansion;             /* The expansion we intend to reuse to expand
101
                                      the accumulator.  If REUSE_EXPANSION is 0 reuse
102
                                      the original accumulator.  Else use
103
                                      var_expansions[REUSE_EXPANSION - 1].  */
104
  unsigned accum_pos;              /* The position in which the accumulator is placed in
105
                                      the insn src.  For example in x = x + something
106
                                      accum_pos is 0 while in x = something + x accum_pos
107
                                      is 1.  */
108
};
109
 
110
/* Information about optimization applied in
111
   the unrolled loop.  */
112
 
113
struct opt_info
114
{
115
  htab_t insns_to_split;           /* A hashtable of insns to split.  */
116
  struct iv_to_split *iv_to_split_head; /* The first iv to split.  */
117
  struct iv_to_split **iv_to_split_tail; /* Pointer to the tail of the list.  */
118
  htab_t insns_with_var_to_expand; /* A hashtable of insns with accumulators
119
                                      to expand.  */
120
  struct var_to_expand *var_to_expand_head; /* The first var to expand.  */
121
  struct var_to_expand **var_to_expand_tail; /* Pointer to the tail of the list.  */
122
  unsigned first_new_block;        /* The first basic block that was
123
                                      duplicated.  */
124
  basic_block loop_exit;           /* The loop exit basic block.  */
125
  basic_block loop_preheader;      /* The loop preheader basic block.  */
126
};
127
 
128
static void decide_unrolling_and_peeling (int);
129
static void peel_loops_completely (int);
130
static void decide_peel_simple (struct loop *, int);
131
static void decide_peel_once_rolling (struct loop *, int);
132
static void decide_peel_completely (struct loop *, int);
133
static void decide_unroll_stupid (struct loop *, int);
134
static void decide_unroll_constant_iterations (struct loop *, int);
135
static void decide_unroll_runtime_iterations (struct loop *, int);
136
static void peel_loop_simple (struct loop *);
137
static void peel_loop_completely (struct loop *);
138
static void unroll_loop_stupid (struct loop *);
139
static void unroll_loop_constant_iterations (struct loop *);
140
static void unroll_loop_runtime_iterations (struct loop *);
141
static struct opt_info *analyze_insns_in_loop (struct loop *);
142
static void opt_info_start_duplication (struct opt_info *);
143
static void apply_opt_in_copies (struct opt_info *, unsigned, bool, bool);
144
static void free_opt_info (struct opt_info *);
145
static struct var_to_expand *analyze_insn_to_expand_var (struct loop*, rtx);
146
static bool referenced_in_one_insn_in_loop_p (struct loop *, rtx, int *);
147
static struct iv_to_split *analyze_iv_to_split_insn (rtx);
148
static void expand_var_during_unrolling (struct var_to_expand *, rtx);
149
static void insert_var_expansion_initialization (struct var_to_expand *,
150
                                                 basic_block);
151
static void combine_var_copies_in_loop_exit (struct var_to_expand *,
152
                                             basic_block);
153
static rtx get_expansion (struct var_to_expand *);
154
 
155
/* Unroll and/or peel (depending on FLAGS) LOOPS.  */
156
void
157
unroll_and_peel_loops (int flags)
158
{
159
  struct loop *loop;
160
  bool check;
161
  loop_iterator li;
162
 
163
  /* First perform complete loop peeling (it is almost surely a win,
164
     and affects parameters for further decision a lot).  */
165
  peel_loops_completely (flags);
166
 
167
  /* Now decide rest of unrolling and peeling.  */
168
  decide_unrolling_and_peeling (flags);
169
 
170
  /* Scan the loops, inner ones first.  */
171
  FOR_EACH_LOOP (li, loop, LI_FROM_INNERMOST)
172
    {
173
      check = true;
174
      /* And perform the appropriate transformations.  */
175
      switch (loop->lpt_decision.decision)
176
        {
177
        case LPT_PEEL_COMPLETELY:
178
          /* Already done.  */
179
          gcc_unreachable ();
180
        case LPT_PEEL_SIMPLE:
181
          peel_loop_simple (loop);
182
          break;
183
        case LPT_UNROLL_CONSTANT:
184
          unroll_loop_constant_iterations (loop);
185
          break;
186
        case LPT_UNROLL_RUNTIME:
187
          unroll_loop_runtime_iterations (loop);
188
          break;
189
        case LPT_UNROLL_STUPID:
190
          unroll_loop_stupid (loop);
191
          break;
192
        case LPT_NONE:
193
          check = false;
194
          break;
195
        default:
196
          gcc_unreachable ();
197
        }
198
      if (check)
199
        {
200
#ifdef ENABLE_CHECKING
201
          verify_dominators (CDI_DOMINATORS);
202
          verify_loop_structure ();
203
#endif
204
        }
205
    }
206
 
207
  iv_analysis_done ();
208
}
209
 
210
/* Check whether exit of the LOOP is at the end of loop body.  */
211
 
212
static bool
213
loop_exit_at_end_p (struct loop *loop)
214
{
215
  struct niter_desc *desc = get_simple_loop_desc (loop);
216
  rtx insn;
217
 
218
  if (desc->in_edge->dest != loop->latch)
219
    return false;
220
 
221
  /* Check that the latch is empty.  */
222
  FOR_BB_INSNS (loop->latch, insn)
223
    {
224
      if (INSN_P (insn))
225
        return false;
226
    }
227
 
228
  return true;
229
}
230
 
231
/* Depending on FLAGS, check whether to peel loops completely and do so.  */
232
static void
233
peel_loops_completely (int flags)
234
{
235
  struct loop *loop;
236
  loop_iterator li;
237
 
238
  /* Scan the loops, the inner ones first.  */
239
  FOR_EACH_LOOP (li, loop, LI_FROM_INNERMOST)
240
    {
241
      loop->lpt_decision.decision = LPT_NONE;
242
 
243
      if (dump_file)
244
        fprintf (dump_file,
245
                 "\n;; *** Considering loop %d for complete peeling ***\n",
246
                 loop->num);
247
 
248
      loop->ninsns = num_loop_insns (loop);
249
 
250
      decide_peel_once_rolling (loop, flags);
251
      if (loop->lpt_decision.decision == LPT_NONE)
252
        decide_peel_completely (loop, flags);
253
 
254
      if (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY)
255
        {
256
          peel_loop_completely (loop);
257
#ifdef ENABLE_CHECKING
258
          verify_dominators (CDI_DOMINATORS);
259
          verify_loop_structure ();
260
#endif
261
        }
262
    }
263
}
264
 
265
/* Decide whether unroll or peel loops (depending on FLAGS) and how much.  */
266
static void
267
decide_unrolling_and_peeling (int flags)
268
{
269
  struct loop *loop;
270
  loop_iterator li;
271
 
272
  /* Scan the loops, inner ones first.  */
273
  FOR_EACH_LOOP (li, loop, LI_FROM_INNERMOST)
274
    {
275
      loop->lpt_decision.decision = LPT_NONE;
276
 
277
      if (dump_file)
278
        fprintf (dump_file, "\n;; *** Considering loop %d ***\n", loop->num);
279
 
280
      /* Do not peel cold areas.  */
281
      if (optimize_loop_for_size_p (loop))
282
        {
283
          if (dump_file)
284
            fprintf (dump_file, ";; Not considering loop, cold area\n");
285
          continue;
286
        }
287
 
288
      /* Can the loop be manipulated?  */
289
      if (!can_duplicate_loop_p (loop))
290
        {
291
          if (dump_file)
292
            fprintf (dump_file,
293
                     ";; Not considering loop, cannot duplicate\n");
294
          continue;
295
        }
296
 
297
      /* Skip non-innermost loops.  */
298
      if (loop->inner)
299
        {
300
          if (dump_file)
301
            fprintf (dump_file, ";; Not considering loop, is not innermost\n");
302
          continue;
303
        }
304
 
305
      loop->ninsns = num_loop_insns (loop);
306
      loop->av_ninsns = average_num_loop_insns (loop);
307
 
308
      /* Try transformations one by one in decreasing order of
309
         priority.  */
310
 
311
      decide_unroll_constant_iterations (loop, flags);
312
      if (loop->lpt_decision.decision == LPT_NONE)
313
        decide_unroll_runtime_iterations (loop, flags);
314
      if (loop->lpt_decision.decision == LPT_NONE)
315
        decide_unroll_stupid (loop, flags);
316
      if (loop->lpt_decision.decision == LPT_NONE)
317
        decide_peel_simple (loop, flags);
318
    }
319
}
320
 
321
/* Decide whether the LOOP is once rolling and suitable for complete
322
   peeling.  */
323
static void
324
decide_peel_once_rolling (struct loop *loop, int flags ATTRIBUTE_UNUSED)
325
{
326
  struct niter_desc *desc;
327
 
328
  if (dump_file)
329
    fprintf (dump_file, "\n;; Considering peeling once rolling loop\n");
330
 
331
  /* Is the loop small enough?  */
332
  if ((unsigned) PARAM_VALUE (PARAM_MAX_ONCE_PEELED_INSNS) < loop->ninsns)
333
    {
334
      if (dump_file)
335
        fprintf (dump_file, ";; Not considering loop, is too big\n");
336
      return;
337
    }
338
 
339
  /* Check for simple loops.  */
340
  desc = get_simple_loop_desc (loop);
341
 
342
  /* Check number of iterations.  */
343
  if (!desc->simple_p
344
      || desc->assumptions
345
      || desc->infinite
346
      || !desc->const_iter
347
      || desc->niter != 0)
348
    {
349
      if (dump_file)
350
        fprintf (dump_file,
351
                 ";; Unable to prove that the loop rolls exactly once\n");
352
      return;
353
    }
354
 
355
  /* Success.  */
356
  if (dump_file)
357
    fprintf (dump_file, ";; Decided to peel exactly once rolling loop\n");
358
  loop->lpt_decision.decision = LPT_PEEL_COMPLETELY;
359
}
360
 
361
/* Decide whether the LOOP is suitable for complete peeling.  */
362
static void
363
decide_peel_completely (struct loop *loop, int flags ATTRIBUTE_UNUSED)
364
{
365
  unsigned npeel;
366
  struct niter_desc *desc;
367
 
368
  if (dump_file)
369
    fprintf (dump_file, "\n;; Considering peeling completely\n");
370
 
371
  /* Skip non-innermost loops.  */
372
  if (loop->inner)
373
    {
374
      if (dump_file)
375
        fprintf (dump_file, ";; Not considering loop, is not innermost\n");
376
      return;
377
    }
378
 
379
  /* Do not peel cold areas.  */
380
  if (optimize_loop_for_size_p (loop))
381
    {
382
      if (dump_file)
383
        fprintf (dump_file, ";; Not considering loop, cold area\n");
384
      return;
385
    }
386
 
387
  /* Can the loop be manipulated?  */
388
  if (!can_duplicate_loop_p (loop))
389
    {
390
      if (dump_file)
391
        fprintf (dump_file,
392
                 ";; Not considering loop, cannot duplicate\n");
393
      return;
394
    }
395
 
396
  /* npeel = number of iterations to peel.  */
397
  npeel = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEELED_INSNS) / loop->ninsns;
398
  if (npeel > (unsigned) PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES))
399
    npeel = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES);
400
 
401
  /* Is the loop small enough?  */
402
  if (!npeel)
403
    {
404
      if (dump_file)
405
        fprintf (dump_file, ";; Not considering loop, is too big\n");
406
      return;
407
    }
408
 
409
  /* Check for simple loops.  */
410
  desc = get_simple_loop_desc (loop);
411
 
412
  /* Check number of iterations.  */
413
  if (!desc->simple_p
414
      || desc->assumptions
415
      || !desc->const_iter
416
      || desc->infinite)
417
    {
418
      if (dump_file)
419
        fprintf (dump_file,
420
                 ";; Unable to prove that the loop iterates constant times\n");
421
      return;
422
    }
423
 
424
  if (desc->niter > npeel - 1)
425
    {
426
      if (dump_file)
427
        {
428
          fprintf (dump_file,
429
                   ";; Not peeling loop completely, rolls too much (");
430
          fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC, desc->niter);
431
          fprintf (dump_file, " iterations > %d [maximum peelings])\n", npeel);
432
        }
433
      return;
434
    }
435
 
436
  /* Success.  */
437
  if (dump_file)
438
    fprintf (dump_file, ";; Decided to peel loop completely\n");
439
  loop->lpt_decision.decision = LPT_PEEL_COMPLETELY;
440
}
441
 
442
/* Peel all iterations of LOOP, remove exit edges and cancel the loop
443
   completely.  The transformation done:
444
 
445
   for (i = 0; i < 4; i++)
446
     body;
447
 
448
   ==>
449
 
450
   i = 0;
451
   body; i++;
452
   body; i++;
453
   body; i++;
454
   body; i++;
455
   */
456
static void
457
peel_loop_completely (struct loop *loop)
458
{
459
  sbitmap wont_exit;
460
  unsigned HOST_WIDE_INT npeel;
461
  unsigned i;
462
  VEC (edge, heap) *remove_edges;
463
  edge ein;
464
  struct niter_desc *desc = get_simple_loop_desc (loop);
465
  struct opt_info *opt_info = NULL;
466
 
467
  npeel = desc->niter;
468
 
469
  if (npeel)
470
    {
471
      bool ok;
472
 
473
      wont_exit = sbitmap_alloc (npeel + 1);
474
      sbitmap_ones (wont_exit);
475
      RESET_BIT (wont_exit, 0);
476
      if (desc->noloop_assumptions)
477
        RESET_BIT (wont_exit, 1);
478
 
479
      remove_edges = NULL;
480
 
481
      if (flag_split_ivs_in_unroller)
482
        opt_info = analyze_insns_in_loop (loop);
483
 
484
      opt_info_start_duplication (opt_info);
485
      ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
486
                                          npeel,
487
                                          wont_exit, desc->out_edge,
488
                                          &remove_edges,
489
                                          DLTHE_FLAG_UPDATE_FREQ
490
                                          | DLTHE_FLAG_COMPLETTE_PEEL
491
                                          | (opt_info
492
                                             ? DLTHE_RECORD_COPY_NUMBER : 0));
493
      gcc_assert (ok);
494
 
495
      free (wont_exit);
496
 
497
      if (opt_info)
498
        {
499
          apply_opt_in_copies (opt_info, npeel, false, true);
500
          free_opt_info (opt_info);
501
        }
502
 
503
      /* Remove the exit edges.  */
504
      FOR_EACH_VEC_ELT (edge, remove_edges, i, ein)
505
        remove_path (ein);
506
      VEC_free (edge, heap, remove_edges);
507
    }
508
 
509
  ein = desc->in_edge;
510
  free_simple_loop_desc (loop);
511
 
512
  /* Now remove the unreachable part of the last iteration and cancel
513
     the loop.  */
514
  remove_path (ein);
515
 
516
  if (dump_file)
517
    fprintf (dump_file, ";; Peeled loop completely, %d times\n", (int) npeel);
518
}
519
 
520
/* Decide whether to unroll LOOP iterating constant number of times
521
   and how much.  */
522
 
523
static void
524
decide_unroll_constant_iterations (struct loop *loop, int flags)
525
{
526
  unsigned nunroll, nunroll_by_av, best_copies, best_unroll = 0, n_copies, i;
527
  struct niter_desc *desc;
528
 
529
  if (!(flags & UAP_UNROLL))
530
    {
531
      /* We were not asked to, just return back silently.  */
532
      return;
533
    }
534
 
535
  if (dump_file)
536
    fprintf (dump_file,
537
             "\n;; Considering unrolling loop with constant "
538
             "number of iterations\n");
539
 
540
  /* nunroll = total number of copies of the original loop body in
541
     unrolled loop (i.e. if it is 2, we have to duplicate loop body once.  */
542
  nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
543
  nunroll_by_av
544
    = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
545
  if (nunroll > nunroll_by_av)
546
    nunroll = nunroll_by_av;
547
  if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
548
    nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
549
 
550
  /* Skip big loops.  */
551
  if (nunroll <= 1)
552
    {
553
      if (dump_file)
554
        fprintf (dump_file, ";; Not considering loop, is too big\n");
555
      return;
556
    }
557
 
558
  /* Check for simple loops.  */
559
  desc = get_simple_loop_desc (loop);
560
 
561
  /* Check number of iterations.  */
562
  if (!desc->simple_p || !desc->const_iter || desc->assumptions)
563
    {
564
      if (dump_file)
565
        fprintf (dump_file,
566
                 ";; Unable to prove that the loop iterates constant times\n");
567
      return;
568
    }
569
 
570
  /* Check whether the loop rolls enough to consider.  */
571
  if (desc->niter < 2 * nunroll)
572
    {
573
      if (dump_file)
574
        fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
575
      return;
576
    }
577
 
578
  /* Success; now compute number of iterations to unroll.  We alter
579
     nunroll so that as few as possible copies of loop body are
580
     necessary, while still not decreasing the number of unrollings
581
     too much (at most by 1).  */
582
  best_copies = 2 * nunroll + 10;
583
 
584
  i = 2 * nunroll + 2;
585
  if (i - 1 >= desc->niter)
586
    i = desc->niter - 2;
587
 
588
  for (; i >= nunroll - 1; i--)
589
    {
590
      unsigned exit_mod = desc->niter % (i + 1);
591
 
592
      if (!loop_exit_at_end_p (loop))
593
        n_copies = exit_mod + i + 1;
594
      else if (exit_mod != (unsigned) i
595
               || desc->noloop_assumptions != NULL_RTX)
596
        n_copies = exit_mod + i + 2;
597
      else
598
        n_copies = i + 1;
599
 
600
      if (n_copies < best_copies)
601
        {
602
          best_copies = n_copies;
603
          best_unroll = i;
604
        }
605
    }
606
 
607
  if (dump_file)
608
    fprintf (dump_file, ";; max_unroll %d (%d copies, initial %d).\n",
609
             best_unroll + 1, best_copies, nunroll);
610
 
611
  loop->lpt_decision.decision = LPT_UNROLL_CONSTANT;
612
  loop->lpt_decision.times = best_unroll;
613
 
614
  if (dump_file)
615
    fprintf (dump_file,
616
             ";; Decided to unroll the constant times rolling loop, %d times.\n",
617
             loop->lpt_decision.times);
618
}
619
 
620
/* Unroll LOOP with constant number of iterations LOOP->LPT_DECISION.TIMES + 1
621
   times.  The transformation does this:
622
 
623
   for (i = 0; i < 102; i++)
624
     body;
625
 
626
   ==>
627
 
628
   i = 0;
629
   body; i++;
630
   body; i++;
631
   while (i < 102)
632
     {
633
       body; i++;
634
       body; i++;
635
       body; i++;
636
       body; i++;
637
     }
638
  */
639
static void
640
unroll_loop_constant_iterations (struct loop *loop)
641
{
642
  unsigned HOST_WIDE_INT niter;
643
  unsigned exit_mod;
644
  sbitmap wont_exit;
645
  unsigned i;
646
  VEC (edge, heap) *remove_edges;
647
  edge e;
648
  unsigned max_unroll = loop->lpt_decision.times;
649
  struct niter_desc *desc = get_simple_loop_desc (loop);
650
  bool exit_at_end = loop_exit_at_end_p (loop);
651
  struct opt_info *opt_info = NULL;
652
  bool ok;
653
 
654
  niter = desc->niter;
655
 
656
  /* Should not get here (such loop should be peeled instead).  */
657
  gcc_assert (niter > max_unroll + 1);
658
 
659
  exit_mod = niter % (max_unroll + 1);
660
 
661
  wont_exit = sbitmap_alloc (max_unroll + 1);
662
  sbitmap_ones (wont_exit);
663
 
664
  remove_edges = NULL;
665
  if (flag_split_ivs_in_unroller
666
      || flag_variable_expansion_in_unroller)
667
    opt_info = analyze_insns_in_loop (loop);
668
 
669
  if (!exit_at_end)
670
    {
671
      /* The exit is not at the end of the loop; leave exit test
672
         in the first copy, so that the loops that start with test
673
         of exit condition have continuous body after unrolling.  */
674
 
675
      if (dump_file)
676
        fprintf (dump_file, ";; Condition on beginning of loop.\n");
677
 
678
      /* Peel exit_mod iterations.  */
679
      RESET_BIT (wont_exit, 0);
680
      if (desc->noloop_assumptions)
681
        RESET_BIT (wont_exit, 1);
682
 
683
      if (exit_mod)
684
        {
685
          opt_info_start_duplication (opt_info);
686
          ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
687
                                              exit_mod,
688
                                              wont_exit, desc->out_edge,
689
                                              &remove_edges,
690
                                              DLTHE_FLAG_UPDATE_FREQ
691
                                              | (opt_info && exit_mod > 1
692
                                                 ? DLTHE_RECORD_COPY_NUMBER
693
                                                   : 0));
694
          gcc_assert (ok);
695
 
696
          if (opt_info && exit_mod > 1)
697
            apply_opt_in_copies (opt_info, exit_mod, false, false);
698
 
699
          desc->noloop_assumptions = NULL_RTX;
700
          desc->niter -= exit_mod;
701
          desc->niter_max -= exit_mod;
702
        }
703
 
704
      SET_BIT (wont_exit, 1);
705
    }
706
  else
707
    {
708
      /* Leave exit test in last copy, for the same reason as above if
709
         the loop tests the condition at the end of loop body.  */
710
 
711
      if (dump_file)
712
        fprintf (dump_file, ";; Condition on end of loop.\n");
713
 
714
      /* We know that niter >= max_unroll + 2; so we do not need to care of
715
         case when we would exit before reaching the loop.  So just peel
716
         exit_mod + 1 iterations.  */
717
      if (exit_mod != max_unroll
718
          || desc->noloop_assumptions)
719
        {
720
          RESET_BIT (wont_exit, 0);
721
          if (desc->noloop_assumptions)
722
            RESET_BIT (wont_exit, 1);
723
 
724
          opt_info_start_duplication (opt_info);
725
          ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
726
                                              exit_mod + 1,
727
                                              wont_exit, desc->out_edge,
728
                                              &remove_edges,
729
                                              DLTHE_FLAG_UPDATE_FREQ
730
                                              | (opt_info && exit_mod > 0
731
                                                 ? DLTHE_RECORD_COPY_NUMBER
732
                                                   : 0));
733
          gcc_assert (ok);
734
 
735
          if (opt_info && exit_mod > 0)
736
            apply_opt_in_copies (opt_info, exit_mod + 1, false, false);
737
 
738
          desc->niter -= exit_mod + 1;
739
          desc->niter_max -= exit_mod + 1;
740
          desc->noloop_assumptions = NULL_RTX;
741
 
742
          SET_BIT (wont_exit, 0);
743
          SET_BIT (wont_exit, 1);
744
        }
745
 
746
      RESET_BIT (wont_exit, max_unroll);
747
    }
748
 
749
  /* Now unroll the loop.  */
750
 
751
  opt_info_start_duplication (opt_info);
752
  ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
753
                                      max_unroll,
754
                                      wont_exit, desc->out_edge,
755
                                      &remove_edges,
756
                                      DLTHE_FLAG_UPDATE_FREQ
757
                                      | (opt_info
758
                                         ? DLTHE_RECORD_COPY_NUMBER
759
                                           : 0));
760
  gcc_assert (ok);
761
 
762
  if (opt_info)
763
    {
764
      apply_opt_in_copies (opt_info, max_unroll, true, true);
765
      free_opt_info (opt_info);
766
    }
767
 
768
  free (wont_exit);
769
 
770
  if (exit_at_end)
771
    {
772
      basic_block exit_block = get_bb_copy (desc->in_edge->src);
773
      /* Find a new in and out edge; they are in the last copy we have made.  */
774
 
775
      if (EDGE_SUCC (exit_block, 0)->dest == desc->out_edge->dest)
776
        {
777
          desc->out_edge = EDGE_SUCC (exit_block, 0);
778
          desc->in_edge = EDGE_SUCC (exit_block, 1);
779
        }
780
      else
781
        {
782
          desc->out_edge = EDGE_SUCC (exit_block, 1);
783
          desc->in_edge = EDGE_SUCC (exit_block, 0);
784
        }
785
    }
786
 
787
  desc->niter /= max_unroll + 1;
788
  desc->niter_max /= max_unroll + 1;
789
  desc->niter_expr = GEN_INT (desc->niter);
790
 
791
  /* Remove the edges.  */
792
  FOR_EACH_VEC_ELT (edge, remove_edges, i, e)
793
    remove_path (e);
794
  VEC_free (edge, heap, remove_edges);
795
 
796
  if (dump_file)
797
    fprintf (dump_file,
798
             ";; Unrolled loop %d times, constant # of iterations %i insns\n",
799
             max_unroll, num_loop_insns (loop));
800
}
801
 
802
/* Decide whether to unroll LOOP iterating runtime computable number of times
803
   and how much.  */
804
static void
805
decide_unroll_runtime_iterations (struct loop *loop, int flags)
806
{
807
  unsigned nunroll, nunroll_by_av, i;
808
  struct niter_desc *desc;
809
 
810
  if (!(flags & UAP_UNROLL))
811
    {
812
      /* We were not asked to, just return back silently.  */
813
      return;
814
    }
815
 
816
  if (dump_file)
817
    fprintf (dump_file,
818
             "\n;; Considering unrolling loop with runtime "
819
             "computable number of iterations\n");
820
 
821
  /* nunroll = total number of copies of the original loop body in
822
     unrolled loop (i.e. if it is 2, we have to duplicate loop body once.  */
823
  nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
824
  nunroll_by_av = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
825
  if (nunroll > nunroll_by_av)
826
    nunroll = nunroll_by_av;
827
  if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
828
    nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
829
 
830
  if (targetm.loop_unroll_adjust)
831
    nunroll = targetm.loop_unroll_adjust (nunroll, loop);
832
 
833
  /* Skip big loops.  */
834
  if (nunroll <= 1)
835
    {
836
      if (dump_file)
837
        fprintf (dump_file, ";; Not considering loop, is too big\n");
838
      return;
839
    }
840
 
841
  /* Check for simple loops.  */
842
  desc = get_simple_loop_desc (loop);
843
 
844
  /* Check simpleness.  */
845
  if (!desc->simple_p || desc->assumptions)
846
    {
847
      if (dump_file)
848
        fprintf (dump_file,
849
                 ";; Unable to prove that the number of iterations "
850
                 "can be counted in runtime\n");
851
      return;
852
    }
853
 
854
  if (desc->const_iter)
855
    {
856
      if (dump_file)
857
        fprintf (dump_file, ";; Loop iterates constant times\n");
858
      return;
859
    }
860
 
861
  /* If we have profile feedback, check whether the loop rolls.  */
862
  if (loop->header->count && expected_loop_iterations (loop) < 2 * nunroll)
863
    {
864
      if (dump_file)
865
        fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
866
      return;
867
    }
868
 
869
  /* Success; now force nunroll to be power of 2, as we are unable to
870
     cope with overflows in computation of number of iterations.  */
871
  for (i = 1; 2 * i <= nunroll; i *= 2)
872
    continue;
873
 
874
  loop->lpt_decision.decision = LPT_UNROLL_RUNTIME;
875
  loop->lpt_decision.times = i - 1;
876
 
877
  if (dump_file)
878
    fprintf (dump_file,
879
             ";; Decided to unroll the runtime computable "
880
             "times rolling loop, %d times.\n",
881
             loop->lpt_decision.times);
882
}
883
 
884
/* Splits edge E and inserts the sequence of instructions INSNS on it, and
885
   returns the newly created block.  If INSNS is NULL_RTX, nothing is changed
886
   and NULL is returned instead.  */
887
 
888
basic_block
889
split_edge_and_insert (edge e, rtx insns)
890
{
891
  basic_block bb;
892
 
893
  if (!insns)
894
    return NULL;
895
  bb = split_edge (e);
896
  emit_insn_after (insns, BB_END (bb));
897
 
898
  /* ??? We used to assume that INSNS can contain control flow insns, and
899
     that we had to try to find sub basic blocks in BB to maintain a valid
900
     CFG.  For this purpose we used to set the BB_SUPERBLOCK flag on BB
901
     and call break_superblocks when going out of cfglayout mode.  But it
902
     turns out that this never happens; and that if it does ever happen,
903
     the TODO_verify_flow at the end of the RTL loop passes would fail.
904
 
905
     There are two reasons why we expected we could have control flow insns
906
     in INSNS.  The first is when a comparison has to be done in parts, and
907
     the second is when the number of iterations is computed for loops with
908
     the number of iterations known at runtime.  In both cases, test cases
909
     to get control flow in INSNS appear to be impossible to construct:
910
 
911
      * If do_compare_rtx_and_jump needs several branches to do comparison
912
        in a mode that needs comparison by parts, we cannot analyze the
913
        number of iterations of the loop, and we never get to unrolling it.
914
 
915
      * The code in expand_divmod that was suspected to cause creation of
916
        branching code seems to be only accessed for signed division.  The
917
        divisions used by # of iterations analysis are always unsigned.
918
        Problems might arise on architectures that emits branching code
919
        for some operations that may appear in the unroller (especially
920
        for division), but we have no such architectures.
921
 
922
     Considering all this, it was decided that we should for now assume
923
     that INSNS can in theory contain control flow insns, but in practice
924
     it never does.  So we don't handle the theoretical case, and should
925
     a real failure ever show up, we have a pretty good clue for how to
926
     fix it.  */
927
 
928
  return bb;
929
}
930
 
931
/* Unroll LOOP for that we are able to count number of iterations in runtime
932
   LOOP->LPT_DECISION.TIMES + 1 times.  The transformation does this (with some
933
   extra care for case n < 0):
934
 
935
   for (i = 0; i < n; i++)
936
     body;
937
 
938
   ==>
939
 
940
   i = 0;
941
   mod = n % 4;
942
 
943
   switch (mod)
944
     {
945
       case 3:
946
         body; i++;
947
       case 2:
948
         body; i++;
949
       case 1:
950
         body; i++;
951
       case 0: ;
952
     }
953
 
954
   while (i < n)
955
     {
956
       body; i++;
957
       body; i++;
958
       body; i++;
959
       body; i++;
960
     }
961
   */
962
static void
963
unroll_loop_runtime_iterations (struct loop *loop)
964
{
965
  rtx old_niter, niter, init_code, branch_code, tmp;
966
  unsigned i, j, p;
967
  basic_block preheader, *body, swtch, ezc_swtch;
968
  VEC (basic_block, heap) *dom_bbs;
969
  sbitmap wont_exit;
970
  int may_exit_copy;
971
  unsigned n_peel;
972
  VEC (edge, heap) *remove_edges;
973
  edge e;
974
  bool extra_zero_check, last_may_exit;
975
  unsigned max_unroll = loop->lpt_decision.times;
976
  struct niter_desc *desc = get_simple_loop_desc (loop);
977
  bool exit_at_end = loop_exit_at_end_p (loop);
978
  struct opt_info *opt_info = NULL;
979
  bool ok;
980
 
981
  if (flag_split_ivs_in_unroller
982
      || flag_variable_expansion_in_unroller)
983
    opt_info = analyze_insns_in_loop (loop);
984
 
985
  /* Remember blocks whose dominators will have to be updated.  */
986
  dom_bbs = NULL;
987
 
988
  body = get_loop_body (loop);
989
  for (i = 0; i < loop->num_nodes; i++)
990
    {
991
      VEC (basic_block, heap) *ldom;
992
      basic_block bb;
993
 
994
      ldom = get_dominated_by (CDI_DOMINATORS, body[i]);
995
      FOR_EACH_VEC_ELT (basic_block, ldom, j, bb)
996
        if (!flow_bb_inside_loop_p (loop, bb))
997
          VEC_safe_push (basic_block, heap, dom_bbs, bb);
998
 
999
      VEC_free (basic_block, heap, ldom);
1000
    }
1001
  free (body);
1002
 
1003
  if (!exit_at_end)
1004
    {
1005
      /* Leave exit in first copy (for explanation why see comment in
1006
         unroll_loop_constant_iterations).  */
1007
      may_exit_copy = 0;
1008
      n_peel = max_unroll - 1;
1009
      extra_zero_check = true;
1010
      last_may_exit = false;
1011
    }
1012
  else
1013
    {
1014
      /* Leave exit in last copy (for explanation why see comment in
1015
         unroll_loop_constant_iterations).  */
1016
      may_exit_copy = max_unroll;
1017
      n_peel = max_unroll;
1018
      extra_zero_check = false;
1019
      last_may_exit = true;
1020
    }
1021
 
1022
  /* Get expression for number of iterations.  */
1023
  start_sequence ();
1024
  old_niter = niter = gen_reg_rtx (desc->mode);
1025
  tmp = force_operand (copy_rtx (desc->niter_expr), niter);
1026
  if (tmp != niter)
1027
    emit_move_insn (niter, tmp);
1028
 
1029
  /* Count modulo by ANDing it with max_unroll; we use the fact that
1030
     the number of unrollings is a power of two, and thus this is correct
1031
     even if there is overflow in the computation.  */
1032
  niter = expand_simple_binop (desc->mode, AND,
1033
                               niter,
1034
                               GEN_INT (max_unroll),
1035
                               NULL_RTX, 0, OPTAB_LIB_WIDEN);
1036
 
1037
  init_code = get_insns ();
1038
  end_sequence ();
1039
  unshare_all_rtl_in_chain (init_code);
1040
 
1041
  /* Precondition the loop.  */
1042
  split_edge_and_insert (loop_preheader_edge (loop), init_code);
1043
 
1044
  remove_edges = NULL;
1045
 
1046
  wont_exit = sbitmap_alloc (max_unroll + 2);
1047
 
1048
  /* Peel the first copy of loop body (almost always we must leave exit test
1049
     here; the only exception is when we have extra zero check and the number
1050
     of iterations is reliable.  Also record the place of (possible) extra
1051
     zero check.  */
1052
  sbitmap_zero (wont_exit);
1053
  if (extra_zero_check
1054
      && !desc->noloop_assumptions)
1055
    SET_BIT (wont_exit, 1);
1056
  ezc_swtch = loop_preheader_edge (loop)->src;
1057
  ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1058
                                      1, wont_exit, desc->out_edge,
1059
                                      &remove_edges,
1060
                                      DLTHE_FLAG_UPDATE_FREQ);
1061
  gcc_assert (ok);
1062
 
1063
  /* Record the place where switch will be built for preconditioning.  */
1064
  swtch = split_edge (loop_preheader_edge (loop));
1065
 
1066
  for (i = 0; i < n_peel; i++)
1067
    {
1068
      /* Peel the copy.  */
1069
      sbitmap_zero (wont_exit);
1070
      if (i != n_peel - 1 || !last_may_exit)
1071
        SET_BIT (wont_exit, 1);
1072
      ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1073
                                          1, wont_exit, desc->out_edge,
1074
                                          &remove_edges,
1075
                                          DLTHE_FLAG_UPDATE_FREQ);
1076
      gcc_assert (ok);
1077
 
1078
      /* Create item for switch.  */
1079
      j = n_peel - i - (extra_zero_check ? 0 : 1);
1080
      p = REG_BR_PROB_BASE / (i + 2);
1081
 
1082
      preheader = split_edge (loop_preheader_edge (loop));
1083
      branch_code = compare_and_jump_seq (copy_rtx (niter), GEN_INT (j), EQ,
1084
                                          block_label (preheader), p,
1085
                                          NULL_RTX);
1086
 
1087
      /* We rely on the fact that the compare and jump cannot be optimized out,
1088
         and hence the cfg we create is correct.  */
1089
      gcc_assert (branch_code != NULL_RTX);
1090
 
1091
      swtch = split_edge_and_insert (single_pred_edge (swtch), branch_code);
1092
      set_immediate_dominator (CDI_DOMINATORS, preheader, swtch);
1093
      single_pred_edge (swtch)->probability = REG_BR_PROB_BASE - p;
1094
      e = make_edge (swtch, preheader,
1095
                     single_succ_edge (swtch)->flags & EDGE_IRREDUCIBLE_LOOP);
1096
      e->probability = p;
1097
    }
1098
 
1099
  if (extra_zero_check)
1100
    {
1101
      /* Add branch for zero iterations.  */
1102
      p = REG_BR_PROB_BASE / (max_unroll + 1);
1103
      swtch = ezc_swtch;
1104
      preheader = split_edge (loop_preheader_edge (loop));
1105
      branch_code = compare_and_jump_seq (copy_rtx (niter), const0_rtx, EQ,
1106
                                          block_label (preheader), p,
1107
                                          NULL_RTX);
1108
      gcc_assert (branch_code != NULL_RTX);
1109
 
1110
      swtch = split_edge_and_insert (single_succ_edge (swtch), branch_code);
1111
      set_immediate_dominator (CDI_DOMINATORS, preheader, swtch);
1112
      single_succ_edge (swtch)->probability = REG_BR_PROB_BASE - p;
1113
      e = make_edge (swtch, preheader,
1114
                     single_succ_edge (swtch)->flags & EDGE_IRREDUCIBLE_LOOP);
1115
      e->probability = p;
1116
    }
1117
 
1118
  /* Recount dominators for outer blocks.  */
1119
  iterate_fix_dominators (CDI_DOMINATORS, dom_bbs, false);
1120
 
1121
  /* And unroll loop.  */
1122
 
1123
  sbitmap_ones (wont_exit);
1124
  RESET_BIT (wont_exit, may_exit_copy);
1125
  opt_info_start_duplication (opt_info);
1126
 
1127
  ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
1128
                                      max_unroll,
1129
                                      wont_exit, desc->out_edge,
1130
                                      &remove_edges,
1131
                                      DLTHE_FLAG_UPDATE_FREQ
1132
                                      | (opt_info
1133
                                         ? DLTHE_RECORD_COPY_NUMBER
1134
                                           : 0));
1135
  gcc_assert (ok);
1136
 
1137
  if (opt_info)
1138
    {
1139
      apply_opt_in_copies (opt_info, max_unroll, true, true);
1140
      free_opt_info (opt_info);
1141
    }
1142
 
1143
  free (wont_exit);
1144
 
1145
  if (exit_at_end)
1146
    {
1147
      basic_block exit_block = get_bb_copy (desc->in_edge->src);
1148
      /* Find a new in and out edge; they are in the last copy we have
1149
         made.  */
1150
 
1151
      if (EDGE_SUCC (exit_block, 0)->dest == desc->out_edge->dest)
1152
        {
1153
          desc->out_edge = EDGE_SUCC (exit_block, 0);
1154
          desc->in_edge = EDGE_SUCC (exit_block, 1);
1155
        }
1156
      else
1157
        {
1158
          desc->out_edge = EDGE_SUCC (exit_block, 1);
1159
          desc->in_edge = EDGE_SUCC (exit_block, 0);
1160
        }
1161
    }
1162
 
1163
  /* Remove the edges.  */
1164
  FOR_EACH_VEC_ELT (edge, remove_edges, i, e)
1165
    remove_path (e);
1166
  VEC_free (edge, heap, remove_edges);
1167
 
1168
  /* We must be careful when updating the number of iterations due to
1169
     preconditioning and the fact that the value must be valid at entry
1170
     of the loop.  After passing through the above code, we see that
1171
     the correct new number of iterations is this:  */
1172
  gcc_assert (!desc->const_iter);
1173
  desc->niter_expr =
1174
    simplify_gen_binary (UDIV, desc->mode, old_niter,
1175
                         GEN_INT (max_unroll + 1));
1176
  desc->niter_max /= max_unroll + 1;
1177
  if (exit_at_end)
1178
    {
1179
      desc->niter_expr =
1180
        simplify_gen_binary (MINUS, desc->mode, desc->niter_expr, const1_rtx);
1181
      desc->noloop_assumptions = NULL_RTX;
1182
      desc->niter_max--;
1183
    }
1184
 
1185
  if (dump_file)
1186
    fprintf (dump_file,
1187
             ";; Unrolled loop %d times, counting # of iterations "
1188
             "in runtime, %i insns\n",
1189
             max_unroll, num_loop_insns (loop));
1190
 
1191
  VEC_free (basic_block, heap, dom_bbs);
1192
}
1193
 
1194
/* Decide whether to simply peel LOOP and how much.  */
1195
static void
1196
decide_peel_simple (struct loop *loop, int flags)
1197
{
1198
  unsigned npeel;
1199
  struct niter_desc *desc;
1200
 
1201
  if (!(flags & UAP_PEEL))
1202
    {
1203
      /* We were not asked to, just return back silently.  */
1204
      return;
1205
    }
1206
 
1207
  if (dump_file)
1208
    fprintf (dump_file, "\n;; Considering simply peeling loop\n");
1209
 
1210
  /* npeel = number of iterations to peel.  */
1211
  npeel = PARAM_VALUE (PARAM_MAX_PEELED_INSNS) / loop->ninsns;
1212
  if (npeel > (unsigned) PARAM_VALUE (PARAM_MAX_PEEL_TIMES))
1213
    npeel = PARAM_VALUE (PARAM_MAX_PEEL_TIMES);
1214
 
1215
  /* Skip big loops.  */
1216
  if (!npeel)
1217
    {
1218
      if (dump_file)
1219
        fprintf (dump_file, ";; Not considering loop, is too big\n");
1220
      return;
1221
    }
1222
 
1223
  /* Check for simple loops.  */
1224
  desc = get_simple_loop_desc (loop);
1225
 
1226
  /* Check number of iterations.  */
1227
  if (desc->simple_p && !desc->assumptions && desc->const_iter)
1228
    {
1229
      if (dump_file)
1230
        fprintf (dump_file, ";; Loop iterates constant times\n");
1231
      return;
1232
    }
1233
 
1234
  /* Do not simply peel loops with branches inside -- it increases number
1235
     of mispredicts.  */
1236
  if (num_loop_branches (loop) > 1)
1237
    {
1238
      if (dump_file)
1239
        fprintf (dump_file, ";; Not peeling, contains branches\n");
1240
      return;
1241
    }
1242
 
1243
  if (loop->header->count)
1244
    {
1245
      unsigned niter = expected_loop_iterations (loop);
1246
      if (niter + 1 > npeel)
1247
        {
1248
          if (dump_file)
1249
            {
1250
              fprintf (dump_file, ";; Not peeling loop, rolls too much (");
1251
              fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC,
1252
                       (HOST_WIDEST_INT) (niter + 1));
1253
              fprintf (dump_file, " iterations > %d [maximum peelings])\n",
1254
                       npeel);
1255
            }
1256
          return;
1257
        }
1258
      npeel = niter + 1;
1259
    }
1260
  else
1261
    {
1262
      /* For now we have no good heuristics to decide whether loop peeling
1263
         will be effective, so disable it.  */
1264
      if (dump_file)
1265
        fprintf (dump_file,
1266
                 ";; Not peeling loop, no evidence it will be profitable\n");
1267
      return;
1268
    }
1269
 
1270
  /* Success.  */
1271
  loop->lpt_decision.decision = LPT_PEEL_SIMPLE;
1272
  loop->lpt_decision.times = npeel;
1273
 
1274
  if (dump_file)
1275
    fprintf (dump_file, ";; Decided to simply peel the loop, %d times.\n",
1276
             loop->lpt_decision.times);
1277
}
1278
 
1279
/* Peel a LOOP LOOP->LPT_DECISION.TIMES times.  The transformation:
1280
   while (cond)
1281
     body;
1282
 
1283
   ==>
1284
 
1285
   if (!cond) goto end;
1286
   body;
1287
   if (!cond) goto end;
1288
   body;
1289
   while (cond)
1290
     body;
1291
   end: ;
1292
   */
1293
static void
1294
peel_loop_simple (struct loop *loop)
1295
{
1296
  sbitmap wont_exit;
1297
  unsigned npeel = loop->lpt_decision.times;
1298
  struct niter_desc *desc = get_simple_loop_desc (loop);
1299
  struct opt_info *opt_info = NULL;
1300
  bool ok;
1301
 
1302
  if (flag_split_ivs_in_unroller && npeel > 1)
1303
    opt_info = analyze_insns_in_loop (loop);
1304
 
1305
  wont_exit = sbitmap_alloc (npeel + 1);
1306
  sbitmap_zero (wont_exit);
1307
 
1308
  opt_info_start_duplication (opt_info);
1309
 
1310
  ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1311
                                      npeel, wont_exit, NULL,
1312
                                      NULL, DLTHE_FLAG_UPDATE_FREQ
1313
                                      | (opt_info
1314
                                         ? DLTHE_RECORD_COPY_NUMBER
1315
                                           : 0));
1316
  gcc_assert (ok);
1317
 
1318
  free (wont_exit);
1319
 
1320
  if (opt_info)
1321
    {
1322
      apply_opt_in_copies (opt_info, npeel, false, false);
1323
      free_opt_info (opt_info);
1324
    }
1325
 
1326
  if (desc->simple_p)
1327
    {
1328
      if (desc->const_iter)
1329
        {
1330
          desc->niter -= npeel;
1331
          desc->niter_expr = GEN_INT (desc->niter);
1332
          desc->noloop_assumptions = NULL_RTX;
1333
        }
1334
      else
1335
        {
1336
          /* We cannot just update niter_expr, as its value might be clobbered
1337
             inside loop.  We could handle this by counting the number into
1338
             temporary just like we do in runtime unrolling, but it does not
1339
             seem worthwhile.  */
1340
          free_simple_loop_desc (loop);
1341
        }
1342
    }
1343
  if (dump_file)
1344
    fprintf (dump_file, ";; Peeling loop %d times\n", npeel);
1345
}
1346
 
1347
/* Decide whether to unroll LOOP stupidly and how much.  */
1348
static void
1349
decide_unroll_stupid (struct loop *loop, int flags)
1350
{
1351
  unsigned nunroll, nunroll_by_av, i;
1352
  struct niter_desc *desc;
1353
 
1354
  if (!(flags & UAP_UNROLL_ALL))
1355
    {
1356
      /* We were not asked to, just return back silently.  */
1357
      return;
1358
    }
1359
 
1360
  if (dump_file)
1361
    fprintf (dump_file, "\n;; Considering unrolling loop stupidly\n");
1362
 
1363
  /* nunroll = total number of copies of the original loop body in
1364
     unrolled loop (i.e. if it is 2, we have to duplicate loop body once.  */
1365
  nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
1366
  nunroll_by_av
1367
    = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
1368
  if (nunroll > nunroll_by_av)
1369
    nunroll = nunroll_by_av;
1370
  if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
1371
    nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
1372
 
1373
  if (targetm.loop_unroll_adjust)
1374
    nunroll = targetm.loop_unroll_adjust (nunroll, loop);
1375
 
1376
  /* Skip big loops.  */
1377
  if (nunroll <= 1)
1378
    {
1379
      if (dump_file)
1380
        fprintf (dump_file, ";; Not considering loop, is too big\n");
1381
      return;
1382
    }
1383
 
1384
  /* Check for simple loops.  */
1385
  desc = get_simple_loop_desc (loop);
1386
 
1387
  /* Check simpleness.  */
1388
  if (desc->simple_p && !desc->assumptions)
1389
    {
1390
      if (dump_file)
1391
        fprintf (dump_file, ";; The loop is simple\n");
1392
      return;
1393
    }
1394
 
1395
  /* Do not unroll loops with branches inside -- it increases number
1396
     of mispredicts.  */
1397
  if (num_loop_branches (loop) > 1)
1398
    {
1399
      if (dump_file)
1400
        fprintf (dump_file, ";; Not unrolling, contains branches\n");
1401
      return;
1402
    }
1403
 
1404
  /* If we have profile feedback, check whether the loop rolls.  */
1405
  if (loop->header->count
1406
      && expected_loop_iterations (loop) < 2 * nunroll)
1407
    {
1408
      if (dump_file)
1409
        fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
1410
      return;
1411
    }
1412
 
1413
  /* Success.  Now force nunroll to be power of 2, as it seems that this
1414
     improves results (partially because of better alignments, partially
1415
     because of some dark magic).  */
1416
  for (i = 1; 2 * i <= nunroll; i *= 2)
1417
    continue;
1418
 
1419
  loop->lpt_decision.decision = LPT_UNROLL_STUPID;
1420
  loop->lpt_decision.times = i - 1;
1421
 
1422
  if (dump_file)
1423
    fprintf (dump_file,
1424
             ";; Decided to unroll the loop stupidly, %d times.\n",
1425
             loop->lpt_decision.times);
1426
}
1427
 
1428
/* Unroll a LOOP LOOP->LPT_DECISION.TIMES times.  The transformation:
1429
   while (cond)
1430
     body;
1431
 
1432
   ==>
1433
 
1434
   while (cond)
1435
     {
1436
       body;
1437
       if (!cond) break;
1438
       body;
1439
       if (!cond) break;
1440
       body;
1441
       if (!cond) break;
1442
       body;
1443
     }
1444
   */
1445
static void
1446
unroll_loop_stupid (struct loop *loop)
1447
{
1448
  sbitmap wont_exit;
1449
  unsigned nunroll = loop->lpt_decision.times;
1450
  struct niter_desc *desc = get_simple_loop_desc (loop);
1451
  struct opt_info *opt_info = NULL;
1452
  bool ok;
1453
 
1454
  if (flag_split_ivs_in_unroller
1455
      || flag_variable_expansion_in_unroller)
1456
    opt_info = analyze_insns_in_loop (loop);
1457
 
1458
 
1459
  wont_exit = sbitmap_alloc (nunroll + 1);
1460
  sbitmap_zero (wont_exit);
1461
  opt_info_start_duplication (opt_info);
1462
 
1463
  ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
1464
                                      nunroll, wont_exit,
1465
                                      NULL, NULL,
1466
                                      DLTHE_FLAG_UPDATE_FREQ
1467
                                      | (opt_info
1468
                                         ? DLTHE_RECORD_COPY_NUMBER
1469
                                           : 0));
1470
  gcc_assert (ok);
1471
 
1472
  if (opt_info)
1473
    {
1474
      apply_opt_in_copies (opt_info, nunroll, true, true);
1475
      free_opt_info (opt_info);
1476
    }
1477
 
1478
  free (wont_exit);
1479
 
1480
  if (desc->simple_p)
1481
    {
1482
      /* We indeed may get here provided that there are nontrivial assumptions
1483
         for a loop to be really simple.  We could update the counts, but the
1484
         problem is that we are unable to decide which exit will be taken
1485
         (not really true in case the number of iterations is constant,
1486
         but noone will do anything with this information, so we do not
1487
         worry about it).  */
1488
      desc->simple_p = false;
1489
    }
1490
 
1491
  if (dump_file)
1492
    fprintf (dump_file, ";; Unrolled loop %d times, %i insns\n",
1493
             nunroll, num_loop_insns (loop));
1494
}
1495
 
1496
/* A hash function for information about insns to split.  */
1497
 
1498
static hashval_t
1499
si_info_hash (const void *ivts)
1500
{
1501
  return (hashval_t) INSN_UID (((const struct iv_to_split *) ivts)->insn);
1502
}
1503
 
1504
/* An equality functions for information about insns to split.  */
1505
 
1506
static int
1507
si_info_eq (const void *ivts1, const void *ivts2)
1508
{
1509
  const struct iv_to_split *const i1 = (const struct iv_to_split *) ivts1;
1510
  const struct iv_to_split *const i2 = (const struct iv_to_split *) ivts2;
1511
 
1512
  return i1->insn == i2->insn;
1513
}
1514
 
1515
/* Return a hash for VES, which is really a "var_to_expand *".  */
1516
 
1517
static hashval_t
1518
ve_info_hash (const void *ves)
1519
{
1520
  return (hashval_t) INSN_UID (((const struct var_to_expand *) ves)->insn);
1521
}
1522
 
1523
/* Return true if IVTS1 and IVTS2 (which are really both of type
1524
   "var_to_expand *") refer to the same instruction.  */
1525
 
1526
static int
1527
ve_info_eq (const void *ivts1, const void *ivts2)
1528
{
1529
  const struct var_to_expand *const i1 = (const struct var_to_expand *) ivts1;
1530
  const struct var_to_expand *const i2 = (const struct var_to_expand *) ivts2;
1531
 
1532
  return i1->insn == i2->insn;
1533
}
1534
 
1535
/* Returns true if REG is referenced in one nondebug insn in LOOP.
1536
   Set *DEBUG_USES to the number of debug insns that reference the
1537
   variable.  */
1538
 
1539
bool
1540
referenced_in_one_insn_in_loop_p (struct loop *loop, rtx reg,
1541
                                  int *debug_uses)
1542
{
1543
  basic_block *body, bb;
1544
  unsigned i;
1545
  int count_ref = 0;
1546
  rtx insn;
1547
 
1548
  body = get_loop_body (loop);
1549
  for (i = 0; i < loop->num_nodes; i++)
1550
    {
1551
      bb = body[i];
1552
 
1553
      FOR_BB_INSNS (bb, insn)
1554
        if (!rtx_referenced_p (reg, insn))
1555
          continue;
1556
        else if (DEBUG_INSN_P (insn))
1557
          ++*debug_uses;
1558
        else if (++count_ref > 1)
1559
          break;
1560
    }
1561
  free (body);
1562
  return (count_ref  == 1);
1563
}
1564
 
1565
/* Reset the DEBUG_USES debug insns in LOOP that reference REG.  */
1566
 
1567
static void
1568
reset_debug_uses_in_loop (struct loop *loop, rtx reg, int debug_uses)
1569
{
1570
  basic_block *body, bb;
1571
  unsigned i;
1572
  rtx insn;
1573
 
1574
  body = get_loop_body (loop);
1575
  for (i = 0; debug_uses && i < loop->num_nodes; i++)
1576
    {
1577
      bb = body[i];
1578
 
1579
      FOR_BB_INSNS (bb, insn)
1580
        if (!DEBUG_INSN_P (insn) || !rtx_referenced_p (reg, insn))
1581
          continue;
1582
        else
1583
          {
1584
            validate_change (insn, &INSN_VAR_LOCATION_LOC (insn),
1585
                             gen_rtx_UNKNOWN_VAR_LOC (), 0);
1586
            if (!--debug_uses)
1587
              break;
1588
          }
1589
    }
1590
  free (body);
1591
}
1592
 
1593
/* Determine whether INSN contains an accumulator
1594
   which can be expanded into separate copies,
1595
   one for each copy of the LOOP body.
1596
 
1597
   for (i = 0 ; i < n; i++)
1598
     sum += a[i];
1599
 
1600
   ==>
1601
 
1602
   sum += a[i]
1603
   ....
1604
   i = i+1;
1605
   sum1 += a[i]
1606
   ....
1607
   i = i+1
1608
   sum2 += a[i];
1609
   ....
1610
 
1611
   Return NULL if INSN contains no opportunity for expansion of accumulator.
1612
   Otherwise, allocate a VAR_TO_EXPAND structure, fill it with the relevant
1613
   information and return a pointer to it.
1614
*/
1615
 
1616
static struct var_to_expand *
1617
analyze_insn_to_expand_var (struct loop *loop, rtx insn)
1618
{
1619
  rtx set, dest, src;
1620
  struct var_to_expand *ves;
1621
  unsigned accum_pos;
1622
  enum rtx_code code;
1623
  int debug_uses = 0;
1624
 
1625
  set = single_set (insn);
1626
  if (!set)
1627
    return NULL;
1628
 
1629
  dest = SET_DEST (set);
1630
  src = SET_SRC (set);
1631
  code = GET_CODE (src);
1632
 
1633
  if (code != PLUS && code != MINUS && code != MULT && code != FMA)
1634
    return NULL;
1635
 
1636
  if (FLOAT_MODE_P (GET_MODE (dest)))
1637
    {
1638
      if (!flag_associative_math)
1639
        return NULL;
1640
      /* In the case of FMA, we're also changing the rounding.  */
1641
      if (code == FMA && !flag_unsafe_math_optimizations)
1642
        return NULL;
1643
    }
1644
 
1645
  /* Hmm, this is a bit paradoxical.  We know that INSN is a valid insn
1646
     in MD.  But if there is no optab to generate the insn, we can not
1647
     perform the variable expansion.  This can happen if an MD provides
1648
     an insn but not a named pattern to generate it, for example to avoid
1649
     producing code that needs additional mode switches like for x87/mmx.
1650
 
1651
     So we check have_insn_for which looks for an optab for the operation
1652
     in SRC.  If it doesn't exist, we can't perform the expansion even
1653
     though INSN is valid.  */
1654
  if (!have_insn_for (code, GET_MODE (src)))
1655
    return NULL;
1656
 
1657
  if (!REG_P (dest)
1658
      && !(GET_CODE (dest) == SUBREG
1659
           && REG_P (SUBREG_REG (dest))))
1660
    return NULL;
1661
 
1662
  /* Find the accumulator use within the operation.  */
1663
  if (code == FMA)
1664
    {
1665
      /* We only support accumulation via FMA in the ADD position.  */
1666
      if (!rtx_equal_p  (dest, XEXP (src, 2)))
1667
        return NULL;
1668
      accum_pos = 2;
1669
    }
1670
  else if (rtx_equal_p (dest, XEXP (src, 0)))
1671
    accum_pos = 0;
1672
  else if (rtx_equal_p (dest, XEXP (src, 1)))
1673
    {
1674
      /* The method of expansion that we are using; which includes the
1675
         initialization of the expansions with zero and the summation of
1676
         the expansions at the end of the computation will yield wrong
1677
         results for (x = something - x) thus avoid using it in that case.  */
1678
      if (code == MINUS)
1679
        return NULL;
1680
      accum_pos = 1;
1681
    }
1682
  else
1683
    return NULL;
1684
 
1685
  /* It must not otherwise be used.  */
1686
  if (code == FMA)
1687
    {
1688
      if (rtx_referenced_p (dest, XEXP (src, 0))
1689
          || rtx_referenced_p (dest, XEXP (src, 1)))
1690
        return NULL;
1691
    }
1692
  else if (rtx_referenced_p (dest, XEXP (src, 1 - accum_pos)))
1693
    return NULL;
1694
 
1695
  /* It must be used in exactly one insn.  */
1696
  if (!referenced_in_one_insn_in_loop_p (loop, dest, &debug_uses))
1697
    return NULL;
1698
 
1699
  if (dump_file)
1700
    {
1701
      fprintf (dump_file, "\n;; Expanding Accumulator ");
1702
      print_rtl (dump_file, dest);
1703
      fprintf (dump_file, "\n");
1704
    }
1705
 
1706
  if (debug_uses)
1707
    /* Instead of resetting the debug insns, we could replace each
1708
       debug use in the loop with the sum or product of all expanded
1709
       accummulators.  Since we'll only know of all expansions at the
1710
       end, we'd have to keep track of which vars_to_expand a debug
1711
       insn in the loop references, take note of each copy of the
1712
       debug insn during unrolling, and when it's all done, compute
1713
       the sum or product of each variable and adjust the original
1714
       debug insn and each copy thereof.  What a pain!  */
1715
    reset_debug_uses_in_loop (loop, dest, debug_uses);
1716
 
1717
  /* Record the accumulator to expand.  */
1718
  ves = XNEW (struct var_to_expand);
1719
  ves->insn = insn;
1720
  ves->reg = copy_rtx (dest);
1721
  ves->var_expansions = VEC_alloc (rtx, heap, 1);
1722
  ves->next = NULL;
1723
  ves->op = GET_CODE (src);
1724
  ves->expansion_count = 0;
1725
  ves->reuse_expansion = 0;
1726
  ves->accum_pos = accum_pos;
1727
  return ves;
1728
}
1729
 
1730
/* Determine whether there is an induction variable in INSN that
1731
   we would like to split during unrolling.
1732
 
1733
   I.e. replace
1734
 
1735
   i = i + 1;
1736
   ...
1737
   i = i + 1;
1738
   ...
1739
   i = i + 1;
1740
   ...
1741
 
1742
   type chains by
1743
 
1744
   i0 = i + 1
1745
   ...
1746
   i = i0 + 1
1747
   ...
1748
   i = i0 + 2
1749
   ...
1750
 
1751
   Return NULL if INSN contains no interesting IVs.  Otherwise, allocate
1752
   an IV_TO_SPLIT structure, fill it with the relevant information and return a
1753
   pointer to it.  */
1754
 
1755
static struct iv_to_split *
1756
analyze_iv_to_split_insn (rtx insn)
1757
{
1758
  rtx set, dest;
1759
  struct rtx_iv iv;
1760
  struct iv_to_split *ivts;
1761
  bool ok;
1762
 
1763
  /* For now we just split the basic induction variables.  Later this may be
1764
     extended for example by selecting also addresses of memory references.  */
1765
  set = single_set (insn);
1766
  if (!set)
1767
    return NULL;
1768
 
1769
  dest = SET_DEST (set);
1770
  if (!REG_P (dest))
1771
    return NULL;
1772
 
1773
  if (!biv_p (insn, dest))
1774
    return NULL;
1775
 
1776
  ok = iv_analyze_result (insn, dest, &iv);
1777
 
1778
  /* This used to be an assert under the assumption that if biv_p returns
1779
     true that iv_analyze_result must also return true.  However, that
1780
     assumption is not strictly correct as evidenced by pr25569.
1781
 
1782
     Returning NULL when iv_analyze_result returns false is safe and
1783
     avoids the problems in pr25569 until the iv_analyze_* routines
1784
     can be fixed, which is apparently hard and time consuming
1785
     according to their author.  */
1786
  if (! ok)
1787
    return NULL;
1788
 
1789
  if (iv.step == const0_rtx
1790
      || iv.mode != iv.extend_mode)
1791
    return NULL;
1792
 
1793
  /* Record the insn to split.  */
1794
  ivts = XNEW (struct iv_to_split);
1795
  ivts->insn = insn;
1796
  ivts->base_var = NULL_RTX;
1797
  ivts->step = iv.step;
1798
  ivts->next = NULL;
1799
  ivts->n_loc = 1;
1800
  ivts->loc[0] = 1;
1801
 
1802
  return ivts;
1803
}
1804
 
1805
/* Determines which of insns in LOOP can be optimized.
1806
   Return a OPT_INFO struct with the relevant hash tables filled
1807
   with all insns to be optimized.  The FIRST_NEW_BLOCK field
1808
   is undefined for the return value.  */
1809
 
1810
static struct opt_info *
1811
analyze_insns_in_loop (struct loop *loop)
1812
{
1813
  basic_block *body, bb;
1814
  unsigned i;
1815
  struct opt_info *opt_info = XCNEW (struct opt_info);
1816
  rtx insn;
1817
  struct iv_to_split *ivts = NULL;
1818
  struct var_to_expand *ves = NULL;
1819
  PTR *slot1;
1820
  PTR *slot2;
1821
  VEC (edge, heap) *edges = get_loop_exit_edges (loop);
1822
  edge exit;
1823
  bool can_apply = false;
1824
 
1825
  iv_analysis_loop_init (loop);
1826
 
1827
  body = get_loop_body (loop);
1828
 
1829
  if (flag_split_ivs_in_unroller)
1830
    {
1831
      opt_info->insns_to_split = htab_create (5 * loop->num_nodes,
1832
                                              si_info_hash, si_info_eq, free);
1833
      opt_info->iv_to_split_head = NULL;
1834
      opt_info->iv_to_split_tail = &opt_info->iv_to_split_head;
1835
    }
1836
 
1837
  /* Record the loop exit bb and loop preheader before the unrolling.  */
1838
  opt_info->loop_preheader = loop_preheader_edge (loop)->src;
1839
 
1840
  if (VEC_length (edge, edges) == 1)
1841
    {
1842
      exit = VEC_index (edge, edges, 0);
1843
      if (!(exit->flags & EDGE_COMPLEX))
1844
        {
1845
          opt_info->loop_exit = split_edge (exit);
1846
          can_apply = true;
1847
        }
1848
    }
1849
 
1850
  if (flag_variable_expansion_in_unroller
1851
      && can_apply)
1852
    {
1853
      opt_info->insns_with_var_to_expand = htab_create (5 * loop->num_nodes,
1854
                                                        ve_info_hash,
1855
                                                        ve_info_eq, free);
1856
      opt_info->var_to_expand_head = NULL;
1857
      opt_info->var_to_expand_tail = &opt_info->var_to_expand_head;
1858
    }
1859
 
1860
  for (i = 0; i < loop->num_nodes; i++)
1861
    {
1862
      bb = body[i];
1863
      if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb))
1864
        continue;
1865
 
1866
      FOR_BB_INSNS (bb, insn)
1867
      {
1868
        if (!INSN_P (insn))
1869
          continue;
1870
 
1871
        if (opt_info->insns_to_split)
1872
          ivts = analyze_iv_to_split_insn (insn);
1873
 
1874
        if (ivts)
1875
          {
1876
            slot1 = htab_find_slot (opt_info->insns_to_split, ivts, INSERT);
1877
            gcc_assert (*slot1 == NULL);
1878
            *slot1 = ivts;
1879
            *opt_info->iv_to_split_tail = ivts;
1880
            opt_info->iv_to_split_tail = &ivts->next;
1881
            continue;
1882
          }
1883
 
1884
        if (opt_info->insns_with_var_to_expand)
1885
          ves = analyze_insn_to_expand_var (loop, insn);
1886
 
1887
        if (ves)
1888
          {
1889
            slot2 = htab_find_slot (opt_info->insns_with_var_to_expand, ves, INSERT);
1890
            gcc_assert (*slot2 == NULL);
1891
            *slot2 = ves;
1892
            *opt_info->var_to_expand_tail = ves;
1893
            opt_info->var_to_expand_tail = &ves->next;
1894
          }
1895
      }
1896
    }
1897
 
1898
  VEC_free (edge, heap, edges);
1899
  free (body);
1900
  return opt_info;
1901
}
1902
 
1903
/* Called just before loop duplication.  Records start of duplicated area
1904
   to OPT_INFO.  */
1905
 
1906
static void
1907
opt_info_start_duplication (struct opt_info *opt_info)
1908
{
1909
  if (opt_info)
1910
    opt_info->first_new_block = last_basic_block;
1911
}
1912
 
1913
/* Determine the number of iterations between initialization of the base
1914
   variable and the current copy (N_COPY).  N_COPIES is the total number
1915
   of newly created copies.  UNROLLING is true if we are unrolling
1916
   (not peeling) the loop.  */
1917
 
1918
static unsigned
1919
determine_split_iv_delta (unsigned n_copy, unsigned n_copies, bool unrolling)
1920
{
1921
  if (unrolling)
1922
    {
1923
      /* If we are unrolling, initialization is done in the original loop
1924
         body (number 0).  */
1925
      return n_copy;
1926
    }
1927
  else
1928
    {
1929
      /* If we are peeling, the copy in that the initialization occurs has
1930
         number 1.  The original loop (number 0) is the last.  */
1931
      if (n_copy)
1932
        return n_copy - 1;
1933
      else
1934
        return n_copies;
1935
    }
1936
}
1937
 
1938
/* Locate in EXPR the expression corresponding to the location recorded
1939
   in IVTS, and return a pointer to the RTX for this location.  */
1940
 
1941
static rtx *
1942
get_ivts_expr (rtx expr, struct iv_to_split *ivts)
1943
{
1944
  unsigned i;
1945
  rtx *ret = &expr;
1946
 
1947
  for (i = 0; i < ivts->n_loc; i++)
1948
    ret = &XEXP (*ret, ivts->loc[i]);
1949
 
1950
  return ret;
1951
}
1952
 
1953
/* Allocate basic variable for the induction variable chain.  */
1954
 
1955
static void
1956
allocate_basic_variable (struct iv_to_split *ivts)
1957
{
1958
  rtx expr = *get_ivts_expr (single_set (ivts->insn), ivts);
1959
 
1960
  ivts->base_var = gen_reg_rtx (GET_MODE (expr));
1961
}
1962
 
1963
/* Insert initialization of basic variable of IVTS before INSN, taking
1964
   the initial value from INSN.  */
1965
 
1966
static void
1967
insert_base_initialization (struct iv_to_split *ivts, rtx insn)
1968
{
1969
  rtx expr = copy_rtx (*get_ivts_expr (single_set (insn), ivts));
1970
  rtx seq;
1971
 
1972
  start_sequence ();
1973
  expr = force_operand (expr, ivts->base_var);
1974
  if (expr != ivts->base_var)
1975
    emit_move_insn (ivts->base_var, expr);
1976
  seq = get_insns ();
1977
  end_sequence ();
1978
 
1979
  emit_insn_before (seq, insn);
1980
}
1981
 
1982
/* Replace the use of induction variable described in IVTS in INSN
1983
   by base variable + DELTA * step.  */
1984
 
1985
static void
1986
split_iv (struct iv_to_split *ivts, rtx insn, unsigned delta)
1987
{
1988
  rtx expr, *loc, seq, incr, var;
1989
  enum machine_mode mode = GET_MODE (ivts->base_var);
1990
  rtx src, dest, set;
1991
 
1992
  /* Construct base + DELTA * step.  */
1993
  if (!delta)
1994
    expr = ivts->base_var;
1995
  else
1996
    {
1997
      incr = simplify_gen_binary (MULT, mode,
1998
                                  ivts->step, gen_int_mode (delta, mode));
1999
      expr = simplify_gen_binary (PLUS, GET_MODE (ivts->base_var),
2000
                                  ivts->base_var, incr);
2001
    }
2002
 
2003
  /* Figure out where to do the replacement.  */
2004
  loc = get_ivts_expr (single_set (insn), ivts);
2005
 
2006
  /* If we can make the replacement right away, we're done.  */
2007
  if (validate_change (insn, loc, expr, 0))
2008
    return;
2009
 
2010
  /* Otherwise, force EXPR into a register and try again.  */
2011
  start_sequence ();
2012
  var = gen_reg_rtx (mode);
2013
  expr = force_operand (expr, var);
2014
  if (expr != var)
2015
    emit_move_insn (var, expr);
2016
  seq = get_insns ();
2017
  end_sequence ();
2018
  emit_insn_before (seq, insn);
2019
 
2020
  if (validate_change (insn, loc, var, 0))
2021
    return;
2022
 
2023
  /* The last chance.  Try recreating the assignment in insn
2024
     completely from scratch.  */
2025
  set = single_set (insn);
2026
  gcc_assert (set);
2027
 
2028
  start_sequence ();
2029
  *loc = var;
2030
  src = copy_rtx (SET_SRC (set));
2031
  dest = copy_rtx (SET_DEST (set));
2032
  src = force_operand (src, dest);
2033
  if (src != dest)
2034
    emit_move_insn (dest, src);
2035
  seq = get_insns ();
2036
  end_sequence ();
2037
 
2038
  emit_insn_before (seq, insn);
2039
  delete_insn (insn);
2040
}
2041
 
2042
 
2043
/* Return one expansion of the accumulator recorded in struct VE.  */
2044
 
2045
static rtx
2046
get_expansion (struct var_to_expand *ve)
2047
{
2048
  rtx reg;
2049
 
2050
  if (ve->reuse_expansion == 0)
2051
    reg = ve->reg;
2052
  else
2053
    reg = VEC_index (rtx, ve->var_expansions, ve->reuse_expansion - 1);
2054
 
2055
  if (VEC_length (rtx, ve->var_expansions) == (unsigned) ve->reuse_expansion)
2056
    ve->reuse_expansion = 0;
2057
  else
2058
    ve->reuse_expansion++;
2059
 
2060
  return reg;
2061
}
2062
 
2063
 
2064
/* Given INSN replace the uses of the accumulator recorded in VE
2065
   with a new register.  */
2066
 
2067
static void
2068
expand_var_during_unrolling (struct var_to_expand *ve, rtx insn)
2069
{
2070
  rtx new_reg, set;
2071
  bool really_new_expansion = false;
2072
 
2073
  set = single_set (insn);
2074
  gcc_assert (set);
2075
 
2076
  /* Generate a new register only if the expansion limit has not been
2077
     reached.  Else reuse an already existing expansion.  */
2078
  if (PARAM_VALUE (PARAM_MAX_VARIABLE_EXPANSIONS) > ve->expansion_count)
2079
    {
2080
      really_new_expansion = true;
2081
      new_reg = gen_reg_rtx (GET_MODE (ve->reg));
2082
    }
2083
  else
2084
    new_reg = get_expansion (ve);
2085
 
2086
  validate_change (insn, &SET_DEST (set), new_reg, 1);
2087
  validate_change (insn, &XEXP (SET_SRC (set), ve->accum_pos), new_reg, 1);
2088
 
2089
  if (apply_change_group ())
2090
    if (really_new_expansion)
2091
      {
2092
        VEC_safe_push (rtx, heap, ve->var_expansions, new_reg);
2093
        ve->expansion_count++;
2094
      }
2095
}
2096
 
2097
/* Initialize the variable expansions in loop preheader.  PLACE is the
2098
   loop-preheader basic block where the initialization of the
2099
   expansions should take place.  The expansions are initialized with
2100
   (-0) when the operation is plus or minus to honor sign zero.  This
2101
   way we can prevent cases where the sign of the final result is
2102
   effected by the sign of the expansion.  Here is an example to
2103
   demonstrate this:
2104
 
2105
   for (i = 0 ; i < n; i++)
2106
     sum += something;
2107
 
2108
   ==>
2109
 
2110
   sum += something
2111
   ....
2112
   i = i+1;
2113
   sum1 += something
2114
   ....
2115
   i = i+1
2116
   sum2 += something;
2117
   ....
2118
 
2119
   When SUM is initialized with -zero and SOMETHING is also -zero; the
2120
   final result of sum should be -zero thus the expansions sum1 and sum2
2121
   should be initialized with -zero as well (otherwise we will get +zero
2122
   as the final result).  */
2123
 
2124
static void
2125
insert_var_expansion_initialization (struct var_to_expand *ve,
2126
                                     basic_block place)
2127
{
2128
  rtx seq, var, zero_init, insn;
2129
  unsigned i;
2130
  enum machine_mode mode = GET_MODE (ve->reg);
2131
  bool honor_signed_zero_p = HONOR_SIGNED_ZEROS (mode);
2132
 
2133
  if (VEC_length (rtx, ve->var_expansions) == 0)
2134
    return;
2135
 
2136
  start_sequence ();
2137
  switch (ve->op)
2138
    {
2139
    case FMA:
2140
      /* Note that we only accumulate FMA via the ADD operand.  */
2141
    case PLUS:
2142
    case MINUS:
2143
      FOR_EACH_VEC_ELT (rtx, ve->var_expansions, i, var)
2144
        {
2145
          if (honor_signed_zero_p)
2146
            zero_init = simplify_gen_unary (NEG, mode, CONST0_RTX (mode), mode);
2147
          else
2148
            zero_init = CONST0_RTX (mode);
2149
          emit_move_insn (var, zero_init);
2150
        }
2151
      break;
2152
 
2153
    case MULT:
2154
      FOR_EACH_VEC_ELT (rtx, ve->var_expansions, i, var)
2155
        {
2156
          zero_init = CONST1_RTX (GET_MODE (var));
2157
          emit_move_insn (var, zero_init);
2158
        }
2159
      break;
2160
 
2161
    default:
2162
      gcc_unreachable ();
2163
    }
2164
 
2165
  seq = get_insns ();
2166
  end_sequence ();
2167
 
2168
  insn = BB_HEAD (place);
2169
  while (!NOTE_INSN_BASIC_BLOCK_P (insn))
2170
    insn = NEXT_INSN (insn);
2171
 
2172
  emit_insn_after (seq, insn);
2173
}
2174
 
2175
/* Combine the variable expansions at the loop exit.  PLACE is the
2176
   loop exit basic block where the summation of the expansions should
2177
   take place.  */
2178
 
2179
static void
2180
combine_var_copies_in_loop_exit (struct var_to_expand *ve, basic_block place)
2181
{
2182
  rtx sum = ve->reg;
2183
  rtx expr, seq, var, insn;
2184
  unsigned i;
2185
 
2186
  if (VEC_length (rtx, ve->var_expansions) == 0)
2187
    return;
2188
 
2189
  start_sequence ();
2190
  switch (ve->op)
2191
    {
2192
    case FMA:
2193
      /* Note that we only accumulate FMA via the ADD operand.  */
2194
    case PLUS:
2195
    case MINUS:
2196
      FOR_EACH_VEC_ELT (rtx, ve->var_expansions, i, var)
2197
        sum = simplify_gen_binary (PLUS, GET_MODE (ve->reg), var, sum);
2198
      break;
2199
 
2200
    case MULT:
2201
      FOR_EACH_VEC_ELT (rtx, ve->var_expansions, i, var)
2202
        sum = simplify_gen_binary (MULT, GET_MODE (ve->reg), var, sum);
2203
      break;
2204
 
2205
    default:
2206
      gcc_unreachable ();
2207
    }
2208
 
2209
  expr = force_operand (sum, ve->reg);
2210
  if (expr != ve->reg)
2211
    emit_move_insn (ve->reg, expr);
2212
  seq = get_insns ();
2213
  end_sequence ();
2214
 
2215
  insn = BB_HEAD (place);
2216
  while (!NOTE_INSN_BASIC_BLOCK_P (insn))
2217
    insn = NEXT_INSN (insn);
2218
 
2219
  emit_insn_after (seq, insn);
2220
}
2221
 
2222
/* Apply loop optimizations in loop copies using the
2223
   data which gathered during the unrolling.  Structure
2224
   OPT_INFO record that data.
2225
 
2226
   UNROLLING is true if we unrolled (not peeled) the loop.
2227
   REWRITE_ORIGINAL_BODY is true if we should also rewrite the original body of
2228
   the loop (as it should happen in complete unrolling, but not in ordinary
2229
   peeling of the loop).  */
2230
 
2231
static void
2232
apply_opt_in_copies (struct opt_info *opt_info,
2233
                     unsigned n_copies, bool unrolling,
2234
                     bool rewrite_original_loop)
2235
{
2236
  unsigned i, delta;
2237
  basic_block bb, orig_bb;
2238
  rtx insn, orig_insn, next;
2239
  struct iv_to_split ivts_templ, *ivts;
2240
  struct var_to_expand ve_templ, *ves;
2241
 
2242
  /* Sanity check -- we need to put initialization in the original loop
2243
     body.  */
2244
  gcc_assert (!unrolling || rewrite_original_loop);
2245
 
2246
  /* Allocate the basic variables (i0).  */
2247
  if (opt_info->insns_to_split)
2248
    for (ivts = opt_info->iv_to_split_head; ivts; ivts = ivts->next)
2249
      allocate_basic_variable (ivts);
2250
 
2251
  for (i = opt_info->first_new_block; i < (unsigned) last_basic_block; i++)
2252
    {
2253
      bb = BASIC_BLOCK (i);
2254
      orig_bb = get_bb_original (bb);
2255
 
2256
      /* bb->aux holds position in copy sequence initialized by
2257
         duplicate_loop_to_header_edge.  */
2258
      delta = determine_split_iv_delta ((size_t)bb->aux, n_copies,
2259
                                        unrolling);
2260
      bb->aux = 0;
2261
      orig_insn = BB_HEAD (orig_bb);
2262
      for (insn = BB_HEAD (bb); insn != NEXT_INSN (BB_END (bb)); insn = next)
2263
        {
2264
          next = NEXT_INSN (insn);
2265
          if (!INSN_P (insn)
2266
              || (DEBUG_INSN_P (insn)
2267
                  && TREE_CODE (INSN_VAR_LOCATION_DECL (insn)) == LABEL_DECL))
2268
            continue;
2269
 
2270
          while (!INSN_P (orig_insn)
2271
                 || (DEBUG_INSN_P (orig_insn)
2272
                     && (TREE_CODE (INSN_VAR_LOCATION_DECL (orig_insn))
2273
                         == LABEL_DECL)))
2274
            orig_insn = NEXT_INSN (orig_insn);
2275
 
2276
          ivts_templ.insn = orig_insn;
2277
          ve_templ.insn = orig_insn;
2278
 
2279
          /* Apply splitting iv optimization.  */
2280
          if (opt_info->insns_to_split)
2281
            {
2282
              ivts = (struct iv_to_split *)
2283
                htab_find (opt_info->insns_to_split, &ivts_templ);
2284
 
2285
              if (ivts)
2286
                {
2287
                  gcc_assert (GET_CODE (PATTERN (insn))
2288
                              == GET_CODE (PATTERN (orig_insn)));
2289
 
2290
                  if (!delta)
2291
                    insert_base_initialization (ivts, insn);
2292
                  split_iv (ivts, insn, delta);
2293
                }
2294
            }
2295
          /* Apply variable expansion optimization.  */
2296
          if (unrolling && opt_info->insns_with_var_to_expand)
2297
            {
2298
              ves = (struct var_to_expand *)
2299
                htab_find (opt_info->insns_with_var_to_expand, &ve_templ);
2300
              if (ves)
2301
                {
2302
                  gcc_assert (GET_CODE (PATTERN (insn))
2303
                              == GET_CODE (PATTERN (orig_insn)));
2304
                  expand_var_during_unrolling (ves, insn);
2305
                }
2306
            }
2307
          orig_insn = NEXT_INSN (orig_insn);
2308
        }
2309
    }
2310
 
2311
  if (!rewrite_original_loop)
2312
    return;
2313
 
2314
  /* Initialize the variable expansions in the loop preheader
2315
     and take care of combining them at the loop exit.  */
2316
  if (opt_info->insns_with_var_to_expand)
2317
    {
2318
      for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2319
        insert_var_expansion_initialization (ves, opt_info->loop_preheader);
2320
      for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2321
        combine_var_copies_in_loop_exit (ves, opt_info->loop_exit);
2322
    }
2323
 
2324
  /* Rewrite also the original loop body.  Find them as originals of the blocks
2325
     in the last copied iteration, i.e. those that have
2326
     get_bb_copy (get_bb_original (bb)) == bb.  */
2327
  for (i = opt_info->first_new_block; i < (unsigned) last_basic_block; i++)
2328
    {
2329
      bb = BASIC_BLOCK (i);
2330
      orig_bb = get_bb_original (bb);
2331
      if (get_bb_copy (orig_bb) != bb)
2332
        continue;
2333
 
2334
      delta = determine_split_iv_delta (0, n_copies, unrolling);
2335
      for (orig_insn = BB_HEAD (orig_bb);
2336
           orig_insn != NEXT_INSN (BB_END (bb));
2337
           orig_insn = next)
2338
        {
2339
          next = NEXT_INSN (orig_insn);
2340
 
2341
          if (!INSN_P (orig_insn))
2342
            continue;
2343
 
2344
          ivts_templ.insn = orig_insn;
2345
          if (opt_info->insns_to_split)
2346
            {
2347
              ivts = (struct iv_to_split *)
2348
                htab_find (opt_info->insns_to_split, &ivts_templ);
2349
              if (ivts)
2350
                {
2351
                  if (!delta)
2352
                    insert_base_initialization (ivts, orig_insn);
2353
                  split_iv (ivts, orig_insn, delta);
2354
                  continue;
2355
                }
2356
            }
2357
 
2358
        }
2359
    }
2360
}
2361
 
2362
/* Release OPT_INFO.  */
2363
 
2364
static void
2365
free_opt_info (struct opt_info *opt_info)
2366
{
2367
  if (opt_info->insns_to_split)
2368
    htab_delete (opt_info->insns_to_split);
2369
  if (opt_info->insns_with_var_to_expand)
2370
    {
2371
      struct var_to_expand *ves;
2372
 
2373
      for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2374
        VEC_free (rtx, heap, ves->var_expansions);
2375
      htab_delete (opt_info->insns_with_var_to_expand);
2376
    }
2377
  free (opt_info);
2378
}

powered by: WebSVN 2.1.0

© copyright 1999-2024 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.