OpenCores
URL https://opencores.org/ocsvn/openrisc/openrisc/trunk

Subversion Repositories openrisc

[/] [openrisc/] [trunk/] [gnu-stable/] [gcc-4.5.1/] [gcc/] [tree-vect-loop.c] - Blame information for rev 826

Details | Compare with Previous | View Log

Line No. Rev Author Line
1 280 jeremybenn
/* Loop Vectorization
2
   Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010
3
   Free Software Foundation, Inc.
4
   Contributed by Dorit Naishlos <dorit@il.ibm.com> and
5
   Ira Rosen <irar@il.ibm.com>
6
 
7
This file is part of GCC.
8
 
9
GCC is free software; you can redistribute it and/or modify it under
10
the terms of the GNU General Public License as published by the Free
11
Software Foundation; either version 3, or (at your option) any later
12
version.
13
 
14
GCC is distributed in the hope that it will be useful, but WITHOUT ANY
15
WARRANTY; without even the implied warranty of MERCHANTABILITY or
16
FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
17
for more details.
18
 
19
You should have received a copy of the GNU General Public License
20
along with GCC; see the file COPYING3.  If not see
21
<http://www.gnu.org/licenses/>.  */
22
 
23
#include "config.h"
24
#include "system.h"
25
#include "coretypes.h"
26
#include "tm.h"
27
#include "ggc.h"
28
#include "tree.h"
29
#include "basic-block.h"
30
#include "diagnostic.h"
31
#include "tree-flow.h"
32
#include "tree-dump.h"
33
#include "cfgloop.h"
34
#include "cfglayout.h"
35
#include "expr.h"
36
#include "recog.h"
37
#include "optabs.h"
38
#include "params.h"
39
#include "toplev.h"
40
#include "tree-chrec.h"
41
#include "tree-scalar-evolution.h"
42
#include "tree-vectorizer.h"
43
 
44
/* Loop Vectorization Pass.
45
 
46
   This pass tries to vectorize loops.
47
 
48
   For example, the vectorizer transforms the following simple loop:
49
 
50
        short a[N]; short b[N]; short c[N]; int i;
51
 
52
        for (i=0; i<N; i++){
53
          a[i] = b[i] + c[i];
54
        }
55
 
56
   as if it was manually vectorized by rewriting the source code into:
57
 
58
        typedef int __attribute__((mode(V8HI))) v8hi;
59
        short a[N];  short b[N]; short c[N];   int i;
60
        v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
61
        v8hi va, vb, vc;
62
 
63
        for (i=0; i<N/8; i++){
64
          vb = pb[i];
65
          vc = pc[i];
66
          va = vb + vc;
67
          pa[i] = va;
68
        }
69
 
70
        The main entry to this pass is vectorize_loops(), in which
71
   the vectorizer applies a set of analyses on a given set of loops,
72
   followed by the actual vectorization transformation for the loops that
73
   had successfully passed the analysis phase.
74
        Throughout this pass we make a distinction between two types of
75
   data: scalars (which are represented by SSA_NAMES), and memory references
76
   ("data-refs"). These two types of data require different handling both
77
   during analysis and transformation. The types of data-refs that the
78
   vectorizer currently supports are ARRAY_REFS which base is an array DECL
79
   (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
80
   accesses are required to have a simple (consecutive) access pattern.
81
 
82
   Analysis phase:
83
   ===============
84
        The driver for the analysis phase is vect_analyze_loop().
85
   It applies a set of analyses, some of which rely on the scalar evolution
86
   analyzer (scev) developed by Sebastian Pop.
87
 
88
        During the analysis phase the vectorizer records some information
89
   per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
90
   loop, as well as general information about the loop as a whole, which is
91
   recorded in a "loop_vec_info" struct attached to each loop.
92
 
93
   Transformation phase:
94
   =====================
95
        The loop transformation phase scans all the stmts in the loop, and
96
   creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
97
   the loop that needs to be vectorized. It inserts the vector code sequence
98
   just before the scalar stmt S, and records a pointer to the vector code
99
   in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
100
   attached to S). This pointer will be used for the vectorization of following
101
   stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
102
   otherwise, we rely on dead code elimination for removing it.
103
 
104
        For example, say stmt S1 was vectorized into stmt VS1:
105
 
106
   VS1: vb = px[i];
107
   S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
108
   S2:  a = b;
109
 
110
   To vectorize stmt S2, the vectorizer first finds the stmt that defines
111
   the operand 'b' (S1), and gets the relevant vector def 'vb' from the
112
   vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
113
   resulting sequence would be:
114
 
115
   VS1: vb = px[i];
116
   S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
117
   VS2: va = vb;
118
   S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
119
 
120
        Operands that are not SSA_NAMEs, are data-refs that appear in
121
   load/store operations (like 'x[i]' in S1), and are handled differently.
122
 
123
   Target modeling:
124
   =================
125
        Currently the only target specific information that is used is the
126
   size of the vector (in bytes) - "UNITS_PER_SIMD_WORD". Targets that can
127
   support different sizes of vectors, for now will need to specify one value
128
   for "UNITS_PER_SIMD_WORD". More flexibility will be added in the future.
129
 
130
        Since we only vectorize operations which vector form can be
131
   expressed using existing tree codes, to verify that an operation is
132
   supported, the vectorizer checks the relevant optab at the relevant
133
   machine_mode (e.g, optab_handler (add_optab, V8HImode)->insn_code). If
134
   the value found is CODE_FOR_nothing, then there's no target support, and
135
   we can't vectorize the stmt.
136
 
137
   For additional information on this project see:
138
   http://gcc.gnu.org/projects/tree-ssa/vectorization.html
139
*/
140
 
141
/* Function vect_determine_vectorization_factor
142
 
143
   Determine the vectorization factor (VF). VF is the number of data elements
144
   that are operated upon in parallel in a single iteration of the vectorized
145
   loop. For example, when vectorizing a loop that operates on 4byte elements,
146
   on a target with vector size (VS) 16byte, the VF is set to 4, since 4
147
   elements can fit in a single vector register.
148
 
149
   We currently support vectorization of loops in which all types operated upon
150
   are of the same size. Therefore this function currently sets VF according to
151
   the size of the types operated upon, and fails if there are multiple sizes
152
   in the loop.
153
 
154
   VF is also the factor by which the loop iterations are strip-mined, e.g.:
155
   original loop:
156
        for (i=0; i<N; i++){
157
          a[i] = b[i] + c[i];
158
        }
159
 
160
   vectorized loop:
161
        for (i=0; i<N; i+=VF){
162
          a[i:VF] = b[i:VF] + c[i:VF];
163
        }
164
*/
165
 
166
static bool
167
vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
168
{
169
  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
170
  basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
171
  int nbbs = loop->num_nodes;
172
  gimple_stmt_iterator si;
173
  unsigned int vectorization_factor = 0;
174
  tree scalar_type;
175
  gimple phi;
176
  tree vectype;
177
  unsigned int nunits;
178
  stmt_vec_info stmt_info;
179
  int i;
180
  HOST_WIDE_INT dummy;
181
 
182
  if (vect_print_dump_info (REPORT_DETAILS))
183
    fprintf (vect_dump, "=== vect_determine_vectorization_factor ===");
184
 
185
  for (i = 0; i < nbbs; i++)
186
    {
187
      basic_block bb = bbs[i];
188
 
189
      for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
190
        {
191
          phi = gsi_stmt (si);
192
          stmt_info = vinfo_for_stmt (phi);
193
          if (vect_print_dump_info (REPORT_DETAILS))
194
            {
195
              fprintf (vect_dump, "==> examining phi: ");
196
              print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
197
            }
198
 
199
          gcc_assert (stmt_info);
200
 
201
          if (STMT_VINFO_RELEVANT_P (stmt_info))
202
            {
203
              gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
204
              scalar_type = TREE_TYPE (PHI_RESULT (phi));
205
 
206
              if (vect_print_dump_info (REPORT_DETAILS))
207
                {
208
                  fprintf (vect_dump, "get vectype for scalar type:  ");
209
                  print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
210
                }
211
 
212
              vectype = get_vectype_for_scalar_type (scalar_type);
213
              if (!vectype)
214
                {
215
                  if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
216
                    {
217
                      fprintf (vect_dump,
218
                               "not vectorized: unsupported data-type ");
219
                      print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
220
                    }
221
                  return false;
222
                }
223
              STMT_VINFO_VECTYPE (stmt_info) = vectype;
224
 
225
              if (vect_print_dump_info (REPORT_DETAILS))
226
                {
227
                  fprintf (vect_dump, "vectype: ");
228
                  print_generic_expr (vect_dump, vectype, TDF_SLIM);
229
                }
230
 
231
              nunits = TYPE_VECTOR_SUBPARTS (vectype);
232
              if (vect_print_dump_info (REPORT_DETAILS))
233
                fprintf (vect_dump, "nunits = %d", nunits);
234
 
235
              if (!vectorization_factor
236
                  || (nunits > vectorization_factor))
237
                vectorization_factor = nunits;
238
            }
239
        }
240
 
241
      for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
242
        {
243
          gimple stmt = gsi_stmt (si);
244
          stmt_info = vinfo_for_stmt (stmt);
245
 
246
          if (vect_print_dump_info (REPORT_DETAILS))
247
            {
248
              fprintf (vect_dump, "==> examining statement: ");
249
              print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
250
            }
251
 
252
          gcc_assert (stmt_info);
253
 
254
          /* skip stmts which do not need to be vectorized.  */
255
          if (!STMT_VINFO_RELEVANT_P (stmt_info)
256
              && !STMT_VINFO_LIVE_P (stmt_info))
257
            {
258
              if (vect_print_dump_info (REPORT_DETAILS))
259
                fprintf (vect_dump, "skip.");
260
              continue;
261
            }
262
 
263
          if (gimple_get_lhs (stmt) == NULL_TREE)
264
            {
265
              if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
266
                {
267
                  fprintf (vect_dump, "not vectorized: irregular stmt.");
268
                  print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
269
                }
270
              return false;
271
            }
272
 
273
          if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
274
            {
275
              if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
276
                {
277
                  fprintf (vect_dump, "not vectorized: vector stmt in loop:");
278
                  print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
279
                }
280
              return false;
281
            }
282
 
283
          if (STMT_VINFO_VECTYPE (stmt_info))
284
            {
285
              /* The only case when a vectype had been already set is for stmts
286
                 that contain a dataref, or for "pattern-stmts" (stmts generated
287
                 by the vectorizer to represent/replace a certain idiom).  */
288
              gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
289
                          || is_pattern_stmt_p (stmt_info));
290
              vectype = STMT_VINFO_VECTYPE (stmt_info);
291
            }
292
          else
293
            {
294
              gcc_assert (!STMT_VINFO_DATA_REF (stmt_info)
295
                          && !is_pattern_stmt_p (stmt_info));
296
 
297
              scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
298
                                                           &dummy);
299
              if (vect_print_dump_info (REPORT_DETAILS))
300
                {
301
                  fprintf (vect_dump, "get vectype for scalar type:  ");
302
                  print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
303
                }
304
 
305
              vectype = get_vectype_for_scalar_type (scalar_type);
306
              if (!vectype)
307
                {
308
                  if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
309
                    {
310
                      fprintf (vect_dump,
311
                               "not vectorized: unsupported data-type ");
312
                      print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
313
                    }
314
                  return false;
315
                }
316
              STMT_VINFO_VECTYPE (stmt_info) = vectype;
317
            }
318
 
319
          if (vect_print_dump_info (REPORT_DETAILS))
320
            {
321
              fprintf (vect_dump, "vectype: ");
322
              print_generic_expr (vect_dump, vectype, TDF_SLIM);
323
            }
324
 
325
          nunits = TYPE_VECTOR_SUBPARTS (vectype);
326
          if (vect_print_dump_info (REPORT_DETAILS))
327
            fprintf (vect_dump, "nunits = %d", nunits);
328
 
329
          if (!vectorization_factor
330
              || (nunits > vectorization_factor))
331
            vectorization_factor = nunits;
332
 
333
        }
334
    }
335
 
336
  /* TODO: Analyze cost. Decide if worth while to vectorize.  */
337
  if (vect_print_dump_info (REPORT_DETAILS))
338
    fprintf (vect_dump, "vectorization factor = %d", vectorization_factor);
339
  if (vectorization_factor <= 1)
340
    {
341
      if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
342
        fprintf (vect_dump, "not vectorized: unsupported data-type");
343
      return false;
344
    }
345
  LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
346
 
347
  return true;
348
}
349
 
350
 
351
/* Function vect_is_simple_iv_evolution.
352
 
353
   FORNOW: A simple evolution of an induction variables in the loop is
354
   considered a polynomial evolution with constant step.  */
355
 
356
static bool
357
vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
358
                             tree * step)
359
{
360
  tree init_expr;
361
  tree step_expr;
362
  tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
363
 
364
  /* When there is no evolution in this loop, the evolution function
365
     is not "simple".  */
366
  if (evolution_part == NULL_TREE)
367
    return false;
368
 
369
  /* When the evolution is a polynomial of degree >= 2
370
     the evolution function is not "simple".  */
371
  if (tree_is_chrec (evolution_part))
372
    return false;
373
 
374
  step_expr = evolution_part;
375
  init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
376
 
377
  if (vect_print_dump_info (REPORT_DETAILS))
378
    {
379
      fprintf (vect_dump, "step: ");
380
      print_generic_expr (vect_dump, step_expr, TDF_SLIM);
381
      fprintf (vect_dump, ",  init: ");
382
      print_generic_expr (vect_dump, init_expr, TDF_SLIM);
383
    }
384
 
385
  *init = init_expr;
386
  *step = step_expr;
387
 
388
  if (TREE_CODE (step_expr) != INTEGER_CST)
389
    {
390
      if (vect_print_dump_info (REPORT_DETAILS))
391
        fprintf (vect_dump, "step unknown.");
392
      return false;
393
    }
394
 
395
  return true;
396
}
397
 
398
/* Function vect_analyze_scalar_cycles_1.
399
 
400
   Examine the cross iteration def-use cycles of scalar variables
401
   in LOOP. LOOP_VINFO represents the loop that is now being
402
   considered for vectorization (can be LOOP, or an outer-loop
403
   enclosing LOOP).  */
404
 
405
static void
406
vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
407
{
408
  basic_block bb = loop->header;
409
  tree dumy;
410
  VEC(gimple,heap) *worklist = VEC_alloc (gimple, heap, 64);
411
  gimple_stmt_iterator gsi;
412
  bool double_reduc;
413
 
414
  if (vect_print_dump_info (REPORT_DETAILS))
415
    fprintf (vect_dump, "=== vect_analyze_scalar_cycles ===");
416
 
417
  /* First - identify all inductions. Reduction detection assumes that all the
418
     inductions have been identified, therefore, this order must not be
419
     changed.  */
420
  for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
421
    {
422
      gimple phi = gsi_stmt (gsi);
423
      tree access_fn = NULL;
424
      tree def = PHI_RESULT (phi);
425
      stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
426
 
427
      if (vect_print_dump_info (REPORT_DETAILS))
428
        {
429
          fprintf (vect_dump, "Analyze phi: ");
430
          print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
431
        }
432
 
433
      /* Skip virtual phi's. The data dependences that are associated with
434
         virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
435
      if (!is_gimple_reg (SSA_NAME_VAR (def)))
436
        continue;
437
 
438
      STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
439
 
440
      /* Analyze the evolution function.  */
441
      access_fn = analyze_scalar_evolution (loop, def);
442
      if (access_fn && vect_print_dump_info (REPORT_DETAILS))
443
        {
444
          fprintf (vect_dump, "Access function of PHI: ");
445
          print_generic_expr (vect_dump, access_fn, TDF_SLIM);
446
        }
447
 
448
      if (!access_fn
449
          || !vect_is_simple_iv_evolution (loop->num, access_fn, &dumy, &dumy))
450
        {
451
          VEC_safe_push (gimple, heap, worklist, phi);
452
          continue;
453
        }
454
 
455
      if (vect_print_dump_info (REPORT_DETAILS))
456
        fprintf (vect_dump, "Detected induction.");
457
      STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
458
    }
459
 
460
 
461
  /* Second - identify all reductions and nested cycles.  */
462
  while (VEC_length (gimple, worklist) > 0)
463
    {
464
      gimple phi = VEC_pop (gimple, worklist);
465
      tree def = PHI_RESULT (phi);
466
      stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
467
      gimple reduc_stmt;
468
      bool nested_cycle;
469
 
470
      if (vect_print_dump_info (REPORT_DETAILS))
471
        {
472
          fprintf (vect_dump, "Analyze phi: ");
473
          print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
474
        }
475
 
476
      gcc_assert (is_gimple_reg (SSA_NAME_VAR (def)));
477
      gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
478
 
479
      nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
480
      reduc_stmt = vect_is_simple_reduction (loop_vinfo, phi, !nested_cycle,
481
                                             &double_reduc);
482
      if (reduc_stmt)
483
        {
484
          if (double_reduc)
485
            {
486
              if (vect_print_dump_info (REPORT_DETAILS))
487
                fprintf (vect_dump, "Detected double reduction.");
488
 
489
              STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
490
              STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
491
                                                    vect_double_reduction_def;
492
            }
493
          else
494
            {
495
              if (nested_cycle)
496
                {
497
                  if (vect_print_dump_info (REPORT_DETAILS))
498
                    fprintf (vect_dump, "Detected vectorizable nested cycle.");
499
 
500
                  STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
501
                  STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
502
                                                             vect_nested_cycle;
503
                }
504
              else
505
                {
506
                  if (vect_print_dump_info (REPORT_DETAILS))
507
                    fprintf (vect_dump, "Detected reduction.");
508
 
509
                  STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
510
                  STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
511
                                                           vect_reduction_def;
512
                }
513
            }
514
        }
515
      else
516
        if (vect_print_dump_info (REPORT_DETAILS))
517
          fprintf (vect_dump, "Unknown def-use cycle pattern.");
518
    }
519
 
520
  VEC_free (gimple, heap, worklist);
521
}
522
 
523
 
524
/* Function vect_analyze_scalar_cycles.
525
 
526
   Examine the cross iteration def-use cycles of scalar variables, by
527
   analyzing the loop-header PHIs of scalar variables; Classify each
528
   cycle as one of the following: invariant, induction, reduction, unknown.
529
   We do that for the loop represented by LOOP_VINFO, and also to its
530
   inner-loop, if exists.
531
   Examples for scalar cycles:
532
 
533
   Example1: reduction:
534
 
535
              loop1:
536
              for (i=0; i<N; i++)
537
                 sum += a[i];
538
 
539
   Example2: induction:
540
 
541
              loop2:
542
              for (i=0; i<N; i++)
543
                 a[i] = i;  */
544
 
545
static void
546
vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
547
{
548
  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
549
 
550
  vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
551
 
552
  /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
553
     Reductions in such inner-loop therefore have different properties than
554
     the reductions in the nest that gets vectorized:
555
     1. When vectorized, they are executed in the same order as in the original
556
        scalar loop, so we can't change the order of computation when
557
        vectorizing them.
558
     2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
559
        current checks are too strict.  */
560
 
561
  if (loop->inner)
562
    vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
563
}
564
 
565
/* Function vect_get_loop_niters.
566
 
567
   Determine how many iterations the loop is executed.
568
   If an expression that represents the number of iterations
569
   can be constructed, place it in NUMBER_OF_ITERATIONS.
570
   Return the loop exit condition.  */
571
 
572
static gimple
573
vect_get_loop_niters (struct loop *loop, tree *number_of_iterations)
574
{
575
  tree niters;
576
 
577
  if (vect_print_dump_info (REPORT_DETAILS))
578
    fprintf (vect_dump, "=== get_loop_niters ===");
579
 
580
  niters = number_of_exit_cond_executions (loop);
581
 
582
  if (niters != NULL_TREE
583
      && niters != chrec_dont_know)
584
    {
585
      *number_of_iterations = niters;
586
 
587
      if (vect_print_dump_info (REPORT_DETAILS))
588
        {
589
          fprintf (vect_dump, "==> get_loop_niters:" );
590
          print_generic_expr (vect_dump, *number_of_iterations, TDF_SLIM);
591
        }
592
    }
593
 
594
  return get_loop_exit_condition (loop);
595
}
596
 
597
 
598
/* Function bb_in_loop_p
599
 
600
   Used as predicate for dfs order traversal of the loop bbs.  */
601
 
602
static bool
603
bb_in_loop_p (const_basic_block bb, const void *data)
604
{
605
  const struct loop *const loop = (const struct loop *)data;
606
  if (flow_bb_inside_loop_p (loop, bb))
607
    return true;
608
  return false;
609
}
610
 
611
 
612
/* Function new_loop_vec_info.
613
 
614
   Create and initialize a new loop_vec_info struct for LOOP, as well as
615
   stmt_vec_info structs for all the stmts in LOOP.  */
616
 
617
static loop_vec_info
618
new_loop_vec_info (struct loop *loop)
619
{
620
  loop_vec_info res;
621
  basic_block *bbs;
622
  gimple_stmt_iterator si;
623
  unsigned int i, nbbs;
624
 
625
  res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
626
  LOOP_VINFO_LOOP (res) = loop;
627
 
628
  bbs = get_loop_body (loop);
629
 
630
  /* Create/Update stmt_info for all stmts in the loop.  */
631
  for (i = 0; i < loop->num_nodes; i++)
632
    {
633
      basic_block bb = bbs[i];
634
 
635
      /* BBs in a nested inner-loop will have been already processed (because
636
         we will have called vect_analyze_loop_form for any nested inner-loop).
637
         Therefore, for stmts in an inner-loop we just want to update the
638
         STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new
639
         loop_info of the outer-loop we are currently considering to vectorize
640
         (instead of the loop_info of the inner-loop).
641
         For stmts in other BBs we need to create a stmt_info from scratch.  */
642
      if (bb->loop_father != loop)
643
        {
644
          /* Inner-loop bb.  */
645
          gcc_assert (loop->inner && bb->loop_father == loop->inner);
646
          for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
647
            {
648
              gimple phi = gsi_stmt (si);
649
              stmt_vec_info stmt_info = vinfo_for_stmt (phi);
650
              loop_vec_info inner_loop_vinfo =
651
                STMT_VINFO_LOOP_VINFO (stmt_info);
652
              gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
653
              STMT_VINFO_LOOP_VINFO (stmt_info) = res;
654
            }
655
          for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
656
           {
657
              gimple stmt = gsi_stmt (si);
658
              stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
659
              loop_vec_info inner_loop_vinfo =
660
                 STMT_VINFO_LOOP_VINFO (stmt_info);
661
              gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
662
              STMT_VINFO_LOOP_VINFO (stmt_info) = res;
663
           }
664
        }
665
      else
666
        {
667
          /* bb in current nest.  */
668
          for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
669
            {
670
              gimple phi = gsi_stmt (si);
671
              gimple_set_uid (phi, 0);
672
              set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res, NULL));
673
            }
674
 
675
          for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
676
            {
677
              gimple stmt = gsi_stmt (si);
678
              gimple_set_uid (stmt, 0);
679
              set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res, NULL));
680
            }
681
        }
682
    }
683
 
684
  /* CHECKME: We want to visit all BBs before their successors (except for
685
     latch blocks, for which this assertion wouldn't hold).  In the simple
686
     case of the loop forms we allow, a dfs order of the BBs would the same
687
     as reversed postorder traversal, so we are safe.  */
688
 
689
   free (bbs);
690
   bbs = XCNEWVEC (basic_block, loop->num_nodes);
691
   nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
692
                              bbs, loop->num_nodes, loop);
693
   gcc_assert (nbbs == loop->num_nodes);
694
 
695
  LOOP_VINFO_BBS (res) = bbs;
696
  LOOP_VINFO_NITERS (res) = NULL;
697
  LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
698
  LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
699
  LOOP_VINFO_VECTORIZABLE_P (res) = 0;
700
  LOOP_PEELING_FOR_ALIGNMENT (res) = 0;
701
  LOOP_VINFO_VECT_FACTOR (res) = 0;
702
  LOOP_VINFO_DATAREFS (res) = VEC_alloc (data_reference_p, heap, 10);
703
  LOOP_VINFO_DDRS (res) = VEC_alloc (ddr_p, heap, 10 * 10);
704
  LOOP_VINFO_UNALIGNED_DR (res) = NULL;
705
  LOOP_VINFO_MAY_MISALIGN_STMTS (res) =
706
    VEC_alloc (gimple, heap,
707
               PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS));
708
  LOOP_VINFO_MAY_ALIAS_DDRS (res) =
709
    VEC_alloc (ddr_p, heap,
710
               PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
711
  LOOP_VINFO_STRIDED_STORES (res) = VEC_alloc (gimple, heap, 10);
712
  LOOP_VINFO_SLP_INSTANCES (res) = VEC_alloc (slp_instance, heap, 10);
713
  LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
714
 
715
  return res;
716
}
717
 
718
 
719
/* Function destroy_loop_vec_info.
720
 
721
   Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
722
   stmts in the loop.  */
723
 
724
void
725
destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
726
{
727
  struct loop *loop;
728
  basic_block *bbs;
729
  int nbbs;
730
  gimple_stmt_iterator si;
731
  int j;
732
  VEC (slp_instance, heap) *slp_instances;
733
  slp_instance instance;
734
 
735
  if (!loop_vinfo)
736
    return;
737
 
738
  loop = LOOP_VINFO_LOOP (loop_vinfo);
739
 
740
  bbs = LOOP_VINFO_BBS (loop_vinfo);
741
  nbbs = loop->num_nodes;
742
 
743
  if (!clean_stmts)
744
    {
745
      free (LOOP_VINFO_BBS (loop_vinfo));
746
      free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
747
      free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
748
      VEC_free (gimple, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
749
 
750
      free (loop_vinfo);
751
      loop->aux = NULL;
752
      return;
753
    }
754
 
755
  for (j = 0; j < nbbs; j++)
756
    {
757
      basic_block bb = bbs[j];
758
      for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
759
        free_stmt_vec_info (gsi_stmt (si));
760
 
761
      for (si = gsi_start_bb (bb); !gsi_end_p (si); )
762
        {
763
          gimple stmt = gsi_stmt (si);
764
          stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
765
 
766
          if (stmt_info)
767
            {
768
              /* Check if this is a "pattern stmt" (introduced by the
769
                 vectorizer during the pattern recognition pass).  */
770
              bool remove_stmt_p = false;
771
              gimple orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
772
              if (orig_stmt)
773
                {
774
                  stmt_vec_info orig_stmt_info = vinfo_for_stmt (orig_stmt);
775
                  if (orig_stmt_info
776
                      && STMT_VINFO_IN_PATTERN_P (orig_stmt_info))
777
                    remove_stmt_p = true;
778
                }
779
 
780
              /* Free stmt_vec_info.  */
781
              free_stmt_vec_info (stmt);
782
 
783
              /* Remove dead "pattern stmts".  */
784
              if (remove_stmt_p)
785
                gsi_remove (&si, true);
786
            }
787
          gsi_next (&si);
788
        }
789
    }
790
 
791
  free (LOOP_VINFO_BBS (loop_vinfo));
792
  free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
793
  free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
794
  VEC_free (gimple, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
795
  VEC_free (ddr_p, heap, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
796
  slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
797
  for (j = 0; VEC_iterate (slp_instance, slp_instances, j, instance); j++)
798
    vect_free_slp_instance (instance);
799
 
800
  VEC_free (slp_instance, heap, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
801
  VEC_free (gimple, heap, LOOP_VINFO_STRIDED_STORES (loop_vinfo));
802
 
803
  free (loop_vinfo);
804
  loop->aux = NULL;
805
}
806
 
807
 
808
/* Function vect_analyze_loop_1.
809
 
810
   Apply a set of analyses on LOOP, and create a loop_vec_info struct
811
   for it. The different analyses will record information in the
812
   loop_vec_info struct.  This is a subset of the analyses applied in
813
   vect_analyze_loop, to be applied on an inner-loop nested in the loop
814
   that is now considered for (outer-loop) vectorization.  */
815
 
816
static loop_vec_info
817
vect_analyze_loop_1 (struct loop *loop)
818
{
819
  loop_vec_info loop_vinfo;
820
 
821
  if (vect_print_dump_info (REPORT_DETAILS))
822
    fprintf (vect_dump, "===== analyze_loop_nest_1 =====");
823
 
824
  /* Check the CFG characteristics of the loop (nesting, entry/exit, etc.  */
825
 
826
  loop_vinfo = vect_analyze_loop_form (loop);
827
  if (!loop_vinfo)
828
    {
829
      if (vect_print_dump_info (REPORT_DETAILS))
830
        fprintf (vect_dump, "bad inner-loop form.");
831
      return NULL;
832
    }
833
 
834
  return loop_vinfo;
835
}
836
 
837
 
838
/* Function vect_analyze_loop_form.
839
 
840
   Verify that certain CFG restrictions hold, including:
841
   - the loop has a pre-header
842
   - the loop has a single entry and exit
843
   - the loop exit condition is simple enough, and the number of iterations
844
     can be analyzed (a countable loop).  */
845
 
846
loop_vec_info
847
vect_analyze_loop_form (struct loop *loop)
848
{
849
  loop_vec_info loop_vinfo;
850
  gimple loop_cond;
851
  tree number_of_iterations = NULL;
852
  loop_vec_info inner_loop_vinfo = NULL;
853
 
854
  if (vect_print_dump_info (REPORT_DETAILS))
855
    fprintf (vect_dump, "=== vect_analyze_loop_form ===");
856
 
857
  /* Different restrictions apply when we are considering an inner-most loop,
858
     vs. an outer (nested) loop.
859
     (FORNOW. May want to relax some of these restrictions in the future).  */
860
 
861
  if (!loop->inner)
862
    {
863
      /* Inner-most loop.  We currently require that the number of BBs is
864
         exactly 2 (the header and latch).  Vectorizable inner-most loops
865
         look like this:
866
 
867
                        (pre-header)
868
                           |
869
                          header <--------+
870
                           | |            |
871
                           | +--> latch --+
872
                           |
873
                        (exit-bb)  */
874
 
875
      if (loop->num_nodes != 2)
876
        {
877
          if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
878
            fprintf (vect_dump, "not vectorized: control flow in loop.");
879
          return NULL;
880
        }
881
 
882
      if (empty_block_p (loop->header))
883
    {
884
          if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
885
            fprintf (vect_dump, "not vectorized: empty loop.");
886
      return NULL;
887
    }
888
    }
889
  else
890
    {
891
      struct loop *innerloop = loop->inner;
892
      edge entryedge;
893
 
894
      /* Nested loop. We currently require that the loop is doubly-nested,
895
         contains a single inner loop, and the number of BBs is exactly 5.
896
         Vectorizable outer-loops look like this:
897
 
898
                        (pre-header)
899
                           |
900
                          header <---+
901
                           |         |
902
                          inner-loop |
903
                           |         |
904
                          tail ------+
905
                           |
906
                        (exit-bb)
907
 
908
         The inner-loop has the properties expected of inner-most loops
909
         as described above.  */
910
 
911
      if ((loop->inner)->inner || (loop->inner)->next)
912
        {
913
          if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
914
            fprintf (vect_dump, "not vectorized: multiple nested loops.");
915
          return NULL;
916
        }
917
 
918
      /* Analyze the inner-loop.  */
919
      inner_loop_vinfo = vect_analyze_loop_1 (loop->inner);
920
      if (!inner_loop_vinfo)
921
        {
922
          if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
923
            fprintf (vect_dump, "not vectorized: Bad inner loop.");
924
          return NULL;
925
        }
926
 
927
      if (!expr_invariant_in_loop_p (loop,
928
                                        LOOP_VINFO_NITERS (inner_loop_vinfo)))
929
        {
930
          if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
931
            fprintf (vect_dump,
932
                     "not vectorized: inner-loop count not invariant.");
933
          destroy_loop_vec_info (inner_loop_vinfo, true);
934
          return NULL;
935
        }
936
 
937
      if (loop->num_nodes != 5)
938
        {
939
          if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
940
            fprintf (vect_dump, "not vectorized: control flow in loop.");
941
          destroy_loop_vec_info (inner_loop_vinfo, true);
942
          return NULL;
943
        }
944
 
945
      gcc_assert (EDGE_COUNT (innerloop->header->preds) == 2);
946
      entryedge = EDGE_PRED (innerloop->header, 0);
947
      if (EDGE_PRED (innerloop->header, 0)->src == innerloop->latch)
948
        entryedge = EDGE_PRED (innerloop->header, 1);
949
 
950
      if (entryedge->src != loop->header
951
          || !single_exit (innerloop)
952
          || single_exit (innerloop)->dest !=  EDGE_PRED (loop->latch, 0)->src)
953
        {
954
          if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
955
            fprintf (vect_dump, "not vectorized: unsupported outerloop form.");
956
          destroy_loop_vec_info (inner_loop_vinfo, true);
957
          return NULL;
958
        }
959
 
960
      if (vect_print_dump_info (REPORT_DETAILS))
961
        fprintf (vect_dump, "Considering outer-loop vectorization.");
962
    }
963
 
964
  if (!single_exit (loop)
965
      || EDGE_COUNT (loop->header->preds) != 2)
966
    {
967
      if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
968
        {
969
          if (!single_exit (loop))
970
            fprintf (vect_dump, "not vectorized: multiple exits.");
971
          else if (EDGE_COUNT (loop->header->preds) != 2)
972
            fprintf (vect_dump, "not vectorized: too many incoming edges.");
973
        }
974
      if (inner_loop_vinfo)
975
        destroy_loop_vec_info (inner_loop_vinfo, true);
976
      return NULL;
977
    }
978
 
979
  /* We assume that the loop exit condition is at the end of the loop. i.e,
980
     that the loop is represented as a do-while (with a proper if-guard
981
     before the loop if needed), where the loop header contains all the
982
     executable statements, and the latch is empty.  */
983
  if (!empty_block_p (loop->latch)
984
        || !gimple_seq_empty_p (phi_nodes (loop->latch)))
985
    {
986
      if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
987
        fprintf (vect_dump, "not vectorized: unexpected loop form.");
988
      if (inner_loop_vinfo)
989
        destroy_loop_vec_info (inner_loop_vinfo, true);
990
      return NULL;
991
    }
992
 
993
  /* Make sure there exists a single-predecessor exit bb:  */
994
  if (!single_pred_p (single_exit (loop)->dest))
995
    {
996
      edge e = single_exit (loop);
997
      if (!(e->flags & EDGE_ABNORMAL))
998
        {
999
          split_loop_exit_edge (e);
1000
          if (vect_print_dump_info (REPORT_DETAILS))
1001
            fprintf (vect_dump, "split exit edge.");
1002
        }
1003
      else
1004
        {
1005
          if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1006
            fprintf (vect_dump, "not vectorized: abnormal loop exit edge.");
1007
          if (inner_loop_vinfo)
1008
            destroy_loop_vec_info (inner_loop_vinfo, true);
1009
          return NULL;
1010
        }
1011
    }
1012
 
1013
  loop_cond = vect_get_loop_niters (loop, &number_of_iterations);
1014
  if (!loop_cond)
1015
    {
1016
      if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1017
        fprintf (vect_dump, "not vectorized: complicated exit condition.");
1018
      if (inner_loop_vinfo)
1019
        destroy_loop_vec_info (inner_loop_vinfo, true);
1020
      return NULL;
1021
    }
1022
 
1023
  if (!number_of_iterations)
1024
    {
1025
      if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1026
        fprintf (vect_dump,
1027
                 "not vectorized: number of iterations cannot be computed.");
1028
      if (inner_loop_vinfo)
1029
        destroy_loop_vec_info (inner_loop_vinfo, true);
1030
      return NULL;
1031
    }
1032
 
1033
  if (chrec_contains_undetermined (number_of_iterations))
1034
    {
1035
      if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1036
        fprintf (vect_dump, "Infinite number of iterations.");
1037
      if (inner_loop_vinfo)
1038
        destroy_loop_vec_info (inner_loop_vinfo, true);
1039
      return NULL;
1040
    }
1041
 
1042
  if (!NITERS_KNOWN_P (number_of_iterations))
1043
    {
1044
      if (vect_print_dump_info (REPORT_DETAILS))
1045
        {
1046
          fprintf (vect_dump, "Symbolic number of iterations is ");
1047
          print_generic_expr (vect_dump, number_of_iterations, TDF_DETAILS);
1048
        }
1049
    }
1050
  else if (TREE_INT_CST_LOW (number_of_iterations) == 0)
1051
    {
1052
      if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1053
        fprintf (vect_dump, "not vectorized: number of iterations = 0.");
1054
      if (inner_loop_vinfo)
1055
        destroy_loop_vec_info (inner_loop_vinfo, false);
1056
      return NULL;
1057
    }
1058
 
1059
  loop_vinfo = new_loop_vec_info (loop);
1060
  LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1061
  LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1062
 
1063
  STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1064
 
1065
  /* CHECKME: May want to keep it around it in the future.  */
1066
  if (inner_loop_vinfo)
1067
    destroy_loop_vec_info (inner_loop_vinfo, false);
1068
 
1069
  gcc_assert (!loop->aux);
1070
  loop->aux = loop_vinfo;
1071
  return loop_vinfo;
1072
}
1073
 
1074
 
1075
/* Function vect_analyze_loop_operations.
1076
 
1077
   Scan the loop stmts and make sure they are all vectorizable.  */
1078
 
1079
static bool
1080
vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1081
{
1082
  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1083
  basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1084
  int nbbs = loop->num_nodes;
1085
  gimple_stmt_iterator si;
1086
  unsigned int vectorization_factor = 0;
1087
  int i;
1088
  gimple phi;
1089
  stmt_vec_info stmt_info;
1090
  bool need_to_vectorize = false;
1091
  int min_profitable_iters;
1092
  int min_scalar_loop_bound;
1093
  unsigned int th;
1094
  bool only_slp_in_loop = true, ok;
1095
 
1096
  if (vect_print_dump_info (REPORT_DETAILS))
1097
    fprintf (vect_dump, "=== vect_analyze_loop_operations ===");
1098
 
1099
  gcc_assert (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
1100
  vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1101
 
1102
  for (i = 0; i < nbbs; i++)
1103
    {
1104
      basic_block bb = bbs[i];
1105
 
1106
      for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1107
        {
1108
          phi = gsi_stmt (si);
1109
          ok = true;
1110
 
1111
          stmt_info = vinfo_for_stmt (phi);
1112
          if (vect_print_dump_info (REPORT_DETAILS))
1113
            {
1114
              fprintf (vect_dump, "examining phi: ");
1115
              print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
1116
            }
1117
 
1118
          if (! is_loop_header_bb_p (bb))
1119
            {
1120
              /* inner-loop loop-closed exit phi in outer-loop vectorization
1121
                 (i.e. a phi in the tail of the outer-loop).
1122
                 FORNOW: we currently don't support the case that these phis
1123
                 are not used in the outerloop (unless it is double reduction,
1124
                 i.e., this phi is vect_reduction_def), cause this case
1125
                 requires to actually do something here.  */
1126
              if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1127
                   || STMT_VINFO_LIVE_P (stmt_info))
1128
                  && STMT_VINFO_DEF_TYPE (stmt_info)
1129
                     != vect_double_reduction_def)
1130
                {
1131
                  if (vect_print_dump_info (REPORT_DETAILS))
1132
                    fprintf (vect_dump,
1133
                             "Unsupported loop-closed phi in outer-loop.");
1134
                  return false;
1135
                }
1136
              continue;
1137
            }
1138
 
1139
          gcc_assert (stmt_info);
1140
 
1141
          if (STMT_VINFO_LIVE_P (stmt_info))
1142
            {
1143
              /* FORNOW: not yet supported.  */
1144
              if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1145
                fprintf (vect_dump, "not vectorized: value used after loop.");
1146
              return false;
1147
            }
1148
 
1149
          if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1150
              && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1151
            {
1152
              /* A scalar-dependence cycle that we don't support.  */
1153
              if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1154
                fprintf (vect_dump, "not vectorized: scalar dependence cycle.");
1155
              return false;
1156
            }
1157
 
1158
          if (STMT_VINFO_RELEVANT_P (stmt_info))
1159
            {
1160
              need_to_vectorize = true;
1161
              if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1162
                ok = vectorizable_induction (phi, NULL, NULL);
1163
            }
1164
 
1165
          if (!ok)
1166
            {
1167
              if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1168
                {
1169
                  fprintf (vect_dump,
1170
                           "not vectorized: relevant phi not supported: ");
1171
                  print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
1172
                }
1173
              return false;
1174
            }
1175
        }
1176
 
1177
      for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1178
        {
1179
          gimple stmt = gsi_stmt (si);
1180
          stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1181
 
1182
          gcc_assert (stmt_info);
1183
 
1184
          if (!vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1185
            return false;
1186
 
1187
          if ((STMT_VINFO_RELEVANT_P (stmt_info)
1188
               || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1189
              && !PURE_SLP_STMT (stmt_info))
1190
 
1191
            /* STMT needs both SLP and loop-based vectorization.  */
1192
            only_slp_in_loop = false;
1193
        }
1194
    } /* bbs */
1195
 
1196
  /* All operations in the loop are either irrelevant (deal with loop
1197
     control, or dead), or only used outside the loop and can be moved
1198
     out of the loop (e.g. invariants, inductions).  The loop can be
1199
     optimized away by scalar optimizations.  We're better off not
1200
     touching this loop.  */
1201
  if (!need_to_vectorize)
1202
    {
1203
      if (vect_print_dump_info (REPORT_DETAILS))
1204
        fprintf (vect_dump,
1205
                 "All the computation can be taken out of the loop.");
1206
      if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1207
        fprintf (vect_dump,
1208
                 "not vectorized: redundant loop. no profit to vectorize.");
1209
      return false;
1210
    }
1211
 
1212
  /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1213
     vectorization factor of the loop is the unrolling factor required by the
1214
     SLP instances.  If that unrolling factor is 1, we say, that we perform
1215
     pure SLP on loop - cross iteration parallelism is not exploited.  */
1216
  if (only_slp_in_loop)
1217
    vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1218
  else
1219
    vectorization_factor = least_common_multiple (vectorization_factor,
1220
                                LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1221
 
1222
  LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1223
 
1224
  if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1225
      && vect_print_dump_info (REPORT_DETAILS))
1226
    fprintf (vect_dump,
1227
        "vectorization_factor = %d, niters = " HOST_WIDE_INT_PRINT_DEC,
1228
        vectorization_factor, LOOP_VINFO_INT_NITERS (loop_vinfo));
1229
 
1230
  if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1231
      && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1232
    {
1233
      if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1234
        fprintf (vect_dump, "not vectorized: iteration count too small.");
1235
      if (vect_print_dump_info (REPORT_DETAILS))
1236
        fprintf (vect_dump,"not vectorized: iteration count smaller than "
1237
                 "vectorization factor.");
1238
      return false;
1239
    }
1240
 
1241
  /* Analyze cost. Decide if worth while to vectorize.  */
1242
 
1243
  /* Once VF is set, SLP costs should be updated since the number of created
1244
     vector stmts depends on VF.  */
1245
  vect_update_slp_costs_according_to_vf (loop_vinfo);
1246
 
1247
  min_profitable_iters = vect_estimate_min_profitable_iters (loop_vinfo);
1248
  LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo) = min_profitable_iters;
1249
 
1250
  if (min_profitable_iters < 0)
1251
    {
1252
      if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1253
        fprintf (vect_dump, "not vectorized: vectorization not profitable.");
1254
      if (vect_print_dump_info (REPORT_DETAILS))
1255
        fprintf (vect_dump, "not vectorized: vector version will never be "
1256
                 "profitable.");
1257
      return false;
1258
    }
1259
 
1260
  min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1261
                            * vectorization_factor) - 1);
1262
 
1263
  /* Use the cost model only if it is more conservative than user specified
1264
     threshold.  */
1265
 
1266
  th = (unsigned) min_scalar_loop_bound;
1267
  if (min_profitable_iters
1268
      && (!min_scalar_loop_bound
1269
          || min_profitable_iters > min_scalar_loop_bound))
1270
    th = (unsigned) min_profitable_iters;
1271
 
1272
  if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1273
      && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
1274
    {
1275
      if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1276
        fprintf (vect_dump, "not vectorized: vectorization not "
1277
                 "profitable.");
1278
      if (vect_print_dump_info (REPORT_DETAILS))
1279
        fprintf (vect_dump, "not vectorized: iteration count smaller than "
1280
                 "user specified loop bound parameter or minimum "
1281
                 "profitable iterations (whichever is more conservative).");
1282
      return false;
1283
    }
1284
 
1285
  if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1286
      || LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0
1287
      || LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
1288
    {
1289
      if (vect_print_dump_info (REPORT_DETAILS))
1290
        fprintf (vect_dump, "epilog loop required.");
1291
      if (!vect_can_advance_ivs_p (loop_vinfo))
1292
        {
1293
          if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1294
            fprintf (vect_dump,
1295
                     "not vectorized: can't create epilog loop 1.");
1296
          return false;
1297
        }
1298
      if (!slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
1299
        {
1300
          if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1301
            fprintf (vect_dump,
1302
                     "not vectorized: can't create epilog loop 2.");
1303
          return false;
1304
        }
1305
    }
1306
 
1307
  return true;
1308
}
1309
 
1310
 
1311
/* Function vect_analyze_loop.
1312
 
1313
   Apply a set of analyses on LOOP, and create a loop_vec_info struct
1314
   for it. The different analyses will record information in the
1315
   loop_vec_info struct.  */
1316
loop_vec_info
1317
vect_analyze_loop (struct loop *loop)
1318
{
1319
  bool ok;
1320
  loop_vec_info loop_vinfo;
1321
 
1322
  if (vect_print_dump_info (REPORT_DETAILS))
1323
    fprintf (vect_dump, "===== analyze_loop_nest =====");
1324
 
1325
  if (loop_outer (loop)
1326
      && loop_vec_info_for_loop (loop_outer (loop))
1327
      && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
1328
    {
1329
      if (vect_print_dump_info (REPORT_DETAILS))
1330
        fprintf (vect_dump, "outer-loop already vectorized.");
1331
      return NULL;
1332
    }
1333
 
1334
  /* Check the CFG characteristics of the loop (nesting, entry/exit, etc.  */
1335
 
1336
  loop_vinfo = vect_analyze_loop_form (loop);
1337
  if (!loop_vinfo)
1338
    {
1339
      if (vect_print_dump_info (REPORT_DETAILS))
1340
        fprintf (vect_dump, "bad loop form.");
1341
      return NULL;
1342
    }
1343
 
1344
  /* Find all data references in the loop (which correspond to vdefs/vuses)
1345
     and analyze their evolution in the loop.
1346
 
1347
     FORNOW: Handle only simple, array references, which
1348
     alignment can be forced, and aligned pointer-references.  */
1349
 
1350
  ok = vect_analyze_data_refs (loop_vinfo, NULL);
1351
  if (!ok)
1352
    {
1353
      if (vect_print_dump_info (REPORT_DETAILS))
1354
        fprintf (vect_dump, "bad data references.");
1355
      destroy_loop_vec_info (loop_vinfo, true);
1356
      return NULL;
1357
    }
1358
 
1359
  /* Classify all cross-iteration scalar data-flow cycles.
1360
     Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1361
 
1362
  vect_analyze_scalar_cycles (loop_vinfo);
1363
 
1364
  vect_pattern_recog (loop_vinfo);
1365
 
1366
  /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1367
 
1368
  ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1369
  if (!ok)
1370
    {
1371
      if (vect_print_dump_info (REPORT_DETAILS))
1372
        fprintf (vect_dump, "unexpected pattern.");
1373
      destroy_loop_vec_info (loop_vinfo, true);
1374
      return NULL;
1375
    }
1376
 
1377
  /* Analyze the alignment of the data-refs in the loop.
1378
     Fail if a data reference is found that cannot be vectorized.  */
1379
 
1380
  ok = vect_analyze_data_refs_alignment (loop_vinfo, NULL);
1381
  if (!ok)
1382
    {
1383
      if (vect_print_dump_info (REPORT_DETAILS))
1384
        fprintf (vect_dump, "bad data alignment.");
1385
      destroy_loop_vec_info (loop_vinfo, true);
1386
      return NULL;
1387
    }
1388
 
1389
  ok = vect_determine_vectorization_factor (loop_vinfo);
1390
  if (!ok)
1391
    {
1392
      if (vect_print_dump_info (REPORT_DETAILS))
1393
        fprintf (vect_dump, "can't determine vectorization factor.");
1394
      destroy_loop_vec_info (loop_vinfo, true);
1395
      return NULL;
1396
    }
1397
 
1398
  /* Analyze data dependences between the data-refs in the loop.
1399
     FORNOW: fail at the first data dependence that we encounter.  */
1400
 
1401
  ok = vect_analyze_data_ref_dependences (loop_vinfo, NULL);
1402
  if (!ok)
1403
    {
1404
      if (vect_print_dump_info (REPORT_DETAILS))
1405
        fprintf (vect_dump, "bad data dependence.");
1406
      destroy_loop_vec_info (loop_vinfo, true);
1407
      return NULL;
1408
    }
1409
 
1410
  /* Analyze the access patterns of the data-refs in the loop (consecutive,
1411
     complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1412
 
1413
  ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
1414
  if (!ok)
1415
    {
1416
      if (vect_print_dump_info (REPORT_DETAILS))
1417
        fprintf (vect_dump, "bad data access.");
1418
      destroy_loop_vec_info (loop_vinfo, true);
1419
      return NULL;
1420
    }
1421
 
1422
  /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1423
     It is important to call pruning after vect_analyze_data_ref_accesses,
1424
     since we use grouping information gathered by interleaving analysis.  */
1425
  ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1426
  if (!ok)
1427
    {
1428
      if (vect_print_dump_info (REPORT_DETAILS))
1429
        fprintf (vect_dump, "too long list of versioning for alias "
1430
                            "run-time tests.");
1431
      destroy_loop_vec_info (loop_vinfo, true);
1432
      return NULL;
1433
    }
1434
 
1435
  /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
1436
  ok = vect_analyze_slp (loop_vinfo, NULL);
1437
  if (ok)
1438
    {
1439
      /* Decide which possible SLP instances to SLP.  */
1440
      vect_make_slp_decision (loop_vinfo);
1441
 
1442
      /* Find stmts that need to be both vectorized and SLPed.  */
1443
      vect_detect_hybrid_slp (loop_vinfo);
1444
    }
1445
 
1446
  /* This pass will decide on using loop versioning and/or loop peeling in
1447
     order to enhance the alignment of data references in the loop.  */
1448
 
1449
  ok = vect_enhance_data_refs_alignment (loop_vinfo);
1450
  if (!ok)
1451
    {
1452
      if (vect_print_dump_info (REPORT_DETAILS))
1453
        fprintf (vect_dump, "bad data alignment.");
1454
      destroy_loop_vec_info (loop_vinfo, true);
1455
      return NULL;
1456
    }
1457
 
1458
  /* Scan all the operations in the loop and make sure they are
1459
     vectorizable.  */
1460
 
1461
  ok = vect_analyze_loop_operations (loop_vinfo);
1462
  if (!ok)
1463
    {
1464
      if (vect_print_dump_info (REPORT_DETAILS))
1465
        fprintf (vect_dump, "bad operation or unsupported loop bound.");
1466
      destroy_loop_vec_info (loop_vinfo, true);
1467
      return NULL;
1468
    }
1469
 
1470
  LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
1471
 
1472
  return loop_vinfo;
1473
}
1474
 
1475
 
1476
/* Function reduction_code_for_scalar_code
1477
 
1478
   Input:
1479
   CODE - tree_code of a reduction operations.
1480
 
1481
   Output:
1482
   REDUC_CODE - the corresponding tree-code to be used to reduce the
1483
      vector of partial results into a single scalar result (which
1484
      will also reside in a vector) or ERROR_MARK if the operation is
1485
      a supported reduction operation, but does not have such tree-code.
1486
 
1487
   Return FALSE if CODE currently cannot be vectorized as reduction.  */
1488
 
1489
static bool
1490
reduction_code_for_scalar_code (enum tree_code code,
1491
                                enum tree_code *reduc_code)
1492
{
1493
  switch (code)
1494
    {
1495
      case MAX_EXPR:
1496
        *reduc_code = REDUC_MAX_EXPR;
1497
        return true;
1498
 
1499
      case MIN_EXPR:
1500
        *reduc_code = REDUC_MIN_EXPR;
1501
        return true;
1502
 
1503
      case PLUS_EXPR:
1504
        *reduc_code = REDUC_PLUS_EXPR;
1505
        return true;
1506
 
1507
      case MULT_EXPR:
1508
      case MINUS_EXPR:
1509
      case BIT_IOR_EXPR:
1510
      case BIT_XOR_EXPR:
1511
      case BIT_AND_EXPR:
1512
        *reduc_code = ERROR_MARK;
1513
        return true;
1514
 
1515
      default:
1516
       return false;
1517
    }
1518
}
1519
 
1520
 
1521
/* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
1522
   STMT is printed with a message MSG. */
1523
 
1524
static void
1525
report_vect_op (gimple stmt, const char *msg)
1526
{
1527
  fprintf (vect_dump, "%s", msg);
1528
  print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
1529
}
1530
 
1531
 
1532
/* Function vect_is_simple_reduction
1533
 
1534
   (1) Detect a cross-iteration def-use cycle that represents a simple
1535
   reduction computation. We look for the following pattern:
1536
 
1537
   loop_header:
1538
     a1 = phi < a0, a2 >
1539
     a3 = ...
1540
     a2 = operation (a3, a1)
1541
 
1542
   such that:
1543
   1. operation is commutative and associative and it is safe to
1544
      change the order of the computation (if CHECK_REDUCTION is true)
1545
   2. no uses for a2 in the loop (a2 is used out of the loop)
1546
   3. no uses of a1 in the loop besides the reduction operation.
1547
 
1548
   Condition 1 is tested here.
1549
   Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
1550
 
1551
   (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
1552
   nested cycles, if CHECK_REDUCTION is false.
1553
 
1554
   (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
1555
   reductions:
1556
 
1557
     a1 = phi < a0, a2 >
1558
     inner loop (def of a3)
1559
     a2 = phi < a3 >
1560
*/
1561
 
1562
gimple
1563
vect_is_simple_reduction (loop_vec_info loop_info, gimple phi,
1564
                          bool check_reduction, bool *double_reduc)
1565
{
1566
  struct loop *loop = (gimple_bb (phi))->loop_father;
1567
  struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
1568
  edge latch_e = loop_latch_edge (loop);
1569
  tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
1570
  gimple def_stmt, def1 = NULL, def2 = NULL;
1571
  enum tree_code code;
1572
  tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
1573
  tree type;
1574
  int nloop_uses;
1575
  tree name;
1576
  imm_use_iterator imm_iter;
1577
  use_operand_p use_p;
1578
  bool phi_def;
1579
 
1580
  *double_reduc = false;
1581
 
1582
  /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
1583
     otherwise, we assume outer loop vectorization.  */
1584
  gcc_assert ((check_reduction && loop == vect_loop)
1585
              || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
1586
 
1587
  name = PHI_RESULT (phi);
1588
  nloop_uses = 0;
1589
  FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
1590
    {
1591
      gimple use_stmt = USE_STMT (use_p);
1592
      if (is_gimple_debug (use_stmt))
1593
        continue;
1594
      if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
1595
          && vinfo_for_stmt (use_stmt)
1596
          && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
1597
        nloop_uses++;
1598
      if (nloop_uses > 1)
1599
        {
1600
          if (vect_print_dump_info (REPORT_DETAILS))
1601
            fprintf (vect_dump, "reduction used in loop.");
1602
          return NULL;
1603
        }
1604
    }
1605
 
1606
  if (TREE_CODE (loop_arg) != SSA_NAME)
1607
    {
1608
      if (vect_print_dump_info (REPORT_DETAILS))
1609
        {
1610
          fprintf (vect_dump, "reduction: not ssa_name: ");
1611
          print_generic_expr (vect_dump, loop_arg, TDF_SLIM);
1612
        }
1613
      return NULL;
1614
    }
1615
 
1616
  def_stmt = SSA_NAME_DEF_STMT (loop_arg);
1617
  if (!def_stmt)
1618
    {
1619
      if (vect_print_dump_info (REPORT_DETAILS))
1620
        fprintf (vect_dump, "reduction: no def_stmt.");
1621
      return NULL;
1622
    }
1623
 
1624
  if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
1625
    {
1626
      if (vect_print_dump_info (REPORT_DETAILS))
1627
        print_gimple_stmt (vect_dump, def_stmt, 0, TDF_SLIM);
1628
      return NULL;
1629
    }
1630
 
1631
  if (is_gimple_assign (def_stmt))
1632
    {
1633
      name = gimple_assign_lhs (def_stmt);
1634
      phi_def = false;
1635
    }
1636
  else
1637
    {
1638
      name = PHI_RESULT (def_stmt);
1639
      phi_def = true;
1640
    }
1641
 
1642
  nloop_uses = 0;
1643
  FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
1644
    {
1645
      gimple use_stmt = USE_STMT (use_p);
1646
      if (is_gimple_debug (use_stmt))
1647
        continue;
1648
      if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
1649
          && vinfo_for_stmt (use_stmt)
1650
          && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
1651
        nloop_uses++;
1652
      if (nloop_uses > 1)
1653
        {
1654
          if (vect_print_dump_info (REPORT_DETAILS))
1655
            fprintf (vect_dump, "reduction used in loop.");
1656
          return NULL;
1657
        }
1658
    }
1659
 
1660
  /* If DEF_STMT is a phi node itself, we expect it to have a single argument
1661
     defined in the inner loop.  */
1662
  if (phi_def)
1663
    {
1664
      op1 = PHI_ARG_DEF (def_stmt, 0);
1665
 
1666
      if (gimple_phi_num_args (def_stmt) != 1
1667
          || TREE_CODE (op1) != SSA_NAME)
1668
        {
1669
          if (vect_print_dump_info (REPORT_DETAILS))
1670
            fprintf (vect_dump, "unsupported phi node definition.");
1671
 
1672
          return NULL;
1673
        }
1674
 
1675
      def1 = SSA_NAME_DEF_STMT (op1);
1676
      if (flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
1677
          && loop->inner
1678
          && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
1679
          && is_gimple_assign (def1))
1680
        {
1681
          if (vect_print_dump_info (REPORT_DETAILS))
1682
            report_vect_op (def_stmt, "detected double reduction: ");
1683
 
1684
          *double_reduc = true;
1685
          return def_stmt;
1686
        }
1687
 
1688
      return NULL;
1689
    }
1690
 
1691
  code = gimple_assign_rhs_code (def_stmt);
1692
 
1693
  if (check_reduction
1694
      && (!commutative_tree_code (code) || !associative_tree_code (code)))
1695
    {
1696
      if (vect_print_dump_info (REPORT_DETAILS))
1697
        report_vect_op (def_stmt, "reduction: not commutative/associative: ");
1698
      return NULL;
1699
    }
1700
 
1701
  if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
1702
    {
1703
      if (code != COND_EXPR)
1704
        {
1705
          if (vect_print_dump_info (REPORT_DETAILS))
1706
            report_vect_op (def_stmt, "reduction: not binary operation: ");
1707
 
1708
          return NULL;
1709
        }
1710
 
1711
      op3 = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
1712
      if (COMPARISON_CLASS_P (op3))
1713
        {
1714
          op4 = TREE_OPERAND (op3, 1);
1715
          op3 = TREE_OPERAND (op3, 0);
1716
        }
1717
 
1718
      op1 = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 1);
1719
      op2 = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 2);
1720
 
1721
      if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
1722
        {
1723
          if (vect_print_dump_info (REPORT_DETAILS))
1724
            report_vect_op (def_stmt, "reduction: uses not ssa_names: ");
1725
 
1726
          return NULL;
1727
        }
1728
    }
1729
  else
1730
    {
1731
      op1 = gimple_assign_rhs1 (def_stmt);
1732
      op2 = gimple_assign_rhs2 (def_stmt);
1733
 
1734
      if (TREE_CODE (op1) != SSA_NAME || TREE_CODE (op2) != SSA_NAME)
1735
        {
1736
          if (vect_print_dump_info (REPORT_DETAILS))
1737
            report_vect_op (def_stmt, "reduction: uses not ssa_names: ");
1738
 
1739
          return NULL;
1740
        }
1741
   }
1742
 
1743
  type = TREE_TYPE (gimple_assign_lhs (def_stmt));
1744
  if ((TREE_CODE (op1) == SSA_NAME
1745
       && !types_compatible_p (type,TREE_TYPE (op1)))
1746
      || (TREE_CODE (op2) == SSA_NAME
1747
          && !types_compatible_p (type, TREE_TYPE (op2)))
1748
      || (op3 && TREE_CODE (op3) == SSA_NAME
1749
          && !types_compatible_p (type, TREE_TYPE (op3)))
1750
      || (op4 && TREE_CODE (op4) == SSA_NAME
1751
          && !types_compatible_p (type, TREE_TYPE (op4))))
1752
    {
1753
      if (vect_print_dump_info (REPORT_DETAILS))
1754
        {
1755
          fprintf (vect_dump, "reduction: multiple types: operation type: ");
1756
          print_generic_expr (vect_dump, type, TDF_SLIM);
1757
          fprintf (vect_dump, ", operands types: ");
1758
          print_generic_expr (vect_dump, TREE_TYPE (op1), TDF_SLIM);
1759
          fprintf (vect_dump, ",");
1760
          print_generic_expr (vect_dump, TREE_TYPE (op2), TDF_SLIM);
1761
          if (op3)
1762
            {
1763
              fprintf (vect_dump, ",");
1764
              print_generic_expr (vect_dump, TREE_TYPE (op3), TDF_SLIM);
1765
            }
1766
 
1767
          if (op4)
1768
            {
1769
              fprintf (vect_dump, ",");
1770
              print_generic_expr (vect_dump, TREE_TYPE (op4), TDF_SLIM);
1771
            }
1772
        }
1773
 
1774
      return NULL;
1775
    }
1776
 
1777
  /* Check that it's ok to change the order of the computation.
1778
     Generally, when vectorizing a reduction we change the order of the
1779
     computation.  This may change the behavior of the program in some
1780
     cases, so we need to check that this is ok.  One exception is when
1781
     vectorizing an outer-loop: the inner-loop is executed sequentially,
1782
     and therefore vectorizing reductions in the inner-loop during
1783
     outer-loop vectorization is safe.  */
1784
 
1785
  /* CHECKME: check for !flag_finite_math_only too?  */
1786
  if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
1787
      && check_reduction)
1788
    {
1789
      /* Changing the order of operations changes the semantics.  */
1790
      if (vect_print_dump_info (REPORT_DETAILS))
1791
        report_vect_op (def_stmt, "reduction: unsafe fp math optimization: ");
1792
      return NULL;
1793
    }
1794
  else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type)
1795
           && check_reduction)
1796
    {
1797
      /* Changing the order of operations changes the semantics.  */
1798
      if (vect_print_dump_info (REPORT_DETAILS))
1799
        report_vect_op (def_stmt, "reduction: unsafe int math optimization: ");
1800
      return NULL;
1801
    }
1802
  else if (SAT_FIXED_POINT_TYPE_P (type) && check_reduction)
1803
    {
1804
      /* Changing the order of operations changes the semantics.  */
1805
      if (vect_print_dump_info (REPORT_DETAILS))
1806
        report_vect_op (def_stmt,
1807
                        "reduction: unsafe fixed-point math optimization: ");
1808
      return NULL;
1809
    }
1810
 
1811
  /* Reduction is safe. We're dealing with one of the following:
1812
     1) integer arithmetic and no trapv
1813
     2) floating point arithmetic, and special flags permit this optimization
1814
     3) nested cycle (i.e., outer loop vectorization).  */
1815
  if (TREE_CODE (op1) == SSA_NAME)
1816
    def1 = SSA_NAME_DEF_STMT (op1);
1817
 
1818
  if (TREE_CODE (op2) == SSA_NAME)
1819
    def2 = SSA_NAME_DEF_STMT (op2);
1820
 
1821
  if (code != COND_EXPR
1822
      && (!def1 || !def2 || gimple_nop_p (def1) || gimple_nop_p (def2)))
1823
    {
1824
      if (vect_print_dump_info (REPORT_DETAILS))
1825
        report_vect_op (def_stmt, "reduction: no defs for operands: ");
1826
      return NULL;
1827
    }
1828
 
1829
  /* Check that one def is the reduction def, defined by PHI,
1830
     the other def is either defined in the loop ("vect_internal_def"),
1831
     or it's an induction (defined by a loop-header phi-node).  */
1832
 
1833
  if (def2 && def2 == phi
1834
      && (code == COND_EXPR
1835
          || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
1836
              && (is_gimple_assign (def1)
1837
                  || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
1838
                      == vect_induction_def
1839
                  || (gimple_code (def1) == GIMPLE_PHI
1840
                      && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
1841
                          == vect_internal_def
1842
                      && !is_loop_header_bb_p (gimple_bb (def1)))))))
1843
    {
1844
      if (vect_print_dump_info (REPORT_DETAILS))
1845
        report_vect_op (def_stmt, "detected reduction: ");
1846
      return def_stmt;
1847
    }
1848
  else if (def1 && def1 == phi
1849
           && (code == COND_EXPR
1850
               || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
1851
                   && (is_gimple_assign (def2)
1852
                       || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
1853
                           == vect_induction_def
1854
                       || (gimple_code (def2) == GIMPLE_PHI
1855
                           && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
1856
                               == vect_internal_def
1857
                           && !is_loop_header_bb_p (gimple_bb (def2)))))))
1858
    {
1859
      if (check_reduction)
1860
        {
1861
          /* Swap operands (just for simplicity - so that the rest of the code
1862
             can assume that the reduction variable is always the last (second)
1863
             argument).  */
1864
          if (vect_print_dump_info (REPORT_DETAILS))
1865
            report_vect_op (def_stmt,
1866
                            "detected reduction: need to swap operands: ");
1867
 
1868
          swap_tree_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
1869
                              gimple_assign_rhs2_ptr (def_stmt));
1870
        }
1871
      else
1872
        {
1873
          if (vect_print_dump_info (REPORT_DETAILS))
1874
            report_vect_op (def_stmt, "detected reduction: ");
1875
        }
1876
 
1877
      return def_stmt;
1878
    }
1879
  else
1880
    {
1881
      if (vect_print_dump_info (REPORT_DETAILS))
1882
        report_vect_op (def_stmt, "reduction: unknown pattern: ");
1883
 
1884
      return NULL;
1885
    }
1886
}
1887
 
1888
 
1889
/* Function vect_estimate_min_profitable_iters
1890
 
1891
   Return the number of iterations required for the vector version of the
1892
   loop to be profitable relative to the cost of the scalar version of the
1893
   loop.
1894
 
1895
   TODO: Take profile info into account before making vectorization
1896
   decisions, if available.  */
1897
 
1898
int
1899
vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
1900
{
1901
  int i;
1902
  int min_profitable_iters;
1903
  int peel_iters_prologue;
1904
  int peel_iters_epilogue;
1905
  int vec_inside_cost = 0;
1906
  int vec_outside_cost = 0;
1907
  int scalar_single_iter_cost = 0;
1908
  int scalar_outside_cost = 0;
1909
  int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1910
  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1911
  basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1912
  int nbbs = loop->num_nodes;
1913
  int byte_misalign = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
1914
  int peel_guard_costs = 0;
1915
  int innerloop_iters = 0, factor;
1916
  VEC (slp_instance, heap) *slp_instances;
1917
  slp_instance instance;
1918
 
1919
  /* Cost model disabled.  */
1920
  if (!flag_vect_cost_model)
1921
    {
1922
      if (vect_print_dump_info (REPORT_COST))
1923
        fprintf (vect_dump, "cost model disabled.");
1924
      return 0;
1925
    }
1926
 
1927
  /* Requires loop versioning tests to handle misalignment.  */
1928
  if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1929
    {
1930
      /*  FIXME: Make cost depend on complexity of individual check.  */
1931
      vec_outside_cost +=
1932
        VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
1933
      if (vect_print_dump_info (REPORT_COST))
1934
        fprintf (vect_dump, "cost model: Adding cost of checks for loop "
1935
                 "versioning to treat misalignment.\n");
1936
    }
1937
 
1938
  /* Requires loop versioning with alias checks.  */
1939
  if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
1940
    {
1941
      /*  FIXME: Make cost depend on complexity of individual check.  */
1942
      vec_outside_cost +=
1943
        VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
1944
      if (vect_print_dump_info (REPORT_COST))
1945
        fprintf (vect_dump, "cost model: Adding cost of checks for loop "
1946
                 "versioning aliasing.\n");
1947
    }
1948
 
1949
  if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
1950
      || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
1951
    vec_outside_cost += TARG_COND_TAKEN_BRANCH_COST;
1952
 
1953
  /* Count statements in scalar loop.  Using this as scalar cost for a single
1954
     iteration for now.
1955
 
1956
     TODO: Add outer loop support.
1957
 
1958
     TODO: Consider assigning different costs to different scalar
1959
     statements.  */
1960
 
1961
  /* FORNOW.  */
1962
  if (loop->inner)
1963
    innerloop_iters = 50; /* FIXME */
1964
 
1965
  for (i = 0; i < nbbs; i++)
1966
    {
1967
      gimple_stmt_iterator si;
1968
      basic_block bb = bbs[i];
1969
 
1970
      if (bb->loop_father == loop->inner)
1971
        factor = innerloop_iters;
1972
      else
1973
        factor = 1;
1974
 
1975
      for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1976
        {
1977
          gimple stmt = gsi_stmt (si);
1978
          stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1979
          /* Skip stmts that are not vectorized inside the loop.  */
1980
          if (!STMT_VINFO_RELEVANT_P (stmt_info)
1981
              && (!STMT_VINFO_LIVE_P (stmt_info)
1982
                  || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def))
1983
            continue;
1984
          scalar_single_iter_cost += cost_for_stmt (stmt) * factor;
1985
          vec_inside_cost += STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) * factor;
1986
          /* FIXME: for stmts in the inner-loop in outer-loop vectorization,
1987
             some of the "outside" costs are generated inside the outer-loop.  */
1988
          vec_outside_cost += STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info);
1989
        }
1990
    }
1991
 
1992
  /* Add additional cost for the peeled instructions in prologue and epilogue
1993
     loop.
1994
 
1995
     FORNOW: If we don't know the value of peel_iters for prologue or epilogue
1996
     at compile-time - we assume it's vf/2 (the worst would be vf-1).
1997
 
1998
     TODO: Build an expression that represents peel_iters for prologue and
1999
     epilogue to be used in a run-time test.  */
2000
 
2001
  if (byte_misalign < 0)
2002
    {
2003
      peel_iters_prologue = vf/2;
2004
      if (vect_print_dump_info (REPORT_COST))
2005
        fprintf (vect_dump, "cost model: "
2006
                 "prologue peel iters set to vf/2.");
2007
 
2008
      /* If peeling for alignment is unknown, loop bound of main loop becomes
2009
         unknown.  */
2010
      peel_iters_epilogue = vf/2;
2011
      if (vect_print_dump_info (REPORT_COST))
2012
        fprintf (vect_dump, "cost model: "
2013
                 "epilogue peel iters set to vf/2 because "
2014
                 "peeling for alignment is unknown .");
2015
 
2016
      /* If peeled iterations are unknown, count a taken branch and a not taken
2017
         branch per peeled loop. Even if scalar loop iterations are known,
2018
         vector iterations are not known since peeled prologue iterations are
2019
         not known. Hence guards remain the same.  */
2020
      peel_guard_costs +=  2 * (TARG_COND_TAKEN_BRANCH_COST
2021
                              + TARG_COND_NOT_TAKEN_BRANCH_COST);
2022
    }
2023
  else
2024
    {
2025
      if (byte_misalign)
2026
        {
2027
          struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2028
          int element_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
2029
          tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr)));
2030
          int nelements = TYPE_VECTOR_SUBPARTS (vectype);
2031
 
2032
          peel_iters_prologue = nelements - (byte_misalign / element_size);
2033
        }
2034
      else
2035
        peel_iters_prologue = 0;
2036
 
2037
      if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2038
        {
2039
          peel_iters_epilogue = vf/2;
2040
          if (vect_print_dump_info (REPORT_COST))
2041
            fprintf (vect_dump, "cost model: "
2042
                     "epilogue peel iters set to vf/2 because "
2043
                     "loop iterations are unknown .");
2044
 
2045
          /* If peeled iterations are known but number of scalar loop
2046
             iterations are unknown, count a taken branch per peeled loop.  */
2047
          peel_guard_costs +=  2 * TARG_COND_TAKEN_BRANCH_COST;
2048
 
2049
        }
2050
      else
2051
        {
2052
          int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
2053
          peel_iters_prologue = niters < peel_iters_prologue ?
2054
                                        niters : peel_iters_prologue;
2055
          peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
2056
        }
2057
    }
2058
 
2059
  vec_outside_cost += (peel_iters_prologue * scalar_single_iter_cost)
2060
                      + (peel_iters_epilogue * scalar_single_iter_cost)
2061
                      + peel_guard_costs;
2062
 
2063
  /* FORNOW: The scalar outside cost is incremented in one of the
2064
     following ways:
2065
 
2066
     1. The vectorizer checks for alignment and aliasing and generates
2067
     a condition that allows dynamic vectorization.  A cost model
2068
     check is ANDED with the versioning condition.  Hence scalar code
2069
     path now has the added cost of the versioning check.
2070
 
2071
       if (cost > th & versioning_check)
2072
         jmp to vector code
2073
 
2074
     Hence run-time scalar is incremented by not-taken branch cost.
2075
 
2076
     2. The vectorizer then checks if a prologue is required.  If the
2077
     cost model check was not done before during versioning, it has to
2078
     be done before the prologue check.
2079
 
2080
       if (cost <= th)
2081
         prologue = scalar_iters
2082
       if (prologue == 0)
2083
         jmp to vector code
2084
       else
2085
         execute prologue
2086
       if (prologue == num_iters)
2087
         go to exit
2088
 
2089
     Hence the run-time scalar cost is incremented by a taken branch,
2090
     plus a not-taken branch, plus a taken branch cost.
2091
 
2092
     3. The vectorizer then checks if an epilogue is required.  If the
2093
     cost model check was not done before during prologue check, it
2094
     has to be done with the epilogue check.
2095
 
2096
       if (prologue == 0)
2097
         jmp to vector code
2098
       else
2099
         execute prologue
2100
       if (prologue == num_iters)
2101
         go to exit
2102
       vector code:
2103
         if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
2104
           jmp to epilogue
2105
 
2106
     Hence the run-time scalar cost should be incremented by 2 taken
2107
     branches.
2108
 
2109
     TODO: The back end may reorder the BBS's differently and reverse
2110
     conditions/branch directions.  Change the estimates below to
2111
     something more reasonable.  */
2112
 
2113
  /* If the number of iterations is known and we do not do versioning, we can
2114
     decide whether to vectorize at compile time. Hence the scalar version
2115
     do not carry cost model guard costs.  */
2116
  if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2117
      || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2118
      || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2119
    {
2120
      /* Cost model check occurs at versioning.  */
2121
      if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2122
          || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2123
        scalar_outside_cost += TARG_COND_NOT_TAKEN_BRANCH_COST;
2124
      else
2125
        {
2126
          /* Cost model check occurs at prologue generation.  */
2127
          if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2128
            scalar_outside_cost += 2 * TARG_COND_TAKEN_BRANCH_COST
2129
              + TARG_COND_NOT_TAKEN_BRANCH_COST;
2130
          /* Cost model check occurs at epilogue generation.  */
2131
          else
2132
            scalar_outside_cost += 2 * TARG_COND_TAKEN_BRANCH_COST;
2133
        }
2134
    }
2135
 
2136
  /* Add SLP costs.  */
2137
  slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2138
  for (i = 0; VEC_iterate (slp_instance, slp_instances, i, instance); i++)
2139
    {
2140
      vec_outside_cost += SLP_INSTANCE_OUTSIDE_OF_LOOP_COST (instance);
2141
      vec_inside_cost += SLP_INSTANCE_INSIDE_OF_LOOP_COST (instance);
2142
    }
2143
 
2144
  /* Calculate number of iterations required to make the vector version
2145
     profitable, relative to the loop bodies only. The following condition
2146
     must hold true:
2147
     SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
2148
     where
2149
     SIC = scalar iteration cost, VIC = vector iteration cost,
2150
     VOC = vector outside cost, VF = vectorization factor,
2151
     PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
2152
     SOC = scalar outside cost for run time cost model check.  */
2153
 
2154
  if ((scalar_single_iter_cost * vf) > vec_inside_cost)
2155
    {
2156
      if (vec_outside_cost <= 0)
2157
        min_profitable_iters = 1;
2158
      else
2159
        {
2160
          min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
2161
                                  - vec_inside_cost * peel_iters_prologue
2162
                                  - vec_inside_cost * peel_iters_epilogue)
2163
                                 / ((scalar_single_iter_cost * vf)
2164
                                    - vec_inside_cost);
2165
 
2166
          if ((scalar_single_iter_cost * vf * min_profitable_iters)
2167
              <= ((vec_inside_cost * min_profitable_iters)
2168
                  + ((vec_outside_cost - scalar_outside_cost) * vf)))
2169
            min_profitable_iters++;
2170
        }
2171
    }
2172
  /* vector version will never be profitable.  */
2173
  else
2174
    {
2175
      if (vect_print_dump_info (REPORT_COST))
2176
        fprintf (vect_dump, "cost model: the vector iteration cost = %d "
2177
                 "divided by the scalar iteration cost = %d "
2178
                 "is greater or equal to the vectorization factor = %d.",
2179
                 vec_inside_cost, scalar_single_iter_cost, vf);
2180
      return -1;
2181
    }
2182
 
2183
  if (vect_print_dump_info (REPORT_COST))
2184
    {
2185
      fprintf (vect_dump, "Cost model analysis: \n");
2186
      fprintf (vect_dump, "  Vector inside of loop cost: %d\n",
2187
               vec_inside_cost);
2188
      fprintf (vect_dump, "  Vector outside of loop cost: %d\n",
2189
               vec_outside_cost);
2190
      fprintf (vect_dump, "  Scalar iteration cost: %d\n",
2191
               scalar_single_iter_cost);
2192
      fprintf (vect_dump, "  Scalar outside cost: %d\n", scalar_outside_cost);
2193
      fprintf (vect_dump, "  prologue iterations: %d\n",
2194
               peel_iters_prologue);
2195
      fprintf (vect_dump, "  epilogue iterations: %d\n",
2196
               peel_iters_epilogue);
2197
      fprintf (vect_dump, "  Calculated minimum iters for profitability: %d\n",
2198
               min_profitable_iters);
2199
    }
2200
 
2201
  min_profitable_iters =
2202
        min_profitable_iters < vf ? vf : min_profitable_iters;
2203
 
2204
  /* Because the condition we create is:
2205
     if (niters <= min_profitable_iters)
2206
       then skip the vectorized loop.  */
2207
  min_profitable_iters--;
2208
 
2209
  if (vect_print_dump_info (REPORT_COST))
2210
    fprintf (vect_dump, "  Profitability threshold = %d\n",
2211
             min_profitable_iters);
2212
 
2213
  return min_profitable_iters;
2214
}
2215
 
2216
 
2217
/* TODO: Close dependency between vect_model_*_cost and vectorizable_*
2218
   functions. Design better to avoid maintenance issues.  */
2219
 
2220
/* Function vect_model_reduction_cost.
2221
 
2222
   Models cost for a reduction operation, including the vector ops
2223
   generated within the strip-mine loop, the initial definition before
2224
   the loop, and the epilogue code that must be generated.  */
2225
 
2226
static bool
2227
vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
2228
                           int ncopies)
2229
{
2230
  int outer_cost = 0;
2231
  enum tree_code code;
2232
  optab optab;
2233
  tree vectype;
2234
  gimple stmt, orig_stmt;
2235
  tree reduction_op;
2236
  enum machine_mode mode;
2237
  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2238
  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2239
 
2240
 
2241
  /* Cost of reduction op inside loop.  */
2242
  STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) += ncopies * TARG_VEC_STMT_COST;
2243
 
2244
  stmt = STMT_VINFO_STMT (stmt_info);
2245
 
2246
  switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
2247
    {
2248
    case GIMPLE_SINGLE_RHS:
2249
      gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) == ternary_op);
2250
      reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), 2);
2251
      break;
2252
    case GIMPLE_UNARY_RHS:
2253
      reduction_op = gimple_assign_rhs1 (stmt);
2254
      break;
2255
    case GIMPLE_BINARY_RHS:
2256
      reduction_op = gimple_assign_rhs2 (stmt);
2257
      break;
2258
    default:
2259
      gcc_unreachable ();
2260
    }
2261
 
2262
  vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
2263
  if (!vectype)
2264
    {
2265
      if (vect_print_dump_info (REPORT_COST))
2266
        {
2267
          fprintf (vect_dump, "unsupported data-type ");
2268
          print_generic_expr (vect_dump, TREE_TYPE (reduction_op), TDF_SLIM);
2269
        }
2270
      return false;
2271
   }
2272
 
2273
  mode = TYPE_MODE (vectype);
2274
  orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
2275
 
2276
  if (!orig_stmt)
2277
    orig_stmt = STMT_VINFO_STMT (stmt_info);
2278
 
2279
  code = gimple_assign_rhs_code (orig_stmt);
2280
 
2281
  /* Add in cost for initial definition.  */
2282
  outer_cost += TARG_SCALAR_TO_VEC_COST;
2283
 
2284
  /* Determine cost of epilogue code.
2285
 
2286
     We have a reduction operator that will reduce the vector in one statement.
2287
     Also requires scalar extract.  */
2288
 
2289
  if (!nested_in_vect_loop_p (loop, orig_stmt))
2290
    {
2291
      if (reduc_code != ERROR_MARK)
2292
        outer_cost += TARG_VEC_STMT_COST + TARG_VEC_TO_SCALAR_COST;
2293
      else
2294
        {
2295
          int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
2296
          tree bitsize =
2297
            TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
2298
          int element_bitsize = tree_low_cst (bitsize, 1);
2299
          int nelements = vec_size_in_bits / element_bitsize;
2300
 
2301
          optab = optab_for_tree_code (code, vectype, optab_default);
2302
 
2303
          /* We have a whole vector shift available.  */
2304
          if (VECTOR_MODE_P (mode)
2305
              && optab_handler (optab, mode)->insn_code != CODE_FOR_nothing
2306
              && optab_handler (vec_shr_optab, mode)->insn_code != CODE_FOR_nothing)
2307
            /* Final reduction via vector shifts and the reduction operator. Also
2308
               requires scalar extract.  */
2309
            outer_cost += ((exact_log2(nelements) * 2) * TARG_VEC_STMT_COST
2310
                                + TARG_VEC_TO_SCALAR_COST);
2311
          else
2312
            /* Use extracts and reduction op for final reduction.  For N elements,
2313
               we have N extracts and N-1 reduction ops.  */
2314
            outer_cost += ((nelements + nelements - 1) * TARG_VEC_STMT_COST);
2315
        }
2316
    }
2317
 
2318
  STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = outer_cost;
2319
 
2320
  if (vect_print_dump_info (REPORT_COST))
2321
    fprintf (vect_dump, "vect_model_reduction_cost: inside_cost = %d, "
2322
             "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
2323
             STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
2324
 
2325
  return true;
2326
}
2327
 
2328
 
2329
/* Function vect_model_induction_cost.
2330
 
2331
   Models cost for induction operations.  */
2332
 
2333
static void
2334
vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
2335
{
2336
  /* loop cost for vec_loop.  */
2337
  STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) = ncopies * TARG_VEC_STMT_COST;
2338
  /* prologue cost for vec_init and vec_step.  */
2339
  STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = 2 * TARG_SCALAR_TO_VEC_COST;
2340
 
2341
  if (vect_print_dump_info (REPORT_COST))
2342
    fprintf (vect_dump, "vect_model_induction_cost: inside_cost = %d, "
2343
             "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
2344
             STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
2345
}
2346
 
2347
 
2348
/* Function get_initial_def_for_induction
2349
 
2350
   Input:
2351
   STMT - a stmt that performs an induction operation in the loop.
2352
   IV_PHI - the initial value of the induction variable
2353
 
2354
   Output:
2355
   Return a vector variable, initialized with the first VF values of
2356
   the induction variable. E.g., for an iv with IV_PHI='X' and
2357
   evolution S, for a vector of 4 units, we want to return:
2358
   [X, X + S, X + 2*S, X + 3*S].  */
2359
 
2360
static tree
2361
get_initial_def_for_induction (gimple iv_phi)
2362
{
2363
  stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
2364
  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2365
  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2366
  tree scalar_type = TREE_TYPE (gimple_phi_result (iv_phi));
2367
  tree vectype;
2368
  int nunits;
2369
  edge pe = loop_preheader_edge (loop);
2370
  struct loop *iv_loop;
2371
  basic_block new_bb;
2372
  tree vec, vec_init, vec_step, t;
2373
  tree access_fn;
2374
  tree new_var;
2375
  tree new_name;
2376
  gimple init_stmt, induction_phi, new_stmt;
2377
  tree induc_def, vec_def, vec_dest;
2378
  tree init_expr, step_expr;
2379
  int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2380
  int i;
2381
  bool ok;
2382
  int ncopies;
2383
  tree expr;
2384
  stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
2385
  bool nested_in_vect_loop = false;
2386
  gimple_seq stmts = NULL;
2387
  imm_use_iterator imm_iter;
2388
  use_operand_p use_p;
2389
  gimple exit_phi;
2390
  edge latch_e;
2391
  tree loop_arg;
2392
  gimple_stmt_iterator si;
2393
  basic_block bb = gimple_bb (iv_phi);
2394
  tree stepvectype;
2395
 
2396
  vectype = get_vectype_for_scalar_type (scalar_type);
2397
  gcc_assert (vectype);
2398
  nunits = TYPE_VECTOR_SUBPARTS (vectype);
2399
  ncopies = vf / nunits;
2400
 
2401
  gcc_assert (phi_info);
2402
  gcc_assert (ncopies >= 1);
2403
 
2404
  /* Find the first insertion point in the BB.  */
2405
  si = gsi_after_labels (bb);
2406
 
2407
  if (INTEGRAL_TYPE_P (scalar_type))
2408
    step_expr = build_int_cst (scalar_type, 0);
2409
  else if (POINTER_TYPE_P (scalar_type))
2410
    step_expr = build_int_cst (sizetype, 0);
2411
  else
2412
    step_expr = build_real (scalar_type, dconst0);
2413
 
2414
  /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop?  */
2415
  if (nested_in_vect_loop_p (loop, iv_phi))
2416
    {
2417
      nested_in_vect_loop = true;
2418
      iv_loop = loop->inner;
2419
    }
2420
  else
2421
    iv_loop = loop;
2422
  gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
2423
 
2424
  latch_e = loop_latch_edge (iv_loop);
2425
  loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
2426
 
2427
  access_fn = analyze_scalar_evolution (iv_loop, PHI_RESULT (iv_phi));
2428
  gcc_assert (access_fn);
2429
  ok = vect_is_simple_iv_evolution (iv_loop->num, access_fn,
2430
                                    &init_expr, &step_expr);
2431
  gcc_assert (ok);
2432
  pe = loop_preheader_edge (iv_loop);
2433
 
2434
  /* Create the vector that holds the initial_value of the induction.  */
2435
  if (nested_in_vect_loop)
2436
    {
2437
      /* iv_loop is nested in the loop to be vectorized.  init_expr had already
2438
         been created during vectorization of previous stmts; We obtain it from
2439
         the STMT_VINFO_VEC_STMT of the defining stmt. */
2440
      tree iv_def = PHI_ARG_DEF_FROM_EDGE (iv_phi,
2441
                                           loop_preheader_edge (iv_loop));
2442
      vec_init = vect_get_vec_def_for_operand (iv_def, iv_phi, NULL);
2443
    }
2444
  else
2445
    {
2446
      /* iv_loop is the loop to be vectorized. Create:
2447
         vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
2448
      new_var = vect_get_new_vect_var (scalar_type, vect_scalar_var, "var_");
2449
      add_referenced_var (new_var);
2450
 
2451
      new_name = force_gimple_operand (init_expr, &stmts, false, new_var);
2452
      if (stmts)
2453
        {
2454
          new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
2455
          gcc_assert (!new_bb);
2456
        }
2457
 
2458
      t = NULL_TREE;
2459
      t = tree_cons (NULL_TREE, init_expr, t);
2460
      for (i = 1; i < nunits; i++)
2461
        {
2462
          /* Create: new_name_i = new_name + step_expr  */
2463
          enum tree_code code = POINTER_TYPE_P (scalar_type)
2464
                                ? POINTER_PLUS_EXPR : PLUS_EXPR;
2465
          init_stmt = gimple_build_assign_with_ops (code, new_var,
2466
                                                    new_name, step_expr);
2467
          new_name = make_ssa_name (new_var, init_stmt);
2468
          gimple_assign_set_lhs (init_stmt, new_name);
2469
 
2470
          new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
2471
          gcc_assert (!new_bb);
2472
 
2473
          if (vect_print_dump_info (REPORT_DETAILS))
2474
            {
2475
              fprintf (vect_dump, "created new init_stmt: ");
2476
              print_gimple_stmt (vect_dump, init_stmt, 0, TDF_SLIM);
2477
            }
2478
          t = tree_cons (NULL_TREE, new_name, t);
2479
        }
2480
      /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1]  */
2481
      vec = build_constructor_from_list (vectype, nreverse (t));
2482
      vec_init = vect_init_vector (iv_phi, vec, vectype, NULL);
2483
    }
2484
 
2485
 
2486
  /* Create the vector that holds the step of the induction.  */
2487
  if (nested_in_vect_loop)
2488
    /* iv_loop is nested in the loop to be vectorized. Generate:
2489
       vec_step = [S, S, S, S]  */
2490
    new_name = step_expr;
2491
  else
2492
    {
2493
      /* iv_loop is the loop to be vectorized. Generate:
2494
          vec_step = [VF*S, VF*S, VF*S, VF*S]  */
2495
      expr = build_int_cst (TREE_TYPE (step_expr), vf);
2496
      new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
2497
                              expr, step_expr);
2498
    }
2499
 
2500
  t = NULL_TREE;
2501
  for (i = 0; i < nunits; i++)
2502
    t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
2503
  gcc_assert (CONSTANT_CLASS_P (new_name));
2504
  stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
2505
  gcc_assert (stepvectype);
2506
  vec = build_vector (stepvectype, t);
2507
  vec_step = vect_init_vector (iv_phi, vec, stepvectype, NULL);
2508
 
2509
 
2510
  /* Create the following def-use cycle:
2511
     loop prolog:
2512
         vec_init = ...
2513
         vec_step = ...
2514
     loop:
2515
         vec_iv = PHI <vec_init, vec_loop>
2516
         ...
2517
         STMT
2518
         ...
2519
         vec_loop = vec_iv + vec_step;  */
2520
 
2521
  /* Create the induction-phi that defines the induction-operand.  */
2522
  vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
2523
  add_referenced_var (vec_dest);
2524
  induction_phi = create_phi_node (vec_dest, iv_loop->header);
2525
  set_vinfo_for_stmt (induction_phi,
2526
                      new_stmt_vec_info (induction_phi, loop_vinfo, NULL));
2527
  induc_def = PHI_RESULT (induction_phi);
2528
 
2529
  /* Create the iv update inside the loop  */
2530
  new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
2531
                                           induc_def, vec_step);
2532
  vec_def = make_ssa_name (vec_dest, new_stmt);
2533
  gimple_assign_set_lhs (new_stmt, vec_def);
2534
  gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
2535
  set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo,
2536
                                                   NULL));
2537
 
2538
  /* Set the arguments of the phi node:  */
2539
  add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
2540
  add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
2541
               UNKNOWN_LOCATION);
2542
 
2543
 
2544
  /* In case that vectorization factor (VF) is bigger than the number
2545
     of elements that we can fit in a vectype (nunits), we have to generate
2546
     more than one vector stmt - i.e - we need to "unroll" the
2547
     vector stmt by a factor VF/nunits.  For more details see documentation
2548
     in vectorizable_operation.  */
2549
 
2550
  if (ncopies > 1)
2551
    {
2552
      stmt_vec_info prev_stmt_vinfo;
2553
      /* FORNOW. This restriction should be relaxed.  */
2554
      gcc_assert (!nested_in_vect_loop);
2555
 
2556
      /* Create the vector that holds the step of the induction.  */
2557
      expr = build_int_cst (TREE_TYPE (step_expr), nunits);
2558
      new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
2559
                              expr, step_expr);
2560
      t = NULL_TREE;
2561
      for (i = 0; i < nunits; i++)
2562
        t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
2563
      gcc_assert (CONSTANT_CLASS_P (new_name));
2564
      vec = build_vector (stepvectype, t);
2565
      vec_step = vect_init_vector (iv_phi, vec, stepvectype, NULL);
2566
 
2567
      vec_def = induc_def;
2568
      prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
2569
      for (i = 1; i < ncopies; i++)
2570
        {
2571
          /* vec_i = vec_prev + vec_step  */
2572
          new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
2573
                                                   vec_def, vec_step);
2574
          vec_def = make_ssa_name (vec_dest, new_stmt);
2575
          gimple_assign_set_lhs (new_stmt, vec_def);
2576
 
2577
          gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
2578
          set_vinfo_for_stmt (new_stmt,
2579
                              new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
2580
          STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
2581
          prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
2582
        }
2583
    }
2584
 
2585
  if (nested_in_vect_loop)
2586
    {
2587
      /* Find the loop-closed exit-phi of the induction, and record
2588
         the final vector of induction results:  */
2589
      exit_phi = NULL;
2590
      FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
2591
        {
2592
          if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (USE_STMT (use_p))))
2593
            {
2594
              exit_phi = USE_STMT (use_p);
2595
              break;
2596
            }
2597
        }
2598
      if (exit_phi)
2599
        {
2600
          stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
2601
          /* FORNOW. Currently not supporting the case that an inner-loop induction
2602
             is not used in the outer-loop (i.e. only outside the outer-loop).  */
2603
          gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
2604
                      && !STMT_VINFO_LIVE_P (stmt_vinfo));
2605
 
2606
          STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
2607
          if (vect_print_dump_info (REPORT_DETAILS))
2608
            {
2609
              fprintf (vect_dump, "vector of inductions after inner-loop:");
2610
              print_gimple_stmt (vect_dump, new_stmt, 0, TDF_SLIM);
2611
            }
2612
        }
2613
    }
2614
 
2615
 
2616
  if (vect_print_dump_info (REPORT_DETAILS))
2617
    {
2618
      fprintf (vect_dump, "transform induction: created def-use cycle: ");
2619
      print_gimple_stmt (vect_dump, induction_phi, 0, TDF_SLIM);
2620
      fprintf (vect_dump, "\n");
2621
      print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (vec_def), 0, TDF_SLIM);
2622
    }
2623
 
2624
  STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
2625
  return induc_def;
2626
}
2627
 
2628
 
2629
/* Function get_initial_def_for_reduction
2630
 
2631
   Input:
2632
   STMT - a stmt that performs a reduction operation in the loop.
2633
   INIT_VAL - the initial value of the reduction variable
2634
 
2635
   Output:
2636
   ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
2637
        of the reduction (used for adjusting the epilog - see below).
2638
   Return a vector variable, initialized according to the operation that STMT
2639
        performs. This vector will be used as the initial value of the
2640
        vector of partial results.
2641
 
2642
   Option1 (adjust in epilog): Initialize the vector as follows:
2643
     add/bit or/xor:    [0,0,...,0,0]
2644
     mult/bit and:      [1,1,...,1,1]
2645
     min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
2646
   and when necessary (e.g. add/mult case) let the caller know
2647
   that it needs to adjust the result by init_val.
2648
 
2649
   Option2: Initialize the vector as follows:
2650
     add/bit or/xor:    [init_val,0,0,...,0]
2651
     mult/bit and:      [init_val,1,1,...,1]
2652
     min/max/cond_expr: [init_val,init_val,...,init_val]
2653
   and no adjustments are needed.
2654
 
2655
   For example, for the following code:
2656
 
2657
   s = init_val;
2658
   for (i=0;i<n;i++)
2659
     s = s + a[i];
2660
 
2661
   STMT is 's = s + a[i]', and the reduction variable is 's'.
2662
   For a vector of 4 units, we want to return either [0,0,0,init_val],
2663
   or [0,0,0,0] and let the caller know that it needs to adjust
2664
   the result at the end by 'init_val'.
2665
 
2666
   FORNOW, we are using the 'adjust in epilog' scheme, because this way the
2667
   initialization vector is simpler (same element in all entries), if
2668
   ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
2669
 
2670
   A cost model should help decide between these two schemes.  */
2671
 
2672
tree
2673
get_initial_def_for_reduction (gimple stmt, tree init_val,
2674
                               tree *adjustment_def)
2675
{
2676
  stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2677
  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2678
  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2679
  tree scalar_type = TREE_TYPE (init_val);
2680
  tree vectype = get_vectype_for_scalar_type (scalar_type);
2681
  int nunits;
2682
  enum tree_code code = gimple_assign_rhs_code (stmt);
2683
  tree def_for_init;
2684
  tree init_def;
2685
  tree t = NULL_TREE;
2686
  int i;
2687
  bool nested_in_vect_loop = false;
2688
  tree init_value;
2689
  REAL_VALUE_TYPE real_init_val = dconst0;
2690
  int int_init_val = 0;
2691
  gimple def_stmt = NULL;
2692
 
2693
  gcc_assert (vectype);
2694
  nunits = TYPE_VECTOR_SUBPARTS (vectype);
2695
 
2696
  gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
2697
              || SCALAR_FLOAT_TYPE_P (scalar_type));
2698
 
2699
  if (nested_in_vect_loop_p (loop, stmt))
2700
    nested_in_vect_loop = true;
2701
  else
2702
    gcc_assert (loop == (gimple_bb (stmt))->loop_father);
2703
 
2704
  /* In case of double reduction we only create a vector variable to be put
2705
     in the reduction phi node. The actual statement creation is done in
2706
     vect_create_epilog_for_reduction.  */
2707
  if (adjustment_def && nested_in_vect_loop
2708
      && TREE_CODE (init_val) == SSA_NAME
2709
      && (def_stmt = SSA_NAME_DEF_STMT (init_val))
2710
      && gimple_code (def_stmt) == GIMPLE_PHI
2711
      && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2712
      && vinfo_for_stmt (def_stmt)
2713
      && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2714
          == vect_double_reduction_def)
2715
    {
2716
      *adjustment_def = NULL;
2717
      return vect_create_destination_var (init_val, vectype);
2718
    }
2719
 
2720
  if (TREE_CONSTANT (init_val))
2721
    {
2722
      if (SCALAR_FLOAT_TYPE_P (scalar_type))
2723
        init_value = build_real (scalar_type, TREE_REAL_CST (init_val));
2724
      else
2725
        init_value = build_int_cst (scalar_type, TREE_INT_CST_LOW (init_val));
2726
    }
2727
  else
2728
    init_value = init_val;
2729
 
2730
  switch (code)
2731
    {
2732
      case WIDEN_SUM_EXPR:
2733
      case DOT_PROD_EXPR:
2734
      case PLUS_EXPR:
2735
      case MINUS_EXPR:
2736
      case BIT_IOR_EXPR:
2737
      case BIT_XOR_EXPR:
2738
      case MULT_EXPR:
2739
      case BIT_AND_EXPR:
2740
        /* ADJUSMENT_DEF is NULL when called from
2741
           vect_create_epilog_for_reduction to vectorize double reduction.  */
2742
        if (adjustment_def)
2743
          {
2744
            if (nested_in_vect_loop)
2745
              *adjustment_def = vect_get_vec_def_for_operand (init_val, stmt,
2746
                                                              NULL);
2747
            else
2748
              *adjustment_def = init_val;
2749
          }
2750
 
2751
        if (code == MULT_EXPR)
2752
          {
2753
            real_init_val = dconst1;
2754
            int_init_val = 1;
2755
          }
2756
 
2757
        if (code == BIT_AND_EXPR)
2758
          int_init_val = -1;
2759
 
2760
        if (SCALAR_FLOAT_TYPE_P (scalar_type))
2761
          def_for_init = build_real (scalar_type, real_init_val);
2762
        else
2763
          def_for_init = build_int_cst (scalar_type, int_init_val);
2764
 
2765
        /* Create a vector of '0' or '1' except the first element.  */
2766
        for (i = nunits - 2; i >= 0; --i)
2767
          t = tree_cons (NULL_TREE, def_for_init, t);
2768
 
2769
        /* Option1: the first element is '0' or '1' as well.  */
2770
        if (adjustment_def)
2771
          {
2772
            t = tree_cons (NULL_TREE, def_for_init, t);
2773
            init_def = build_vector (vectype, t);
2774
            break;
2775
          }
2776
 
2777
        /* Option2: the first element is INIT_VAL.  */
2778
        t = tree_cons (NULL_TREE, init_value, t);
2779
        if (TREE_CONSTANT (init_val))
2780
          init_def = build_vector (vectype, t);
2781
        else
2782
          init_def = build_constructor_from_list (vectype, t);
2783
 
2784
        break;
2785
 
2786
      case MIN_EXPR:
2787
      case MAX_EXPR:
2788
      case COND_EXPR:
2789
        if (adjustment_def)
2790
          {
2791
            *adjustment_def = NULL_TREE;
2792
            init_def = vect_get_vec_def_for_operand (init_val, stmt, NULL);
2793
            break;
2794
          }
2795
 
2796
        for (i = nunits - 1; i >= 0; --i)
2797
          t = tree_cons (NULL_TREE, init_value, t);
2798
 
2799
        if (TREE_CONSTANT (init_val))
2800
          init_def = build_vector (vectype, t);
2801
        else
2802
          init_def = build_constructor_from_list (vectype, t);
2803
 
2804
        break;
2805
 
2806
      default:
2807
        gcc_unreachable ();
2808
    }
2809
 
2810
  return init_def;
2811
}
2812
 
2813
 
2814
/* Function vect_create_epilog_for_reduction
2815
 
2816
   Create code at the loop-epilog to finalize the result of a reduction
2817
   computation.
2818
 
2819
   VECT_DEF is a vector of partial results.
2820
   REDUC_CODE is the tree-code for the epilog reduction.
2821
   NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
2822
     number of elements that we can fit in a vectype (nunits). In this case
2823
     we have to generate more than one vector stmt - i.e - we need to "unroll"
2824
     the vector stmt by a factor VF/nunits.  For more details see documentation
2825
     in vectorizable_operation.
2826
   STMT is the scalar reduction stmt that is being vectorized.
2827
   REDUCTION_PHI is the phi-node that carries the reduction computation.
2828
   REDUC_INDEX is the index of the operand in the right hand side of the
2829
     statement that is defined by REDUCTION_PHI.
2830
   DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
2831
 
2832
   This function:
2833
   1. Creates the reduction def-use cycle: sets the arguments for
2834
      REDUCTION_PHI:
2835
      The loop-entry argument is the vectorized initial-value of the reduction.
2836
      The loop-latch argument is VECT_DEF - the vector of partial sums.
2837
   2. "Reduces" the vector of partial results VECT_DEF into a single result,
2838
      by applying the operation specified by REDUC_CODE if available, or by
2839
      other means (whole-vector shifts or a scalar loop).
2840
      The function also creates a new phi node at the loop exit to preserve
2841
      loop-closed form, as illustrated below.
2842
 
2843
     The flow at the entry to this function:
2844
 
2845
        loop:
2846
          vec_def = phi <null, null>            # REDUCTION_PHI
2847
          VECT_DEF = vector_stmt                # vectorized form of STMT
2848
          s_loop = scalar_stmt                  # (scalar) STMT
2849
        loop_exit:
2850
          s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
2851
          use <s_out0>
2852
          use <s_out0>
2853
 
2854
     The above is transformed by this function into:
2855
 
2856
        loop:
2857
          vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
2858
          VECT_DEF = vector_stmt                # vectorized form of STMT
2859
          s_loop = scalar_stmt                  # (scalar) STMT
2860
        loop_exit:
2861
          s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
2862
          v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
2863
          v_out2 = reduce <v_out1>
2864
          s_out3 = extract_field <v_out2, 0>
2865
          s_out4 = adjust_result <s_out3>
2866
          use <s_out4>
2867
          use <s_out4>
2868
*/
2869
 
2870
static void
2871
vect_create_epilog_for_reduction (tree vect_def, gimple stmt,
2872
                                  int ncopies,
2873
                                  enum tree_code reduc_code,
2874
                                  gimple reduction_phi,
2875
                                  int reduc_index,
2876
                                  bool double_reduc)
2877
{
2878
  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2879
  stmt_vec_info prev_phi_info;
2880
  tree vectype;
2881
  enum machine_mode mode;
2882
  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2883
  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
2884
  basic_block exit_bb;
2885
  tree scalar_dest;
2886
  tree scalar_type;
2887
  gimple new_phi = NULL, phi;
2888
  gimple_stmt_iterator exit_gsi;
2889
  tree vec_dest;
2890
  tree new_temp = NULL_TREE;
2891
  tree new_name;
2892
  gimple epilog_stmt = NULL;
2893
  tree new_scalar_dest, new_dest;
2894
  gimple exit_phi;
2895
  tree bitsize, bitpos;
2896
  enum tree_code code = gimple_assign_rhs_code (stmt);
2897
  tree adjustment_def;
2898
  tree vec_initial_def, def;
2899
  tree orig_name;
2900
  imm_use_iterator imm_iter;
2901
  use_operand_p use_p;
2902
  bool extract_scalar_result = false;
2903
  tree reduction_op, expr;
2904
  gimple orig_stmt;
2905
  gimple use_stmt;
2906
  bool nested_in_vect_loop = false;
2907
  VEC(gimple,heap) *phis = NULL;
2908
  enum vect_def_type dt = vect_unknown_def_type;
2909
  int j, i;
2910
 
2911
  if (nested_in_vect_loop_p (loop, stmt))
2912
    {
2913
      outer_loop = loop;
2914
      loop = loop->inner;
2915
      nested_in_vect_loop = true;
2916
    }
2917
 
2918
  switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
2919
    {
2920
    case GIMPLE_SINGLE_RHS:
2921
      gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
2922
                                       == ternary_op);
2923
      reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
2924
      break;
2925
    case GIMPLE_UNARY_RHS:
2926
      reduction_op = gimple_assign_rhs1 (stmt);
2927
      break;
2928
    case GIMPLE_BINARY_RHS:
2929
      reduction_op = reduc_index ?
2930
                     gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt);
2931
      break;
2932
    default:
2933
      gcc_unreachable ();
2934
    }
2935
 
2936
  vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
2937
  gcc_assert (vectype);
2938
  mode = TYPE_MODE (vectype);
2939
 
2940
  /*** 1. Create the reduction def-use cycle  ***/
2941
 
2942
  /* For the case of reduction, vect_get_vec_def_for_operand returns
2943
     the scalar def before the loop, that defines the initial value
2944
     of the reduction variable.  */
2945
  vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
2946
                                                  &adjustment_def);
2947
 
2948
  phi = reduction_phi;
2949
  def = vect_def;
2950
  for (j = 0; j < ncopies; j++)
2951
    {
2952
      /* 1.1 set the loop-entry arg of the reduction-phi:  */
2953
      add_phi_arg (phi, vec_initial_def, loop_preheader_edge (loop),
2954
                   UNKNOWN_LOCATION);
2955
 
2956
      /* 1.2 set the loop-latch arg for the reduction-phi:  */
2957
      if (j > 0)
2958
        def = vect_get_vec_def_for_stmt_copy (dt, def);
2959
      add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
2960
 
2961
      if (vect_print_dump_info (REPORT_DETAILS))
2962
        {
2963
          fprintf (vect_dump, "transform reduction: created def-use cycle: ");
2964
          print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
2965
          fprintf (vect_dump, "\n");
2966
          print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (def), 0, TDF_SLIM);
2967
        }
2968
 
2969
      phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
2970
    }
2971
 
2972
  /*** 2. Create epilog code
2973
          The reduction epilog code operates across the elements of the vector
2974
          of partial results computed by the vectorized loop.
2975
          The reduction epilog code consists of:
2976
          step 1: compute the scalar result in a vector (v_out2)
2977
          step 2: extract the scalar result (s_out3) from the vector (v_out2)
2978
          step 3: adjust the scalar result (s_out3) if needed.
2979
 
2980
          Step 1 can be accomplished using one the following three schemes:
2981
          (scheme 1) using reduc_code, if available.
2982
          (scheme 2) using whole-vector shifts, if available.
2983
          (scheme 3) using a scalar loop. In this case steps 1+2 above are
2984
                     combined.
2985
 
2986
          The overall epilog code looks like this:
2987
 
2988
          s_out0 = phi <s_loop>         # original EXIT_PHI
2989
          v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
2990
          v_out2 = reduce <v_out1>              # step 1
2991
          s_out3 = extract_field <v_out2, 0>    # step 2
2992
          s_out4 = adjust_result <s_out3>       # step 3
2993
 
2994
          (step 3 is optional, and steps 1 and 2 may be combined).
2995
          Lastly, the uses of s_out0 are replaced by s_out4.
2996
 
2997
          ***/
2998
 
2999
  /* 2.1 Create new loop-exit-phi to preserve loop-closed form:
3000
        v_out1 = phi <v_loop>  */
3001
 
3002
  exit_bb = single_exit (loop)->dest;
3003
  def = vect_def;
3004
  prev_phi_info = NULL;
3005
  for (j = 0; j < ncopies; j++)
3006
    {
3007
      phi = create_phi_node (SSA_NAME_VAR (vect_def), exit_bb);
3008
      set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL));
3009
      if (j == 0)
3010
        new_phi = phi;
3011
      else
3012
        {
3013
          def = vect_get_vec_def_for_stmt_copy (dt, def);
3014
          STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
3015
        }
3016
      SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
3017
      prev_phi_info = vinfo_for_stmt (phi);
3018
    }
3019
 
3020
  exit_gsi = gsi_after_labels (exit_bb);
3021
 
3022
  /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
3023
         (i.e. when reduc_code is not available) and in the final adjustment
3024
         code (if needed).  Also get the original scalar reduction variable as
3025
         defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
3026
         represents a reduction pattern), the tree-code and scalar-def are
3027
         taken from the original stmt that the pattern-stmt (STMT) replaces.
3028
         Otherwise (it is a regular reduction) - the tree-code and scalar-def
3029
         are taken from STMT.  */
3030
 
3031
  orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3032
  if (!orig_stmt)
3033
    {
3034
      /* Regular reduction  */
3035
      orig_stmt = stmt;
3036
    }
3037
  else
3038
    {
3039
      /* Reduction pattern  */
3040
      stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
3041
      gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
3042
      gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
3043
    }
3044
 
3045
  code = gimple_assign_rhs_code (orig_stmt);
3046
  scalar_dest = gimple_assign_lhs (orig_stmt);
3047
  scalar_type = TREE_TYPE (scalar_dest);
3048
  new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
3049
  bitsize = TYPE_SIZE (scalar_type);
3050
 
3051
  /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
3052
     partial results are added and not subtracted.  */
3053
  if (code == MINUS_EXPR)
3054
    code = PLUS_EXPR;
3055
 
3056
  /* In case this is a reduction in an inner-loop while vectorizing an outer
3057
     loop - we don't need to extract a single scalar result at the end of the
3058
     inner-loop (unless it is double reduction, i.e., the use of reduction is
3059
     outside the outer-loop). The final vector of partial results will be used
3060
     in the vectorized outer-loop, or reduced to a scalar result at the end of
3061
     the outer-loop.  */
3062
  if (nested_in_vect_loop && !double_reduc)
3063
    goto vect_finalize_reduction;
3064
 
3065
  /* The epilogue is created for the outer-loop, i.e., for the loop being
3066
     vectorized.  */
3067
  if (double_reduc)
3068
    loop = outer_loop;
3069
 
3070
  /* FORNOW */
3071
  gcc_assert (ncopies == 1);
3072
 
3073
  /* 2.3 Create the reduction code, using one of the three schemes described
3074
         above.  */
3075
 
3076
  if (reduc_code != ERROR_MARK)
3077
    {
3078
      tree tmp;
3079
 
3080
      /*** Case 1:  Create:
3081
           v_out2 = reduc_expr <v_out1>  */
3082
 
3083
      if (vect_print_dump_info (REPORT_DETAILS))
3084
        fprintf (vect_dump, "Reduce using direct vector reduction.");
3085
 
3086
      vec_dest = vect_create_destination_var (scalar_dest, vectype);
3087
      tmp = build1 (reduc_code, vectype,  PHI_RESULT (new_phi));
3088
      epilog_stmt = gimple_build_assign (vec_dest, tmp);
3089
      new_temp = make_ssa_name (vec_dest, epilog_stmt);
3090
      gimple_assign_set_lhs (epilog_stmt, new_temp);
3091
      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3092
 
3093
      extract_scalar_result = true;
3094
    }
3095
  else
3096
    {
3097
      enum tree_code shift_code = ERROR_MARK;
3098
      bool have_whole_vector_shift = true;
3099
      int bit_offset;
3100
      int element_bitsize = tree_low_cst (bitsize, 1);
3101
      int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
3102
      tree vec_temp;
3103
 
3104
      if (optab_handler (vec_shr_optab, mode)->insn_code != CODE_FOR_nothing)
3105
        shift_code = VEC_RSHIFT_EXPR;
3106
      else
3107
        have_whole_vector_shift = false;
3108
 
3109
      /* Regardless of whether we have a whole vector shift, if we're
3110
         emulating the operation via tree-vect-generic, we don't want
3111
         to use it.  Only the first round of the reduction is likely
3112
         to still be profitable via emulation.  */
3113
      /* ??? It might be better to emit a reduction tree code here, so that
3114
         tree-vect-generic can expand the first round via bit tricks.  */
3115
      if (!VECTOR_MODE_P (mode))
3116
        have_whole_vector_shift = false;
3117
      else
3118
        {
3119
          optab optab = optab_for_tree_code (code, vectype, optab_default);
3120
          if (optab_handler (optab, mode)->insn_code == CODE_FOR_nothing)
3121
            have_whole_vector_shift = false;
3122
        }
3123
 
3124
      if (have_whole_vector_shift)
3125
        {
3126
          /*** Case 2: Create:
3127
             for (offset = VS/2; offset >= element_size; offset/=2)
3128
                {
3129
                  Create:  va' = vec_shift <va, offset>
3130
                  Create:  va = vop <va, va'>
3131
                }  */
3132
 
3133
          if (vect_print_dump_info (REPORT_DETAILS))
3134
            fprintf (vect_dump, "Reduce using vector shifts");
3135
 
3136
          vec_dest = vect_create_destination_var (scalar_dest, vectype);
3137
          new_temp = PHI_RESULT (new_phi);
3138
 
3139
          for (bit_offset = vec_size_in_bits/2;
3140
               bit_offset >= element_bitsize;
3141
               bit_offset /= 2)
3142
            {
3143
              tree bitpos = size_int (bit_offset);
3144
 
3145
              epilog_stmt = gimple_build_assign_with_ops (shift_code, vec_dest,
3146
                                                          new_temp, bitpos);
3147
              new_name = make_ssa_name (vec_dest, epilog_stmt);
3148
              gimple_assign_set_lhs (epilog_stmt, new_name);
3149
              gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3150
 
3151
              epilog_stmt = gimple_build_assign_with_ops (code, vec_dest,
3152
                                                          new_name, new_temp);
3153
              new_temp = make_ssa_name (vec_dest, epilog_stmt);
3154
              gimple_assign_set_lhs (epilog_stmt, new_temp);
3155
              gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3156
            }
3157
 
3158
          extract_scalar_result = true;
3159
        }
3160
      else
3161
        {
3162
          tree rhs;
3163
 
3164
          /*** Case 3: Create:
3165
             s = extract_field <v_out2, 0>
3166
             for (offset = element_size;
3167
                  offset < vector_size;
3168
                  offset += element_size;)
3169
               {
3170
                 Create:  s' = extract_field <v_out2, offset>
3171
                 Create:  s = op <s, s'>
3172
               }  */
3173
 
3174
          if (vect_print_dump_info (REPORT_DETAILS))
3175
            fprintf (vect_dump, "Reduce using scalar code. ");
3176
 
3177
          vec_temp = PHI_RESULT (new_phi);
3178
          vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
3179
          rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
3180
                         bitsize_zero_node);
3181
          epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
3182
          new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
3183
          gimple_assign_set_lhs (epilog_stmt, new_temp);
3184
          gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3185
 
3186
          for (bit_offset = element_bitsize;
3187
               bit_offset < vec_size_in_bits;
3188
               bit_offset += element_bitsize)
3189
            {
3190
              tree bitpos = bitsize_int (bit_offset);
3191
              tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
3192
                                 bitpos);
3193
 
3194
              epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
3195
              new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
3196
              gimple_assign_set_lhs (epilog_stmt, new_name);
3197
              gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3198
 
3199
              epilog_stmt = gimple_build_assign_with_ops (code,
3200
                                                          new_scalar_dest,
3201
                                                          new_name, new_temp);
3202
              new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
3203
              gimple_assign_set_lhs (epilog_stmt, new_temp);
3204
              gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3205
            }
3206
 
3207
          extract_scalar_result = false;
3208
        }
3209
    }
3210
 
3211
  /* 2.4  Extract the final scalar result.  Create:
3212
         s_out3 = extract_field <v_out2, bitpos>  */
3213
 
3214
  if (extract_scalar_result)
3215
    {
3216
      tree rhs;
3217
 
3218
      gcc_assert (!nested_in_vect_loop || double_reduc);
3219
      if (vect_print_dump_info (REPORT_DETAILS))
3220
        fprintf (vect_dump, "extract scalar result");
3221
 
3222
      if (BYTES_BIG_ENDIAN)
3223
        bitpos = size_binop (MULT_EXPR,
3224
                       bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
3225
                       TYPE_SIZE (scalar_type));
3226
      else
3227
        bitpos = bitsize_zero_node;
3228
 
3229
      rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
3230
      epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
3231
      new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
3232
      gimple_assign_set_lhs (epilog_stmt, new_temp);
3233
      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3234
    }
3235
 
3236
vect_finalize_reduction:
3237
 
3238
  if (double_reduc)
3239
    loop = loop->inner;
3240
 
3241
  /* 2.5 Adjust the final result by the initial value of the reduction
3242
         variable. (When such adjustment is not needed, then
3243
         'adjustment_def' is zero).  For example, if code is PLUS we create:
3244
         new_temp = loop_exit_def + adjustment_def  */
3245
 
3246
  if (adjustment_def)
3247
    {
3248
      if (nested_in_vect_loop)
3249
        {
3250
          gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
3251
          expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
3252
          new_dest = vect_create_destination_var (scalar_dest, vectype);
3253
        }
3254
      else
3255
        {
3256
          gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
3257
          expr = build2 (code, scalar_type, new_temp, adjustment_def);
3258
          new_dest = vect_create_destination_var (scalar_dest, scalar_type);
3259
        }
3260
 
3261
      epilog_stmt = gimple_build_assign (new_dest, expr);
3262
      new_temp = make_ssa_name (new_dest, epilog_stmt);
3263
      gimple_assign_set_lhs (epilog_stmt, new_temp);
3264
      SSA_NAME_DEF_STMT (new_temp) = epilog_stmt;
3265
      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3266
    }
3267
 
3268
 
3269
  /* 2.6  Handle the loop-exit phi  */
3270
 
3271
  /* Replace uses of s_out0 with uses of s_out3:
3272
     Find the loop-closed-use at the loop exit of the original scalar result.
3273
     (The reduction result is expected to have two immediate uses - one at the
3274
     latch block, and one at the loop exit).  */
3275
  phis = VEC_alloc (gimple, heap, 10);
3276
  FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
3277
    {
3278
      if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
3279
        {
3280
          exit_phi = USE_STMT (use_p);
3281
          VEC_quick_push (gimple, phis, exit_phi);
3282
        }
3283
    }
3284
 
3285
  /* We expect to have found an exit_phi because of loop-closed-ssa form.  */
3286
  gcc_assert (!VEC_empty (gimple, phis));
3287
 
3288
  for (i = 0; VEC_iterate (gimple, phis, i, exit_phi); i++)
3289
    {
3290
      if (nested_in_vect_loop)
3291
        {
3292
          stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
3293
          gimple vect_phi;
3294
 
3295
          /* FORNOW. Currently not supporting the case that an inner-loop
3296
             reduction is not used in the outer-loop (but only outside the
3297
             outer-loop), unless it is double reduction.  */
3298
          gcc_assert ((STMT_VINFO_RELEVANT_P (stmt_vinfo)
3299
                      && !STMT_VINFO_LIVE_P (stmt_vinfo)) || double_reduc);
3300
 
3301
          epilog_stmt = adjustment_def ? epilog_stmt : new_phi;
3302
          STMT_VINFO_VEC_STMT (stmt_vinfo) = epilog_stmt;
3303
          set_vinfo_for_stmt (epilog_stmt,
3304
                              new_stmt_vec_info (epilog_stmt, loop_vinfo,
3305
                                                 NULL));
3306
          if (adjustment_def)
3307
            STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
3308
                STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
3309
 
3310
          if (!double_reduc
3311
              || STMT_VINFO_DEF_TYPE (stmt_vinfo) != vect_double_reduction_def)
3312
            continue;
3313
 
3314
          /* Handle double reduction:
3315
 
3316
             stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
3317
             stmt2:   s3 = phi <s1, s4> - (regular) reduction phi (inner loop)
3318
             stmt3:   s4 = use (s3)     - (regular) reduction stmt (inner loop)
3319
             stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
3320
 
3321
             At that point the regular reduction (stmt2 and stmt3) is already
3322
             vectorized, as well as the exit phi node, stmt4.
3323
             Here we vectorize the phi node of double reduction, stmt1, and
3324
             update all relevant statements.  */
3325
 
3326
          /* Go through all the uses of s2 to find double reduction phi node,
3327
             i.e., stmt1 above.  */
3328
          orig_name = PHI_RESULT (exit_phi);
3329
          FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
3330
            {
3331
              stmt_vec_info use_stmt_vinfo = vinfo_for_stmt (use_stmt);
3332
              stmt_vec_info new_phi_vinfo;
3333
              tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
3334
              basic_block bb = gimple_bb (use_stmt);
3335
              gimple use;
3336
 
3337
              /* Check that USE_STMT is really double reduction phi node.  */
3338
              if (gimple_code (use_stmt) != GIMPLE_PHI
3339
                  || gimple_phi_num_args (use_stmt) != 2
3340
                  || !use_stmt_vinfo
3341
                  || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
3342
                      != vect_double_reduction_def
3343
                  || bb->loop_father != outer_loop)
3344
                continue;
3345
 
3346
              /* Create vector phi node for double reduction:
3347
                 vs1 = phi <vs0, vs2>
3348
                 vs1 was created previously in this function by a call to
3349
                 vect_get_vec_def_for_operand and is stored in vec_initial_def;
3350
                 vs2 is defined by EPILOG_STMT, the vectorized EXIT_PHI;
3351
                 vs0 is created here.  */
3352
 
3353
              /* Create vector phi node.  */
3354
              vect_phi = create_phi_node (vec_initial_def, bb);
3355
              new_phi_vinfo = new_stmt_vec_info (vect_phi,
3356
                                    loop_vec_info_for_loop (outer_loop), NULL);
3357
              set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
3358
 
3359
              /* Create vs0 - initial def of the double reduction phi.  */
3360
              preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
3361
                                             loop_preheader_edge (outer_loop));
3362
              init_def = get_initial_def_for_reduction (stmt, preheader_arg,
3363
                                                        NULL);
3364
              vect_phi_init = vect_init_vector (use_stmt, init_def, vectype,
3365
                                                NULL);
3366
 
3367
              /* Update phi node arguments with vs0 and vs2.  */
3368
              add_phi_arg (vect_phi, vect_phi_init,
3369
                           loop_preheader_edge (outer_loop), UNKNOWN_LOCATION);
3370
              add_phi_arg (vect_phi, PHI_RESULT (epilog_stmt),
3371
                           loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
3372
              if (vect_print_dump_info (REPORT_DETAILS))
3373
                {
3374
                  fprintf (vect_dump, "created double reduction phi node: ");
3375
                  print_gimple_stmt (vect_dump, vect_phi, 0, TDF_SLIM);
3376
                }
3377
 
3378
              vect_phi_res = PHI_RESULT (vect_phi);
3379
 
3380
              /* Replace the use, i.e., set the correct vs1 in the regular
3381
                 reduction phi node. FORNOW, NCOPIES is always 1, so the loop
3382
                 is redundant.  */
3383
              use = reduction_phi;
3384
              for (j = 0; j < ncopies; j++)
3385
                {
3386
                  edge pr_edge = loop_preheader_edge (loop);
3387
                  SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
3388
                  use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
3389
                }
3390
            }
3391
        }
3392
 
3393
      /* Replace the uses:  */
3394
      orig_name = PHI_RESULT (exit_phi);
3395
      FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
3396
        FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
3397
          SET_USE (use_p, new_temp);
3398
    }
3399
 
3400
  VEC_free (gimple, heap, phis);
3401
}
3402
 
3403
 
3404
/* Function vectorizable_reduction.
3405
 
3406
   Check if STMT performs a reduction operation that can be vectorized.
3407
   If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3408
   stmt to replace it, put it in VEC_STMT, and insert it at GSI.
3409
   Return FALSE if not a vectorizable STMT, TRUE otherwise.
3410
 
3411
   This function also handles reduction idioms (patterns) that have been
3412
   recognized in advance during vect_pattern_recog. In this case, STMT may be
3413
   of this form:
3414
     X = pattern_expr (arg0, arg1, ..., X)
3415
   and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
3416
   sequence that had been detected and replaced by the pattern-stmt (STMT).
3417
 
3418
   In some cases of reduction patterns, the type of the reduction variable X is
3419
   different than the type of the other arguments of STMT.
3420
   In such cases, the vectype that is used when transforming STMT into a vector
3421
   stmt is different than the vectype that is used to determine the
3422
   vectorization factor, because it consists of a different number of elements
3423
   than the actual number of elements that are being operated upon in parallel.
3424
 
3425
   For example, consider an accumulation of shorts into an int accumulator.
3426
   On some targets it's possible to vectorize this pattern operating on 8
3427
   shorts at a time (hence, the vectype for purposes of determining the
3428
   vectorization factor should be V8HI); on the other hand, the vectype that
3429
   is used to create the vector form is actually V4SI (the type of the result).
3430
 
3431
   Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
3432
   indicates what is the actual level of parallelism (V8HI in the example), so
3433
   that the right vectorization factor would be derived. This vectype
3434
   corresponds to the type of arguments to the reduction stmt, and should *NOT*
3435
   be used to create the vectorized stmt. The right vectype for the vectorized
3436
   stmt is obtained from the type of the result X:
3437
        get_vectype_for_scalar_type (TREE_TYPE (X))
3438
 
3439
   This means that, contrary to "regular" reductions (or "regular" stmts in
3440
   general), the following equation:
3441
      STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
3442
   does *NOT* necessarily hold for reduction patterns.  */
3443
 
3444
bool
3445
vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
3446
                        gimple *vec_stmt)
3447
{
3448
  tree vec_dest;
3449
  tree scalar_dest;
3450
  tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
3451
  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3452
  tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3453
  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3454
  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3455
  enum tree_code code, orig_code, epilog_reduc_code;
3456
  enum machine_mode vec_mode;
3457
  int op_type;
3458
  optab optab, reduc_optab;
3459
  tree new_temp = NULL_TREE;
3460
  tree def;
3461
  gimple def_stmt;
3462
  enum vect_def_type dt;
3463
  gimple new_phi = NULL;
3464
  tree scalar_type;
3465
  bool is_simple_use;
3466
  gimple orig_stmt;
3467
  stmt_vec_info orig_stmt_info;
3468
  tree expr = NULL_TREE;
3469
  int i;
3470
  int nunits = TYPE_VECTOR_SUBPARTS (vectype);
3471
  int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
3472
  int epilog_copies;
3473
  stmt_vec_info prev_stmt_info, prev_phi_info;
3474
  gimple first_phi = NULL;
3475
  bool single_defuse_cycle = false;
3476
  tree reduc_def = NULL_TREE;
3477
  gimple new_stmt = NULL;
3478
  int j;
3479
  tree ops[3];
3480
  bool nested_cycle = false, found_nested_cycle_def = false;
3481
  gimple reduc_def_stmt = NULL;
3482
  /* The default is that the reduction variable is the last in statement.  */
3483
  int reduc_index = 2;
3484
  bool double_reduc = false, dummy;
3485
  basic_block def_bb;
3486
  struct loop * def_stmt_loop, *outer_loop = NULL;
3487
  tree def_arg;
3488
  gimple def_arg_stmt;
3489
 
3490
  if (nested_in_vect_loop_p (loop, stmt))
3491
    {
3492
      outer_loop = loop;
3493
      loop = loop->inner;
3494
      nested_cycle = true;
3495
    }
3496
 
3497
  gcc_assert (ncopies >= 1);
3498
 
3499
  /* FORNOW: SLP not supported.  */
3500
  if (STMT_SLP_TYPE (stmt_info))
3501
    return false;
3502
 
3503
  /* 1. Is vectorizable reduction?  */
3504
  /* Not supportable if the reduction variable is used in the loop.  */
3505
  if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer)
3506
    return false;
3507
 
3508
  /* Reductions that are not used even in an enclosing outer-loop,
3509
     are expected to be "live" (used out of the loop).  */
3510
  if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
3511
      && !STMT_VINFO_LIVE_P (stmt_info))
3512
    return false;
3513
 
3514
  /* Make sure it was already recognized as a reduction computation.  */
3515
  if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
3516
      && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
3517
    return false;
3518
 
3519
  /* 2. Has this been recognized as a reduction pattern?
3520
 
3521
     Check if STMT represents a pattern that has been recognized
3522
     in earlier analysis stages.  For stmts that represent a pattern,
3523
     the STMT_VINFO_RELATED_STMT field records the last stmt in
3524
     the original sequence that constitutes the pattern.  */
3525
 
3526
  orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3527
  if (orig_stmt)
3528
    {
3529
      orig_stmt_info = vinfo_for_stmt (orig_stmt);
3530
      gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt);
3531
      gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
3532
      gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
3533
    }
3534
 
3535
  /* 3. Check the operands of the operation. The first operands are defined
3536
        inside the loop body. The last operand is the reduction variable,
3537
        which is defined by the loop-header-phi.  */
3538
 
3539
  gcc_assert (is_gimple_assign (stmt));
3540
 
3541
  /* Flatten RHS */
3542
  switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3543
    {
3544
    case GIMPLE_SINGLE_RHS:
3545
      op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
3546
      if (op_type == ternary_op)
3547
        {
3548
          tree rhs = gimple_assign_rhs1 (stmt);
3549
          ops[0] = TREE_OPERAND (rhs, 0);
3550
          ops[1] = TREE_OPERAND (rhs, 1);
3551
          ops[2] = TREE_OPERAND (rhs, 2);
3552
          code = TREE_CODE (rhs);
3553
        }
3554
      else
3555
        return false;
3556
      break;
3557
 
3558
    case GIMPLE_BINARY_RHS:
3559
      code = gimple_assign_rhs_code (stmt);
3560
      op_type = TREE_CODE_LENGTH (code);
3561
      gcc_assert (op_type == binary_op);
3562
      ops[0] = gimple_assign_rhs1 (stmt);
3563
      ops[1] = gimple_assign_rhs2 (stmt);
3564
      break;
3565
 
3566
    case GIMPLE_UNARY_RHS:
3567
      return false;
3568
 
3569
    default:
3570
      gcc_unreachable ();
3571
    }
3572
 
3573
  scalar_dest = gimple_assign_lhs (stmt);
3574
  scalar_type = TREE_TYPE (scalar_dest);
3575
  if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
3576
      && !SCALAR_FLOAT_TYPE_P (scalar_type))
3577
    return false;
3578
 
3579
  /* All uses but the last are expected to be defined in the loop.
3580
     The last use is the reduction variable. In case of nested cycle this
3581
     assumption is not true: we use reduc_index to record the index of the
3582
     reduction variable.  */
3583
  for (i = 0; i < op_type-1; i++)
3584
    {
3585
      /* The condition of COND_EXPR is checked in vectorizable_condition().  */
3586
      if (i == 0 && code == COND_EXPR)
3587
        continue;
3588
 
3589
      is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, NULL, &def_stmt,
3590
                                          &def, &dt);
3591
      gcc_assert (is_simple_use);
3592
      if (dt != vect_internal_def
3593
          && dt != vect_external_def
3594
          && dt != vect_constant_def
3595
          && dt != vect_induction_def
3596
          && !(dt == vect_nested_cycle && nested_cycle))
3597
        return false;
3598
 
3599
      if (dt == vect_nested_cycle)
3600
        {
3601
          found_nested_cycle_def = true;
3602
          reduc_def_stmt = def_stmt;
3603
          reduc_index = i;
3604
        }
3605
    }
3606
 
3607
  is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, NULL, &def_stmt,
3608
                                      &def, &dt);
3609
  gcc_assert (is_simple_use);
3610
  gcc_assert (dt == vect_reduction_def
3611
              || dt == vect_nested_cycle
3612
              || ((dt == vect_internal_def || dt == vect_external_def
3613
                   || dt == vect_constant_def || dt == vect_induction_def)
3614
                   && nested_cycle && found_nested_cycle_def));
3615
  if (!found_nested_cycle_def)
3616
    reduc_def_stmt = def_stmt;
3617
 
3618
  gcc_assert (gimple_code (reduc_def_stmt) == GIMPLE_PHI);
3619
  if (orig_stmt)
3620
    gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo,
3621
                                                       reduc_def_stmt,
3622
                                                       !nested_cycle,
3623
                                                       &dummy));
3624
  else
3625
    gcc_assert (stmt == vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
3626
                                                  !nested_cycle, &dummy));
3627
 
3628
  if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
3629
    return false;
3630
 
3631
  vec_mode = TYPE_MODE (vectype);
3632
 
3633
  if (code == COND_EXPR)
3634
    {
3635
      if (!vectorizable_condition (stmt, gsi, NULL, ops[reduc_index], 0))
3636
        {
3637
          if (vect_print_dump_info (REPORT_DETAILS))
3638
            fprintf (vect_dump, "unsupported condition in reduction");
3639
 
3640
            return false;
3641
        }
3642
    }
3643
  else
3644
    {
3645
      /* 4. Supportable by target?  */
3646
 
3647
      /* 4.1. check support for the operation in the loop  */
3648
      optab = optab_for_tree_code (code, vectype, optab_default);
3649
      if (!optab)
3650
        {
3651
          if (vect_print_dump_info (REPORT_DETAILS))
3652
            fprintf (vect_dump, "no optab.");
3653
 
3654
          return false;
3655
        }
3656
 
3657
      if (optab_handler (optab, vec_mode)->insn_code == CODE_FOR_nothing)
3658
        {
3659
          if (vect_print_dump_info (REPORT_DETAILS))
3660
            fprintf (vect_dump, "op not supported by target.");
3661
 
3662
          if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
3663
              || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
3664
                  < vect_min_worthwhile_factor (code))
3665
            return false;
3666
 
3667
          if (vect_print_dump_info (REPORT_DETAILS))
3668
            fprintf (vect_dump, "proceeding using word mode.");
3669
        }
3670
 
3671
      /* Worthwhile without SIMD support?  */
3672
      if (!VECTOR_MODE_P (TYPE_MODE (vectype))
3673
          && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
3674
             < vect_min_worthwhile_factor (code))
3675
        {
3676
          if (vect_print_dump_info (REPORT_DETAILS))
3677
            fprintf (vect_dump, "not worthwhile without SIMD support.");
3678
 
3679
          return false;
3680
        }
3681
    }
3682
 
3683
  /* 4.2. Check support for the epilog operation.
3684
 
3685
          If STMT represents a reduction pattern, then the type of the
3686
          reduction variable may be different than the type of the rest
3687
          of the arguments.  For example, consider the case of accumulation
3688
          of shorts into an int accumulator; The original code:
3689
                        S1: int_a = (int) short_a;
3690
          orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
3691
 
3692
          was replaced with:
3693
                        STMT: int_acc = widen_sum <short_a, int_acc>
3694
 
3695
          This means that:
3696
          1. The tree-code that is used to create the vector operation in the
3697
             epilog code (that reduces the partial results) is not the
3698
             tree-code of STMT, but is rather the tree-code of the original
3699
             stmt from the pattern that STMT is replacing. I.e, in the example
3700
             above we want to use 'widen_sum' in the loop, but 'plus' in the
3701
             epilog.
3702
          2. The type (mode) we use to check available target support
3703
             for the vector operation to be created in the *epilog*, is
3704
             determined by the type of the reduction variable (in the example
3705
             above we'd check this: plus_optab[vect_int_mode]).
3706
             However the type (mode) we use to check available target support
3707
             for the vector operation to be created *inside the loop*, is
3708
             determined by the type of the other arguments to STMT (in the
3709
             example we'd check this: widen_sum_optab[vect_short_mode]).
3710
 
3711
          This is contrary to "regular" reductions, in which the types of all
3712
          the arguments are the same as the type of the reduction variable.
3713
          For "regular" reductions we can therefore use the same vector type
3714
          (and also the same tree-code) when generating the epilog code and
3715
          when generating the code inside the loop.  */
3716
 
3717
  if (orig_stmt)
3718
    {
3719
      /* This is a reduction pattern: get the vectype from the type of the
3720
         reduction variable, and get the tree-code from orig_stmt.  */
3721
      orig_code = gimple_assign_rhs_code (orig_stmt);
3722
      vectype = get_vectype_for_scalar_type (TREE_TYPE (def));
3723
      if (!vectype)
3724
        {
3725
          if (vect_print_dump_info (REPORT_DETAILS))
3726
            {
3727
              fprintf (vect_dump, "unsupported data-type ");
3728
              print_generic_expr (vect_dump, TREE_TYPE (def), TDF_SLIM);
3729
            }
3730
          return false;
3731
        }
3732
 
3733
      vec_mode = TYPE_MODE (vectype);
3734
    }
3735
  else
3736
    {
3737
      /* Regular reduction: use the same vectype and tree-code as used for
3738
         the vector code inside the loop can be used for the epilog code. */
3739
      orig_code = code;
3740
    }
3741
 
3742
  if (nested_cycle)
3743
    {
3744
      def_bb = gimple_bb (reduc_def_stmt);
3745
      def_stmt_loop = def_bb->loop_father;
3746
      def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
3747
                                       loop_preheader_edge (def_stmt_loop));
3748
      if (TREE_CODE (def_arg) == SSA_NAME
3749
          && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
3750
          && gimple_code (def_arg_stmt) == GIMPLE_PHI
3751
          && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
3752
          && vinfo_for_stmt (def_arg_stmt)
3753
          && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
3754
              == vect_double_reduction_def)
3755
        double_reduc = true;
3756
    }
3757
 
3758
  epilog_reduc_code = ERROR_MARK;
3759
  if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
3760
    {
3761
      reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype,
3762
                                         optab_default);
3763
      if (!reduc_optab)
3764
        {
3765
          if (vect_print_dump_info (REPORT_DETAILS))
3766
            fprintf (vect_dump, "no optab for reduction.");
3767
 
3768
          epilog_reduc_code = ERROR_MARK;
3769
        }
3770
 
3771
      if (reduc_optab
3772
          && optab_handler (reduc_optab, vec_mode)->insn_code
3773
              == CODE_FOR_nothing)
3774
        {
3775
          if (vect_print_dump_info (REPORT_DETAILS))
3776
            fprintf (vect_dump, "reduc op not supported by target.");
3777
 
3778
          epilog_reduc_code = ERROR_MARK;
3779
        }
3780
    }
3781
  else
3782
    {
3783
      if (!nested_cycle || double_reduc)
3784
        {
3785
          if (vect_print_dump_info (REPORT_DETAILS))
3786
            fprintf (vect_dump, "no reduc code for scalar code.");
3787
 
3788
          return false;
3789
        }
3790
    }
3791
 
3792
  if (double_reduc && ncopies > 1)
3793
    {
3794
      if (vect_print_dump_info (REPORT_DETAILS))
3795
        fprintf (vect_dump, "multiple types in double reduction");
3796
 
3797
      return false;
3798
    }
3799
 
3800
  if (!vec_stmt) /* transformation not required.  */
3801
    {
3802
      STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
3803
      if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies))
3804
        return false;
3805
      return true;
3806
    }
3807
 
3808
  /** Transform.  **/
3809
 
3810
  if (vect_print_dump_info (REPORT_DETAILS))
3811
    fprintf (vect_dump, "transform reduction.");
3812
 
3813
  /* FORNOW: Multiple types are not supported for condition.  */
3814
  if (code == COND_EXPR)
3815
    gcc_assert (ncopies == 1);
3816
 
3817
  /* Create the destination vector  */
3818
  vec_dest = vect_create_destination_var (scalar_dest, vectype);
3819
 
3820
  /* In case the vectorization factor (VF) is bigger than the number
3821
     of elements that we can fit in a vectype (nunits), we have to generate
3822
     more than one vector stmt - i.e - we need to "unroll" the
3823
     vector stmt by a factor VF/nunits.  For more details see documentation
3824
     in vectorizable_operation.  */
3825
 
3826
  /* If the reduction is used in an outer loop we need to generate
3827
     VF intermediate results, like so (e.g. for ncopies=2):
3828
        r0 = phi (init, r0)
3829
        r1 = phi (init, r1)
3830
        r0 = x0 + r0;
3831
        r1 = x1 + r1;
3832
    (i.e. we generate VF results in 2 registers).
3833
    In this case we have a separate def-use cycle for each copy, and therefore
3834
    for each copy we get the vector def for the reduction variable from the
3835
    respective phi node created for this copy.
3836
 
3837
    Otherwise (the reduction is unused in the loop nest), we can combine
3838
    together intermediate results, like so (e.g. for ncopies=2):
3839
        r = phi (init, r)
3840
        r = x0 + r;
3841
        r = x1 + r;
3842
   (i.e. we generate VF/2 results in a single register).
3843
   In this case for each copy we get the vector def for the reduction variable
3844
   from the vectorized reduction operation generated in the previous iteration.
3845
  */
3846
 
3847
  if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope)
3848
    {
3849
      single_defuse_cycle = true;
3850
      epilog_copies = 1;
3851
    }
3852
  else
3853
    epilog_copies = ncopies;
3854
 
3855
  prev_stmt_info = NULL;
3856
  prev_phi_info = NULL;
3857
  for (j = 0; j < ncopies; j++)
3858
    {
3859
      if (j == 0 || !single_defuse_cycle)
3860
        {
3861
          /* Create the reduction-phi that defines the reduction-operand.  */
3862
          new_phi = create_phi_node (vec_dest, loop->header);
3863
          set_vinfo_for_stmt (new_phi, new_stmt_vec_info (new_phi, loop_vinfo,
3864
                                                          NULL));
3865
          /* Get the vector def for the reduction variable from the phi
3866
             node.  */
3867
          reduc_def = PHI_RESULT (new_phi);
3868
        }
3869
 
3870
      if (code == COND_EXPR)
3871
        {
3872
          first_phi = new_phi;
3873
          vectorizable_condition (stmt, gsi, vec_stmt, reduc_def, reduc_index);
3874
          /* Multiple types are not supported for condition.  */
3875
          break;
3876
        }
3877
 
3878
      /* Handle uses.  */
3879
      if (j == 0)
3880
        {
3881
          loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
3882
                                                        stmt, NULL);
3883
          if (op_type == ternary_op)
3884
            {
3885
              if (reduc_index == 0)
3886
                loop_vec_def1 = vect_get_vec_def_for_operand (ops[2], stmt,
3887
                                                              NULL);
3888
              else
3889
                loop_vec_def1 = vect_get_vec_def_for_operand (ops[1], stmt,
3890
                                                              NULL);
3891
            }
3892
 
3893
          /* Get the vector def for the reduction variable from the phi
3894
             node.  */
3895
          first_phi = new_phi;
3896
        }
3897
      else
3898
        {
3899
          enum vect_def_type dt = vect_unknown_def_type; /* Dummy */
3900
          loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def0);
3901
          if (op_type == ternary_op)
3902
            loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def1);
3903
 
3904
          if (single_defuse_cycle)
3905
            reduc_def = gimple_assign_lhs (new_stmt);
3906
          else
3907
            reduc_def = PHI_RESULT (new_phi);
3908
 
3909
          STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
3910
        }
3911
 
3912
      /* Arguments are ready. Create the new vector stmt.  */
3913
      if (op_type == binary_op)
3914
        {
3915
          if (reduc_index == 0)
3916
            expr = build2 (code, vectype, reduc_def, loop_vec_def0);
3917
          else
3918
            expr = build2 (code, vectype, loop_vec_def0, reduc_def);
3919
        }
3920
      else
3921
        {
3922
          if (reduc_index == 0)
3923
            expr = build3 (code, vectype, reduc_def, loop_vec_def0,
3924
                           loop_vec_def1);
3925
          else
3926
            {
3927
              if (reduc_index == 1)
3928
                expr = build3 (code, vectype, loop_vec_def0, reduc_def,
3929
                               loop_vec_def1);
3930
              else
3931
                expr = build3 (code, vectype, loop_vec_def0, loop_vec_def1,
3932
                               reduc_def);
3933
            }
3934
        }
3935
 
3936
      new_stmt = gimple_build_assign (vec_dest, expr);
3937
      new_temp = make_ssa_name (vec_dest, new_stmt);
3938
      gimple_assign_set_lhs (new_stmt, new_temp);
3939
      vect_finish_stmt_generation (stmt, new_stmt, gsi);
3940
 
3941
      if (j == 0)
3942
        STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
3943
      else
3944
        STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3945
 
3946
      prev_stmt_info = vinfo_for_stmt (new_stmt);
3947
      prev_phi_info = vinfo_for_stmt (new_phi);
3948
    }
3949
 
3950
  /* Finalize the reduction-phi (set its arguments) and create the
3951
     epilog reduction code.  */
3952
  if (!single_defuse_cycle || code == COND_EXPR)
3953
    new_temp = gimple_assign_lhs (*vec_stmt);
3954
 
3955
  vect_create_epilog_for_reduction (new_temp, stmt, epilog_copies,
3956
                                    epilog_reduc_code, first_phi, reduc_index,
3957
                                    double_reduc);
3958
  return true;
3959
}
3960
 
3961
/* Function vect_min_worthwhile_factor.
3962
 
3963
   For a loop where we could vectorize the operation indicated by CODE,
3964
   return the minimum vectorization factor that makes it worthwhile
3965
   to use generic vectors.  */
3966
int
3967
vect_min_worthwhile_factor (enum tree_code code)
3968
{
3969
  switch (code)
3970
    {
3971
    case PLUS_EXPR:
3972
    case MINUS_EXPR:
3973
    case NEGATE_EXPR:
3974
      return 4;
3975
 
3976
    case BIT_AND_EXPR:
3977
    case BIT_IOR_EXPR:
3978
    case BIT_XOR_EXPR:
3979
    case BIT_NOT_EXPR:
3980
      return 2;
3981
 
3982
    default:
3983
      return INT_MAX;
3984
    }
3985
}
3986
 
3987
 
3988
/* Function vectorizable_induction
3989
 
3990
   Check if PHI performs an induction computation that can be vectorized.
3991
   If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
3992
   phi to replace it, put it in VEC_STMT, and add it to the same basic block.
3993
   Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
3994
 
3995
bool
3996
vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
3997
                        gimple *vec_stmt)
3998
{
3999
  stmt_vec_info stmt_info = vinfo_for_stmt (phi);
4000
  tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4001
  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4002
  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4003
  int nunits = TYPE_VECTOR_SUBPARTS (vectype);
4004
  int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
4005
  tree vec_def;
4006
 
4007
  gcc_assert (ncopies >= 1);
4008
  /* FORNOW. This restriction should be relaxed.  */
4009
  if (nested_in_vect_loop_p (loop, phi) && ncopies > 1)
4010
    {
4011
      if (vect_print_dump_info (REPORT_DETAILS))
4012
        fprintf (vect_dump, "multiple types in nested loop.");
4013
      return false;
4014
    }
4015
 
4016
  if (!STMT_VINFO_RELEVANT_P (stmt_info))
4017
    return false;
4018
 
4019
  /* FORNOW: SLP not supported.  */
4020
  if (STMT_SLP_TYPE (stmt_info))
4021
    return false;
4022
 
4023
  gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
4024
 
4025
  if (gimple_code (phi) != GIMPLE_PHI)
4026
    return false;
4027
 
4028
  if (!vec_stmt) /* transformation not required.  */
4029
    {
4030
      STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
4031
      if (vect_print_dump_info (REPORT_DETAILS))
4032
        fprintf (vect_dump, "=== vectorizable_induction ===");
4033
      vect_model_induction_cost (stmt_info, ncopies);
4034
      return true;
4035
    }
4036
 
4037
  /** Transform.  **/
4038
 
4039
  if (vect_print_dump_info (REPORT_DETAILS))
4040
    fprintf (vect_dump, "transform induction phi.");
4041
 
4042
  vec_def = get_initial_def_for_induction (phi);
4043
  *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
4044
  return true;
4045
}
4046
 
4047
/* Function vectorizable_live_operation.
4048
 
4049
   STMT computes a value that is used outside the loop. Check if
4050
   it can be supported.  */
4051
 
4052
bool
4053
vectorizable_live_operation (gimple stmt,
4054
                             gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
4055
                             gimple *vec_stmt ATTRIBUTE_UNUSED)
4056
{
4057
  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4058
  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4059
  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4060
  int i;
4061
  int op_type;
4062
  tree op;
4063
  tree def;
4064
  gimple def_stmt;
4065
  enum vect_def_type dt;
4066
  enum tree_code code;
4067
  enum gimple_rhs_class rhs_class;
4068
 
4069
  gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
4070
 
4071
  if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
4072
    return false;
4073
 
4074
  if (!is_gimple_assign (stmt))
4075
    return false;
4076
 
4077
  if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
4078
    return false;
4079
 
4080
  /* FORNOW. CHECKME. */
4081
  if (nested_in_vect_loop_p (loop, stmt))
4082
    return false;
4083
 
4084
  code = gimple_assign_rhs_code (stmt);
4085
  op_type = TREE_CODE_LENGTH (code);
4086
  rhs_class = get_gimple_rhs_class (code);
4087
  gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op);
4088
  gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op);
4089
 
4090
  /* FORNOW: support only if all uses are invariant. This means
4091
     that the scalar operations can remain in place, unvectorized.
4092
     The original last scalar value that they compute will be used.  */
4093
 
4094
  for (i = 0; i < op_type; i++)
4095
    {
4096
      if (rhs_class == GIMPLE_SINGLE_RHS)
4097
        op = TREE_OPERAND (gimple_op (stmt, 1), i);
4098
      else
4099
        op = gimple_op (stmt, i + 1);
4100
      if (op
4101
          && !vect_is_simple_use (op, loop_vinfo, NULL, &def_stmt, &def, &dt))
4102
        {
4103
          if (vect_print_dump_info (REPORT_DETAILS))
4104
            fprintf (vect_dump, "use not simple.");
4105
          return false;
4106
        }
4107
 
4108
      if (dt != vect_external_def && dt != vect_constant_def)
4109
        return false;
4110
    }
4111
 
4112
  /* No transformation is required for the cases we currently support.  */
4113
  return true;
4114
}
4115
 
4116
/* Kill any debug uses outside LOOP of SSA names defined in STMT.  */
4117
 
4118
static void
4119
vect_loop_kill_debug_uses (struct loop *loop, gimple stmt)
4120
{
4121
  ssa_op_iter op_iter;
4122
  imm_use_iterator imm_iter;
4123
  def_operand_p def_p;
4124
  gimple ustmt;
4125
 
4126
  FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
4127
    {
4128
      FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
4129
        {
4130
          basic_block bb;
4131
 
4132
          if (!is_gimple_debug (ustmt))
4133
            continue;
4134
 
4135
          bb = gimple_bb (ustmt);
4136
 
4137
          if (!flow_bb_inside_loop_p (loop, bb))
4138
            {
4139
              if (gimple_debug_bind_p (ustmt))
4140
                {
4141
                  if (vect_print_dump_info (REPORT_DETAILS))
4142
                    fprintf (vect_dump, "killing debug use");
4143
 
4144
                  gimple_debug_bind_reset_value (ustmt);
4145
                  update_stmt (ustmt);
4146
                }
4147
              else
4148
                gcc_unreachable ();
4149
            }
4150
        }
4151
    }
4152
}
4153
 
4154
/* Function vect_transform_loop.
4155
 
4156
   The analysis phase has determined that the loop is vectorizable.
4157
   Vectorize the loop - created vectorized stmts to replace the scalar
4158
   stmts in the loop, and update the loop exit condition.  */
4159
 
4160
void
4161
vect_transform_loop (loop_vec_info loop_vinfo)
4162
{
4163
  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4164
  basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
4165
  int nbbs = loop->num_nodes;
4166
  gimple_stmt_iterator si;
4167
  int i;
4168
  tree ratio = NULL;
4169
  int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
4170
  bool strided_store;
4171
  bool slp_scheduled = false;
4172
  unsigned int nunits;
4173
  tree cond_expr = NULL_TREE;
4174
  gimple_seq cond_expr_stmt_list = NULL;
4175
  bool do_peeling_for_loop_bound;
4176
 
4177
  if (vect_print_dump_info (REPORT_DETAILS))
4178
    fprintf (vect_dump, "=== vec_transform_loop ===");
4179
 
4180
  /* Peel the loop if there are data refs with unknown alignment.
4181
     Only one data ref with unknown store is allowed.  */
4182
 
4183
  if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
4184
    vect_do_peeling_for_alignment (loop_vinfo);
4185
 
4186
  do_peeling_for_loop_bound
4187
    = (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4188
       || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4189
           && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0));
4190
 
4191
  if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
4192
      || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
4193
    vect_loop_versioning (loop_vinfo,
4194
                          !do_peeling_for_loop_bound,
4195
                          &cond_expr, &cond_expr_stmt_list);
4196
 
4197
  /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
4198
     compile time constant), or it is a constant that doesn't divide by the
4199
     vectorization factor, then an epilog loop needs to be created.
4200
     We therefore duplicate the loop: the original loop will be vectorized,
4201
     and will compute the first (n/VF) iterations. The second copy of the loop
4202
     will remain scalar and will compute the remaining (n%VF) iterations.
4203
     (VF is the vectorization factor).  */
4204
 
4205
  if (do_peeling_for_loop_bound)
4206
    vect_do_peeling_for_loop_bound (loop_vinfo, &ratio,
4207
                                    cond_expr, cond_expr_stmt_list);
4208
  else
4209
    ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
4210
                LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
4211
 
4212
  /* 1) Make sure the loop header has exactly two entries
4213
     2) Make sure we have a preheader basic block.  */
4214
 
4215
  gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
4216
 
4217
  split_edge (loop_preheader_edge (loop));
4218
 
4219
  /* FORNOW: the vectorizer supports only loops which body consist
4220
     of one basic block (header + empty latch). When the vectorizer will
4221
     support more involved loop forms, the order by which the BBs are
4222
     traversed need to be reconsidered.  */
4223
 
4224
  for (i = 0; i < nbbs; i++)
4225
    {
4226
      basic_block bb = bbs[i];
4227
      stmt_vec_info stmt_info;
4228
      gimple phi;
4229
 
4230
      for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
4231
        {
4232
          phi = gsi_stmt (si);
4233
          if (vect_print_dump_info (REPORT_DETAILS))
4234
            {
4235
              fprintf (vect_dump, "------>vectorizing phi: ");
4236
              print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
4237
            }
4238
          stmt_info = vinfo_for_stmt (phi);
4239
          if (!stmt_info)
4240
            continue;
4241
 
4242
          if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
4243
            vect_loop_kill_debug_uses (loop, phi);
4244
 
4245
          if (!STMT_VINFO_RELEVANT_P (stmt_info)
4246
              && !STMT_VINFO_LIVE_P (stmt_info))
4247
            continue;
4248
 
4249
          if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
4250
                != (unsigned HOST_WIDE_INT) vectorization_factor)
4251
              && vect_print_dump_info (REPORT_DETAILS))
4252
            fprintf (vect_dump, "multiple-types.");
4253
 
4254
          if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
4255
            {
4256
              if (vect_print_dump_info (REPORT_DETAILS))
4257
                fprintf (vect_dump, "transform phi.");
4258
              vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
4259
            }
4260
        }
4261
 
4262
      for (si = gsi_start_bb (bb); !gsi_end_p (si);)
4263
        {
4264
          gimple stmt = gsi_stmt (si);
4265
          bool is_store;
4266
 
4267
          if (vect_print_dump_info (REPORT_DETAILS))
4268
            {
4269
              fprintf (vect_dump, "------>vectorizing statement: ");
4270
              print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
4271
            }
4272
 
4273
          stmt_info = vinfo_for_stmt (stmt);
4274
 
4275
          /* vector stmts created in the outer-loop during vectorization of
4276
             stmts in an inner-loop may not have a stmt_info, and do not
4277
             need to be vectorized.  */
4278
          if (!stmt_info)
4279
            {
4280
              gsi_next (&si);
4281
              continue;
4282
            }
4283
 
4284
          if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
4285
            vect_loop_kill_debug_uses (loop, stmt);
4286
 
4287
          if (!STMT_VINFO_RELEVANT_P (stmt_info)
4288
              && !STMT_VINFO_LIVE_P (stmt_info))
4289
            {
4290
              gsi_next (&si);
4291
              continue;
4292
            }
4293
 
4294
          gcc_assert (STMT_VINFO_VECTYPE (stmt_info));
4295
          nunits =
4296
            (unsigned int) TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
4297
          if (!STMT_SLP_TYPE (stmt_info)
4298
              && nunits != (unsigned int) vectorization_factor
4299
              && vect_print_dump_info (REPORT_DETAILS))
4300
            /* For SLP VF is set according to unrolling factor, and not to
4301
               vector size, hence for SLP this print is not valid.  */
4302
            fprintf (vect_dump, "multiple-types.");
4303
 
4304
          /* SLP. Schedule all the SLP instances when the first SLP stmt is
4305
             reached.  */
4306
          if (STMT_SLP_TYPE (stmt_info))
4307
            {
4308
              if (!slp_scheduled)
4309
                {
4310
                  slp_scheduled = true;
4311
 
4312
                  if (vect_print_dump_info (REPORT_DETAILS))
4313
                    fprintf (vect_dump, "=== scheduling SLP instances ===");
4314
 
4315
                  vect_schedule_slp (loop_vinfo, NULL);
4316
                }
4317
 
4318
              /* Hybrid SLP stmts must be vectorized in addition to SLP.  */
4319
              if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
4320
                {
4321
                  gsi_next (&si);
4322
                  continue;
4323
                }
4324
            }
4325
 
4326
          /* -------- vectorize statement ------------ */
4327
          if (vect_print_dump_info (REPORT_DETAILS))
4328
            fprintf (vect_dump, "transform statement.");
4329
 
4330
          strided_store = false;
4331
          is_store = vect_transform_stmt (stmt, &si, &strided_store, NULL, NULL);
4332
          if (is_store)
4333
            {
4334
              if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
4335
                {
4336
                  /* Interleaving. If IS_STORE is TRUE, the vectorization of the
4337
                     interleaving chain was completed - free all the stores in
4338
                     the chain.  */
4339
                  vect_remove_stores (DR_GROUP_FIRST_DR (stmt_info));
4340
                  gsi_remove (&si, true);
4341
                  continue;
4342
                }
4343
              else
4344
                {
4345
                  /* Free the attached stmt_vec_info and remove the stmt.  */
4346
                  free_stmt_vec_info (stmt);
4347
                  gsi_remove (&si, true);
4348
                  continue;
4349
                }
4350
            }
4351
          gsi_next (&si);
4352
        }                       /* stmts in BB */
4353
    }                           /* BBs in loop */
4354
 
4355
  slpeel_make_loop_iterate_ntimes (loop, ratio);
4356
 
4357
  /* The memory tags and pointers in vectorized statements need to
4358
     have their SSA forms updated.  FIXME, why can't this be delayed
4359
     until all the loops have been transformed?  */
4360
  update_ssa (TODO_update_ssa);
4361
 
4362
  if (vect_print_dump_info (REPORT_VECTORIZED_LOCATIONS))
4363
    fprintf (vect_dump, "LOOP VECTORIZED.");
4364
  if (loop->inner && vect_print_dump_info (REPORT_VECTORIZED_LOCATIONS))
4365
    fprintf (vect_dump, "OUTER LOOP VECTORIZED.");
4366
}

powered by: WebSVN 2.1.0

© copyright 1999-2024 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.