OpenCores
URL https://opencores.org/ocsvn/openrisc/openrisc/trunk

Subversion Repositories openrisc

[/] [openrisc/] [trunk/] [gnu-dev/] [or1k-gcc/] [gcc/] [tree-ssa-math-opts.c] - Blame information for rev 684

Details | Compare with Previous | View Log

Line No. Rev Author Line
1 684 jeremybenn
/* Global, SSA-based optimizations using mathematical identities.
2
   Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010, 2011
3
   Free Software Foundation, Inc.
4
 
5
This file is part of GCC.
6
 
7
GCC is free software; you can redistribute it and/or modify it
8
under the terms of the GNU General Public License as published by the
9
Free Software Foundation; either version 3, or (at your option) any
10
later version.
11
 
12
GCC is distributed in the hope that it will be useful, but WITHOUT
13
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14
FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
15
for more details.
16
 
17
You should have received a copy of the GNU General Public License
18
along with GCC; see the file COPYING3.  If not see
19
<http://www.gnu.org/licenses/>.  */
20
 
21
/* Currently, the only mini-pass in this file tries to CSE reciprocal
22
   operations.  These are common in sequences such as this one:
23
 
24
        modulus = sqrt(x*x + y*y + z*z);
25
        x = x / modulus;
26
        y = y / modulus;
27
        z = z / modulus;
28
 
29
   that can be optimized to
30
 
31
        modulus = sqrt(x*x + y*y + z*z);
32
        rmodulus = 1.0 / modulus;
33
        x = x * rmodulus;
34
        y = y * rmodulus;
35
        z = z * rmodulus;
36
 
37
   We do this for loop invariant divisors, and with this pass whenever
38
   we notice that a division has the same divisor multiple times.
39
 
40
   Of course, like in PRE, we don't insert a division if a dominator
41
   already has one.  However, this cannot be done as an extension of
42
   PRE for several reasons.
43
 
44
   First of all, with some experiments it was found out that the
45
   transformation is not always useful if there are only two divisions
46
   hy the same divisor.  This is probably because modern processors
47
   can pipeline the divisions; on older, in-order processors it should
48
   still be effective to optimize two divisions by the same number.
49
   We make this a param, and it shall be called N in the remainder of
50
   this comment.
51
 
52
   Second, if trapping math is active, we have less freedom on where
53
   to insert divisions: we can only do so in basic blocks that already
54
   contain one.  (If divisions don't trap, instead, we can insert
55
   divisions elsewhere, which will be in blocks that are common dominators
56
   of those that have the division).
57
 
58
   We really don't want to compute the reciprocal unless a division will
59
   be found.  To do this, we won't insert the division in a basic block
60
   that has less than N divisions *post-dominating* it.
61
 
62
   The algorithm constructs a subset of the dominator tree, holding the
63
   blocks containing the divisions and the common dominators to them,
64
   and walk it twice.  The first walk is in post-order, and it annotates
65
   each block with the number of divisions that post-dominate it: this
66
   gives information on where divisions can be inserted profitably.
67
   The second walk is in pre-order, and it inserts divisions as explained
68
   above, and replaces divisions by multiplications.
69
 
70
   In the best case, the cost of the pass is O(n_statements).  In the
71
   worst-case, the cost is due to creating the dominator tree subset,
72
   with a cost of O(n_basic_blocks ^ 2); however this can only happen
73
   for n_statements / n_basic_blocks statements.  So, the amortized cost
74
   of creating the dominator tree subset is O(n_basic_blocks) and the
75
   worst-case cost of the pass is O(n_statements * n_basic_blocks).
76
 
77
   More practically, the cost will be small because there are few
78
   divisions, and they tend to be in the same basic block, so insert_bb
79
   is called very few times.
80
 
81
   If we did this using domwalk.c, an efficient implementation would have
82
   to work on all the variables in a single pass, because we could not
83
   work on just a subset of the dominator tree, as we do now, and the
84
   cost would also be something like O(n_statements * n_basic_blocks).
85
   The data structures would be more complex in order to work on all the
86
   variables in a single pass.  */
87
 
88
#include "config.h"
89
#include "system.h"
90
#include "coretypes.h"
91
#include "tm.h"
92
#include "flags.h"
93
#include "tree.h"
94
#include "tree-flow.h"
95
#include "timevar.h"
96
#include "tree-pass.h"
97
#include "alloc-pool.h"
98
#include "basic-block.h"
99
#include "target.h"
100
#include "gimple-pretty-print.h"
101
 
102
/* FIXME: RTL headers have to be included here for optabs.  */
103
#include "rtl.h"                /* Because optabs.h wants enum rtx_code.  */
104
#include "expr.h"               /* Because optabs.h wants sepops.  */
105
#include "optabs.h"
106
 
107
/* This structure represents one basic block that either computes a
108
   division, or is a common dominator for basic block that compute a
109
   division.  */
110
struct occurrence {
111
  /* The basic block represented by this structure.  */
112
  basic_block bb;
113
 
114
  /* If non-NULL, the SSA_NAME holding the definition for a reciprocal
115
     inserted in BB.  */
116
  tree recip_def;
117
 
118
  /* If non-NULL, the GIMPLE_ASSIGN for a reciprocal computation that
119
     was inserted in BB.  */
120
  gimple recip_def_stmt;
121
 
122
  /* Pointer to a list of "struct occurrence"s for blocks dominated
123
     by BB.  */
124
  struct occurrence *children;
125
 
126
  /* Pointer to the next "struct occurrence"s in the list of blocks
127
     sharing a common dominator.  */
128
  struct occurrence *next;
129
 
130
  /* The number of divisions that are in BB before compute_merit.  The
131
     number of divisions that are in BB or post-dominate it after
132
     compute_merit.  */
133
  int num_divisions;
134
 
135
  /* True if the basic block has a division, false if it is a common
136
     dominator for basic blocks that do.  If it is false and trapping
137
     math is active, BB is not a candidate for inserting a reciprocal.  */
138
  bool bb_has_division;
139
};
140
 
141
static struct
142
{
143
  /* Number of 1.0/X ops inserted.  */
144
  int rdivs_inserted;
145
 
146
  /* Number of 1.0/FUNC ops inserted.  */
147
  int rfuncs_inserted;
148
} reciprocal_stats;
149
 
150
static struct
151
{
152
  /* Number of cexpi calls inserted.  */
153
  int inserted;
154
} sincos_stats;
155
 
156
static struct
157
{
158
  /* Number of hand-written 32-bit bswaps found.  */
159
  int found_32bit;
160
 
161
  /* Number of hand-written 64-bit bswaps found.  */
162
  int found_64bit;
163
} bswap_stats;
164
 
165
static struct
166
{
167
  /* Number of widening multiplication ops inserted.  */
168
  int widen_mults_inserted;
169
 
170
  /* Number of integer multiply-and-accumulate ops inserted.  */
171
  int maccs_inserted;
172
 
173
  /* Number of fp fused multiply-add ops inserted.  */
174
  int fmas_inserted;
175
} widen_mul_stats;
176
 
177
/* The instance of "struct occurrence" representing the highest
178
   interesting block in the dominator tree.  */
179
static struct occurrence *occ_head;
180
 
181
/* Allocation pool for getting instances of "struct occurrence".  */
182
static alloc_pool occ_pool;
183
 
184
 
185
 
186
/* Allocate and return a new struct occurrence for basic block BB, and
187
   whose children list is headed by CHILDREN.  */
188
static struct occurrence *
189
occ_new (basic_block bb, struct occurrence *children)
190
{
191
  struct occurrence *occ;
192
 
193
  bb->aux = occ = (struct occurrence *) pool_alloc (occ_pool);
194
  memset (occ, 0, sizeof (struct occurrence));
195
 
196
  occ->bb = bb;
197
  occ->children = children;
198
  return occ;
199
}
200
 
201
 
202
/* Insert NEW_OCC into our subset of the dominator tree.  P_HEAD points to a
203
   list of "struct occurrence"s, one per basic block, having IDOM as
204
   their common dominator.
205
 
206
   We try to insert NEW_OCC as deep as possible in the tree, and we also
207
   insert any other block that is a common dominator for BB and one
208
   block already in the tree.  */
209
 
210
static void
211
insert_bb (struct occurrence *new_occ, basic_block idom,
212
           struct occurrence **p_head)
213
{
214
  struct occurrence *occ, **p_occ;
215
 
216
  for (p_occ = p_head; (occ = *p_occ) != NULL; )
217
    {
218
      basic_block bb = new_occ->bb, occ_bb = occ->bb;
219
      basic_block dom = nearest_common_dominator (CDI_DOMINATORS, occ_bb, bb);
220
      if (dom == bb)
221
        {
222
          /* BB dominates OCC_BB.  OCC becomes NEW_OCC's child: remove OCC
223
             from its list.  */
224
          *p_occ = occ->next;
225
          occ->next = new_occ->children;
226
          new_occ->children = occ;
227
 
228
          /* Try the next block (it may as well be dominated by BB).  */
229
        }
230
 
231
      else if (dom == occ_bb)
232
        {
233
          /* OCC_BB dominates BB.  Tail recurse to look deeper.  */
234
          insert_bb (new_occ, dom, &occ->children);
235
          return;
236
        }
237
 
238
      else if (dom != idom)
239
        {
240
          gcc_assert (!dom->aux);
241
 
242
          /* There is a dominator between IDOM and BB, add it and make
243
             two children out of NEW_OCC and OCC.  First, remove OCC from
244
             its list.  */
245
          *p_occ = occ->next;
246
          new_occ->next = occ;
247
          occ->next = NULL;
248
 
249
          /* None of the previous blocks has DOM as a dominator: if we tail
250
             recursed, we would reexamine them uselessly. Just switch BB with
251
             DOM, and go on looking for blocks dominated by DOM.  */
252
          new_occ = occ_new (dom, new_occ);
253
        }
254
 
255
      else
256
        {
257
          /* Nothing special, go on with the next element.  */
258
          p_occ = &occ->next;
259
        }
260
    }
261
 
262
  /* No place was found as a child of IDOM.  Make BB a sibling of IDOM.  */
263
  new_occ->next = *p_head;
264
  *p_head = new_occ;
265
}
266
 
267
/* Register that we found a division in BB.  */
268
 
269
static inline void
270
register_division_in (basic_block bb)
271
{
272
  struct occurrence *occ;
273
 
274
  occ = (struct occurrence *) bb->aux;
275
  if (!occ)
276
    {
277
      occ = occ_new (bb, NULL);
278
      insert_bb (occ, ENTRY_BLOCK_PTR, &occ_head);
279
    }
280
 
281
  occ->bb_has_division = true;
282
  occ->num_divisions++;
283
}
284
 
285
 
286
/* Compute the number of divisions that postdominate each block in OCC and
287
   its children.  */
288
 
289
static void
290
compute_merit (struct occurrence *occ)
291
{
292
  struct occurrence *occ_child;
293
  basic_block dom = occ->bb;
294
 
295
  for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
296
    {
297
      basic_block bb;
298
      if (occ_child->children)
299
        compute_merit (occ_child);
300
 
301
      if (flag_exceptions)
302
        bb = single_noncomplex_succ (dom);
303
      else
304
        bb = dom;
305
 
306
      if (dominated_by_p (CDI_POST_DOMINATORS, bb, occ_child->bb))
307
        occ->num_divisions += occ_child->num_divisions;
308
    }
309
}
310
 
311
 
312
/* Return whether USE_STMT is a floating-point division by DEF.  */
313
static inline bool
314
is_division_by (gimple use_stmt, tree def)
315
{
316
  return is_gimple_assign (use_stmt)
317
         && gimple_assign_rhs_code (use_stmt) == RDIV_EXPR
318
         && gimple_assign_rhs2 (use_stmt) == def
319
         /* Do not recognize x / x as valid division, as we are getting
320
            confused later by replacing all immediate uses x in such
321
            a stmt.  */
322
         && gimple_assign_rhs1 (use_stmt) != def;
323
}
324
 
325
/* Walk the subset of the dominator tree rooted at OCC, setting the
326
   RECIP_DEF field to a definition of 1.0 / DEF that can be used in
327
   the given basic block.  The field may be left NULL, of course,
328
   if it is not possible or profitable to do the optimization.
329
 
330
   DEF_BSI is an iterator pointing at the statement defining DEF.
331
   If RECIP_DEF is set, a dominator already has a computation that can
332
   be used.  */
333
 
334
static void
335
insert_reciprocals (gimple_stmt_iterator *def_gsi, struct occurrence *occ,
336
                    tree def, tree recip_def, int threshold)
337
{
338
  tree type;
339
  gimple new_stmt;
340
  gimple_stmt_iterator gsi;
341
  struct occurrence *occ_child;
342
 
343
  if (!recip_def
344
      && (occ->bb_has_division || !flag_trapping_math)
345
      && occ->num_divisions >= threshold)
346
    {
347
      /* Make a variable with the replacement and substitute it.  */
348
      type = TREE_TYPE (def);
349
      recip_def = make_rename_temp (type, "reciptmp");
350
      new_stmt = gimple_build_assign_with_ops (RDIV_EXPR, recip_def,
351
                                               build_one_cst (type), def);
352
 
353
      if (occ->bb_has_division)
354
        {
355
          /* Case 1: insert before an existing division.  */
356
          gsi = gsi_after_labels (occ->bb);
357
          while (!gsi_end_p (gsi) && !is_division_by (gsi_stmt (gsi), def))
358
            gsi_next (&gsi);
359
 
360
          gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
361
        }
362
      else if (def_gsi && occ->bb == def_gsi->bb)
363
        {
364
          /* Case 2: insert right after the definition.  Note that this will
365
             never happen if the definition statement can throw, because in
366
             that case the sole successor of the statement's basic block will
367
             dominate all the uses as well.  */
368
          gsi_insert_after (def_gsi, new_stmt, GSI_NEW_STMT);
369
        }
370
      else
371
        {
372
          /* Case 3: insert in a basic block not containing defs/uses.  */
373
          gsi = gsi_after_labels (occ->bb);
374
          gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
375
        }
376
 
377
      reciprocal_stats.rdivs_inserted++;
378
 
379
      occ->recip_def_stmt = new_stmt;
380
    }
381
 
382
  occ->recip_def = recip_def;
383
  for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
384
    insert_reciprocals (def_gsi, occ_child, def, recip_def, threshold);
385
}
386
 
387
 
388
/* Replace the division at USE_P with a multiplication by the reciprocal, if
389
   possible.  */
390
 
391
static inline void
392
replace_reciprocal (use_operand_p use_p)
393
{
394
  gimple use_stmt = USE_STMT (use_p);
395
  basic_block bb = gimple_bb (use_stmt);
396
  struct occurrence *occ = (struct occurrence *) bb->aux;
397
 
398
  if (optimize_bb_for_speed_p (bb)
399
      && occ->recip_def && use_stmt != occ->recip_def_stmt)
400
    {
401
      gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
402
      gimple_assign_set_rhs_code (use_stmt, MULT_EXPR);
403
      SET_USE (use_p, occ->recip_def);
404
      fold_stmt_inplace (&gsi);
405
      update_stmt (use_stmt);
406
    }
407
}
408
 
409
 
410
/* Free OCC and return one more "struct occurrence" to be freed.  */
411
 
412
static struct occurrence *
413
free_bb (struct occurrence *occ)
414
{
415
  struct occurrence *child, *next;
416
 
417
  /* First get the two pointers hanging off OCC.  */
418
  next = occ->next;
419
  child = occ->children;
420
  occ->bb->aux = NULL;
421
  pool_free (occ_pool, occ);
422
 
423
  /* Now ensure that we don't recurse unless it is necessary.  */
424
  if (!child)
425
    return next;
426
  else
427
    {
428
      while (next)
429
        next = free_bb (next);
430
 
431
      return child;
432
    }
433
}
434
 
435
 
436
/* Look for floating-point divisions among DEF's uses, and try to
437
   replace them by multiplications with the reciprocal.  Add
438
   as many statements computing the reciprocal as needed.
439
 
440
   DEF must be a GIMPLE register of a floating-point type.  */
441
 
442
static void
443
execute_cse_reciprocals_1 (gimple_stmt_iterator *def_gsi, tree def)
444
{
445
  use_operand_p use_p;
446
  imm_use_iterator use_iter;
447
  struct occurrence *occ;
448
  int count = 0, threshold;
449
 
450
  gcc_assert (FLOAT_TYPE_P (TREE_TYPE (def)) && is_gimple_reg (def));
451
 
452
  FOR_EACH_IMM_USE_FAST (use_p, use_iter, def)
453
    {
454
      gimple use_stmt = USE_STMT (use_p);
455
      if (is_division_by (use_stmt, def))
456
        {
457
          register_division_in (gimple_bb (use_stmt));
458
          count++;
459
        }
460
    }
461
 
462
  /* Do the expensive part only if we can hope to optimize something.  */
463
  threshold = targetm.min_divisions_for_recip_mul (TYPE_MODE (TREE_TYPE (def)));
464
  if (count >= threshold)
465
    {
466
      gimple use_stmt;
467
      for (occ = occ_head; occ; occ = occ->next)
468
        {
469
          compute_merit (occ);
470
          insert_reciprocals (def_gsi, occ, def, NULL, threshold);
471
        }
472
 
473
      FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, def)
474
        {
475
          if (is_division_by (use_stmt, def))
476
            {
477
              FOR_EACH_IMM_USE_ON_STMT (use_p, use_iter)
478
                replace_reciprocal (use_p);
479
            }
480
        }
481
    }
482
 
483
  for (occ = occ_head; occ; )
484
    occ = free_bb (occ);
485
 
486
  occ_head = NULL;
487
}
488
 
489
static bool
490
gate_cse_reciprocals (void)
491
{
492
  return optimize && flag_reciprocal_math;
493
}
494
 
495
/* Go through all the floating-point SSA_NAMEs, and call
496
   execute_cse_reciprocals_1 on each of them.  */
497
static unsigned int
498
execute_cse_reciprocals (void)
499
{
500
  basic_block bb;
501
  tree arg;
502
 
503
  occ_pool = create_alloc_pool ("dominators for recip",
504
                                sizeof (struct occurrence),
505
                                n_basic_blocks / 3 + 1);
506
 
507
  memset (&reciprocal_stats, 0, sizeof (reciprocal_stats));
508
  calculate_dominance_info (CDI_DOMINATORS);
509
  calculate_dominance_info (CDI_POST_DOMINATORS);
510
 
511
#ifdef ENABLE_CHECKING
512
  FOR_EACH_BB (bb)
513
    gcc_assert (!bb->aux);
514
#endif
515
 
516
  for (arg = DECL_ARGUMENTS (cfun->decl); arg; arg = DECL_CHAIN (arg))
517
    if (gimple_default_def (cfun, arg)
518
        && FLOAT_TYPE_P (TREE_TYPE (arg))
519
        && is_gimple_reg (arg))
520
      execute_cse_reciprocals_1 (NULL, gimple_default_def (cfun, arg));
521
 
522
  FOR_EACH_BB (bb)
523
    {
524
      gimple_stmt_iterator gsi;
525
      gimple phi;
526
      tree def;
527
 
528
      for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
529
        {
530
          phi = gsi_stmt (gsi);
531
          def = PHI_RESULT (phi);
532
          if (FLOAT_TYPE_P (TREE_TYPE (def))
533
              && is_gimple_reg (def))
534
            execute_cse_reciprocals_1 (NULL, def);
535
        }
536
 
537
      for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
538
        {
539
          gimple stmt = gsi_stmt (gsi);
540
 
541
          if (gimple_has_lhs (stmt)
542
              && (def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF)) != NULL
543
              && FLOAT_TYPE_P (TREE_TYPE (def))
544
              && TREE_CODE (def) == SSA_NAME)
545
            execute_cse_reciprocals_1 (&gsi, def);
546
        }
547
 
548
      if (optimize_bb_for_size_p (bb))
549
        continue;
550
 
551
      /* Scan for a/func(b) and convert it to reciprocal a*rfunc(b).  */
552
      for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
553
        {
554
          gimple stmt = gsi_stmt (gsi);
555
          tree fndecl;
556
 
557
          if (is_gimple_assign (stmt)
558
              && gimple_assign_rhs_code (stmt) == RDIV_EXPR)
559
            {
560
              tree arg1 = gimple_assign_rhs2 (stmt);
561
              gimple stmt1;
562
 
563
              if (TREE_CODE (arg1) != SSA_NAME)
564
                continue;
565
 
566
              stmt1 = SSA_NAME_DEF_STMT (arg1);
567
 
568
              if (is_gimple_call (stmt1)
569
                  && gimple_call_lhs (stmt1)
570
                  && (fndecl = gimple_call_fndecl (stmt1))
571
                  && (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL
572
                      || DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD))
573
                {
574
                  enum built_in_function code;
575
                  bool md_code, fail;
576
                  imm_use_iterator ui;
577
                  use_operand_p use_p;
578
 
579
                  code = DECL_FUNCTION_CODE (fndecl);
580
                  md_code = DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD;
581
 
582
                  fndecl = targetm.builtin_reciprocal (code, md_code, false);
583
                  if (!fndecl)
584
                    continue;
585
 
586
                  /* Check that all uses of the SSA name are divisions,
587
                     otherwise replacing the defining statement will do
588
                     the wrong thing.  */
589
                  fail = false;
590
                  FOR_EACH_IMM_USE_FAST (use_p, ui, arg1)
591
                    {
592
                      gimple stmt2 = USE_STMT (use_p);
593
                      if (is_gimple_debug (stmt2))
594
                        continue;
595
                      if (!is_gimple_assign (stmt2)
596
                          || gimple_assign_rhs_code (stmt2) != RDIV_EXPR
597
                          || gimple_assign_rhs1 (stmt2) == arg1
598
                          || gimple_assign_rhs2 (stmt2) != arg1)
599
                        {
600
                          fail = true;
601
                          break;
602
                        }
603
                    }
604
                  if (fail)
605
                    continue;
606
 
607
                  gimple_replace_lhs (stmt1, arg1);
608
                  gimple_call_set_fndecl (stmt1, fndecl);
609
                  update_stmt (stmt1);
610
                  reciprocal_stats.rfuncs_inserted++;
611
 
612
                  FOR_EACH_IMM_USE_STMT (stmt, ui, arg1)
613
                    {
614
                      gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
615
                      gimple_assign_set_rhs_code (stmt, MULT_EXPR);
616
                      fold_stmt_inplace (&gsi);
617
                      update_stmt (stmt);
618
                    }
619
                }
620
            }
621
        }
622
    }
623
 
624
  statistics_counter_event (cfun, "reciprocal divs inserted",
625
                            reciprocal_stats.rdivs_inserted);
626
  statistics_counter_event (cfun, "reciprocal functions inserted",
627
                            reciprocal_stats.rfuncs_inserted);
628
 
629
  free_dominance_info (CDI_DOMINATORS);
630
  free_dominance_info (CDI_POST_DOMINATORS);
631
  free_alloc_pool (occ_pool);
632
  return 0;
633
}
634
 
635
struct gimple_opt_pass pass_cse_reciprocals =
636
{
637
 {
638
  GIMPLE_PASS,
639
  "recip",                              /* name */
640
  gate_cse_reciprocals,                 /* gate */
641
  execute_cse_reciprocals,              /* execute */
642
  NULL,                                 /* sub */
643
  NULL,                                 /* next */
644
  0,                                     /* static_pass_number */
645
  TV_NONE,                              /* tv_id */
646
  PROP_ssa,                             /* properties_required */
647
  0,                                     /* properties_provided */
648
  0,                                     /* properties_destroyed */
649
  0,                                     /* todo_flags_start */
650
  TODO_update_ssa | TODO_verify_ssa
651
    | TODO_verify_stmts                /* todo_flags_finish */
652
 }
653
};
654
 
655
/* Records an occurrence at statement USE_STMT in the vector of trees
656
   STMTS if it is dominated by *TOP_BB or dominates it or this basic block
657
   is not yet initialized.  Returns true if the occurrence was pushed on
658
   the vector.  Adjusts *TOP_BB to be the basic block dominating all
659
   statements in the vector.  */
660
 
661
static bool
662
maybe_record_sincos (VEC(gimple, heap) **stmts,
663
                     basic_block *top_bb, gimple use_stmt)
664
{
665
  basic_block use_bb = gimple_bb (use_stmt);
666
  if (*top_bb
667
      && (*top_bb == use_bb
668
          || dominated_by_p (CDI_DOMINATORS, use_bb, *top_bb)))
669
    VEC_safe_push (gimple, heap, *stmts, use_stmt);
670
  else if (!*top_bb
671
           || dominated_by_p (CDI_DOMINATORS, *top_bb, use_bb))
672
    {
673
      VEC_safe_push (gimple, heap, *stmts, use_stmt);
674
      *top_bb = use_bb;
675
    }
676
  else
677
    return false;
678
 
679
  return true;
680
}
681
 
682
/* Look for sin, cos and cexpi calls with the same argument NAME and
683
   create a single call to cexpi CSEing the result in this case.
684
   We first walk over all immediate uses of the argument collecting
685
   statements that we can CSE in a vector and in a second pass replace
686
   the statement rhs with a REALPART or IMAGPART expression on the
687
   result of the cexpi call we insert before the use statement that
688
   dominates all other candidates.  */
689
 
690
static bool
691
execute_cse_sincos_1 (tree name)
692
{
693
  gimple_stmt_iterator gsi;
694
  imm_use_iterator use_iter;
695
  tree fndecl, res, type;
696
  gimple def_stmt, use_stmt, stmt;
697
  int seen_cos = 0, seen_sin = 0, seen_cexpi = 0;
698
  VEC(gimple, heap) *stmts = NULL;
699
  basic_block top_bb = NULL;
700
  int i;
701
  bool cfg_changed = false;
702
 
703
  type = TREE_TYPE (name);
704
  FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, name)
705
    {
706
      if (gimple_code (use_stmt) != GIMPLE_CALL
707
          || !gimple_call_lhs (use_stmt)
708
          || !(fndecl = gimple_call_fndecl (use_stmt))
709
          || DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_NORMAL)
710
        continue;
711
 
712
      switch (DECL_FUNCTION_CODE (fndecl))
713
        {
714
        CASE_FLT_FN (BUILT_IN_COS):
715
          seen_cos |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
716
          break;
717
 
718
        CASE_FLT_FN (BUILT_IN_SIN):
719
          seen_sin |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
720
          break;
721
 
722
        CASE_FLT_FN (BUILT_IN_CEXPI):
723
          seen_cexpi |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
724
          break;
725
 
726
        default:;
727
        }
728
    }
729
 
730
  if (seen_cos + seen_sin + seen_cexpi <= 1)
731
    {
732
      VEC_free(gimple, heap, stmts);
733
      return false;
734
    }
735
 
736
  /* Simply insert cexpi at the beginning of top_bb but not earlier than
737
     the name def statement.  */
738
  fndecl = mathfn_built_in (type, BUILT_IN_CEXPI);
739
  if (!fndecl)
740
    return false;
741
  res = create_tmp_reg (TREE_TYPE (TREE_TYPE (fndecl)), "sincostmp");
742
  stmt = gimple_build_call (fndecl, 1, name);
743
  res = make_ssa_name (res, stmt);
744
  gimple_call_set_lhs (stmt, res);
745
 
746
  def_stmt = SSA_NAME_DEF_STMT (name);
747
  if (!SSA_NAME_IS_DEFAULT_DEF (name)
748
      && gimple_code (def_stmt) != GIMPLE_PHI
749
      && gimple_bb (def_stmt) == top_bb)
750
    {
751
      gsi = gsi_for_stmt (def_stmt);
752
      gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
753
    }
754
  else
755
    {
756
      gsi = gsi_after_labels (top_bb);
757
      gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
758
    }
759
  update_stmt (stmt);
760
  sincos_stats.inserted++;
761
 
762
  /* And adjust the recorded old call sites.  */
763
  for (i = 0; VEC_iterate(gimple, stmts, i, use_stmt); ++i)
764
    {
765
      tree rhs = NULL;
766
      fndecl = gimple_call_fndecl (use_stmt);
767
 
768
      switch (DECL_FUNCTION_CODE (fndecl))
769
        {
770
        CASE_FLT_FN (BUILT_IN_COS):
771
          rhs = fold_build1 (REALPART_EXPR, type, res);
772
          break;
773
 
774
        CASE_FLT_FN (BUILT_IN_SIN):
775
          rhs = fold_build1 (IMAGPART_EXPR, type, res);
776
          break;
777
 
778
        CASE_FLT_FN (BUILT_IN_CEXPI):
779
          rhs = res;
780
          break;
781
 
782
        default:;
783
          gcc_unreachable ();
784
        }
785
 
786
        /* Replace call with a copy.  */
787
        stmt = gimple_build_assign (gimple_call_lhs (use_stmt), rhs);
788
 
789
        gsi = gsi_for_stmt (use_stmt);
790
        gsi_replace (&gsi, stmt, true);
791
        if (gimple_purge_dead_eh_edges (gimple_bb (stmt)))
792
          cfg_changed = true;
793
    }
794
 
795
  VEC_free(gimple, heap, stmts);
796
 
797
  return cfg_changed;
798
}
799
 
800
/* To evaluate powi(x,n), the floating point value x raised to the
801
   constant integer exponent n, we use a hybrid algorithm that
802
   combines the "window method" with look-up tables.  For an
803
   introduction to exponentiation algorithms and "addition chains",
804
   see section 4.6.3, "Evaluation of Powers" of Donald E. Knuth,
805
   "Seminumerical Algorithms", Vol. 2, "The Art of Computer Programming",
806
   3rd Edition, 1998, and Daniel M. Gordon, "A Survey of Fast Exponentiation
807
   Methods", Journal of Algorithms, Vol. 27, pp. 129-146, 1998.  */
808
 
809
/* Provide a default value for POWI_MAX_MULTS, the maximum number of
810
   multiplications to inline before calling the system library's pow
811
   function.  powi(x,n) requires at worst 2*bits(n)-2 multiplications,
812
   so this default never requires calling pow, powf or powl.  */
813
 
814
#ifndef POWI_MAX_MULTS
815
#define POWI_MAX_MULTS  (2*HOST_BITS_PER_WIDE_INT-2)
816
#endif
817
 
818
/* The size of the "optimal power tree" lookup table.  All
819
   exponents less than this value are simply looked up in the
820
   powi_table below.  This threshold is also used to size the
821
   cache of pseudo registers that hold intermediate results.  */
822
#define POWI_TABLE_SIZE 256
823
 
824
/* The size, in bits of the window, used in the "window method"
825
   exponentiation algorithm.  This is equivalent to a radix of
826
   (1<<POWI_WINDOW_SIZE) in the corresponding "m-ary method".  */
827
#define POWI_WINDOW_SIZE 3
828
 
829
/* The following table is an efficient representation of an
830
   "optimal power tree".  For each value, i, the corresponding
831
   value, j, in the table states than an optimal evaluation
832
   sequence for calculating pow(x,i) can be found by evaluating
833
   pow(x,j)*pow(x,i-j).  An optimal power tree for the first
834
   100 integers is given in Knuth's "Seminumerical algorithms".  */
835
 
836
static const unsigned char powi_table[POWI_TABLE_SIZE] =
837
  {
838
      0,   1,   1,   2,   2,   3,   3,   4,  /*   0 -   7 */
839
      4,   6,   5,   6,   6,  10,   7,   9,  /*   8 -  15 */
840
      8,  16,   9,  16,  10,  12,  11,  13,  /*  16 -  23 */
841
     12,  17,  13,  18,  14,  24,  15,  26,  /*  24 -  31 */
842
     16,  17,  17,  19,  18,  33,  19,  26,  /*  32 -  39 */
843
     20,  25,  21,  40,  22,  27,  23,  44,  /*  40 -  47 */
844
     24,  32,  25,  34,  26,  29,  27,  44,  /*  48 -  55 */
845
     28,  31,  29,  34,  30,  60,  31,  36,  /*  56 -  63 */
846
     32,  64,  33,  34,  34,  46,  35,  37,  /*  64 -  71 */
847
     36,  65,  37,  50,  38,  48,  39,  69,  /*  72 -  79 */
848
     40,  49,  41,  43,  42,  51,  43,  58,  /*  80 -  87 */
849
     44,  64,  45,  47,  46,  59,  47,  76,  /*  88 -  95 */
850
     48,  65,  49,  66,  50,  67,  51,  66,  /*  96 - 103 */
851
     52,  70,  53,  74,  54, 104,  55,  74,  /* 104 - 111 */
852
     56,  64,  57,  69,  58,  78,  59,  68,  /* 112 - 119 */
853
     60,  61,  61,  80,  62,  75,  63,  68,  /* 120 - 127 */
854
     64,  65,  65, 128,  66, 129,  67,  90,  /* 128 - 135 */
855
     68,  73,  69, 131,  70,  94,  71,  88,  /* 136 - 143 */
856
     72, 128,  73,  98,  74, 132,  75, 121,  /* 144 - 151 */
857
     76, 102,  77, 124,  78, 132,  79, 106,  /* 152 - 159 */
858
     80,  97,  81, 160,  82,  99,  83, 134,  /* 160 - 167 */
859
     84,  86,  85,  95,  86, 160,  87, 100,  /* 168 - 175 */
860
     88, 113,  89,  98,  90, 107,  91, 122,  /* 176 - 183 */
861
     92, 111,  93, 102,  94, 126,  95, 150,  /* 184 - 191 */
862
     96, 128,  97, 130,  98, 133,  99, 195,  /* 192 - 199 */
863
    100, 128, 101, 123, 102, 164, 103, 138,  /* 200 - 207 */
864
    104, 145, 105, 146, 106, 109, 107, 149,  /* 208 - 215 */
865
    108, 200, 109, 146, 110, 170, 111, 157,  /* 216 - 223 */
866
    112, 128, 113, 130, 114, 182, 115, 132,  /* 224 - 231 */
867
    116, 200, 117, 132, 118, 158, 119, 206,  /* 232 - 239 */
868
    120, 240, 121, 162, 122, 147, 123, 152,  /* 240 - 247 */
869
    124, 166, 125, 214, 126, 138, 127, 153,  /* 248 - 255 */
870
  };
871
 
872
 
873
/* Return the number of multiplications required to calculate
874
   powi(x,n) where n is less than POWI_TABLE_SIZE.  This is a
875
   subroutine of powi_cost.  CACHE is an array indicating
876
   which exponents have already been calculated.  */
877
 
878
static int
879
powi_lookup_cost (unsigned HOST_WIDE_INT n, bool *cache)
880
{
881
  /* If we've already calculated this exponent, then this evaluation
882
     doesn't require any additional multiplications.  */
883
  if (cache[n])
884
    return 0;
885
 
886
  cache[n] = true;
887
  return powi_lookup_cost (n - powi_table[n], cache)
888
         + powi_lookup_cost (powi_table[n], cache) + 1;
889
}
890
 
891
/* Return the number of multiplications required to calculate
892
   powi(x,n) for an arbitrary x, given the exponent N.  This
893
   function needs to be kept in sync with powi_as_mults below.  */
894
 
895
static int
896
powi_cost (HOST_WIDE_INT n)
897
{
898
  bool cache[POWI_TABLE_SIZE];
899
  unsigned HOST_WIDE_INT digit;
900
  unsigned HOST_WIDE_INT val;
901
  int result;
902
 
903
  if (n == 0)
904
    return 0;
905
 
906
  /* Ignore the reciprocal when calculating the cost.  */
907
  val = (n < 0) ? -n : n;
908
 
909
  /* Initialize the exponent cache.  */
910
  memset (cache, 0, POWI_TABLE_SIZE * sizeof (bool));
911
  cache[1] = true;
912
 
913
  result = 0;
914
 
915
  while (val >= POWI_TABLE_SIZE)
916
    {
917
      if (val & 1)
918
        {
919
          digit = val & ((1 << POWI_WINDOW_SIZE) - 1);
920
          result += powi_lookup_cost (digit, cache)
921
                    + POWI_WINDOW_SIZE + 1;
922
          val >>= POWI_WINDOW_SIZE;
923
        }
924
      else
925
        {
926
          val >>= 1;
927
          result++;
928
        }
929
    }
930
 
931
  return result + powi_lookup_cost (val, cache);
932
}
933
 
934
/* Recursive subroutine of powi_as_mults.  This function takes the
935
   array, CACHE, of already calculated exponents and an exponent N and
936
   returns a tree that corresponds to CACHE[1]**N, with type TYPE.  */
937
 
938
static tree
939
powi_as_mults_1 (gimple_stmt_iterator *gsi, location_t loc, tree type,
940
                 HOST_WIDE_INT n, tree *cache, tree target)
941
{
942
  tree op0, op1, ssa_target;
943
  unsigned HOST_WIDE_INT digit;
944
  gimple mult_stmt;
945
 
946
  if (n < POWI_TABLE_SIZE && cache[n])
947
    return cache[n];
948
 
949
  ssa_target = make_ssa_name (target, NULL);
950
 
951
  if (n < POWI_TABLE_SIZE)
952
    {
953
      cache[n] = ssa_target;
954
      op0 = powi_as_mults_1 (gsi, loc, type, n - powi_table[n], cache, target);
955
      op1 = powi_as_mults_1 (gsi, loc, type, powi_table[n], cache, target);
956
    }
957
  else if (n & 1)
958
    {
959
      digit = n & ((1 << POWI_WINDOW_SIZE) - 1);
960
      op0 = powi_as_mults_1 (gsi, loc, type, n - digit, cache, target);
961
      op1 = powi_as_mults_1 (gsi, loc, type, digit, cache, target);
962
    }
963
  else
964
    {
965
      op0 = powi_as_mults_1 (gsi, loc, type, n >> 1, cache, target);
966
      op1 = op0;
967
    }
968
 
969
  mult_stmt = gimple_build_assign_with_ops (MULT_EXPR, ssa_target, op0, op1);
970
  gimple_set_location (mult_stmt, loc);
971
  gsi_insert_before (gsi, mult_stmt, GSI_SAME_STMT);
972
 
973
  return ssa_target;
974
}
975
 
976
/* Convert ARG0**N to a tree of multiplications of ARG0 with itself.
977
   This function needs to be kept in sync with powi_cost above.  */
978
 
979
static tree
980
powi_as_mults (gimple_stmt_iterator *gsi, location_t loc,
981
               tree arg0, HOST_WIDE_INT n)
982
{
983
  tree cache[POWI_TABLE_SIZE], result, type = TREE_TYPE (arg0), target;
984
  gimple div_stmt;
985
 
986
  if (n == 0)
987
    return build_real (type, dconst1);
988
 
989
  memset (cache, 0,  sizeof (cache));
990
  cache[1] = arg0;
991
 
992
  target = create_tmp_reg (type, "powmult");
993
  add_referenced_var (target);
994
 
995
  result = powi_as_mults_1 (gsi, loc, type, (n < 0) ? -n : n, cache, target);
996
 
997
  if (n >= 0)
998
    return result;
999
 
1000
  /* If the original exponent was negative, reciprocate the result.  */
1001
  target = make_ssa_name (target, NULL);
1002
  div_stmt = gimple_build_assign_with_ops (RDIV_EXPR, target,
1003
                                           build_real (type, dconst1),
1004
                                           result);
1005
  gimple_set_location (div_stmt, loc);
1006
  gsi_insert_before (gsi, div_stmt, GSI_SAME_STMT);
1007
 
1008
  return target;
1009
}
1010
 
1011
/* ARG0 and N are the two arguments to a powi builtin in GSI with
1012
   location info LOC.  If the arguments are appropriate, create an
1013
   equivalent sequence of statements prior to GSI using an optimal
1014
   number of multiplications, and return an expession holding the
1015
   result.  */
1016
 
1017
static tree
1018
gimple_expand_builtin_powi (gimple_stmt_iterator *gsi, location_t loc,
1019
                            tree arg0, HOST_WIDE_INT n)
1020
{
1021
  /* Avoid largest negative number.  */
1022
  if (n != -n
1023
      && ((n >= -1 && n <= 2)
1024
          || (optimize_function_for_speed_p (cfun)
1025
              && powi_cost (n) <= POWI_MAX_MULTS)))
1026
    return powi_as_mults (gsi, loc, arg0, n);
1027
 
1028
  return NULL_TREE;
1029
}
1030
 
1031
/* Build a gimple call statement that calls FN with argument ARG.
1032
   Set the lhs of the call statement to a fresh SSA name for
1033
   variable VAR.  If VAR is NULL, first allocate it.  Insert the
1034
   statement prior to GSI's current position, and return the fresh
1035
   SSA name.  */
1036
 
1037
static tree
1038
build_and_insert_call (gimple_stmt_iterator *gsi, location_t loc,
1039
                       tree *var, tree fn, tree arg)
1040
{
1041
  gimple call_stmt;
1042
  tree ssa_target;
1043
 
1044
  if (!*var)
1045
    {
1046
      *var = create_tmp_reg (TREE_TYPE (arg), "powroot");
1047
      add_referenced_var (*var);
1048
    }
1049
 
1050
  call_stmt = gimple_build_call (fn, 1, arg);
1051
  ssa_target = make_ssa_name (*var, NULL);
1052
  gimple_set_lhs (call_stmt, ssa_target);
1053
  gimple_set_location (call_stmt, loc);
1054
  gsi_insert_before (gsi, call_stmt, GSI_SAME_STMT);
1055
 
1056
  return ssa_target;
1057
}
1058
 
1059
/* Build a gimple binary operation with the given CODE and arguments
1060
   ARG0, ARG1, assigning the result to a new SSA name for variable
1061
   TARGET.  Insert the statement prior to GSI's current position, and
1062
   return the fresh SSA name.*/
1063
 
1064
static tree
1065
build_and_insert_binop (gimple_stmt_iterator *gsi, location_t loc,
1066
                        tree target, enum tree_code code, tree arg0, tree arg1)
1067
{
1068
  tree result = make_ssa_name (target, NULL);
1069
  gimple stmt = gimple_build_assign_with_ops (code, result, arg0, arg1);
1070
  gimple_set_location (stmt, loc);
1071
  gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1072
  return result;
1073
}
1074
 
1075
/* Build a gimple reference operation with the given CODE and argument
1076
   ARG, assigning the result to a new SSA name for variable TARGET.
1077
   Insert the statement prior to GSI's current position, and return
1078
   the fresh SSA name.  */
1079
 
1080
static inline tree
1081
build_and_insert_ref (gimple_stmt_iterator *gsi, location_t loc, tree type,
1082
                      tree target, enum tree_code code, tree arg0)
1083
{
1084
  tree result = make_ssa_name (target, NULL);
1085
  gimple stmt = gimple_build_assign (result, build1 (code, type, arg0));
1086
  gimple_set_location (stmt, loc);
1087
  gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1088
  return result;
1089
}
1090
 
1091
/* Build a gimple assignment to cast VAL to TARGET.  Insert the statement
1092
   prior to GSI's current position, and return the fresh SSA name.  */
1093
 
1094
static tree
1095
build_and_insert_cast (gimple_stmt_iterator *gsi, location_t loc,
1096
                       tree target, tree val)
1097
{
1098
  return build_and_insert_binop (gsi, loc, target, CONVERT_EXPR, val, NULL);
1099
}
1100
 
1101
/* ARG0 and ARG1 are the two arguments to a pow builtin call in GSI
1102
   with location info LOC.  If possible, create an equivalent and
1103
   less expensive sequence of statements prior to GSI, and return an
1104
   expession holding the result.  */
1105
 
1106
static tree
1107
gimple_expand_builtin_pow (gimple_stmt_iterator *gsi, location_t loc,
1108
                           tree arg0, tree arg1)
1109
{
1110
  REAL_VALUE_TYPE c, cint, dconst1_4, dconst3_4, dconst1_3, dconst1_6;
1111
  REAL_VALUE_TYPE c2, dconst3;
1112
  HOST_WIDE_INT n;
1113
  tree type, sqrtfn, cbrtfn, sqrt_arg0, sqrt_sqrt, result, cbrt_x, powi_cbrt_x;
1114
  tree target = NULL_TREE;
1115
  enum machine_mode mode;
1116
  bool hw_sqrt_exists;
1117
 
1118
  /* If the exponent isn't a constant, there's nothing of interest
1119
     to be done.  */
1120
  if (TREE_CODE (arg1) != REAL_CST)
1121
    return NULL_TREE;
1122
 
1123
  /* If the exponent is equivalent to an integer, expand to an optimal
1124
     multiplication sequence when profitable.  */
1125
  c = TREE_REAL_CST (arg1);
1126
  n = real_to_integer (&c);
1127
  real_from_integer (&cint, VOIDmode, n, n < 0 ? -1 : 0, 0);
1128
 
1129
  if (real_identical (&c, &cint)
1130
      && ((n >= -1 && n <= 2)
1131
          || (flag_unsafe_math_optimizations
1132
              && optimize_insn_for_speed_p ()
1133
              && powi_cost (n) <= POWI_MAX_MULTS)))
1134
    return gimple_expand_builtin_powi (gsi, loc, arg0, n);
1135
 
1136
  /* Attempt various optimizations using sqrt and cbrt.  */
1137
  type = TREE_TYPE (arg0);
1138
  mode = TYPE_MODE (type);
1139
  sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1140
 
1141
  /* Optimize pow(x,0.5) = sqrt(x).  This replacement is always safe
1142
     unless signed zeros must be maintained.  pow(-0,0.5) = +0, while
1143
     sqrt(-0) = -0.  */
1144
  if (sqrtfn
1145
      && REAL_VALUES_EQUAL (c, dconsthalf)
1146
      && !HONOR_SIGNED_ZEROS (mode))
1147
    return build_and_insert_call (gsi, loc, &target, sqrtfn, arg0);
1148
 
1149
  /* Optimize pow(x,0.25) = sqrt(sqrt(x)).  Assume on most machines that
1150
     a builtin sqrt instruction is smaller than a call to pow with 0.25,
1151
     so do this optimization even if -Os.  Don't do this optimization
1152
     if we don't have a hardware sqrt insn.  */
1153
  dconst1_4 = dconst1;
1154
  SET_REAL_EXP (&dconst1_4, REAL_EXP (&dconst1_4) - 2);
1155
  hw_sqrt_exists = optab_handler (sqrt_optab, mode) != CODE_FOR_nothing;
1156
 
1157
  if (flag_unsafe_math_optimizations
1158
      && sqrtfn
1159
      && REAL_VALUES_EQUAL (c, dconst1_4)
1160
      && hw_sqrt_exists)
1161
    {
1162
      /* sqrt(x)  */
1163
      sqrt_arg0 = build_and_insert_call (gsi, loc, &target, sqrtfn, arg0);
1164
 
1165
      /* sqrt(sqrt(x))  */
1166
      return build_and_insert_call (gsi, loc, &target, sqrtfn, sqrt_arg0);
1167
    }
1168
 
1169
  /* Optimize pow(x,0.75) = sqrt(x) * sqrt(sqrt(x)) unless we are
1170
     optimizing for space.  Don't do this optimization if we don't have
1171
     a hardware sqrt insn.  */
1172
  real_from_integer (&dconst3_4, VOIDmode, 3, 0, 0);
1173
  SET_REAL_EXP (&dconst3_4, REAL_EXP (&dconst3_4) - 2);
1174
 
1175
  if (flag_unsafe_math_optimizations
1176
      && sqrtfn
1177
      && optimize_function_for_speed_p (cfun)
1178
      && REAL_VALUES_EQUAL (c, dconst3_4)
1179
      && hw_sqrt_exists)
1180
    {
1181
      /* sqrt(x)  */
1182
      sqrt_arg0 = build_and_insert_call (gsi, loc, &target, sqrtfn, arg0);
1183
 
1184
      /* sqrt(sqrt(x))  */
1185
      sqrt_sqrt = build_and_insert_call (gsi, loc, &target, sqrtfn, sqrt_arg0);
1186
 
1187
      /* sqrt(x) * sqrt(sqrt(x))  */
1188
      return build_and_insert_binop (gsi, loc, target, MULT_EXPR,
1189
                                     sqrt_arg0, sqrt_sqrt);
1190
    }
1191
 
1192
  /* Optimize pow(x,1./3.) = cbrt(x).  This requires unsafe math
1193
     optimizations since 1./3. is not exactly representable.  If x
1194
     is negative and finite, the correct value of pow(x,1./3.) is
1195
     a NaN with the "invalid" exception raised, because the value
1196
     of 1./3. actually has an even denominator.  The correct value
1197
     of cbrt(x) is a negative real value.  */
1198
  cbrtfn = mathfn_built_in (type, BUILT_IN_CBRT);
1199
  dconst1_3 = real_value_truncate (mode, dconst_third ());
1200
 
1201
  if (flag_unsafe_math_optimizations
1202
      && cbrtfn
1203
      && (gimple_val_nonnegative_real_p (arg0) || !HONOR_NANS (mode))
1204
      && REAL_VALUES_EQUAL (c, dconst1_3))
1205
    return build_and_insert_call (gsi, loc, &target, cbrtfn, arg0);
1206
 
1207
  /* Optimize pow(x,1./6.) = cbrt(sqrt(x)).  Don't do this optimization
1208
     if we don't have a hardware sqrt insn.  */
1209
  dconst1_6 = dconst1_3;
1210
  SET_REAL_EXP (&dconst1_6, REAL_EXP (&dconst1_6) - 1);
1211
 
1212
  if (flag_unsafe_math_optimizations
1213
      && sqrtfn
1214
      && cbrtfn
1215
      && (gimple_val_nonnegative_real_p (arg0) || !HONOR_NANS (mode))
1216
      && optimize_function_for_speed_p (cfun)
1217
      && hw_sqrt_exists
1218
      && REAL_VALUES_EQUAL (c, dconst1_6))
1219
    {
1220
      /* sqrt(x)  */
1221
      sqrt_arg0 = build_and_insert_call (gsi, loc, &target, sqrtfn, arg0);
1222
 
1223
      /* cbrt(sqrt(x))  */
1224
      return build_and_insert_call (gsi, loc, &target, cbrtfn, sqrt_arg0);
1225
    }
1226
 
1227
  /* Optimize pow(x,c), where n = 2c for some nonzero integer n, into
1228
 
1229
       sqrt(x) * powi(x, n/2),                n > 0;
1230
       1.0 / (sqrt(x) * powi(x, abs(n/2))),   n < 0.
1231
 
1232
     Do not calculate the powi factor when n/2 = 0.  */
1233
  real_arithmetic (&c2, MULT_EXPR, &c, &dconst2);
1234
  n = real_to_integer (&c2);
1235
  real_from_integer (&cint, VOIDmode, n, n < 0 ? -1 : 0, 0);
1236
 
1237
  if (flag_unsafe_math_optimizations
1238
      && sqrtfn
1239
      && real_identical (&c2, &cint))
1240
    {
1241
      tree powi_x_ndiv2 = NULL_TREE;
1242
 
1243
      /* Attempt to fold powi(arg0, abs(n/2)) into multiplies.  If not
1244
         possible or profitable, give up.  Skip the degenerate case when
1245
         n is 1 or -1, where the result is always 1.  */
1246
      if (absu_hwi (n) != 1)
1247
        {
1248
          powi_x_ndiv2 = gimple_expand_builtin_powi (gsi, loc, arg0,
1249
                                                     abs_hwi (n / 2));
1250
          if (!powi_x_ndiv2)
1251
            return NULL_TREE;
1252
        }
1253
 
1254
      /* Calculate sqrt(x).  When n is not 1 or -1, multiply it by the
1255
         result of the optimal multiply sequence just calculated.  */
1256
      sqrt_arg0 = build_and_insert_call (gsi, loc, &target, sqrtfn, arg0);
1257
 
1258
      if (absu_hwi (n) == 1)
1259
        result = sqrt_arg0;
1260
      else
1261
        result = build_and_insert_binop (gsi, loc, target, MULT_EXPR,
1262
                                         sqrt_arg0, powi_x_ndiv2);
1263
 
1264
      /* If n is negative, reciprocate the result.  */
1265
      if (n < 0)
1266
        result = build_and_insert_binop (gsi, loc, target, RDIV_EXPR,
1267
                                         build_real (type, dconst1), result);
1268
      return result;
1269
    }
1270
 
1271
  /* Optimize pow(x,c), where 3c = n for some nonzero integer n, into
1272
 
1273
     powi(x, n/3) * powi(cbrt(x), n%3),                    n > 0;
1274
     1.0 / (powi(x, abs(n)/3) * powi(cbrt(x), abs(n)%3)),  n < 0.
1275
 
1276
     Do not calculate the first factor when n/3 = 0.  As cbrt(x) is
1277
     different from pow(x, 1./3.) due to rounding and behavior with
1278
     negative x, we need to constrain this transformation to unsafe
1279
     math and positive x or finite math.  */
1280
  real_from_integer (&dconst3, VOIDmode, 3, 0, 0);
1281
  real_arithmetic (&c2, MULT_EXPR, &c, &dconst3);
1282
  real_round (&c2, mode, &c2);
1283
  n = real_to_integer (&c2);
1284
  real_from_integer (&cint, VOIDmode, n, n < 0 ? -1 : 0, 0);
1285
  real_arithmetic (&c2, RDIV_EXPR, &cint, &dconst3);
1286
  real_convert (&c2, mode, &c2);
1287
 
1288
  if (flag_unsafe_math_optimizations
1289
      && cbrtfn
1290
      && (gimple_val_nonnegative_real_p (arg0) || !HONOR_NANS (mode))
1291
      && real_identical (&c2, &c)
1292
      && optimize_function_for_speed_p (cfun)
1293
      && powi_cost (n / 3) <= POWI_MAX_MULTS)
1294
    {
1295
      tree powi_x_ndiv3 = NULL_TREE;
1296
 
1297
      /* Attempt to fold powi(arg0, abs(n/3)) into multiplies.  If not
1298
         possible or profitable, give up.  Skip the degenerate case when
1299
         abs(n) < 3, where the result is always 1.  */
1300
      if (absu_hwi (n) >= 3)
1301
        {
1302
          powi_x_ndiv3 = gimple_expand_builtin_powi (gsi, loc, arg0,
1303
                                                     abs_hwi (n / 3));
1304
          if (!powi_x_ndiv3)
1305
            return NULL_TREE;
1306
        }
1307
 
1308
      /* Calculate powi(cbrt(x), n%3).  Don't use gimple_expand_builtin_powi
1309
         as that creates an unnecessary variable.  Instead, just produce
1310
         either cbrt(x) or cbrt(x) * cbrt(x).  */
1311
      cbrt_x = build_and_insert_call (gsi, loc, &target, cbrtfn, arg0);
1312
 
1313
      if (absu_hwi (n) % 3 == 1)
1314
        powi_cbrt_x = cbrt_x;
1315
      else
1316
        powi_cbrt_x = build_and_insert_binop (gsi, loc, target, MULT_EXPR,
1317
                                              cbrt_x, cbrt_x);
1318
 
1319
      /* Multiply the two subexpressions, unless powi(x,abs(n)/3) = 1.  */
1320
      if (absu_hwi (n) < 3)
1321
        result = powi_cbrt_x;
1322
      else
1323
        result = build_and_insert_binop (gsi, loc, target, MULT_EXPR,
1324
                                         powi_x_ndiv3, powi_cbrt_x);
1325
 
1326
      /* If n is negative, reciprocate the result.  */
1327
      if (n < 0)
1328
        result = build_and_insert_binop (gsi, loc, target, RDIV_EXPR,
1329
                                         build_real (type, dconst1), result);
1330
 
1331
      return result;
1332
    }
1333
 
1334
  /* No optimizations succeeded.  */
1335
  return NULL_TREE;
1336
}
1337
 
1338
/* ARG is the argument to a cabs builtin call in GSI with location info
1339
   LOC.  Create a sequence of statements prior to GSI that calculates
1340
   sqrt(R*R + I*I), where R and I are the real and imaginary components
1341
   of ARG, respectively.  Return an expression holding the result.  */
1342
 
1343
static tree
1344
gimple_expand_builtin_cabs (gimple_stmt_iterator *gsi, location_t loc, tree arg)
1345
{
1346
  tree target, real_part, imag_part, addend1, addend2, sum, result;
1347
  tree type = TREE_TYPE (TREE_TYPE (arg));
1348
  tree sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1349
  enum machine_mode mode = TYPE_MODE (type);
1350
 
1351
  if (!flag_unsafe_math_optimizations
1352
      || !optimize_bb_for_speed_p (gimple_bb (gsi_stmt (*gsi)))
1353
      || !sqrtfn
1354
      || optab_handler (sqrt_optab, mode) == CODE_FOR_nothing)
1355
    return NULL_TREE;
1356
 
1357
  target = create_tmp_reg (type, "cabs");
1358
  add_referenced_var (target);
1359
 
1360
  real_part = build_and_insert_ref (gsi, loc, type, target,
1361
                                    REALPART_EXPR, arg);
1362
  addend1 = build_and_insert_binop (gsi, loc, target, MULT_EXPR,
1363
                                    real_part, real_part);
1364
  imag_part = build_and_insert_ref (gsi, loc, type, target,
1365
                                    IMAGPART_EXPR, arg);
1366
  addend2 = build_and_insert_binop (gsi, loc, target, MULT_EXPR,
1367
                                    imag_part, imag_part);
1368
  sum = build_and_insert_binop (gsi, loc, target, PLUS_EXPR, addend1, addend2);
1369
  result = build_and_insert_call (gsi, loc, &target, sqrtfn, sum);
1370
 
1371
  return result;
1372
}
1373
 
1374
/* Go through all calls to sin, cos and cexpi and call execute_cse_sincos_1
1375
   on the SSA_NAME argument of each of them.  Also expand powi(x,n) into
1376
   an optimal number of multiplies, when n is a constant.  */
1377
 
1378
static unsigned int
1379
execute_cse_sincos (void)
1380
{
1381
  basic_block bb;
1382
  bool cfg_changed = false;
1383
 
1384
  calculate_dominance_info (CDI_DOMINATORS);
1385
  memset (&sincos_stats, 0, sizeof (sincos_stats));
1386
 
1387
  FOR_EACH_BB (bb)
1388
    {
1389
      gimple_stmt_iterator gsi;
1390
 
1391
      for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1392
        {
1393
          gimple stmt = gsi_stmt (gsi);
1394
          tree fndecl;
1395
 
1396
          if (is_gimple_call (stmt)
1397
              && gimple_call_lhs (stmt)
1398
              && (fndecl = gimple_call_fndecl (stmt))
1399
              && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL)
1400
            {
1401
              tree arg, arg0, arg1, result;
1402
              HOST_WIDE_INT n;
1403
              location_t loc;
1404
 
1405
              switch (DECL_FUNCTION_CODE (fndecl))
1406
                {
1407
                CASE_FLT_FN (BUILT_IN_COS):
1408
                CASE_FLT_FN (BUILT_IN_SIN):
1409
                CASE_FLT_FN (BUILT_IN_CEXPI):
1410
                  /* Make sure we have either sincos or cexp.  */
1411
                  if (!TARGET_HAS_SINCOS && !TARGET_C99_FUNCTIONS)
1412
                    break;
1413
 
1414
                  arg = gimple_call_arg (stmt, 0);
1415
                  if (TREE_CODE (arg) == SSA_NAME)
1416
                    cfg_changed |= execute_cse_sincos_1 (arg);
1417
                  break;
1418
 
1419
                CASE_FLT_FN (BUILT_IN_POW):
1420
                  arg0 = gimple_call_arg (stmt, 0);
1421
                  arg1 = gimple_call_arg (stmt, 1);
1422
 
1423
                  loc = gimple_location (stmt);
1424
                  result = gimple_expand_builtin_pow (&gsi, loc, arg0, arg1);
1425
 
1426
                  if (result)
1427
                    {
1428
                      tree lhs = gimple_get_lhs (stmt);
1429
                      gimple new_stmt = gimple_build_assign (lhs, result);
1430
                      gimple_set_location (new_stmt, loc);
1431
                      unlink_stmt_vdef (stmt);
1432
                      gsi_replace (&gsi, new_stmt, true);
1433
                    }
1434
                  break;
1435
 
1436
                CASE_FLT_FN (BUILT_IN_POWI):
1437
                  arg0 = gimple_call_arg (stmt, 0);
1438
                  arg1 = gimple_call_arg (stmt, 1);
1439
                  if (!host_integerp (arg1, 0))
1440
                    break;
1441
 
1442
                  n = TREE_INT_CST_LOW (arg1);
1443
                  loc = gimple_location (stmt);
1444
                  result = gimple_expand_builtin_powi (&gsi, loc, arg0, n);
1445
 
1446
                  if (result)
1447
                    {
1448
                      tree lhs = gimple_get_lhs (stmt);
1449
                      gimple new_stmt = gimple_build_assign (lhs, result);
1450
                      gimple_set_location (new_stmt, loc);
1451
                      unlink_stmt_vdef (stmt);
1452
                      gsi_replace (&gsi, new_stmt, true);
1453
                    }
1454
                  break;
1455
 
1456
                CASE_FLT_FN (BUILT_IN_CABS):
1457
                  arg0 = gimple_call_arg (stmt, 0);
1458
                  loc = gimple_location (stmt);
1459
                  result = gimple_expand_builtin_cabs (&gsi, loc, arg0);
1460
 
1461
                  if (result)
1462
                    {
1463
                      tree lhs = gimple_get_lhs (stmt);
1464
                      gimple new_stmt = gimple_build_assign (lhs, result);
1465
                      gimple_set_location (new_stmt, loc);
1466
                      unlink_stmt_vdef (stmt);
1467
                      gsi_replace (&gsi, new_stmt, true);
1468
                    }
1469
                  break;
1470
 
1471
                default:;
1472
                }
1473
            }
1474
        }
1475
    }
1476
 
1477
  statistics_counter_event (cfun, "sincos statements inserted",
1478
                            sincos_stats.inserted);
1479
 
1480
  free_dominance_info (CDI_DOMINATORS);
1481
  return cfg_changed ? TODO_cleanup_cfg : 0;
1482
}
1483
 
1484
static bool
1485
gate_cse_sincos (void)
1486
{
1487
  /* We no longer require either sincos or cexp, since powi expansion
1488
     piggybacks on this pass.  */
1489
  return optimize;
1490
}
1491
 
1492
struct gimple_opt_pass pass_cse_sincos =
1493
{
1494
 {
1495
  GIMPLE_PASS,
1496
  "sincos",                             /* name */
1497
  gate_cse_sincos,                      /* gate */
1498
  execute_cse_sincos,                   /* execute */
1499
  NULL,                                 /* sub */
1500
  NULL,                                 /* next */
1501
  0,                                     /* static_pass_number */
1502
  TV_NONE,                              /* tv_id */
1503
  PROP_ssa,                             /* properties_required */
1504
  0,                                     /* properties_provided */
1505
  0,                                     /* properties_destroyed */
1506
  0,                                     /* todo_flags_start */
1507
  TODO_update_ssa | TODO_verify_ssa
1508
    | TODO_verify_stmts                 /* todo_flags_finish */
1509
 }
1510
};
1511
 
1512
/* A symbolic number is used to detect byte permutation and selection
1513
   patterns.  Therefore the field N contains an artificial number
1514
   consisting of byte size markers:
1515
 
1516
 
1517
   1..size - byte contains the content of the byte
1518
   number indexed with that value minus one  */
1519
 
1520
struct symbolic_number {
1521
  unsigned HOST_WIDEST_INT n;
1522
  int size;
1523
};
1524
 
1525
/* Perform a SHIFT or ROTATE operation by COUNT bits on symbolic
1526
   number N.  Return false if the requested operation is not permitted
1527
   on a symbolic number.  */
1528
 
1529
static inline bool
1530
do_shift_rotate (enum tree_code code,
1531
                 struct symbolic_number *n,
1532
                 int count)
1533
{
1534
  if (count % 8 != 0)
1535
    return false;
1536
 
1537
  /* Zero out the extra bits of N in order to avoid them being shifted
1538
     into the significant bits.  */
1539
  if (n->size < (int)sizeof (HOST_WIDEST_INT))
1540
    n->n &= ((unsigned HOST_WIDEST_INT)1 << (n->size * BITS_PER_UNIT)) - 1;
1541
 
1542
  switch (code)
1543
    {
1544
    case LSHIFT_EXPR:
1545
      n->n <<= count;
1546
      break;
1547
    case RSHIFT_EXPR:
1548
      n->n >>= count;
1549
      break;
1550
    case LROTATE_EXPR:
1551
      n->n = (n->n << count) | (n->n >> ((n->size * BITS_PER_UNIT) - count));
1552
      break;
1553
    case RROTATE_EXPR:
1554
      n->n = (n->n >> count) | (n->n << ((n->size * BITS_PER_UNIT) - count));
1555
      break;
1556
    default:
1557
      return false;
1558
    }
1559
  /* Zero unused bits for size.  */
1560
  if (n->size < (int)sizeof (HOST_WIDEST_INT))
1561
    n->n &= ((unsigned HOST_WIDEST_INT)1 << (n->size * BITS_PER_UNIT)) - 1;
1562
  return true;
1563
}
1564
 
1565
/* Perform sanity checking for the symbolic number N and the gimple
1566
   statement STMT.  */
1567
 
1568
static inline bool
1569
verify_symbolic_number_p (struct symbolic_number *n, gimple stmt)
1570
{
1571
  tree lhs_type;
1572
 
1573
  lhs_type = gimple_expr_type (stmt);
1574
 
1575
  if (TREE_CODE (lhs_type) != INTEGER_TYPE)
1576
    return false;
1577
 
1578
  if (TYPE_PRECISION (lhs_type) != n->size * BITS_PER_UNIT)
1579
    return false;
1580
 
1581
  return true;
1582
}
1583
 
1584
/* find_bswap_1 invokes itself recursively with N and tries to perform
1585
   the operation given by the rhs of STMT on the result.  If the
1586
   operation could successfully be executed the function returns the
1587
   tree expression of the source operand and NULL otherwise.  */
1588
 
1589
static tree
1590
find_bswap_1 (gimple stmt, struct symbolic_number *n, int limit)
1591
{
1592
  enum tree_code code;
1593
  tree rhs1, rhs2 = NULL;
1594
  gimple rhs1_stmt, rhs2_stmt;
1595
  tree source_expr1;
1596
  enum gimple_rhs_class rhs_class;
1597
 
1598
  if (!limit || !is_gimple_assign (stmt))
1599
    return NULL_TREE;
1600
 
1601
  rhs1 = gimple_assign_rhs1 (stmt);
1602
 
1603
  if (TREE_CODE (rhs1) != SSA_NAME)
1604
    return NULL_TREE;
1605
 
1606
  code = gimple_assign_rhs_code (stmt);
1607
  rhs_class = gimple_assign_rhs_class (stmt);
1608
  rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
1609
 
1610
  if (rhs_class == GIMPLE_BINARY_RHS)
1611
    rhs2 = gimple_assign_rhs2 (stmt);
1612
 
1613
  /* Handle unary rhs and binary rhs with integer constants as second
1614
     operand.  */
1615
 
1616
  if (rhs_class == GIMPLE_UNARY_RHS
1617
      || (rhs_class == GIMPLE_BINARY_RHS
1618
          && TREE_CODE (rhs2) == INTEGER_CST))
1619
    {
1620
      if (code != BIT_AND_EXPR
1621
          && code != LSHIFT_EXPR
1622
          && code != RSHIFT_EXPR
1623
          && code != LROTATE_EXPR
1624
          && code != RROTATE_EXPR
1625
          && code != NOP_EXPR
1626
          && code != CONVERT_EXPR)
1627
        return NULL_TREE;
1628
 
1629
      source_expr1 = find_bswap_1 (rhs1_stmt, n, limit - 1);
1630
 
1631
      /* If find_bswap_1 returned NULL STMT is a leaf node and we have
1632
         to initialize the symbolic number.  */
1633
      if (!source_expr1)
1634
        {
1635
          /* Set up the symbolic number N by setting each byte to a
1636
             value between 1 and the byte size of rhs1.  The highest
1637
             order byte is set to n->size and the lowest order
1638
             byte to 1.  */
1639
          n->size = TYPE_PRECISION (TREE_TYPE (rhs1));
1640
          if (n->size % BITS_PER_UNIT != 0)
1641
            return NULL_TREE;
1642
          n->size /= BITS_PER_UNIT;
1643
          n->n = (sizeof (HOST_WIDEST_INT) < 8 ? 0 :
1644
                  (unsigned HOST_WIDEST_INT)0x08070605 << 32 | 0x04030201);
1645
 
1646
          if (n->size < (int)sizeof (HOST_WIDEST_INT))
1647
            n->n &= ((unsigned HOST_WIDEST_INT)1 <<
1648
                     (n->size * BITS_PER_UNIT)) - 1;
1649
 
1650
          source_expr1 = rhs1;
1651
        }
1652
 
1653
      switch (code)
1654
        {
1655
        case BIT_AND_EXPR:
1656
          {
1657
            int i;
1658
            unsigned HOST_WIDEST_INT val = widest_int_cst_value (rhs2);
1659
            unsigned HOST_WIDEST_INT tmp = val;
1660
 
1661
            /* Only constants masking full bytes are allowed.  */
1662
            for (i = 0; i < n->size; i++, tmp >>= BITS_PER_UNIT)
1663
              if ((tmp & 0xff) != 0 && (tmp & 0xff) != 0xff)
1664
                return NULL_TREE;
1665
 
1666
            n->n &= val;
1667
          }
1668
          break;
1669
        case LSHIFT_EXPR:
1670
        case RSHIFT_EXPR:
1671
        case LROTATE_EXPR:
1672
        case RROTATE_EXPR:
1673
          if (!do_shift_rotate (code, n, (int)TREE_INT_CST_LOW (rhs2)))
1674
            return NULL_TREE;
1675
          break;
1676
        CASE_CONVERT:
1677
          {
1678
            int type_size;
1679
 
1680
            type_size = TYPE_PRECISION (gimple_expr_type (stmt));
1681
            if (type_size % BITS_PER_UNIT != 0)
1682
              return NULL_TREE;
1683
 
1684
            if (type_size / BITS_PER_UNIT < (int)(sizeof (HOST_WIDEST_INT)))
1685
              {
1686
                /* If STMT casts to a smaller type mask out the bits not
1687
                   belonging to the target type.  */
1688
                n->n &= ((unsigned HOST_WIDEST_INT)1 << type_size) - 1;
1689
              }
1690
            n->size = type_size / BITS_PER_UNIT;
1691
          }
1692
          break;
1693
        default:
1694
          return NULL_TREE;
1695
        };
1696
      return verify_symbolic_number_p (n, stmt) ? source_expr1 : NULL;
1697
    }
1698
 
1699
  /* Handle binary rhs.  */
1700
 
1701
  if (rhs_class == GIMPLE_BINARY_RHS)
1702
    {
1703
      struct symbolic_number n1, n2;
1704
      tree source_expr2;
1705
 
1706
      if (code != BIT_IOR_EXPR)
1707
        return NULL_TREE;
1708
 
1709
      if (TREE_CODE (rhs2) != SSA_NAME)
1710
        return NULL_TREE;
1711
 
1712
      rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
1713
 
1714
      switch (code)
1715
        {
1716
        case BIT_IOR_EXPR:
1717
          source_expr1 = find_bswap_1 (rhs1_stmt, &n1, limit - 1);
1718
 
1719
          if (!source_expr1)
1720
            return NULL_TREE;
1721
 
1722
          source_expr2 = find_bswap_1 (rhs2_stmt, &n2, limit - 1);
1723
 
1724
          if (source_expr1 != source_expr2
1725
              || n1.size != n2.size)
1726
            return NULL_TREE;
1727
 
1728
          n->size = n1.size;
1729
          n->n = n1.n | n2.n;
1730
 
1731
          if (!verify_symbolic_number_p (n, stmt))
1732
            return NULL_TREE;
1733
 
1734
          break;
1735
        default:
1736
          return NULL_TREE;
1737
        }
1738
      return source_expr1;
1739
    }
1740
  return NULL_TREE;
1741
}
1742
 
1743
/* Check if STMT completes a bswap implementation consisting of ORs,
1744
   SHIFTs and ANDs.  Return the source tree expression on which the
1745
   byte swap is performed and NULL if no bswap was found.  */
1746
 
1747
static tree
1748
find_bswap (gimple stmt)
1749
{
1750
/* The number which the find_bswap result should match in order to
1751
   have a full byte swap.  The number is shifted to the left according
1752
   to the size of the symbolic number before using it.  */
1753
  unsigned HOST_WIDEST_INT cmp =
1754
    sizeof (HOST_WIDEST_INT) < 8 ? 0 :
1755
    (unsigned HOST_WIDEST_INT)0x01020304 << 32 | 0x05060708;
1756
 
1757
  struct symbolic_number n;
1758
  tree source_expr;
1759
  int limit;
1760
 
1761
  /* The last parameter determines the depth search limit.  It usually
1762
     correlates directly to the number of bytes to be touched.  We
1763
     increase that number by three  here in order to also
1764
     cover signed -> unsigned converions of the src operand as can be seen
1765
     in libgcc, and for initial shift/and operation of the src operand.  */
1766
  limit = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (gimple_expr_type (stmt)));
1767
  limit += 1 + (int) ceil_log2 ((unsigned HOST_WIDE_INT) limit);
1768
  source_expr =  find_bswap_1 (stmt, &n, limit);
1769
 
1770
  if (!source_expr)
1771
    return NULL_TREE;
1772
 
1773
  /* Zero out the extra bits of N and CMP.  */
1774
  if (n.size < (int)sizeof (HOST_WIDEST_INT))
1775
    {
1776
      unsigned HOST_WIDEST_INT mask =
1777
        ((unsigned HOST_WIDEST_INT)1 << (n.size * BITS_PER_UNIT)) - 1;
1778
 
1779
      n.n &= mask;
1780
      cmp >>= (sizeof (HOST_WIDEST_INT) - n.size) * BITS_PER_UNIT;
1781
    }
1782
 
1783
  /* A complete byte swap should make the symbolic number to start
1784
     with the largest digit in the highest order byte.  */
1785
  if (cmp != n.n)
1786
    return NULL_TREE;
1787
 
1788
  return source_expr;
1789
}
1790
 
1791
/* Find manual byte swap implementations and turn them into a bswap
1792
   builtin invokation.  */
1793
 
1794
static unsigned int
1795
execute_optimize_bswap (void)
1796
{
1797
  basic_block bb;
1798
  bool bswap32_p, bswap64_p;
1799
  bool changed = false;
1800
  tree bswap32_type = NULL_TREE, bswap64_type = NULL_TREE;
1801
 
1802
  if (BITS_PER_UNIT != 8)
1803
    return 0;
1804
 
1805
  if (sizeof (HOST_WIDEST_INT) < 8)
1806
    return 0;
1807
 
1808
  bswap32_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP32)
1809
               && optab_handler (bswap_optab, SImode) != CODE_FOR_nothing);
1810
  bswap64_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP64)
1811
               && (optab_handler (bswap_optab, DImode) != CODE_FOR_nothing
1812
                   || (bswap32_p && word_mode == SImode)));
1813
 
1814
  if (!bswap32_p && !bswap64_p)
1815
    return 0;
1816
 
1817
  /* Determine the argument type of the builtins.  The code later on
1818
     assumes that the return and argument type are the same.  */
1819
  if (bswap32_p)
1820
    {
1821
      tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
1822
      bswap32_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
1823
    }
1824
 
1825
  if (bswap64_p)
1826
    {
1827
      tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
1828
      bswap64_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
1829
    }
1830
 
1831
  memset (&bswap_stats, 0, sizeof (bswap_stats));
1832
 
1833
  FOR_EACH_BB (bb)
1834
    {
1835
      gimple_stmt_iterator gsi;
1836
 
1837
      /* We do a reverse scan for bswap patterns to make sure we get the
1838
         widest match. As bswap pattern matching doesn't handle
1839
         previously inserted smaller bswap replacements as sub-
1840
         patterns, the wider variant wouldn't be detected.  */
1841
      for (gsi = gsi_last_bb (bb); !gsi_end_p (gsi); gsi_prev (&gsi))
1842
        {
1843
          gimple stmt = gsi_stmt (gsi);
1844
          tree bswap_src, bswap_type;
1845
          tree bswap_tmp;
1846
          tree fndecl = NULL_TREE;
1847
          int type_size;
1848
          gimple call;
1849
 
1850
          if (!is_gimple_assign (stmt)
1851
              || gimple_assign_rhs_code (stmt) != BIT_IOR_EXPR)
1852
            continue;
1853
 
1854
          type_size = TYPE_PRECISION (gimple_expr_type (stmt));
1855
 
1856
          switch (type_size)
1857
            {
1858
            case 32:
1859
              if (bswap32_p)
1860
                {
1861
                  fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
1862
                  bswap_type = bswap32_type;
1863
                }
1864
              break;
1865
            case 64:
1866
              if (bswap64_p)
1867
                {
1868
                  fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
1869
                  bswap_type = bswap64_type;
1870
                }
1871
              break;
1872
            default:
1873
              continue;
1874
            }
1875
 
1876
          if (!fndecl)
1877
            continue;
1878
 
1879
          bswap_src = find_bswap (stmt);
1880
 
1881
          if (!bswap_src)
1882
            continue;
1883
 
1884
          changed = true;
1885
          if (type_size == 32)
1886
            bswap_stats.found_32bit++;
1887
          else
1888
            bswap_stats.found_64bit++;
1889
 
1890
          bswap_tmp = bswap_src;
1891
 
1892
          /* Convert the src expression if necessary.  */
1893
          if (!useless_type_conversion_p (TREE_TYPE (bswap_tmp), bswap_type))
1894
            {
1895
              gimple convert_stmt;
1896
 
1897
              bswap_tmp = create_tmp_var (bswap_type, "bswapsrc");
1898
              add_referenced_var (bswap_tmp);
1899
              bswap_tmp = make_ssa_name (bswap_tmp, NULL);
1900
 
1901
              convert_stmt = gimple_build_assign_with_ops (
1902
                               CONVERT_EXPR, bswap_tmp, bswap_src, NULL);
1903
              gsi_insert_before (&gsi, convert_stmt, GSI_SAME_STMT);
1904
            }
1905
 
1906
          call = gimple_build_call (fndecl, 1, bswap_tmp);
1907
 
1908
          bswap_tmp = gimple_assign_lhs (stmt);
1909
 
1910
          /* Convert the result if necessary.  */
1911
          if (!useless_type_conversion_p (TREE_TYPE (bswap_tmp), bswap_type))
1912
            {
1913
              gimple convert_stmt;
1914
 
1915
              bswap_tmp = create_tmp_var (bswap_type, "bswapdst");
1916
              add_referenced_var (bswap_tmp);
1917
              bswap_tmp = make_ssa_name (bswap_tmp, NULL);
1918
              convert_stmt = gimple_build_assign_with_ops (
1919
                               CONVERT_EXPR, gimple_assign_lhs (stmt), bswap_tmp, NULL);
1920
              gsi_insert_after (&gsi, convert_stmt, GSI_SAME_STMT);
1921
            }
1922
 
1923
          gimple_call_set_lhs (call, bswap_tmp);
1924
 
1925
          if (dump_file)
1926
            {
1927
              fprintf (dump_file, "%d bit bswap implementation found at: ",
1928
                       (int)type_size);
1929
              print_gimple_stmt (dump_file, stmt, 0, 0);
1930
            }
1931
 
1932
          gsi_insert_after (&gsi, call, GSI_SAME_STMT);
1933
          gsi_remove (&gsi, true);
1934
        }
1935
    }
1936
 
1937
  statistics_counter_event (cfun, "32-bit bswap implementations found",
1938
                            bswap_stats.found_32bit);
1939
  statistics_counter_event (cfun, "64-bit bswap implementations found",
1940
                            bswap_stats.found_64bit);
1941
 
1942
  return (changed ? TODO_update_ssa | TODO_verify_ssa
1943
          | TODO_verify_stmts : 0);
1944
}
1945
 
1946
static bool
1947
gate_optimize_bswap (void)
1948
{
1949
  return flag_expensive_optimizations && optimize;
1950
}
1951
 
1952
struct gimple_opt_pass pass_optimize_bswap =
1953
{
1954
 {
1955
  GIMPLE_PASS,
1956
  "bswap",                              /* name */
1957
  gate_optimize_bswap,                  /* gate */
1958
  execute_optimize_bswap,               /* execute */
1959
  NULL,                                 /* sub */
1960
  NULL,                                 /* next */
1961
  0,                                     /* static_pass_number */
1962
  TV_NONE,                              /* tv_id */
1963
  PROP_ssa,                             /* properties_required */
1964
  0,                                     /* properties_provided */
1965
  0,                                     /* properties_destroyed */
1966
  0,                                     /* todo_flags_start */
1967
 
1968
 }
1969
};
1970
 
1971
/* Return true if RHS is a suitable operand for a widening multiplication,
1972
   assuming a target type of TYPE.
1973
   There are two cases:
1974
 
1975
     - RHS makes some value at least twice as wide.  Store that value
1976
       in *NEW_RHS_OUT if so, and store its type in *TYPE_OUT.
1977
 
1978
     - RHS is an integer constant.  Store that value in *NEW_RHS_OUT if so,
1979
       but leave *TYPE_OUT untouched.  */
1980
 
1981
static bool
1982
is_widening_mult_rhs_p (tree type, tree rhs, tree *type_out,
1983
                        tree *new_rhs_out)
1984
{
1985
  gimple stmt;
1986
  tree type1, rhs1;
1987
  enum tree_code rhs_code;
1988
 
1989
  if (TREE_CODE (rhs) == SSA_NAME)
1990
    {
1991
      stmt = SSA_NAME_DEF_STMT (rhs);
1992
      if (is_gimple_assign (stmt))
1993
        {
1994
          rhs_code = gimple_assign_rhs_code (stmt);
1995
          if (TREE_CODE (type) == INTEGER_TYPE
1996
              ? !CONVERT_EXPR_CODE_P (rhs_code)
1997
              : rhs_code != FIXED_CONVERT_EXPR)
1998
            rhs1 = rhs;
1999
          else
2000
            {
2001
              rhs1 = gimple_assign_rhs1 (stmt);
2002
 
2003
              if (TREE_CODE (rhs1) == INTEGER_CST)
2004
                {
2005
                  *new_rhs_out = rhs1;
2006
                  *type_out = NULL;
2007
                  return true;
2008
                }
2009
            }
2010
        }
2011
      else
2012
        rhs1 = rhs;
2013
 
2014
      type1 = TREE_TYPE (rhs1);
2015
 
2016
      if (TREE_CODE (type1) != TREE_CODE (type)
2017
          || TYPE_PRECISION (type1) * 2 > TYPE_PRECISION (type))
2018
        return false;
2019
 
2020
      *new_rhs_out = rhs1;
2021
      *type_out = type1;
2022
      return true;
2023
    }
2024
 
2025
  if (TREE_CODE (rhs) == INTEGER_CST)
2026
    {
2027
      *new_rhs_out = rhs;
2028
      *type_out = NULL;
2029
      return true;
2030
    }
2031
 
2032
  return false;
2033
}
2034
 
2035
/* Return true if STMT performs a widening multiplication, assuming the
2036
   output type is TYPE.  If so, store the unwidened types of the operands
2037
   in *TYPE1_OUT and *TYPE2_OUT respectively.  Also fill *RHS1_OUT and
2038
   *RHS2_OUT such that converting those operands to types *TYPE1_OUT
2039
   and *TYPE2_OUT would give the operands of the multiplication.  */
2040
 
2041
static bool
2042
is_widening_mult_p (gimple stmt,
2043
                    tree *type1_out, tree *rhs1_out,
2044
                    tree *type2_out, tree *rhs2_out)
2045
{
2046
  tree type = TREE_TYPE (gimple_assign_lhs (stmt));
2047
 
2048
  if (TREE_CODE (type) != INTEGER_TYPE
2049
      && TREE_CODE (type) != FIXED_POINT_TYPE)
2050
    return false;
2051
 
2052
  if (!is_widening_mult_rhs_p (type, gimple_assign_rhs1 (stmt), type1_out,
2053
                               rhs1_out))
2054
    return false;
2055
 
2056
  if (!is_widening_mult_rhs_p (type, gimple_assign_rhs2 (stmt), type2_out,
2057
                               rhs2_out))
2058
    return false;
2059
 
2060
  if (*type1_out == NULL)
2061
    {
2062
      if (*type2_out == NULL || !int_fits_type_p (*rhs1_out, *type2_out))
2063
        return false;
2064
      *type1_out = *type2_out;
2065
    }
2066
 
2067
  if (*type2_out == NULL)
2068
    {
2069
      if (!int_fits_type_p (*rhs2_out, *type1_out))
2070
        return false;
2071
      *type2_out = *type1_out;
2072
    }
2073
 
2074
  /* Ensure that the larger of the two operands comes first. */
2075
  if (TYPE_PRECISION (*type1_out) < TYPE_PRECISION (*type2_out))
2076
    {
2077
      tree tmp;
2078
      tmp = *type1_out;
2079
      *type1_out = *type2_out;
2080
      *type2_out = tmp;
2081
      tmp = *rhs1_out;
2082
      *rhs1_out = *rhs2_out;
2083
      *rhs2_out = tmp;
2084
    }
2085
 
2086
  return true;
2087
}
2088
 
2089
/* Process a single gimple statement STMT, which has a MULT_EXPR as
2090
   its rhs, and try to convert it into a WIDEN_MULT_EXPR.  The return
2091
   value is true iff we converted the statement.  */
2092
 
2093
static bool
2094
convert_mult_to_widen (gimple stmt, gimple_stmt_iterator *gsi)
2095
{
2096
  tree lhs, rhs1, rhs2, type, type1, type2, tmp = NULL;
2097
  enum insn_code handler;
2098
  enum machine_mode to_mode, from_mode, actual_mode;
2099
  optab op;
2100
  int actual_precision;
2101
  location_t loc = gimple_location (stmt);
2102
  bool from_unsigned1, from_unsigned2;
2103
 
2104
  lhs = gimple_assign_lhs (stmt);
2105
  type = TREE_TYPE (lhs);
2106
  if (TREE_CODE (type) != INTEGER_TYPE)
2107
    return false;
2108
 
2109
  if (!is_widening_mult_p (stmt, &type1, &rhs1, &type2, &rhs2))
2110
    return false;
2111
 
2112
  to_mode = TYPE_MODE (type);
2113
  from_mode = TYPE_MODE (type1);
2114
  from_unsigned1 = TYPE_UNSIGNED (type1);
2115
  from_unsigned2 = TYPE_UNSIGNED (type2);
2116
 
2117
  if (from_unsigned1 && from_unsigned2)
2118
    op = umul_widen_optab;
2119
  else if (!from_unsigned1 && !from_unsigned2)
2120
    op = smul_widen_optab;
2121
  else
2122
    op = usmul_widen_optab;
2123
 
2124
  handler = find_widening_optab_handler_and_mode (op, to_mode, from_mode,
2125
                                                  0, &actual_mode);
2126
 
2127
  if (handler == CODE_FOR_nothing)
2128
    {
2129
      if (op != smul_widen_optab)
2130
        {
2131
          /* We can use a signed multiply with unsigned types as long as
2132
             there is a wider mode to use, or it is the smaller of the two
2133
             types that is unsigned.  Note that type1 >= type2, always.  */
2134
          if ((TYPE_UNSIGNED (type1)
2135
               && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode))
2136
              || (TYPE_UNSIGNED (type2)
2137
                  && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode)))
2138
            {
2139
              from_mode = GET_MODE_WIDER_MODE (from_mode);
2140
              if (GET_MODE_SIZE (to_mode) <= GET_MODE_SIZE (from_mode))
2141
                return false;
2142
            }
2143
 
2144
          op = smul_widen_optab;
2145
          handler = find_widening_optab_handler_and_mode (op, to_mode,
2146
                                                          from_mode, 0,
2147
                                                          &actual_mode);
2148
 
2149
          if (handler == CODE_FOR_nothing)
2150
            return false;
2151
 
2152
          from_unsigned1 = from_unsigned2 = false;
2153
        }
2154
      else
2155
        return false;
2156
    }
2157
 
2158
  /* Ensure that the inputs to the handler are in the correct precison
2159
     for the opcode.  This will be the full mode size.  */
2160
  actual_precision = GET_MODE_PRECISION (actual_mode);
2161
  if (actual_precision != TYPE_PRECISION (type1)
2162
      || from_unsigned1 != TYPE_UNSIGNED (type1))
2163
    {
2164
      tmp = create_tmp_var (build_nonstandard_integer_type
2165
                                (actual_precision, from_unsigned1),
2166
                            NULL);
2167
      rhs1 = build_and_insert_cast (gsi, loc, tmp, rhs1);
2168
    }
2169
  if (actual_precision != TYPE_PRECISION (type2)
2170
      || from_unsigned2 != TYPE_UNSIGNED (type2))
2171
    {
2172
      /* Reuse the same type info, if possible.  */
2173
      if (!tmp || from_unsigned1 != from_unsigned2)
2174
        tmp = create_tmp_var (build_nonstandard_integer_type
2175
                                (actual_precision, from_unsigned2),
2176
                              NULL);
2177
      rhs2 = build_and_insert_cast (gsi, loc, tmp, rhs2);
2178
    }
2179
 
2180
  /* Handle constants.  */
2181
  if (TREE_CODE (rhs1) == INTEGER_CST)
2182
    rhs1 = fold_convert (type1, rhs1);
2183
  if (TREE_CODE (rhs2) == INTEGER_CST)
2184
    rhs2 = fold_convert (type2, rhs2);
2185
 
2186
  gimple_assign_set_rhs1 (stmt, rhs1);
2187
  gimple_assign_set_rhs2 (stmt, rhs2);
2188
  gimple_assign_set_rhs_code (stmt, WIDEN_MULT_EXPR);
2189
  update_stmt (stmt);
2190
  widen_mul_stats.widen_mults_inserted++;
2191
  return true;
2192
}
2193
 
2194
/* Process a single gimple statement STMT, which is found at the
2195
   iterator GSI and has a either a PLUS_EXPR or a MINUS_EXPR as its
2196
   rhs (given by CODE), and try to convert it into a
2197
   WIDEN_MULT_PLUS_EXPR or a WIDEN_MULT_MINUS_EXPR.  The return value
2198
   is true iff we converted the statement.  */
2199
 
2200
static bool
2201
convert_plusminus_to_widen (gimple_stmt_iterator *gsi, gimple stmt,
2202
                            enum tree_code code)
2203
{
2204
  gimple rhs1_stmt = NULL, rhs2_stmt = NULL;
2205
  gimple conv1_stmt = NULL, conv2_stmt = NULL, conv_stmt;
2206
  tree type, type1, type2, optype, tmp = NULL;
2207
  tree lhs, rhs1, rhs2, mult_rhs1, mult_rhs2, add_rhs;
2208
  enum tree_code rhs1_code = ERROR_MARK, rhs2_code = ERROR_MARK;
2209
  optab this_optab;
2210
  enum tree_code wmult_code;
2211
  enum insn_code handler;
2212
  enum machine_mode to_mode, from_mode, actual_mode;
2213
  location_t loc = gimple_location (stmt);
2214
  int actual_precision;
2215
  bool from_unsigned1, from_unsigned2;
2216
 
2217
  lhs = gimple_assign_lhs (stmt);
2218
  type = TREE_TYPE (lhs);
2219
  if (TREE_CODE (type) != INTEGER_TYPE
2220
      && TREE_CODE (type) != FIXED_POINT_TYPE)
2221
    return false;
2222
 
2223
  if (code == MINUS_EXPR)
2224
    wmult_code = WIDEN_MULT_MINUS_EXPR;
2225
  else
2226
    wmult_code = WIDEN_MULT_PLUS_EXPR;
2227
 
2228
  rhs1 = gimple_assign_rhs1 (stmt);
2229
  rhs2 = gimple_assign_rhs2 (stmt);
2230
 
2231
  if (TREE_CODE (rhs1) == SSA_NAME)
2232
    {
2233
      rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
2234
      if (is_gimple_assign (rhs1_stmt))
2235
        rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
2236
    }
2237
 
2238
  if (TREE_CODE (rhs2) == SSA_NAME)
2239
    {
2240
      rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
2241
      if (is_gimple_assign (rhs2_stmt))
2242
        rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
2243
    }
2244
 
2245
  /* Allow for one conversion statement between the multiply
2246
     and addition/subtraction statement.  If there are more than
2247
     one conversions then we assume they would invalidate this
2248
     transformation.  If that's not the case then they should have
2249
     been folded before now.  */
2250
  if (CONVERT_EXPR_CODE_P (rhs1_code))
2251
    {
2252
      conv1_stmt = rhs1_stmt;
2253
      rhs1 = gimple_assign_rhs1 (rhs1_stmt);
2254
      if (TREE_CODE (rhs1) == SSA_NAME)
2255
        {
2256
          rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
2257
          if (is_gimple_assign (rhs1_stmt))
2258
            rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
2259
        }
2260
      else
2261
        return false;
2262
    }
2263
  if (CONVERT_EXPR_CODE_P (rhs2_code))
2264
    {
2265
      conv2_stmt = rhs2_stmt;
2266
      rhs2 = gimple_assign_rhs1 (rhs2_stmt);
2267
      if (TREE_CODE (rhs2) == SSA_NAME)
2268
        {
2269
          rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
2270
          if (is_gimple_assign (rhs2_stmt))
2271
            rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
2272
        }
2273
      else
2274
        return false;
2275
    }
2276
 
2277
  /* If code is WIDEN_MULT_EXPR then it would seem unnecessary to call
2278
     is_widening_mult_p, but we still need the rhs returns.
2279
 
2280
     It might also appear that it would be sufficient to use the existing
2281
     operands of the widening multiply, but that would limit the choice of
2282
     multiply-and-accumulate instructions.  */
2283
  if (code == PLUS_EXPR
2284
      && (rhs1_code == MULT_EXPR || rhs1_code == WIDEN_MULT_EXPR))
2285
    {
2286
      if (!is_widening_mult_p (rhs1_stmt, &type1, &mult_rhs1,
2287
                               &type2, &mult_rhs2))
2288
        return false;
2289
      add_rhs = rhs2;
2290
      conv_stmt = conv1_stmt;
2291
    }
2292
  else if (rhs2_code == MULT_EXPR || rhs2_code == WIDEN_MULT_EXPR)
2293
    {
2294
      if (!is_widening_mult_p (rhs2_stmt, &type1, &mult_rhs1,
2295
                               &type2, &mult_rhs2))
2296
        return false;
2297
      add_rhs = rhs1;
2298
      conv_stmt = conv2_stmt;
2299
    }
2300
  else
2301
    return false;
2302
 
2303
  to_mode = TYPE_MODE (type);
2304
  from_mode = TYPE_MODE (type1);
2305
  from_unsigned1 = TYPE_UNSIGNED (type1);
2306
  from_unsigned2 = TYPE_UNSIGNED (type2);
2307
  optype = type1;
2308
 
2309
  /* There's no such thing as a mixed sign madd yet, so use a wider mode.  */
2310
  if (from_unsigned1 != from_unsigned2)
2311
    {
2312
      if (!INTEGRAL_TYPE_P (type))
2313
        return false;
2314
      /* We can use a signed multiply with unsigned types as long as
2315
         there is a wider mode to use, or it is the smaller of the two
2316
         types that is unsigned.  Note that type1 >= type2, always.  */
2317
      if ((from_unsigned1
2318
           && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode))
2319
          || (from_unsigned2
2320
              && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode)))
2321
        {
2322
          from_mode = GET_MODE_WIDER_MODE (from_mode);
2323
          if (GET_MODE_SIZE (from_mode) >= GET_MODE_SIZE (to_mode))
2324
            return false;
2325
        }
2326
 
2327
      from_unsigned1 = from_unsigned2 = false;
2328
      optype = build_nonstandard_integer_type (GET_MODE_PRECISION (from_mode),
2329
                                               false);
2330
    }
2331
 
2332
  /* If there was a conversion between the multiply and addition
2333
     then we need to make sure it fits a multiply-and-accumulate.
2334
     The should be a single mode change which does not change the
2335
     value.  */
2336
  if (conv_stmt)
2337
    {
2338
      /* We use the original, unmodified data types for this.  */
2339
      tree from_type = TREE_TYPE (gimple_assign_rhs1 (conv_stmt));
2340
      tree to_type = TREE_TYPE (gimple_assign_lhs (conv_stmt));
2341
      int data_size = TYPE_PRECISION (type1) + TYPE_PRECISION (type2);
2342
      bool is_unsigned = TYPE_UNSIGNED (type1) && TYPE_UNSIGNED (type2);
2343
 
2344
      if (TYPE_PRECISION (from_type) > TYPE_PRECISION (to_type))
2345
        {
2346
          /* Conversion is a truncate.  */
2347
          if (TYPE_PRECISION (to_type) < data_size)
2348
            return false;
2349
        }
2350
      else if (TYPE_PRECISION (from_type) < TYPE_PRECISION (to_type))
2351
        {
2352
          /* Conversion is an extend.  Check it's the right sort.  */
2353
          if (TYPE_UNSIGNED (from_type) != is_unsigned
2354
              && !(is_unsigned && TYPE_PRECISION (from_type) > data_size))
2355
            return false;
2356
        }
2357
      /* else convert is a no-op for our purposes.  */
2358
    }
2359
 
2360
  /* Verify that the machine can perform a widening multiply
2361
     accumulate in this mode/signedness combination, otherwise
2362
     this transformation is likely to pessimize code.  */
2363
  this_optab = optab_for_tree_code (wmult_code, optype, optab_default);
2364
  handler = find_widening_optab_handler_and_mode (this_optab, to_mode,
2365
                                                  from_mode, 0, &actual_mode);
2366
 
2367
  if (handler == CODE_FOR_nothing)
2368
    return false;
2369
 
2370
  /* Ensure that the inputs to the handler are in the correct precison
2371
     for the opcode.  This will be the full mode size.  */
2372
  actual_precision = GET_MODE_PRECISION (actual_mode);
2373
  if (actual_precision != TYPE_PRECISION (type1)
2374
      || from_unsigned1 != TYPE_UNSIGNED (type1))
2375
    {
2376
      tmp = create_tmp_var (build_nonstandard_integer_type
2377
                                (actual_precision, from_unsigned1),
2378
                            NULL);
2379
      mult_rhs1 = build_and_insert_cast (gsi, loc, tmp, mult_rhs1);
2380
    }
2381
  if (actual_precision != TYPE_PRECISION (type2)
2382
      || from_unsigned2 != TYPE_UNSIGNED (type2))
2383
    {
2384
      if (!tmp || from_unsigned1 != from_unsigned2)
2385
        tmp = create_tmp_var (build_nonstandard_integer_type
2386
                                (actual_precision, from_unsigned2),
2387
                              NULL);
2388
      mult_rhs2 = build_and_insert_cast (gsi, loc, tmp, mult_rhs2);
2389
    }
2390
 
2391
  if (!useless_type_conversion_p (type, TREE_TYPE (add_rhs)))
2392
    add_rhs = build_and_insert_cast (gsi, loc, create_tmp_var (type, NULL),
2393
                                     add_rhs);
2394
 
2395
  /* Handle constants.  */
2396
  if (TREE_CODE (mult_rhs1) == INTEGER_CST)
2397
    mult_rhs1 = fold_convert (type1, mult_rhs1);
2398
  if (TREE_CODE (mult_rhs2) == INTEGER_CST)
2399
    mult_rhs2 = fold_convert (type2, mult_rhs2);
2400
 
2401
  gimple_assign_set_rhs_with_ops_1 (gsi, wmult_code, mult_rhs1, mult_rhs2,
2402
                                    add_rhs);
2403
  update_stmt (gsi_stmt (*gsi));
2404
  widen_mul_stats.maccs_inserted++;
2405
  return true;
2406
}
2407
 
2408
/* Combine the multiplication at MUL_STMT with operands MULOP1 and MULOP2
2409
   with uses in additions and subtractions to form fused multiply-add
2410
   operations.  Returns true if successful and MUL_STMT should be removed.  */
2411
 
2412
static bool
2413
convert_mult_to_fma (gimple mul_stmt, tree op1, tree op2)
2414
{
2415
  tree mul_result = gimple_get_lhs (mul_stmt);
2416
  tree type = TREE_TYPE (mul_result);
2417
  gimple use_stmt, neguse_stmt, fma_stmt;
2418
  use_operand_p use_p;
2419
  imm_use_iterator imm_iter;
2420
 
2421
  if (FLOAT_TYPE_P (type)
2422
      && flag_fp_contract_mode == FP_CONTRACT_OFF)
2423
    return false;
2424
 
2425
  /* We don't want to do bitfield reduction ops.  */
2426
  if (INTEGRAL_TYPE_P (type)
2427
      && (TYPE_PRECISION (type)
2428
          != GET_MODE_PRECISION (TYPE_MODE (type))))
2429
    return false;
2430
 
2431
  /* If the target doesn't support it, don't generate it.  We assume that
2432
     if fma isn't available then fms, fnma or fnms are not either.  */
2433
  if (optab_handler (fma_optab, TYPE_MODE (type)) == CODE_FOR_nothing)
2434
    return false;
2435
 
2436
  /* If the multiplication has zero uses, it is kept around probably because
2437
     of -fnon-call-exceptions.  Don't optimize it away in that case,
2438
     it is DCE job.  */
2439
  if (has_zero_uses (mul_result))
2440
    return false;
2441
 
2442
  /* Make sure that the multiplication statement becomes dead after
2443
     the transformation, thus that all uses are transformed to FMAs.
2444
     This means we assume that an FMA operation has the same cost
2445
     as an addition.  */
2446
  FOR_EACH_IMM_USE_FAST (use_p, imm_iter, mul_result)
2447
    {
2448
      enum tree_code use_code;
2449
      tree result = mul_result;
2450
      bool negate_p = false;
2451
 
2452
      use_stmt = USE_STMT (use_p);
2453
 
2454
      if (is_gimple_debug (use_stmt))
2455
        continue;
2456
 
2457
      /* For now restrict this operations to single basic blocks.  In theory
2458
         we would want to support sinking the multiplication in
2459
         m = a*b;
2460
         if ()
2461
           ma = m + c;
2462
         else
2463
           d = m;
2464
         to form a fma in the then block and sink the multiplication to the
2465
         else block.  */
2466
      if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
2467
        return false;
2468
 
2469
      if (!is_gimple_assign (use_stmt))
2470
        return false;
2471
 
2472
      use_code = gimple_assign_rhs_code (use_stmt);
2473
 
2474
      /* A negate on the multiplication leads to FNMA.  */
2475
      if (use_code == NEGATE_EXPR)
2476
        {
2477
          ssa_op_iter iter;
2478
          use_operand_p usep;
2479
 
2480
          result = gimple_assign_lhs (use_stmt);
2481
 
2482
          /* Make sure the negate statement becomes dead with this
2483
             single transformation.  */
2484
          if (!single_imm_use (gimple_assign_lhs (use_stmt),
2485
                               &use_p, &neguse_stmt))
2486
            return false;
2487
 
2488
          /* Make sure the multiplication isn't also used on that stmt.  */
2489
          FOR_EACH_PHI_OR_STMT_USE (usep, neguse_stmt, iter, SSA_OP_USE)
2490
            if (USE_FROM_PTR (usep) == mul_result)
2491
              return false;
2492
 
2493
          /* Re-validate.  */
2494
          use_stmt = neguse_stmt;
2495
          if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
2496
            return false;
2497
          if (!is_gimple_assign (use_stmt))
2498
            return false;
2499
 
2500
          use_code = gimple_assign_rhs_code (use_stmt);
2501
          negate_p = true;
2502
        }
2503
 
2504
      switch (use_code)
2505
        {
2506
        case MINUS_EXPR:
2507
          if (gimple_assign_rhs2 (use_stmt) == result)
2508
            negate_p = !negate_p;
2509
          break;
2510
        case PLUS_EXPR:
2511
          break;
2512
        default:
2513
          /* FMA can only be formed from PLUS and MINUS.  */
2514
          return false;
2515
        }
2516
 
2517
      /* We can't handle a * b + a * b.  */
2518
      if (gimple_assign_rhs1 (use_stmt) == gimple_assign_rhs2 (use_stmt))
2519
        return false;
2520
 
2521
      /* While it is possible to validate whether or not the exact form
2522
         that we've recognized is available in the backend, the assumption
2523
         is that the transformation is never a loss.  For instance, suppose
2524
         the target only has the plain FMA pattern available.  Consider
2525
         a*b-c -> fma(a,b,-c): we've exchanged MUL+SUB for FMA+NEG, which
2526
         is still two operations.  Consider -(a*b)-c -> fma(-a,b,-c): we
2527
         still have 3 operations, but in the FMA form the two NEGs are
2528
         independant and could be run in parallel.  */
2529
    }
2530
 
2531
  FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, mul_result)
2532
    {
2533
      gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
2534
      enum tree_code use_code;
2535
      tree addop, mulop1 = op1, result = mul_result;
2536
      bool negate_p = false;
2537
 
2538
      if (is_gimple_debug (use_stmt))
2539
        continue;
2540
 
2541
      use_code = gimple_assign_rhs_code (use_stmt);
2542
      if (use_code == NEGATE_EXPR)
2543
        {
2544
          result = gimple_assign_lhs (use_stmt);
2545
          single_imm_use (gimple_assign_lhs (use_stmt), &use_p, &neguse_stmt);
2546
          gsi_remove (&gsi, true);
2547
          release_defs (use_stmt);
2548
 
2549
          use_stmt = neguse_stmt;
2550
          gsi = gsi_for_stmt (use_stmt);
2551
          use_code = gimple_assign_rhs_code (use_stmt);
2552
          negate_p = true;
2553
        }
2554
 
2555
      if (gimple_assign_rhs1 (use_stmt) == result)
2556
        {
2557
          addop = gimple_assign_rhs2 (use_stmt);
2558
          /* a * b - c -> a * b + (-c)  */
2559
          if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
2560
            addop = force_gimple_operand_gsi (&gsi,
2561
                                              build1 (NEGATE_EXPR,
2562
                                                      type, addop),
2563
                                              true, NULL_TREE, true,
2564
                                              GSI_SAME_STMT);
2565
        }
2566
      else
2567
        {
2568
          addop = gimple_assign_rhs1 (use_stmt);
2569
          /* a - b * c -> (-b) * c + a */
2570
          if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
2571
            negate_p = !negate_p;
2572
        }
2573
 
2574
      if (negate_p)
2575
        mulop1 = force_gimple_operand_gsi (&gsi,
2576
                                           build1 (NEGATE_EXPR,
2577
                                                   type, mulop1),
2578
                                           true, NULL_TREE, true,
2579
                                           GSI_SAME_STMT);
2580
 
2581
      fma_stmt = gimple_build_assign_with_ops3 (FMA_EXPR,
2582
                                                gimple_assign_lhs (use_stmt),
2583
                                                mulop1, op2,
2584
                                                addop);
2585
      gsi_replace (&gsi, fma_stmt, true);
2586
      widen_mul_stats.fmas_inserted++;
2587
    }
2588
 
2589
  return true;
2590
}
2591
 
2592
/* Find integer multiplications where the operands are extended from
2593
   smaller types, and replace the MULT_EXPR with a WIDEN_MULT_EXPR
2594
   where appropriate.  */
2595
 
2596
static unsigned int
2597
execute_optimize_widening_mul (void)
2598
{
2599
  basic_block bb;
2600
  bool cfg_changed = false;
2601
 
2602
  memset (&widen_mul_stats, 0, sizeof (widen_mul_stats));
2603
 
2604
  FOR_EACH_BB (bb)
2605
    {
2606
      gimple_stmt_iterator gsi;
2607
 
2608
      for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);)
2609
        {
2610
          gimple stmt = gsi_stmt (gsi);
2611
          enum tree_code code;
2612
 
2613
          if (is_gimple_assign (stmt))
2614
            {
2615
              code = gimple_assign_rhs_code (stmt);
2616
              switch (code)
2617
                {
2618
                case MULT_EXPR:
2619
                  if (!convert_mult_to_widen (stmt, &gsi)
2620
                      && convert_mult_to_fma (stmt,
2621
                                              gimple_assign_rhs1 (stmt),
2622
                                              gimple_assign_rhs2 (stmt)))
2623
                    {
2624
                      gsi_remove (&gsi, true);
2625
                      release_defs (stmt);
2626
                      continue;
2627
                    }
2628
                  break;
2629
 
2630
                case PLUS_EXPR:
2631
                case MINUS_EXPR:
2632
                  convert_plusminus_to_widen (&gsi, stmt, code);
2633
                  break;
2634
 
2635
                default:;
2636
                }
2637
            }
2638
          else if (is_gimple_call (stmt)
2639
                   && gimple_call_lhs (stmt))
2640
            {
2641
              tree fndecl = gimple_call_fndecl (stmt);
2642
              if (fndecl
2643
                  && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL)
2644
                {
2645
                  switch (DECL_FUNCTION_CODE (fndecl))
2646
                    {
2647
                      case BUILT_IN_POWF:
2648
                      case BUILT_IN_POW:
2649
                      case BUILT_IN_POWL:
2650
                        if (TREE_CODE (gimple_call_arg (stmt, 1)) == REAL_CST
2651
                            && REAL_VALUES_EQUAL
2652
                                 (TREE_REAL_CST (gimple_call_arg (stmt, 1)),
2653
                                  dconst2)
2654
                            && convert_mult_to_fma (stmt,
2655
                                                    gimple_call_arg (stmt, 0),
2656
                                                    gimple_call_arg (stmt, 0)))
2657
                          {
2658
                            unlink_stmt_vdef (stmt);
2659
                            gsi_remove (&gsi, true);
2660
                            release_defs (stmt);
2661
                            if (gimple_purge_dead_eh_edges (bb))
2662
                              cfg_changed = true;
2663
                            continue;
2664
                          }
2665
                          break;
2666
 
2667
                      default:;
2668
                    }
2669
                }
2670
            }
2671
          gsi_next (&gsi);
2672
        }
2673
    }
2674
 
2675
  statistics_counter_event (cfun, "widening multiplications inserted",
2676
                            widen_mul_stats.widen_mults_inserted);
2677
  statistics_counter_event (cfun, "widening maccs inserted",
2678
                            widen_mul_stats.maccs_inserted);
2679
  statistics_counter_event (cfun, "fused multiply-adds inserted",
2680
                            widen_mul_stats.fmas_inserted);
2681
 
2682
  return cfg_changed ? TODO_cleanup_cfg : 0;
2683
}
2684
 
2685
static bool
2686
gate_optimize_widening_mul (void)
2687
{
2688
  return flag_expensive_optimizations && optimize;
2689
}
2690
 
2691
struct gimple_opt_pass pass_optimize_widening_mul =
2692
{
2693
 {
2694
  GIMPLE_PASS,
2695
  "widening_mul",                       /* name */
2696
  gate_optimize_widening_mul,           /* gate */
2697
  execute_optimize_widening_mul,        /* execute */
2698
  NULL,                                 /* sub */
2699
  NULL,                                 /* next */
2700
  0,                                     /* static_pass_number */
2701
  TV_NONE,                              /* tv_id */
2702
  PROP_ssa,                             /* properties_required */
2703
  0,                                     /* properties_provided */
2704
  0,                                     /* properties_destroyed */
2705
  0,                                     /* todo_flags_start */
2706
  TODO_verify_ssa
2707
  | TODO_verify_stmts
2708
  | TODO_update_ssa                     /* todo_flags_finish */
2709
 }
2710
};

powered by: WebSVN 2.1.0

© copyright 1999-2024 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.