| 1 |
734 |
jeremybenn |
/* Copyright (C) 2008, 2009 Free Software Foundation, Inc.
|
| 2 |
|
|
|
| 3 |
|
|
This file is free software; you can redistribute it and/or modify it under
|
| 4 |
|
|
the terms of the GNU General Public License as published by the Free
|
| 5 |
|
|
Software Foundation; either version 3 of the License, or (at your option)
|
| 6 |
|
|
any later version.
|
| 7 |
|
|
|
| 8 |
|
|
This file is distributed in the hope that it will be useful, but WITHOUT
|
| 9 |
|
|
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
| 10 |
|
|
FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
| 11 |
|
|
for more details.
|
| 12 |
|
|
|
| 13 |
|
|
Under Section 7 of GPL version 3, you are granted additional
|
| 14 |
|
|
permissions described in the GCC Runtime Library Exception, version
|
| 15 |
|
|
3.1, as published by the Free Software Foundation.
|
| 16 |
|
|
|
| 17 |
|
|
You should have received a copy of the GNU General Public License and
|
| 18 |
|
|
a copy of the GCC Runtime Library Exception along with this program;
|
| 19 |
|
|
see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
|
| 20 |
|
|
<http://www.gnu.org/licenses/>. */
|
| 21 |
|
|
|
| 22 |
|
|
#include <spu_intrinsics.h>
|
| 23 |
|
|
|
| 24 |
|
|
typedef int TItype __attribute__ ((mode (TI)));
|
| 25 |
|
|
|
| 26 |
|
|
union qword_TItype
|
| 27 |
|
|
{
|
| 28 |
|
|
qword q;
|
| 29 |
|
|
TItype t;
|
| 30 |
|
|
};
|
| 31 |
|
|
|
| 32 |
|
|
inline static qword
|
| 33 |
|
|
si_from_TItype (TItype t)
|
| 34 |
|
|
{
|
| 35 |
|
|
union qword_TItype u;
|
| 36 |
|
|
u.t = t;
|
| 37 |
|
|
return u.q;
|
| 38 |
|
|
}
|
| 39 |
|
|
|
| 40 |
|
|
inline static TItype
|
| 41 |
|
|
si_to_TItype (qword q)
|
| 42 |
|
|
{
|
| 43 |
|
|
union qword_TItype u;
|
| 44 |
|
|
u.q = q;
|
| 45 |
|
|
return u.t;
|
| 46 |
|
|
}
|
| 47 |
|
|
|
| 48 |
|
|
/* A straight forward vectorization and unrolling of
|
| 49 |
|
|
* short l[8], r[8];
|
| 50 |
|
|
* TItype total = 0;
|
| 51 |
|
|
* for (i = 0; i < 8; i++)
|
| 52 |
|
|
* for (j = 0; j < 8; j++)
|
| 53 |
|
|
* total += (TItype)((l[7-i] * r[7-j]) << (16 * (i + j)));
|
| 54 |
|
|
*/
|
| 55 |
|
|
TItype
|
| 56 |
|
|
__multi3 (TItype l, TItype r)
|
| 57 |
|
|
{
|
| 58 |
|
|
qword u = si_from_TItype (l);
|
| 59 |
|
|
qword v = si_from_TItype (r);
|
| 60 |
|
|
qword splat0 = si_shufb (v, v, si_ilh (0x0001));
|
| 61 |
|
|
qword splat1 = si_shufb (v, v, si_ilh (0x0203));
|
| 62 |
|
|
qword splat2 = si_shufb (v, v, si_ilh (0x0405));
|
| 63 |
|
|
qword splat3 = si_shufb (v, v, si_ilh (0x0607));
|
| 64 |
|
|
qword splat4 = si_shufb (v, v, si_ilh (0x0809));
|
| 65 |
|
|
qword splat5 = si_shufb (v, v, si_ilh (0x0a0b));
|
| 66 |
|
|
qword splat6 = si_shufb (v, v, si_ilh (0x0c0d));
|
| 67 |
|
|
qword splat7 = si_shufb (v, v, si_ilh (0x0e0f));
|
| 68 |
|
|
|
| 69 |
|
|
qword part0l = si_shlqbyi (si_mpyu (u, splat0), 14);
|
| 70 |
|
|
qword part1h = si_shlqbyi (si_mpyhhu (u, splat1), 14);
|
| 71 |
|
|
qword part1l = si_shlqbyi (si_mpyu (u, splat1), 12);
|
| 72 |
|
|
qword part2h = si_shlqbyi (si_mpyhhu (u, splat2), 12);
|
| 73 |
|
|
qword part2l = si_shlqbyi (si_mpyu (u, splat2), 10);
|
| 74 |
|
|
qword part3h = si_shlqbyi (si_mpyhhu (u, splat3), 10);
|
| 75 |
|
|
qword part3l = si_shlqbyi (si_mpyu (u, splat3), 8);
|
| 76 |
|
|
qword part4h = si_shlqbyi (si_mpyhhu (u, splat4), 8);
|
| 77 |
|
|
qword part4l = si_shlqbyi (si_mpyu (u, splat4), 6);
|
| 78 |
|
|
qword part5h = si_shlqbyi (si_mpyhhu (u, splat5), 6);
|
| 79 |
|
|
qword part5l = si_shlqbyi (si_mpyu (u, splat5), 4);
|
| 80 |
|
|
qword part6h = si_shlqbyi (si_mpyhhu (u, splat6), 4);
|
| 81 |
|
|
qword part6l = si_shlqbyi (si_mpyu (u, splat6), 2);
|
| 82 |
|
|
qword part7h = si_shlqbyi (si_mpyhhu (u, splat7), 2);
|
| 83 |
|
|
qword part7l = si_mpyu (u, splat7);
|
| 84 |
|
|
|
| 85 |
|
|
qword carry, total0, total1, total2, total3, total4;
|
| 86 |
|
|
qword total5, total6, total7, total8, total9, total10;
|
| 87 |
|
|
qword total;
|
| 88 |
|
|
|
| 89 |
|
|
total0 = si_a (si_a (si_a (part0l, part1h), si_a (part1l, part2h)), part7l);
|
| 90 |
|
|
total1 = si_a (part2l, part3h);
|
| 91 |
|
|
total2 = si_a (part3l, part4h);
|
| 92 |
|
|
total3 = si_a (part4l, part5h);
|
| 93 |
|
|
total4 = si_a (part5l, part6h);
|
| 94 |
|
|
total5 = si_a (part6l, part7h);
|
| 95 |
|
|
total6 = si_a (total0, total1);
|
| 96 |
|
|
total7 = si_a (total2, total3);
|
| 97 |
|
|
total8 = si_a (total4, total5);
|
| 98 |
|
|
total9 = si_a (total6, total7);
|
| 99 |
|
|
total10 = si_a (total8, total9);
|
| 100 |
|
|
|
| 101 |
|
|
carry = si_cg (part2l, part3h);
|
| 102 |
|
|
carry = si_a (carry, si_cg (part3l, part4h));
|
| 103 |
|
|
carry = si_a (carry, si_cg (part4l, part5h));
|
| 104 |
|
|
carry = si_a (carry, si_cg (part5l, part6h));
|
| 105 |
|
|
carry = si_a (carry, si_cg (part6l, part7h));
|
| 106 |
|
|
carry = si_a (carry, si_cg (total0, total1));
|
| 107 |
|
|
carry = si_a (carry, si_cg (total2, total3));
|
| 108 |
|
|
carry = si_a (carry, si_cg (total4, total5));
|
| 109 |
|
|
carry = si_a (carry, si_cg (total6, total7));
|
| 110 |
|
|
carry = si_a (carry, si_cg (total8, total9));
|
| 111 |
|
|
carry = si_shlqbyi (carry, 4);
|
| 112 |
|
|
|
| 113 |
|
|
total = si_cg (total10, carry);
|
| 114 |
|
|
total = si_shlqbyi (total, 4);
|
| 115 |
|
|
total = si_cgx (total10, carry, total);
|
| 116 |
|
|
total = si_shlqbyi (total, 4);
|
| 117 |
|
|
total = si_addx (total10, carry, total);
|
| 118 |
|
|
return si_to_TItype (total);
|
| 119 |
|
|
}
|