1 |
282 |
jeremybenn |
/* Copyright (C) 2008, 2009 Free Software Foundation, Inc.
|
2 |
|
|
|
3 |
|
|
This file is free software; you can redistribute it and/or modify it under
|
4 |
|
|
the terms of the GNU General Public License as published by the Free
|
5 |
|
|
Software Foundation; either version 3 of the License, or (at your option)
|
6 |
|
|
any later version.
|
7 |
|
|
|
8 |
|
|
This file is distributed in the hope that it will be useful, but WITHOUT
|
9 |
|
|
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
10 |
|
|
FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
11 |
|
|
for more details.
|
12 |
|
|
|
13 |
|
|
Under Section 7 of GPL version 3, you are granted additional
|
14 |
|
|
permissions described in the GCC Runtime Library Exception, version
|
15 |
|
|
3.1, as published by the Free Software Foundation.
|
16 |
|
|
|
17 |
|
|
You should have received a copy of the GNU General Public License and
|
18 |
|
|
a copy of the GCC Runtime Library Exception along with this program;
|
19 |
|
|
see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
|
20 |
|
|
<http://www.gnu.org/licenses/>. */
|
21 |
|
|
|
22 |
|
|
#include <spu_intrinsics.h>
|
23 |
|
|
|
24 |
|
|
typedef int TItype __attribute__ ((mode (TI)));
|
25 |
|
|
|
26 |
|
|
/* A straight forward vectorization and unrolling of
|
27 |
|
|
* short l[8], r[8];
|
28 |
|
|
* TItype total = 0;
|
29 |
|
|
* for (i = 0; i < 8; i++)
|
30 |
|
|
* for (j = 0; j < 8; j++)
|
31 |
|
|
* total += (TItype)((l[7-i] * r[7-j]) << (16 * (i + j)));
|
32 |
|
|
*/
|
33 |
|
|
TItype
|
34 |
|
|
__multi3 (TItype l, TItype r)
|
35 |
|
|
{
|
36 |
|
|
qword u = *(qword *) & l;
|
37 |
|
|
qword v = *(qword *) & r;
|
38 |
|
|
qword splat0 = si_shufb (v, v, si_ilh (0x0001));
|
39 |
|
|
qword splat1 = si_shufb (v, v, si_ilh (0x0203));
|
40 |
|
|
qword splat2 = si_shufb (v, v, si_ilh (0x0405));
|
41 |
|
|
qword splat3 = si_shufb (v, v, si_ilh (0x0607));
|
42 |
|
|
qword splat4 = si_shufb (v, v, si_ilh (0x0809));
|
43 |
|
|
qword splat5 = si_shufb (v, v, si_ilh (0x0a0b));
|
44 |
|
|
qword splat6 = si_shufb (v, v, si_ilh (0x0c0d));
|
45 |
|
|
qword splat7 = si_shufb (v, v, si_ilh (0x0e0f));
|
46 |
|
|
|
47 |
|
|
qword part0l = si_shlqbyi (si_mpyu (u, splat0), 14);
|
48 |
|
|
qword part1h = si_shlqbyi (si_mpyhhu (u, splat1), 14);
|
49 |
|
|
qword part1l = si_shlqbyi (si_mpyu (u, splat1), 12);
|
50 |
|
|
qword part2h = si_shlqbyi (si_mpyhhu (u, splat2), 12);
|
51 |
|
|
qword part2l = si_shlqbyi (si_mpyu (u, splat2), 10);
|
52 |
|
|
qword part3h = si_shlqbyi (si_mpyhhu (u, splat3), 10);
|
53 |
|
|
qword part3l = si_shlqbyi (si_mpyu (u, splat3), 8);
|
54 |
|
|
qword part4h = si_shlqbyi (si_mpyhhu (u, splat4), 8);
|
55 |
|
|
qword part4l = si_shlqbyi (si_mpyu (u, splat4), 6);
|
56 |
|
|
qword part5h = si_shlqbyi (si_mpyhhu (u, splat5), 6);
|
57 |
|
|
qword part5l = si_shlqbyi (si_mpyu (u, splat5), 4);
|
58 |
|
|
qword part6h = si_shlqbyi (si_mpyhhu (u, splat6), 4);
|
59 |
|
|
qword part6l = si_shlqbyi (si_mpyu (u, splat6), 2);
|
60 |
|
|
qword part7h = si_shlqbyi (si_mpyhhu (u, splat7), 2);
|
61 |
|
|
qword part7l = si_mpyu (u, splat7);
|
62 |
|
|
|
63 |
|
|
qword carry, total0, total1, total2, total3, total4;
|
64 |
|
|
qword total5, total6, total7, total8, total9, total10;
|
65 |
|
|
qword total;
|
66 |
|
|
|
67 |
|
|
total0 = si_a (si_a (si_a (part0l, part1h), si_a (part1l, part2h)), part7l);
|
68 |
|
|
total1 = si_a (part2l, part3h);
|
69 |
|
|
total2 = si_a (part3l, part4h);
|
70 |
|
|
total3 = si_a (part4l, part5h);
|
71 |
|
|
total4 = si_a (part5l, part6h);
|
72 |
|
|
total5 = si_a (part6l, part7h);
|
73 |
|
|
total6 = si_a (total0, total1);
|
74 |
|
|
total7 = si_a (total2, total3);
|
75 |
|
|
total8 = si_a (total4, total5);
|
76 |
|
|
total9 = si_a (total6, total7);
|
77 |
|
|
total10 = si_a (total8, total9);
|
78 |
|
|
|
79 |
|
|
carry = si_cg (part2l, part3h);
|
80 |
|
|
carry = si_a (carry, si_cg (part3l, part4h));
|
81 |
|
|
carry = si_a (carry, si_cg (part4l, part5h));
|
82 |
|
|
carry = si_a (carry, si_cg (part5l, part6h));
|
83 |
|
|
carry = si_a (carry, si_cg (part6l, part7h));
|
84 |
|
|
carry = si_a (carry, si_cg (total0, total1));
|
85 |
|
|
carry = si_a (carry, si_cg (total2, total3));
|
86 |
|
|
carry = si_a (carry, si_cg (total4, total5));
|
87 |
|
|
carry = si_a (carry, si_cg (total6, total7));
|
88 |
|
|
carry = si_a (carry, si_cg (total8, total9));
|
89 |
|
|
carry = si_shlqbyi (carry, 4);
|
90 |
|
|
|
91 |
|
|
total = si_cg (total10, carry);
|
92 |
|
|
total = si_shlqbyi (total, 4);
|
93 |
|
|
total = si_cgx (total10, carry, total);
|
94 |
|
|
total = si_shlqbyi (total, 4);
|
95 |
|
|
total = si_addx (total10, carry, total);
|
96 |
|
|
return *(TItype *) & total;
|
97 |
|
|
}
|