OpenCores
URL https://opencores.org/ocsvn/or1k/or1k/trunk

Subversion Repositories or1k

[/] [or1k/] [trunk/] [linux/] [uClibc/] [libc/] [sysdeps/] [linux/] [sparc/] [umul.S] - Blame information for rev 1765

Details | Compare with Previous | View Log

Line No. Rev Author Line
1 1325 phoenix
/*
2
 * Unsigned multiply.  Returns %o0 * %o1 in %o1%o0 (i.e., %o1 holds the
3
 * upper 32 bits of the 64-bit product).
4
 *
5
 * This code optimizes short (less than 13-bit) multiplies.  Short
6
 * multiplies require 25 instruction cycles, and long ones require
7
 * 45 instruction cycles.
8
 *
9
 * On return, overflow has occurred (%o1 is not zero) if and only if
10
 * the Z condition code is clear, allowing, e.g., the following:
11
 *
12
 *      call    .umul
13
 *      nop
14
 *      bnz     overflow        (or tnz)
15
 */
16
 
17
#include 
18
 
19
 
20
.global   .umul;
21
.align 4;
22
.type  .umul ,@function;
23
 
24
.umul:
25
        or      %o0, %o1, %o4
26
        mov     %o0, %y                 ! multiplier -> Y
27
        andncc  %o4, 0xfff, %g0         ! test bits 12..31 of *both* args
28
        be      .Lmul_shortway  ! if zero, can do it the short way
29
         andcc  %g0, %g0, %o4           ! zero the partial product; clear N & V
30
 
31
        /*
32
         * Long multiply.  32 steps, followed by a final shift step.
33
         */
34
        mulscc  %o4, %o1, %o4   ! 1
35
        mulscc  %o4, %o1, %o4   ! 2
36
        mulscc  %o4, %o1, %o4   ! 3
37
        mulscc  %o4, %o1, %o4   ! 4
38
        mulscc  %o4, %o1, %o4   ! 5
39
        mulscc  %o4, %o1, %o4   ! 6
40
        mulscc  %o4, %o1, %o4   ! 7
41
        mulscc  %o4, %o1, %o4   ! 8
42
        mulscc  %o4, %o1, %o4   ! 9
43
        mulscc  %o4, %o1, %o4   ! 10
44
        mulscc  %o4, %o1, %o4   ! 11
45
        mulscc  %o4, %o1, %o4   ! 12
46
        mulscc  %o4, %o1, %o4   ! 13
47
        mulscc  %o4, %o1, %o4   ! 14
48
        mulscc  %o4, %o1, %o4   ! 15
49
        mulscc  %o4, %o1, %o4   ! 16
50
        mulscc  %o4, %o1, %o4   ! 17
51
        mulscc  %o4, %o1, %o4   ! 18
52
        mulscc  %o4, %o1, %o4   ! 19
53
        mulscc  %o4, %o1, %o4   ! 20
54
        mulscc  %o4, %o1, %o4   ! 21
55
        mulscc  %o4, %o1, %o4   ! 22
56
        mulscc  %o4, %o1, %o4   ! 23
57
        mulscc  %o4, %o1, %o4   ! 24
58
        mulscc  %o4, %o1, %o4   ! 25
59
        mulscc  %o4, %o1, %o4   ! 26
60
        mulscc  %o4, %o1, %o4   ! 27
61
        mulscc  %o4, %o1, %o4   ! 28
62
        mulscc  %o4, %o1, %o4   ! 29
63
        mulscc  %o4, %o1, %o4   ! 30
64
        mulscc  %o4, %o1, %o4   ! 31
65
        mulscc  %o4, %o1, %o4   ! 32
66
        mulscc  %o4, %g0, %o4   ! final shift
67
 
68
        /*
69
         * Normally, with the shift-and-add approach, if both numbers are
70
         * positive you get the correct result.  With 32-bit two's-complement
71
         * numbers, -x is represented as
72
         *
73
         *                x                 32
74
         *      ( 2  -  ------ ) mod 2  *  2
75
         *                 32
76
         *                2
77
         *
78
         * (the `mod 2' subtracts 1 from 1.bbbb).  To avoid lots of 2^32s,
79
         * we can treat this as if the radix point were just to the left
80
         * of the sign bit (multiply by 2^32), and get
81
         *
82
         *      -x  =  (2 - x) mod 2
83
         *
84
         * Then, ignoring the `mod 2's for convenience:
85
         *
86
         *   x *  y     = xy
87
         *  -x *  y     = 2y - xy
88
         *   x * -y     = 2x - xy
89
         *  -x * -y     = 4 - 2x - 2y + xy
90
         *
91
         * For signed multiplies, we subtract (x << 32) from the partial
92
         * product to fix this problem for negative multipliers (see mul.s).
93
         * Because of the way the shift into the partial product is calculated
94
         * (N xor V), this term is automatically removed for the multiplicand,
95
         * so we don't have to adjust.
96
         *
97
         * But for unsigned multiplies, the high order bit wasn't a sign bit,
98
         * and the correction is wrong.  So for unsigned multiplies where the
99
         * high order bit is one, we end up with xy - (y << 32).  To fix it
100
         * we add y << 32.
101
         */
102
#if 0
103
        tst     %o1
104
        bl,a    1f              ! if %o1 < 0 (high order bit = 1),
105
         add    %o4, %o0, %o4   ! %o4 += %o0 (add y to upper half)
106
1:      rd      %y, %o0         ! get lower half of product
107
        retl
108
         addcc  %o4, %g0, %o1   ! put upper half in place and set Z for %o1==0
109
#else
110
        /* Faster code from tege@sics.se.  */
111
        sra     %o1, 31, %o2    ! make mask from sign bit
112
        and     %o0, %o2, %o2   ! %o2 = 0 or %o0, depending on sign of %o1
113
        rd      %y, %o0         ! get lower half of product
114
        retl
115
         addcc  %o4, %o2, %o1   ! add compensation and put upper half in place
116
#endif
117
 
118
.Lmul_shortway:
119
        /*
120
         * Short multiply.  12 steps, followed by a final shift step.
121
         * The resulting bits are off by 12 and (32-12) = 20 bit positions,
122
         * but there is no problem with %o0 being negative (unlike above),
123
         * and overflow is impossible (the answer is at most 24 bits long).
124
         */
125
        mulscc  %o4, %o1, %o4   ! 1
126
        mulscc  %o4, %o1, %o4   ! 2
127
        mulscc  %o4, %o1, %o4   ! 3
128
        mulscc  %o4, %o1, %o4   ! 4
129
        mulscc  %o4, %o1, %o4   ! 5
130
        mulscc  %o4, %o1, %o4   ! 6
131
        mulscc  %o4, %o1, %o4   ! 7
132
        mulscc  %o4, %o1, %o4   ! 8
133
        mulscc  %o4, %o1, %o4   ! 9
134
        mulscc  %o4, %o1, %o4   ! 10
135
        mulscc  %o4, %o1, %o4   ! 11
136
        mulscc  %o4, %o1, %o4   ! 12
137
        mulscc  %o4, %g0, %o4   ! final shift
138
 
139
        /*
140
         * %o4 has 20 of the bits that should be in the result; %y has
141
         * the bottom 12 (as %y's top 12).  That is:
142
         *
143
         *        %o4               %y
144
         * +----------------+----------------+
145
         * | -12- |   -20-  | -12- |   -20-  |
146
         * +------(---------+------)---------+
147
         *         -----result-----
148
         *
149
         * The 12 bits of %o4 left of the `result' area are all zero;
150
         * in fact, all top 20 bits of %o4 are zero.
151
         */
152
 
153
        rd      %y, %o5
154
        sll     %o4, 12, %o0    ! shift middle bits left 12
155
        srl     %o5, 20, %o5    ! shift low bits right 20
156
        or      %o5, %o0, %o0
157
        retl
158
         addcc  %g0, %g0, %o1   ! %o1 = zero, and set Z
159
 
160
.size  .umul , . -.umul

powered by: WebSVN 2.1.0

© copyright 1999-2025 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.