OpenCores
URL https://opencores.org/ocsvn/or1k/or1k/trunk

Subversion Repositories or1k

[/] [or1k/] [trunk/] [rc203soc/] [sw/] [uClinux/] [arch/] [sparc/] [lib/] [umul.S] - Blame information for rev 1765

Details | Compare with Previous | View Log

Line No. Rev Author Line
1 1624 jcastillo
/* $Id: umul.S,v 1.1 2005-12-20 09:50:47 jcastillo Exp $
2
 * umul.S:      This routine was taken from glibc-1.09 and is covered
3
 *              by the GNU Library General Public License Version 2.
4
 */
5
 
6
 
7
/*
8
 * Unsigned multiply.  Returns %o0 * %o1 in %o1%o0 (i.e., %o1 holds the
9
 * upper 32 bits of the 64-bit product).
10
 *
11
 * This code optimizes short (less than 13-bit) multiplies.  Short
12
 * multiplies require 25 instruction cycles, and long ones require
13
 * 45 instruction cycles.
14
 *
15
 * On return, overflow has occurred (%o1 is not zero) if and only if
16
 * the Z condition code is clear, allowing, e.g., the following:
17
 *
18
 *      call    .umul
19
 *      nop
20
 *      bnz     overflow        (or tnz)
21
 */
22
 
23
        .globl .umul
24
.umul:
25
        or      %o0, %o1, %o4
26
        mov     %o0, %y         ! multiplier -> Y
27
        andncc  %o4, 0xfff, %g0 ! test bits 12..31 of *both* args
28
        be      Lmul_shortway   ! if zero, can do it the short way
29
        andcc   %g0, %g0, %o4   ! zero the partial product and clear N and V
30
 
31
        /*
32
         * Long multiply.  32 steps, followed by a final shift step.
33
         */
34
        mulscc  %o4, %o1, %o4   ! 1
35
        mulscc  %o4, %o1, %o4   ! 2
36
        mulscc  %o4, %o1, %o4   ! 3
37
        mulscc  %o4, %o1, %o4   ! 4
38
        mulscc  %o4, %o1, %o4   ! 5
39
        mulscc  %o4, %o1, %o4   ! 6
40
        mulscc  %o4, %o1, %o4   ! 7
41
        mulscc  %o4, %o1, %o4   ! 8
42
        mulscc  %o4, %o1, %o4   ! 9
43
        mulscc  %o4, %o1, %o4   ! 10
44
        mulscc  %o4, %o1, %o4   ! 11
45
        mulscc  %o4, %o1, %o4   ! 12
46
        mulscc  %o4, %o1, %o4   ! 13
47
        mulscc  %o4, %o1, %o4   ! 14
48
        mulscc  %o4, %o1, %o4   ! 15
49
        mulscc  %o4, %o1, %o4   ! 16
50
        mulscc  %o4, %o1, %o4   ! 17
51
        mulscc  %o4, %o1, %o4   ! 18
52
        mulscc  %o4, %o1, %o4   ! 19
53
        mulscc  %o4, %o1, %o4   ! 20
54
        mulscc  %o4, %o1, %o4   ! 21
55
        mulscc  %o4, %o1, %o4   ! 22
56
        mulscc  %o4, %o1, %o4   ! 23
57
        mulscc  %o4, %o1, %o4   ! 24
58
        mulscc  %o4, %o1, %o4   ! 25
59
        mulscc  %o4, %o1, %o4   ! 26
60
        mulscc  %o4, %o1, %o4   ! 27
61
        mulscc  %o4, %o1, %o4   ! 28
62
        mulscc  %o4, %o1, %o4   ! 29
63
        mulscc  %o4, %o1, %o4   ! 30
64
        mulscc  %o4, %o1, %o4   ! 31
65
        mulscc  %o4, %o1, %o4   ! 32
66
        mulscc  %o4, %g0, %o4   ! final shift
67
 
68
 
69
        /*
70
         * Normally, with the shift-and-add approach, if both numbers are
71
         * positive you get the correct result.  With 32-bit two's-complement
72
         * numbers, -x is represented as
73
         *
74
         *                x                 32
75
         *      ( 2  -  ------ ) mod 2  *  2
76
         *                 32
77
         *                2
78
         *
79
         * (the `mod 2' subtracts 1 from 1.bbbb).  To avoid lots of 2^32s,
80
         * we can treat this as if the radix point were just to the left
81
         * of the sign bit (multiply by 2^32), and get
82
         *
83
         *      -x  =  (2 - x) mod 2
84
         *
85
         * Then, ignoring the `mod 2's for convenience:
86
         *
87
         *   x *  y     = xy
88
         *  -x *  y     = 2y - xy
89
         *   x * -y     = 2x - xy
90
         *  -x * -y     = 4 - 2x - 2y + xy
91
         *
92
         * For signed multiplies, we subtract (x << 32) from the partial
93
         * product to fix this problem for negative multipliers (see mul.s).
94
         * Because of the way the shift into the partial product is calculated
95
         * (N xor V), this term is automatically removed for the multiplicand,
96
         * so we don't have to adjust.
97
         *
98
         * But for unsigned multiplies, the high order bit wasn't a sign bit,
99
         * and the correction is wrong.  So for unsigned multiplies where the
100
         * high order bit is one, we end up with xy - (y << 32).  To fix it
101
         * we add y << 32.
102
         */
103
#if 0
104
        tst     %o1
105
        bl,a    1f              ! if %o1 < 0 (high order bit = 1),
106
        add     %o4, %o0, %o4   ! %o4 += %o0 (add y to upper half)
107
1:      rd      %y, %o0         ! get lower half of product
108
        retl
109
        addcc   %o4, %g0, %o1   ! put upper half in place and set Z for %o1==0
110
#else
111
        /* Faster code from tege@sics.se.  */
112
        sra     %o1, 31, %o2    ! make mask from sign bit
113
        and     %o0, %o2, %o2   ! %o2 = 0 or %o0, depending on sign of %o1
114
        rd      %y, %o0         ! get lower half of product
115
        retl
116
        addcc   %o4, %o2, %o1   ! add compensation and put upper half in place
117
#endif
118
 
119
Lmul_shortway:
120
        /*
121
         * Short multiply.  12 steps, followed by a final shift step.
122
         * The resulting bits are off by 12 and (32-12) = 20 bit positions,
123
         * but there is no problem with %o0 being negative (unlike above),
124
         * and overflow is impossible (the answer is at most 24 bits long).
125
         */
126
        mulscc  %o4, %o1, %o4   ! 1
127
        mulscc  %o4, %o1, %o4   ! 2
128
        mulscc  %o4, %o1, %o4   ! 3
129
        mulscc  %o4, %o1, %o4   ! 4
130
        mulscc  %o4, %o1, %o4   ! 5
131
        mulscc  %o4, %o1, %o4   ! 6
132
        mulscc  %o4, %o1, %o4   ! 7
133
        mulscc  %o4, %o1, %o4   ! 8
134
        mulscc  %o4, %o1, %o4   ! 9
135
        mulscc  %o4, %o1, %o4   ! 10
136
        mulscc  %o4, %o1, %o4   ! 11
137
        mulscc  %o4, %o1, %o4   ! 12
138
        mulscc  %o4, %g0, %o4   ! final shift
139
 
140
        /*
141
         * %o4 has 20 of the bits that should be in the result; %y has
142
         * the bottom 12 (as %y's top 12).  That is:
143
         *
144
         *        %o4               %y
145
         * +----------------+----------------+
146
         * | -12- |   -20-  | -12- |   -20-  |
147
         * +------(---------+------)---------+
148
         *         -----result-----
149
         *
150
         * The 12 bits of %o4 left of the `result' area are all zero;
151
         * in fact, all top 20 bits of %o4 are zero.
152
         */
153
 
154
        rd      %y, %o5
155
        sll     %o4, 12, %o0    ! shift middle bits left 12
156
        srl     %o5, 20, %o5    ! shift low bits right 20
157
        or      %o5, %o0, %o0
158
        retl
159
        addcc   %g0, %g0, %o1   ! %o1 = zero, and set Z

powered by: WebSVN 2.1.0

© copyright 1999-2024 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.