URL
https://opencores.org/ocsvn/openrisc_me/openrisc_me/trunk
Subversion Repositories openrisc_me
[/] [openrisc/] [trunk/] [gnu-src/] [gcc-4.5.1/] [gcc/] [config/] [sparc/] [lb1spc.asm] - Rev 282
Compare with Previous | Blame | View Log
/* This is an assembly language implementation of mulsi3, divsi3, and modsi3for the sparc processor.These routines are derived from the SPARC Architecture Manual, version 8,slightly edited to match the desired calling convention, and also tooptimize them for our purposes. */#ifdef L_mulsi3.text.align 4.global .umul.proc 4.umul:or %o0, %o1, %o4 ! logical or of multiplier and multiplicandmov %o0, %y ! multiplier to Y registerandncc %o4, 0xfff, %o5 ! mask out lower 12 bitsbe mul_shortway ! can do it the short wayandcc %g0, %g0, %o4 ! zero the partial product and clear NV cc!! long multiply!mulscc %o4, %o1, %o4 ! first iteration of 33mulscc %o4, %o1, %o4mulscc %o4, %o1, %o4mulscc %o4, %o1, %o4mulscc %o4, %o1, %o4mulscc %o4, %o1, %o4mulscc %o4, %o1, %o4mulscc %o4, %o1, %o4mulscc %o4, %o1, %o4mulscc %o4, %o1, %o4mulscc %o4, %o1, %o4mulscc %o4, %o1, %o4mulscc %o4, %o1, %o4mulscc %o4, %o1, %o4mulscc %o4, %o1, %o4mulscc %o4, %o1, %o4mulscc %o4, %o1, %o4mulscc %o4, %o1, %o4mulscc %o4, %o1, %o4mulscc %o4, %o1, %o4mulscc %o4, %o1, %o4mulscc %o4, %o1, %o4mulscc %o4, %o1, %o4mulscc %o4, %o1, %o4mulscc %o4, %o1, %o4mulscc %o4, %o1, %o4mulscc %o4, %o1, %o4mulscc %o4, %o1, %o4mulscc %o4, %o1, %o4mulscc %o4, %o1, %o4mulscc %o4, %o1, %o4mulscc %o4, %o1, %o4 ! 32nd iterationmulscc %o4, %g0, %o4 ! last iteration only shifts! the upper 32 bits of product are wrong, but we do not careretlrd %y, %o0!! short multiply!mul_shortway:mulscc %o4, %o1, %o4 ! first iteration of 13mulscc %o4, %o1, %o4mulscc %o4, %o1, %o4mulscc %o4, %o1, %o4mulscc %o4, %o1, %o4mulscc %o4, %o1, %o4mulscc %o4, %o1, %o4mulscc %o4, %o1, %o4mulscc %o4, %o1, %o4mulscc %o4, %o1, %o4mulscc %o4, %o1, %o4mulscc %o4, %o1, %o4 ! 12th iterationmulscc %o4, %g0, %o4 ! last iteration only shiftsrd %y, %o5sll %o4, 12, %o4 ! left shift partial product by 12 bitssrl %o5, 20, %o5 ! right shift partial product by 20 bitsretlor %o5, %o4, %o0 ! merge for true product#endif#ifdef L_divsi3/** Division and remainder, from Appendix E of the SPARC Version 8* Architecture Manual, with fixes from Gordon Irlam.*//** Input: dividend and divisor in %o0 and %o1 respectively.** m4 parameters:* .div name of function to generate* div div=div => %o0 / %o1; div=rem => %o0 % %o1* true true=true => signed; true=false => unsigned** Algorithm parameters:* N how many bits per iteration we try to get (4)* WORDSIZE total number of bits (32)** Derived constants:* TOPBITS number of bits in the top decade of a number** Important variables:* Q the partial quotient under development (initially 0)* R the remainder so far, initially the dividend* ITER number of main division loop iterations required;* equal to ceil(log2(quotient) / N). Note that this* is the log base (2^N) of the quotient.* V the current comparand, initially divisor*2^(ITER*N-1)** Cost:* Current estimate for non-large dividend is* ceil(log2(quotient) / N) * (10 + 7N/2) + C* A large dividend is one greater than 2^(31-TOPBITS) and takes a* different path, as the upper bits of the quotient must be developed* one bit at a time.*/.global .udiv.align 4.proc 4.text.udiv:b ready_to_dividemov 0, %g3 ! result is always positive.global .div.align 4.proc 4.text.div:! compute sign of result; if neither is negative, no problemorcc %o1, %o0, %g0 ! either negative?bge ready_to_divide ! no, go do the dividexor %o1, %o0, %g3 ! compute sign in any casetst %o1bge 1ftst %o0! %o1 is definitely negative; %o0 might also be negativebge ready_to_divide ! if %o0 not negative...sub %g0, %o1, %o1 ! in any case, make %o1 nonneg1: ! %o0 is negative, %o1 is nonnegativesub %g0, %o0, %o0 ! make %o0 nonnegativeready_to_divide:! Ready to divide. Compute size of quotient; scale comparand.orcc %o1, %g0, %o5bne 1fmov %o0, %o3! Divide by zero trap. If it returns, return 0 (about as! wrong as possible, but that is what SunOS does...).ta 0x2 ! ST_DIV0retlclr %o01:cmp %o3, %o5 ! if %o1 exceeds %o0, doneblu got_result ! (and algorithm fails otherwise)clr %o2sethi %hi(1 << (32 - 4 - 1)), %g1cmp %o3, %g1blu not_really_bigclr %o4! Here the dividend is >= 2**(31-N) or so. We must be careful here,! as our usual N-at-a-shot divide step will cause overflow and havoc.! The number of bits in the result here is N*ITER+SC, where SC <= N.! Compute ITER in an unorthodox manner: know we need to shift V into! the top decade: so do not even bother to compare to R.1:cmp %o5, %g1bgeu 3fmov 1, %g2sll %o5, 4, %o5b 1badd %o4, 1, %o4! Now compute %g2.2: addcc %o5, %o5, %o5bcc not_too_bigadd %g2, 1, %g2! We get here if the %o1 overflowed while shifting.! This means that %o3 has the high-order bit set.! Restore %o5 and subtract from %o3.sll %g1, 4, %g1 ! high order bitsrl %o5, 1, %o5 ! rest of %o5add %o5, %g1, %o5b do_single_divsub %g2, 1, %g2not_too_big:3: cmp %o5, %o3blu 2bnopbe do_single_divnop/* NB: these are commented out in the V8-SPARC manual as well *//* (I do not understand this) */! %o5 > %o3: went too far: back up 1 step! srl %o5, 1, %o5! dec %g2! do single-bit divide steps!! We have to be careful here. We know that %o3 >= %o5, so we can do the! first divide step without thinking. BUT, the others are conditional,! and are only done if %o3 >= 0. Because both %o3 and %o5 may have the high-! order bit set in the first step, just falling into the regular! division loop will mess up the first time around.! So we unroll slightly...do_single_div:subcc %g2, 1, %g2bl end_regular_dividenopsub %o3, %o5, %o3mov 1, %o2b end_single_divloopnopsingle_divloop:sll %o2, 1, %o2bl 1fsrl %o5, 1, %o5! %o3 >= 0sub %o3, %o5, %o3b 2fadd %o2, 1, %o21: ! %o3 < 0add %o3, %o5, %o3sub %o2, 1, %o22:end_single_divloop:subcc %g2, 1, %g2bge single_divlooptst %o3b,a end_regular_dividenot_really_big:1:sll %o5, 4, %o5cmp %o5, %o3bleu 1baddcc %o4, 1, %o4be got_resultsub %o4, 1, %o4tst %o3 ! set up for initial iterationdivloop:sll %o2, 4, %o2! depth 1, accumulated bits 0bl L1.16srl %o5,1,%o5! remainder is positivesubcc %o3,%o5,%o3! depth 2, accumulated bits 1bl L2.17srl %o5,1,%o5! remainder is positivesubcc %o3,%o5,%o3! depth 3, accumulated bits 3bl L3.19srl %o5,1,%o5! remainder is positivesubcc %o3,%o5,%o3! depth 4, accumulated bits 7bl L4.23srl %o5,1,%o5! remainder is positivesubcc %o3,%o5,%o3b 9fadd %o2, (7*2+1), %o2L4.23:! remainder is negativeaddcc %o3,%o5,%o3b 9fadd %o2, (7*2-1), %o2L3.19:! remainder is negativeaddcc %o3,%o5,%o3! depth 4, accumulated bits 5bl L4.21srl %o5,1,%o5! remainder is positivesubcc %o3,%o5,%o3b 9fadd %o2, (5*2+1), %o2L4.21:! remainder is negativeaddcc %o3,%o5,%o3b 9fadd %o2, (5*2-1), %o2L2.17:! remainder is negativeaddcc %o3,%o5,%o3! depth 3, accumulated bits 1bl L3.17srl %o5,1,%o5! remainder is positivesubcc %o3,%o5,%o3! depth 4, accumulated bits 3bl L4.19srl %o5,1,%o5! remainder is positivesubcc %o3,%o5,%o3b 9fadd %o2, (3*2+1), %o2L4.19:! remainder is negativeaddcc %o3,%o5,%o3b 9fadd %o2, (3*2-1), %o2L3.17:! remainder is negativeaddcc %o3,%o5,%o3! depth 4, accumulated bits 1bl L4.17srl %o5,1,%o5! remainder is positivesubcc %o3,%o5,%o3b 9fadd %o2, (1*2+1), %o2L4.17:! remainder is negativeaddcc %o3,%o5,%o3b 9fadd %o2, (1*2-1), %o2L1.16:! remainder is negativeaddcc %o3,%o5,%o3! depth 2, accumulated bits -1bl L2.15srl %o5,1,%o5! remainder is positivesubcc %o3,%o5,%o3! depth 3, accumulated bits -1bl L3.15srl %o5,1,%o5! remainder is positivesubcc %o3,%o5,%o3! depth 4, accumulated bits -1bl L4.15srl %o5,1,%o5! remainder is positivesubcc %o3,%o5,%o3b 9fadd %o2, (-1*2+1), %o2L4.15:! remainder is negativeaddcc %o3,%o5,%o3b 9fadd %o2, (-1*2-1), %o2L3.15:! remainder is negativeaddcc %o3,%o5,%o3! depth 4, accumulated bits -3bl L4.13srl %o5,1,%o5! remainder is positivesubcc %o3,%o5,%o3b 9fadd %o2, (-3*2+1), %o2L4.13:! remainder is negativeaddcc %o3,%o5,%o3b 9fadd %o2, (-3*2-1), %o2L2.15:! remainder is negativeaddcc %o3,%o5,%o3! depth 3, accumulated bits -3bl L3.13srl %o5,1,%o5! remainder is positivesubcc %o3,%o5,%o3! depth 4, accumulated bits -5bl L4.11srl %o5,1,%o5! remainder is positivesubcc %o3,%o5,%o3b 9fadd %o2, (-5*2+1), %o2L4.11:! remainder is negativeaddcc %o3,%o5,%o3b 9fadd %o2, (-5*2-1), %o2L3.13:! remainder is negativeaddcc %o3,%o5,%o3! depth 4, accumulated bits -7bl L4.9srl %o5,1,%o5! remainder is positivesubcc %o3,%o5,%o3b 9fadd %o2, (-7*2+1), %o2L4.9:! remainder is negativeaddcc %o3,%o5,%o3b 9fadd %o2, (-7*2-1), %o29:end_regular_divide:subcc %o4, 1, %o4bge divlooptst %o3bl,a got_result! non-restoring fixup here (one instruction only!)sub %o2, 1, %o2got_result:! check to see if answer should be < 0tst %g3bl,a 1fsub %g0, %o2, %o21:retlmov %o2, %o0#endif#ifdef L_modsi3/* This implementation was taken from glibc:** Input: dividend and divisor in %o0 and %o1 respectively.** Algorithm parameters:* N how many bits per iteration we try to get (4)* WORDSIZE total number of bits (32)** Derived constants:* TOPBITS number of bits in the top decade of a number** Important variables:* Q the partial quotient under development (initially 0)* R the remainder so far, initially the dividend* ITER number of main division loop iterations required;* equal to ceil(log2(quotient) / N). Note that this* is the log base (2^N) of the quotient.* V the current comparand, initially divisor*2^(ITER*N-1)** Cost:* Current estimate for non-large dividend is* ceil(log2(quotient) / N) * (10 + 7N/2) + C* A large dividend is one greater than 2^(31-TOPBITS) and takes a* different path, as the upper bits of the quotient must be developed* one bit at a time.*/.text.align 4.global .urem.proc 4.urem:b dividemov 0, %g3 ! result always positive.align 4.global .rem.proc 4.rem:! compute sign of result; if neither is negative, no problemorcc %o1, %o0, %g0 ! either negative?bge 2f ! no, go do the dividemov %o0, %g3 ! sign of remainder matches %o0tst %o1bge 1ftst %o0! %o1 is definitely negative; %o0 might also be negativebge 2f ! if %o0 not negative...sub %g0, %o1, %o1 ! in any case, make %o1 nonneg1: ! %o0 is negative, %o1 is nonnegativesub %g0, %o0, %o0 ! make %o0 nonnegative2:! Ready to divide. Compute size of quotient; scale comparand.divide:orcc %o1, %g0, %o5bne 1fmov %o0, %o3! Divide by zero trap. If it returns, return 0 (about as! wrong as possible, but that is what SunOS does...).ta 0x2 !ST_DIV0retlclr %o01:cmp %o3, %o5 ! if %o1 exceeds %o0, doneblu got_result ! (and algorithm fails otherwise)clr %o2sethi %hi(1 << (32 - 4 - 1)), %g1cmp %o3, %g1blu not_really_bigclr %o4! Here the dividend is >= 2**(31-N) or so. We must be careful here,! as our usual N-at-a-shot divide step will cause overflow and havoc.! The number of bits in the result here is N*ITER+SC, where SC <= N.! Compute ITER in an unorthodox manner: know we need to shift V into! the top decade: so do not even bother to compare to R.1:cmp %o5, %g1bgeu 3fmov 1, %g2sll %o5, 4, %o5b 1badd %o4, 1, %o4! Now compute %g2.2: addcc %o5, %o5, %o5bcc not_too_bigadd %g2, 1, %g2! We get here if the %o1 overflowed while shifting.! This means that %o3 has the high-order bit set.! Restore %o5 and subtract from %o3.sll %g1, 4, %g1 ! high order bitsrl %o5, 1, %o5 ! rest of %o5add %o5, %g1, %o5b do_single_divsub %g2, 1, %g2not_too_big:3: cmp %o5, %o3blu 2bnopbe do_single_divnop/* NB: these are commented out in the V8-SPARC manual as well *//* (I do not understand this) */! %o5 > %o3: went too far: back up 1 step! srl %o5, 1, %o5! dec %g2! do single-bit divide steps!! We have to be careful here. We know that %o3 >= %o5, so we can do the! first divide step without thinking. BUT, the others are conditional,! and are only done if %o3 >= 0. Because both %o3 and %o5 may have the high-! order bit set in the first step, just falling into the regular! division loop will mess up the first time around.! So we unroll slightly...do_single_div:subcc %g2, 1, %g2bl end_regular_dividenopsub %o3, %o5, %o3mov 1, %o2b end_single_divloopnopsingle_divloop:sll %o2, 1, %o2bl 1fsrl %o5, 1, %o5! %o3 >= 0sub %o3, %o5, %o3b 2fadd %o2, 1, %o21: ! %o3 < 0add %o3, %o5, %o3sub %o2, 1, %o22:end_single_divloop:subcc %g2, 1, %g2bge single_divlooptst %o3b,a end_regular_dividenot_really_big:1:sll %o5, 4, %o5cmp %o5, %o3bleu 1baddcc %o4, 1, %o4be got_resultsub %o4, 1, %o4tst %o3 ! set up for initial iterationdivloop:sll %o2, 4, %o2! depth 1, accumulated bits 0bl L1.16srl %o5,1,%o5! remainder is positivesubcc %o3,%o5,%o3! depth 2, accumulated bits 1bl L2.17srl %o5,1,%o5! remainder is positivesubcc %o3,%o5,%o3! depth 3, accumulated bits 3bl L3.19srl %o5,1,%o5! remainder is positivesubcc %o3,%o5,%o3! depth 4, accumulated bits 7bl L4.23srl %o5,1,%o5! remainder is positivesubcc %o3,%o5,%o3b 9fadd %o2, (7*2+1), %o2L4.23:! remainder is negativeaddcc %o3,%o5,%o3b 9fadd %o2, (7*2-1), %o2L3.19:! remainder is negativeaddcc %o3,%o5,%o3! depth 4, accumulated bits 5bl L4.21srl %o5,1,%o5! remainder is positivesubcc %o3,%o5,%o3b 9fadd %o2, (5*2+1), %o2L4.21:! remainder is negativeaddcc %o3,%o5,%o3b 9fadd %o2, (5*2-1), %o2L2.17:! remainder is negativeaddcc %o3,%o5,%o3! depth 3, accumulated bits 1bl L3.17srl %o5,1,%o5! remainder is positivesubcc %o3,%o5,%o3! depth 4, accumulated bits 3bl L4.19srl %o5,1,%o5! remainder is positivesubcc %o3,%o5,%o3b 9fadd %o2, (3*2+1), %o2L4.19:! remainder is negativeaddcc %o3,%o5,%o3b 9fadd %o2, (3*2-1), %o2L3.17:! remainder is negativeaddcc %o3,%o5,%o3! depth 4, accumulated bits 1bl L4.17srl %o5,1,%o5! remainder is positivesubcc %o3,%o5,%o3b 9fadd %o2, (1*2+1), %o2L4.17:! remainder is negativeaddcc %o3,%o5,%o3b 9fadd %o2, (1*2-1), %o2L1.16:! remainder is negativeaddcc %o3,%o5,%o3! depth 2, accumulated bits -1bl L2.15srl %o5,1,%o5! remainder is positivesubcc %o3,%o5,%o3! depth 3, accumulated bits -1bl L3.15srl %o5,1,%o5! remainder is positivesubcc %o3,%o5,%o3! depth 4, accumulated bits -1bl L4.15srl %o5,1,%o5! remainder is positivesubcc %o3,%o5,%o3b 9fadd %o2, (-1*2+1), %o2L4.15:! remainder is negativeaddcc %o3,%o5,%o3b 9fadd %o2, (-1*2-1), %o2L3.15:! remainder is negativeaddcc %o3,%o5,%o3! depth 4, accumulated bits -3bl L4.13srl %o5,1,%o5! remainder is positivesubcc %o3,%o5,%o3b 9fadd %o2, (-3*2+1), %o2L4.13:! remainder is negativeaddcc %o3,%o5,%o3b 9fadd %o2, (-3*2-1), %o2L2.15:! remainder is negativeaddcc %o3,%o5,%o3! depth 3, accumulated bits -3bl L3.13srl %o5,1,%o5! remainder is positivesubcc %o3,%o5,%o3! depth 4, accumulated bits -5bl L4.11srl %o5,1,%o5! remainder is positivesubcc %o3,%o5,%o3b 9fadd %o2, (-5*2+1), %o2L4.11:! remainder is negativeaddcc %o3,%o5,%o3b 9fadd %o2, (-5*2-1), %o2L3.13:! remainder is negativeaddcc %o3,%o5,%o3! depth 4, accumulated bits -7bl L4.9srl %o5,1,%o5! remainder is positivesubcc %o3,%o5,%o3b 9fadd %o2, (-7*2+1), %o2L4.9:! remainder is negativeaddcc %o3,%o5,%o3b 9fadd %o2, (-7*2-1), %o29:end_regular_divide:subcc %o4, 1, %o4bge divlooptst %o3bl,a got_result! non-restoring fixup here (one instruction only!)add %o3, %o1, %o3got_result:! check to see if answer should be < 0tst %g3bl,a 1fsub %g0, %o3, %o31:retlmov %o3, %o0#endif
