URL
https://opencores.org/ocsvn/openrisc_me/openrisc_me/trunk
Subversion Repositories openrisc_me
[/] [openrisc/] [trunk/] [gnu-src/] [gcc-4.5.1/] [gcc/] [config/] [ia64/] [lib1funcs.asm] - Rev 282
Compare with Previous | Blame | View Log
/* Copyright (C) 2000, 2001, 2003, 2005, 2009 Free Software Foundation, Inc.Contributed by James E. Wilson <wilson@cygnus.com>.This file is part of GCC.GCC is free software; you can redistribute it and/or modifyit under the terms of the GNU General Public License as published bythe Free Software Foundation; either version 3, or (at your option)any later version.GCC is distributed in the hope that it will be useful,but WITHOUT ANY WARRANTY; without even the implied warranty ofMERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See theGNU General Public License for more details.Under Section 7 of GPL version 3, you are granted additionalpermissions described in the GCC Runtime Library Exception, version3.1, as published by the Free Software Foundation.You should have received a copy of the GNU General Public License anda copy of the GCC Runtime Library Exception along with this program;see the files COPYING3 and COPYING.RUNTIME respectively. If not, see<http://www.gnu.org/licenses/>. */#ifdef L__divxf3// Compute a 80-bit IEEE double-extended quotient.//// From the Intel IA-64 Optimization Guide, choose the minimum latency// alternative.//// farg0 holds the dividend. farg1 holds the divisor.//// __divtf3 is an alternate symbol name for backward compatibility..text.align 16.global __divxf3.proc __divxf3__divxf3:#ifdef SHARED.global __divtf3__divtf3:#endifcmp.eq p7, p0 = r0, r0frcpa.s0 f10, p6 = farg0, farg1;;(p6) cmp.ne p7, p0 = r0, r0.pred.rel.mutex p6, p7(p6) fnma.s1 f11 = farg1, f10, f1(p6) fma.s1 f12 = farg0, f10, f0;;(p6) fma.s1 f13 = f11, f11, f0(p6) fma.s1 f14 = f11, f11, f11;;(p6) fma.s1 f11 = f13, f13, f11(p6) fma.s1 f13 = f14, f10, f10;;(p6) fma.s1 f10 = f13, f11, f10(p6) fnma.s1 f11 = farg1, f12, farg0;;(p6) fma.s1 f11 = f11, f10, f12(p6) fnma.s1 f12 = farg1, f10, f1;;(p6) fma.s1 f10 = f12, f10, f10(p6) fnma.s1 f12 = farg1, f11, farg0;;(p6) fma.s0 fret0 = f12, f10, f11(p7) mov fret0 = f10br.ret.sptk rp.endp __divxf3#endif#ifdef L__divdf3// Compute a 64-bit IEEE double quotient.//// From the Intel IA-64 Optimization Guide, choose the minimum latency// alternative.//// farg0 holds the dividend. farg1 holds the divisor..text.align 16.global __divdf3.proc __divdf3__divdf3:cmp.eq p7, p0 = r0, r0frcpa.s0 f10, p6 = farg0, farg1;;(p6) cmp.ne p7, p0 = r0, r0.pred.rel.mutex p6, p7(p6) fmpy.s1 f11 = farg0, f10(p6) fnma.s1 f12 = farg1, f10, f1;;(p6) fma.s1 f11 = f12, f11, f11(p6) fmpy.s1 f13 = f12, f12;;(p6) fma.s1 f10 = f12, f10, f10(p6) fma.s1 f11 = f13, f11, f11;;(p6) fmpy.s1 f12 = f13, f13(p6) fma.s1 f10 = f13, f10, f10;;(p6) fma.d.s1 f11 = f12, f11, f11(p6) fma.s1 f10 = f12, f10, f10;;(p6) fnma.d.s1 f8 = farg1, f11, farg0;;(p6) fma.d fret0 = f8, f10, f11(p7) mov fret0 = f10br.ret.sptk rp;;.endp __divdf3#endif#ifdef L__divsf3// Compute a 32-bit IEEE float quotient.//// From the Intel IA-64 Optimization Guide, choose the minimum latency// alternative.//// farg0 holds the dividend. farg1 holds the divisor..text.align 16.global __divsf3.proc __divsf3__divsf3:cmp.eq p7, p0 = r0, r0frcpa.s0 f10, p6 = farg0, farg1;;(p6) cmp.ne p7, p0 = r0, r0.pred.rel.mutex p6, p7(p6) fmpy.s1 f8 = farg0, f10(p6) fnma.s1 f9 = farg1, f10, f1;;(p6) fma.s1 f8 = f9, f8, f8(p6) fmpy.s1 f9 = f9, f9;;(p6) fma.s1 f8 = f9, f8, f8(p6) fmpy.s1 f9 = f9, f9;;(p6) fma.d.s1 f10 = f9, f8, f8;;(p6) fnorm.s.s0 fret0 = f10(p7) mov fret0 = f10br.ret.sptk rp;;.endp __divsf3#endif#ifdef L__divdi3// Compute a 64-bit integer quotient.//// From the Intel IA-64 Optimization Guide, choose the minimum latency// alternative.//// in0 holds the dividend. in1 holds the divisor..text.align 16.global __divdi3.proc __divdi3__divdi3:.regstk 2,0,0,0// Transfer inputs to FP registers.setf.sig f8 = in0setf.sig f9 = in1// Check divide by zero.cmp.ne.unc p0,p7=0,in1;;// Convert the inputs to FP, so that they won't be treated as unsigned.fcvt.xf f8 = f8fcvt.xf f9 = f9(p7) break 1;;// Compute the reciprocal approximation.frcpa.s1 f10, p6 = f8, f9;;// 3 Newton-Raphson iterations.(p6) fnma.s1 f11 = f9, f10, f1(p6) fmpy.s1 f12 = f8, f10;;(p6) fmpy.s1 f13 = f11, f11(p6) fma.s1 f12 = f11, f12, f12;;(p6) fma.s1 f10 = f11, f10, f10(p6) fma.s1 f11 = f13, f12, f12;;(p6) fma.s1 f10 = f13, f10, f10(p6) fnma.s1 f12 = f9, f11, f8;;(p6) fma.s1 f10 = f12, f10, f11;;// Round quotient to an integer.fcvt.fx.trunc.s1 f10 = f10;;// Transfer result to GP registers.getf.sig ret0 = f10br.ret.sptk rp;;.endp __divdi3#endif#ifdef L__moddi3// Compute a 64-bit integer modulus.//// From the Intel IA-64 Optimization Guide, choose the minimum latency// alternative.//// in0 holds the dividend (a). in1 holds the divisor (b)..text.align 16.global __moddi3.proc __moddi3__moddi3:.regstk 2,0,0,0// Transfer inputs to FP registers.setf.sig f14 = in0setf.sig f9 = in1// Check divide by zero.cmp.ne.unc p0,p7=0,in1;;// Convert the inputs to FP, so that they won't be treated as unsigned.fcvt.xf f8 = f14fcvt.xf f9 = f9(p7) break 1;;// Compute the reciprocal approximation.frcpa.s1 f10, p6 = f8, f9;;// 3 Newton-Raphson iterations.(p6) fmpy.s1 f12 = f8, f10(p6) fnma.s1 f11 = f9, f10, f1;;(p6) fma.s1 f12 = f11, f12, f12(p6) fmpy.s1 f13 = f11, f11;;(p6) fma.s1 f10 = f11, f10, f10(p6) fma.s1 f11 = f13, f12, f12;;sub in1 = r0, in1(p6) fma.s1 f10 = f13, f10, f10(p6) fnma.s1 f12 = f9, f11, f8;;setf.sig f9 = in1(p6) fma.s1 f10 = f12, f10, f11;;fcvt.fx.trunc.s1 f10 = f10;;// r = q * (-b) + axma.l f10 = f10, f9, f14;;// Transfer result to GP registers.getf.sig ret0 = f10br.ret.sptk rp;;.endp __moddi3#endif#ifdef L__udivdi3// Compute a 64-bit unsigned integer quotient.//// From the Intel IA-64 Optimization Guide, choose the minimum latency// alternative.//// in0 holds the dividend. in1 holds the divisor..text.align 16.global __udivdi3.proc __udivdi3__udivdi3:.regstk 2,0,0,0// Transfer inputs to FP registers.setf.sig f8 = in0setf.sig f9 = in1// Check divide by zero.cmp.ne.unc p0,p7=0,in1;;// Convert the inputs to FP, to avoid FP software-assist faults.fcvt.xuf.s1 f8 = f8fcvt.xuf.s1 f9 = f9(p7) break 1;;// Compute the reciprocal approximation.frcpa.s1 f10, p6 = f8, f9;;// 3 Newton-Raphson iterations.(p6) fnma.s1 f11 = f9, f10, f1(p6) fmpy.s1 f12 = f8, f10;;(p6) fmpy.s1 f13 = f11, f11(p6) fma.s1 f12 = f11, f12, f12;;(p6) fma.s1 f10 = f11, f10, f10(p6) fma.s1 f11 = f13, f12, f12;;(p6) fma.s1 f10 = f13, f10, f10(p6) fnma.s1 f12 = f9, f11, f8;;(p6) fma.s1 f10 = f12, f10, f11;;// Round quotient to an unsigned integer.fcvt.fxu.trunc.s1 f10 = f10;;// Transfer result to GP registers.getf.sig ret0 = f10br.ret.sptk rp;;.endp __udivdi3#endif#ifdef L__umoddi3// Compute a 64-bit unsigned integer modulus.//// From the Intel IA-64 Optimization Guide, choose the minimum latency// alternative.//// in0 holds the dividend (a). in1 holds the divisor (b)..text.align 16.global __umoddi3.proc __umoddi3__umoddi3:.regstk 2,0,0,0// Transfer inputs to FP registers.setf.sig f14 = in0setf.sig f9 = in1// Check divide by zero.cmp.ne.unc p0,p7=0,in1;;// Convert the inputs to FP, to avoid FP software assist faults.fcvt.xuf.s1 f8 = f14fcvt.xuf.s1 f9 = f9(p7) break 1;;;// Compute the reciprocal approximation.frcpa.s1 f10, p6 = f8, f9;;// 3 Newton-Raphson iterations.(p6) fmpy.s1 f12 = f8, f10(p6) fnma.s1 f11 = f9, f10, f1;;(p6) fma.s1 f12 = f11, f12, f12(p6) fmpy.s1 f13 = f11, f11;;(p6) fma.s1 f10 = f11, f10, f10(p6) fma.s1 f11 = f13, f12, f12;;sub in1 = r0, in1(p6) fma.s1 f10 = f13, f10, f10(p6) fnma.s1 f12 = f9, f11, f8;;setf.sig f9 = in1(p6) fma.s1 f10 = f12, f10, f11;;// Round quotient to an unsigned integer.fcvt.fxu.trunc.s1 f10 = f10;;// r = q * (-b) + axma.l f10 = f10, f9, f14;;// Transfer result to GP registers.getf.sig ret0 = f10br.ret.sptk rp;;.endp __umoddi3#endif#ifdef L__divsi3// Compute a 32-bit integer quotient.//// From the Intel IA-64 Optimization Guide, choose the minimum latency// alternative.//// in0 holds the dividend. in1 holds the divisor..text.align 16.global __divsi3.proc __divsi3__divsi3:.regstk 2,0,0,0// Check divide by zero.cmp.ne.unc p0,p7=0,in1sxt4 in0 = in0sxt4 in1 = in1;;setf.sig f8 = in0setf.sig f9 = in1(p7) break 1;;mov r2 = 0x0ffddfcvt.xf f8 = f8fcvt.xf f9 = f9;;setf.exp f11 = r2frcpa.s1 f10, p6 = f8, f9;;(p6) fmpy.s1 f8 = f8, f10(p6) fnma.s1 f9 = f9, f10, f1;;(p6) fma.s1 f8 = f9, f8, f8(p6) fma.s1 f9 = f9, f9, f11;;(p6) fma.s1 f10 = f9, f8, f8;;fcvt.fx.trunc.s1 f10 = f10;;getf.sig ret0 = f10br.ret.sptk rp;;.endp __divsi3#endif#ifdef L__modsi3// Compute a 32-bit integer modulus.//// From the Intel IA-64 Optimization Guide, choose the minimum latency// alternative.//// in0 holds the dividend. in1 holds the divisor..text.align 16.global __modsi3.proc __modsi3__modsi3:.regstk 2,0,0,0mov r2 = 0x0ffddsxt4 in0 = in0sxt4 in1 = in1;;setf.sig f13 = r32setf.sig f9 = r33// Check divide by zero.cmp.ne.unc p0,p7=0,in1;;sub in1 = r0, in1fcvt.xf f8 = f13fcvt.xf f9 = f9;;setf.exp f11 = r2frcpa.s1 f10, p6 = f8, f9(p7) break 1;;(p6) fmpy.s1 f12 = f8, f10(p6) fnma.s1 f10 = f9, f10, f1;;setf.sig f9 = in1(p6) fma.s1 f12 = f10, f12, f12(p6) fma.s1 f10 = f10, f10, f11;;(p6) fma.s1 f10 = f10, f12, f12;;fcvt.fx.trunc.s1 f10 = f10;;xma.l f10 = f10, f9, f13;;getf.sig ret0 = f10br.ret.sptk rp;;.endp __modsi3#endif#ifdef L__udivsi3// Compute a 32-bit unsigned integer quotient.//// From the Intel IA-64 Optimization Guide, choose the minimum latency// alternative.//// in0 holds the dividend. in1 holds the divisor..text.align 16.global __udivsi3.proc __udivsi3__udivsi3:.regstk 2,0,0,0mov r2 = 0x0ffddzxt4 in0 = in0zxt4 in1 = in1;;setf.sig f8 = in0setf.sig f9 = in1// Check divide by zero.cmp.ne.unc p0,p7=0,in1;;fcvt.xf f8 = f8fcvt.xf f9 = f9(p7) break 1;;setf.exp f11 = r2frcpa.s1 f10, p6 = f8, f9;;(p6) fmpy.s1 f8 = f8, f10(p6) fnma.s1 f9 = f9, f10, f1;;(p6) fma.s1 f8 = f9, f8, f8(p6) fma.s1 f9 = f9, f9, f11;;(p6) fma.s1 f10 = f9, f8, f8;;fcvt.fxu.trunc.s1 f10 = f10;;getf.sig ret0 = f10br.ret.sptk rp;;.endp __udivsi3#endif#ifdef L__umodsi3// Compute a 32-bit unsigned integer modulus.//// From the Intel IA-64 Optimization Guide, choose the minimum latency// alternative.//// in0 holds the dividend. in1 holds the divisor..text.align 16.global __umodsi3.proc __umodsi3__umodsi3:.regstk 2,0,0,0mov r2 = 0x0ffddzxt4 in0 = in0zxt4 in1 = in1;;setf.sig f13 = in0setf.sig f9 = in1// Check divide by zero.cmp.ne.unc p0,p7=0,in1;;sub in1 = r0, in1fcvt.xf f8 = f13fcvt.xf f9 = f9;;setf.exp f11 = r2frcpa.s1 f10, p6 = f8, f9(p7) break 1;;;(p6) fmpy.s1 f12 = f8, f10(p6) fnma.s1 f10 = f9, f10, f1;;setf.sig f9 = in1(p6) fma.s1 f12 = f10, f12, f12(p6) fma.s1 f10 = f10, f10, f11;;(p6) fma.s1 f10 = f10, f12, f12;;fcvt.fxu.trunc.s1 f10 = f10;;xma.l f10 = f10, f9, f13;;getf.sig ret0 = f10br.ret.sptk rp;;.endp __umodsi3#endif#ifdef L__save_stack_nonlocal// Notes on save/restore stack nonlocal: We read ar.bsp but write// ar.bspstore. This is because ar.bsp can be read at all times// (independent of the RSE mode) but since it's read-only we need to// restore the value via ar.bspstore. This is OK because// ar.bsp==ar.bspstore after executing "flushrs".// void __ia64_save_stack_nonlocal(void *save_area, void *stack_pointer).text.align 16.global __ia64_save_stack_nonlocal.proc __ia64_save_stack_nonlocal__ia64_save_stack_nonlocal:{ .mmfalloc r18 = ar.pfs, 2, 0, 0, 0mov r19 = ar.rsc;;}{ .mmiflushrsst8 [in0] = in1, 24and r19 = 0x1c, r19;;}{ .mmist8 [in0] = r18, -16mov ar.rsc = r19or r19 = 0x3, r19;;}{ .mmimov r16 = ar.bspmov r17 = ar.rnatadds r2 = 8, in0;;}{ .mmist8 [in0] = r16st8 [r2] = r17}{ .mibmov ar.rsc = r19br.ret.sptk.few rp;;}.endp __ia64_save_stack_nonlocal#endif#ifdef L__nonlocal_goto// void __ia64_nonlocal_goto(void *target_label, void *save_area,// void *static_chain);.text.align 16.global __ia64_nonlocal_goto.proc __ia64_nonlocal_goto__ia64_nonlocal_goto:{ .mmialloc r20 = ar.pfs, 3, 0, 0, 0ld8 r12 = [in1], 8mov.ret.sptk rp = in0, .L0;;}{ .mmfld8 r16 = [in1], 8mov r19 = ar.rsc;;}{ .mmiflushrsld8 r17 = [in1], 8and r19 = 0x1c, r19;;}{ .mmild8 r18 = [in1]mov ar.rsc = r19or r19 = 0x3, r19;;}{ .mmimov ar.bspstore = r16;;mov ar.rnat = r17;;}{ .mmiloadrsinvalamov r15 = in2;;}.L0: { .mibmov ar.rsc = r19mov ar.pfs = r18br.ret.sptk.few rp;;}.endp __ia64_nonlocal_goto#endif#ifdef L__restore_stack_nonlocal// This is mostly the same as nonlocal_goto above.// ??? This has not been tested yet.// void __ia64_restore_stack_nonlocal(void *save_area).text.align 16.global __ia64_restore_stack_nonlocal.proc __ia64_restore_stack_nonlocal__ia64_restore_stack_nonlocal:{ .mmfalloc r20 = ar.pfs, 4, 0, 0, 0ld8 r12 = [in0], 8;;}{ .mmbld8 r16=[in0], 8mov r19 = ar.rsc;;}{ .mmiflushrsld8 r17 = [in0], 8and r19 = 0x1c, r19;;}{ .mmfld8 r18 = [in0]mov ar.rsc = r19;;}{ .mmimov ar.bspstore = r16;;mov ar.rnat = r17or r19 = 0x3, r19;;}{ .mmfloadrsinvala;;}.L0: { .mibmov ar.rsc = r19mov ar.pfs = r18br.ret.sptk.few rp;;}.endp __ia64_restore_stack_nonlocal#endif#ifdef L__trampoline// Implement the nested function trampoline. This is out of line// so that we don't have to bother with flushing the icache, as// well as making the on-stack trampoline smaller.//// The trampoline has the following form://// +-------------------+ >// TRAMP: | __ia64_trampoline | |// +-------------------+ > fake function descriptor// | TRAMP+16 | |// +-------------------+ >// | target descriptor |// +-------------------+// | static link |// +-------------------+.text.align 16.global __ia64_trampoline.proc __ia64_trampoline__ia64_trampoline:{ .mmild8 r2 = [r1], 8;;ld8 r15 = [r1]}{ .mmild8 r3 = [r2], 8;;ld8 r1 = [r2]mov b6 = r3}{ .bbbbr.sptk.many b6;;}.endp __ia64_trampoline#endif#ifdef SHARED// Thunks for backward compatibility.#ifdef L_fixtfdi.text.align 16.global __fixtfti.proc __fixtfti__fixtfti:{ .bbbbr.sptk.many __fixxfti;;}.endp __fixtfti#endif#ifdef L_fixunstfdi.align 16.global __fixunstfti.proc __fixunstfti__fixunstfti:{ .bbbbr.sptk.many __fixunsxfti;;}.endp __fixunstfti#endif#ifdef L_floatditf.align 16.global __floattitf.proc __floattitf__floattitf:{ .bbbbr.sptk.many __floattixf;;}.endp __floattitf#endif#endif
