URL https://opencores.org/ocsvn/openrisc/openrisc/trunk

Subversion Repositories openrisc

[/] [openrisc/] [trunk/] [gnu-dev/] [or1k-gcc/] [libgcc/] [config/] [ia64/] [lib1funcs.S] - Blame information for rev 777

Go to most recent revision | Details | Compare with Previous | View Log


/* Copyright (C) 2000, 2001, 2003, 2005, 2009 Free Software Foundation, Inc.
   Contributed by James E. Wilson .
 
   This file is part of GCC.
 
   GCC is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 3, or (at your option)
   any later version.
 
   GCC is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.
 
   Under Section 7 of GPL version 3, you are granted additional
   permissions described in the GCC Runtime Library Exception, version
   3.1, as published by the Free Software Foundation.
 
   You should have received a copy of the GNU General Public License and
   a copy of the GCC Runtime Library Exception along with this program;
   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
   .  */
 
#ifdef L__divxf3
// Compute a 80-bit IEEE double-extended quotient.
//
// From the Intel IA-64 Optimization Guide, choose the minimum latency
// alternative.
//
// farg0 holds the dividend.  farg1 holds the divisor.
//
// __divtf3 is an alternate symbol name for backward compatibility.
 
        .text
        .align 16
        .global __divxf3
        .proc __divxf3
__divxf3:
#ifdef SHARED
        .global __divtf3
__divtf3:
#endif
        cmp.eq p7, p0 = r0, r0
        frcpa.s0 f10, p6 = farg0, farg1
        ;;
(p6)    cmp.ne p7, p0 = r0, r0
        .pred.rel.mutex p6, p7
(p6)    fnma.s1 f11 = farg1, f10, f1
(p6)    fma.s1 f12 = farg0, f10, f0
        ;;
(p6)    fma.s1 f13 = f11, f11, f0
(p6)    fma.s1 f14 = f11, f11, f11
        ;;
(p6)    fma.s1 f11 = f13, f13, f11
(p6)    fma.s1 f13 = f14, f10, f10
        ;;
(p6)    fma.s1 f10 = f13, f11, f10
(p6)    fnma.s1 f11 = farg1, f12, farg0
        ;;
(p6)    fma.s1 f11 = f11, f10, f12
(p6)    fnma.s1 f12 = farg1, f10, f1
        ;;
(p6)    fma.s1 f10 = f12, f10, f10
(p6)    fnma.s1 f12 = farg1, f11, farg0
        ;;
(p6)    fma.s0 fret0 = f12, f10, f11
(p7)    mov fret0 = f10
        br.ret.sptk rp
        .endp __divxf3
#endif
 
#ifdef L__divdf3
// Compute a 64-bit IEEE double quotient.
//
// From the Intel IA-64 Optimization Guide, choose the minimum latency
// alternative.
//
// farg0 holds the dividend.  farg1 holds the divisor.
 
        .text
        .align 16
        .global __divdf3
        .proc __divdf3
__divdf3:
        cmp.eq p7, p0 = r0, r0
        frcpa.s0 f10, p6 = farg0, farg1
        ;;
(p6)    cmp.ne p7, p0 = r0, r0
        .pred.rel.mutex p6, p7
(p6)    fmpy.s1 f11 = farg0, f10
(p6)    fnma.s1 f12 = farg1, f10, f1
        ;;
(p6)    fma.s1 f11 = f12, f11, f11
(p6)    fmpy.s1 f13 = f12, f12
        ;;
(p6)    fma.s1 f10 = f12, f10, f10
(p6)    fma.s1 f11 = f13, f11, f11
        ;;
(p6)    fmpy.s1 f12 = f13, f13
(p6)    fma.s1 f10 = f13, f10, f10
        ;;
(p6)    fma.d.s1 f11 = f12, f11, f11
(p6)    fma.s1 f10 = f12, f10, f10
        ;;
(p6)    fnma.d.s1 f8 = farg1, f11, farg0
        ;;
(p6)    fma.d fret0 = f8, f10, f11
(p7)    mov fret0 = f10
        br.ret.sptk rp
        ;;
        .endp __divdf3
#endif
 
#ifdef L__divsf3
// Compute a 32-bit IEEE float quotient.
//
// From the Intel IA-64 Optimization Guide, choose the minimum latency
// alternative.
//
// farg0 holds the dividend.  farg1 holds the divisor.
 
        .text
        .align 16
        .global __divsf3
        .proc __divsf3
__divsf3:
        cmp.eq p7, p0 = r0, r0
        frcpa.s0 f10, p6 = farg0, farg1
        ;;
(p6)    cmp.ne p7, p0 = r0, r0
        .pred.rel.mutex p6, p7
(p6)    fmpy.s1 f8 = farg0, f10
(p6)    fnma.s1 f9 = farg1, f10, f1
        ;;
(p6)    fma.s1 f8 = f9, f8, f8
(p6)    fmpy.s1 f9 = f9, f9
        ;;
(p6)    fma.s1 f8 = f9, f8, f8
(p6)    fmpy.s1 f9 = f9, f9
        ;;
(p6)    fma.d.s1 f10 = f9, f8, f8
        ;;
(p6)    fnorm.s.s0 fret0 = f10
(p7)    mov fret0 = f10
        br.ret.sptk rp
        ;;
        .endp __divsf3
#endif
 
#ifdef L__divdi3
// Compute a 64-bit integer quotient.
//
// From the Intel IA-64 Optimization Guide, choose the minimum latency
// alternative.
//
// in0 holds the dividend.  in1 holds the divisor.
 
        .text
        .align 16
        .global __divdi3
        .proc __divdi3
__divdi3:
        .regstk 2,0,0,0
        // Transfer inputs to FP registers.
        setf.sig f8 = in0
        setf.sig f9 = in1
        // Check divide by zero.
        cmp.ne.unc p0,p7=0,in1
        ;;
        // Convert the inputs to FP, so that they won't be treated as unsigned.
        fcvt.xf f8 = f8
        fcvt.xf f9 = f9
(p7)    break 1
        ;;
        // Compute the reciprocal approximation.
        frcpa.s1 f10, p6 = f8, f9
        ;;
        // 3 Newton-Raphson iterations.
(p6)    fnma.s1 f11 = f9, f10, f1
(p6)    fmpy.s1 f12 = f8, f10
        ;;
(p6)    fmpy.s1 f13 = f11, f11
(p6)    fma.s1 f12 = f11, f12, f12
        ;;
(p6)    fma.s1 f10 = f11, f10, f10
(p6)    fma.s1 f11 = f13, f12, f12
        ;;
(p6)    fma.s1 f10 = f13, f10, f10
(p6)    fnma.s1 f12 = f9, f11, f8
        ;;
(p6)    fma.s1 f10 = f12, f10, f11
        ;;
        // Round quotient to an integer.
        fcvt.fx.trunc.s1 f10 = f10
        ;;
        // Transfer result to GP registers.
        getf.sig ret0 = f10
        br.ret.sptk rp
        ;;
        .endp __divdi3
#endif
 
#ifdef L__moddi3
// Compute a 64-bit integer modulus.
//
// From the Intel IA-64 Optimization Guide, choose the minimum latency
// alternative.
//
// in0 holds the dividend (a).  in1 holds the divisor (b).
 
        .text
        .align 16
        .global __moddi3
        .proc __moddi3
__moddi3:
        .regstk 2,0,0,0
        // Transfer inputs to FP registers.
        setf.sig f14 = in0
        setf.sig f9 = in1
        // Check divide by zero.
        cmp.ne.unc p0,p7=0,in1
        ;;
        // Convert the inputs to FP, so that they won't be treated as unsigned.
        fcvt.xf f8 = f14
        fcvt.xf f9 = f9
(p7)    break 1
        ;;
        // Compute the reciprocal approximation.
        frcpa.s1 f10, p6 = f8, f9
        ;;
        // 3 Newton-Raphson iterations.
(p6)    fmpy.s1 f12 = f8, f10
(p6)    fnma.s1 f11 = f9, f10, f1
        ;;
(p6)    fma.s1 f12 = f11, f12, f12
(p6)    fmpy.s1 f13 = f11, f11
        ;;
(p6)    fma.s1 f10 = f11, f10, f10
(p6)    fma.s1 f11 = f13, f12, f12
        ;;
        sub in1 = r0, in1
(p6)    fma.s1 f10 = f13, f10, f10
(p6)    fnma.s1 f12 = f9, f11, f8
        ;;
        setf.sig f9 = in1
(p6)    fma.s1 f10 = f12, f10, f11
        ;;
        fcvt.fx.trunc.s1 f10 = f10
        ;;
        // r = q * (-b) + a
        xma.l f10 = f10, f9, f14
        ;;
        // Transfer result to GP registers.
        getf.sig ret0 = f10
        br.ret.sptk rp
        ;;
        .endp __moddi3
#endif
 
#ifdef L__udivdi3
// Compute a 64-bit unsigned integer quotient.
//
// From the Intel IA-64 Optimization Guide, choose the minimum latency
// alternative.
//
// in0 holds the dividend.  in1 holds the divisor.
 
        .text
        .align 16
        .global __udivdi3
        .proc __udivdi3
__udivdi3:
        .regstk 2,0,0,0
        // Transfer inputs to FP registers.
        setf.sig f8 = in0
        setf.sig f9 = in1
        // Check divide by zero.
        cmp.ne.unc p0,p7=0,in1
        ;;
        // Convert the inputs to FP, to avoid FP software-assist faults.
        fcvt.xuf.s1 f8 = f8
        fcvt.xuf.s1 f9 = f9
(p7)    break 1
        ;;
        // Compute the reciprocal approximation.
        frcpa.s1 f10, p6 = f8, f9
        ;;
        // 3 Newton-Raphson iterations.
(p6)    fnma.s1 f11 = f9, f10, f1
(p6)    fmpy.s1 f12 = f8, f10
        ;;
(p6)    fmpy.s1 f13 = f11, f11
(p6)    fma.s1 f12 = f11, f12, f12
        ;;
(p6)    fma.s1 f10 = f11, f10, f10
(p6)    fma.s1 f11 = f13, f12, f12
        ;;
(p6)    fma.s1 f10 = f13, f10, f10
(p6)    fnma.s1 f12 = f9, f11, f8
        ;;
(p6)    fma.s1 f10 = f12, f10, f11
        ;;
        // Round quotient to an unsigned integer.
        fcvt.fxu.trunc.s1 f10 = f10
        ;;
        // Transfer result to GP registers.
        getf.sig ret0 = f10
        br.ret.sptk rp
        ;;
        .endp __udivdi3
#endif
 
#ifdef L__umoddi3
// Compute a 64-bit unsigned integer modulus.
//
// From the Intel IA-64 Optimization Guide, choose the minimum latency
// alternative.
//
// in0 holds the dividend (a).  in1 holds the divisor (b).
 
        .text
        .align 16
        .global __umoddi3
        .proc __umoddi3
__umoddi3:
        .regstk 2,0,0,0
        // Transfer inputs to FP registers.
        setf.sig f14 = in0
        setf.sig f9 = in1
        // Check divide by zero.
        cmp.ne.unc p0,p7=0,in1
        ;;
        // Convert the inputs to FP, to avoid FP software assist faults.
        fcvt.xuf.s1 f8 = f14
        fcvt.xuf.s1 f9 = f9
(p7)    break 1;
        ;;
        // Compute the reciprocal approximation.
        frcpa.s1 f10, p6 = f8, f9
        ;;
        // 3 Newton-Raphson iterations.
(p6)    fmpy.s1 f12 = f8, f10
(p6)    fnma.s1 f11 = f9, f10, f1
        ;;
(p6)    fma.s1 f12 = f11, f12, f12
(p6)    fmpy.s1 f13 = f11, f11
        ;;
(p6)    fma.s1 f10 = f11, f10, f10
(p6)    fma.s1 f11 = f13, f12, f12
        ;;
        sub in1 = r0, in1
(p6)    fma.s1 f10 = f13, f10, f10
(p6)    fnma.s1 f12 = f9, f11, f8
        ;;
        setf.sig f9 = in1
(p6)    fma.s1 f10 = f12, f10, f11
        ;;
        // Round quotient to an unsigned integer.
        fcvt.fxu.trunc.s1 f10 = f10
        ;;
        // r = q * (-b) + a
        xma.l f10 = f10, f9, f14
        ;;
        // Transfer result to GP registers.
        getf.sig ret0 = f10
        br.ret.sptk rp
        ;;
        .endp __umoddi3
#endif
 
#ifdef L__divsi3
// Compute a 32-bit integer quotient.
//
// From the Intel IA-64 Optimization Guide, choose the minimum latency
// alternative.
//
// in0 holds the dividend.  in1 holds the divisor.
 
        .text
        .align 16
        .global __divsi3
        .proc __divsi3
__divsi3:
        .regstk 2,0,0,0
        // Check divide by zero.
        cmp.ne.unc p0,p7=0,in1
        sxt4 in0 = in0
        sxt4 in1 = in1
        ;;
        setf.sig f8 = in0
        setf.sig f9 = in1
(p7)    break 1
        ;;
        mov r2 = 0x0ffdd
        fcvt.xf f8 = f8
        fcvt.xf f9 = f9
        ;;
        setf.exp f11 = r2
        frcpa.s1 f10, p6 = f8, f9
        ;;
(p6)    fmpy.s1 f8 = f8, f10
(p6)    fnma.s1 f9 = f9, f10, f1
        ;;
(p6)    fma.s1 f8 = f9, f8, f8
(p6)    fma.s1 f9 = f9, f9, f11
        ;;
(p6)    fma.s1 f10 = f9, f8, f8
        ;;
        fcvt.fx.trunc.s1 f10 = f10
        ;;
        getf.sig ret0 = f10
        br.ret.sptk rp
        ;;
        .endp __divsi3
#endif
 
#ifdef L__modsi3
// Compute a 32-bit integer modulus.
//
// From the Intel IA-64 Optimization Guide, choose the minimum latency
// alternative.
//
// in0 holds the dividend.  in1 holds the divisor.
 
        .text
        .align 16
        .global __modsi3
        .proc __modsi3
__modsi3:
        .regstk 2,0,0,0
        mov r2 = 0x0ffdd
        sxt4 in0 = in0
        sxt4 in1 = in1
        ;;
        setf.sig f13 = r32
        setf.sig f9 = r33
        // Check divide by zero.
        cmp.ne.unc p0,p7=0,in1
        ;;
        sub in1 = r0, in1
        fcvt.xf f8 = f13
        fcvt.xf f9 = f9
        ;;
        setf.exp f11 = r2
        frcpa.s1 f10, p6 = f8, f9
(p7)    break 1
        ;;
(p6)    fmpy.s1 f12 = f8, f10
(p6)    fnma.s1 f10 = f9, f10, f1
        ;;
        setf.sig f9 = in1
(p6)    fma.s1 f12 = f10, f12, f12
(p6)    fma.s1 f10 = f10, f10, f11
        ;;
(p6)    fma.s1 f10 = f10, f12, f12
        ;;
        fcvt.fx.trunc.s1 f10 = f10
        ;;
        xma.l f10 = f10, f9, f13
        ;;
        getf.sig ret0 = f10
        br.ret.sptk rp
        ;;
        .endp __modsi3
#endif
 
#ifdef L__udivsi3
// Compute a 32-bit unsigned integer quotient.
//
// From the Intel IA-64 Optimization Guide, choose the minimum latency
// alternative.
//
// in0 holds the dividend.  in1 holds the divisor.
 
        .text
        .align 16
        .global __udivsi3
        .proc __udivsi3
__udivsi3:
        .regstk 2,0,0,0
        mov r2 = 0x0ffdd
        zxt4 in0 = in0
        zxt4 in1 = in1
        ;;
        setf.sig f8 = in0
        setf.sig f9 = in1
        // Check divide by zero.
        cmp.ne.unc p0,p7=0,in1
        ;;
        fcvt.xf f8 = f8
        fcvt.xf f9 = f9
(p7)    break 1
        ;;
        setf.exp f11 = r2
        frcpa.s1 f10, p6 = f8, f9
        ;;
(p6)    fmpy.s1 f8 = f8, f10
(p6)    fnma.s1 f9 = f9, f10, f1
        ;;
(p6)    fma.s1 f8 = f9, f8, f8
(p6)    fma.s1 f9 = f9, f9, f11
        ;;
(p6)    fma.s1 f10 = f9, f8, f8
        ;;
        fcvt.fxu.trunc.s1 f10 = f10
        ;;
        getf.sig ret0 = f10
        br.ret.sptk rp
        ;;
        .endp __udivsi3
#endif
 
#ifdef L__umodsi3
// Compute a 32-bit unsigned integer modulus.
//
// From the Intel IA-64 Optimization Guide, choose the minimum latency
// alternative.
//
// in0 holds the dividend.  in1 holds the divisor.
 
        .text
        .align 16
        .global __umodsi3
        .proc __umodsi3
__umodsi3:
        .regstk 2,0,0,0
        mov r2 = 0x0ffdd
        zxt4 in0 = in0
        zxt4 in1 = in1
        ;;
        setf.sig f13 = in0
        setf.sig f9 = in1
        // Check divide by zero.
        cmp.ne.unc p0,p7=0,in1
        ;;
        sub in1 = r0, in1
        fcvt.xf f8 = f13
        fcvt.xf f9 = f9
        ;;
        setf.exp f11 = r2
        frcpa.s1 f10, p6 = f8, f9
(p7)    break 1;
        ;;
(p6)    fmpy.s1 f12 = f8, f10
(p6)    fnma.s1 f10 = f9, f10, f1
        ;;
        setf.sig f9 = in1
(p6)    fma.s1 f12 = f10, f12, f12
(p6)    fma.s1 f10 = f10, f10, f11
        ;;
(p6)    fma.s1 f10 = f10, f12, f12
        ;;
        fcvt.fxu.trunc.s1 f10 = f10
        ;;
        xma.l f10 = f10, f9, f13
        ;;
        getf.sig ret0 = f10
        br.ret.sptk rp
        ;;
        .endp __umodsi3
#endif
 
#ifdef L__save_stack_nonlocal
// Notes on save/restore stack nonlocal: We read ar.bsp but write
// ar.bspstore.  This is because ar.bsp can be read at all times
// (independent of the RSE mode) but since it's read-only we need to
// restore the value via ar.bspstore.  This is OK because
// ar.bsp==ar.bspstore after executing "flushrs".
 
// void __ia64_save_stack_nonlocal(void *save_area, void *stack_pointer)
 
        .text
        .align 16
        .global __ia64_save_stack_nonlocal
        .proc __ia64_save_stack_nonlocal
__ia64_save_stack_nonlocal:
        { .mmf
          alloc r18 = ar.pfs, 2, 0, 0, 0
          mov r19 = ar.rsc
          ;;
        }
        { .mmi
          flushrs
          st8 [in0] = in1, 24
          and r19 = 0x1c, r19
          ;;
        }
        { .mmi
          st8 [in0] = r18, -16
          mov ar.rsc = r19
          or r19 = 0x3, r19
          ;;
        }
        { .mmi
          mov r16 = ar.bsp
          mov r17 = ar.rnat
          adds r2 = 8, in0
          ;;
        }
        { .mmi
          st8 [in0] = r16
          st8 [r2] = r17
        }
        { .mib
          mov ar.rsc = r19
          br.ret.sptk.few rp
          ;;
        }
        .endp __ia64_save_stack_nonlocal
#endif
 
#ifdef L__nonlocal_goto
// void __ia64_nonlocal_goto(void *target_label, void *save_area,
//                           void *static_chain);
 
        .text
        .align 16
        .global __ia64_nonlocal_goto
        .proc __ia64_nonlocal_goto
__ia64_nonlocal_goto:
        { .mmi
          alloc r20 = ar.pfs, 3, 0, 0, 0
          ld8 r12 = [in1], 8
          mov.ret.sptk rp = in0, .L0
          ;;
        }
        { .mmf
          ld8 r16 = [in1], 8
          mov r19 = ar.rsc
          ;;
        }
        { .mmi
          flushrs
          ld8 r17 = [in1], 8
          and r19 = 0x1c, r19
          ;;
        }
        { .mmi
          ld8 r18 = [in1]
          mov ar.rsc = r19
          or r19 = 0x3, r19
          ;;
        }
        { .mmi
          mov ar.bspstore = r16
          ;;
          mov ar.rnat = r17
          ;;
        }
        { .mmi
          loadrs
          invala
          mov r15 = in2
          ;;
        }
.L0:    { .mib
          mov ar.rsc = r19
          mov ar.pfs = r18
          br.ret.sptk.few rp
          ;;
        }
        .endp __ia64_nonlocal_goto
#endif
 
#ifdef L__restore_stack_nonlocal
// This is mostly the same as nonlocal_goto above.
// ??? This has not been tested yet.
 
// void __ia64_restore_stack_nonlocal(void *save_area)
 
        .text
        .align 16
        .global __ia64_restore_stack_nonlocal
        .proc __ia64_restore_stack_nonlocal
__ia64_restore_stack_nonlocal:
        { .mmf
          alloc r20 = ar.pfs, 4, 0, 0, 0
          ld8 r12 = [in0], 8
          ;;
        }
        { .mmb
          ld8 r16=[in0], 8
          mov r19 = ar.rsc
          ;;
        }
        { .mmi
          flushrs
          ld8 r17 = [in0], 8
          and r19 = 0x1c, r19
          ;;
        }
        { .mmf
          ld8 r18 = [in0]
          mov ar.rsc = r19
          ;;
        }
        { .mmi
          mov ar.bspstore = r16
          ;;
          mov ar.rnat = r17
          or r19 = 0x3, r19
          ;;
        }
        { .mmf
          loadrs
          invala
          ;;
        }
.L0:    { .mib
          mov ar.rsc = r19
          mov ar.pfs = r18
          br.ret.sptk.few rp
          ;;
        }
        .endp __ia64_restore_stack_nonlocal
#endif
 
#ifdef L__trampoline
// Implement the nested function trampoline.  This is out of line
// so that we don't have to bother with flushing the icache, as
// well as making the on-stack trampoline smaller.
//
// The trampoline has the following form:
//
//              +-------------------+ >
//      TRAMP:  | __ia64_trampoline | |
//              +-------------------+  > fake function descriptor
//              | TRAMP+16          | |
//              +-------------------+ >
//              | target descriptor |
//              +-------------------+
//              | static link       |
//              +-------------------+
 
        .text
        .align 16
        .global __ia64_trampoline
        .proc __ia64_trampoline
__ia64_trampoline:
        { .mmi
          ld8 r2 = [r1], 8
          ;;
          ld8 r15 = [r1]
        }
        { .mmi
          ld8 r3 = [r2], 8
          ;;
          ld8 r1 = [r2]
          mov b6 = r3
        }
        { .bbb
          br.sptk.many b6
          ;;
        }
        .endp __ia64_trampoline
#endif
 
#ifdef SHARED
// Thunks for backward compatibility.
#ifdef L_fixtfdi
        .text
        .align 16
        .global __fixtfti
        .proc __fixtfti
__fixtfti:
        { .bbb
          br.sptk.many __fixxfti
          ;;
        }
        .endp __fixtfti
#endif
#ifdef L_fixunstfdi
        .align 16
        .global __fixunstfti
        .proc __fixunstfti
__fixunstfti:
        { .bbb
          br.sptk.many __fixunsxfti
          ;;
        }
        .endp __fixunstfti
#endif
#ifdef L_floatditf
        .align 16
        .global __floattitf
        .proc __floattitf
__floattitf:
        { .bbb
          br.sptk.many __floattixf
          ;;
        }
        .endp __floattitf
#endif
#endif

Browse

Tools

Subversion Repositories openrisc

[/] [openrisc/] [trunk/] [gnu-dev/] [or1k-gcc/] [libgcc/] [config/] [ia64/] [lib1funcs.S] - Blame information for rev 777

Line No.	Rev	Author	Line
1	734	jeremybenn	`/* Copyright (C) 2000, 2001, 2003, 2005, 2009 Free Software Foundation, Inc.`
2			`Contributed by James E. Wilson .`
3
4			`This file is part of GCC.`
5
6			`GCC is free software; you can redistribute it and/or modify`
7			`it under the terms of the GNU General Public License as published by`
8			`the Free Software Foundation; either version 3, or (at your option)`
9			`any later version.`
10
11			`GCC is distributed in the hope that it will be useful,`
12			`but WITHOUT ANY WARRANTY; without even the implied warranty of`
13			`MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
14			`GNU General Public License for more details.`
15
16			`Under Section 7 of GPL version 3, you are granted additional`
17			`permissions described in the GCC Runtime Library Exception, version`
18			`3.1, as published by the Free Software Foundation.`
19
20			`You should have received a copy of the GNU General Public License and`
21			`a copy of the GCC Runtime Library Exception along with this program;`
22			`see the files COPYING3 and COPYING.RUNTIME respectively. If not, see`
23			`. */`
24
25			`#ifdef L__divxf3`
26			`// Compute a 80-bit IEEE double-extended quotient.`
27			`//`
28			`// From the Intel IA-64 Optimization Guide, choose the minimum latency`
29			`// alternative.`
30			`//`
31			`// farg0 holds the dividend. farg1 holds the divisor.`
32			`//`
33			`// __divtf3 is an alternate symbol name for backward compatibility.`
34
35			`.text`
36			`.align 16`
37			`.global __divxf3`
38			`.proc __divxf3`
39			`__divxf3:`
40			`#ifdef SHARED`
41			`.global __divtf3`
42			`__divtf3:`
43			`#endif`
44			`cmp.eq p7, p0 = r0, r0`
45			`frcpa.s0 f10, p6 = farg0, farg1`
46			`;;`
47			`(p6) cmp.ne p7, p0 = r0, r0`
48			`.pred.rel.mutex p6, p7`
49			`(p6) fnma.s1 f11 = farg1, f10, f1`
50			`(p6) fma.s1 f12 = farg0, f10, f0`
51			`;;`
52			`(p6) fma.s1 f13 = f11, f11, f0`
53			`(p6) fma.s1 f14 = f11, f11, f11`
54			`;;`
55			`(p6) fma.s1 f11 = f13, f13, f11`
56			`(p6) fma.s1 f13 = f14, f10, f10`
57			`;;`
58			`(p6) fma.s1 f10 = f13, f11, f10`
59			`(p6) fnma.s1 f11 = farg1, f12, farg0`
60			`;;`
61			`(p6) fma.s1 f11 = f11, f10, f12`
62			`(p6) fnma.s1 f12 = farg1, f10, f1`
63			`;;`
64			`(p6) fma.s1 f10 = f12, f10, f10`
65			`(p6) fnma.s1 f12 = farg1, f11, farg0`
66			`;;`
67			`(p6) fma.s0 fret0 = f12, f10, f11`
68			`(p7) mov fret0 = f10`
69			`br.ret.sptk rp`
70			`.endp __divxf3`
71			`#endif`
72
73			`#ifdef L__divdf3`
74			`// Compute a 64-bit IEEE double quotient.`
75			`//`
76			`// From the Intel IA-64 Optimization Guide, choose the minimum latency`
77			`// alternative.`
78			`//`
79			`// farg0 holds the dividend. farg1 holds the divisor.`
80
81			`.text`
82			`.align 16`
83			`.global __divdf3`
84			`.proc __divdf3`
85			`__divdf3:`
86			`cmp.eq p7, p0 = r0, r0`
87			`frcpa.s0 f10, p6 = farg0, farg1`
88			`;;`
89			`(p6) cmp.ne p7, p0 = r0, r0`
90			`.pred.rel.mutex p6, p7`
91			`(p6) fmpy.s1 f11 = farg0, f10`
92			`(p6) fnma.s1 f12 = farg1, f10, f1`
93			`;;`
94			`(p6) fma.s1 f11 = f12, f11, f11`
95			`(p6) fmpy.s1 f13 = f12, f12`
96			`;;`
97			`(p6) fma.s1 f10 = f12, f10, f10`
98			`(p6) fma.s1 f11 = f13, f11, f11`
99			`;;`
100			`(p6) fmpy.s1 f12 = f13, f13`
101			`(p6) fma.s1 f10 = f13, f10, f10`
102			`;;`
103			`(p6) fma.d.s1 f11 = f12, f11, f11`
104			`(p6) fma.s1 f10 = f12, f10, f10`
105			`;;`
106			`(p6) fnma.d.s1 f8 = farg1, f11, farg0`
107			`;;`
108			`(p6) fma.d fret0 = f8, f10, f11`
109			`(p7) mov fret0 = f10`
110			`br.ret.sptk rp`
111			`;;`
112			`.endp __divdf3`
113			`#endif`
114
115			`#ifdef L__divsf3`
116			`// Compute a 32-bit IEEE float quotient.`
117			`//`
118			`// From the Intel IA-64 Optimization Guide, choose the minimum latency`
119			`// alternative.`
120			`//`
121			`// farg0 holds the dividend. farg1 holds the divisor.`
122
123			`.text`
124			`.align 16`
125			`.global __divsf3`
126			`.proc __divsf3`
127			`__divsf3:`
128			`cmp.eq p7, p0 = r0, r0`
129			`frcpa.s0 f10, p6 = farg0, farg1`
130			`;;`
131			`(p6) cmp.ne p7, p0 = r0, r0`
132			`.pred.rel.mutex p6, p7`
133			`(p6) fmpy.s1 f8 = farg0, f10`
134			`(p6) fnma.s1 f9 = farg1, f10, f1`
135			`;;`
136			`(p6) fma.s1 f8 = f9, f8, f8`
137			`(p6) fmpy.s1 f9 = f9, f9`
138			`;;`
139			`(p6) fma.s1 f8 = f9, f8, f8`
140			`(p6) fmpy.s1 f9 = f9, f9`
141			`;;`
142			`(p6) fma.d.s1 f10 = f9, f8, f8`
143			`;;`
144			`(p6) fnorm.s.s0 fret0 = f10`
145			`(p7) mov fret0 = f10`
146			`br.ret.sptk rp`
147			`;;`
148			`.endp __divsf3`
149			`#endif`
150
151			`#ifdef L__divdi3`
152			`// Compute a 64-bit integer quotient.`
153			`//`
154			`// From the Intel IA-64 Optimization Guide, choose the minimum latency`
155			`// alternative.`
156			`//`
157			`// in0 holds the dividend. in1 holds the divisor.`
158
159			`.text`
160			`.align 16`
161			`.global __divdi3`
162			`.proc __divdi3`
163			`__divdi3:`
164			`.regstk 2,0,0,0`
165			`// Transfer inputs to FP registers.`
166			`setf.sig f8 = in0`
167			`setf.sig f9 = in1`
168			`// Check divide by zero.`
169			`cmp.ne.unc p0,p7=0,in1`
170			`;;`
171			`// Convert the inputs to FP, so that they won't be treated as unsigned.`
172			`fcvt.xf f8 = f8`
173			`fcvt.xf f9 = f9`
174			`(p7) break 1`
175			`;;`
176			`// Compute the reciprocal approximation.`
177			`frcpa.s1 f10, p6 = f8, f9`
178			`;;`
179			`// 3 Newton-Raphson iterations.`
180			`(p6) fnma.s1 f11 = f9, f10, f1`
181			`(p6) fmpy.s1 f12 = f8, f10`
182			`;;`
183			`(p6) fmpy.s1 f13 = f11, f11`
184			`(p6) fma.s1 f12 = f11, f12, f12`
185			`;;`
186			`(p6) fma.s1 f10 = f11, f10, f10`
187			`(p6) fma.s1 f11 = f13, f12, f12`
188			`;;`
189			`(p6) fma.s1 f10 = f13, f10, f10`
190			`(p6) fnma.s1 f12 = f9, f11, f8`
191			`;;`
192			`(p6) fma.s1 f10 = f12, f10, f11`
193			`;;`
194			`// Round quotient to an integer.`
195			`fcvt.fx.trunc.s1 f10 = f10`
196			`;;`
197			`// Transfer result to GP registers.`
198			`getf.sig ret0 = f10`
199			`br.ret.sptk rp`
200			`;;`
201			`.endp __divdi3`
202			`#endif`
203
204			`#ifdef L__moddi3`
205			`// Compute a 64-bit integer modulus.`
206			`//`
207			`// From the Intel IA-64 Optimization Guide, choose the minimum latency`
208			`// alternative.`
209			`//`
210			`// in0 holds the dividend (a). in1 holds the divisor (b).`
211
212			`.text`
213			`.align 16`
214			`.global __moddi3`
215			`.proc __moddi3`
216			`__moddi3:`
217			`.regstk 2,0,0,0`
218			`// Transfer inputs to FP registers.`
219			`setf.sig f14 = in0`
220			`setf.sig f9 = in1`
221			`// Check divide by zero.`
222			`cmp.ne.unc p0,p7=0,in1`
223			`;;`
224			`// Convert the inputs to FP, so that they won't be treated as unsigned.`
225			`fcvt.xf f8 = f14`
226			`fcvt.xf f9 = f9`
227			`(p7) break 1`
228			`;;`
229			`// Compute the reciprocal approximation.`
230			`frcpa.s1 f10, p6 = f8, f9`
231			`;;`
232			`// 3 Newton-Raphson iterations.`
233			`(p6) fmpy.s1 f12 = f8, f10`
234			`(p6) fnma.s1 f11 = f9, f10, f1`
235			`;;`
236			`(p6) fma.s1 f12 = f11, f12, f12`
237			`(p6) fmpy.s1 f13 = f11, f11`
238			`;;`
239			`(p6) fma.s1 f10 = f11, f10, f10`
240			`(p6) fma.s1 f11 = f13, f12, f12`
241			`;;`
242			`sub in1 = r0, in1`
243			`(p6) fma.s1 f10 = f13, f10, f10`
244			`(p6) fnma.s1 f12 = f9, f11, f8`
245			`;;`
246			`setf.sig f9 = in1`
247			`(p6) fma.s1 f10 = f12, f10, f11`
248			`;;`
249			`fcvt.fx.trunc.s1 f10 = f10`
250			`;;`
251			`// r = q * (-b) + a`
252			`xma.l f10 = f10, f9, f14`
253			`;;`
254			`// Transfer result to GP registers.`
255			`getf.sig ret0 = f10`
256			`br.ret.sptk rp`
257			`;;`
258			`.endp __moddi3`
259			`#endif`
260
261			`#ifdef L__udivdi3`
262			`// Compute a 64-bit unsigned integer quotient.`
263			`//`
264			`// From the Intel IA-64 Optimization Guide, choose the minimum latency`
265			`// alternative.`
266			`//`
267			`// in0 holds the dividend. in1 holds the divisor.`
268
269			`.text`
270			`.align 16`
271			`.global __udivdi3`
272			`.proc __udivdi3`
273			`__udivdi3:`
274			`.regstk 2,0,0,0`
275			`// Transfer inputs to FP registers.`
276			`setf.sig f8 = in0`
277			`setf.sig f9 = in1`
278			`// Check divide by zero.`
279			`cmp.ne.unc p0,p7=0,in1`
280			`;;`
281			`// Convert the inputs to FP, to avoid FP software-assist faults.`
282			`fcvt.xuf.s1 f8 = f8`
283			`fcvt.xuf.s1 f9 = f9`
284			`(p7) break 1`
285			`;;`
286			`// Compute the reciprocal approximation.`
287			`frcpa.s1 f10, p6 = f8, f9`
288			`;;`
289			`// 3 Newton-Raphson iterations.`
290			`(p6) fnma.s1 f11 = f9, f10, f1`
291			`(p6) fmpy.s1 f12 = f8, f10`
292			`;;`
293			`(p6) fmpy.s1 f13 = f11, f11`
294			`(p6) fma.s1 f12 = f11, f12, f12`
295			`;;`
296			`(p6) fma.s1 f10 = f11, f10, f10`
297			`(p6) fma.s1 f11 = f13, f12, f12`
298			`;;`
299			`(p6) fma.s1 f10 = f13, f10, f10`
300			`(p6) fnma.s1 f12 = f9, f11, f8`
301			`;;`
302			`(p6) fma.s1 f10 = f12, f10, f11`
303			`;;`
304			`// Round quotient to an unsigned integer.`
305			`fcvt.fxu.trunc.s1 f10 = f10`
306			`;;`
307			`// Transfer result to GP registers.`
308			`getf.sig ret0 = f10`
309			`br.ret.sptk rp`
310			`;;`
311			`.endp __udivdi3`
312			`#endif`
313
314			`#ifdef L__umoddi3`
315			`// Compute a 64-bit unsigned integer modulus.`
316			`//`
317			`// From the Intel IA-64 Optimization Guide, choose the minimum latency`
318			`// alternative.`
319			`//`
320			`// in0 holds the dividend (a). in1 holds the divisor (b).`
321
322			`.text`
323			`.align 16`
324			`.global __umoddi3`
325			`.proc __umoddi3`
326			`__umoddi3:`
327			`.regstk 2,0,0,0`
328			`// Transfer inputs to FP registers.`
329			`setf.sig f14 = in0`
330			`setf.sig f9 = in1`
331			`// Check divide by zero.`
332			`cmp.ne.unc p0,p7=0,in1`
333			`;;`
334			`// Convert the inputs to FP, to avoid FP software assist faults.`
335			`fcvt.xuf.s1 f8 = f14`
336			`fcvt.xuf.s1 f9 = f9`
337			`(p7) break 1;`
338			`;;`
339			`// Compute the reciprocal approximation.`
340			`frcpa.s1 f10, p6 = f8, f9`
341			`;;`
342			`// 3 Newton-Raphson iterations.`
343			`(p6) fmpy.s1 f12 = f8, f10`
344			`(p6) fnma.s1 f11 = f9, f10, f1`
345			`;;`
346			`(p6) fma.s1 f12 = f11, f12, f12`
347			`(p6) fmpy.s1 f13 = f11, f11`
348			`;;`
349			`(p6) fma.s1 f10 = f11, f10, f10`
350			`(p6) fma.s1 f11 = f13, f12, f12`
351			`;;`
352			`sub in1 = r0, in1`
353			`(p6) fma.s1 f10 = f13, f10, f10`
354			`(p6) fnma.s1 f12 = f9, f11, f8`
355			`;;`
356			`setf.sig f9 = in1`
357			`(p6) fma.s1 f10 = f12, f10, f11`
358			`;;`
359			`// Round quotient to an unsigned integer.`
360			`fcvt.fxu.trunc.s1 f10 = f10`
361			`;;`
362			`// r = q * (-b) + a`
363			`xma.l f10 = f10, f9, f14`
364			`;;`
365			`// Transfer result to GP registers.`
366			`getf.sig ret0 = f10`
367			`br.ret.sptk rp`
368			`;;`
369			`.endp __umoddi3`
370			`#endif`
371
372			`#ifdef L__divsi3`
373			`// Compute a 32-bit integer quotient.`
374			`//`
375			`// From the Intel IA-64 Optimization Guide, choose the minimum latency`
376			`// alternative.`
377			`//`
378			`// in0 holds the dividend. in1 holds the divisor.`
379
380			`.text`
381			`.align 16`
382			`.global __divsi3`
383			`.proc __divsi3`
384			`__divsi3:`
385			`.regstk 2,0,0,0`
386			`// Check divide by zero.`
387			`cmp.ne.unc p0,p7=0,in1`
388			`sxt4 in0 = in0`
389			`sxt4 in1 = in1`
390			`;;`
391			`setf.sig f8 = in0`
392			`setf.sig f9 = in1`
393			`(p7) break 1`
394			`;;`
395			`mov r2 = 0x0ffdd`
396			`fcvt.xf f8 = f8`
397			`fcvt.xf f9 = f9`
398			`;;`
399			`setf.exp f11 = r2`
400			`frcpa.s1 f10, p6 = f8, f9`
401			`;;`
402			`(p6) fmpy.s1 f8 = f8, f10`
403			`(p6) fnma.s1 f9 = f9, f10, f1`
404			`;;`
405			`(p6) fma.s1 f8 = f9, f8, f8`
406			`(p6) fma.s1 f9 = f9, f9, f11`
407			`;;`
408			`(p6) fma.s1 f10 = f9, f8, f8`
409			`;;`
410			`fcvt.fx.trunc.s1 f10 = f10`
411			`;;`
412			`getf.sig ret0 = f10`
413			`br.ret.sptk rp`
414			`;;`
415			`.endp __divsi3`
416			`#endif`
417
418			`#ifdef L__modsi3`
419			`// Compute a 32-bit integer modulus.`
420			`//`
421			`// From the Intel IA-64 Optimization Guide, choose the minimum latency`
422			`// alternative.`
423			`//`
424			`// in0 holds the dividend. in1 holds the divisor.`
425
426			`.text`
427			`.align 16`
428			`.global __modsi3`
429			`.proc __modsi3`
430			`__modsi3:`
431			`.regstk 2,0,0,0`
432			`mov r2 = 0x0ffdd`
433			`sxt4 in0 = in0`
434			`sxt4 in1 = in1`
435			`;;`
436			`setf.sig f13 = r32`
437			`setf.sig f9 = r33`
438			`// Check divide by zero.`
439			`cmp.ne.unc p0,p7=0,in1`
440			`;;`
441			`sub in1 = r0, in1`
442			`fcvt.xf f8 = f13`
443			`fcvt.xf f9 = f9`
444			`;;`
445			`setf.exp f11 = r2`
446			`frcpa.s1 f10, p6 = f8, f9`
447			`(p7) break 1`
448			`;;`
449			`(p6) fmpy.s1 f12 = f8, f10`
450			`(p6) fnma.s1 f10 = f9, f10, f1`
451			`;;`
452			`setf.sig f9 = in1`
453			`(p6) fma.s1 f12 = f10, f12, f12`
454			`(p6) fma.s1 f10 = f10, f10, f11`
455			`;;`
456			`(p6) fma.s1 f10 = f10, f12, f12`
457			`;;`
458			`fcvt.fx.trunc.s1 f10 = f10`
459			`;;`
460			`xma.l f10 = f10, f9, f13`
461			`;;`
462			`getf.sig ret0 = f10`
463			`br.ret.sptk rp`
464			`;;`
465			`.endp __modsi3`
466			`#endif`
467
468			`#ifdef L__udivsi3`
469			`// Compute a 32-bit unsigned integer quotient.`
470			`//`
471			`// From the Intel IA-64 Optimization Guide, choose the minimum latency`
472			`// alternative.`
473			`//`
474			`// in0 holds the dividend. in1 holds the divisor.`
475
476			`.text`
477			`.align 16`
478			`.global __udivsi3`
479			`.proc __udivsi3`
480			`__udivsi3:`
481			`.regstk 2,0,0,0`
482			`mov r2 = 0x0ffdd`
483			`zxt4 in0 = in0`
484			`zxt4 in1 = in1`
485			`;;`
486			`setf.sig f8 = in0`
487			`setf.sig f9 = in1`
488			`// Check divide by zero.`
489			`cmp.ne.unc p0,p7=0,in1`
490			`;;`
491			`fcvt.xf f8 = f8`
492			`fcvt.xf f9 = f9`
493			`(p7) break 1`
494			`;;`
495			`setf.exp f11 = r2`
496			`frcpa.s1 f10, p6 = f8, f9`
497			`;;`
498			`(p6) fmpy.s1 f8 = f8, f10`
499			`(p6) fnma.s1 f9 = f9, f10, f1`
500			`;;`
501			`(p6) fma.s1 f8 = f9, f8, f8`
502			`(p6) fma.s1 f9 = f9, f9, f11`
503			`;;`
504			`(p6) fma.s1 f10 = f9, f8, f8`
505			`;;`
506			`fcvt.fxu.trunc.s1 f10 = f10`
507			`;;`
508			`getf.sig ret0 = f10`
509			`br.ret.sptk rp`
510			`;;`
511			`.endp __udivsi3`
512			`#endif`
513
514			`#ifdef L__umodsi3`
515			`// Compute a 32-bit unsigned integer modulus.`
516			`//`
517			`// From the Intel IA-64 Optimization Guide, choose the minimum latency`
518			`// alternative.`
519			`//`
520			`// in0 holds the dividend. in1 holds the divisor.`
521
522			`.text`
523			`.align 16`
524			`.global __umodsi3`
525			`.proc __umodsi3`
526			`__umodsi3:`
527			`.regstk 2,0,0,0`
528			`mov r2 = 0x0ffdd`
529			`zxt4 in0 = in0`
530			`zxt4 in1 = in1`
531			`;;`
532			`setf.sig f13 = in0`
533			`setf.sig f9 = in1`
534			`// Check divide by zero.`
535			`cmp.ne.unc p0,p7=0,in1`
536			`;;`
537			`sub in1 = r0, in1`
538			`fcvt.xf f8 = f13`
539			`fcvt.xf f9 = f9`
540			`;;`
541			`setf.exp f11 = r2`
542			`frcpa.s1 f10, p6 = f8, f9`
543			`(p7) break 1;`
544			`;;`
545			`(p6) fmpy.s1 f12 = f8, f10`
546			`(p6) fnma.s1 f10 = f9, f10, f1`
547			`;;`
548			`setf.sig f9 = in1`
549			`(p6) fma.s1 f12 = f10, f12, f12`
550			`(p6) fma.s1 f10 = f10, f10, f11`
551			`;;`
552			`(p6) fma.s1 f10 = f10, f12, f12`
553			`;;`
554			`fcvt.fxu.trunc.s1 f10 = f10`
555			`;;`
556			`xma.l f10 = f10, f9, f13`
557			`;;`
558			`getf.sig ret0 = f10`
559			`br.ret.sptk rp`
560			`;;`
561			`.endp __umodsi3`
562			`#endif`
563
564			`#ifdef L__save_stack_nonlocal`
565			`// Notes on save/restore stack nonlocal: We read ar.bsp but write`
566			`// ar.bspstore. This is because ar.bsp can be read at all times`
567			`// (independent of the RSE mode) but since it's read-only we need to`
568			`// restore the value via ar.bspstore. This is OK because`
569			`// ar.bsp==ar.bspstore after executing "flushrs".`
570
571			`// void __ia64_save_stack_nonlocal(void save_area, void stack_pointer)`
572
573			`.text`
574			`.align 16`
575			`.global __ia64_save_stack_nonlocal`
576			`.proc __ia64_save_stack_nonlocal`
577			`__ia64_save_stack_nonlocal:`
578			`{ .mmf`
579			`alloc r18 = ar.pfs, 2, 0, 0, 0`
580			`mov r19 = ar.rsc`
581			`;;`
582			`}`
583			`{ .mmi`
584			`flushrs`
585			`st8 [in0] = in1, 24`
586			`and r19 = 0x1c, r19`
587			`;;`
588			`}`
589			`{ .mmi`
590			`st8 [in0] = r18, -16`
591			`mov ar.rsc = r19`
592			`or r19 = 0x3, r19`
593			`;;`
594			`}`
595			`{ .mmi`
596			`mov r16 = ar.bsp`
597			`mov r17 = ar.rnat`
598			`adds r2 = 8, in0`
599			`;;`
600			`}`
601			`{ .mmi`
602			`st8 [in0] = r16`
603			`st8 [r2] = r17`
604			`}`
605			`{ .mib`
606			`mov ar.rsc = r19`
607			`br.ret.sptk.few rp`
608			`;;`
609			`}`
610			`.endp __ia64_save_stack_nonlocal`
611			`#endif`
612
613			`#ifdef L__nonlocal_goto`
614			`// void __ia64_nonlocal_goto(void target_label, void save_area,`
615			`// void *static_chain);`
616
617			`.text`
618			`.align 16`
619			`.global __ia64_nonlocal_goto`
620			`.proc __ia64_nonlocal_goto`
621			`__ia64_nonlocal_goto:`
622			`{ .mmi`
623			`alloc r20 = ar.pfs, 3, 0, 0, 0`
624			`ld8 r12 = [in1], 8`
625			`mov.ret.sptk rp = in0, .L0`
626			`;;`
627			`}`
628			`{ .mmf`
629			`ld8 r16 = [in1], 8`
630			`mov r19 = ar.rsc`
631			`;;`
632			`}`
633			`{ .mmi`
634			`flushrs`
635			`ld8 r17 = [in1], 8`
636			`and r19 = 0x1c, r19`
637			`;;`
638			`}`
639			`{ .mmi`
640			`ld8 r18 = [in1]`
641			`mov ar.rsc = r19`
642			`or r19 = 0x3, r19`
643			`;;`
644			`}`
645			`{ .mmi`
646			`mov ar.bspstore = r16`
647			`;;`
648			`mov ar.rnat = r17`
649			`;;`
650			`}`
651			`{ .mmi`
652			`loadrs`
653			`invala`
654			`mov r15 = in2`
655			`;;`
656			`}`
657			`.L0: { .mib`
658			`mov ar.rsc = r19`
659			`mov ar.pfs = r18`
660			`br.ret.sptk.few rp`
661			`;;`
662			`}`
663			`.endp __ia64_nonlocal_goto`
664			`#endif`
665
666			`#ifdef L__restore_stack_nonlocal`
667			`// This is mostly the same as nonlocal_goto above.`
668			`// ??? This has not been tested yet.`
669
670			`// void __ia64_restore_stack_nonlocal(void *save_area)`
671
672			`.text`
673			`.align 16`
674			`.global __ia64_restore_stack_nonlocal`
675			`.proc __ia64_restore_stack_nonlocal`
676			`__ia64_restore_stack_nonlocal:`
677			`{ .mmf`
678			`alloc r20 = ar.pfs, 4, 0, 0, 0`
679			`ld8 r12 = [in0], 8`
680			`;;`
681			`}`
682			`{ .mmb`
683			`ld8 r16=[in0], 8`
684			`mov r19 = ar.rsc`
685			`;;`
686			`}`
687			`{ .mmi`
688			`flushrs`
689			`ld8 r17 = [in0], 8`
690			`and r19 = 0x1c, r19`
691			`;;`
692			`}`
693			`{ .mmf`
694			`ld8 r18 = [in0]`
695			`mov ar.rsc = r19`
696			`;;`
697			`}`
698			`{ .mmi`
699			`mov ar.bspstore = r16`
700			`;;`
701			`mov ar.rnat = r17`
702			`or r19 = 0x3, r19`
703			`;;`
704			`}`
705			`{ .mmf`
706			`loadrs`
707			`invala`
708			`;;`
709			`}`
710			`.L0: { .mib`
711			`mov ar.rsc = r19`
712			`mov ar.pfs = r18`
713			`br.ret.sptk.few rp`
714			`;;`
715			`}`
716			`.endp __ia64_restore_stack_nonlocal`
717			`#endif`
718
719			`#ifdef L__trampoline`
720			`// Implement the nested function trampoline. This is out of line`
721			`// so that we don't have to bother with flushing the icache, as`
722			`// well as making the on-stack trampoline smaller.`
723			`//`
724			`// The trampoline has the following form:`
725			`//`
726			`// +-------------------+ >`
727			`// TRAMP: \| __ia64_trampoline \| \|`
728			`// +-------------------+ > fake function descriptor`
729			`// \| TRAMP+16 \| \|`
730			`// +-------------------+ >`
731			`// \| target descriptor \|`
732			`// +-------------------+`
733			`// \| static link \|`
734			`// +-------------------+`
735
736			`.text`
737			`.align 16`
738			`.global __ia64_trampoline`
739			`.proc __ia64_trampoline`
740			`__ia64_trampoline:`
741			`{ .mmi`
742			`ld8 r2 = [r1], 8`
743			`;;`
744			`ld8 r15 = [r1]`
745			`}`
746			`{ .mmi`
747			`ld8 r3 = [r2], 8`
748			`;;`
749			`ld8 r1 = [r2]`
750			`mov b6 = r3`
751			`}`
752			`{ .bbb`
753			`br.sptk.many b6`
754			`;;`
755			`}`
756			`.endp __ia64_trampoline`
757			`#endif`
758
759			`#ifdef SHARED`
760			`// Thunks for backward compatibility.`
761			`#ifdef L_fixtfdi`
762			`.text`
763			`.align 16`
764			`.global __fixtfti`
765			`.proc __fixtfti`
766			`__fixtfti:`
767			`{ .bbb`
768			`br.sptk.many __fixxfti`
769			`;;`
770			`}`
771			`.endp __fixtfti`
772			`#endif`
773			`#ifdef L_fixunstfdi`
774			`.align 16`
775			`.global __fixunstfti`
776			`.proc __fixunstfti`
777			`__fixunstfti:`
778			`{ .bbb`
779			`br.sptk.many __fixunsxfti`
780			`;;`
781			`}`
782			`.endp __fixunstfti`
783			`#endif`
784			`#ifdef L_floatditf`
785			`.align 16`
786			`.global __floattitf`
787			`.proc __floattitf`
788			`__floattitf:`
789			`{ .bbb`
790			`br.sptk.many __floattixf`
791			`;;`
792			`}`
793			`.endp __floattitf`
794			`#endif`
795			`#endif`