OpenCores
URL https://opencores.org/ocsvn/openrisc/openrisc/trunk

Subversion Repositories openrisc

[/] [openrisc/] [trunk/] [gnu-dev/] [or1k-gcc/] [libgcc/] [config/] [ia64/] [lib1funcs.S] - Rev 734

Compare with Previous | Blame | View Log

/* Copyright (C) 2000, 2001, 2003, 2005, 2009 Free Software Foundation, Inc.
   Contributed by James E. Wilson <wilson@cygnus.com>.

   This file is part of GCC.

   GCC is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 3, or (at your option)
   any later version.

   GCC is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   Under Section 7 of GPL version 3, you are granted additional
   permissions described in the GCC Runtime Library Exception, version
   3.1, as published by the Free Software Foundation.

   You should have received a copy of the GNU General Public License and
   a copy of the GCC Runtime Library Exception along with this program;
   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
   <http://www.gnu.org/licenses/>.  */

#ifdef L__divxf3
// Compute a 80-bit IEEE double-extended quotient.
//
// From the Intel IA-64 Optimization Guide, choose the minimum latency
// alternative.
//
// farg0 holds the dividend.  farg1 holds the divisor.
//
// __divtf3 is an alternate symbol name for backward compatibility.

        .text
        .align 16
        .global __divxf3
        .proc __divxf3
__divxf3:
#ifdef SHARED
        .global __divtf3
__divtf3:
#endif
        cmp.eq p7, p0 = r0, r0
        frcpa.s0 f10, p6 = farg0, farg1
        ;;
(p6)    cmp.ne p7, p0 = r0, r0
        .pred.rel.mutex p6, p7
(p6)    fnma.s1 f11 = farg1, f10, f1
(p6)    fma.s1 f12 = farg0, f10, f0
        ;;
(p6)    fma.s1 f13 = f11, f11, f0
(p6)    fma.s1 f14 = f11, f11, f11
        ;;
(p6)    fma.s1 f11 = f13, f13, f11
(p6)    fma.s1 f13 = f14, f10, f10
        ;;
(p6)    fma.s1 f10 = f13, f11, f10
(p6)    fnma.s1 f11 = farg1, f12, farg0
        ;;
(p6)    fma.s1 f11 = f11, f10, f12
(p6)    fnma.s1 f12 = farg1, f10, f1
        ;;
(p6)    fma.s1 f10 = f12, f10, f10
(p6)    fnma.s1 f12 = farg1, f11, farg0
        ;;
(p6)    fma.s0 fret0 = f12, f10, f11
(p7)    mov fret0 = f10
        br.ret.sptk rp
        .endp __divxf3
#endif

#ifdef L__divdf3
// Compute a 64-bit IEEE double quotient.
//
// From the Intel IA-64 Optimization Guide, choose the minimum latency
// alternative.
//
// farg0 holds the dividend.  farg1 holds the divisor.

        .text
        .align 16
        .global __divdf3
        .proc __divdf3
__divdf3:
        cmp.eq p7, p0 = r0, r0
        frcpa.s0 f10, p6 = farg0, farg1
        ;;
(p6)    cmp.ne p7, p0 = r0, r0
        .pred.rel.mutex p6, p7
(p6)    fmpy.s1 f11 = farg0, f10
(p6)    fnma.s1 f12 = farg1, f10, f1
        ;;
(p6)    fma.s1 f11 = f12, f11, f11
(p6)    fmpy.s1 f13 = f12, f12
        ;;
(p6)    fma.s1 f10 = f12, f10, f10
(p6)    fma.s1 f11 = f13, f11, f11
        ;;
(p6)    fmpy.s1 f12 = f13, f13
(p6)    fma.s1 f10 = f13, f10, f10
        ;;
(p6)    fma.d.s1 f11 = f12, f11, f11
(p6)    fma.s1 f10 = f12, f10, f10
        ;;
(p6)    fnma.d.s1 f8 = farg1, f11, farg0
        ;;
(p6)    fma.d fret0 = f8, f10, f11
(p7)    mov fret0 = f10
        br.ret.sptk rp
        ;;
        .endp __divdf3
#endif

#ifdef L__divsf3
// Compute a 32-bit IEEE float quotient.
//
// From the Intel IA-64 Optimization Guide, choose the minimum latency
// alternative.
//
// farg0 holds the dividend.  farg1 holds the divisor.

        .text
        .align 16
        .global __divsf3
        .proc __divsf3
__divsf3:
        cmp.eq p7, p0 = r0, r0
        frcpa.s0 f10, p6 = farg0, farg1
        ;;
(p6)    cmp.ne p7, p0 = r0, r0
        .pred.rel.mutex p6, p7
(p6)    fmpy.s1 f8 = farg0, f10
(p6)    fnma.s1 f9 = farg1, f10, f1
        ;;
(p6)    fma.s1 f8 = f9, f8, f8
(p6)    fmpy.s1 f9 = f9, f9
        ;;
(p6)    fma.s1 f8 = f9, f8, f8
(p6)    fmpy.s1 f9 = f9, f9
        ;;
(p6)    fma.d.s1 f10 = f9, f8, f8
        ;;
(p6)    fnorm.s.s0 fret0 = f10
(p7)    mov fret0 = f10
        br.ret.sptk rp
        ;;
        .endp __divsf3
#endif

#ifdef L__divdi3
// Compute a 64-bit integer quotient.
//
// From the Intel IA-64 Optimization Guide, choose the minimum latency
// alternative.
//
// in0 holds the dividend.  in1 holds the divisor.

        .text
        .align 16
        .global __divdi3
        .proc __divdi3
__divdi3:
        .regstk 2,0,0,0
        // Transfer inputs to FP registers.
        setf.sig f8 = in0
        setf.sig f9 = in1
        // Check divide by zero.
        cmp.ne.unc p0,p7=0,in1
        ;;
        // Convert the inputs to FP, so that they won't be treated as unsigned.
        fcvt.xf f8 = f8
        fcvt.xf f9 = f9
(p7)    break 1
        ;;
        // Compute the reciprocal approximation.
        frcpa.s1 f10, p6 = f8, f9
        ;;
        // 3 Newton-Raphson iterations.
(p6)    fnma.s1 f11 = f9, f10, f1
(p6)    fmpy.s1 f12 = f8, f10
        ;;
(p6)    fmpy.s1 f13 = f11, f11
(p6)    fma.s1 f12 = f11, f12, f12
        ;;
(p6)    fma.s1 f10 = f11, f10, f10
(p6)    fma.s1 f11 = f13, f12, f12
        ;;
(p6)    fma.s1 f10 = f13, f10, f10
(p6)    fnma.s1 f12 = f9, f11, f8
        ;;
(p6)    fma.s1 f10 = f12, f10, f11
        ;;
        // Round quotient to an integer.
        fcvt.fx.trunc.s1 f10 = f10
        ;;
        // Transfer result to GP registers.
        getf.sig ret0 = f10
        br.ret.sptk rp
        ;;
        .endp __divdi3
#endif

#ifdef L__moddi3
// Compute a 64-bit integer modulus.
//
// From the Intel IA-64 Optimization Guide, choose the minimum latency
// alternative.
//
// in0 holds the dividend (a).  in1 holds the divisor (b).

        .text
        .align 16
        .global __moddi3
        .proc __moddi3
__moddi3:
        .regstk 2,0,0,0
        // Transfer inputs to FP registers.
        setf.sig f14 = in0
        setf.sig f9 = in1
        // Check divide by zero.
        cmp.ne.unc p0,p7=0,in1
        ;;
        // Convert the inputs to FP, so that they won't be treated as unsigned.
        fcvt.xf f8 = f14
        fcvt.xf f9 = f9
(p7)    break 1
        ;;
        // Compute the reciprocal approximation.
        frcpa.s1 f10, p6 = f8, f9
        ;;
        // 3 Newton-Raphson iterations.
(p6)    fmpy.s1 f12 = f8, f10
(p6)    fnma.s1 f11 = f9, f10, f1
        ;;
(p6)    fma.s1 f12 = f11, f12, f12
(p6)    fmpy.s1 f13 = f11, f11
        ;;
(p6)    fma.s1 f10 = f11, f10, f10
(p6)    fma.s1 f11 = f13, f12, f12
        ;;
        sub in1 = r0, in1
(p6)    fma.s1 f10 = f13, f10, f10
(p6)    fnma.s1 f12 = f9, f11, f8
        ;;
        setf.sig f9 = in1
(p6)    fma.s1 f10 = f12, f10, f11
        ;;
        fcvt.fx.trunc.s1 f10 = f10
        ;;
        // r = q * (-b) + a
        xma.l f10 = f10, f9, f14
        ;;
        // Transfer result to GP registers.
        getf.sig ret0 = f10
        br.ret.sptk rp
        ;;
        .endp __moddi3
#endif

#ifdef L__udivdi3
// Compute a 64-bit unsigned integer quotient.
//
// From the Intel IA-64 Optimization Guide, choose the minimum latency
// alternative.
//
// in0 holds the dividend.  in1 holds the divisor.

        .text
        .align 16
        .global __udivdi3
        .proc __udivdi3
__udivdi3:
        .regstk 2,0,0,0
        // Transfer inputs to FP registers.
        setf.sig f8 = in0
        setf.sig f9 = in1
        // Check divide by zero.
        cmp.ne.unc p0,p7=0,in1
        ;;
        // Convert the inputs to FP, to avoid FP software-assist faults.
        fcvt.xuf.s1 f8 = f8
        fcvt.xuf.s1 f9 = f9
(p7)    break 1
        ;;
        // Compute the reciprocal approximation.
        frcpa.s1 f10, p6 = f8, f9
        ;;
        // 3 Newton-Raphson iterations.
(p6)    fnma.s1 f11 = f9, f10, f1
(p6)    fmpy.s1 f12 = f8, f10
        ;;
(p6)    fmpy.s1 f13 = f11, f11
(p6)    fma.s1 f12 = f11, f12, f12
        ;;
(p6)    fma.s1 f10 = f11, f10, f10
(p6)    fma.s1 f11 = f13, f12, f12
        ;;
(p6)    fma.s1 f10 = f13, f10, f10
(p6)    fnma.s1 f12 = f9, f11, f8
        ;;
(p6)    fma.s1 f10 = f12, f10, f11
        ;;
        // Round quotient to an unsigned integer.
        fcvt.fxu.trunc.s1 f10 = f10
        ;;
        // Transfer result to GP registers.
        getf.sig ret0 = f10
        br.ret.sptk rp
        ;;
        .endp __udivdi3
#endif

#ifdef L__umoddi3
// Compute a 64-bit unsigned integer modulus.
//
// From the Intel IA-64 Optimization Guide, choose the minimum latency
// alternative.
//
// in0 holds the dividend (a).  in1 holds the divisor (b).

        .text
        .align 16
        .global __umoddi3
        .proc __umoddi3
__umoddi3:
        .regstk 2,0,0,0
        // Transfer inputs to FP registers.
        setf.sig f14 = in0
        setf.sig f9 = in1
        // Check divide by zero.
        cmp.ne.unc p0,p7=0,in1
        ;;
        // Convert the inputs to FP, to avoid FP software assist faults.
        fcvt.xuf.s1 f8 = f14
        fcvt.xuf.s1 f9 = f9
(p7)    break 1;
        ;;
        // Compute the reciprocal approximation.
        frcpa.s1 f10, p6 = f8, f9
        ;;
        // 3 Newton-Raphson iterations.
(p6)    fmpy.s1 f12 = f8, f10
(p6)    fnma.s1 f11 = f9, f10, f1
        ;;
(p6)    fma.s1 f12 = f11, f12, f12
(p6)    fmpy.s1 f13 = f11, f11
        ;;
(p6)    fma.s1 f10 = f11, f10, f10
(p6)    fma.s1 f11 = f13, f12, f12
        ;;
        sub in1 = r0, in1
(p6)    fma.s1 f10 = f13, f10, f10
(p6)    fnma.s1 f12 = f9, f11, f8
        ;;
        setf.sig f9 = in1
(p6)    fma.s1 f10 = f12, f10, f11
        ;;
        // Round quotient to an unsigned integer.
        fcvt.fxu.trunc.s1 f10 = f10
        ;;
        // r = q * (-b) + a
        xma.l f10 = f10, f9, f14
        ;;
        // Transfer result to GP registers.
        getf.sig ret0 = f10
        br.ret.sptk rp
        ;;
        .endp __umoddi3
#endif

#ifdef L__divsi3
// Compute a 32-bit integer quotient.
//
// From the Intel IA-64 Optimization Guide, choose the minimum latency
// alternative.
//
// in0 holds the dividend.  in1 holds the divisor.

        .text
        .align 16
        .global __divsi3
        .proc __divsi3
__divsi3:
        .regstk 2,0,0,0
        // Check divide by zero.
        cmp.ne.unc p0,p7=0,in1
        sxt4 in0 = in0
        sxt4 in1 = in1
        ;;
        setf.sig f8 = in0
        setf.sig f9 = in1
(p7)    break 1
        ;;
        mov r2 = 0x0ffdd
        fcvt.xf f8 = f8
        fcvt.xf f9 = f9
        ;;
        setf.exp f11 = r2
        frcpa.s1 f10, p6 = f8, f9
        ;;
(p6)    fmpy.s1 f8 = f8, f10
(p6)    fnma.s1 f9 = f9, f10, f1
        ;;
(p6)    fma.s1 f8 = f9, f8, f8
(p6)    fma.s1 f9 = f9, f9, f11
        ;;
(p6)    fma.s1 f10 = f9, f8, f8
        ;;
        fcvt.fx.trunc.s1 f10 = f10
        ;;
        getf.sig ret0 = f10
        br.ret.sptk rp
        ;;
        .endp __divsi3
#endif

#ifdef L__modsi3
// Compute a 32-bit integer modulus.
//
// From the Intel IA-64 Optimization Guide, choose the minimum latency
// alternative.
//
// in0 holds the dividend.  in1 holds the divisor.

        .text
        .align 16
        .global __modsi3
        .proc __modsi3
__modsi3:
        .regstk 2,0,0,0
        mov r2 = 0x0ffdd
        sxt4 in0 = in0
        sxt4 in1 = in1
        ;;
        setf.sig f13 = r32
        setf.sig f9 = r33
        // Check divide by zero.
        cmp.ne.unc p0,p7=0,in1
        ;;
        sub in1 = r0, in1
        fcvt.xf f8 = f13
        fcvt.xf f9 = f9
        ;;
        setf.exp f11 = r2
        frcpa.s1 f10, p6 = f8, f9
(p7)    break 1
        ;;
(p6)    fmpy.s1 f12 = f8, f10
(p6)    fnma.s1 f10 = f9, f10, f1
        ;;
        setf.sig f9 = in1
(p6)    fma.s1 f12 = f10, f12, f12
(p6)    fma.s1 f10 = f10, f10, f11      
        ;;
(p6)    fma.s1 f10 = f10, f12, f12
        ;;
        fcvt.fx.trunc.s1 f10 = f10
        ;;
        xma.l f10 = f10, f9, f13
        ;;
        getf.sig ret0 = f10
        br.ret.sptk rp
        ;;
        .endp __modsi3
#endif

#ifdef L__udivsi3
// Compute a 32-bit unsigned integer quotient.
//
// From the Intel IA-64 Optimization Guide, choose the minimum latency
// alternative.
//
// in0 holds the dividend.  in1 holds the divisor.

        .text
        .align 16
        .global __udivsi3
        .proc __udivsi3
__udivsi3:
        .regstk 2,0,0,0
        mov r2 = 0x0ffdd
        zxt4 in0 = in0
        zxt4 in1 = in1
        ;;
        setf.sig f8 = in0
        setf.sig f9 = in1
        // Check divide by zero.
        cmp.ne.unc p0,p7=0,in1
        ;;
        fcvt.xf f8 = f8
        fcvt.xf f9 = f9
(p7)    break 1
        ;;
        setf.exp f11 = r2
        frcpa.s1 f10, p6 = f8, f9
        ;;
(p6)    fmpy.s1 f8 = f8, f10
(p6)    fnma.s1 f9 = f9, f10, f1
        ;;
(p6)    fma.s1 f8 = f9, f8, f8
(p6)    fma.s1 f9 = f9, f9, f11
        ;;
(p6)    fma.s1 f10 = f9, f8, f8
        ;;
        fcvt.fxu.trunc.s1 f10 = f10
        ;;
        getf.sig ret0 = f10
        br.ret.sptk rp
        ;;
        .endp __udivsi3
#endif

#ifdef L__umodsi3
// Compute a 32-bit unsigned integer modulus.
//
// From the Intel IA-64 Optimization Guide, choose the minimum latency
// alternative.
//
// in0 holds the dividend.  in1 holds the divisor.

        .text
        .align 16
        .global __umodsi3
        .proc __umodsi3
__umodsi3:
        .regstk 2,0,0,0
        mov r2 = 0x0ffdd
        zxt4 in0 = in0
        zxt4 in1 = in1
        ;;
        setf.sig f13 = in0
        setf.sig f9 = in1
        // Check divide by zero.
        cmp.ne.unc p0,p7=0,in1
        ;;
        sub in1 = r0, in1
        fcvt.xf f8 = f13
        fcvt.xf f9 = f9
        ;;
        setf.exp f11 = r2
        frcpa.s1 f10, p6 = f8, f9
(p7)    break 1;
        ;;
(p6)    fmpy.s1 f12 = f8, f10
(p6)    fnma.s1 f10 = f9, f10, f1
        ;;
        setf.sig f9 = in1
(p6)    fma.s1 f12 = f10, f12, f12
(p6)    fma.s1 f10 = f10, f10, f11
        ;;
(p6)    fma.s1 f10 = f10, f12, f12
        ;;
        fcvt.fxu.trunc.s1 f10 = f10
        ;;
        xma.l f10 = f10, f9, f13
        ;;
        getf.sig ret0 = f10
        br.ret.sptk rp
        ;;
        .endp __umodsi3
#endif

#ifdef L__save_stack_nonlocal
// Notes on save/restore stack nonlocal: We read ar.bsp but write
// ar.bspstore.  This is because ar.bsp can be read at all times
// (independent of the RSE mode) but since it's read-only we need to
// restore the value via ar.bspstore.  This is OK because
// ar.bsp==ar.bspstore after executing "flushrs".

// void __ia64_save_stack_nonlocal(void *save_area, void *stack_pointer)

        .text
        .align 16
        .global __ia64_save_stack_nonlocal
        .proc __ia64_save_stack_nonlocal
__ia64_save_stack_nonlocal:
        { .mmf
          alloc r18 = ar.pfs, 2, 0, 0, 0
          mov r19 = ar.rsc
          ;;
        }
        { .mmi
          flushrs
          st8 [in0] = in1, 24
          and r19 = 0x1c, r19
          ;;
        }
        { .mmi
          st8 [in0] = r18, -16
          mov ar.rsc = r19
          or r19 = 0x3, r19
          ;;
        }
        { .mmi
          mov r16 = ar.bsp
          mov r17 = ar.rnat
          adds r2 = 8, in0
          ;;
        }
        { .mmi
          st8 [in0] = r16
          st8 [r2] = r17
        }
        { .mib
          mov ar.rsc = r19
          br.ret.sptk.few rp
          ;;
        }
        .endp __ia64_save_stack_nonlocal
#endif

#ifdef L__nonlocal_goto
// void __ia64_nonlocal_goto(void *target_label, void *save_area,
//                           void *static_chain);

        .text
        .align 16
        .global __ia64_nonlocal_goto
        .proc __ia64_nonlocal_goto
__ia64_nonlocal_goto:
        { .mmi
          alloc r20 = ar.pfs, 3, 0, 0, 0
          ld8 r12 = [in1], 8
          mov.ret.sptk rp = in0, .L0
          ;;
        }
        { .mmf
          ld8 r16 = [in1], 8
          mov r19 = ar.rsc
          ;;
        }
        { .mmi
          flushrs
          ld8 r17 = [in1], 8
          and r19 = 0x1c, r19
          ;;
        }
        { .mmi
          ld8 r18 = [in1]
          mov ar.rsc = r19
          or r19 = 0x3, r19
          ;;
        }
        { .mmi
          mov ar.bspstore = r16
          ;;
          mov ar.rnat = r17
          ;;
        }
        { .mmi
          loadrs
          invala
          mov r15 = in2
          ;;
        }
.L0:    { .mib
          mov ar.rsc = r19
          mov ar.pfs = r18
          br.ret.sptk.few rp
          ;;
        }
        .endp __ia64_nonlocal_goto
#endif

#ifdef L__restore_stack_nonlocal
// This is mostly the same as nonlocal_goto above.
// ??? This has not been tested yet.

// void __ia64_restore_stack_nonlocal(void *save_area)

        .text
        .align 16
        .global __ia64_restore_stack_nonlocal
        .proc __ia64_restore_stack_nonlocal
__ia64_restore_stack_nonlocal:
        { .mmf
          alloc r20 = ar.pfs, 4, 0, 0, 0
          ld8 r12 = [in0], 8
          ;;
        }
        { .mmb
          ld8 r16=[in0], 8
          mov r19 = ar.rsc
          ;;
        }
        { .mmi
          flushrs
          ld8 r17 = [in0], 8
          and r19 = 0x1c, r19
          ;;
        }
        { .mmf
          ld8 r18 = [in0]
          mov ar.rsc = r19
          ;;
        }
        { .mmi
          mov ar.bspstore = r16
          ;;
          mov ar.rnat = r17
          or r19 = 0x3, r19
          ;;
        }
        { .mmf
          loadrs
          invala
          ;;
        }
.L0:    { .mib
          mov ar.rsc = r19
          mov ar.pfs = r18
          br.ret.sptk.few rp
          ;;
        }
        .endp __ia64_restore_stack_nonlocal
#endif

#ifdef L__trampoline
// Implement the nested function trampoline.  This is out of line
// so that we don't have to bother with flushing the icache, as
// well as making the on-stack trampoline smaller.
//
// The trampoline has the following form:
//
//              +-------------------+ >
//      TRAMP:  | __ia64_trampoline | |
//              +-------------------+  > fake function descriptor
//              | TRAMP+16          | |
//              +-------------------+ >
//              | target descriptor |
//              +-------------------+
//              | static link       |
//              +-------------------+

        .text
        .align 16
        .global __ia64_trampoline
        .proc __ia64_trampoline
__ia64_trampoline:
        { .mmi
          ld8 r2 = [r1], 8
          ;;
          ld8 r15 = [r1]
        }
        { .mmi
          ld8 r3 = [r2], 8
          ;;
          ld8 r1 = [r2]
          mov b6 = r3
        }
        { .bbb
          br.sptk.many b6
          ;;
        }
        .endp __ia64_trampoline
#endif

#ifdef SHARED
// Thunks for backward compatibility.
#ifdef L_fixtfdi
        .text
        .align 16
        .global __fixtfti
        .proc __fixtfti
__fixtfti:
        { .bbb
          br.sptk.many __fixxfti
          ;;
        }
        .endp __fixtfti
#endif
#ifdef L_fixunstfdi
        .align 16
        .global __fixunstfti
        .proc __fixunstfti
__fixunstfti:
        { .bbb
          br.sptk.many __fixunsxfti
          ;;
        }
        .endp __fixunstfti
#endif
#ifdef L_floatditf
        .align 16
        .global __floattitf
        .proc __floattitf
__floattitf:
        { .bbb
          br.sptk.many __floattixf
          ;;
        }
        .endp __floattitf
#endif
#endif

Compare with Previous | Blame | View Log

powered by: WebSVN 2.1.0

© copyright 1999-2024 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.