URL https://opencores.org/ocsvn/openrisc/openrisc/trunk
Subversion Repositories openrisc

[/] [openrisc/] [trunk/] [gnu-dev/] [or1k-gcc/] [libgcc/] [config/] [sh/] [lib1funcs.S] - Rev 847

Go to most recent revision | Compare with Previous | Blame | View Log
/* Copyright (C) 1994, 1995, 1997, 1998, 1999, 2000, 2001, 2002, 2003,
   2004, 2005, 2006, 2009
   Free Software Foundation, Inc.

This file is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by the
Free Software Foundation; either version 3, or (at your option) any
later version.

This file is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
General Public License for more details.

Under Section 7 of GPL version 3, you are granted additional
permissions described in the GCC Runtime Library Exception, version
3.1, as published by the Free Software Foundation.

You should have received a copy of the GNU General Public License and
a copy of the GCC Runtime Library Exception along with this program;
see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
<http://www.gnu.org/licenses/>.  */


!! libgcc routines for the Renesas / SuperH SH CPUs.
!! Contributed by Steve Chamberlain.
!! sac@cygnus.com

!! ashiftrt_r4_x, ___ashrsi3, ___ashlsi3, ___lshrsi3 routines
!! recoded in assembly by Toshiyasu Morita
!! tm@netcom.com

#if defined(__ELF__) && defined(__linux__)
.section .note.GNU-stack,"",%progbits
.previous
#endif

/* SH2 optimizations for ___ashrsi3, ___ashlsi3, ___lshrsi3 and
   ELF local label prefixes by J"orn Rennecke
   amylaar@cygnus.com  */

#include "lib1funcs.h"

/* t-vxworks needs to build both PIC and non-PIC versions of libgcc,
   so it is more convenient to define NO_FPSCR_VALUES here than to
   define it on the command line.  */
#if defined __vxworks && defined __PIC__
#define NO_FPSCR_VALUES
#endif
        
#if ! __SH5__
#ifdef L_ashiftrt
        .global GLOBAL(ashiftrt_r4_0)
        .global GLOBAL(ashiftrt_r4_1)
        .global GLOBAL(ashiftrt_r4_2)
        .global GLOBAL(ashiftrt_r4_3)
        .global GLOBAL(ashiftrt_r4_4)
        .global GLOBAL(ashiftrt_r4_5)
        .global GLOBAL(ashiftrt_r4_6)
        .global GLOBAL(ashiftrt_r4_7)
        .global GLOBAL(ashiftrt_r4_8)
        .global GLOBAL(ashiftrt_r4_9)
        .global GLOBAL(ashiftrt_r4_10)
        .global GLOBAL(ashiftrt_r4_11)
        .global GLOBAL(ashiftrt_r4_12)
        .global GLOBAL(ashiftrt_r4_13)
        .global GLOBAL(ashiftrt_r4_14)
        .global GLOBAL(ashiftrt_r4_15)
        .global GLOBAL(ashiftrt_r4_16)
        .global GLOBAL(ashiftrt_r4_17)
        .global GLOBAL(ashiftrt_r4_18)
        .global GLOBAL(ashiftrt_r4_19)
        .global GLOBAL(ashiftrt_r4_20)
        .global GLOBAL(ashiftrt_r4_21)
        .global GLOBAL(ashiftrt_r4_22)
        .global GLOBAL(ashiftrt_r4_23)
        .global GLOBAL(ashiftrt_r4_24)
        .global GLOBAL(ashiftrt_r4_25)
        .global GLOBAL(ashiftrt_r4_26)
        .global GLOBAL(ashiftrt_r4_27)
        .global GLOBAL(ashiftrt_r4_28)
        .global GLOBAL(ashiftrt_r4_29)
        .global GLOBAL(ashiftrt_r4_30)
        .global GLOBAL(ashiftrt_r4_31)
        .global GLOBAL(ashiftrt_r4_32)

        HIDDEN_FUNC(GLOBAL(ashiftrt_r4_0))
        HIDDEN_FUNC(GLOBAL(ashiftrt_r4_1))
        HIDDEN_FUNC(GLOBAL(ashiftrt_r4_2))
        HIDDEN_FUNC(GLOBAL(ashiftrt_r4_3))
        HIDDEN_FUNC(GLOBAL(ashiftrt_r4_4))
        HIDDEN_FUNC(GLOBAL(ashiftrt_r4_5))
        HIDDEN_FUNC(GLOBAL(ashiftrt_r4_6))
        HIDDEN_FUNC(GLOBAL(ashiftrt_r4_7))
        HIDDEN_FUNC(GLOBAL(ashiftrt_r4_8))
        HIDDEN_FUNC(GLOBAL(ashiftrt_r4_9))
        HIDDEN_FUNC(GLOBAL(ashiftrt_r4_10))
        HIDDEN_FUNC(GLOBAL(ashiftrt_r4_11))
        HIDDEN_FUNC(GLOBAL(ashiftrt_r4_12))
        HIDDEN_FUNC(GLOBAL(ashiftrt_r4_13))
        HIDDEN_FUNC(GLOBAL(ashiftrt_r4_14))
        HIDDEN_FUNC(GLOBAL(ashiftrt_r4_15))
        HIDDEN_FUNC(GLOBAL(ashiftrt_r4_16))
        HIDDEN_FUNC(GLOBAL(ashiftrt_r4_17))
        HIDDEN_FUNC(GLOBAL(ashiftrt_r4_18))
        HIDDEN_FUNC(GLOBAL(ashiftrt_r4_19))
        HIDDEN_FUNC(GLOBAL(ashiftrt_r4_20))
        HIDDEN_FUNC(GLOBAL(ashiftrt_r4_21))
        HIDDEN_FUNC(GLOBAL(ashiftrt_r4_22))
        HIDDEN_FUNC(GLOBAL(ashiftrt_r4_23))
        HIDDEN_FUNC(GLOBAL(ashiftrt_r4_24))
        HIDDEN_FUNC(GLOBAL(ashiftrt_r4_25))
        HIDDEN_FUNC(GLOBAL(ashiftrt_r4_26))
        HIDDEN_FUNC(GLOBAL(ashiftrt_r4_27))
        HIDDEN_FUNC(GLOBAL(ashiftrt_r4_28))
        HIDDEN_FUNC(GLOBAL(ashiftrt_r4_29))
        HIDDEN_FUNC(GLOBAL(ashiftrt_r4_30))
        HIDDEN_FUNC(GLOBAL(ashiftrt_r4_31))
        HIDDEN_FUNC(GLOBAL(ashiftrt_r4_32))

        .align  1
GLOBAL(ashiftrt_r4_32):
GLOBAL(ashiftrt_r4_31):
        rotcl   r4
        rts
        subc    r4,r4

GLOBAL(ashiftrt_r4_30):
        shar    r4
GLOBAL(ashiftrt_r4_29):
        shar    r4
GLOBAL(ashiftrt_r4_28):
        shar    r4
GLOBAL(ashiftrt_r4_27):
        shar    r4
GLOBAL(ashiftrt_r4_26):
        shar    r4
GLOBAL(ashiftrt_r4_25):
        shar    r4
GLOBAL(ashiftrt_r4_24):
        shlr16  r4
        shlr8   r4
        rts
        exts.b  r4,r4

GLOBAL(ashiftrt_r4_23):
        shar    r4
GLOBAL(ashiftrt_r4_22):
        shar    r4
GLOBAL(ashiftrt_r4_21):
        shar    r4
GLOBAL(ashiftrt_r4_20):
        shar    r4
GLOBAL(ashiftrt_r4_19):
        shar    r4
GLOBAL(ashiftrt_r4_18):
        shar    r4
GLOBAL(ashiftrt_r4_17):
        shar    r4
GLOBAL(ashiftrt_r4_16):
        shlr16  r4
        rts
        exts.w  r4,r4

GLOBAL(ashiftrt_r4_15):
        shar    r4
GLOBAL(ashiftrt_r4_14):
        shar    r4
GLOBAL(ashiftrt_r4_13):
        shar    r4
GLOBAL(ashiftrt_r4_12):
        shar    r4
GLOBAL(ashiftrt_r4_11):
        shar    r4
GLOBAL(ashiftrt_r4_10):
        shar    r4
GLOBAL(ashiftrt_r4_9):
        shar    r4
GLOBAL(ashiftrt_r4_8):
        shar    r4
GLOBAL(ashiftrt_r4_7):
        shar    r4
GLOBAL(ashiftrt_r4_6):
        shar    r4
GLOBAL(ashiftrt_r4_5):
        shar    r4
GLOBAL(ashiftrt_r4_4):
        shar    r4
GLOBAL(ashiftrt_r4_3):
        shar    r4
GLOBAL(ashiftrt_r4_2):
        shar    r4
GLOBAL(ashiftrt_r4_1):
        rts
        shar    r4

GLOBAL(ashiftrt_r4_0):
        rts
        nop

        ENDFUNC(GLOBAL(ashiftrt_r4_0))
        ENDFUNC(GLOBAL(ashiftrt_r4_1))
        ENDFUNC(GLOBAL(ashiftrt_r4_2))
        ENDFUNC(GLOBAL(ashiftrt_r4_3))
        ENDFUNC(GLOBAL(ashiftrt_r4_4))
        ENDFUNC(GLOBAL(ashiftrt_r4_5))
        ENDFUNC(GLOBAL(ashiftrt_r4_6))
        ENDFUNC(GLOBAL(ashiftrt_r4_7))
        ENDFUNC(GLOBAL(ashiftrt_r4_8))
        ENDFUNC(GLOBAL(ashiftrt_r4_9))
        ENDFUNC(GLOBAL(ashiftrt_r4_10))
        ENDFUNC(GLOBAL(ashiftrt_r4_11))
        ENDFUNC(GLOBAL(ashiftrt_r4_12))
        ENDFUNC(GLOBAL(ashiftrt_r4_13))
        ENDFUNC(GLOBAL(ashiftrt_r4_14))
        ENDFUNC(GLOBAL(ashiftrt_r4_15))
        ENDFUNC(GLOBAL(ashiftrt_r4_16))
        ENDFUNC(GLOBAL(ashiftrt_r4_17))
        ENDFUNC(GLOBAL(ashiftrt_r4_18))
        ENDFUNC(GLOBAL(ashiftrt_r4_19))
        ENDFUNC(GLOBAL(ashiftrt_r4_20))
        ENDFUNC(GLOBAL(ashiftrt_r4_21))
        ENDFUNC(GLOBAL(ashiftrt_r4_22))
        ENDFUNC(GLOBAL(ashiftrt_r4_23))
        ENDFUNC(GLOBAL(ashiftrt_r4_24))
        ENDFUNC(GLOBAL(ashiftrt_r4_25))
        ENDFUNC(GLOBAL(ashiftrt_r4_26))
        ENDFUNC(GLOBAL(ashiftrt_r4_27))
        ENDFUNC(GLOBAL(ashiftrt_r4_28))
        ENDFUNC(GLOBAL(ashiftrt_r4_29))
        ENDFUNC(GLOBAL(ashiftrt_r4_30))
        ENDFUNC(GLOBAL(ashiftrt_r4_31))
        ENDFUNC(GLOBAL(ashiftrt_r4_32))
#endif

#ifdef L_ashiftrt_n

!
! GLOBAL(ashrsi3)
!
! Entry:
!
! r4: Value to shift
! r5: Shifts
!
! Exit:
!
! r0: Result
!
! Destroys:
!
! (none)
!

        .global GLOBAL(ashrsi3)
        HIDDEN_FUNC(GLOBAL(ashrsi3))
        .align  2
GLOBAL(ashrsi3):
        mov     #31,r0
        and     r0,r5
        mova    LOCAL(ashrsi3_table),r0
        mov.b   @(r0,r5),r5
#ifdef __sh1__
        add     r5,r0
        jmp     @r0
#else
        braf    r5
#endif
        mov     r4,r0

        .align  2
LOCAL(ashrsi3_table):
        .byte           LOCAL(ashrsi3_0)-LOCAL(ashrsi3_table)
        .byte           LOCAL(ashrsi3_1)-LOCAL(ashrsi3_table)
        .byte           LOCAL(ashrsi3_2)-LOCAL(ashrsi3_table)
        .byte           LOCAL(ashrsi3_3)-LOCAL(ashrsi3_table)
        .byte           LOCAL(ashrsi3_4)-LOCAL(ashrsi3_table)
        .byte           LOCAL(ashrsi3_5)-LOCAL(ashrsi3_table)
        .byte           LOCAL(ashrsi3_6)-LOCAL(ashrsi3_table)
        .byte           LOCAL(ashrsi3_7)-LOCAL(ashrsi3_table)
        .byte           LOCAL(ashrsi3_8)-LOCAL(ashrsi3_table)
        .byte           LOCAL(ashrsi3_9)-LOCAL(ashrsi3_table)
        .byte           LOCAL(ashrsi3_10)-LOCAL(ashrsi3_table)
        .byte           LOCAL(ashrsi3_11)-LOCAL(ashrsi3_table)
        .byte           LOCAL(ashrsi3_12)-LOCAL(ashrsi3_table)
        .byte           LOCAL(ashrsi3_13)-LOCAL(ashrsi3_table)
        .byte           LOCAL(ashrsi3_14)-LOCAL(ashrsi3_table)
        .byte           LOCAL(ashrsi3_15)-LOCAL(ashrsi3_table)
        .byte           LOCAL(ashrsi3_16)-LOCAL(ashrsi3_table)
        .byte           LOCAL(ashrsi3_17)-LOCAL(ashrsi3_table)
        .byte           LOCAL(ashrsi3_18)-LOCAL(ashrsi3_table)
        .byte           LOCAL(ashrsi3_19)-LOCAL(ashrsi3_table)
        .byte           LOCAL(ashrsi3_20)-LOCAL(ashrsi3_table)
        .byte           LOCAL(ashrsi3_21)-LOCAL(ashrsi3_table)
        .byte           LOCAL(ashrsi3_22)-LOCAL(ashrsi3_table)
        .byte           LOCAL(ashrsi3_23)-LOCAL(ashrsi3_table)
        .byte           LOCAL(ashrsi3_24)-LOCAL(ashrsi3_table)
        .byte           LOCAL(ashrsi3_25)-LOCAL(ashrsi3_table)
        .byte           LOCAL(ashrsi3_26)-LOCAL(ashrsi3_table)
        .byte           LOCAL(ashrsi3_27)-LOCAL(ashrsi3_table)
        .byte           LOCAL(ashrsi3_28)-LOCAL(ashrsi3_table)
        .byte           LOCAL(ashrsi3_29)-LOCAL(ashrsi3_table)
        .byte           LOCAL(ashrsi3_30)-LOCAL(ashrsi3_table)
        .byte           LOCAL(ashrsi3_31)-LOCAL(ashrsi3_table)

LOCAL(ashrsi3_31):
        rotcl   r0
        rts
        subc    r0,r0

LOCAL(ashrsi3_30):
        shar    r0
LOCAL(ashrsi3_29):
        shar    r0
LOCAL(ashrsi3_28):
        shar    r0
LOCAL(ashrsi3_27):
        shar    r0
LOCAL(ashrsi3_26):
        shar    r0
LOCAL(ashrsi3_25):
        shar    r0
LOCAL(ashrsi3_24):
        shlr16  r0
        shlr8   r0
        rts
        exts.b  r0,r0

LOCAL(ashrsi3_23):
        shar    r0
LOCAL(ashrsi3_22):
        shar    r0
LOCAL(ashrsi3_21):
        shar    r0
LOCAL(ashrsi3_20):
        shar    r0
LOCAL(ashrsi3_19):
        shar    r0
LOCAL(ashrsi3_18):
        shar    r0
LOCAL(ashrsi3_17):
        shar    r0
LOCAL(ashrsi3_16):
        shlr16  r0
        rts
        exts.w  r0,r0

LOCAL(ashrsi3_15):
        shar    r0
LOCAL(ashrsi3_14):
        shar    r0
LOCAL(ashrsi3_13):
        shar    r0
LOCAL(ashrsi3_12):
        shar    r0
LOCAL(ashrsi3_11):
        shar    r0
LOCAL(ashrsi3_10):
        shar    r0
LOCAL(ashrsi3_9):
        shar    r0
LOCAL(ashrsi3_8):
        shar    r0
LOCAL(ashrsi3_7):
        shar    r0
LOCAL(ashrsi3_6):
        shar    r0
LOCAL(ashrsi3_5):
        shar    r0
LOCAL(ashrsi3_4):
        shar    r0
LOCAL(ashrsi3_3):
        shar    r0
LOCAL(ashrsi3_2):
        shar    r0
LOCAL(ashrsi3_1):
        rts
        shar    r0

LOCAL(ashrsi3_0):
        rts
        nop

        ENDFUNC(GLOBAL(ashrsi3))
#endif

#ifdef L_ashiftlt

!
! GLOBAL(ashlsi3)
!
! Entry:
!
! r4: Value to shift
! r5: Shifts
!
! Exit:
!
! r0: Result
!
! Destroys:
!
! (none)
!
        .global GLOBAL(ashlsi3)
        HIDDEN_FUNC(GLOBAL(ashlsi3))
        .align  2
GLOBAL(ashlsi3):
        mov     #31,r0
        and     r0,r5
        mova    LOCAL(ashlsi3_table),r0
        mov.b   @(r0,r5),r5
#ifdef __sh1__
        add     r5,r0
        jmp     @r0
#else
        braf    r5
#endif
        mov     r4,r0

        .align  2
LOCAL(ashlsi3_table):
        .byte           LOCAL(ashlsi3_0)-LOCAL(ashlsi3_table)
        .byte           LOCAL(ashlsi3_1)-LOCAL(ashlsi3_table)
        .byte           LOCAL(ashlsi3_2)-LOCAL(ashlsi3_table)
        .byte           LOCAL(ashlsi3_3)-LOCAL(ashlsi3_table)
        .byte           LOCAL(ashlsi3_4)-LOCAL(ashlsi3_table)
        .byte           LOCAL(ashlsi3_5)-LOCAL(ashlsi3_table)
        .byte           LOCAL(ashlsi3_6)-LOCAL(ashlsi3_table)
        .byte           LOCAL(ashlsi3_7)-LOCAL(ashlsi3_table)
        .byte           LOCAL(ashlsi3_8)-LOCAL(ashlsi3_table)
        .byte           LOCAL(ashlsi3_9)-LOCAL(ashlsi3_table)
        .byte           LOCAL(ashlsi3_10)-LOCAL(ashlsi3_table)
        .byte           LOCAL(ashlsi3_11)-LOCAL(ashlsi3_table)
        .byte           LOCAL(ashlsi3_12)-LOCAL(ashlsi3_table)
        .byte           LOCAL(ashlsi3_13)-LOCAL(ashlsi3_table)
        .byte           LOCAL(ashlsi3_14)-LOCAL(ashlsi3_table)
        .byte           LOCAL(ashlsi3_15)-LOCAL(ashlsi3_table)
        .byte           LOCAL(ashlsi3_16)-LOCAL(ashlsi3_table)
        .byte           LOCAL(ashlsi3_17)-LOCAL(ashlsi3_table)
        .byte           LOCAL(ashlsi3_18)-LOCAL(ashlsi3_table)
        .byte           LOCAL(ashlsi3_19)-LOCAL(ashlsi3_table)
        .byte           LOCAL(ashlsi3_20)-LOCAL(ashlsi3_table)
        .byte           LOCAL(ashlsi3_21)-LOCAL(ashlsi3_table)
        .byte           LOCAL(ashlsi3_22)-LOCAL(ashlsi3_table)
        .byte           LOCAL(ashlsi3_23)-LOCAL(ashlsi3_table)
        .byte           LOCAL(ashlsi3_24)-LOCAL(ashlsi3_table)
        .byte           LOCAL(ashlsi3_25)-LOCAL(ashlsi3_table)
        .byte           LOCAL(ashlsi3_26)-LOCAL(ashlsi3_table)
        .byte           LOCAL(ashlsi3_27)-LOCAL(ashlsi3_table)
        .byte           LOCAL(ashlsi3_28)-LOCAL(ashlsi3_table)
        .byte           LOCAL(ashlsi3_29)-LOCAL(ashlsi3_table)
        .byte           LOCAL(ashlsi3_30)-LOCAL(ashlsi3_table)
        .byte           LOCAL(ashlsi3_31)-LOCAL(ashlsi3_table)

LOCAL(ashlsi3_6):
        shll2   r0
LOCAL(ashlsi3_4):
        shll2   r0
LOCAL(ashlsi3_2):
        rts
        shll2   r0

LOCAL(ashlsi3_7):
        shll2   r0
LOCAL(ashlsi3_5):
        shll2   r0
LOCAL(ashlsi3_3):
        shll2   r0
LOCAL(ashlsi3_1):
        rts
        shll    r0

LOCAL(ashlsi3_14):
        shll2   r0
LOCAL(ashlsi3_12):
        shll2   r0
LOCAL(ashlsi3_10):
        shll2   r0
LOCAL(ashlsi3_8):
        rts
        shll8   r0

LOCAL(ashlsi3_15):
        shll2   r0
LOCAL(ashlsi3_13):
        shll2   r0
LOCAL(ashlsi3_11):
        shll2   r0
LOCAL(ashlsi3_9):
        shll8   r0
        rts
        shll    r0

LOCAL(ashlsi3_22):
        shll2   r0
LOCAL(ashlsi3_20):
        shll2   r0
LOCAL(ashlsi3_18):
        shll2   r0
LOCAL(ashlsi3_16):
        rts
        shll16  r0

LOCAL(ashlsi3_23):
        shll2   r0
LOCAL(ashlsi3_21):
        shll2   r0
LOCAL(ashlsi3_19):
        shll2   r0
LOCAL(ashlsi3_17):
        shll16  r0
        rts
        shll    r0

LOCAL(ashlsi3_30):
        shll2   r0
LOCAL(ashlsi3_28):
        shll2   r0
LOCAL(ashlsi3_26):
        shll2   r0
LOCAL(ashlsi3_24):
        shll16  r0
        rts
        shll8   r0

LOCAL(ashlsi3_31):
        shll2   r0
LOCAL(ashlsi3_29):
        shll2   r0
LOCAL(ashlsi3_27):
        shll2   r0
LOCAL(ashlsi3_25):
        shll16  r0
        shll8   r0
        rts
        shll    r0

LOCAL(ashlsi3_0):
        rts
        nop

        ENDFUNC(GLOBAL(ashlsi3))
#endif

#ifdef L_lshiftrt

!
! GLOBAL(lshrsi3)
!
! Entry:
!
! r4: Value to shift
! r5: Shifts
!
! Exit:
!
! r0: Result
!
! Destroys:
!
! (none)
!
        .global GLOBAL(lshrsi3)
        HIDDEN_FUNC(GLOBAL(lshrsi3))
        .align  2
GLOBAL(lshrsi3):
        mov     #31,r0
        and     r0,r5
        mova    LOCAL(lshrsi3_table),r0
        mov.b   @(r0,r5),r5
#ifdef __sh1__
        add     r5,r0
        jmp     @r0
#else
        braf    r5
#endif
        mov     r4,r0

        .align  2
LOCAL(lshrsi3_table):
        .byte           LOCAL(lshrsi3_0)-LOCAL(lshrsi3_table)
        .byte           LOCAL(lshrsi3_1)-LOCAL(lshrsi3_table)
        .byte           LOCAL(lshrsi3_2)-LOCAL(lshrsi3_table)
        .byte           LOCAL(lshrsi3_3)-LOCAL(lshrsi3_table)
        .byte           LOCAL(lshrsi3_4)-LOCAL(lshrsi3_table)
        .byte           LOCAL(lshrsi3_5)-LOCAL(lshrsi3_table)
        .byte           LOCAL(lshrsi3_6)-LOCAL(lshrsi3_table)
        .byte           LOCAL(lshrsi3_7)-LOCAL(lshrsi3_table)
        .byte           LOCAL(lshrsi3_8)-LOCAL(lshrsi3_table)
        .byte           LOCAL(lshrsi3_9)-LOCAL(lshrsi3_table)
        .byte           LOCAL(lshrsi3_10)-LOCAL(lshrsi3_table)
        .byte           LOCAL(lshrsi3_11)-LOCAL(lshrsi3_table)
        .byte           LOCAL(lshrsi3_12)-LOCAL(lshrsi3_table)
        .byte           LOCAL(lshrsi3_13)-LOCAL(lshrsi3_table)
        .byte           LOCAL(lshrsi3_14)-LOCAL(lshrsi3_table)
        .byte           LOCAL(lshrsi3_15)-LOCAL(lshrsi3_table)
        .byte           LOCAL(lshrsi3_16)-LOCAL(lshrsi3_table)
        .byte           LOCAL(lshrsi3_17)-LOCAL(lshrsi3_table)
        .byte           LOCAL(lshrsi3_18)-LOCAL(lshrsi3_table)
        .byte           LOCAL(lshrsi3_19)-LOCAL(lshrsi3_table)
        .byte           LOCAL(lshrsi3_20)-LOCAL(lshrsi3_table)
        .byte           LOCAL(lshrsi3_21)-LOCAL(lshrsi3_table)
        .byte           LOCAL(lshrsi3_22)-LOCAL(lshrsi3_table)
        .byte           LOCAL(lshrsi3_23)-LOCAL(lshrsi3_table)
        .byte           LOCAL(lshrsi3_24)-LOCAL(lshrsi3_table)
        .byte           LOCAL(lshrsi3_25)-LOCAL(lshrsi3_table)
        .byte           LOCAL(lshrsi3_26)-LOCAL(lshrsi3_table)
        .byte           LOCAL(lshrsi3_27)-LOCAL(lshrsi3_table)
        .byte           LOCAL(lshrsi3_28)-LOCAL(lshrsi3_table)
        .byte           LOCAL(lshrsi3_29)-LOCAL(lshrsi3_table)
        .byte           LOCAL(lshrsi3_30)-LOCAL(lshrsi3_table)
        .byte           LOCAL(lshrsi3_31)-LOCAL(lshrsi3_table)

LOCAL(lshrsi3_6):
        shlr2   r0
LOCAL(lshrsi3_4):
        shlr2   r0
LOCAL(lshrsi3_2):
        rts
        shlr2   r0

LOCAL(lshrsi3_7):
        shlr2   r0
LOCAL(lshrsi3_5):
        shlr2   r0
LOCAL(lshrsi3_3):
        shlr2   r0
LOCAL(lshrsi3_1):
        rts
        shlr    r0

LOCAL(lshrsi3_14):
        shlr2   r0
LOCAL(lshrsi3_12):
        shlr2   r0
LOCAL(lshrsi3_10):
        shlr2   r0
LOCAL(lshrsi3_8):
        rts
        shlr8   r0

LOCAL(lshrsi3_15):
        shlr2   r0
LOCAL(lshrsi3_13):
        shlr2   r0
LOCAL(lshrsi3_11):
        shlr2   r0
LOCAL(lshrsi3_9):
        shlr8   r0
        rts
        shlr    r0

LOCAL(lshrsi3_22):
        shlr2   r0
LOCAL(lshrsi3_20):
        shlr2   r0
LOCAL(lshrsi3_18):
        shlr2   r0
LOCAL(lshrsi3_16):
        rts
        shlr16  r0

LOCAL(lshrsi3_23):
        shlr2   r0
LOCAL(lshrsi3_21):
        shlr2   r0
LOCAL(lshrsi3_19):
        shlr2   r0
LOCAL(lshrsi3_17):
        shlr16  r0
        rts
        shlr    r0

LOCAL(lshrsi3_30):
        shlr2   r0
LOCAL(lshrsi3_28):
        shlr2   r0
LOCAL(lshrsi3_26):
        shlr2   r0
LOCAL(lshrsi3_24):
        shlr16  r0
        rts
        shlr8   r0

LOCAL(lshrsi3_31):
        shlr2   r0
LOCAL(lshrsi3_29):
        shlr2   r0
LOCAL(lshrsi3_27):
        shlr2   r0
LOCAL(lshrsi3_25):
        shlr16  r0
        shlr8   r0
        rts
        shlr    r0

LOCAL(lshrsi3_0):
        rts
        nop

        ENDFUNC(GLOBAL(lshrsi3))
#endif

#ifdef L_movmem
        .text
        .balign 4
        .global GLOBAL(movmem)
        HIDDEN_FUNC(GLOBAL(movmem))
        HIDDEN_ALIAS(movstr,movmem)
        /* This would be a lot simpler if r6 contained the byte count
           minus 64, and we wouldn't be called here for a byte count of 64.  */
GLOBAL(movmem):
        sts.l   pr,@-r15
        shll2   r6
        bsr     GLOBAL(movmemSI52+2)
        mov.l   @(48,r5),r0
        .balign 4
LOCAL(movmem_loop): /* Reached with rts */
        mov.l   @(60,r5),r0
        add     #-64,r6
        mov.l   r0,@(60,r4)
        tst     r6,r6
        mov.l   @(56,r5),r0
        bt      LOCAL(movmem_done)
        mov.l   r0,@(56,r4)
        cmp/pl  r6
        mov.l   @(52,r5),r0
        add     #64,r5
        mov.l   r0,@(52,r4)
        add     #64,r4
        bt      GLOBAL(movmemSI52)
! done all the large groups, do the remainder
! jump to movmem+
        mova    GLOBAL(movmemSI4)+4,r0
        add     r6,r0
        jmp     @r0
LOCAL(movmem_done): ! share slot insn, works out aligned.
        lds.l   @r15+,pr
        mov.l   r0,@(56,r4)
        mov.l   @(52,r5),r0
        rts
        mov.l   r0,@(52,r4)
        .balign 4
! ??? We need aliases movstr* for movmem* for the older libraries.  These
! aliases will be removed at the some point in the future.
        .global GLOBAL(movmemSI64)
        HIDDEN_FUNC(GLOBAL(movmemSI64))
        HIDDEN_ALIAS(movstrSI64,movmemSI64)
GLOBAL(movmemSI64):
        mov.l   @(60,r5),r0
        mov.l   r0,@(60,r4)
        .global GLOBAL(movmemSI60)
        HIDDEN_FUNC(GLOBAL(movmemSI60))
        HIDDEN_ALIAS(movstrSI60,movmemSI60)
GLOBAL(movmemSI60):
        mov.l   @(56,r5),r0
        mov.l   r0,@(56,r4)
        .global GLOBAL(movmemSI56)
        HIDDEN_FUNC(GLOBAL(movmemSI56))
        HIDDEN_ALIAS(movstrSI56,movmemSI56)
GLOBAL(movmemSI56):
        mov.l   @(52,r5),r0
        mov.l   r0,@(52,r4)
        .global GLOBAL(movmemSI52)
        HIDDEN_FUNC(GLOBAL(movmemSI52))
        HIDDEN_ALIAS(movstrSI52,movmemSI52)
GLOBAL(movmemSI52):
        mov.l   @(48,r5),r0
        mov.l   r0,@(48,r4)
        .global GLOBAL(movmemSI48)
        HIDDEN_FUNC(GLOBAL(movmemSI48))
        HIDDEN_ALIAS(movstrSI48,movmemSI48)
GLOBAL(movmemSI48):
        mov.l   @(44,r5),r0
        mov.l   r0,@(44,r4)
        .global GLOBAL(movmemSI44)
        HIDDEN_FUNC(GLOBAL(movmemSI44))
        HIDDEN_ALIAS(movstrSI44,movmemSI44)
GLOBAL(movmemSI44):
        mov.l   @(40,r5),r0
        mov.l   r0,@(40,r4)
        .global GLOBAL(movmemSI40)
        HIDDEN_FUNC(GLOBAL(movmemSI40))
        HIDDEN_ALIAS(movstrSI40,movmemSI40)
GLOBAL(movmemSI40):
        mov.l   @(36,r5),r0
        mov.l   r0,@(36,r4)
        .global GLOBAL(movmemSI36)
        HIDDEN_FUNC(GLOBAL(movmemSI36))
        HIDDEN_ALIAS(movstrSI36,movmemSI36)
GLOBAL(movmemSI36):
        mov.l   @(32,r5),r0
        mov.l   r0,@(32,r4)
        .global GLOBAL(movmemSI32)
        HIDDEN_FUNC(GLOBAL(movmemSI32))
        HIDDEN_ALIAS(movstrSI32,movmemSI32)
GLOBAL(movmemSI32):
        mov.l   @(28,r5),r0
        mov.l   r0,@(28,r4)
        .global GLOBAL(movmemSI28)
        HIDDEN_FUNC(GLOBAL(movmemSI28))
        HIDDEN_ALIAS(movstrSI28,movmemSI28)
GLOBAL(movmemSI28):
        mov.l   @(24,r5),r0
        mov.l   r0,@(24,r4)
        .global GLOBAL(movmemSI24)
        HIDDEN_FUNC(GLOBAL(movmemSI24))
        HIDDEN_ALIAS(movstrSI24,movmemSI24)
GLOBAL(movmemSI24):
        mov.l   @(20,r5),r0
        mov.l   r0,@(20,r4)
        .global GLOBAL(movmemSI20)
        HIDDEN_FUNC(GLOBAL(movmemSI20))
        HIDDEN_ALIAS(movstrSI20,movmemSI20)
GLOBAL(movmemSI20):
        mov.l   @(16,r5),r0
        mov.l   r0,@(16,r4)
        .global GLOBAL(movmemSI16)
        HIDDEN_FUNC(GLOBAL(movmemSI16))
        HIDDEN_ALIAS(movstrSI16,movmemSI16)
GLOBAL(movmemSI16):
        mov.l   @(12,r5),r0
        mov.l   r0,@(12,r4)
        .global GLOBAL(movmemSI12)
        HIDDEN_FUNC(GLOBAL(movmemSI12))
        HIDDEN_ALIAS(movstrSI12,movmemSI12)
GLOBAL(movmemSI12):
        mov.l   @(8,r5),r0
        mov.l   r0,@(8,r4)
        .global GLOBAL(movmemSI8)
        HIDDEN_FUNC(GLOBAL(movmemSI8))
        HIDDEN_ALIAS(movstrSI8,movmemSI8)
GLOBAL(movmemSI8):
        mov.l   @(4,r5),r0
        mov.l   r0,@(4,r4)
        .global GLOBAL(movmemSI4)
        HIDDEN_FUNC(GLOBAL(movmemSI4))
        HIDDEN_ALIAS(movstrSI4,movmemSI4)
GLOBAL(movmemSI4):
        mov.l   @(0,r5),r0
        rts
        mov.l   r0,@(0,r4)

        ENDFUNC(GLOBAL(movmemSI64))
        ENDFUNC(GLOBAL(movmemSI60))
        ENDFUNC(GLOBAL(movmemSI56))
        ENDFUNC(GLOBAL(movmemSI52))
        ENDFUNC(GLOBAL(movmemSI48))
        ENDFUNC(GLOBAL(movmemSI44))
        ENDFUNC(GLOBAL(movmemSI40))
        ENDFUNC(GLOBAL(movmemSI36))
        ENDFUNC(GLOBAL(movmemSI32))
        ENDFUNC(GLOBAL(movmemSI28))
        ENDFUNC(GLOBAL(movmemSI24))
        ENDFUNC(GLOBAL(movmemSI20))
        ENDFUNC(GLOBAL(movmemSI16))
        ENDFUNC(GLOBAL(movmemSI12))
        ENDFUNC(GLOBAL(movmemSI8))
        ENDFUNC(GLOBAL(movmemSI4))
        ENDFUNC(GLOBAL(movmem))
#endif

#ifdef L_movmem_i4
        .text
        .global GLOBAL(movmem_i4_even)
        .global GLOBAL(movmem_i4_odd)
        .global GLOBAL(movmemSI12_i4)

        HIDDEN_FUNC(GLOBAL(movmem_i4_even))
        HIDDEN_FUNC(GLOBAL(movmem_i4_odd))
        HIDDEN_FUNC(GLOBAL(movmemSI12_i4))

        HIDDEN_ALIAS(movstr_i4_even,movmem_i4_even)
        HIDDEN_ALIAS(movstr_i4_odd,movmem_i4_odd)
        HIDDEN_ALIAS(movstrSI12_i4,movmemSI12_i4)

        .p2align        5
L_movmem_2mod4_end:
        mov.l   r0,@(16,r4)
        rts
        mov.l   r1,@(20,r4)

        .p2align        2

GLOBAL(movmem_i4_even):
        mov.l   @r5+,r0
        bra     L_movmem_start_even
        mov.l   @r5+,r1

GLOBAL(movmem_i4_odd):
        mov.l   @r5+,r1
        add     #-4,r4
        mov.l   @r5+,r2
        mov.l   @r5+,r3
        mov.l   r1,@(4,r4)
        mov.l   r2,@(8,r4)

L_movmem_loop:
        mov.l   r3,@(12,r4)
        dt      r6
        mov.l   @r5+,r0
        bt/s    L_movmem_2mod4_end
        mov.l   @r5+,r1
        add     #16,r4
L_movmem_start_even:
        mov.l   @r5+,r2
        mov.l   @r5+,r3
        mov.l   r0,@r4
        dt      r6
        mov.l   r1,@(4,r4)
        bf/s    L_movmem_loop
        mov.l   r2,@(8,r4)
        rts
        mov.l   r3,@(12,r4)

        ENDFUNC(GLOBAL(movmem_i4_even))
        ENDFUNC(GLOBAL(movmem_i4_odd))

        .p2align        4
GLOBAL(movmemSI12_i4):
        mov.l   @r5,r0
        mov.l   @(4,r5),r1
        mov.l   @(8,r5),r2
        mov.l   r0,@r4
        mov.l   r1,@(4,r4)
        rts
        mov.l   r2,@(8,r4)

        ENDFUNC(GLOBAL(movmemSI12_i4))
#endif

#ifdef L_mulsi3


        .global GLOBAL(mulsi3)
        HIDDEN_FUNC(GLOBAL(mulsi3))

! r4 =       aabb
! r5 =       ccdd
! r0 = aabb*ccdd  via partial products
!
! if aa == 0 and cc = 0
! r0 = bb*dd
!
! else
! aa = bb*dd + (aa*dd*65536) + (cc*bb*65536)
!

GLOBAL(mulsi3):
        mulu.w  r4,r5           ! multiply the lsws  macl=bb*dd
        mov     r5,r3           ! r3 = ccdd
        swap.w  r4,r2           ! r2 = bbaa
        xtrct   r2,r3           ! r3 = aacc
        tst     r3,r3           ! msws zero ?
        bf      hiset
        rts                     ! yes - then we have the answer
        sts     macl,r0

hiset:  sts     macl,r0         ! r0 = bb*dd
        mulu.w  r2,r5           ! brewing macl = aa*dd
        sts     macl,r1
        mulu.w  r3,r4           ! brewing macl = cc*bb
        sts     macl,r2
        add     r1,r2
        shll16  r2
        rts
        add     r2,r0

        ENDFUNC(GLOBAL(mulsi3))
#endif
#endif /* ! __SH5__ */
#ifdef L_sdivsi3_i4
        .title "SH DIVIDE"
!! 4 byte integer Divide code for the Renesas SH
#ifdef __SH4__
!! args in r4 and r5, result in fpul, clobber dr0, dr2

        .global GLOBAL(sdivsi3_i4)
        HIDDEN_FUNC(GLOBAL(sdivsi3_i4))
GLOBAL(sdivsi3_i4):
        lds r4,fpul
        float fpul,dr0
        lds r5,fpul
        float fpul,dr2
        fdiv dr2,dr0
        rts
        ftrc dr0,fpul

        ENDFUNC(GLOBAL(sdivsi3_i4))
#elif defined(__SH4_SINGLE__) || defined(__SH4_SINGLE_ONLY__) || (defined (__SH5__) && ! defined __SH4_NOFPU__)
!! args in r4 and r5, result in fpul, clobber r2, dr0, dr2

#if ! __SH5__ || __SH5__ == 32
#if __SH5__
        .mode   SHcompact
#endif
        .global GLOBAL(sdivsi3_i4)
        HIDDEN_FUNC(GLOBAL(sdivsi3_i4))
GLOBAL(sdivsi3_i4):
        sts.l fpscr,@-r15
        mov #8,r2
        swap.w r2,r2
        lds r2,fpscr
        lds r4,fpul
        float fpul,dr0
        lds r5,fpul
        float fpul,dr2
        fdiv dr2,dr0
        ftrc dr0,fpul
        rts
        lds.l @r15+,fpscr

        ENDFUNC(GLOBAL(sdivsi3_i4))
#endif /* ! __SH5__ || __SH5__ == 32 */
#endif /* ! __SH4__ */
#endif

#ifdef L_sdivsi3
/* __SH4_SINGLE_ONLY__ keeps this part for link compatibility with
   sh2e/sh3e code.  */
#if (! defined(__SH4__) && ! defined (__SH4_SINGLE__)) || defined (__linux__)
!!
!! Steve Chamberlain
!! sac@cygnus.com
!!
!!

!! args in r4 and r5, result in r0 clobber r1, r2, r3, and t bit

        .global GLOBAL(sdivsi3)
#if __SHMEDIA__
#if __SH5__ == 32
        .section        .text..SHmedia32,"ax"
#else
        .text
#endif
        .align  2
#if 0
/* The assembly code that follows is a hand-optimized version of the C
   code that follows.  Note that the registers that are modified are
   exactly those listed as clobbered in the patterns divsi3_i1 and
   divsi3_i1_media.
        
int __sdivsi3 (i, j)
     int i, j;
{
  register unsigned long long r18 asm ("r18");
  register unsigned long long r19 asm ("r19");
  register unsigned long long r0 asm ("r0") = 0;
  register unsigned long long r1 asm ("r1") = 1;
  register int r2 asm ("r2") = i >> 31;
  register int r3 asm ("r3") = j >> 31;

  r2 = r2 ? r2 : r1;
  r3 = r3 ? r3 : r1;
  r18 = i * r2;
  r19 = j * r3;
  r2 *= r3;
  
  r19 <<= 31;
  r1 <<= 31;
  do
    if (r18 >= r19)
      r0 |= r1, r18 -= r19;
  while (r19 >>= 1, r1 >>= 1);

  return r2 * (int)r0;
}
*/
GLOBAL(sdivsi3):
        pt/l    LOCAL(sdivsi3_dontadd), tr2
        pt/l    LOCAL(sdivsi3_loop), tr1
        ptabs/l r18, tr0
        movi    0, r0
        movi    1, r1
        shari.l r4, 31, r2
        shari.l r5, 31, r3
        cmveq   r2, r1, r2
        cmveq   r3, r1, r3
        muls.l  r4, r2, r18
        muls.l  r5, r3, r19
        muls.l  r2, r3, r2
        shlli   r19, 31, r19
        shlli   r1, 31, r1
LOCAL(sdivsi3_loop):
        bgtu    r19, r18, tr2
        or      r0, r1, r0
        sub     r18, r19, r18
LOCAL(sdivsi3_dontadd):
        shlri   r1, 1, r1
        shlri   r19, 1, r19
        bnei    r1, 0, tr1
        muls.l  r0, r2, r0
        add.l   r0, r63, r0
        blink   tr0, r63
#elif 0 /* ! 0 */
 // inputs: r4,r5
 // clobbered: r1,r2,r3,r18,r19,r20,r21,r25,tr0
 // result in r0
GLOBAL(sdivsi3):
 // can create absolute value without extra latency,
 // but dependent on proper sign extension of inputs:
 // shari.l r5,31,r2
 // xor r5,r2,r20
 // sub r20,r2,r20 // r20 is now absolute value of r5, zero-extended.
 shari.l r5,31,r2
 ori r2,1,r2
 muls.l r5,r2,r20 // r20 is now absolute value of r5, zero-extended.
 movi 0xffffffffffffbb0c,r19 // shift count eqiv 76
 shari.l r4,31,r3
 nsb r20,r0
 shlld r20,r0,r25
 shlri r25,48,r25
 sub r19,r25,r1
 mmulfx.w r1,r1,r2
 mshflo.w r1,r63,r1
 // If r4 was to be used in-place instead of r21, could use this sequence
 // to compute absolute:
 // sub r63,r4,r19 // compute absolute value of r4
 // shlri r4,32,r3 // into lower 32 bit of r4, keeping
 // mcmv r19,r3,r4 // the sign in the upper 32 bits intact.
 ori r3,1,r3
 mmulfx.w r25,r2,r2
 sub r19,r0,r0
 muls.l r4,r3,r21
 msub.w r1,r2,r2
 addi r2,-2,r1
 mulu.l r21,r1,r19
 mmulfx.w r2,r2,r2
 shlli r1,15,r1
 shlrd r19,r0,r19
 mulu.l r19,r20,r3
 mmacnfx.wl r25,r2,r1
 ptabs r18,tr0
 sub r21,r3,r25

 mulu.l r25,r1,r2
 addi r0,14,r0
 xor r4,r5,r18
 shlrd r2,r0,r2
 mulu.l r2,r20,r3
 add r19,r2,r19
 shari.l r18,31,r18
 sub r25,r3,r25

 mulu.l r25,r1,r2
 sub r25,r20,r25
 add r19,r18,r19
 shlrd r2,r0,r2
 mulu.l r2,r20,r3
 addi r25,1,r25
 add r19,r2,r19

 cmpgt r25,r3,r25
 add.l r19,r25,r0
 xor r0,r18,r0
 blink tr0,r63
#else /* ! 0 && ! 0 */

 // inputs: r4,r5
 // clobbered: r1,r18,r19,r20,r21,r25,tr0
 // result in r0
        HIDDEN_FUNC(GLOBAL(sdivsi3_2))
#ifndef __pic__
        FUNC(GLOBAL(sdivsi3))
GLOBAL(sdivsi3): /* this is the shcompact entry point */
 // The special SHmedia entry point sdivsi3_1 prevents accidental linking
 // with the SHcompact implementation, which clobbers tr1 / tr2.
 .global GLOBAL(sdivsi3_1)
GLOBAL(sdivsi3_1):
 .global GLOBAL(div_table_internal)
 movi (GLOBAL(div_table_internal) >> 16) & 65535, r20
 shori GLOBAL(div_table_internal) & 65535, r20
#endif
 .global GLOBAL(sdivsi3_2)
 // div_table in r20
 // clobbered: r1,r18,r19,r21,r25,tr0
GLOBAL(sdivsi3_2):
 nsb r5, r1
 shlld r5, r1, r25    // normalize; [-2 ..1, 1..2) in s2.62
 shari r25, 58, r21   // extract 5(6) bit index (s2.4 with hole -1..1)
 ldx.ub r20, r21, r19 // u0.8
 shari r25, 32, r25   // normalize to s2.30
 shlli r21, 1, r21
 muls.l r25, r19, r19 // s2.38
 ldx.w r20, r21, r21  // s2.14
  ptabs r18, tr0
 shari r19, 24, r19   // truncate to s2.14
 sub r21, r19, r19    // some 11 bit inverse in s1.14
 muls.l r19, r19, r21 // u0.28
  sub r63, r1, r1
  addi r1, 92, r1
 muls.l r25, r21, r18 // s2.58
 shlli r19, 45, r19   // multiply by two and convert to s2.58
  /* bubble */
 sub r19, r18, r18
 shari r18, 28, r18   // some 22 bit inverse in s1.30
 muls.l r18, r25, r0  // s2.60
  muls.l r18, r4, r25 // s32.30
  /* bubble */
 shari r0, 16, r19   // s-16.44
 muls.l r19, r18, r19 // s-16.74
  shari r25, 63, r0
  shari r4, 14, r18   // s19.-14
 shari r19, 30, r19   // s-16.44
 muls.l r19, r18, r19 // s15.30
  xor r21, r0, r21    // You could also use the constant 1 << 27.
  add r21, r25, r21
 sub r21, r19, r21
 shard r21, r1, r21
 sub r21, r0, r0
 blink tr0, r63
#ifndef __pic__
        ENDFUNC(GLOBAL(sdivsi3))
#endif
        ENDFUNC(GLOBAL(sdivsi3_2))
#endif
#elif defined __SHMEDIA__
/* m5compact-nofpu */
 // clobbered: r18,r19,r20,r21,r25,tr0,tr1,tr2
        .mode   SHmedia
        .section        .text..SHmedia32,"ax"
        .align  2
        FUNC(GLOBAL(sdivsi3))
GLOBAL(sdivsi3):
        pt/l LOCAL(sdivsi3_dontsub), tr0
        pt/l LOCAL(sdivsi3_loop), tr1
        ptabs/l r18,tr2
        shari.l r4,31,r18
        shari.l r5,31,r19
        xor r4,r18,r20
        xor r5,r19,r21
        sub.l r20,r18,r20
        sub.l r21,r19,r21
        xor r18,r19,r19
        shlli r21,32,r25
        addi r25,-1,r21
        addz.l r20,r63,r20
LOCAL(sdivsi3_loop):
        shlli r20,1,r20
        bgeu/u r21,r20,tr0
        sub r20,r21,r20
LOCAL(sdivsi3_dontsub):
        addi.l r25,-1,r25
        bnei r25,-32,tr1
        xor r20,r19,r20
        sub.l r20,r19,r0
        blink tr2,r63
        ENDFUNC(GLOBAL(sdivsi3))
#else /* ! __SHMEDIA__ */
        FUNC(GLOBAL(sdivsi3))
GLOBAL(sdivsi3):
        mov     r4,r1
        mov     r5,r0

        tst     r0,r0
        bt      div0
        mov     #0,r2
        div0s   r2,r1
        subc    r3,r3
        subc    r2,r1
        div0s   r0,r3
        rotcl   r1
        div1    r0,r3
        rotcl   r1
        div1    r0,r3
        rotcl   r1
        div1    r0,r3
        rotcl   r1
        div1    r0,r3
        rotcl   r1
        div1    r0,r3
        rotcl   r1
        div1    r0,r3
        rotcl   r1
        div1    r0,r3
        rotcl   r1
        div1    r0,r3
        rotcl   r1
        div1    r0,r3
        rotcl   r1
        div1    r0,r3
        rotcl   r1
        div1    r0,r3
        rotcl   r1
        div1    r0,r3
        rotcl   r1
        div1    r0,r3
        rotcl   r1
        div1    r0,r3
        rotcl   r1
        div1    r0,r3
        rotcl   r1
        div1    r0,r3
        rotcl   r1
        div1    r0,r3
        rotcl   r1
        div1    r0,r3
        rotcl   r1
        div1    r0,r3
        rotcl   r1
        div1    r0,r3
        rotcl   r1
        div1    r0,r3
        rotcl   r1
        div1    r0,r3
        rotcl   r1
        div1    r0,r3
        rotcl   r1
        div1    r0,r3
        rotcl   r1
        div1    r0,r3
        rotcl   r1
        div1    r0,r3
        rotcl   r1
        div1    r0,r3
        rotcl   r1
        div1    r0,r3
        rotcl   r1
        div1    r0,r3
        rotcl   r1
        div1    r0,r3
        rotcl   r1
        div1    r0,r3
        rotcl   r1
        div1    r0,r3
        rotcl   r1
        addc    r2,r1
        rts
        mov     r1,r0


div0:   rts
        mov     #0,r0

        ENDFUNC(GLOBAL(sdivsi3))
#endif /* ! __SHMEDIA__ */
#endif /* ! __SH4__ */
#endif
#ifdef L_udivsi3_i4

        .title "SH DIVIDE"
!! 4 byte integer Divide code for the Renesas SH
#ifdef __SH4__
!! args in r4 and r5, result in fpul, clobber r0, r1, r4, r5, dr0, dr2, dr4,
!! and t bit

        .global GLOBAL(udivsi3_i4)
        HIDDEN_FUNC(GLOBAL(udivsi3_i4))
GLOBAL(udivsi3_i4):
        mov #1,r1
        cmp/hi r1,r5
        bf trivial
        rotr r1
        xor r1,r4
        lds r4,fpul
        mova L1,r0
#ifdef FMOVD_WORKS
        fmov.d @r0+,dr4
#else
        fmov.s @r0+,DR40
        fmov.s @r0,DR41
#endif
        float fpul,dr0
        xor r1,r5
        lds r5,fpul
        float fpul,dr2
        fadd dr4,dr0
        fadd dr4,dr2
        fdiv dr2,dr0
        rts
        ftrc dr0,fpul

trivial:
        rts
        lds r4,fpul

        .align 2
#ifdef FMOVD_WORKS
        .align 3        ! make double below 8 byte aligned.
#endif
L1:
        .double 2147483648

        ENDFUNC(GLOBAL(udivsi3_i4))
#elif defined (__SH5__) && ! defined (__SH4_NOFPU__)
#if ! __SH5__ || __SH5__ == 32
!! args in r4 and r5, result in fpul, clobber r20, r21, dr0, fr33
        .mode   SHmedia
        .global GLOBAL(udivsi3_i4)
        HIDDEN_FUNC(GLOBAL(udivsi3_i4))
GLOBAL(udivsi3_i4):
        addz.l  r4,r63,r20
        addz.l  r5,r63,r21
        fmov.qd r20,dr0
        fmov.qd r21,dr32
        ptabs   r18,tr0
        float.qd dr0,dr0
        float.qd dr32,dr32
        fdiv.d  dr0,dr32,dr0
        ftrc.dq dr0,dr32
        fmov.s fr33,fr32
        blink tr0,r63

        ENDFUNC(GLOBAL(udivsi3_i4))
#endif /* ! __SH5__ || __SH5__ == 32 */
#elif defined(__SH4_SINGLE__) || defined(__SH4_SINGLE_ONLY__)
!! args in r4 and r5, result in fpul, clobber r0, r1, r4, r5, dr0, dr2, dr4

        .global GLOBAL(udivsi3_i4)
        HIDDEN_FUNC(GLOBAL(udivsi3_i4))
GLOBAL(udivsi3_i4):
        mov #1,r1
        cmp/hi r1,r5
        bf trivial
        sts.l fpscr,@-r15
        mova L1,r0
        lds.l @r0+,fpscr
        rotr r1
        xor r1,r4
        lds r4,fpul
#ifdef FMOVD_WORKS
        fmov.d @r0+,dr4
#else
        fmov.s @r0+,DR40
        fmov.s @r0,DR41
#endif
        float fpul,dr0
        xor r1,r5
        lds r5,fpul
        float fpul,dr2
        fadd dr4,dr0
        fadd dr4,dr2
        fdiv dr2,dr0
        ftrc dr0,fpul
        rts
        lds.l @r15+,fpscr

#ifdef FMOVD_WORKS
        .align 3        ! make double below 8 byte aligned.
#endif
trivial:
        rts
        lds r4,fpul

        .align 2
L1:
#ifndef FMOVD_WORKS
        .long 0x80000
#else
        .long 0x180000
#endif
        .double 2147483648

        ENDFUNC(GLOBAL(udivsi3_i4))
#endif /* ! __SH4__ */
#endif

#ifdef L_udivsi3
/* __SH4_SINGLE_ONLY__ keeps this part for link compatibility with
   sh2e/sh3e code.  */
#if (! defined(__SH4__) && ! defined (__SH4_SINGLE__)) || defined (__linux__)

!! args in r4 and r5, result in r0, clobbers r4, pr, and t bit
        .global GLOBAL(udivsi3)
        HIDDEN_FUNC(GLOBAL(udivsi3))

#if __SHMEDIA__
#if __SH5__ == 32
        .section        .text..SHmedia32,"ax"
#else
        .text
#endif
        .align  2
#if 0
/* The assembly code that follows is a hand-optimized version of the C
   code that follows.  Note that the registers that are modified are
   exactly those listed as clobbered in the patterns udivsi3_i1 and
   udivsi3_i1_media.
        
unsigned 
__udivsi3 (i, j)
    unsigned i, j; 
{
  register unsigned long long r0 asm ("r0") = 0;
  register unsigned long long r18 asm ("r18") = 1;
  register unsigned long long r4 asm ("r4") = i;
  register unsigned long long r19 asm ("r19") = j;

  r19 <<= 31;
  r18 <<= 31;
  do
    if (r4 >= r19)
      r0 |= r18, r4 -= r19;
  while (r19 >>= 1, r18 >>= 1);

  return r0;
}
*/
GLOBAL(udivsi3):
        pt/l    LOCAL(udivsi3_dontadd), tr2
        pt/l    LOCAL(udivsi3_loop), tr1
        ptabs/l r18, tr0
        movi    0, r0
        movi    1, r18
        addz.l  r5, r63, r19
        addz.l  r4, r63, r4
        shlli   r19, 31, r19
        shlli   r18, 31, r18
LOCAL(udivsi3_loop):
        bgtu    r19, r4, tr2
        or      r0, r18, r0
        sub     r4, r19, r4
LOCAL(udivsi3_dontadd):
        shlri   r18, 1, r18
        shlri   r19, 1, r19
        bnei    r18, 0, tr1
        blink   tr0, r63
#else
GLOBAL(udivsi3):
 // inputs: r4,r5
 // clobbered: r18,r19,r20,r21,r22,r25,tr0
 // result in r0.
 addz.l r5,r63,r22
 nsb r22,r0
 shlld r22,r0,r25
 shlri r25,48,r25
 movi 0xffffffffffffbb0c,r20 // shift count eqiv 76
 sub r20,r25,r21
 mmulfx.w r21,r21,r19
 mshflo.w r21,r63,r21
 ptabs r18,tr0
 mmulfx.w r25,r19,r19
 sub r20,r0,r0
 /* bubble */
 msub.w r21,r19,r19
 addi r19,-2,r21 /* It would be nice for scheduling to do this add to r21
                    before the msub.w, but we need a different value for
                    r19 to keep errors under control.  */
 mulu.l r4,r21,r18
 mmulfx.w r19,r19,r19
 shlli r21,15,r21
 shlrd r18,r0,r18
 mulu.l r18,r22,r20
 mmacnfx.wl r25,r19,r21
 /* bubble */
 sub r4,r20,r25

 mulu.l r25,r21,r19
 addi r0,14,r0
 /* bubble */
 shlrd r19,r0,r19
 mulu.l r19,r22,r20
 add r18,r19,r18
 /* bubble */
 sub.l r25,r20,r25

 mulu.l r25,r21,r19
 addz.l r25,r63,r25
 sub r25,r22,r25
 shlrd r19,r0,r19
 mulu.l r19,r22,r20
 addi r25,1,r25
 add r18,r19,r18

 cmpgt r25,r20,r25
 add.l r18,r25,r0
 blink tr0,r63
#endif
#elif defined (__SHMEDIA__)
/* m5compact-nofpu - more emphasis on code size than on speed, but don't
   ignore speed altogether - div1 needs 9 cycles, subc 7 and rotcl 4.
   So use a short shmedia loop.  */
 // clobbered: r20,r21,r25,tr0,tr1,tr2
        .mode   SHmedia
        .section        .text..SHmedia32,"ax"
        .align  2
GLOBAL(udivsi3):
 pt/l LOCAL(udivsi3_dontsub), tr0
 pt/l LOCAL(udivsi3_loop), tr1
 ptabs/l r18,tr2
 shlli r5,32,r25
 addi r25,-1,r21
 addz.l r4,r63,r20
LOCAL(udivsi3_loop):
 shlli r20,1,r20
 bgeu/u r21,r20,tr0
 sub r20,r21,r20
LOCAL(udivsi3_dontsub):
 addi.l r25,-1,r25
 bnei r25,-32,tr1
 add.l r20,r63,r0
 blink tr2,r63
#else /* ! defined (__SHMEDIA__) */
LOCAL(div8):
 div1 r5,r4
LOCAL(div7):
 div1 r5,r4; div1 r5,r4; div1 r5,r4
 div1 r5,r4; div1 r5,r4; div1 r5,r4; rts; div1 r5,r4

LOCAL(divx4):
 div1 r5,r4; rotcl r0
 div1 r5,r4; rotcl r0
 div1 r5,r4; rotcl r0
 rts; div1 r5,r4

GLOBAL(udivsi3):
 sts.l pr,@-r15
 extu.w r5,r0
 cmp/eq r5,r0
#ifdef __sh1__
 bf LOCAL(large_divisor)
#else
 bf/s LOCAL(large_divisor)
#endif
 div0u
 swap.w r4,r0
 shlr16 r4
 bsr LOCAL(div8)
 shll16 r5
 bsr LOCAL(div7)
 div1 r5,r4
 xtrct r4,r0
 xtrct r0,r4
 bsr LOCAL(div8)
 swap.w r4,r4
 bsr LOCAL(div7)
 div1 r5,r4
 lds.l @r15+,pr
 xtrct r4,r0
 swap.w r0,r0
 rotcl r0
 rts
 shlr16 r5

LOCAL(large_divisor):
#ifdef __sh1__
 div0u
#endif
 mov #0,r0
 xtrct r4,r0
 xtrct r0,r4
 bsr LOCAL(divx4)
 rotcl r0
 bsr LOCAL(divx4)
 rotcl r0
 bsr LOCAL(divx4)
 rotcl r0
 bsr LOCAL(divx4)
 rotcl r0
 lds.l @r15+,pr
 rts
 rotcl r0

        ENDFUNC(GLOBAL(udivsi3))
#endif /* ! __SHMEDIA__ */
#endif /* __SH4__ */
#endif /* L_udivsi3 */

#ifdef L_udivdi3
#ifdef __SHMEDIA__
        .mode   SHmedia
        .section        .text..SHmedia32,"ax"
        .align  2
        .global GLOBAL(udivdi3)
        FUNC(GLOBAL(udivdi3))
GLOBAL(udivdi3):
        HIDDEN_ALIAS(udivdi3_internal,udivdi3)
        shlri r3,1,r4
        nsb r4,r22
        shlld r3,r22,r6
        shlri r6,49,r5
        movi 0xffffffffffffbaf1,r21 /* .l shift count 17.  */
        sub r21,r5,r1
        mmulfx.w r1,r1,r4
        mshflo.w r1,r63,r1
        sub r63,r22,r20 // r63 == 64 % 64
        mmulfx.w r5,r4,r4
        pta LOCAL(large_divisor),tr0
        addi r20,32,r9
        msub.w r1,r4,r1
        madd.w r1,r1,r1
        mmulfx.w r1,r1,r4
        shlri r6,32,r7
        bgt/u r9,r63,tr0 // large_divisor
        mmulfx.w r5,r4,r4
        shlri r2,32+14,r19
        addi r22,-31,r0
        msub.w r1,r4,r1

        mulu.l r1,r7,r4
        addi r1,-3,r5
        mulu.l r5,r19,r5
        sub r63,r4,r4 // Negate to make sure r1 ends up <= 1/r2
        shlri r4,2,r4 /* chop off leading %0000000000000000 001.00000000000 - or, as
                         the case may be, %0000000000000000 000.11111111111, still */
        muls.l r1,r4,r4 /* leaving at least one sign bit.  */
        mulu.l r5,r3,r8
        mshalds.l r1,r21,r1
        shari r4,26,r4
        shlld r8,r0,r8
        add r1,r4,r1 // 31 bit unsigned reciprocal now in r1 (msb equiv. 0.5)
        sub r2,r8,r2
        /* Can do second step of 64 : 32 div now, using r1 and the rest in r2.  */

        shlri r2,22,r21
        mulu.l r21,r1,r21
        shlld r5,r0,r8
        addi r20,30-22,r0
        shlrd r21,r0,r21
        mulu.l r21,r3,r5
        add r8,r21,r8
        mcmpgt.l r21,r63,r21 // See Note 1
        addi r20,30,r0
        mshfhi.l r63,r21,r21
        sub r2,r5,r2
        andc r2,r21,r2

        /* small divisor: need a third divide step */
        mulu.l r2,r1,r7
        ptabs r18,tr0
        addi r2,1,r2
        shlrd r7,r0,r7
        mulu.l r7,r3,r5
        add r8,r7,r8
        sub r2,r3,r2
        cmpgt r2,r5,r5
        add r8,r5,r2
        /* could test r3 here to check for divide by zero.  */
        blink tr0,r63

LOCAL(large_divisor):
        mmulfx.w r5,r4,r4
        shlrd r2,r9,r25
        shlri r25,32,r8
        msub.w r1,r4,r1

        mulu.l r1,r7,r4
        addi r1,-3,r5
        mulu.l r5,r8,r5
        sub r63,r4,r4 // Negate to make sure r1 ends up <= 1/r2
        shlri r4,2,r4 /* chop off leading %0000000000000000 001.00000000000 - or, as
                         the case may be, %0000000000000000 000.11111111111, still */
        muls.l r1,r4,r4 /* leaving at least one sign bit.  */
        shlri r5,14-1,r8
        mulu.l r8,r7,r5
        mshalds.l r1,r21,r1
        shari r4,26,r4
        add r1,r4,r1 // 31 bit unsigned reciprocal now in r1 (msb equiv. 0.5)
        sub r25,r5,r25
        /* Can do second step of 64 : 32 div now, using r1 and the rest in r25.  */

        shlri r25,22,r21
        mulu.l r21,r1,r21
        pta LOCAL(no_lo_adj),tr0
        addi r22,32,r0
        shlri r21,40,r21
        mulu.l r21,r7,r5
        add r8,r21,r8
        shlld r2,r0,r2
        sub r25,r5,r25
        bgtu/u r7,r25,tr0 // no_lo_adj
        addi r8,1,r8
        sub r25,r7,r25
LOCAL(no_lo_adj):
        mextr4 r2,r25,r2

        /* large_divisor: only needs a few adjustments.  */
        mulu.l r8,r6,r5
        ptabs r18,tr0
        /* bubble */
        cmpgtu r5,r2,r5
        sub r8,r5,r2
        blink tr0,r63
        ENDFUNC(GLOBAL(udivdi3))
/* Note 1: To shift the result of the second divide stage so that the result
   always fits into 32 bits, yet we still reduce the rest sufficiently
   would require a lot of instructions to do the shifts just right.  Using
   the full 64 bit shift result to multiply with the divisor would require
   four extra instructions for the upper 32 bits (shift / mulu / shift / sub).
   Fortunately, if the upper 32 bits of the shift result are nonzero, we
   know that the rest after taking this partial result into account will
   fit into 32 bits.  So we just clear the upper 32 bits of the rest if the
   upper 32 bits of the partial result are nonzero.  */
#endif /* __SHMEDIA__ */
#endif /* L_udivdi3 */

#ifdef L_divdi3
#ifdef __SHMEDIA__
        .mode   SHmedia
        .section        .text..SHmedia32,"ax"
        .align  2
        .global GLOBAL(divdi3)
        FUNC(GLOBAL(divdi3))
GLOBAL(divdi3):
        pta GLOBAL(udivdi3_internal),tr0
        shari r2,63,r22
        shari r3,63,r23
        xor r2,r22,r2
        xor r3,r23,r3
        sub r2,r22,r2
        sub r3,r23,r3
        beq/u r22,r23,tr0
        ptabs r18,tr1
        blink tr0,r18
        sub r63,r2,r2
        blink tr1,r63
        ENDFUNC(GLOBAL(divdi3))
#endif /* __SHMEDIA__ */
#endif /* L_divdi3 */

#ifdef L_umoddi3
#ifdef __SHMEDIA__
        .mode   SHmedia
        .section        .text..SHmedia32,"ax"
        .align  2
        .global GLOBAL(umoddi3)
        FUNC(GLOBAL(umoddi3))
GLOBAL(umoddi3):
        HIDDEN_ALIAS(umoddi3_internal,umoddi3)
        shlri r3,1,r4
        nsb r4,r22
        shlld r3,r22,r6
        shlri r6,49,r5
        movi 0xffffffffffffbaf1,r21 /* .l shift count 17.  */
        sub r21,r5,r1
        mmulfx.w r1,r1,r4
        mshflo.w r1,r63,r1
        sub r63,r22,r20 // r63 == 64 % 64
        mmulfx.w r5,r4,r4
        pta LOCAL(large_divisor),tr0
        addi r20,32,r9
        msub.w r1,r4,r1
        madd.w r1,r1,r1
        mmulfx.w r1,r1,r4
        shlri r6,32,r7
        bgt/u r9,r63,tr0 // large_divisor
        mmulfx.w r5,r4,r4
        shlri r2,32+14,r19
        addi r22,-31,r0
        msub.w r1,r4,r1

        mulu.l r1,r7,r4
        addi r1,-3,r5
        mulu.l r5,r19,r5
        sub r63,r4,r4 // Negate to make sure r1 ends up <= 1/r2
        shlri r4,2,r4 /* chop off leading %0000000000000000 001.00000000000 - or, as
                         the case may be, %0000000000000000 000.11111111111, still */
        muls.l r1,r4,r4 /* leaving at least one sign bit.  */
        mulu.l r5,r3,r5
        mshalds.l r1,r21,r1
        shari r4,26,r4
        shlld r5,r0,r5
        add r1,r4,r1 // 31 bit unsigned reciprocal now in r1 (msb equiv. 0.5)
        sub r2,r5,r2
        /* Can do second step of 64 : 32 div now, using r1 and the rest in r2.  */

        shlri r2,22,r21
        mulu.l r21,r1,r21
        addi r20,30-22,r0
        /* bubble */ /* could test r3 here to check for divide by zero.  */
        shlrd r21,r0,r21
        mulu.l r21,r3,r5
        mcmpgt.l r21,r63,r21 // See Note 1
        addi r20,30,r0
        mshfhi.l r63,r21,r21
        sub r2,r5,r2
        andc r2,r21,r2

        /* small divisor: need a third divide step */
        mulu.l r2,r1,r7
        ptabs r18,tr0
        sub r2,r3,r8 /* re-use r8 here for rest - r3 */
        shlrd r7,r0,r7
        mulu.l r7,r3,r5
        /* bubble */
        addi r8,1,r7
        cmpgt r7,r5,r7
        cmvne r7,r8,r2
        sub r2,r5,r2
        blink tr0,r63

LOCAL(large_divisor):
        mmulfx.w r5,r4,r4
        shlrd r2,r9,r25
        shlri r25,32,r8
        msub.w r1,r4,r1

        mulu.l r1,r7,r4
        addi r1,-3,r5
        mulu.l r5,r8,r5
        sub r63,r4,r4 // Negate to make sure r1 ends up <= 1/r2
        shlri r4,2,r4 /* chop off leading %0000000000000000 001.00000000000 - or, as
                         the case may be, %0000000000000000 000.11111111111, still */
        muls.l r1,r4,r4 /* leaving at least one sign bit.  */
        shlri r5,14-1,r8
        mulu.l r8,r7,r5
        mshalds.l r1,r21,r1
        shari r4,26,r4
        add r1,r4,r1 // 31 bit unsigned reciprocal now in r1 (msb equiv. 0.5)
        sub r25,r5,r25
        /* Can do second step of 64 : 32 div now, using r1 and the rest in r25.  */

        shlri r25,22,r21
        mulu.l r21,r1,r21
        pta LOCAL(no_lo_adj),tr0
        addi r22,32,r0
        shlri r21,40,r21
        mulu.l r21,r7,r5
        add r8,r21,r8
        shlld r2,r0,r2
        sub r25,r5,r25
        bgtu/u r7,r25,tr0 // no_lo_adj
        addi r8,1,r8
        sub r25,r7,r25
LOCAL(no_lo_adj):
        mextr4 r2,r25,r2

        /* large_divisor: only needs a few adjustments.  */
        mulu.l r8,r6,r5
        ptabs r18,tr0
        add r2,r6,r7
        cmpgtu r5,r2,r8
        cmvne r8,r7,r2
        sub r2,r5,r2
        shlrd r2,r22,r2
        blink tr0,r63
        ENDFUNC(GLOBAL(umoddi3))
/* Note 1: To shift the result of the second divide stage so that the result
   always fits into 32 bits, yet we still reduce the rest sufficiently
   would require a lot of instructions to do the shifts just right.  Using
   the full 64 bit shift result to multiply with the divisor would require
   four extra instructions for the upper 32 bits (shift / mulu / shift / sub).
   Fortunately, if the upper 32 bits of the shift result are nonzero, we
   know that the rest after taking this partial result into account will
   fit into 32 bits.  So we just clear the upper 32 bits of the rest if the
   upper 32 bits of the partial result are nonzero.  */
#endif /* __SHMEDIA__ */
#endif /* L_umoddi3 */

#ifdef L_moddi3
#ifdef __SHMEDIA__
        .mode   SHmedia
        .section        .text..SHmedia32,"ax"
        .align  2
        .global GLOBAL(moddi3)
        FUNC(GLOBAL(moddi3))
GLOBAL(moddi3):
        pta GLOBAL(umoddi3_internal),tr0
        shari r2,63,r22
        shari r3,63,r23
        xor r2,r22,r2
        xor r3,r23,r3
        sub r2,r22,r2
        sub r3,r23,r3
        beq/u r22,r63,tr0
        ptabs r18,tr1
        blink tr0,r18
        sub r63,r2,r2
        blink tr1,r63
        ENDFUNC(GLOBAL(moddi3))
#endif /* __SHMEDIA__ */
#endif /* L_moddi3 */

#ifdef L_set_fpscr
#if !defined (__SH2A_NOFPU__)
#if defined (__SH2E__) || defined (__SH2A__) || defined (__SH3E__) || defined(__SH4_SINGLE__) || defined(__SH4__) || defined(__SH4_SINGLE_ONLY__) || __SH5__ == 32
#ifdef __SH5__
        .mode   SHcompact
#endif
        .global GLOBAL(set_fpscr)
        HIDDEN_FUNC(GLOBAL(set_fpscr))
GLOBAL(set_fpscr):
        lds r4,fpscr
#ifdef __PIC__
        mov.l   r12,@-r15
#ifdef __vxworks
        mov.l   LOCAL(set_fpscr_L0_base),r12
        mov.l   LOCAL(set_fpscr_L0_index),r0
        mov.l   @r12,r12
        mov.l   @(r0,r12),r12
#else
        mova    LOCAL(set_fpscr_L0),r0
        mov.l   LOCAL(set_fpscr_L0),r12
        add     r0,r12
#endif
        mov.l   LOCAL(set_fpscr_L1),r0
        mov.l   @(r0,r12),r1
        mov.l   @r15+,r12
#else
        mov.l LOCAL(set_fpscr_L1),r1
#endif
        swap.w r4,r0
        or #24,r0
#ifndef FMOVD_WORKS
        xor #16,r0
#endif
#if defined(__SH4__) || defined (__SH2A_DOUBLE__)
        swap.w r0,r3
        mov.l r3,@(4,r1)
#else /* defined (__SH2E__) || defined(__SH3E__) || defined(__SH4_SINGLE*__) */
        swap.w r0,r2
        mov.l r2,@r1
#endif
#ifndef FMOVD_WORKS
        xor #8,r0
#else
        xor #24,r0
#endif
#if defined(__SH4__) || defined (__SH2A_DOUBLE__)
        swap.w r0,r2
        rts
        mov.l r2,@r1
#else /* defined(__SH2E__) || defined(__SH3E__) || defined(__SH4_SINGLE*__) */
        swap.w r0,r3
        rts
        mov.l r3,@(4,r1)
#endif
        .align 2
#ifdef __PIC__
#ifdef __vxworks
LOCAL(set_fpscr_L0_base):
        .long ___GOTT_BASE__
LOCAL(set_fpscr_L0_index):
        .long ___GOTT_INDEX__
#else
LOCAL(set_fpscr_L0):
        .long _GLOBAL_OFFSET_TABLE_
#endif
LOCAL(set_fpscr_L1):
        .long GLOBAL(fpscr_values@GOT)
#else
LOCAL(set_fpscr_L1):
        .long GLOBAL(fpscr_values)
#endif

        ENDFUNC(GLOBAL(set_fpscr))
#ifndef NO_FPSCR_VALUES
#ifdef __ELF__
        .comm   GLOBAL(fpscr_values),8,4
#else
        .comm   GLOBAL(fpscr_values),8
#endif /* ELF */
#endif /* NO_FPSCR_VALUES */
#endif /* SH2E / SH3E / SH4 */
#endif /* __SH2A_NOFPU__ */
#endif /* L_set_fpscr */
#ifdef L_ic_invalidate
#if __SH5__ == 32
        .mode   SHmedia
        .section        .text..SHmedia32,"ax"
        .align  2
        .global GLOBAL(init_trampoline)
        HIDDEN_FUNC(GLOBAL(init_trampoline))
GLOBAL(init_trampoline):
        st.l    r0,8,r2
#ifdef __LITTLE_ENDIAN__
        movi    9,r20
        shori   0x402b,r20
        shori   0xd101,r20
        shori   0xd002,r20
#else
        movi    0xffffffffffffd002,r20
        shori   0xd101,r20
        shori   0x402b,r20
        shori   9,r20
#endif
        st.q    r0,0,r20
        st.l    r0,12,r3
        ENDFUNC(GLOBAL(init_trampoline))
        .global GLOBAL(ic_invalidate)
        HIDDEN_FUNC(GLOBAL(ic_invalidate))
GLOBAL(ic_invalidate):
        ocbwb   r0,0
        synco
        icbi    r0, 0
        ptabs   r18, tr0
        synci
        blink   tr0, r63
        ENDFUNC(GLOBAL(ic_invalidate))
#elif defined(__SH4A__)
        .global GLOBAL(ic_invalidate)
        HIDDEN_FUNC(GLOBAL(ic_invalidate))
GLOBAL(ic_invalidate):
        ocbwb   @r4
        synco
        icbi    @r4
        rts
          nop
        ENDFUNC(GLOBAL(ic_invalidate))
#elif defined(__SH4_SINGLE__) || defined(__SH4__) || defined(__SH4_SINGLE_ONLY__) || (defined(__SH4_NOFPU__) && !defined(__SH5__))
        /* For system code, we use ic_invalidate_line_i, but user code
           needs a different mechanism.  A kernel call is generally not
           available, and it would also be slow.  Different SH4 variants use
           different sizes and associativities of the Icache.  We use a small
           bit of dispatch code that can be put hidden in every shared object,
           which calls the actual processor-specific invalidation code in a
           separate module.
           Or if you have operating system support, the OS could mmap the
           procesor-specific code from a single page, since it is highly
           repetitive.  */
        .global GLOBAL(ic_invalidate)
        HIDDEN_FUNC(GLOBAL(ic_invalidate))
GLOBAL(ic_invalidate):
#ifdef __pic__
#ifdef __vxworks
        mov.l   1f,r1
        mov.l   2f,r0
        mov.l   @r1,r1
        mov.l   0f,r2
        mov.l   @(r0,r1),r0
#else
        mov.l   1f,r1
        mova    1f,r0
        mov.l   0f,r2
        add     r1,r0
#endif
        mov.l   @(r0,r2),r1
#else
        mov.l   0f,r1
#endif
        ocbwb   @r4
        mov.l   @(8,r1),r0
        sub     r1,r4
        and     r4,r0
        add     r1,r0
        jmp     @r0
        mov.l   @(4,r1),r0
        .align  2
#ifndef __pic__
0:      .long   GLOBAL(ic_invalidate_array)
#else /* __pic__ */
        .global GLOBAL(ic_invalidate_array)
0:      .long   GLOBAL(ic_invalidate_array)@GOT
#ifdef __vxworks
1:      .long   ___GOTT_BASE__
2:      .long   ___GOTT_INDEX__
#else
1:      .long   _GLOBAL_OFFSET_TABLE_
#endif
        ENDFUNC(GLOBAL(ic_invalidate))
#endif /* __pic__ */
#endif /* SH4 */
#endif /* L_ic_invalidate */

#ifdef L_ic_invalidate_array
#if defined(__SH4A__) || (defined (__FORCE_SH4A__) && (defined(__SH4_SINGLE__) || defined(__SH4__) || defined(__SH4_SINGLE_ONLY__) || (defined(__SH4_NOFPU__) && !defined(__SH5__))))
        .global GLOBAL(ic_invalidate_array)
        /* This is needed when an SH4 dso with trampolines is used on SH4A.  */
        .global GLOBAL(ic_invalidate_array)
        FUNC(GLOBAL(ic_invalidate_array))
GLOBAL(ic_invalidate_array):
        add     r1,r4
        synco
        icbi    @r4
        rts
          nop
        .align 2
        .long   0
        ENDFUNC(GLOBAL(ic_invalidate_array))
#elif defined(__SH4_SINGLE__) || defined(__SH4__) || defined(__SH4_SINGLE_ONLY__) || (defined(__SH4_NOFPU__) && !defined(__SH5__))
        .global GLOBAL(ic_invalidate_array)
        .p2align 5
        FUNC(GLOBAL(ic_invalidate_array))
/* This must be aligned to the beginning of a cache line.  */
GLOBAL(ic_invalidate_array):
#ifndef WAYS
#define WAYS 4
#define WAY_SIZE 0x4000
#endif
#if WAYS == 1
        .rept   WAY_SIZE * WAYS / 32
        rts
        nop
        .rept   7
        .long   WAY_SIZE - 32
        .endr
        .endr
#elif WAYS <= 6
        .rept   WAY_SIZE * WAYS / 32
        braf    r0
        add     #-8,r0
        .long   WAY_SIZE + 8
        .long   WAY_SIZE - 32
        .rept   WAYS-2
        braf    r0
        nop
        .endr
        .rept   7 - WAYS
        rts
        nop
        .endr
        .endr
#else /* WAYS > 6 */
        /* This variant needs two different pages for mmap-ing.  */
        .rept   WAYS-1
        .rept   WAY_SIZE / 32
        braf    r0
        nop
        .long   WAY_SIZE
        .rept 6
        .long   WAY_SIZE - 32
        .endr
        .endr
        .endr
        .rept   WAY_SIZE / 32
        rts
        .rept   15
        nop
        .endr
        .endr
#endif /* WAYS */
        ENDFUNC(GLOBAL(ic_invalidate_array))
#endif /* SH4 */
#endif /* L_ic_invalidate_array */

#if defined (__SH5__) && __SH5__ == 32
#ifdef L_shcompact_call_trampoline
        .section        .rodata
        .align  1
LOCAL(ct_main_table):
.word   LOCAL(ct_r2_fp) - datalabel LOCAL(ct_main_label)
.word   LOCAL(ct_r2_ld) - datalabel LOCAL(ct_main_label)
.word   LOCAL(ct_r2_pop) - datalabel LOCAL(ct_main_label)
.word   LOCAL(ct_r3_fp) - datalabel LOCAL(ct_main_label)
.word   LOCAL(ct_r3_ld) - datalabel LOCAL(ct_main_label)
.word   LOCAL(ct_r3_pop) - datalabel LOCAL(ct_main_label)
.word   LOCAL(ct_r4_fp) - datalabel LOCAL(ct_main_label)
.word   LOCAL(ct_r4_ld) - datalabel LOCAL(ct_main_label)
.word   LOCAL(ct_r4_pop) - datalabel LOCAL(ct_main_label)
.word   LOCAL(ct_r5_fp) - datalabel LOCAL(ct_main_label)
.word   LOCAL(ct_r5_ld) - datalabel LOCAL(ct_main_label)
.word   LOCAL(ct_r5_pop) - datalabel LOCAL(ct_main_label)
.word   LOCAL(ct_r6_fph) - datalabel LOCAL(ct_main_label)
.word   LOCAL(ct_r6_fpl) - datalabel LOCAL(ct_main_label)
.word   LOCAL(ct_r6_ld) - datalabel LOCAL(ct_main_label)
.word   LOCAL(ct_r6_pop) - datalabel LOCAL(ct_main_label)
.word   LOCAL(ct_r7_fph) - datalabel LOCAL(ct_main_label)
.word   LOCAL(ct_r7_fpl) - datalabel LOCAL(ct_main_label)
.word   LOCAL(ct_r7_ld) - datalabel LOCAL(ct_main_label)
.word   LOCAL(ct_r7_pop) - datalabel LOCAL(ct_main_label)
.word   LOCAL(ct_r8_fph) - datalabel LOCAL(ct_main_label)
.word   LOCAL(ct_r8_fpl) - datalabel LOCAL(ct_main_label)
.word   LOCAL(ct_r8_ld) - datalabel LOCAL(ct_main_label)
.word   LOCAL(ct_r8_pop) - datalabel LOCAL(ct_main_label)
.word   LOCAL(ct_r9_fph) - datalabel LOCAL(ct_main_label)
.word   LOCAL(ct_r9_fpl) - datalabel LOCAL(ct_main_label)
.word   LOCAL(ct_r9_ld) - datalabel LOCAL(ct_main_label)
.word   LOCAL(ct_r9_pop) - datalabel LOCAL(ct_main_label)
.word   LOCAL(ct_pop_seq) - datalabel LOCAL(ct_main_label)
.word   LOCAL(ct_pop_seq) - datalabel LOCAL(ct_main_label)
.word   LOCAL(ct_r9_pop) - datalabel LOCAL(ct_main_label)
.word   LOCAL(ct_ret_wide) - datalabel LOCAL(ct_main_label)
.word   LOCAL(ct_call_func) - datalabel LOCAL(ct_main_label)
        .mode   SHmedia
        .section        .text..SHmedia32, "ax"
        .align  2
        
     /* This function loads 64-bit general-purpose registers from the
        stack, from a memory address contained in them or from an FP
        register, according to a cookie passed in r1.  Its execution
        time is linear on the number of registers that actually have
        to be copied.  See sh.h for details on the actual bit pattern.

        The function to be called is passed in r0.  If a 32-bit return
        value is expected, the actual function will be tail-called,
        otherwise the return address will be stored in r10 (that the
        caller should expect to be clobbered) and the return value
        will be expanded into r2/r3 upon return.  */
        
        .global GLOBAL(GCC_shcompact_call_trampoline)
        FUNC(GLOBAL(GCC_shcompact_call_trampoline))
GLOBAL(GCC_shcompact_call_trampoline):
        ptabs/l r0, tr0 /* Prepare to call the actual function.  */
        movi    ((datalabel LOCAL(ct_main_table) - 31 * 2) >> 16) & 65535, r0
        pt/l    LOCAL(ct_loop), tr1
        addz.l  r1, r63, r1
        shori   ((datalabel LOCAL(ct_main_table) - 31 * 2)) & 65535, r0
LOCAL(ct_loop):
        nsb     r1, r28
        shlli   r28, 1, r29
        ldx.w   r0, r29, r30
LOCAL(ct_main_label):
        ptrel/l r30, tr2
        blink   tr2, r63
LOCAL(ct_r2_fp):        /* Copy r2 from an FP register.  */
        /* It must be dr0, so just do it.  */
        fmov.dq dr0, r2
        movi    7, r30
        shlli   r30, 29, r31
        andc    r1, r31, r1
        blink   tr1, r63
LOCAL(ct_r3_fp):        /* Copy r3 from an FP register.  */
        /* It is either dr0 or dr2.  */
        movi    7, r30
        shlri   r1, 26, r32
        shlli   r30, 26, r31
        andc    r1, r31, r1
        fmov.dq dr0, r3
        beqi/l  r32, 4, tr1
        fmov.dq dr2, r3
        blink   tr1, r63
LOCAL(ct_r4_fp):        /* Copy r4 from an FP register.  */
        shlri   r1, 23 - 3, r34
        andi    r34, 3 << 3, r33
        addi    r33, LOCAL(ct_r4_fp_copy) - datalabel LOCAL(ct_r4_fp_base), r32
LOCAL(ct_r4_fp_base):
        ptrel/l r32, tr2
        movi    7, r30
        shlli   r30, 23, r31
        andc    r1, r31, r1
        blink   tr2, r63
LOCAL(ct_r4_fp_copy):
        fmov.dq dr0, r4
        blink   tr1, r63
        fmov.dq dr2, r4
        blink   tr1, r63
        fmov.dq dr4, r4
        blink   tr1, r63
LOCAL(ct_r5_fp):        /* Copy r5 from an FP register.  */
        shlri   r1, 20 - 3, r34
        andi    r34, 3 << 3, r33
        addi    r33, LOCAL(ct_r5_fp_copy) - datalabel LOCAL(ct_r5_fp_base), r32
LOCAL(ct_r5_fp_base):
        ptrel/l r32, tr2
        movi    7, r30
        shlli   r30, 20, r31
        andc    r1, r31, r1
        blink   tr2, r63
LOCAL(ct_r5_fp_copy):
        fmov.dq dr0, r5
        blink   tr1, r63
        fmov.dq dr2, r5
        blink   tr1, r63
        fmov.dq dr4, r5
        blink   tr1, r63
        fmov.dq dr6, r5
        blink   tr1, r63
LOCAL(ct_r6_fph):       /* Copy r6 from a high FP register.  */
        /* It must be dr8.  */
        fmov.dq dr8, r6
        movi    15, r30
        shlli   r30, 16, r31
        andc    r1, r31, r1
        blink   tr1, r63
LOCAL(ct_r6_fpl):       /* Copy r6 from a low FP register.  */
        shlri   r1, 16 - 3, r34
        andi    r34, 3 << 3, r33
        addi    r33, LOCAL(ct_r6_fp_copy) - datalabel LOCAL(ct_r6_fp_base), r32
LOCAL(ct_r6_fp_base):
        ptrel/l r32, tr2
        movi    7, r30
        shlli   r30, 16, r31
        andc    r1, r31, r1
        blink   tr2, r63
LOCAL(ct_r6_fp_copy):
        fmov.dq dr0, r6
        blink   tr1, r63
        fmov.dq dr2, r6
        blink   tr1, r63
        fmov.dq dr4, r6
        blink   tr1, r63
        fmov.dq dr6, r6
        blink   tr1, r63
LOCAL(ct_r7_fph):       /* Copy r7 from a high FP register.  */
        /* It is either dr8 or dr10.  */
        movi    15 << 12, r31
        shlri   r1, 12, r32
        andc    r1, r31, r1
        fmov.dq dr8, r7
        beqi/l  r32, 8, tr1
        fmov.dq dr10, r7
        blink   tr1, r63
LOCAL(ct_r7_fpl):       /* Copy r7 from a low FP register.  */
        shlri   r1, 12 - 3, r34
        andi    r34, 3 << 3, r33
        addi    r33, LOCAL(ct_r7_fp_copy) - datalabel LOCAL(ct_r7_fp_base), r32
LOCAL(ct_r7_fp_base):
        ptrel/l r32, tr2
        movi    7 << 12, r31
        andc    r1, r31, r1
        blink   tr2, r63
LOCAL(ct_r7_fp_copy):
        fmov.dq dr0, r7
        blink   tr1, r63
        fmov.dq dr2, r7
        blink   tr1, r63
        fmov.dq dr4, r7
        blink   tr1, r63
        fmov.dq dr6, r7
        blink   tr1, r63
LOCAL(ct_r8_fph):       /* Copy r8 from a high FP register.  */
        /* It is either dr8 or dr10.  */
        movi    15 << 8, r31
        andi    r1, 1 << 8, r32
        andc    r1, r31, r1
        fmov.dq dr8, r8
        beq/l   r32, r63, tr1
        fmov.dq dr10, r8
        blink   tr1, r63
LOCAL(ct_r8_fpl):       /* Copy r8 from a low FP register.  */
        shlri   r1, 8 - 3, r34
        andi    r34, 3 << 3, r33
        addi    r33, LOCAL(ct_r8_fp_copy) - datalabel LOCAL(ct_r8_fp_base), r32
LOCAL(ct_r8_fp_base):
        ptrel/l r32, tr2
        movi    7 << 8, r31
        andc    r1, r31, r1
        blink   tr2, r63
LOCAL(ct_r8_fp_copy):
        fmov.dq dr0, r8
        blink   tr1, r63
        fmov.dq dr2, r8
        blink   tr1, r63
        fmov.dq dr4, r8
        blink   tr1, r63
        fmov.dq dr6, r8
        blink   tr1, r63
LOCAL(ct_r9_fph):       /* Copy r9 from a high FP register.  */
        /* It is either dr8 or dr10.  */
        movi    15 << 4, r31
        andi    r1, 1 << 4, r32
        andc    r1, r31, r1
        fmov.dq dr8, r9
        beq/l   r32, r63, tr1
        fmov.dq dr10, r9
        blink   tr1, r63
LOCAL(ct_r9_fpl):       /* Copy r9 from a low FP register.  */
        shlri   r1, 4 - 3, r34
        andi    r34, 3 << 3, r33
        addi    r33, LOCAL(ct_r9_fp_copy) - datalabel LOCAL(ct_r9_fp_base), r32
LOCAL(ct_r9_fp_base):
        ptrel/l r32, tr2
        movi    7 << 4, r31
        andc    r1, r31, r1
        blink   tr2, r63
LOCAL(ct_r9_fp_copy):
        fmov.dq dr0, r9
        blink   tr1, r63
        fmov.dq dr2, r9
        blink   tr1, r63
        fmov.dq dr4, r9
        blink   tr1, r63
        fmov.dq dr6, r9
        blink   tr1, r63
LOCAL(ct_r2_ld):        /* Copy r2 from a memory address.  */
        pt/l    LOCAL(ct_r2_load), tr2
        movi    3, r30
        shlli   r30, 29, r31
        and     r1, r31, r32
        andc    r1, r31, r1
        beq/l   r31, r32, tr2
        addi.l  r2, 8, r3
        ldx.q   r2, r63, r2
        /* Fall through.  */
LOCAL(ct_r3_ld):        /* Copy r3 from a memory address.  */
        pt/l    LOCAL(ct_r3_load), tr2
        movi    3, r30
        shlli   r30, 26, r31
        and     r1, r31, r32
        andc    r1, r31, r1
        beq/l   r31, r32, tr2
        addi.l  r3, 8, r4
        ldx.q   r3, r63, r3
LOCAL(ct_r4_ld):        /* Copy r4 from a memory address.  */
        pt/l    LOCAL(ct_r4_load), tr2
        movi    3, r30
        shlli   r30, 23, r31
        and     r1, r31, r32
        andc    r1, r31, r1
        beq/l   r31, r32, tr2
        addi.l  r4, 8, r5
        ldx.q   r4, r63, r4
LOCAL(ct_r5_ld):        /* Copy r5 from a memory address.  */
        pt/l    LOCAL(ct_r5_load), tr2
        movi    3, r30
        shlli   r30, 20, r31
        and     r1, r31, r32
        andc    r1, r31, r1
        beq/l   r31, r32, tr2
        addi.l  r5, 8, r6
        ldx.q   r5, r63, r5
LOCAL(ct_r6_ld):        /* Copy r6 from a memory address.  */
        pt/l    LOCAL(ct_r6_load), tr2
        movi    3 << 16, r31
        and     r1, r31, r32
        andc    r1, r31, r1
        beq/l   r31, r32, tr2
        addi.l  r6, 8, r7
        ldx.q   r6, r63, r6
LOCAL(ct_r7_ld):        /* Copy r7 from a memory address.  */
        pt/l    LOCAL(ct_r7_load), tr2
        movi    3 << 12, r31
        and     r1, r31, r32
        andc    r1, r31, r1
        beq/l   r31, r32, tr2
        addi.l  r7, 8, r8
        ldx.q   r7, r63, r7
LOCAL(ct_r8_ld):        /* Copy r8 from a memory address.  */
        pt/l    LOCAL(ct_r8_load), tr2
        movi    3 << 8, r31
        and     r1, r31, r32
        andc    r1, r31, r1
        beq/l   r31, r32, tr2
        addi.l  r8, 8, r9
        ldx.q   r8, r63, r8
LOCAL(ct_r9_ld):        /* Copy r9 from a memory address.  */
        pt/l    LOCAL(ct_check_tramp), tr2
        ldx.q   r9, r63, r9
        blink   tr2, r63
LOCAL(ct_r2_load):
        ldx.q   r2, r63, r2
        blink   tr1, r63
LOCAL(ct_r3_load):
        ldx.q   r3, r63, r3
        blink   tr1, r63
LOCAL(ct_r4_load):
        ldx.q   r4, r63, r4
        blink   tr1, r63
LOCAL(ct_r5_load):
        ldx.q   r5, r63, r5
        blink   tr1, r63
LOCAL(ct_r6_load):
        ldx.q   r6, r63, r6
        blink   tr1, r63
LOCAL(ct_r7_load):
        ldx.q   r7, r63, r7
        blink   tr1, r63
LOCAL(ct_r8_load):
        ldx.q   r8, r63, r8
        blink   tr1, r63
LOCAL(ct_r2_pop):       /* Pop r2 from the stack.  */
        movi    1, r30
        ldx.q   r15, r63, r2
        shlli   r30, 29, r31
        addi.l  r15, 8, r15
        andc    r1, r31, r1
        blink   tr1, r63
LOCAL(ct_r3_pop):       /* Pop r3 from the stack.  */
        movi    1, r30
        ldx.q   r15, r63, r3
        shlli   r30, 26, r31
        addi.l  r15, 8, r15
        andc    r1, r31, r1
        blink   tr1, r63
LOCAL(ct_r4_pop):       /* Pop r4 from the stack.  */
        movi    1, r30
        ldx.q   r15, r63, r4
        shlli   r30, 23, r31
        addi.l  r15, 8, r15
        andc    r1, r31, r1
        blink   tr1, r63
LOCAL(ct_r5_pop):       /* Pop r5 from the stack.  */
        movi    1, r30
        ldx.q   r15, r63, r5
        shlli   r30, 20, r31
        addi.l  r15, 8, r15
        andc    r1, r31, r1
        blink   tr1, r63
LOCAL(ct_r6_pop):       /* Pop r6 from the stack.  */
        movi    1, r30
        ldx.q   r15, r63, r6
        shlli   r30, 16, r31
        addi.l  r15, 8, r15
        andc    r1, r31, r1
        blink   tr1, r63
LOCAL(ct_r7_pop):       /* Pop r7 from the stack.  */
        ldx.q   r15, r63, r7
        movi    1 << 12, r31
        addi.l  r15, 8, r15
        andc    r1, r31, r1
        blink   tr1, r63
LOCAL(ct_r8_pop):       /* Pop r8 from the stack.  */
        ldx.q   r15, r63, r8
        movi    1 << 8, r31
        addi.l  r15, 8, r15
        andc    r1, r31, r1
        blink   tr1, r63
LOCAL(ct_pop_seq):      /* Pop a sequence of registers off the stack.  */
        andi    r1, 7 << 1, r30
        movi    (LOCAL(ct_end_of_pop_seq) >> 16) & 65535, r32
        shlli   r30, 2, r31
        shori   LOCAL(ct_end_of_pop_seq) & 65535, r32
        sub.l   r32, r31, r33
        ptabs/l r33, tr2
        blink   tr2, r63
LOCAL(ct_start_of_pop_seq):     /* Beginning of pop sequence.  */
        ldx.q   r15, r63, r3
        addi.l  r15, 8, r15
        ldx.q   r15, r63, r4
        addi.l  r15, 8, r15
        ldx.q   r15, r63, r5
        addi.l  r15, 8, r15
        ldx.q   r15, r63, r6
        addi.l  r15, 8, r15
        ldx.q   r15, r63, r7
        addi.l  r15, 8, r15
        ldx.q   r15, r63, r8
        addi.l  r15, 8, r15
LOCAL(ct_r9_pop):       /* Pop r9 from the stack.  */
        ldx.q   r15, r63, r9
        addi.l  r15, 8, r15
LOCAL(ct_end_of_pop_seq): /* Label used to compute first pop instruction.  */
LOCAL(ct_check_tramp):  /* Check whether we need a trampoline.  */
        pt/u    LOCAL(ct_ret_wide), tr2
        andi    r1, 1, r1
        bne/u   r1, r63, tr2
LOCAL(ct_call_func):    /* Just branch to the function.  */
        blink   tr0, r63
LOCAL(ct_ret_wide):     /* Call the function, so that we can unpack its 
                           64-bit return value.  */
        add.l   r18, r63, r10
        blink   tr0, r18
        ptabs   r10, tr0
#if __LITTLE_ENDIAN__
        shari   r2, 32, r3
        add.l   r2, r63, r2
#else
        add.l   r2, r63, r3
        shari   r2, 32, r2
#endif
        blink   tr0, r63

        ENDFUNC(GLOBAL(GCC_shcompact_call_trampoline))
#endif /* L_shcompact_call_trampoline */

#ifdef L_shcompact_return_trampoline
     /* This function does the converse of the code in `ret_wide'
        above.  It is tail-called by SHcompact functions returning
        64-bit non-floating-point values, to pack the 32-bit values in
        r2 and r3 into r2.  */

        .mode   SHmedia
        .section        .text..SHmedia32, "ax"
        .align  2
        .global GLOBAL(GCC_shcompact_return_trampoline)
        HIDDEN_FUNC(GLOBAL(GCC_shcompact_return_trampoline))
GLOBAL(GCC_shcompact_return_trampoline):
        ptabs/l r18, tr0
#if __LITTLE_ENDIAN__
        addz.l  r2, r63, r2
        shlli   r3, 32, r3
#else
        addz.l  r3, r63, r3
        shlli   r2, 32, r2
#endif
        or      r3, r2, r2
        blink   tr0, r63

        ENDFUNC(GLOBAL(GCC_shcompact_return_trampoline))
#endif /* L_shcompact_return_trampoline */

#ifdef L_shcompact_incoming_args
        .section        .rodata
        .align  1
LOCAL(ia_main_table):
.word   1 /* Invalid, just loop */
.word   LOCAL(ia_r2_ld) - datalabel LOCAL(ia_main_label)
.word   LOCAL(ia_r2_push) - datalabel LOCAL(ia_main_label)
.word   1 /* Invalid, just loop */
.word   LOCAL(ia_r3_ld) - datalabel LOCAL(ia_main_label)
.word   LOCAL(ia_r3_push) - datalabel LOCAL(ia_main_label)
.word   1 /* Invalid, just loop */
.word   LOCAL(ia_r4_ld) - datalabel LOCAL(ia_main_label)
.word   LOCAL(ia_r4_push) - datalabel LOCAL(ia_main_label)
.word   1 /* Invalid, just loop */
.word   LOCAL(ia_r5_ld) - datalabel LOCAL(ia_main_label)
.word   LOCAL(ia_r5_push) - datalabel LOCAL(ia_main_label)
.word   1 /* Invalid, just loop */
.word   1 /* Invalid, just loop */
.word   LOCAL(ia_r6_ld) - datalabel LOCAL(ia_main_label)
.word   LOCAL(ia_r6_push) - datalabel LOCAL(ia_main_label)
.word   1 /* Invalid, just loop */
.word   1 /* Invalid, just loop */
.word   LOCAL(ia_r7_ld) - datalabel LOCAL(ia_main_label)
.word   LOCAL(ia_r7_push) - datalabel LOCAL(ia_main_label)
.word   1 /* Invalid, just loop */
.word   1 /* Invalid, just loop */
.word   LOCAL(ia_r8_ld) - datalabel LOCAL(ia_main_label)
.word   LOCAL(ia_r8_push) - datalabel LOCAL(ia_main_label)
.word   1 /* Invalid, just loop */
.word   1 /* Invalid, just loop */
.word   LOCAL(ia_r9_ld) - datalabel LOCAL(ia_main_label)
.word   LOCAL(ia_r9_push) - datalabel LOCAL(ia_main_label)
.word   LOCAL(ia_push_seq) - datalabel LOCAL(ia_main_label)
.word   LOCAL(ia_push_seq) - datalabel LOCAL(ia_main_label)
.word   LOCAL(ia_r9_push) - datalabel LOCAL(ia_main_label)
.word   LOCAL(ia_return) - datalabel LOCAL(ia_main_label)
.word   LOCAL(ia_return) - datalabel LOCAL(ia_main_label)
        .mode   SHmedia
        .section        .text..SHmedia32, "ax"
        .align  2
        
     /* This function stores 64-bit general-purpose registers back in
        the stack, and loads the address in which each register
        was stored into itself.  The lower 32 bits of r17 hold the address
        to begin storing, and the upper 32 bits of r17 hold the cookie.
        Its execution time is linear on the
        number of registers that actually have to be copied, and it is
        optimized for structures larger than 64 bits, as opposed to
        individual `long long' arguments.  See sh.h for details on the
        actual bit pattern.  */
        
        .global GLOBAL(GCC_shcompact_incoming_args)
        FUNC(GLOBAL(GCC_shcompact_incoming_args))
GLOBAL(GCC_shcompact_incoming_args):
        ptabs/l r18, tr0        /* Prepare to return.  */
        shlri   r17, 32, r0     /* Load the cookie.  */
        movi    ((datalabel LOCAL(ia_main_table) - 31 * 2) >> 16) & 65535, r43
        pt/l    LOCAL(ia_loop), tr1
        add.l   r17, r63, r17
        shori   ((datalabel LOCAL(ia_main_table) - 31 * 2)) & 65535, r43
LOCAL(ia_loop):
        nsb     r0, r36
        shlli   r36, 1, r37
        ldx.w   r43, r37, r38
LOCAL(ia_main_label):
        ptrel/l r38, tr2
        blink   tr2, r63
LOCAL(ia_r2_ld):        /* Store r2 and load its address.  */
        movi    3, r38
        shlli   r38, 29, r39
        and     r0, r39, r40
        andc    r0, r39, r0
        stx.q   r17, r63, r2
        add.l   r17, r63, r2
        addi.l  r17, 8, r17
        beq/u   r39, r40, tr1
LOCAL(ia_r3_ld):        /* Store r3 and load its address.  */
        movi    3, r38
        shlli   r38, 26, r39
        and     r0, r39, r40
        andc    r0, r39, r0
        stx.q   r17, r63, r3
        add.l   r17, r63, r3
        addi.l  r17, 8, r17
        beq/u   r39, r40, tr1
LOCAL(ia_r4_ld):        /* Store r4 and load its address.  */
        movi    3, r38
        shlli   r38, 23, r39
        and     r0, r39, r40
        andc    r0, r39, r0
        stx.q   r17, r63, r4
        add.l   r17, r63, r4
        addi.l  r17, 8, r17
        beq/u   r39, r40, tr1
LOCAL(ia_r5_ld):        /* Store r5 and load its address.  */
        movi    3, r38
        shlli   r38, 20, r39
        and     r0, r39, r40
        andc    r0, r39, r0
        stx.q   r17, r63, r5
        add.l   r17, r63, r5
        addi.l  r17, 8, r17
        beq/u   r39, r40, tr1
LOCAL(ia_r6_ld):        /* Store r6 and load its address.  */
        movi    3, r38
        shlli   r38, 16, r39
        and     r0, r39, r40
        andc    r0, r39, r0
        stx.q   r17, r63, r6
        add.l   r17, r63, r6
        addi.l  r17, 8, r17
        beq/u   r39, r40, tr1
LOCAL(ia_r7_ld):        /* Store r7 and load its address.  */
        movi    3 << 12, r39
        and     r0, r39, r40
        andc    r0, r39, r0
        stx.q   r17, r63, r7
        add.l   r17, r63, r7
        addi.l  r17, 8, r17
        beq/u   r39, r40, tr1
LOCAL(ia_r8_ld):        /* Store r8 and load its address.  */
        movi    3 << 8, r39
        and     r0, r39, r40
        andc    r0, r39, r0
        stx.q   r17, r63, r8
        add.l   r17, r63, r8
        addi.l  r17, 8, r17
        beq/u   r39, r40, tr1
LOCAL(ia_r9_ld):        /* Store r9 and load its address.  */
        stx.q   r17, r63, r9
        add.l   r17, r63, r9
        blink   tr0, r63
LOCAL(ia_r2_push):      /* Push r2 onto the stack.  */
        movi    1, r38
        shlli   r38, 29, r39
        andc    r0, r39, r0
        stx.q   r17, r63, r2
        addi.l  r17, 8, r17
        blink   tr1, r63
LOCAL(ia_r3_push):      /* Push r3 onto the stack.  */
        movi    1, r38
        shlli   r38, 26, r39
        andc    r0, r39, r0
        stx.q   r17, r63, r3
        addi.l  r17, 8, r17
        blink   tr1, r63
LOCAL(ia_r4_push):      /* Push r4 onto the stack.  */
        movi    1, r38
        shlli   r38, 23, r39
        andc    r0, r39, r0
        stx.q   r17, r63, r4
        addi.l  r17, 8, r17
        blink   tr1, r63
LOCAL(ia_r5_push):      /* Push r5 onto the stack.  */
        movi    1, r38
        shlli   r38, 20, r39
        andc    r0, r39, r0
        stx.q   r17, r63, r5
        addi.l  r17, 8, r17
        blink   tr1, r63
LOCAL(ia_r6_push):      /* Push r6 onto the stack.  */
        movi    1, r38
        shlli   r38, 16, r39
        andc    r0, r39, r0
        stx.q   r17, r63, r6
        addi.l  r17, 8, r17
        blink   tr1, r63
LOCAL(ia_r7_push):      /* Push r7 onto the stack.  */
        movi    1 << 12, r39
        andc    r0, r39, r0
        stx.q   r17, r63, r7
        addi.l  r17, 8, r17
        blink   tr1, r63
LOCAL(ia_r8_push):      /* Push r8 onto the stack.  */
        movi    1 << 8, r39
        andc    r0, r39, r0
        stx.q   r17, r63, r8
        addi.l  r17, 8, r17
        blink   tr1, r63
LOCAL(ia_push_seq):     /* Push a sequence of registers onto the stack.  */
        andi    r0, 7 << 1, r38
        movi    (LOCAL(ia_end_of_push_seq) >> 16) & 65535, r40
        shlli   r38, 2, r39
        shori   LOCAL(ia_end_of_push_seq) & 65535, r40
        sub.l   r40, r39, r41
        ptabs/l r41, tr2
        blink   tr2, r63
LOCAL(ia_stack_of_push_seq):     /* Beginning of push sequence.  */
        stx.q   r17, r63, r3
        addi.l  r17, 8, r17
        stx.q   r17, r63, r4
        addi.l  r17, 8, r17
        stx.q   r17, r63, r5
        addi.l  r17, 8, r17
        stx.q   r17, r63, r6
        addi.l  r17, 8, r17
        stx.q   r17, r63, r7
        addi.l  r17, 8, r17
        stx.q   r17, r63, r8
        addi.l  r17, 8, r17
LOCAL(ia_r9_push):      /* Push r9 onto the stack.  */
        stx.q   r17, r63, r9
LOCAL(ia_return):       /* Return.  */
        blink   tr0, r63
LOCAL(ia_end_of_push_seq): /* Label used to compute the first push instruction.  */
        ENDFUNC(GLOBAL(GCC_shcompact_incoming_args))
#endif /* L_shcompact_incoming_args */
#endif
#if __SH5__
#ifdef L_nested_trampoline
#if __SH5__ == 32
        .section        .text..SHmedia32,"ax"
#else
        .text
#endif
        .align  3 /* It is copied in units of 8 bytes in SHmedia mode.  */
        .global GLOBAL(GCC_nested_trampoline)
        HIDDEN_FUNC(GLOBAL(GCC_nested_trampoline))
GLOBAL(GCC_nested_trampoline):
        .mode   SHmedia
        ptrel/u r63, tr0
        gettr   tr0, r0
#if __SH5__ == 64
        ld.q    r0, 24, r1
#else
        ld.l    r0, 24, r1
#endif
        ptabs/l r1, tr1
#if __SH5__ == 64
        ld.q    r0, 32, r1
#else
        ld.l    r0, 28, r1
#endif
        blink   tr1, r63

        ENDFUNC(GLOBAL(GCC_nested_trampoline))
#endif /* L_nested_trampoline */
#endif /* __SH5__ */
#if __SH5__ == 32
#ifdef L_push_pop_shmedia_regs
        .section        .text..SHmedia32,"ax"
        .mode   SHmedia
        .align  2
#ifndef __SH4_NOFPU__   
        .global GLOBAL(GCC_push_shmedia_regs)
        FUNC(GLOBAL(GCC_push_shmedia_regs))
GLOBAL(GCC_push_shmedia_regs):
        addi.l  r15, -14*8, r15
        fst.d   r15, 13*8, dr62
        fst.d   r15, 12*8, dr60
        fst.d   r15, 11*8, dr58
        fst.d   r15, 10*8, dr56
        fst.d   r15,  9*8, dr54
        fst.d   r15,  8*8, dr52
        fst.d   r15,  7*8, dr50
        fst.d   r15,  6*8, dr48
        fst.d   r15,  5*8, dr46
        fst.d   r15,  4*8, dr44
        fst.d   r15,  3*8, dr42
        fst.d   r15,  2*8, dr40
        fst.d   r15,  1*8, dr38
        fst.d   r15,  0*8, dr36
#else /* ! __SH4_NOFPU__ */
        .global GLOBAL(GCC_push_shmedia_regs_nofpu)
        FUNC(GLOBAL(GCC_push_shmedia_regs_nofpu))
GLOBAL(GCC_push_shmedia_regs_nofpu):
#endif /* ! __SH4_NOFPU__ */
        ptabs/l r18, tr0
        addi.l  r15, -27*8, r15
        gettr   tr7, r62
        gettr   tr6, r61
        gettr   tr5, r60
        st.q    r15, 26*8, r62
        st.q    r15, 25*8, r61
        st.q    r15, 24*8, r60
        st.q    r15, 23*8, r59
        st.q    r15, 22*8, r58
        st.q    r15, 21*8, r57
        st.q    r15, 20*8, r56
        st.q    r15, 19*8, r55
        st.q    r15, 18*8, r54
        st.q    r15, 17*8, r53
        st.q    r15, 16*8, r52
        st.q    r15, 15*8, r51
        st.q    r15, 14*8, r50
        st.q    r15, 13*8, r49
        st.q    r15, 12*8, r48
        st.q    r15, 11*8, r47
        st.q    r15, 10*8, r46
        st.q    r15,  9*8, r45
        st.q    r15,  8*8, r44
        st.q    r15,  7*8, r35
        st.q    r15,  6*8, r34
        st.q    r15,  5*8, r33
        st.q    r15,  4*8, r32
        st.q    r15,  3*8, r31
        st.q    r15,  2*8, r30
        st.q    r15,  1*8, r29
        st.q    r15,  0*8, r28
        blink   tr0, r63
#ifndef __SH4_NOFPU__   
        ENDFUNC(GLOBAL(GCC_push_shmedia_regs))
#else
        ENDFUNC(GLOBAL(GCC_push_shmedia_regs_nofpu))
#endif
#ifndef __SH4_NOFPU__   
        .global GLOBAL(GCC_pop_shmedia_regs)
        FUNC(GLOBAL(GCC_pop_shmedia_regs))
GLOBAL(GCC_pop_shmedia_regs):
        pt      .L0, tr1
        movi    41*8, r0
        fld.d   r15, 40*8, dr62
        fld.d   r15, 39*8, dr60
        fld.d   r15, 38*8, dr58
        fld.d   r15, 37*8, dr56
        fld.d   r15, 36*8, dr54
        fld.d   r15, 35*8, dr52
        fld.d   r15, 34*8, dr50
        fld.d   r15, 33*8, dr48
        fld.d   r15, 32*8, dr46
        fld.d   r15, 31*8, dr44
        fld.d   r15, 30*8, dr42
        fld.d   r15, 29*8, dr40
        fld.d   r15, 28*8, dr38
        fld.d   r15, 27*8, dr36
        blink   tr1, r63
#else /* ! __SH4_NOFPU__        */
        .global GLOBAL(GCC_pop_shmedia_regs_nofpu)
        FUNC(GLOBAL(GCC_pop_shmedia_regs_nofpu))
GLOBAL(GCC_pop_shmedia_regs_nofpu):
#endif /* ! __SH4_NOFPU__       */
        movi    27*8, r0
.L0:
        ptabs   r18, tr0
        ld.q    r15, 26*8, r62
        ld.q    r15, 25*8, r61
        ld.q    r15, 24*8, r60
        ptabs   r62, tr7
        ptabs   r61, tr6
        ptabs   r60, tr5
        ld.q    r15, 23*8, r59
        ld.q    r15, 22*8, r58
        ld.q    r15, 21*8, r57
        ld.q    r15, 20*8, r56
        ld.q    r15, 19*8, r55
        ld.q    r15, 18*8, r54
        ld.q    r15, 17*8, r53
        ld.q    r15, 16*8, r52
        ld.q    r15, 15*8, r51
        ld.q    r15, 14*8, r50
        ld.q    r15, 13*8, r49
        ld.q    r15, 12*8, r48
        ld.q    r15, 11*8, r47
        ld.q    r15, 10*8, r46
        ld.q    r15,  9*8, r45
        ld.q    r15,  8*8, r44
        ld.q    r15,  7*8, r35
        ld.q    r15,  6*8, r34
        ld.q    r15,  5*8, r33
        ld.q    r15,  4*8, r32
        ld.q    r15,  3*8, r31
        ld.q    r15,  2*8, r30
        ld.q    r15,  1*8, r29
        ld.q    r15,  0*8, r28
        add.l   r15, r0, r15
        blink   tr0, r63

#ifndef __SH4_NOFPU__
        ENDFUNC(GLOBAL(GCC_pop_shmedia_regs))
#else
        ENDFUNC(GLOBAL(GCC_pop_shmedia_regs_nofpu))
#endif
#endif /* __SH5__ == 32 */
#endif /* L_push_pop_shmedia_regs */

#ifdef L_div_table
#if __SH5__
#if defined(__pic__) && defined(__SHMEDIA__)
        .global GLOBAL(sdivsi3)
        FUNC(GLOBAL(sdivsi3))
#if __SH5__ == 32
        .section        .text..SHmedia32,"ax"
#else
        .text
#endif
#if 0
/* ??? FIXME: Presumably due to a linker bug, exporting data symbols
   in a text section does not work (at least for shared libraries):
   the linker sets the LSB of the address as if this was SHmedia code.  */
#define TEXT_DATA_BUG
#endif
        .align  2
 // inputs: r4,r5
 // clobbered: r1,r18,r19,r20,r21,r25,tr0
 // result in r0
 .global GLOBAL(sdivsi3)
GLOBAL(sdivsi3):
#ifdef TEXT_DATA_BUG
 ptb datalabel Local_div_table,tr0
#else
 ptb GLOBAL(div_table_internal),tr0
#endif
 nsb r5, r1
 shlld r5, r1, r25    // normalize; [-2 ..1, 1..2) in s2.62
 shari r25, 58, r21   // extract 5(6) bit index (s2.4 with hole -1..1)
 /* bubble */
 gettr tr0,r20
 ldx.ub r20, r21, r19 // u0.8
 shari r25, 32, r25   // normalize to s2.30
 shlli r21, 1, r21
 muls.l r25, r19, r19 // s2.38
 ldx.w r20, r21, r21  // s2.14
  ptabs r18, tr0
 shari r19, 24, r19   // truncate to s2.14
 sub r21, r19, r19    // some 11 bit inverse in s1.14
 muls.l r19, r19, r21 // u0.28
  sub r63, r1, r1
  addi r1, 92, r1
 muls.l r25, r21, r18 // s2.58
 shlli r19, 45, r19   // multiply by two and convert to s2.58
  /* bubble */
 sub r19, r18, r18
 shari r18, 28, r18   // some 22 bit inverse in s1.30
 muls.l r18, r25, r0  // s2.60
  muls.l r18, r4, r25 // s32.30
  /* bubble */
 shari r0, 16, r19   // s-16.44
 muls.l r19, r18, r19 // s-16.74
  shari r25, 63, r0
  shari r4, 14, r18   // s19.-14
 shari r19, 30, r19   // s-16.44
 muls.l r19, r18, r19 // s15.30
  xor r21, r0, r21    // You could also use the constant 1 << 27.
  add r21, r25, r21
 sub r21, r19, r21
 shard r21, r1, r21
 sub r21, r0, r0
 blink tr0, r63
        ENDFUNC(GLOBAL(sdivsi3))
/* This table has been generated by divtab.c .
Defects for bias -330:
   Max defect: 6.081536e-07 at -1.000000e+00
   Min defect: 2.849516e-08 at 1.030651e+00
   Max 2nd step defect: 9.606539e-12 at -1.000000e+00
   Min 2nd step defect: 0.000000e+00 at 0.000000e+00
   Defect at 1: 1.238659e-07
   Defect at -2: 1.061708e-07 */
#else /* ! __pic__ || ! __SHMEDIA__ */
        .section        .rodata
#endif /* __pic__ */
#if defined(TEXT_DATA_BUG) && defined(__pic__) && defined(__SHMEDIA__)
        .balign 2
        .type   Local_div_table,@object
        .size   Local_div_table,128
/* negative division constants */
        .word   -16638
        .word   -17135
        .word   -17737
        .word   -18433
        .word   -19103
        .word   -19751
        .word   -20583
        .word   -21383
        .word   -22343
        .word   -23353
        .word   -24407
        .word   -25582
        .word   -26863
        .word   -28382
        .word   -29965
        .word   -31800
/* negative division factors */
        .byte   66
        .byte   70
        .byte   75
        .byte   81
        .byte   87
        .byte   93
        .byte   101
        .byte   109
        .byte   119
        .byte   130
        .byte   142
        .byte   156
        .byte   172
        .byte   192
        .byte   214
        .byte   241
        .skip 16
Local_div_table:
        .skip 16
/* positive division factors */
        .byte   241
        .byte   214
        .byte   192
        .byte   172
        .byte   156
        .byte   142
        .byte   130
        .byte   119
        .byte   109
        .byte   101
        .byte   93
        .byte   87
        .byte   81
        .byte   75
        .byte   70
        .byte   66
/* positive division constants */
        .word   31801
        .word   29966
        .word   28383
        .word   26864
        .word   25583
        .word   24408
        .word   23354
        .word   22344
        .word   21384
        .word   20584
        .word   19752
        .word   19104
        .word   18434
        .word   17738
        .word   17136
        .word   16639
        .section        .rodata
#endif /* TEXT_DATA_BUG */
        .balign 2
        .type   GLOBAL(div_table),@object
        .size   GLOBAL(div_table),128
/* negative division constants */
        .word   -16638
        .word   -17135
        .word   -17737
        .word   -18433
        .word   -19103
        .word   -19751
        .word   -20583
        .word   -21383
        .word   -22343
        .word   -23353
        .word   -24407
        .word   -25582
        .word   -26863
        .word   -28382
        .word   -29965
        .word   -31800
/* negative division factors */
        .byte   66
        .byte   70
        .byte   75
        .byte   81
        .byte   87
        .byte   93
        .byte   101
        .byte   109
        .byte   119
        .byte   130
        .byte   142
        .byte   156
        .byte   172
        .byte   192
        .byte   214
        .byte   241
        .skip 16
        .global GLOBAL(div_table)
GLOBAL(div_table):
        HIDDEN_ALIAS(div_table_internal,div_table)
        .skip 16
/* positive division factors */
        .byte   241
        .byte   214
        .byte   192
        .byte   172
        .byte   156
        .byte   142
        .byte   130
        .byte   119
        .byte   109
        .byte   101
        .byte   93
        .byte   87
        .byte   81
        .byte   75
        .byte   70
        .byte   66
/* positive division constants */
        .word   31801
        .word   29966
        .word   28383
        .word   26864
        .word   25583
        .word   24408
        .word   23354
        .word   22344
        .word   21384
        .word   20584
        .word   19752
        .word   19104
        .word   18434
        .word   17738
        .word   17136
        .word   16639

#elif defined (__SH3__) || defined (__SH3E__) || defined (__SH4__) || defined (__SH4_SINGLE__) || defined (__SH4_SINGLE_ONLY__) || defined (__SH4_NOFPU__)
/* This code used shld, thus is not suitable for SH1 / SH2.  */

/* Signed / unsigned division without use of FPU, optimized for SH4.
   Uses a lookup table for divisors in the range -128 .. +128, and
   div1 with case distinction for larger divisors in three more ranges.
   The code is lumped together with the table to allow the use of mova.  */
#ifdef __LITTLE_ENDIAN__
#define L_LSB 0
#define L_LSWMSB 1
#define L_MSWLSB 2
#else
#define L_LSB 3
#define L_LSWMSB 2
#define L_MSWLSB 1
#endif

        .balign 4
        .global GLOBAL(udivsi3_i4i)
        FUNC(GLOBAL(udivsi3_i4i))
GLOBAL(udivsi3_i4i):
        mov.w LOCAL(c128_w), r1
        div0u
        mov r4,r0
        shlr8 r0
        cmp/hi r1,r5
        extu.w r5,r1
        bf LOCAL(udiv_le128)
        cmp/eq r5,r1
        bf LOCAL(udiv_ge64k)
        shlr r0
        mov r5,r1
        shll16 r5
        mov.l r4,@-r15
        div1 r5,r0
        mov.l r1,@-r15
        div1 r5,r0
        div1 r5,r0
        bra LOCAL(udiv_25)
        div1 r5,r0

LOCAL(div_le128):
        mova LOCAL(div_table_ix),r0
        bra LOCAL(div_le128_2)
        mov.b @(r0,r5),r1
LOCAL(udiv_le128):
        mov.l r4,@-r15
        mova LOCAL(div_table_ix),r0
        mov.b @(r0,r5),r1
        mov.l r5,@-r15
LOCAL(div_le128_2):
        mova LOCAL(div_table_inv),r0
        mov.l @(r0,r1),r1
        mov r5,r0
        tst #0xfe,r0
        mova LOCAL(div_table_clz),r0
        dmulu.l r1,r4
        mov.b @(r0,r5),r1
        bt/s LOCAL(div_by_1)
        mov r4,r0
        mov.l @r15+,r5
        sts mach,r0
        /* clrt */
        addc r4,r0
        mov.l @r15+,r4
        rotcr r0
        rts
        shld r1,r0

LOCAL(div_by_1_neg):
        neg r4,r0
LOCAL(div_by_1):
        mov.l @r15+,r5
        rts
        mov.l @r15+,r4

LOCAL(div_ge64k):
        bt/s LOCAL(div_r8)
        div0u
        shll8 r5
        bra LOCAL(div_ge64k_2)
        div1 r5,r0
LOCAL(udiv_ge64k):
        cmp/hi r0,r5
        mov r5,r1
        bt LOCAL(udiv_r8)
        shll8 r5
        mov.l r4,@-r15
        div1 r5,r0
        mov.l r1,@-r15
LOCAL(div_ge64k_2):
        div1 r5,r0
        mov.l LOCAL(zero_l),r1
        .rept 4
        div1 r5,r0
        .endr
        mov.l r1,@-r15
        div1 r5,r0
        mov.w LOCAL(m256_w),r1
        div1 r5,r0
        mov.b r0,@(L_LSWMSB,r15)
        xor r4,r0
        and r1,r0
        bra LOCAL(div_ge64k_end)
        xor r4,r0
        
LOCAL(div_r8):
        shll16 r4
        bra LOCAL(div_r8_2)
        shll8 r4
LOCAL(udiv_r8):
        mov.l r4,@-r15
        shll16 r4
        clrt
        shll8 r4
        mov.l r5,@-r15
LOCAL(div_r8_2):
        rotcl r4
        mov r0,r1
        div1 r5,r1
        mov r4,r0
        rotcl r0
        mov r5,r4
        div1 r5,r1
        .rept 5
        rotcl r0; div1 r5,r1
        .endr
        rotcl r0
        mov.l @r15+,r5
        div1 r4,r1
        mov.l @r15+,r4
        rts
        rotcl r0

        ENDFUNC(GLOBAL(udivsi3_i4i))

        .global GLOBAL(sdivsi3_i4i)
        FUNC(GLOBAL(sdivsi3_i4i))
        /* This is link-compatible with a GLOBAL(sdivsi3) call,
           but we effectively clobber only r1.  */
GLOBAL(sdivsi3_i4i):
        mov.l r4,@-r15
        cmp/pz r5
        mov.w LOCAL(c128_w), r1
        bt/s LOCAL(pos_divisor)
        cmp/pz r4
        mov.l r5,@-r15
        neg r5,r5
        bt/s LOCAL(neg_result)
        cmp/hi r1,r5
        neg r4,r4
LOCAL(pos_result):
        extu.w r5,r0
        bf LOCAL(div_le128)
        cmp/eq r5,r0
        mov r4,r0
        shlr8 r0
        bf/s LOCAL(div_ge64k)
        cmp/hi r0,r5
        div0u
        shll16 r5
        div1 r5,r0
        div1 r5,r0
        div1 r5,r0
LOCAL(udiv_25):
        mov.l LOCAL(zero_l),r1
        div1 r5,r0
        div1 r5,r0
        mov.l r1,@-r15
        .rept 3
        div1 r5,r0
        .endr
        mov.b r0,@(L_MSWLSB,r15)
        xtrct r4,r0
        swap.w r0,r0
        .rept 8
        div1 r5,r0
        .endr
        mov.b r0,@(L_LSWMSB,r15)
LOCAL(div_ge64k_end):
        .rept 8
        div1 r5,r0
        .endr
        mov.l @r15+,r4 ! zero-extension and swap using LS unit.
        extu.b r0,r0
        mov.l @r15+,r5
        or r4,r0
        mov.l @r15+,r4
        rts
        rotcl r0

LOCAL(div_le128_neg):
        tst #0xfe,r0
        mova LOCAL(div_table_ix),r0
        mov.b @(r0,r5),r1
        mova LOCAL(div_table_inv),r0
        bt/s LOCAL(div_by_1_neg)
        mov.l @(r0,r1),r1
        mova LOCAL(div_table_clz),r0
        dmulu.l r1,r4
        mov.b @(r0,r5),r1
        mov.l @r15+,r5
        sts mach,r0
        /* clrt */
        addc r4,r0
        mov.l @r15+,r4
        rotcr r0
        shld r1,r0
        rts
        neg r0,r0

LOCAL(pos_divisor):
        mov.l r5,@-r15
        bt/s LOCAL(pos_result)
        cmp/hi r1,r5
        neg r4,r4
LOCAL(neg_result):
        extu.w r5,r0
        bf LOCAL(div_le128_neg)
        cmp/eq r5,r0
        mov r4,r0
        shlr8 r0
        bf/s LOCAL(div_ge64k_neg)
        cmp/hi r0,r5
        div0u
        mov.l LOCAL(zero_l),r1
        shll16 r5
        div1 r5,r0
        mov.l r1,@-r15
        .rept 7
        div1 r5,r0
        .endr
        mov.b r0,@(L_MSWLSB,r15)
        xtrct r4,r0
        swap.w r0,r0
        .rept 8
        div1 r5,r0
        .endr
        mov.b r0,@(L_LSWMSB,r15)
LOCAL(div_ge64k_neg_end):
        .rept 8
        div1 r5,r0
        .endr
        mov.l @r15+,r4 ! zero-extension and swap using LS unit.
        extu.b r0,r1
        mov.l @r15+,r5
        or r4,r1
LOCAL(div_r8_neg_end):
        mov.l @r15+,r4
        rotcl r1
        rts
        neg r1,r0

LOCAL(div_ge64k_neg):
        bt/s LOCAL(div_r8_neg)
        div0u
        shll8 r5
        mov.l LOCAL(zero_l),r1
        .rept 6
        div1 r5,r0
        .endr
        mov.l r1,@-r15
        div1 r5,r0
        mov.w LOCAL(m256_w),r1
        div1 r5,r0
        mov.b r0,@(L_LSWMSB,r15)
        xor r4,r0
        and r1,r0
        bra LOCAL(div_ge64k_neg_end)
        xor r4,r0

LOCAL(c128_w):
        .word 128

LOCAL(div_r8_neg):
        clrt
        shll16 r4
        mov r4,r1
        shll8 r1
        mov r5,r4
        .rept 7
        rotcl r1; div1 r5,r0
        .endr
        mov.l @r15+,r5
        rotcl r1
        bra LOCAL(div_r8_neg_end)
        div1 r4,r0

LOCAL(m256_w):
        .word 0xff00
/* This table has been generated by divtab-sh4.c.  */
        .balign 4
LOCAL(div_table_clz):
        .byte   0
        .byte   1
        .byte   0
        .byte   -1
        .byte   -1
        .byte   -2
        .byte   -2
        .byte   -2
        .byte   -2
        .byte   -3
        .byte   -3
        .byte   -3
        .byte   -3
        .byte   -3
        .byte   -3
        .byte   -3
        .byte   -3
        .byte   -4
        .byte   -4
        .byte   -4
        .byte   -4
        .byte   -4
        .byte   -4
        .byte   -4
        .byte   -4
        .byte   -4
        .byte   -4
        .byte   -4
        .byte   -4
        .byte   -4
        .byte   -4
        .byte   -4
        .byte   -4
        .byte   -5
        .byte   -5
        .byte   -5
        .byte   -5
        .byte   -5
        .byte   -5
        .byte   -5
        .byte   -5
        .byte   -5
        .byte   -5
        .byte   -5
        .byte   -5
        .byte   -5
        .byte   -5
        .byte   -5
        .byte   -5
        .byte   -5
        .byte   -5
        .byte   -5
        .byte   -5
        .byte   -5
        .byte   -5
        .byte   -5
        .byte   -5
        .byte   -5
        .byte   -5
        .byte   -5
        .byte   -5
        .byte   -5
        .byte   -5
        .byte   -5
        .byte   -5
        .byte   -6
        .byte   -6
        .byte   -6
        .byte   -6
        .byte   -6
        .byte   -6
        .byte   -6
        .byte   -6
        .byte   -6
        .byte   -6
        .byte   -6
        .byte   -6
        .byte   -6
        .byte   -6
        .byte   -6
        .byte   -6
        .byte   -6
        .byte   -6
        .byte   -6
        .byte   -6
        .byte   -6
        .byte   -6
        .byte   -6
        .byte   -6
        .byte   -6
        .byte   -6
        .byte   -6
        .byte   -6
        .byte   -6
        .byte   -6
        .byte   -6
        .byte   -6
        .byte   -6
        .byte   -6
        .byte   -6
        .byte   -6
        .byte   -6
        .byte   -6
        .byte   -6
        .byte   -6
        .byte   -6
        .byte   -6
        .byte   -6
        .byte   -6
        .byte   -6
        .byte   -6
        .byte   -6
        .byte   -6
        .byte   -6
        .byte   -6
        .byte   -6
        .byte   -6
        .byte   -6
        .byte   -6
        .byte   -6
        .byte   -6
        .byte   -6
        .byte   -6
        .byte   -6
        .byte   -6
        .byte   -6
        .byte   -6
        .byte   -6
/* Lookup table translating positive divisor to index into table of
   normalized inverse.  N.B. the '0' entry is also the last entry of the
 previous table, and causes an unaligned access for division by zero.  */
LOCAL(div_table_ix):
        .byte   -6
        .byte   -128
        .byte   -128
        .byte   0
        .byte   -128
        .byte   -64
        .byte   0
        .byte   64
        .byte   -128
        .byte   -96
        .byte   -64
        .byte   -32
        .byte   0
        .byte   32
        .byte   64
        .byte   96
        .byte   -128
        .byte   -112
        .byte   -96
        .byte   -80
        .byte   -64
        .byte   -48
        .byte   -32
        .byte   -16
        .byte   0
        .byte   16
        .byte   32
        .byte   48
        .byte   64
        .byte   80
        .byte   96
        .byte   112
        .byte   -128
        .byte   -120
        .byte   -112
        .byte   -104
        .byte   -96
        .byte   -88
        .byte   -80
        .byte   -72
        .byte   -64
        .byte   -56
        .byte   -48
        .byte   -40
        .byte   -32
        .byte   -24
        .byte   -16
        .byte   -8
        .byte   0
        .byte   8
        .byte   16
        .byte   24
        .byte   32
        .byte   40
        .byte   48
        .byte   56
        .byte   64
        .byte   72
        .byte   80
        .byte   88
        .byte   96
        .byte   104
        .byte   112
        .byte   120
        .byte   -128
        .byte   -124
        .byte   -120
        .byte   -116
        .byte   -112
        .byte   -108
        .byte   -104
        .byte   -100
        .byte   -96
        .byte   -92
        .byte   -88
        .byte   -84
        .byte   -80
        .byte   -76
        .byte   -72
        .byte   -68
        .byte   -64
        .byte   -60
        .byte   -56
        .byte   -52
        .byte   -48
        .byte   -44
        .byte   -40
        .byte   -36
        .byte   -32
        .byte   -28
        .byte   -24
        .byte   -20
        .byte   -16
        .byte   -12
        .byte   -8
        .byte   -4
        .byte   0
        .byte   4
        .byte   8
        .byte   12
        .byte   16
        .byte   20
        .byte   24
        .byte   28
        .byte   32
        .byte   36
        .byte   40
        .byte   44
        .byte   48
        .byte   52
        .byte   56
        .byte   60
        .byte   64
        .byte   68
        .byte   72
        .byte   76
        .byte   80
        .byte   84
        .byte   88
        .byte   92
        .byte   96
        .byte   100
        .byte   104
        .byte   108
        .byte   112
        .byte   116
        .byte   120
        .byte   124
        .byte   -128
/* 1/64 .. 1/127, normalized.  There is an implicit leading 1 in bit 32.  */
        .balign 4
LOCAL(zero_l):
        .long   0x0
        .long   0xF81F81F9
        .long   0xF07C1F08
        .long   0xE9131AC0
        .long   0xE1E1E1E2
        .long   0xDAE6076C
        .long   0xD41D41D5
        .long   0xCD856891
        .long   0xC71C71C8
        .long   0xC0E07039
        .long   0xBACF914D
        .long   0xB4E81B4F
        .long   0xAF286BCB
        .long   0xA98EF607
        .long   0xA41A41A5
        .long   0x9EC8E952
        .long   0x9999999A
        .long   0x948B0FCE
        .long   0x8F9C18FA
        .long   0x8ACB90F7
        .long   0x86186187
        .long   0x81818182
        .long   0x7D05F418
        .long   0x78A4C818
        .long   0x745D1746
        .long   0x702E05C1
        .long   0x6C16C16D
        .long   0x68168169
        .long   0x642C8591
        .long   0x60581606
        .long   0x5C9882BA
        .long   0x58ED2309
LOCAL(div_table_inv):
        .long   0x55555556
        .long   0x51D07EAF
        .long   0x4E5E0A73
        .long   0x4AFD6A06
        .long   0x47AE147B
        .long   0x446F8657
        .long   0x41414142
        .long   0x3E22CBCF
        .long   0x3B13B13C
        .long   0x38138139
        .long   0x3521CFB3
        .long   0x323E34A3
        .long   0x2F684BDB
        .long   0x2C9FB4D9
        .long   0x29E4129F
        .long   0x27350B89
        .long   0x24924925
        .long   0x21FB7813
        .long   0x1F7047DD
        .long   0x1CF06ADB
        .long   0x1A7B9612
        .long   0x18118119
        .long   0x15B1E5F8
        .long   0x135C8114
        .long   0x11111112
        .long   0xECF56BF
        .long   0xC9714FC
        .long   0xA6810A7
        .long   0x8421085
        .long   0x624DD30
        .long   0x4104105
        .long   0x2040811
        /* maximum error: 0.987342 scaled: 0.921875*/

        ENDFUNC(GLOBAL(sdivsi3_i4i))
#endif /* SH3 / SH4 */

#endif /* L_div_table */

#ifdef L_udiv_qrnnd_16
#if !__SHMEDIA__
        HIDDEN_FUNC(GLOBAL(udiv_qrnnd_16))
        /* r0: rn r1: qn */ /* r0: n1 r4: n0 r5: d r6: d1 */ /* r2: __m */
        /* n1 < d, but n1 might be larger than d1.  */
        .global GLOBAL(udiv_qrnnd_16)
        .balign 8
GLOBAL(udiv_qrnnd_16):
        div0u
        cmp/hi r6,r0
        bt .Lots
        .rept 16
        div1 r6,r0 
        .endr
        extu.w r0,r1
        bt 0f
        add r6,r0
0:      rotcl r1
        mulu.w r1,r5
        xtrct r4,r0
        swap.w r0,r0
        sts macl,r2
        cmp/hs r2,r0
        sub r2,r0
        bt 0f
        addc r5,r0
        add #-1,r1
        bt 0f
1:      add #-1,r1
        rts
        add r5,r0
        .balign 8
.Lots:
        sub r5,r0
        swap.w r4,r1
        xtrct r0,r1
        clrt
        mov r1,r0
        addc r5,r0
        mov #-1,r1
        SL1(bf, 1b,
        shlr16 r1)
0:      rts
        nop
        ENDFUNC(GLOBAL(udiv_qrnnd_16))
#endif /* !__SHMEDIA__ */
#endif /* L_udiv_qrnnd_16 */
Go to most recent revision | Compare with Previous | Blame | View Log
Browse

Tools

Subversion Repositories openrisc

[/] [openrisc/] [trunk/] [gnu-dev/] [or1k-gcc/] [libgcc/] [config/] [sh/] [lib1funcs.S] - Rev 847