URL https://opencores.org/ocsvn/or1k_soc_on_altera_embedded_dev_kit/or1k_soc_on_altera_embedded_dev_kit/trunk

Subversion Repositories or1k_soc_on_altera_embedded_dev_kit

[/] [or1k_soc_on_altera_embedded_dev_kit/] [trunk/] [linux-2.6/] [linux-2.6.24/] [arch/] [mips/] [lib/] [csum_partial.S] - Blame information for rev 3

Details | Compare with Previous | View Log


/*
 * This file is subject to the terms and conditions of the GNU General Public
 * License.  See the file "COPYING" in the main directory of this archive
 * for more details.
 *
 * Quick'n'dirty IP checksum ...
 *
 * Copyright (C) 1998, 1999 Ralf Baechle
 * Copyright (C) 1999 Silicon Graphics, Inc.
 */
#include 
#include 
#include 
#include 
 
#ifdef CONFIG_64BIT
/*
 * As we are sharing code base with the mips32 tree (which use the o32 ABI
 * register definitions). We need to redefine the register definitions from
 * the n64 ABI register naming to the o32 ABI register naming.
 */
#undef t0
#undef t1
#undef t2
#undef t3
#define t0      $8
#define t1      $9
#define t2      $10
#define t3      $11
#define t4      $12
#define t5      $13
#define t6      $14
#define t7      $15
 
#define USE_DOUBLE
#endif
 
#ifdef USE_DOUBLE
 
#define LOAD   ld
#define ADD    daddu
#define NBYTES 8
 
#else
 
#define LOAD   lw
#define ADD    addu
#define NBYTES 4
 
#endif /* USE_DOUBLE */
 
#define UNIT(unit)  ((unit)*NBYTES)
 
#define ADDC(sum,reg)                                           \
        ADD     sum, reg;                                       \
        sltu    v1, sum, reg;                                   \
        ADD     sum, v1
 
#define CSUM_BIGCHUNK1(src, offset, sum, _t0, _t1, _t2, _t3)    \
        LOAD    _t0, (offset + UNIT(0))(src);                   \
        LOAD    _t1, (offset + UNIT(1))(src);                   \
        LOAD    _t2, (offset + UNIT(2))(src);                   \
        LOAD    _t3, (offset + UNIT(3))(src);                   \
        ADDC(sum, _t0);                                         \
        ADDC(sum, _t1);                                         \
        ADDC(sum, _t2);                                         \
        ADDC(sum, _t3)
 
#ifdef USE_DOUBLE
#define CSUM_BIGCHUNK(src, offset, sum, _t0, _t1, _t2, _t3)     \
        CSUM_BIGCHUNK1(src, offset, sum, _t0, _t1, _t2, _t3)
#else
#define CSUM_BIGCHUNK(src, offset, sum, _t0, _t1, _t2, _t3)     \
        CSUM_BIGCHUNK1(src, offset, sum, _t0, _t1, _t2, _t3);   \
        CSUM_BIGCHUNK1(src, offset + 0x10, sum, _t0, _t1, _t2, _t3)
#endif
 
/*
 * a0: source address
 * a1: length of the area to checksum
 * a2: partial checksum
 */
 
#define src a0
#define sum v0
 
        .text
        .set    noreorder
        .align  5
LEAF(csum_partial)
        move    sum, zero
        move    t7, zero
 
        sltiu   t8, a1, 0x8
        bnez    t8, small_csumcpy               /* < 8 bytes to copy */
         move   t2, a1
 
        andi    t7, src, 0x1                    /* odd buffer? */
 
hword_align:
        beqz    t7, word_align
         andi   t8, src, 0x2
 
        lbu     t0, (src)
        LONG_SUBU       a1, a1, 0x1
#ifdef __MIPSEL__
        sll     t0, t0, 8
#endif
        ADDC(sum, t0)
        PTR_ADDU        src, src, 0x1
        andi    t8, src, 0x2
 
word_align:
        beqz    t8, dword_align
         sltiu  t8, a1, 56
 
        lhu     t0, (src)
        LONG_SUBU       a1, a1, 0x2
        ADDC(sum, t0)
        sltiu   t8, a1, 56
        PTR_ADDU        src, src, 0x2
 
dword_align:
        bnez    t8, do_end_words
         move   t8, a1
 
        andi    t8, src, 0x4
        beqz    t8, qword_align
         andi   t8, src, 0x8
 
        lw      t0, 0x00(src)
        LONG_SUBU       a1, a1, 0x4
        ADDC(sum, t0)
        PTR_ADDU        src, src, 0x4
        andi    t8, src, 0x8
 
qword_align:
        beqz    t8, oword_align
         andi   t8, src, 0x10
 
#ifdef USE_DOUBLE
        ld      t0, 0x00(src)
        LONG_SUBU       a1, a1, 0x8
        ADDC(sum, t0)
#else
        lw      t0, 0x00(src)
        lw      t1, 0x04(src)
        LONG_SUBU       a1, a1, 0x8
        ADDC(sum, t0)
        ADDC(sum, t1)
#endif
        PTR_ADDU        src, src, 0x8
        andi    t8, src, 0x10
 
oword_align:
        beqz    t8, begin_movement
         LONG_SRL       t8, a1, 0x7
 
#ifdef USE_DOUBLE
        ld      t0, 0x00(src)
        ld      t1, 0x08(src)
        ADDC(sum, t0)
        ADDC(sum, t1)
#else
        CSUM_BIGCHUNK1(src, 0x00, sum, t0, t1, t3, t4)
#endif
        LONG_SUBU       a1, a1, 0x10
        PTR_ADDU        src, src, 0x10
        LONG_SRL        t8, a1, 0x7
 
begin_movement:
        beqz    t8, 1f
         andi   t2, a1, 0x40
 
move_128bytes:
        CSUM_BIGCHUNK(src, 0x00, sum, t0, t1, t3, t4)
        CSUM_BIGCHUNK(src, 0x20, sum, t0, t1, t3, t4)
        CSUM_BIGCHUNK(src, 0x40, sum, t0, t1, t3, t4)
        CSUM_BIGCHUNK(src, 0x60, sum, t0, t1, t3, t4)
        LONG_SUBU       t8, t8, 0x01
        bnez    t8, move_128bytes
         PTR_ADDU       src, src, 0x80
 
1:
        beqz    t2, 1f
         andi   t2, a1, 0x20
 
move_64bytes:
        CSUM_BIGCHUNK(src, 0x00, sum, t0, t1, t3, t4)
        CSUM_BIGCHUNK(src, 0x20, sum, t0, t1, t3, t4)
        PTR_ADDU        src, src, 0x40
 
1:
        beqz    t2, do_end_words
         andi   t8, a1, 0x1c
 
move_32bytes:
        CSUM_BIGCHUNK(src, 0x00, sum, t0, t1, t3, t4)
        andi    t8, a1, 0x1c
        PTR_ADDU        src, src, 0x20
 
do_end_words:
        beqz    t8, small_csumcpy
         andi   t2, a1, 0x3
        LONG_SRL        t8, t8, 0x2
 
end_words:
        lw      t0, (src)
        LONG_SUBU       t8, t8, 0x1
        ADDC(sum, t0)
        bnez    t8, end_words
         PTR_ADDU       src, src, 0x4
 
/* unknown src alignment and < 8 bytes to go  */
small_csumcpy:
        move    a1, t2
 
        andi    t0, a1, 4
        beqz    t0, 1f
         andi   t0, a1, 2
 
        /* Still a full word to go  */
        ulw     t1, (src)
        PTR_ADDIU       src, 4
        ADDC(sum, t1)
 
1:      move    t1, zero
        beqz    t0, 1f
         andi   t0, a1, 1
 
        /* Still a halfword to go  */
        ulhu    t1, (src)
        PTR_ADDIU       src, 2
 
1:      beqz    t0, 1f
         sll    t1, t1, 16
 
        lbu     t2, (src)
         nop
 
#ifdef __MIPSEB__
        sll     t2, t2, 8
#endif
        or      t1, t2
 
1:      ADDC(sum, t1)
 
        /* fold checksum */
#ifdef USE_DOUBLE
        dsll32  v1, sum, 0
        daddu   sum, v1
        sltu    v1, sum, v1
        dsra32  sum, sum, 0
        addu    sum, v1
#endif
        sll     v1, sum, 16
        addu    sum, v1
        sltu    v1, sum, v1
        srl     sum, sum, 16
        addu    sum, v1
 
        /* odd buffer alignment? */
        beqz    t7, 1f
         nop
        sll     v1, sum, 8
        srl     sum, sum, 8
        or      sum, v1
        andi    sum, 0xffff
1:
        .set    reorder
        /* Add the passed partial csum.  */
        ADDC(sum, a2)
        jr      ra
        .set    noreorder
        END(csum_partial)
 
 
/*
 * checksum and copy routines based on memcpy.S
 *
 *      csum_partial_copy_nocheck(src, dst, len, sum)
 *      __csum_partial_copy_user(src, dst, len, sum, errp)
 *
 * See "Spec" in memcpy.S for details.  Unlike __copy_user, all
 * function in this file use the standard calling convention.
 */
 
#define src a0
#define dst a1
#define len a2
#define psum a3
#define sum v0
#define odd t8
#define errptr t9
 
/*
 * The exception handler for loads requires that:
 *  1- AT contain the address of the byte just past the end of the source
 *     of the copy,
 *  2- src_entry <= src < AT, and
 *  3- (dst - src) == (dst_entry - src_entry),
 * The _entry suffix denotes values when __copy_user was called.
 *
 * (1) is set up up by __csum_partial_copy_from_user and maintained by
 *      not writing AT in __csum_partial_copy
 * (2) is met by incrementing src by the number of bytes copied
 * (3) is met by not doing loads between a pair of increments of dst and src
 *
 * The exception handlers for stores stores -EFAULT to errptr and return.
 * These handlers do not need to overwrite any data.
 */
 
#define EXC(inst_reg,addr,handler)              \
9:      inst_reg, addr;                         \
        .section __ex_table,"a";                \
        PTR     9b, handler;                    \
        .previous
 
#ifdef USE_DOUBLE
 
#define LOAD   ld
#define LOADL  ldl
#define LOADR  ldr
#define STOREL sdl
#define STORER sdr
#define STORE  sd
#define ADD    daddu
#define SUB    dsubu
#define SRL    dsrl
#define SLL    dsll
#define SLLV   dsllv
#define SRLV   dsrlv
#define NBYTES 8
#define LOG_NBYTES 3
 
#else
 
#define LOAD   lw
#define LOADL  lwl
#define LOADR  lwr
#define STOREL swl
#define STORER swr
#define STORE  sw
#define ADD    addu
#define SUB    subu
#define SRL    srl
#define SLL    sll
#define SLLV   sllv
#define SRLV   srlv
#define NBYTES 4
#define LOG_NBYTES 2
 
#endif /* USE_DOUBLE */
 
#ifdef CONFIG_CPU_LITTLE_ENDIAN
#define LDFIRST LOADR
#define LDREST  LOADL
#define STFIRST STORER
#define STREST  STOREL
#define SHIFT_DISCARD SLLV
#define SHIFT_DISCARD_REVERT SRLV
#else
#define LDFIRST LOADL
#define LDREST  LOADR
#define STFIRST STOREL
#define STREST  STORER
#define SHIFT_DISCARD SRLV
#define SHIFT_DISCARD_REVERT SLLV
#endif
 
#define FIRST(unit) ((unit)*NBYTES)
#define REST(unit)  (FIRST(unit)+NBYTES-1)
 
#define ADDRMASK (NBYTES-1)
 
        .set    noat
 
LEAF(__csum_partial_copy_user)
        PTR_ADDU        AT, src, len    /* See (1) above. */
#ifdef CONFIG_64BIT
        move    errptr, a4
#else
        lw      errptr, 16(sp)
#endif
FEXPORT(csum_partial_copy_nocheck)
        move    sum, zero
        move    odd, zero
        /*
         * Note: dst & src may be unaligned, len may be 0
         * Temps
         */
        /*
         * The "issue break"s below are very approximate.
         * Issue delays for dcache fills will perturb the schedule, as will
         * load queue full replay traps, etc.
         *
         * If len < NBYTES use byte operations.
         */
        sltu    t2, len, NBYTES
        and     t1, dst, ADDRMASK
        bnez    t2, copy_bytes_checklen
         and    t0, src, ADDRMASK
        andi    odd, dst, 0x1                   /* odd buffer? */
        bnez    t1, dst_unaligned
         nop
        bnez    t0, src_unaligned_dst_aligned
        /*
         * use delay slot for fall-through
         * src and dst are aligned; need to compute rem
         */
both_aligned:
         SRL    t0, len, LOG_NBYTES+3    # +3 for 8 units/iter
        beqz    t0, cleanup_both_aligned # len < 8*NBYTES
         nop
        SUB     len, 8*NBYTES           # subtract here for bgez loop
        .align  4
1:
EXC(    LOAD    t0, UNIT(0)(src),       l_exc)
EXC(    LOAD    t1, UNIT(1)(src),       l_exc_copy)
EXC(    LOAD    t2, UNIT(2)(src),       l_exc_copy)
EXC(    LOAD    t3, UNIT(3)(src),       l_exc_copy)
EXC(    LOAD    t4, UNIT(4)(src),       l_exc_copy)
EXC(    LOAD    t5, UNIT(5)(src),       l_exc_copy)
EXC(    LOAD    t6, UNIT(6)(src),       l_exc_copy)
EXC(    LOAD    t7, UNIT(7)(src),       l_exc_copy)
        SUB     len, len, 8*NBYTES
        ADD     src, src, 8*NBYTES
EXC(    STORE   t0, UNIT(0)(dst),       s_exc)
        ADDC(sum, t0)
EXC(    STORE   t1, UNIT(1)(dst),       s_exc)
        ADDC(sum, t1)
EXC(    STORE   t2, UNIT(2)(dst),       s_exc)
        ADDC(sum, t2)
EXC(    STORE   t3, UNIT(3)(dst),       s_exc)
        ADDC(sum, t3)
EXC(    STORE   t4, UNIT(4)(dst),       s_exc)
        ADDC(sum, t4)
EXC(    STORE   t5, UNIT(5)(dst),       s_exc)
        ADDC(sum, t5)
EXC(    STORE   t6, UNIT(6)(dst),       s_exc)
        ADDC(sum, t6)
EXC(    STORE   t7, UNIT(7)(dst),       s_exc)
        ADDC(sum, t7)
        bgez    len, 1b
         ADD    dst, dst, 8*NBYTES
        ADD     len, 8*NBYTES           # revert len (see above)
 
        /*
         * len == the number of bytes left to copy < 8*NBYTES
         */
cleanup_both_aligned:
#define rem t7
        beqz    len, done
         sltu   t0, len, 4*NBYTES
        bnez    t0, less_than_4units
         and    rem, len, (NBYTES-1)    # rem = len % NBYTES
        /*
         * len >= 4*NBYTES
         */
EXC(    LOAD    t0, UNIT(0)(src),       l_exc)
EXC(    LOAD    t1, UNIT(1)(src),       l_exc_copy)
EXC(    LOAD    t2, UNIT(2)(src),       l_exc_copy)
EXC(    LOAD    t3, UNIT(3)(src),       l_exc_copy)
        SUB     len, len, 4*NBYTES
        ADD     src, src, 4*NBYTES
EXC(    STORE   t0, UNIT(0)(dst),       s_exc)
        ADDC(sum, t0)
EXC(    STORE   t1, UNIT(1)(dst),       s_exc)
        ADDC(sum, t1)
EXC(    STORE   t2, UNIT(2)(dst),       s_exc)
        ADDC(sum, t2)
EXC(    STORE   t3, UNIT(3)(dst),       s_exc)
        ADDC(sum, t3)
        beqz    len, done
         ADD    dst, dst, 4*NBYTES
less_than_4units:
        /*
         * rem = len % NBYTES
         */
        beq     rem, len, copy_bytes
         nop
1:
EXC(    LOAD    t0, 0(src),             l_exc)
        ADD     src, src, NBYTES
        SUB     len, len, NBYTES
EXC(    STORE   t0, 0(dst),             s_exc)
        ADDC(sum, t0)
        bne     rem, len, 1b
         ADD    dst, dst, NBYTES
 
        /*
         * src and dst are aligned, need to copy rem bytes (rem < NBYTES)
         * A loop would do only a byte at a time with possible branch
         * mispredicts.  Can't do an explicit LOAD dst,mask,or,STORE
         * because can't assume read-access to dst.  Instead, use
         * STREST dst, which doesn't require read access to dst.
         *
         * This code should perform better than a simple loop on modern,
         * wide-issue mips processors because the code has fewer branches and
         * more instruction-level parallelism.
         */
#define bits t2
        beqz    len, done
         ADD    t1, dst, len    # t1 is just past last byte of dst
        li      bits, 8*NBYTES
        SLL     rem, len, 3     # rem = number of bits to keep
EXC(    LOAD    t0, 0(src),             l_exc)
        SUB     bits, bits, rem # bits = number of bits to discard
        SHIFT_DISCARD t0, t0, bits
EXC(    STREST  t0, -1(t1),             s_exc)
        SHIFT_DISCARD_REVERT t0, t0, bits
        .set reorder
        ADDC(sum, t0)
        b       done
        .set noreorder
dst_unaligned:
        /*
         * dst is unaligned
         * t0 = src & ADDRMASK
         * t1 = dst & ADDRMASK; T1 > 0
         * len >= NBYTES
         *
         * Copy enough bytes to align dst
         * Set match = (src and dst have same alignment)
         */
#define match rem
EXC(    LDFIRST t3, FIRST(0)(src),      l_exc)
        ADD     t2, zero, NBYTES
EXC(    LDREST  t3, REST(0)(src),       l_exc_copy)
        SUB     t2, t2, t1      # t2 = number of bytes copied
        xor     match, t0, t1
EXC(    STFIRST t3, FIRST(0)(dst),      s_exc)
        SLL     t4, t1, 3               # t4 = number of bits to discard
        SHIFT_DISCARD t3, t3, t4
        /* no SHIFT_DISCARD_REVERT to handle odd buffer properly */
        ADDC(sum, t3)
        beq     len, t2, done
         SUB    len, len, t2
        ADD     dst, dst, t2
        beqz    match, both_aligned
         ADD    src, src, t2
 
src_unaligned_dst_aligned:
        SRL     t0, len, LOG_NBYTES+2    # +2 for 4 units/iter
        beqz    t0, cleanup_src_unaligned
         and    rem, len, (4*NBYTES-1)   # rem = len % 4*NBYTES
1:
/*
 * Avoid consecutive LD*'s to the same register since some mips
 * implementations can't issue them in the same cycle.
 * It's OK to load FIRST(N+1) before REST(N) because the two addresses
 * are to the same unit (unless src is aligned, but it's not).
 */
EXC(    LDFIRST t0, FIRST(0)(src),      l_exc)
EXC(    LDFIRST t1, FIRST(1)(src),      l_exc_copy)
        SUB     len, len, 4*NBYTES
EXC(    LDREST  t0, REST(0)(src),       l_exc_copy)
EXC(    LDREST  t1, REST(1)(src),       l_exc_copy)
EXC(    LDFIRST t2, FIRST(2)(src),      l_exc_copy)
EXC(    LDFIRST t3, FIRST(3)(src),      l_exc_copy)
EXC(    LDREST  t2, REST(2)(src),       l_exc_copy)
EXC(    LDREST  t3, REST(3)(src),       l_exc_copy)
        ADD     src, src, 4*NBYTES
#ifdef CONFIG_CPU_SB1
        nop                             # improves slotting
#endif
EXC(    STORE   t0, UNIT(0)(dst),       s_exc)
        ADDC(sum, t0)
EXC(    STORE   t1, UNIT(1)(dst),       s_exc)
        ADDC(sum, t1)
EXC(    STORE   t2, UNIT(2)(dst),       s_exc)
        ADDC(sum, t2)
EXC(    STORE   t3, UNIT(3)(dst),       s_exc)
        ADDC(sum, t3)
        bne     len, rem, 1b
         ADD    dst, dst, 4*NBYTES
 
cleanup_src_unaligned:
        beqz    len, done
         and    rem, len, NBYTES-1  # rem = len % NBYTES
        beq     rem, len, copy_bytes
         nop
1:
EXC(    LDFIRST t0, FIRST(0)(src),      l_exc)
EXC(    LDREST  t0, REST(0)(src),       l_exc_copy)
        ADD     src, src, NBYTES
        SUB     len, len, NBYTES
EXC(    STORE   t0, 0(dst),             s_exc)
        ADDC(sum, t0)
        bne     len, rem, 1b
         ADD    dst, dst, NBYTES
 
copy_bytes_checklen:
        beqz    len, done
         nop
copy_bytes:
        /* 0 < len < NBYTES  */
#ifdef CONFIG_CPU_LITTLE_ENDIAN
#define SHIFT_START 0
#define SHIFT_INC 8
#else
#define SHIFT_START 8*(NBYTES-1)
#define SHIFT_INC -8
#endif
        move    t2, zero        # partial word
        li      t3, SHIFT_START # shift
/* use l_exc_copy here to return correct sum on fault */
#define COPY_BYTE(N)                    \
EXC(    lbu     t0, N(src), l_exc_copy);        \
        SUB     len, len, 1;            \
EXC(    sb      t0, N(dst), s_exc);     \
        SLLV    t0, t0, t3;             \
        addu    t3, SHIFT_INC;          \
        beqz    len, copy_bytes_done;   \
         or     t2, t0
 
        COPY_BYTE(0)
        COPY_BYTE(1)
#ifdef USE_DOUBLE
        COPY_BYTE(2)
        COPY_BYTE(3)
        COPY_BYTE(4)
        COPY_BYTE(5)
#endif
EXC(    lbu     t0, NBYTES-2(src), l_exc_copy)
        SUB     len, len, 1
EXC(    sb      t0, NBYTES-2(dst), s_exc)
        SLLV    t0, t0, t3
        or      t2, t0
copy_bytes_done:
        ADDC(sum, t2)
done:
        /* fold checksum */
#ifdef USE_DOUBLE
        dsll32  v1, sum, 0
        daddu   sum, v1
        sltu    v1, sum, v1
        dsra32  sum, sum, 0
        addu    sum, v1
#endif
        sll     v1, sum, 16
        addu    sum, v1
        sltu    v1, sum, v1
        srl     sum, sum, 16
        addu    sum, v1
 
        /* odd buffer alignment? */
        beqz    odd, 1f
         nop
        sll     v1, sum, 8
        srl     sum, sum, 8
        or      sum, v1
        andi    sum, 0xffff
1:
        .set reorder
        ADDC(sum, psum)
        jr      ra
        .set noreorder
 
l_exc_copy:
        /*
         * Copy bytes from src until faulting load address (or until a
         * lb faults)
         *
         * When reached by a faulting LDFIRST/LDREST, THREAD_BUADDR($28)
         * may be more than a byte beyond the last address.
         * Hence, the lb below may get an exception.
         *
         * Assumes src < THREAD_BUADDR($28)
         */
        LOAD    t0, TI_TASK($28)
         li     t2, SHIFT_START
        LOAD    t0, THREAD_BUADDR(t0)
1:
EXC(    lbu     t1, 0(src),     l_exc)
        ADD     src, src, 1
        sb      t1, 0(dst)      # can't fault -- we're copy_from_user
        SLLV    t1, t1, t2
        addu    t2, SHIFT_INC
        ADDC(sum, t1)
        bne     src, t0, 1b
         ADD    dst, dst, 1
l_exc:
        LOAD    t0, TI_TASK($28)
         nop
        LOAD    t0, THREAD_BUADDR(t0)   # t0 is just past last good address
         nop
        SUB     len, AT, t0             # len number of uncopied bytes
        /*
         * Here's where we rely on src and dst being incremented in tandem,
         *   See (3) above.
         * dst += (fault addr - src) to put dst at first byte to clear
         */
        ADD     dst, t0                 # compute start address in a1
        SUB     dst, src
        /*
         * Clear len bytes starting at dst.  Can't call __bzero because it
         * might modify len.  An inefficient loop for these rare times...
         */
        beqz    len, done
         SUB    src, len, 1
1:      sb      zero, 0(dst)
        ADD     dst, dst, 1
        bnez    src, 1b
         SUB    src, src, 1
        li      v1, -EFAULT
        b       done
         sw     v1, (errptr)
 
s_exc:
        li      v0, -1 /* invalid checksum */
        li      v1, -EFAULT
        jr      ra
         sw     v1, (errptr)
        END(__csum_partial_copy_user)

Browse

Tools

Subversion Repositories or1k_soc_on_altera_embedded_dev_kit

[/] [or1k_soc_on_altera_embedded_dev_kit/] [trunk/] [linux-2.6/] [linux-2.6.24/] [arch/] [mips/] [lib/] [csum_partial.S] - Blame information for rev 3

Line No.	Rev	Author	Line
1	3	xianfeng	`/*`
2			`* This file is subject to the terms and conditions of the GNU General Public`
3			`* License. See the file "COPYING" in the main directory of this archive`
4			`* for more details.`
5			`*`
6			`* Quick'n'dirty IP checksum ...`
7			`*`
8			`* Copyright (C) 1998, 1999 Ralf Baechle`
9			`* Copyright (C) 1999 Silicon Graphics, Inc.`
10			`*/`
11			`#include`
12			`#include`
13			`#include`
14			`#include`
15
16			`#ifdef CONFIG_64BIT`
17			`/*`
18			`* As we are sharing code base with the mips32 tree (which use the o32 ABI`
19			`* register definitions). We need to redefine the register definitions from`
20			`* the n64 ABI register naming to the o32 ABI register naming.`
21			`*/`
22			`#undef t0`
23			`#undef t1`
24			`#undef t2`
25			`#undef t3`
26			`#define t0 $8`
27			`#define t1 $9`
28			`#define t2 $10`
29			`#define t3 $11`
30			`#define t4 $12`
31			`#define t5 $13`
32			`#define t6 $14`
33			`#define t7 $15`
34
35			`#define USE_DOUBLE`
36			`#endif`
37
38			`#ifdef USE_DOUBLE`
39
40			`#define LOAD ld`
41			`#define ADD daddu`
42			`#define NBYTES 8`
43
44			`#else`
45
46			`#define LOAD lw`
47			`#define ADD addu`
48			`#define NBYTES 4`
49
50			`#endif /* USE_DOUBLE */`
51
52			`#define UNIT(unit) ((unit)*NBYTES)`
53
54			`#define ADDC(sum,reg) \`
55			`ADD sum, reg; \`
56			`sltu v1, sum, reg; \`
57			`ADD sum, v1`
58
59			`#define CSUM_BIGCHUNK1(src, offset, sum, _t0, _t1, _t2, _t3) \`
60			`LOAD _t0, (offset + UNIT(0))(src); \`
61			`LOAD _t1, (offset + UNIT(1))(src); \`
62			`LOAD _t2, (offset + UNIT(2))(src); \`
63			`LOAD _t3, (offset + UNIT(3))(src); \`
64			`ADDC(sum, _t0); \`
65			`ADDC(sum, _t1); \`
66			`ADDC(sum, _t2); \`
67			`ADDC(sum, _t3)`
68
69			`#ifdef USE_DOUBLE`
70			`#define CSUM_BIGCHUNK(src, offset, sum, _t0, _t1, _t2, _t3) \`
71			`CSUM_BIGCHUNK1(src, offset, sum, _t0, _t1, _t2, _t3)`
72			`#else`
73			`#define CSUM_BIGCHUNK(src, offset, sum, _t0, _t1, _t2, _t3) \`
74			`CSUM_BIGCHUNK1(src, offset, sum, _t0, _t1, _t2, _t3); \`
75			`CSUM_BIGCHUNK1(src, offset + 0x10, sum, _t0, _t1, _t2, _t3)`
76			`#endif`
77
78			`/*`
79			`* a0: source address`
80			`* a1: length of the area to checksum`
81			`* a2: partial checksum`
82			`*/`
83
84			`#define src a0`
85			`#define sum v0`
86
87			`.text`
88			`.set noreorder`
89			`.align 5`
90			`LEAF(csum_partial)`
91			`move sum, zero`
92			`move t7, zero`
93
94			`sltiu t8, a1, 0x8`
95			`bnez t8, small_csumcpy /* < 8 bytes to copy */`
96			`move t2, a1`
97
98			`andi t7, src, 0x1 /* odd buffer? */`
99
100			`hword_align:`
101			`beqz t7, word_align`
102			`andi t8, src, 0x2`
103
104			`lbu t0, (src)`
105			`LONG_SUBU a1, a1, 0x1`
106			`#ifdef __MIPSEL__`
107			`sll t0, t0, 8`
108			`#endif`
109			`ADDC(sum, t0)`
110			`PTR_ADDU src, src, 0x1`
111			`andi t8, src, 0x2`
112
113			`word_align:`
114			`beqz t8, dword_align`
115			`sltiu t8, a1, 56`
116
117			`lhu t0, (src)`
118			`LONG_SUBU a1, a1, 0x2`
119			`ADDC(sum, t0)`
120			`sltiu t8, a1, 56`
121			`PTR_ADDU src, src, 0x2`
122
123			`dword_align:`
124			`bnez t8, do_end_words`
125			`move t8, a1`
126
127			`andi t8, src, 0x4`
128			`beqz t8, qword_align`
129			`andi t8, src, 0x8`
130
131			`lw t0, 0x00(src)`
132			`LONG_SUBU a1, a1, 0x4`
133			`ADDC(sum, t0)`
134			`PTR_ADDU src, src, 0x4`
135			`andi t8, src, 0x8`
136
137			`qword_align:`
138			`beqz t8, oword_align`
139			`andi t8, src, 0x10`
140
141			`#ifdef USE_DOUBLE`
142			`ld t0, 0x00(src)`
143			`LONG_SUBU a1, a1, 0x8`
144			`ADDC(sum, t0)`
145			`#else`
146			`lw t0, 0x00(src)`
147			`lw t1, 0x04(src)`
148			`LONG_SUBU a1, a1, 0x8`
149			`ADDC(sum, t0)`
150			`ADDC(sum, t1)`
151			`#endif`
152			`PTR_ADDU src, src, 0x8`
153			`andi t8, src, 0x10`
154
155			`oword_align:`
156			`beqz t8, begin_movement`
157			`LONG_SRL t8, a1, 0x7`
158
159			`#ifdef USE_DOUBLE`
160			`ld t0, 0x00(src)`
161			`ld t1, 0x08(src)`
162			`ADDC(sum, t0)`
163			`ADDC(sum, t1)`
164			`#else`
165			`CSUM_BIGCHUNK1(src, 0x00, sum, t0, t1, t3, t4)`
166			`#endif`
167			`LONG_SUBU a1, a1, 0x10`
168			`PTR_ADDU src, src, 0x10`
169			`LONG_SRL t8, a1, 0x7`
170
171			`begin_movement:`
172			`beqz t8, 1f`
173			`andi t2, a1, 0x40`
174
175			`move_128bytes:`
176			`CSUM_BIGCHUNK(src, 0x00, sum, t0, t1, t3, t4)`
177			`CSUM_BIGCHUNK(src, 0x20, sum, t0, t1, t3, t4)`
178			`CSUM_BIGCHUNK(src, 0x40, sum, t0, t1, t3, t4)`
179			`CSUM_BIGCHUNK(src, 0x60, sum, t0, t1, t3, t4)`
180			`LONG_SUBU t8, t8, 0x01`
181			`bnez t8, move_128bytes`
182			`PTR_ADDU src, src, 0x80`
183
184			`1:`
185			`beqz t2, 1f`
186			`andi t2, a1, 0x20`
187
188			`move_64bytes:`
189			`CSUM_BIGCHUNK(src, 0x00, sum, t0, t1, t3, t4)`
190			`CSUM_BIGCHUNK(src, 0x20, sum, t0, t1, t3, t4)`
191			`PTR_ADDU src, src, 0x40`
192
193			`1:`
194			`beqz t2, do_end_words`
195			`andi t8, a1, 0x1c`
196
197			`move_32bytes:`
198			`CSUM_BIGCHUNK(src, 0x00, sum, t0, t1, t3, t4)`
199			`andi t8, a1, 0x1c`
200			`PTR_ADDU src, src, 0x20`
201
202			`do_end_words:`
203			`beqz t8, small_csumcpy`
204			`andi t2, a1, 0x3`
205			`LONG_SRL t8, t8, 0x2`
206
207			`end_words:`
208			`lw t0, (src)`
209			`LONG_SUBU t8, t8, 0x1`
210			`ADDC(sum, t0)`
211			`bnez t8, end_words`
212			`PTR_ADDU src, src, 0x4`
213
214			`/* unknown src alignment and < 8 bytes to go */`
215			`small_csumcpy:`
216			`move a1, t2`
217
218			`andi t0, a1, 4`
219			`beqz t0, 1f`
220			`andi t0, a1, 2`
221
222			`/* Still a full word to go */`
223			`ulw t1, (src)`
224			`PTR_ADDIU src, 4`
225			`ADDC(sum, t1)`
226
227			`1: move t1, zero`
228			`beqz t0, 1f`
229			`andi t0, a1, 1`
230
231			`/* Still a halfword to go */`
232			`ulhu t1, (src)`
233			`PTR_ADDIU src, 2`
234
235			`1: beqz t0, 1f`
236			`sll t1, t1, 16`
237
238			`lbu t2, (src)`
239			`nop`
240
241			`#ifdef __MIPSEB__`
242			`sll t2, t2, 8`
243			`#endif`
244			`or t1, t2`
245
246			`1: ADDC(sum, t1)`
247
248			`/* fold checksum */`
249			`#ifdef USE_DOUBLE`
250			`dsll32 v1, sum, 0`
251			`daddu sum, v1`
252			`sltu v1, sum, v1`
253			`dsra32 sum, sum, 0`
254			`addu sum, v1`
255			`#endif`
256			`sll v1, sum, 16`
257			`addu sum, v1`
258			`sltu v1, sum, v1`
259			`srl sum, sum, 16`
260			`addu sum, v1`
261
262			`/* odd buffer alignment? */`
263			`beqz t7, 1f`
264			`nop`
265			`sll v1, sum, 8`
266			`srl sum, sum, 8`
267			`or sum, v1`
268			`andi sum, 0xffff`
269			`1:`
270			`.set reorder`
271			`/* Add the passed partial csum. */`
272			`ADDC(sum, a2)`
273			`jr ra`
274			`.set noreorder`
275			`END(csum_partial)`
276
277
278			`/*`
279			`* checksum and copy routines based on memcpy.S`
280			`*`
281			`* csum_partial_copy_nocheck(src, dst, len, sum)`
282			`* __csum_partial_copy_user(src, dst, len, sum, errp)`
283			`*`
284			`* See "Spec" in memcpy.S for details. Unlike __copy_user, all`
285			`* function in this file use the standard calling convention.`
286			`*/`
287
288			`#define src a0`
289			`#define dst a1`
290			`#define len a2`
291			`#define psum a3`
292			`#define sum v0`
293			`#define odd t8`
294			`#define errptr t9`
295
296			`/*`
297			`* The exception handler for loads requires that:`
298			`* 1- AT contain the address of the byte just past the end of the source`
299			`* of the copy,`
300			`* 2- src_entry <= src < AT, and`
301			`* 3- (dst - src) == (dst_entry - src_entry),`
302			`* The _entry suffix denotes values when __copy_user was called.`
303			`*`
304			`* (1) is set up up by __csum_partial_copy_from_user and maintained by`
305			`* not writing AT in __csum_partial_copy`
306			`* (2) is met by incrementing src by the number of bytes copied`
307			`* (3) is met by not doing loads between a pair of increments of dst and src`
308			`*`
309			`* The exception handlers for stores stores -EFAULT to errptr and return.`
310			`* These handlers do not need to overwrite any data.`
311			`*/`
312
313			`#define EXC(inst_reg,addr,handler) \`
314			`9: inst_reg, addr; \`
315			`.section __ex_table,"a"; \`
316			`PTR 9b, handler; \`
317			`.previous`
318
319			`#ifdef USE_DOUBLE`
320
321			`#define LOAD ld`
322			`#define LOADL ldl`
323			`#define LOADR ldr`
324			`#define STOREL sdl`
325			`#define STORER sdr`
326			`#define STORE sd`
327			`#define ADD daddu`
328			`#define SUB dsubu`
329			`#define SRL dsrl`
330			`#define SLL dsll`
331			`#define SLLV dsllv`
332			`#define SRLV dsrlv`
333			`#define NBYTES 8`
334			`#define LOG_NBYTES 3`
335
336			`#else`
337
338			`#define LOAD lw`
339			`#define LOADL lwl`
340			`#define LOADR lwr`
341			`#define STOREL swl`
342			`#define STORER swr`
343			`#define STORE sw`
344			`#define ADD addu`
345			`#define SUB subu`
346			`#define SRL srl`
347			`#define SLL sll`
348			`#define SLLV sllv`
349			`#define SRLV srlv`
350			`#define NBYTES 4`
351			`#define LOG_NBYTES 2`
352
353			`#endif /* USE_DOUBLE */`
354
355			`#ifdef CONFIG_CPU_LITTLE_ENDIAN`
356			`#define LDFIRST LOADR`
357			`#define LDREST LOADL`
358			`#define STFIRST STORER`
359			`#define STREST STOREL`
360			`#define SHIFT_DISCARD SLLV`
361			`#define SHIFT_DISCARD_REVERT SRLV`
362			`#else`
363			`#define LDFIRST LOADL`
364			`#define LDREST LOADR`
365			`#define STFIRST STOREL`
366			`#define STREST STORER`
367			`#define SHIFT_DISCARD SRLV`
368			`#define SHIFT_DISCARD_REVERT SLLV`
369			`#endif`
370
371			`#define FIRST(unit) ((unit)*NBYTES)`
372			`#define REST(unit) (FIRST(unit)+NBYTES-1)`
373
374			`#define ADDRMASK (NBYTES-1)`
375
376			`.set noat`
377
378			`LEAF(__csum_partial_copy_user)`
379			`PTR_ADDU AT, src, len /* See (1) above. */`
380			`#ifdef CONFIG_64BIT`
381			`move errptr, a4`
382			`#else`
383			`lw errptr, 16(sp)`
384			`#endif`
385			`FEXPORT(csum_partial_copy_nocheck)`
386			`move sum, zero`
387			`move odd, zero`
388			`/*`
389			`* Note: dst & src may be unaligned, len may be 0`
390			`* Temps`
391			`*/`
392			`/*`
393			`* The "issue break"s below are very approximate.`
394			`* Issue delays for dcache fills will perturb the schedule, as will`
395			`* load queue full replay traps, etc.`
396			`*`
397			`* If len < NBYTES use byte operations.`
398			`*/`
399			`sltu t2, len, NBYTES`
400			`and t1, dst, ADDRMASK`
401			`bnez t2, copy_bytes_checklen`
402			`and t0, src, ADDRMASK`
403			`andi odd, dst, 0x1 /* odd buffer? */`
404			`bnez t1, dst_unaligned`
405			`nop`
406			`bnez t0, src_unaligned_dst_aligned`
407			`/*`
408			`* use delay slot for fall-through`
409			`* src and dst are aligned; need to compute rem`
410			`*/`
411			`both_aligned:`
412			`SRL t0, len, LOG_NBYTES+3 # +3 for 8 units/iter`
413			`beqz t0, cleanup_both_aligned # len < 8*NBYTES`
414			`nop`
415			`SUB len, 8*NBYTES # subtract here for bgez loop`
416			`.align 4`
417			`1:`
418			`EXC( LOAD t0, UNIT(0)(src), l_exc)`
419			`EXC( LOAD t1, UNIT(1)(src), l_exc_copy)`
420			`EXC( LOAD t2, UNIT(2)(src), l_exc_copy)`
421			`EXC( LOAD t3, UNIT(3)(src), l_exc_copy)`
422			`EXC( LOAD t4, UNIT(4)(src), l_exc_copy)`
423			`EXC( LOAD t5, UNIT(5)(src), l_exc_copy)`
424			`EXC( LOAD t6, UNIT(6)(src), l_exc_copy)`
425			`EXC( LOAD t7, UNIT(7)(src), l_exc_copy)`
426			`SUB len, len, 8*NBYTES`
427			`ADD src, src, 8*NBYTES`
428			`EXC( STORE t0, UNIT(0)(dst), s_exc)`
429			`ADDC(sum, t0)`
430			`EXC( STORE t1, UNIT(1)(dst), s_exc)`
431			`ADDC(sum, t1)`
432			`EXC( STORE t2, UNIT(2)(dst), s_exc)`
433			`ADDC(sum, t2)`
434			`EXC( STORE t3, UNIT(3)(dst), s_exc)`
435			`ADDC(sum, t3)`
436			`EXC( STORE t4, UNIT(4)(dst), s_exc)`
437			`ADDC(sum, t4)`
438			`EXC( STORE t5, UNIT(5)(dst), s_exc)`
439			`ADDC(sum, t5)`
440			`EXC( STORE t6, UNIT(6)(dst), s_exc)`
441			`ADDC(sum, t6)`
442			`EXC( STORE t7, UNIT(7)(dst), s_exc)`
443			`ADDC(sum, t7)`
444			`bgez len, 1b`
445			`ADD dst, dst, 8*NBYTES`
446			`ADD len, 8*NBYTES # revert len (see above)`
447
448			`/*`
449			`* len == the number of bytes left to copy < 8*NBYTES`
450			`*/`
451			`cleanup_both_aligned:`
452			`#define rem t7`
453			`beqz len, done`
454			`sltu t0, len, 4*NBYTES`
455			`bnez t0, less_than_4units`
456			`and rem, len, (NBYTES-1) # rem = len % NBYTES`
457			`/*`
458			`* len >= 4*NBYTES`
459			`*/`
460			`EXC( LOAD t0, UNIT(0)(src), l_exc)`
461			`EXC( LOAD t1, UNIT(1)(src), l_exc_copy)`
462			`EXC( LOAD t2, UNIT(2)(src), l_exc_copy)`
463			`EXC( LOAD t3, UNIT(3)(src), l_exc_copy)`
464			`SUB len, len, 4*NBYTES`
465			`ADD src, src, 4*NBYTES`
466			`EXC( STORE t0, UNIT(0)(dst), s_exc)`
467			`ADDC(sum, t0)`
468			`EXC( STORE t1, UNIT(1)(dst), s_exc)`
469			`ADDC(sum, t1)`
470			`EXC( STORE t2, UNIT(2)(dst), s_exc)`
471			`ADDC(sum, t2)`
472			`EXC( STORE t3, UNIT(3)(dst), s_exc)`
473			`ADDC(sum, t3)`
474			`beqz len, done`
475			`ADD dst, dst, 4*NBYTES`
476			`less_than_4units:`
477			`/*`
478			`* rem = len % NBYTES`
479			`*/`
480			`beq rem, len, copy_bytes`
481			`nop`
482			`1:`
483			`EXC( LOAD t0, 0(src), l_exc)`
484			`ADD src, src, NBYTES`
485			`SUB len, len, NBYTES`
486			`EXC( STORE t0, 0(dst), s_exc)`
487			`ADDC(sum, t0)`
488			`bne rem, len, 1b`
489			`ADD dst, dst, NBYTES`
490
491			`/*`
492			`* src and dst are aligned, need to copy rem bytes (rem < NBYTES)`
493			`* A loop would do only a byte at a time with possible branch`
494			`* mispredicts. Can't do an explicit LOAD dst,mask,or,STORE`
495			`* because can't assume read-access to dst. Instead, use`
496			`* STREST dst, which doesn't require read access to dst.`
497			`*`
498			`* This code should perform better than a simple loop on modern,`
499			`* wide-issue mips processors because the code has fewer branches and`
500			`* more instruction-level parallelism.`
501			`*/`
502			`#define bits t2`
503			`beqz len, done`
504			`ADD t1, dst, len # t1 is just past last byte of dst`
505			`li bits, 8*NBYTES`
506			`SLL rem, len, 3 # rem = number of bits to keep`
507			`EXC( LOAD t0, 0(src), l_exc)`
508			`SUB bits, bits, rem # bits = number of bits to discard`
509			`SHIFT_DISCARD t0, t0, bits`
510			`EXC( STREST t0, -1(t1), s_exc)`
511			`SHIFT_DISCARD_REVERT t0, t0, bits`
512			`.set reorder`
513			`ADDC(sum, t0)`
514			`b done`
515			`.set noreorder`
516			`dst_unaligned:`
517			`/*`
518			`* dst is unaligned`
519			`* t0 = src & ADDRMASK`
520			`* t1 = dst & ADDRMASK; T1 > 0`
521			`* len >= NBYTES`
522			`*`
523			`* Copy enough bytes to align dst`
524			`* Set match = (src and dst have same alignment)`
525			`*/`
526			`#define match rem`
527			`EXC( LDFIRST t3, FIRST(0)(src), l_exc)`
528			`ADD t2, zero, NBYTES`
529			`EXC( LDREST t3, REST(0)(src), l_exc_copy)`
530			`SUB t2, t2, t1 # t2 = number of bytes copied`
531			`xor match, t0, t1`
532			`EXC( STFIRST t3, FIRST(0)(dst), s_exc)`
533			`SLL t4, t1, 3 # t4 = number of bits to discard`
534			`SHIFT_DISCARD t3, t3, t4`
535			`/* no SHIFT_DISCARD_REVERT to handle odd buffer properly */`
536			`ADDC(sum, t3)`
537			`beq len, t2, done`
538			`SUB len, len, t2`
539			`ADD dst, dst, t2`
540			`beqz match, both_aligned`
541			`ADD src, src, t2`
542
543			`src_unaligned_dst_aligned:`
544			`SRL t0, len, LOG_NBYTES+2 # +2 for 4 units/iter`
545			`beqz t0, cleanup_src_unaligned`
546			`and rem, len, (4NBYTES-1) # rem = len % 4NBYTES`
547			`1:`
548			`/*`
549			`* Avoid consecutive LD*'s to the same register since some mips`
550			`* implementations can't issue them in the same cycle.`
551			`* It's OK to load FIRST(N+1) before REST(N) because the two addresses`
552			`* are to the same unit (unless src is aligned, but it's not).`
553			`*/`
554			`EXC( LDFIRST t0, FIRST(0)(src), l_exc)`
555			`EXC( LDFIRST t1, FIRST(1)(src), l_exc_copy)`
556			`SUB len, len, 4*NBYTES`
557			`EXC( LDREST t0, REST(0)(src), l_exc_copy)`
558			`EXC( LDREST t1, REST(1)(src), l_exc_copy)`
559			`EXC( LDFIRST t2, FIRST(2)(src), l_exc_copy)`
560			`EXC( LDFIRST t3, FIRST(3)(src), l_exc_copy)`
561			`EXC( LDREST t2, REST(2)(src), l_exc_copy)`
562			`EXC( LDREST t3, REST(3)(src), l_exc_copy)`
563			`ADD src, src, 4*NBYTES`
564			`#ifdef CONFIG_CPU_SB1`
565			`nop # improves slotting`
566			`#endif`
567			`EXC( STORE t0, UNIT(0)(dst), s_exc)`
568			`ADDC(sum, t0)`
569			`EXC( STORE t1, UNIT(1)(dst), s_exc)`
570			`ADDC(sum, t1)`
571			`EXC( STORE t2, UNIT(2)(dst), s_exc)`
572			`ADDC(sum, t2)`
573			`EXC( STORE t3, UNIT(3)(dst), s_exc)`
574			`ADDC(sum, t3)`
575			`bne len, rem, 1b`
576			`ADD dst, dst, 4*NBYTES`
577
578			`cleanup_src_unaligned:`
579			`beqz len, done`
580			`and rem, len, NBYTES-1 # rem = len % NBYTES`
581			`beq rem, len, copy_bytes`
582			`nop`
583			`1:`
584			`EXC( LDFIRST t0, FIRST(0)(src), l_exc)`
585			`EXC( LDREST t0, REST(0)(src), l_exc_copy)`
586			`ADD src, src, NBYTES`
587			`SUB len, len, NBYTES`
588			`EXC( STORE t0, 0(dst), s_exc)`
589			`ADDC(sum, t0)`
590			`bne len, rem, 1b`
591			`ADD dst, dst, NBYTES`
592
593			`copy_bytes_checklen:`
594			`beqz len, done`
595			`nop`
596			`copy_bytes:`
597			`/* 0 < len < NBYTES */`
598			`#ifdef CONFIG_CPU_LITTLE_ENDIAN`
599			`#define SHIFT_START 0`
600			`#define SHIFT_INC 8`
601			`#else`
602			`#define SHIFT_START 8*(NBYTES-1)`
603			`#define SHIFT_INC -8`
604			`#endif`
605			`move t2, zero # partial word`
606			`li t3, SHIFT_START # shift`
607			`/* use l_exc_copy here to return correct sum on fault */`
608			`#define COPY_BYTE(N) \`
609			`EXC( lbu t0, N(src), l_exc_copy); \`
610			`SUB len, len, 1; \`
611			`EXC( sb t0, N(dst), s_exc); \`
612			`SLLV t0, t0, t3; \`
613			`addu t3, SHIFT_INC; \`
614			`beqz len, copy_bytes_done; \`
615			`or t2, t0`
616
617			`COPY_BYTE(0)`
618			`COPY_BYTE(1)`
619			`#ifdef USE_DOUBLE`
620			`COPY_BYTE(2)`
621			`COPY_BYTE(3)`
622			`COPY_BYTE(4)`
623			`COPY_BYTE(5)`
624			`#endif`
625			`EXC( lbu t0, NBYTES-2(src), l_exc_copy)`
626			`SUB len, len, 1`
627			`EXC( sb t0, NBYTES-2(dst), s_exc)`
628			`SLLV t0, t0, t3`
629			`or t2, t0`
630			`copy_bytes_done:`
631			`ADDC(sum, t2)`
632			`done:`
633			`/* fold checksum */`
634			`#ifdef USE_DOUBLE`
635			`dsll32 v1, sum, 0`
636			`daddu sum, v1`
637			`sltu v1, sum, v1`
638			`dsra32 sum, sum, 0`
639			`addu sum, v1`
640			`#endif`
641			`sll v1, sum, 16`
642			`addu sum, v1`
643			`sltu v1, sum, v1`
644			`srl sum, sum, 16`
645			`addu sum, v1`
646
647			`/* odd buffer alignment? */`
648			`beqz odd, 1f`
649			`nop`
650			`sll v1, sum, 8`
651			`srl sum, sum, 8`
652			`or sum, v1`
653			`andi sum, 0xffff`
654			`1:`
655			`.set reorder`
656			`ADDC(sum, psum)`
657			`jr ra`
658			`.set noreorder`
659
660			`l_exc_copy:`
661			`/*`
662			`* Copy bytes from src until faulting load address (or until a`
663			`* lb faults)`
664			`*`
665			`* When reached by a faulting LDFIRST/LDREST, THREAD_BUADDR($28)`
666			`* may be more than a byte beyond the last address.`
667			`* Hence, the lb below may get an exception.`
668			`*`
669			`* Assumes src < THREAD_BUADDR($28)`
670			`*/`
671			`LOAD t0, TI_TASK($28)`
672			`li t2, SHIFT_START`
673			`LOAD t0, THREAD_BUADDR(t0)`
674			`1:`
675			`EXC( lbu t1, 0(src), l_exc)`
676			`ADD src, src, 1`
677			`sb t1, 0(dst) # can't fault -- we're copy_from_user`
678			`SLLV t1, t1, t2`
679			`addu t2, SHIFT_INC`
680			`ADDC(sum, t1)`
681			`bne src, t0, 1b`
682			`ADD dst, dst, 1`
683			`l_exc:`
684			`LOAD t0, TI_TASK($28)`
685			`nop`
686			`LOAD t0, THREAD_BUADDR(t0) # t0 is just past last good address`
687			`nop`
688			`SUB len, AT, t0 # len number of uncopied bytes`
689			`/*`
690			`* Here's where we rely on src and dst being incremented in tandem,`
691			`* See (3) above.`
692			`* dst += (fault addr - src) to put dst at first byte to clear`
693			`*/`
694			`ADD dst, t0 # compute start address in a1`
695			`SUB dst, src`
696			`/*`
697			`* Clear len bytes starting at dst. Can't call __bzero because it`
698			`* might modify len. An inefficient loop for these rare times...`
699			`*/`
700			`beqz len, done`
701			`SUB src, len, 1`
702			`1: sb zero, 0(dst)`
703			`ADD dst, dst, 1`
704			`bnez src, 1b`
705			`SUB src, src, 1`
706			`li v1, -EFAULT`
707			`b done`
708			`sw v1, (errptr)`
709
710			`s_exc:`
711			`li v0, -1 /* invalid checksum */`
712			`li v1, -EFAULT`
713			`jr ra`
714			`sw v1, (errptr)`
715			`END(__csum_partial_copy_user)`