URL
https://opencores.org/ocsvn/or1k/or1k/trunk
Subversion Repositories or1k
[/] [or1k/] [trunk/] [linux/] [linux-2.4/] [arch/] [sh64/] [lib/] [copy_user_memcpy.S] - Rev 1765
Compare with Previous | Blame | View Log
!! Fast SH memcpy!! by Toshiyasu Morita (tm@netcom.com)! hacked by J"orn Rernnecke (joern.rennecke@superh.com) ("o for o-umlaut)! SH5 code Copyright 2002 SuperH Ltd.!! Entry: ARG0: destination pointer! ARG1: source pointer! ARG2: byte count!! Exit: RESULT: destination pointer! any other registers in the range r0-r7: trashed!! Notes: Usually one wants to do small reads and write a longword, but! unfortunately it is difficult in some cases to concatanate bytes! into a longword on the SH, so this does a longword read and small! writes.!! This implementation makes two assumptions about how it is called:!! 1.: If the byte count is nonzero, the address of the last byte to be! copied is unsigned greater than the address of the first byte to! be copied. This could be easily swapped for a signed comparison,! but the algorithm used needs some comparison.!! 2.: When there are two or three bytes in the last word of an 11-or-more! bytes memory chunk to b copied, the rest of the word can be read! without side effects.! This could be easily changed by increasing the minumum size of! a fast memcpy and the amount subtracted from r7 before L_2l_loop be 2,! however, this would cost a few extra cyles on average.! For SHmedia, the assumption is that any quadword can be read in its! enirety if at least one byte is included in the copy./* Imported into Linux kernel by Richard Curnow. This is used to implement the__copy_user function in the general case, so it has to be a distinctfunction from intra-kernel memcpy to allow for exception fix-ups in theevent that the user pointer is bad somewhere in the copy (e.g. due torunning off the end of the vma).Note, this algorithm will be slightly wasteful in the case where the sourceand destination pointers are equally aligned, because the stlo/sthi pairscould then be merged back into single stores. If there are a lot of cachemisses, this is probably offset by the stall lengths on the preloads.*/.section .text..SHmedia32,"ax".little.balign 32.global copy_user_memcpy.global copy_user_memcpy_endcopy_user_memcpy:#define LDUAQ(P,O,D0,D1) ldlo.q P,O,D0; ldhi.q P,O+7,D1#define STUAQ(P,O,D0,D1) stlo.q P,O,D0; sthi.q P,O+7,D1#define LDUAL(P,O,D0,D1) ldlo.l P,O,D0; ldhi.l P,O+3,D1#define STUAL(P,O,D0,D1) stlo.l P,O,D0; sthi.l P,O+3,D1ld.b r3,0,r63pta/l Large,tr0movi 25,r0bgeu/u r4,r0,tr0nsb r4,r0shlli r0,5,r0movi (L1-L0+63*32 + 1) & 0xffff,r1sub r1, r0, r0L0: ptrel r0,tr0add r2,r4,r5ptabs r18,tr1add r3,r4,r6blink tr0,r63/* Rearranged to make cut2 safe */.balign 8L4_7: /* 4..7 byte memcpy cntd. */stlo.l r2, 0, r0or r6, r7, r6sthi.l r5, -1, r6stlo.l r5, -4, r6blink tr1,r63.balign 8L1: /* 0 byte memcpy */nopblink tr1,r63nopnopnopnopL2_3: /* 2 or 3 byte memcpy cntd. */st.b r5,-1,r6blink tr1,r63/* 1 byte memcpy */ld.b r3,0,r0st.b r2,0,r0blink tr1,r63L8_15: /* 8..15 byte memcpy cntd. */stlo.q r2, 0, r0or r6, r7, r6sthi.q r5, -1, r6stlo.q r5, -8, r6blink tr1,r63/* 2 or 3 byte memcpy */ld.b r3,0,r0ld.b r2,0,r63ld.b r3,1,r1st.b r2,0,r0pta/l L2_3,tr0ld.b r6,-1,r6st.b r2,1,r1blink tr0, r63/* 4 .. 7 byte memcpy */LDUAL (r3, 0, r0, r1)pta L4_7, tr0ldlo.l r6, -4, r7or r0, r1, r0sthi.l r2, 3, r0ldhi.l r6, -1, r6blink tr0, r63/* 8 .. 15 byte memcpy */LDUAQ (r3, 0, r0, r1)pta L8_15, tr0ldlo.q r6, -8, r7or r0, r1, r0sthi.q r2, 7, r0ldhi.q r6, -1, r6blink tr0, r63/* 16 .. 24 byte memcpy */LDUAQ (r3, 0, r0, r1)LDUAQ (r3, 8, r8, r9)or r0, r1, r0sthi.q r2, 7, r0or r8, r9, r8sthi.q r2, 15, r8ldlo.q r6, -8, r7ldhi.q r6, -1, r6stlo.q r2, 8, r8stlo.q r2, 0, r0or r6, r7, r6sthi.q r5, -1, r6stlo.q r5, -8, r6blink tr1,r63Large:ld.b r2, 0, r63pta/l Loop_ua, tr1ori r3, -8, r7sub r2, r7, r22sub r3, r2, r6add r2, r4, r5ldlo.q r3, 0, r0addi r5, -16, r5movi 64+8, r27 ! could subtract r7 from that.stlo.q r2, 0, r0sthi.q r2, 7, r0ldx.q r22, r6, r0bgtu/l r27, r4, tr1addi r5, -48, r27pta/l Loop_line, tr0addi r6, 64, r36addi r6, -24, r19addi r6, -16, r20addi r6, -8, r21Loop_line:ldx.q r22, r36, r63alloco r22, 32addi r22, 32, r22ldx.q r22, r19, r23sthi.q r22, -25, r0ldx.q r22, r20, r24ldx.q r22, r21, r25stlo.q r22, -32, r0ldx.q r22, r6, r0sthi.q r22, -17, r23sthi.q r22, -9, r24sthi.q r22, -1, r25stlo.q r22, -24, r23stlo.q r22, -16, r24stlo.q r22, -8, r25bgeu r27, r22, tr0Loop_ua:addi r22, 8, r22sthi.q r22, -1, r0stlo.q r22, -8, r0ldx.q r22, r6, r0bgtu/l r5, r22, tr1add r3, r4, r7ldlo.q r7, -8, r1sthi.q r22, 7, r0ldhi.q r7, -1, r7ptabs r18,tr1stlo.q r22, 0, r0or r1, r7, r1sthi.q r5, 15, r1stlo.q r5, 8, r1blink tr1, r63copy_user_memcpy_end:nop
