URL https://opencores.org/ocsvn/openrisc_2011-10-31/openrisc_2011-10-31/trunk

Subversion Repositories openrisc_2011-10-31

[/] [openrisc/] [tags/] [gnu-src/] [newlib-1.18.0/] [newlib-1.18.0-or32-1.0rc1/] [newlib/] [libc/] [machine/] [sh/] [strncpy.S] - Diff between revs 207 and 345

Only display areas with differences | Details | Blame | View Log

Rev 207	Rev 345
`/* Copyright 2003 SuperH Ltd. */`	`/* Copyright 2003 SuperH Ltd. */`

`#include "asm.h"`	`#include "asm.h"`

`#ifdef __SH5__`	`#ifdef __SH5__`
`#if __SHMEDIA__`	`#if __SHMEDIA__`

`#ifdef __LITTLE_ENDIAN__`	`#ifdef __LITTLE_ENDIAN__`
`#define ZPAD_MASK(src, dst) addi src, -1, dst`	`#define ZPAD_MASK(src, dst) addi src, -1, dst`
`#else`	`#else`
`#define ZPAD_MASK(src, dst) \`	`#define ZPAD_MASK(src, dst) \`
`byterev src, dst; addi dst, -1, dst; byterev dst, dst`	`byterev src, dst; addi dst, -1, dst; byterev dst, dst`
`#endif`	`#endif`


`/* We assume that the destination is not in the first 16 bytes of memory.`	`/* We assume that the destination is not in the first 16 bytes of memory.`
`A typical linker script will put the text section first, and as`	`A typical linker script will put the text section first, and as`
`this code is longer that 16 bytes, you have to get out of your way`	`this code is longer that 16 bytes, you have to get out of your way`
`to put data there. */`	`to put data there. */`
`ENTRY(strncpy)`	`ENTRY(strncpy)`
`pt L_small, tr2`	`pt L_small, tr2`
`ldlo.q r3, 0, r0`	`ldlo.q r3, 0, r0`
`shlli r3, 3, r19`	`shlli r3, 3, r19`
`mcmpeq.b r0, r63, r1`	`mcmpeq.b r0, r63, r1`
`SHHI r1, r19, r7`	`SHHI r1, r19, r7`
`add r2, r4, r20`	`add r2, r4, r20`
`addi r20, -8, r5`	`addi r20, -8, r5`
`/* If the size is greater than 8, we know we can read beyond the first`	`/* If the size is greater than 8, we know we can read beyond the first`
`(possibly partial) quadword, and write out a full first and last`	`(possibly partial) quadword, and write out a full first and last`
`(possibly unaligned and/or overlapping) quadword. */`	`(possibly unaligned and/or overlapping) quadword. */`
`bge/u r2, r5, tr2 // L_small`	`bge/u r2, r5, tr2 // L_small`
`pt L_found0, tr0`	`pt L_found0, tr0`
`addi r2, 8, r22`	`addi r2, 8, r22`
`bnei/u r7, 0, tr0 // L_found0`	`bnei/u r7, 0, tr0 // L_found0`
`ori r3, -8, r38`	`ori r3, -8, r38`
`pt L_end_early, tr1`	`pt L_end_early, tr1`
`sub r2, r38, r22`	`sub r2, r38, r22`
`stlo.q r2, 0, r0`	`stlo.q r2, 0, r0`
`sthi.q r2, 7, r0`	`sthi.q r2, 7, r0`
`sub r3, r2, r6`	`sub r3, r2, r6`
`ldx.q r22, r6, r0`	`ldx.q r22, r6, r0`
`/* Before each iteration, check that we can store in full the next quad we`	`/* Before each iteration, check that we can store in full the next quad we`
`are about to fetch. */`	`are about to fetch. */`
`addi r5, -8, r36`	`addi r5, -8, r36`
`bgtu/u r22, r36, tr1 // L_end_early`	`bgtu/u r22, r36, tr1 // L_end_early`
`pt L_scan0, tr1`	`pt L_scan0, tr1`
`L_scan0:`	`L_scan0:`
`addi r22, 8, r22`	`addi r22, 8, r22`
`mcmpeq.b r0, r63, r1`	`mcmpeq.b r0, r63, r1`
`stlo.q r22, -8, r0`	`stlo.q r22, -8, r0`
`bnei/u r1, 0, tr0 // L_found0`	`bnei/u r1, 0, tr0 // L_found0`
`sthi.q r22, -1, r0`	`sthi.q r22, -1, r0`
`ldx.q r22, r6, r0`	`ldx.q r22, r6, r0`
`bgeu/l r36, r22, tr1 // L_scan0`	`bgeu/l r36, r22, tr1 // L_scan0`
`L_end:`	`L_end:`
`// At end; we might re-read a few bytes when we fetch the last quad.`	`// At end; we might re-read a few bytes when we fetch the last quad.`
`// branch mispredict, so load is ready now.`	`// branch mispredict, so load is ready now.`
`mcmpeq.b r0, r63, r1`	`mcmpeq.b r0, r63, r1`
`addi r22, 8, r22`	`addi r22, 8, r22`
`bnei/u r1, 0, tr0 // L_found0`	`bnei/u r1, 0, tr0 // L_found0`
`add r3, r4, r7`	`add r3, r4, r7`
`ldlo.q r7, -8, r1`	`ldlo.q r7, -8, r1`
`ldhi.q r7, -1, r7`	`ldhi.q r7, -1, r7`
`ptabs r18, tr0`	`ptabs r18, tr0`
`stlo.q r22, -8, r0`	`stlo.q r22, -8, r0`
`or r1, r7, r1`	`or r1, r7, r1`
`mcmpeq.b r1, r63, r7`	`mcmpeq.b r1, r63, r7`
`sthi.q r22, -1, r0`	`sthi.q r22, -1, r0`
`ZPAD_MASK (r7, r7)`	`ZPAD_MASK (r7, r7)`
`and r1, r7, r1 // mask out non-zero bytes after first zero byte`	`and r1, r7, r1 // mask out non-zero bytes after first zero byte`
`stlo.q r20, -8, r1`	`stlo.q r20, -8, r1`
`sthi.q r20, -1, r1`	`sthi.q r20, -1, r1`
`blink tr0, r63`	`blink tr0, r63`

`L_end_early:`	`L_end_early:`
`/* Check if we can store the current quad in full. */`	`/* Check if we can store the current quad in full. */`
`pt L_end, tr1`	`pt L_end, tr1`
`add r3, r4, r7`	`add r3, r4, r7`
`bgtu/u r5, r22, tr1 // L_end // Not really unlikely, but gap is short.`	`bgtu/u r5, r22, tr1 // L_end // Not really unlikely, but gap is short.`
`/* If not, that means we can just proceed to process the last quad.`	`/* If not, that means we can just proceed to process the last quad.`
`Two pipeline stalls are unavoidable, as we don't have enough ILP. */`	`Two pipeline stalls are unavoidable, as we don't have enough ILP. */`
`ldlo.q r7, -8, r1`	`ldlo.q r7, -8, r1`
`ldhi.q r7, -1, r7`	`ldhi.q r7, -1, r7`
`ptabs r18, tr0`	`ptabs r18, tr0`
`or r1, r7, r1`	`or r1, r7, r1`
`mcmpeq.b r1, r63, r7`	`mcmpeq.b r1, r63, r7`
`ZPAD_MASK (r7, r7)`	`ZPAD_MASK (r7, r7)`
`and r1, r7, r1 // mask out non-zero bytes after first zero byte`	`and r1, r7, r1 // mask out non-zero bytes after first zero byte`
`stlo.q r20, -8, r1`	`stlo.q r20, -8, r1`
`sthi.q r20, -1, r1`	`sthi.q r20, -1, r1`
`blink tr0, r63`	`blink tr0, r63`

`L_found0:`	`L_found0:`
`// r0: string to store, not yet zero-padding normalized.`	`// r0: string to store, not yet zero-padding normalized.`
`// r1: result of mcmpeq.b r0, r63, r1.`	`// r1: result of mcmpeq.b r0, r63, r1.`
`// r22: store address plus 8. I.e. address where zero padding beyond the`	`// r22: store address plus 8. I.e. address where zero padding beyond the`
`// string in r0 goes.`	`// string in r0 goes.`
`// r20: store end address.`	`// r20: store end address.`
`// r5: store end address minus 8.`	`// r5: store end address minus 8.`
`pt L_write0_multiquad, tr0`	`pt L_write0_multiquad, tr0`
`ZPAD_MASK (r1, r1)`	`ZPAD_MASK (r1, r1)`
`and r0, r1, r0 // mask out non-zero bytes after first zero byte`	`and r0, r1, r0 // mask out non-zero bytes after first zero byte`
`stlo.q r22, -8, r0`	`stlo.q r22, -8, r0`
`sthi.q r22, -1, r0`	`sthi.q r22, -1, r0`
`andi r22, -8, r1 // Check if zeros to write fit in one quad word.`	`andi r22, -8, r1 // Check if zeros to write fit in one quad word.`
`bgtu/l r5, r1, tr0 // L_write0_multiquad`	`bgtu/l r5, r1, tr0 // L_write0_multiquad`
`ptabs r18, tr1`	`ptabs r18, tr1`
`sub r20, r22, r1`	`sub r20, r22, r1`
`shlli r1, 2, r1 // Do shift in two steps so that 64 bit case is`	`shlli r1, 2, r1 // Do shift in two steps so that 64 bit case is`
`SHLO r0, r1, r0 // handled correctly.`	`SHLO r0, r1, r0 // handled correctly.`
`SHLO r0, r1, r0`	`SHLO r0, r1, r0`
`sthi.q r20, -1, r0`	`sthi.q r20, -1, r0`
`blink tr1, r63`	`blink tr1, r63`

`L_write0_multiquad:`	`L_write0_multiquad:`
`pt L_write0_loop, tr0`	`pt L_write0_loop, tr0`
`ptabs r18, tr1`	`ptabs r18, tr1`
`stlo.q r22, 0, r63`	`stlo.q r22, 0, r63`
`sthi.q r20, -1, r63`	`sthi.q r20, -1, r63`
`addi r1, 8, r1`	`addi r1, 8, r1`
`bgeu/l r5, r1, tr0 // L_write0_loop`	`bgeu/l r5, r1, tr0 // L_write0_loop`
`blink tr1, r63`	`blink tr1, r63`

`L_write0_loop:`	`L_write0_loop:`
`st.q r1, 0 ,r63`	`st.q r1, 0 ,r63`
`addi r1, 8, r1`	`addi r1, 8, r1`
`bgeu/l r5, r1, tr0 // L_write0_loop`	`bgeu/l r5, r1, tr0 // L_write0_loop`
`blink tr1, r63`	`blink tr1, r63`

`L_small:`	`L_small:`
`// r0: string to store, not yet zero-padding normalized.`	`// r0: string to store, not yet zero-padding normalized.`
`// r1: result of mcmpeq.b r0, r63, r1.`	`// r1: result of mcmpeq.b r0, r63, r1.`
`// r7: nonzero indicates relevant zero found r0.`	`// r7: nonzero indicates relevant zero found r0.`
`// r2: store address.`	`// r2: store address.`
`// r3: read address.`	`// r3: read address.`
`// r4: size, max 8`	`// r4: size, max 8`
`// r20: store end address.`	`// r20: store end address.`
`// r5: store end address minus 8.`	`// r5: store end address minus 8.`
`pt L_nohi, tr0`	`pt L_nohi, tr0`
`pt L_small_storelong, tr1`	`pt L_small_storelong, tr1`
`ptabs r18, tr2`	`ptabs r18, tr2`
`sub r63, r4, r23`	`sub r63, r4, r23`
`bnei/u r7, 0, tr0 // L_nohi`	`bnei/u r7, 0, tr0 // L_nohi`
`ori r3, -8, r7`	`ori r3, -8, r7`
`bge/l r23, r7, tr0 // L_nohi`	`bge/l r23, r7, tr0 // L_nohi`
`ldhi.q r3, 7, r1`	`ldhi.q r3, 7, r1`
`or r0, r1, r0`	`or r0, r1, r0`
`mcmpeq.b r0, r63, r1`	`mcmpeq.b r0, r63, r1`
`L_nohi:`	`L_nohi:`
`ZPAD_MASK (r1, r1)`	`ZPAD_MASK (r1, r1)`
`and r0, r1, r0`	`and r0, r1, r0`
`movi 4, r19`	`movi 4, r19`
`bge/u r4, r19, tr1 // L_small_storelong`	`bge/u r4, r19, tr1 // L_small_storelong`

`pt L_small_end, tr0`	`pt L_small_end, tr0`
`#ifndef __LITTLE_ENDIAN__`	`#ifndef __LITTLE_ENDIAN__`
`byterev r0, r0`	`byterev r0, r0`
`#endif`	`#endif`
`beqi/u r4, 0, tr0 // L_small_end`	`beqi/u r4, 0, tr0 // L_small_end`
`st.b r2, 0, r0`	`st.b r2, 0, r0`
`beqi/u r4, 1, tr0 // L_small_end`	`beqi/u r4, 1, tr0 // L_small_end`
`shlri r0, 8, r0`	`shlri r0, 8, r0`
`st.b r2, 1, r0`	`st.b r2, 1, r0`
`beqi/u r4, 2, tr0 // L_small_end`	`beqi/u r4, 2, tr0 // L_small_end`
`shlri r0, 8, r0`	`shlri r0, 8, r0`
`st.b r2, 2, r0`	`st.b r2, 2, r0`
`L_small_end:`	`L_small_end:`
`blink tr2, r63`	`blink tr2, r63`

`L_small_storelong:`	`L_small_storelong:`
`shlli r23, 3, r7`	`shlli r23, 3, r7`
`SHHI r0, r7, r1`	`SHHI r0, r7, r1`
`#ifdef __LITTLE_ENDIAN__`	`#ifdef __LITTLE_ENDIAN__`
`shlri r1, 32, r1`	`shlri r1, 32, r1`
`#else`	`#else`
`shlri r0, 32, r0`	`shlri r0, 32, r0`
`#endif`	`#endif`
`stlo.l r2, 0, r0`	`stlo.l r2, 0, r0`
`sthi.l r2, 3, r0`	`sthi.l r2, 3, r0`
`stlo.l r20, -4, r1`	`stlo.l r20, -4, r1`
`sthi.l r20, -1, r1`	`sthi.l r20, -1, r1`
`blink tr2, r63`	`blink tr2, r63`

`#else /* SHcompact */`	`#else /* SHcompact */`

`/* This code is optimized for size. Instruction selection is SH5 specific.`	`/* This code is optimized for size. Instruction selection is SH5 specific.`
`SH4 should use a different version. */`	`SH4 should use a different version. */`
`ENTRY(strncpy)`	`ENTRY(strncpy)`
`mov #0, r6`	`mov #0, r6`
`cmp/eq r4, r6`	`cmp/eq r4, r6`
`bt return`	`bt return`
`mov r2, r5`	`mov r2, r5`
`add #-1, r5`	`add #-1, r5`
`add r5, r4`	`add r5, r4`
`loop:`	`loop:`
`bt/s found0`	`bt/s found0`
`add #1, r5`	`add #1, r5`
`mov.b @r3+, r1`	`mov.b @r3+, r1`
`found0:`	`found0:`
`cmp/eq r5,r4`	`cmp/eq r5,r4`
`mov.b r1, @r5`	`mov.b r1, @r5`
`bf/s loop`	`bf/s loop`
`cmp/eq r1, r6`	`cmp/eq r1, r6`
`return:`	`return:`
`rts`	`rts`
`nop`	`nop`

`#endif /* SHcompact */`	`#endif /* SHcompact */`
`#endif /* __SH5__ */`	`#endif /* __SH5__ */`

Browse

Tools

Subversion Repositories openrisc_2011-10-31

[/] [openrisc/] [tags/] [gnu-src/] [newlib-1.18.0/] [newlib-1.18.0-or32-1.0rc1/] [newlib/] [libc/] [machine/] [sh/] [strncpy.S] - Diff between revs 207 and 345