12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697 |
- /* Cloned and hacked for uClibc by Paul Mundt, December 2003 */
- /* Modified by SuperH, Inc. September 2003 */
- !
- ! Fast SH memset
- !
- ! by Toshiyasu Morita (tm@netcom.com)
- !
- ! SH5 code by J"orn Rennecke (joern.rennecke@superh.com)
- ! Copyright 2002 SuperH Ltd.
- !
- #include <features.h>
- #include <endian.h>
- #if __BYTE_ORDER == __LITTLE_ENDIAN
- #define SHHI shlld
- #define SHLO shlrd
- #else
- #define SHHI shlrd
- #define SHLO shlld
- #endif
- .section .text..SHmedia32,"ax"
- .globl __memset
- .hidden __memset
- .type __memset, @function
- .align 5
- __memset:
- pta/l multiquad, tr0
- andi r2, 7, r22
- ptabs r18, tr2
- mshflo.b r3,r3,r3
- add r4, r22, r23
- mperm.w r3, r63, r3 // Fill pattern now in every byte of r3
- movi 8, r9
- bgtu/u r23, r9, tr0 // multiquad
- beqi/u r4, 0, tr2 // Return with size 0 - ensures no mem accesses
- ldlo.q r2, 0, r7
- shlli r4, 2, r4
- movi -1, r8
- SHHI r8, r4, r8
- SHHI r8, r4, r8
- mcmv r7, r8, r3
- stlo.q r2, 0, r3
- blink tr2, r63
- multiquad:
- pta/l lastquad, tr0
- stlo.q r2, 0, r3
- shlri r23, 3, r24
- add r2, r4, r5
- beqi/u r24, 1, tr0 // lastquad
- pta/l loop, tr1
- sub r2, r22, r25
- andi r5, -8, r20 // calculate end address and
- addi r20, -7*8, r8 // loop end address; This might overflow, so we need
- // to use a different test before we start the loop
- bge/u r24, r9, tr1 // loop
- st.q r25, 8, r3
- st.q r20, -8, r3
- shlri r24, 1, r24
- beqi/u r24, 1, tr0 // lastquad
- st.q r25, 16, r3
- st.q r20, -16, r3
- beqi/u r24, 2, tr0 // lastquad
- st.q r25, 24, r3
- st.q r20, -24, r3
- lastquad:
- sthi.q r5, -1, r3
- blink tr2,r63
- loop:
- !!! alloco r25, 32 // QQQ comment out for short-term fix to SHUK #3895.
- // QQQ commenting out is locically correct, but sub-optimal
- // QQQ Sean McGoogan - 4th April 2003.
- st.q r25, 8, r3
- st.q r25, 16, r3
- st.q r25, 24, r3
- st.q r25, 32, r3
- addi r25, 32, r25
- bgeu/l r8, r25, tr1 // loop
- st.q r20, -40, r3
- st.q r20, -32, r3
- st.q r20, -24, r3
- st.q r20, -16, r3
- st.q r20, -8, r3
- sthi.q r5, -1, r3
- blink tr2,r63
- .size __memset,.-__memset
- strong_alias(__memset,memset)
|