memset.S 2.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146
  1. /* $Id: memset.S,v 1.1 2000/04/14 16:49:01 mjd Exp $
  2. *
  3. * "memset" implementation of SuperH
  4. *
  5. * Copyright (C) 1999 Niibe Yutaka
  6. *
  7. * Copyright (c) 2009 STMicroelectronics Ltd
  8. * Optimised using 64bit data transfer via FPU
  9. * Author: Giuseppe Cavallaro <peppe.cavallaro@st.com>
  10. *
  11. * Licensed under the LGPL v2.1, see the file COPYING.LIB in this tarball.
  12. */
  13. /*
  14. * void *memset(void *s, int c, size_t n);
  15. */
  16. #include <sysdep.h>
  17. #ifdef __LITTLE_ENDIAN__
  18. #define MEMSET_USES_FPU
  19. /* Use paired single precision load or store mode for 64-bit tranfering.
  20. * FPSCR.SZ=1,FPSCR.SZ=0 is well defined on both SH4-200 and SH4-300.
  21. * Currenlty it has been only implemented and tested for little endian mode. */
  22. .macro FPU_SET_PAIRED_PREC
  23. sts fpscr, r3
  24. mov #0x10, r0 ! PR=0 SZ=1
  25. shll16 r0
  26. lds r0, fpscr
  27. .endm
  28. .macro RESTORE_FPSCR
  29. lds r3, fpscr
  30. .endm
  31. #endif
  32. ENTRY(memset)
  33. tst r6,r6
  34. bt/s 5f ! if n=0, do nothing
  35. add r6,r4
  36. mov #12,r0
  37. cmp/gt r6,r0
  38. bt/s 4f ! if it's too small, set a byte at once
  39. mov r4,r0
  40. and #3,r0
  41. cmp/eq #0,r0
  42. bt/s 2f ! It's aligned
  43. sub r0,r6
  44. 1:
  45. dt r0
  46. bf/s 1b
  47. mov.b r5,@-r4
  48. 2: ! make VVVV
  49. extu.b r5,r5
  50. swap.b r5,r0 ! V0
  51. or r0,r5 ! VV
  52. swap.w r5,r0 ! VV00
  53. or r0,r5 ! VVVV
  54. ! Enough bytes need to be copied
  55. mov #0x40, r0 ! (MT)
  56. cmp/gt r6,r0 ! (MT) 64 > len => slow loop
  57. bt/s 22f
  58. mov r6,r0
  59. ! align the dst to the cache block size if necessary
  60. mov r4, r3
  61. mov #~(0x1f), r1
  62. and r3, r1
  63. cmp/eq r3, r1
  64. bt/s 11f ! dst is already aligned
  65. sub r1, r3 ! r3-r1 -> r3
  66. shlr2 r3 ! number of loops
  67. 10: mov.l r5,@-r4
  68. dt r3
  69. bf/s 10b
  70. add #-4, r6
  71. 11: ! dst is 32byte aligned
  72. mov r6,r2
  73. mov #-5,r0
  74. shld r0,r2 ! number of loops
  75. #ifdef MEMSET_USES_FPU
  76. lds r5, fpul ! (CO)
  77. fsts fpul, fr0 ! Dr0 will be 'VVVVVVVV'
  78. fsts fpul, fr1
  79. FPU_SET_PAIRED_PREC
  80. 12:
  81. add #-0x20, r6 !(MT)
  82. fmov dr0, @-r4
  83. fmov dr0, @-r4
  84. fmov dr0, @-r4
  85. dt r2
  86. bf/s 12b !(BR)
  87. fmov dr0, @-r4
  88. RESTORE_FPSCR
  89. #else
  90. 12:
  91. mov.l r5,@-r4
  92. mov.l r5,@-r4
  93. mov.l r5,@-r4
  94. mov.l r5,@-r4
  95. mov.l r5,@-r4
  96. mov.l r5,@-r4
  97. add #-0x20, r6
  98. mov.l r5,@-r4
  99. dt r2
  100. bf/s 12b
  101. mov.l r5,@-r4
  102. #endif
  103. tst r6,r6
  104. bt/s 5f
  105. mov #8, r0
  106. cmp/ge r0, r6
  107. bf/s 4f
  108. mov r6,r0
  109. 22:
  110. shlr2 r0
  111. shlr r0 ! r0 = r6 >> 3
  112. 3:
  113. dt r0
  114. mov.l r5,@-r4 ! set 8-byte at once
  115. bf/s 3b
  116. mov.l r5,@-r4
  117. !
  118. mov #7,r0
  119. and r0,r6
  120. tst r6,r6
  121. bt 5f
  122. ! fill bytes
  123. 4:
  124. dt r6
  125. bf/s 4b
  126. mov.b r5,@-r4
  127. 5:
  128. rts
  129. mov r4,r0
  130. END(memset)
  131. libc_hidden_def (memset)