memset.S 2.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152
  1. /* $Id: memset.S,v 1.1 2000/04/14 16:49:01 mjd Exp $
  2. *
  3. * "memset" implementation of SuperH
  4. *
  5. * Copyright (C) 1999 Niibe Yutaka
  6. *
  7. * Copyright (c) 2009 STMicroelectronics Ltd
  8. * Optimised using 64bit data transfer (via FPU) and the movca.l inst.
  9. * Author: Giuseppe Cavallaro <peppe.cavallaro@st.com>
  10. *
  11. * Licensed under the LGPL v2.1, see the file COPYING.LIB in this tarball.
  12. */
  13. /*
  14. * void *memset(void *s, int c, size_t n);
  15. */
  16. #include <sysdep.h>
  17. #if defined (__LITTLE_ENDIAN__) && defined (__SH_FPU_ANY__)
  18. #define MEMSET_USES_FPU
  19. /* Use paired single precision load or store mode for 64-bit tranfering.
  20. * FPSCR.SZ=1,FPSCR.SZ=0 is well defined on both SH4-200 and SH4-300.
  21. * Currenlty it has been only implemented and tested for little endian mode. */
  22. .macro FPU_SET_PAIRED_PREC
  23. sts fpscr, r3
  24. mov #0x10, r1 ! PR=0 SZ=1
  25. shll16 r1
  26. lds r1, fpscr
  27. .endm
  28. .macro RESTORE_FPSCR
  29. lds r3, fpscr
  30. .endm
  31. #endif
  32. ENTRY(memset)
  33. mov #12,r0
  34. add r6,r4
  35. cmp/gt r6,r0
  36. bt/s 40f ! if it's too small, set a byte at once
  37. mov r4,r0
  38. and #3,r0
  39. cmp/eq #0,r0
  40. bt/s 2f ! It's aligned
  41. sub r0,r6
  42. 1:
  43. dt r0
  44. bf/s 1b
  45. mov.b r5,@-r4
  46. 2: ! make VVVV
  47. extu.b r5,r5
  48. swap.b r5,r0 ! V0
  49. or r0,r5 ! VV
  50. swap.w r5,r0 ! VV00
  51. or r0,r5 ! VVVV
  52. ! Check if enough bytes need to be copied to be worth the big loop
  53. mov #0x40, r0 ! (MT)
  54. cmp/gt r6,r0 ! (MT) 64 > len => slow loop
  55. bt/s 22f
  56. mov r6,r0
  57. ! align the dst to the cache block size if necessary
  58. mov r4, r3
  59. mov #~(0x1f), r1
  60. and r3, r1
  61. cmp/eq r3, r1
  62. bt/s 11f ! dst is already aligned
  63. sub r1, r3 ! r3-r1 -> r3
  64. shlr2 r3 ! number of loops
  65. 10: mov.l r5,@-r4
  66. dt r3
  67. bf/s 10b
  68. add #-4, r6
  69. 11: ! dst is 32byte aligned
  70. mov r6,r2
  71. mov #-5,r0
  72. shld r0,r2 ! number of loops
  73. add #-32, r4
  74. mov r5, r0
  75. #ifdef MEMSET_USES_FPU
  76. lds r5, fpul ! (CO)
  77. fsts fpul, fr0 ! Dr0 will be 'VVVVVVVV'
  78. fsts fpul, fr1
  79. FPU_SET_PAIRED_PREC
  80. 12:
  81. movca.l r0, @r4
  82. mov.l r5, @(4, r4)
  83. add #32, r4
  84. fmov dr0, @-r4
  85. fmov dr0, @-r4
  86. add #-0x20, r6
  87. fmov dr0, @-r4
  88. dt r2
  89. bf/s 12b
  90. add #-40, r4
  91. RESTORE_FPSCR
  92. #else
  93. 12:
  94. movca.l r0,@r4
  95. mov.l r5,@(4, r4)
  96. mov.l r5,@(8, r4)
  97. mov.l r5,@(12,r4)
  98. mov.l r5,@(16,r4)
  99. mov.l r5,@(20,r4)
  100. add #-0x20, r6
  101. mov.l r5,@(24,r4)
  102. dt r2
  103. mov.l r5,@(28,r4)
  104. bf/s 12b
  105. add #-32, r4
  106. #endif
  107. add #32, r4
  108. mov #8, r0
  109. cmp/ge r0, r6
  110. bf 40f
  111. mov r6,r0
  112. 22:
  113. shlr2 r0
  114. shlr r0 ! r0 = r6 >> 3
  115. 3:
  116. dt r0
  117. mov.l r5,@-r4 ! set 8-byte at once
  118. bf/s 3b
  119. mov.l r5,@-r4
  120. !
  121. mov #7,r0
  122. and r0,r6
  123. ! fill bytes (length may be zero)
  124. 40: tst r6,r6
  125. bt 5f
  126. 4:
  127. dt r6
  128. bf/s 4b
  129. mov.b r5,@-r4
  130. 5:
  131. rts
  132. mov r4,r0
  133. END(memset)
  134. libc_hidden_def (memset)