memset.c 9.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271
  1. /* Copyright (C) 2001, 2003 Free Software Foundation, Inc.
  2. Copyright (C) 1999, 2000 Axis Communications AB.
  3. This file is part of the GNU C Library.
  4. The GNU C Library is free software; you can redistribute it and/or
  5. modify it under the terms of the GNU Library General Public License as
  6. published by the Free Software Foundation; either version 2 of the
  7. License, or (at your option) any later version.
  8. The GNU C Library is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  11. Library General Public License for more details.
  12. You should have received a copy of the GNU Library General Public
  13. License along with the GNU C Library; see the file COPYING.LIB. If not,
  14. write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  15. Boston, MA 02111-1307, USA. */
  16. /*#************************************************************************#*/
  17. /*#-------------------------------------------------------------------------*/
  18. /*# */
  19. /*# FUNCTION NAME: memset() */
  20. /*# */
  21. /*# PARAMETERS: void* dst; Destination address. */
  22. /*# int c; Value of byte to write. */
  23. /*# int len; Number of bytes to write. */
  24. /*# */
  25. /*# RETURNS: dst. */
  26. /*# */
  27. /*# DESCRIPTION: Sets the memory dst of length len bytes to c, as standard. */
  28. /*# Framework taken from memcpy. This routine is */
  29. /*# very sensitive to compiler changes in register allocation. */
  30. /*# Should really be rewritten to avoid this problem. */
  31. /*# */
  32. /*#-------------------------------------------------------------------------*/
  33. /*# */
  34. /*# HISTORY */
  35. /*# */
  36. /*# DATE NAME CHANGES */
  37. /*# ---- ---- ------- */
  38. /*# 990713 HP Tired of watching this function (or */
  39. /*# really, the nonoptimized generic */
  40. /*# implementation) take up 90% of simulator */
  41. /*# output. Measurements needed. */
  42. /*# */
  43. /*#-------------------------------------------------------------------------*/
  44. /* No, there's no macro saying 12*4, since it is "hard" to get it into
  45. the asm in a good way. Thus better to expose the problem everywhere.
  46. */
  47. /* Assuming 1 cycle per dword written or read (ok, not really true), and
  48. one per instruction, then 43+3*(n/48-1) <= 24+24*(n/48-1)
  49. so n >= 45.7; n >= 0.9; we win on the first full 48-byte block to set. */
  50. #define ZERO_BLOCK_SIZE (1*12*4)
  51. void *memset(void *, int, unsigned long);
  52. libc_hidden_proto(memset)
  53. void *memset(void *pdst,
  54. int c,
  55. unsigned long plen)
  56. {
  57. /* Ok. Now we want the parameters put in special registers.
  58. Make sure the compiler is able to make something useful of this. */
  59. register char *return_dst __asm__ ("r10") = pdst;
  60. register long n __asm__ ("r12") = plen;
  61. register int lc __asm__ ("r11") = c;
  62. /* Most apps use memset sanely. Only those memsetting about 3..4
  63. bytes or less get penalized compared to the generic implementation
  64. - and that's not really sane use. */
  65. /* Ugh. This is fragile at best. Check with newer GCC releases, if
  66. they compile cascaded "x |= x << 8" sanely! */
  67. __asm__("movu.b %0,$r13 \n\
  68. lslq 8,$r13 \n\
  69. move.b %0,$r13 \n\
  70. move.d $r13,%0 \n\
  71. lslq 16,$r13 \n\
  72. or.d $r13,%0"
  73. : "=r" (lc) : "0" (lc) : "r13");
  74. {
  75. register char *dst __asm__ ("r13") = pdst;
  76. if (((unsigned long) pdst & 3) != 0
  77. /* Oops! n=0 must be a legal call, regardless of alignment. */
  78. && n >= 3)
  79. {
  80. if ((unsigned long)dst & 1)
  81. {
  82. *dst = (char) lc;
  83. n--;
  84. dst++;
  85. }
  86. if ((unsigned long)dst & 2)
  87. {
  88. *(short *)dst = lc;
  89. n -= 2;
  90. dst += 2;
  91. }
  92. }
  93. /* Now the fun part. For the threshold value of this, check the equation
  94. above. */
  95. /* Decide which copying method to use. */
  96. if (n >= ZERO_BLOCK_SIZE)
  97. {
  98. /* For large copies we use 'movem' */
  99. /* It is not optimal to tell the compiler about clobbering any
  100. registers; that will move the saving/restoring of those registers
  101. to the function prologue/epilogue, and make non-movem sizes
  102. suboptimal.
  103. This method is not foolproof; it assumes that the "asm reg"
  104. declarations at the beginning of the function really are used
  105. here (beware: they may be moved to temporary registers).
  106. This way, we do not have to save/move the registers around into
  107. temporaries; we can safely use them straight away. */
  108. __asm__ __volatile__ (" \n\
  109. .syntax no_register_prefix \n\
  110. \n\
  111. ;; Check that the register asm declaration got right. \n\
  112. ;; The GCC manual explicitly says there's no warranty for that (too). \n\
  113. .ifnc %0-%1-%4,$r13-$r12-$r11 \n\
  114. .err \n\
  115. .endif \n\
  116. \n\
  117. ;; Save the registers we'll clobber in the movem process \n\
  118. ;; on the stack. Don't mention them to gcc, it will only be \n\
  119. ;; upset. \n\
  120. subq 11*4,sp \n\
  121. movem r10,[sp] \n\
  122. \n\
  123. move.d r11,r0 \n\
  124. move.d r11,r1 \n\
  125. move.d r11,r2 \n\
  126. move.d r11,r3 \n\
  127. move.d r11,r4 \n\
  128. move.d r11,r5 \n\
  129. move.d r11,r6 \n\
  130. move.d r11,r7 \n\
  131. move.d r11,r8 \n\
  132. move.d r11,r9 \n\
  133. move.d r11,r10 \n\
  134. \n\
  135. ;; Now we've got this: \n\
  136. ;; r13 - dst \n\
  137. ;; r12 - n \n\
  138. \n\
  139. ;; Update n for the first loop \n\
  140. subq 12*4,r12 \n\
  141. 0: \n\
  142. subq 12*4,r12 \n\
  143. bge 0b \n\
  144. movem r11,[r13+] \n\
  145. \n\
  146. addq 12*4,r12 ;; compensate for last loop underflowing n \n\
  147. \n\
  148. ;; Restore registers from stack \n\
  149. movem [sp+],r10"
  150. /* Outputs */ : "=r" (dst), "=r" (n)
  151. /* Inputs */ : "0" (dst), "1" (n), "r" (lc));
  152. }
  153. /* Either we directly starts copying, using dword copying
  154. in a loop, or we copy as much as possible with 'movem'
  155. and then the last block (<44 bytes) is copied here.
  156. This will work since 'movem' will have updated src,dst,n. */
  157. while ( n >= 16 )
  158. {
  159. *((long*)dst)++ = lc;
  160. *((long*)dst)++ = lc;
  161. *((long*)dst)++ = lc;
  162. *((long*)dst)++ = lc;
  163. n -= 16;
  164. }
  165. /* A switch() is definitely the fastest although it takes a LOT of code.
  166. * Particularly if you inline code this.
  167. */
  168. switch (n)
  169. {
  170. case 0:
  171. break;
  172. case 1:
  173. *(char*)dst = (char) lc;
  174. break;
  175. case 2:
  176. *(short*)dst = (short) lc;
  177. break;
  178. case 3:
  179. *((short*)dst)++ = (short) lc;
  180. *(char*)dst = (char) lc;
  181. break;
  182. case 4:
  183. *((long*)dst)++ = lc;
  184. break;
  185. case 5:
  186. *((long*)dst)++ = lc;
  187. *(char*)dst = (char) lc;
  188. break;
  189. case 6:
  190. *((long*)dst)++ = lc;
  191. *(short*)dst = (short) lc;
  192. break;
  193. case 7:
  194. *((long*)dst)++ = lc;
  195. *((short*)dst)++ = (short) lc;
  196. *(char*)dst = (char) lc;
  197. break;
  198. case 8:
  199. *((long*)dst)++ = lc;
  200. *((long*)dst)++ = lc;
  201. break;
  202. case 9:
  203. *((long*)dst)++ = lc;
  204. *((long*)dst)++ = lc;
  205. *(char*)dst = (char) lc;
  206. break;
  207. case 10:
  208. *((long*)dst)++ = lc;
  209. *((long*)dst)++ = lc;
  210. *(short*)dst = (short) lc;
  211. break;
  212. case 11:
  213. *((long*)dst)++ = lc;
  214. *((long*)dst)++ = lc;
  215. *((short*)dst)++ = (short) lc;
  216. *(char*)dst = (char) lc;
  217. break;
  218. case 12:
  219. *((long*)dst)++ = lc;
  220. *((long*)dst)++ = lc;
  221. *((long*)dst)++ = lc;
  222. break;
  223. case 13:
  224. *((long*)dst)++ = lc;
  225. *((long*)dst)++ = lc;
  226. *((long*)dst)++ = lc;
  227. *(char*)dst = (char) lc;
  228. break;
  229. case 14:
  230. *((long*)dst)++ = lc;
  231. *((long*)dst)++ = lc;
  232. *((long*)dst)++ = lc;
  233. *(short*)dst = (short) lc;
  234. break;
  235. case 15:
  236. *((long*)dst)++ = lc;
  237. *((long*)dst)++ = lc;
  238. *((long*)dst)++ = lc;
  239. *((short*)dst)++ = (short) lc;
  240. *(char*)dst = (char) lc;
  241. break;
  242. }
  243. }
  244. return return_dst; /* destination pointer. */
  245. } /* memset() */
  246. libc_hidden_def(memset)