string.h 9.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338
  1. /*
  2. * Copyright (C) 2008 Denys Vlasenko <vda.linux@googlemail.com>
  3. *
  4. * Licensed under the LGPL v2.1, see the file COPYING.LIB in this tarball
  5. */
  6. #if !defined _STRING_H
  7. #error "Never use <libc-string_i386.h> directly; include <string.h> instead"
  8. #endif
  9. #ifndef _LIBC_STRING_i386_H
  10. #define _LIBC_STRING_i386_H 1
  11. static __always_inline
  12. void *inlined_memset_const_c_count4(void *s, unsigned eax, unsigned count)
  13. {
  14. int ecx, edi;
  15. if (count == 0)
  16. return s;
  17. /* Very small (2 stores or less) are best done with direct
  18. * mov <const>,<mem> instructions (they do not clobber registers) */
  19. if (count == 1) {
  20. *(char *)(s + 0) = eax;
  21. return s;
  22. }
  23. /* You wonder why & 0xff is needed? Try memset(p, '\xff', size).
  24. * If char is signed, '\xff' == -1! */
  25. eax = (eax & 0xff) * 0x01010101; /* done at compile time */
  26. if (count == 2) {
  27. *(short *)(s + 0) = eax;
  28. return s;
  29. }
  30. if (count == 3) {
  31. *(short *)(s + 0) = eax;
  32. *(char *) (s + 2) = eax;
  33. return s;
  34. }
  35. if (count == 1*4 + 0) {
  36. *(int *)(s + 0) = eax;
  37. return s;
  38. }
  39. if (count == 1*4 + 1) {
  40. *(int *) (s + 0) = eax;
  41. *(char *)(s + 4) = eax;
  42. return s;
  43. }
  44. if (count == 1*4 + 2) {
  45. *(int *) (s + 0) = eax;
  46. *(short *)(s + 4) = eax;
  47. return s;
  48. }
  49. /* Small string stores: don't clobber ecx
  50. * (clobbers only eax and edi) */
  51. #define small_store(arg) { \
  52. __asm__ __volatile__( \
  53. arg \
  54. : "=&D" (edi) \
  55. : "a" (eax), "0" (s) \
  56. : "memory" \
  57. ); \
  58. return s; \
  59. }
  60. if (count == 1*4 + 3) small_store("stosl; stosw; stosb");
  61. if (count == 2*4 + 0) {
  62. ((int *)s)[0] = eax;
  63. ((int *)s)[1] = eax;
  64. return s;
  65. }
  66. if (count == 2*4 + 1) small_store("stosl; stosl; stosb");
  67. if (count == 2*4 + 2) small_store("stosl; stosl; stosw");
  68. if (count == 2*4 + 3) small_store("stosl; stosl; stosw; stosb");
  69. if (count == 3*4 + 0) small_store("stosl; stosl; stosl");
  70. if (count == 3*4 + 1) small_store("stosl; stosl; stosl; stosb");
  71. if (count == 3*4 + 2) small_store("stosl; stosl; stosl; stosw");
  72. if (count == 3*4 + 3) small_store("stosl; stosl; stosl; stosw; stosb");
  73. if (count == 4*4 + 0) small_store("stosl; stosl; stosl; stosl");
  74. if (count == 4*4 + 1) small_store("stosl; stosl; stosl; stosl; stosb");
  75. /* going over 7 bytes is suboptimal */
  76. /* stosw is 2-byte insn, so this one takes 6 bytes: */
  77. if (count == 4*4 + 2) small_store("stosl; stosl; stosl; stosl; stosw");
  78. /* 7 bytes */
  79. if (count == 4*4 + 3) small_store("stosl; stosl; stosl; stosl; stosw; stosb");
  80. /* 5 bytes */
  81. if (count == 5*4 + 0) small_store("stosl; stosl; stosl; stosl; stosl");
  82. /* 6 bytes */
  83. if (count == 5*4 + 1) small_store("stosl; stosl; stosl; stosl; stosl; stosb");
  84. /* 7 bytes */
  85. if (count == 5*4 + 2) small_store("stosl; stosl; stosl; stosl; stosl; stosw");
  86. /* 8 bytes, but oh well... */
  87. if (count == 5*4 + 3) small_store("stosl; stosl; stosl; stosl; stosl; stosw; stosb");
  88. /* 6 bytes */
  89. if (count == 6*4 + 0) small_store("stosl; stosl; stosl; stosl; stosl; stosl");
  90. /* the rest would be 7+ bytes and is handled below instead */
  91. #undef small_store
  92. /* Not small, but multiple-of-4 store.
  93. * "mov <const>,%ecx; rep; stosl" sequence is 7 bytes */
  94. __asm__ __volatile__(
  95. " rep; stosl\n"
  96. : "=&c" (ecx), "=&D" (edi)
  97. : "a" (eax), "0" (count / 4), "1" (s)
  98. : "memory"
  99. );
  100. return s;
  101. }
  102. #if 1 /* -51 bytes on shared i386 build with gcc 4.3.0 */
  103. #define memset(s, c, count) ( \
  104. ( !(__builtin_constant_p(c) && __builtin_constant_p(count)) \
  105. || ((count) > (6*4 + 0) && ((count) % 4) != 0) \
  106. ) \
  107. ? memset((s), (c), (count)) \
  108. : inlined_memset_const_c_count4((s), (c), (count)) \
  109. )
  110. #endif
  111. static __always_inline
  112. void *inlined_mempcpy_const_count4(void *d, const void *s, unsigned count)
  113. {
  114. int ecx;
  115. char *esi, *edi;
  116. if (count == 0)
  117. return d;
  118. if (count == 1) {
  119. *(char *)d = *(char *)s;
  120. return d + 1;
  121. }
  122. if (count == 2) {
  123. *(short *)d = *(short *)s;
  124. return d + 2;
  125. }
  126. /* Small string moves: don't clobber ecx
  127. * (clobbers only esi and edi) */
  128. #define small_move(arg) { \
  129. __asm__ __volatile__( \
  130. arg \
  131. : "=&S" (esi), "=&D" (edi) \
  132. : "0" (s), "1" (d) \
  133. : "memory" \
  134. ); \
  135. return edi; \
  136. }
  137. if (count == 3) small_move("movsw; movsb");
  138. if (count == 1*4 + 0) {
  139. *(int *)d = *(int *)s;
  140. return d + 4;
  141. }
  142. if (count == 1*4 + 1) small_move("movsl; movsb");
  143. if (count == 1*4 + 2) small_move("movsl; movsw");
  144. if (count == 1*4 + 3) small_move("movsl; movsw; movsb");
  145. if (count == 2*4 + 0) small_move("movsl; movsl");
  146. if (count == 2*4 + 1) small_move("movsl; movsl; movsb");
  147. if (count == 2*4 + 2) small_move("movsl; movsl; movsw");
  148. if (count == 2*4 + 3) small_move("movsl; movsl; movsw; movsb");
  149. if (count == 3*4 + 0) small_move("movsl; movsl; movsl");
  150. if (count == 3*4 + 1) small_move("movsl; movsl; movsl; movsb");
  151. if (count == 3*4 + 2) small_move("movsl; movsl; movsl; movsw");
  152. if (count == 3*4 + 3) small_move("movsl; movsl; movsl; movsw; movsb");
  153. if (count == 4*4 + 0) small_move("movsl; movsl; movsl; movsl");
  154. if (count == 4*4 + 1) small_move("movsl; movsl; movsl; movsl; movsb");
  155. /* going over 7 bytes is suboptimal */
  156. /* movsw is 2-byte insn, so this one takes 6 bytes: */
  157. if (count == 4*4 + 2) small_move("movsl; movsl; movsl; movsl; movsw");
  158. /* 7 bytes */
  159. if (count == 4*4 + 3) small_move("movsl; movsl; movsl; movsl; movsw; movsb");
  160. /* 5 bytes */
  161. if (count == 5*4 + 0) small_move("movsl; movsl; movsl; movsl; movsl");
  162. /* 6 bytes */
  163. if (count == 5*4 + 1) small_move("movsl; movsl; movsl; movsl; movsl; movsb");
  164. /* 7 bytes */
  165. if (count == 5*4 + 2) small_move("movsl; movsl; movsl; movsl; movsl; movsw");
  166. /* 8 bytes, but oh well... */
  167. if (count == 5*4 + 3) small_move("movsl; movsl; movsl; movsl; movsl; movsw; movsb");
  168. /* 6 bytes */
  169. if (count == 6*4 + 0) small_move("movsl; movsl; movsl; movsl; movsl; movsl");
  170. /* the rest would be 7+ bytes and is handled below instead */
  171. #undef small_move
  172. /* Not small, but multiple-of-4 move.
  173. * "mov <const>,%ecx; rep; movsl" sequence is 7 bytes */
  174. __asm__ __volatile__(
  175. " rep; movsl\n"
  176. : "=&c" (ecx), "=&S" (esi), "=&D" (edi)
  177. : "0" (count / 4), "1" (s), "2" (d)
  178. : "memory"
  179. );
  180. return edi;
  181. }
  182. static __always_inline
  183. void *inlined_memcpy_const_count4(void *d, const void *s, unsigned count)
  184. {
  185. inlined_mempcpy_const_count4(d, s, count);
  186. return d;
  187. }
  188. #if 1 /* +34 bytes on shared i386 build with gcc 4.3.0 */
  189. #define mempcpy(d, s, count) ( \
  190. ( !(__builtin_constant_p(count)) \
  191. || ((count) > (6*4 + 0) && ((count) % 4) != 0) \
  192. ) \
  193. ? mempcpy((d), (s), (count)) \
  194. : inlined_mempcpy_const_count4((d), (s), (count)) \
  195. )
  196. #define memcpy(d, s, count) ( \
  197. ( !(__builtin_constant_p(count)) \
  198. || ((count) > (6*4 + 0) && ((count) % 4) != 0) \
  199. ) \
  200. ? memcpy((d), (s), (count)) \
  201. : inlined_memcpy_const_count4((d), (s), (count)) \
  202. )
  203. #endif
  204. static __always_inline
  205. size_t inlined_strlen(const char *s)
  206. {
  207. int edi;
  208. int ecx;
  209. __asm__ __volatile__(
  210. " repne; scasb\n"
  211. /* " notl %0\n" */
  212. /* " decl %0\n" */
  213. : "=c" (ecx), "=&D" (edi)
  214. : "1" (s), "a" (0), "0" (0xffffffffu)
  215. /* : no clobbers */
  216. );
  217. return -ecx - 1;
  218. }
  219. #if 0 /* +1108 bytes on shared i386 build with gcc 4.3.0 */
  220. #define strlen(s) inlined_strlen(s)
  221. #endif
  222. static __always_inline
  223. char *inlined_stpcpy(char *dest, const char *src)
  224. {
  225. char *esi, *edi;
  226. int eax;
  227. __asm__ __volatile__(
  228. "1: lodsb\n"
  229. " stosb\n"
  230. " testb %%al, %%al\n"
  231. " jnz 1b\n"
  232. : "=&S" (esi), "=&D" (edi), "=&a" (eax)
  233. : "0" (src), "1" (dest)
  234. : "memory"
  235. );
  236. return edi - 1;
  237. }
  238. static __always_inline
  239. char *inlined_strcpy(char *dest, const char *src)
  240. {
  241. inlined_stpcpy(dest, src);
  242. return dest;
  243. }
  244. #if 0 /* +562 bytes on shared i386 build with gcc 4.3.0 */
  245. #define stpcpy(dest, src) inlined_stpcpy(dest, src)
  246. #define strcpy(dest, src) inlined_strcpy(dest, src)
  247. #endif
  248. static __always_inline
  249. void *inlined_memchr(const void *s, int c, size_t count)
  250. {
  251. void *edi;
  252. int ecx;
  253. /* Unfortunately, c gets loaded to %eax (wide insn), not %al */
  254. __asm__ __volatile__(
  255. " jecxz 1f\n"
  256. " repne; scasb\n"
  257. " leal -1(%%edi), %%edi\n"
  258. " je 2f\n"
  259. "1:\n"
  260. " xorl %%edi, %%edi\n"
  261. "2:\n"
  262. : "=&D" (edi), "=&c" (ecx)
  263. : "a" (c), "0" (s), "1" (count)
  264. /* : no clobbers */
  265. );
  266. return edi;
  267. }
  268. static __always_inline
  269. void *inlined_memchr_const_c(const void *s, int c, size_t count)
  270. {
  271. #if defined __OPTIMIZE__
  272. void *edi;
  273. int ecx, eax;
  274. __asm__ __volatile__(
  275. " jecxz 1f\n"
  276. " movb %4, %%al\n" /* const c to %%al */
  277. " repne; scasb\n"
  278. " leal -1(%%edi), %%edi\n"
  279. " je 2f\n"
  280. "1:\n"
  281. " xorl %%edi, %%edi\n"
  282. "2:\n"
  283. : "=&D" (edi), "=&c" (ecx), "=&a" (eax)
  284. : "0" (s), "i" (c), "1" (count)
  285. /* : no clobbers */
  286. );
  287. return edi;
  288. #else
  289. /* With -O0, gcc can't figure out how to encode CONST c
  290. * as an immediate operand. Generating slightly bigger code
  291. * (usually "movl CONST,%eax", 3 bytes bigger than needed):
  292. */
  293. void *edi;
  294. int ecx, eax;
  295. __asm__ __volatile__(
  296. " jecxz 1f\n"
  297. " repne; scasb\n"
  298. " leal -1(%%edi), %%edi\n"
  299. " je 2f\n"
  300. "1:\n"
  301. " xorl %%edi, %%edi\n"
  302. "2:\n"
  303. : "=&D" (edi), "=&c" (ecx), "=&a" (eax)
  304. : "0" (s), "2" (c), "1" (count)
  305. /* : no clobbers */
  306. );
  307. return edi;
  308. #endif
  309. }
  310. #if 1 /* +2 bytes on shared i386 build with gcc 4.3.0 */
  311. #define memchr(s, c, count) ( \
  312. __builtin_constant_p(c) \
  313. ? inlined_memchr_const_c(s, (c) & 0xff, count) \
  314. : inlined_memchr(s, c, count) \
  315. )
  316. #endif
  317. #endif /* _LIBC_STRING_i386_H */