123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316 |
- /*
- * Copyright (C) 2008 Denys Vlasenko <vda.linux@googlemail.com>
- *
- * Licensed under the LGPL v2.1, see the file COPYING.LIB in this tarball
- */
- #if !defined _STRING_H
- #error "Never use <libc-string_i386.h> directly; include <string.h> instead"
- #endif
- #ifndef _LIBC_STRING_i386_H
- #define _LIBC_STRING_i386_H 1
- static __always_inline
- void *inlined_memset_const_c_count4(void *s, unsigned eax, unsigned count)
- {
- int ecx, edi;
- if (count == 0)
- return s;
- /* Very small (2 stores or less) are best done with direct
- * mov <const>,<mem> instructions (they do not clobber registers) */
- if (count == 1) {
- *(char *)(s + 0) = eax;
- return s;
- }
- /* You wonder why & 0xff is needed? Try memset(p, '\xff', size).
- * If char is signed, '\xff' == -1! */
- eax = (eax & 0xff) * 0x01010101; /* done at compile time */
- if (count == 2) {
- *(short *)(s + 0) = eax;
- return s;
- }
- if (count == 3) {
- *(short *)(s + 0) = eax;
- *(char *) (s + 2) = eax;
- return s;
- }
- if (count == 1*4 + 0) {
- *(int *)(s + 0) = eax;
- return s;
- }
- if (count == 1*4 + 1) {
- *(int *) (s + 0) = eax;
- *(char *)(s + 4) = eax;
- return s;
- }
- if (count == 1*4 + 2) {
- *(int *) (s + 0) = eax;
- *(short *)(s + 4) = eax;
- return s;
- }
- /* Small string stores: don't clobber ecx
- * (clobbers only eax and edi) */
- #define small_store(arg) { \
- __asm__ __volatile__( \
- arg \
- : "=&D" (edi) \
- : "a" (eax), "0" (s) \
- : "memory" \
- ); \
- return s; \
- }
- if (count == 1*4 + 3) small_store("stosl; stosw; stosb");
- if (count == 2*4 + 0) {
- ((int *)s)[0] = eax;
- ((int *)s)[1] = eax;
- return s;
- }
- if (count == 2*4 + 1) small_store("stosl; stosl; stosb");
- if (count == 2*4 + 2) small_store("stosl; stosl; stosw");
- if (count == 2*4 + 3) small_store("stosl; stosl; stosw; stosb");
- if (count == 3*4 + 0) small_store("stosl; stosl; stosl");
- if (count == 3*4 + 1) small_store("stosl; stosl; stosl; stosb");
- if (count == 3*4 + 2) small_store("stosl; stosl; stosl; stosw");
- if (count == 3*4 + 3) small_store("stosl; stosl; stosl; stosw; stosb");
- if (count == 4*4 + 0) small_store("stosl; stosl; stosl; stosl");
- if (count == 4*4 + 1) small_store("stosl; stosl; stosl; stosl; stosb");
- /* going over 7 bytes is suboptimal */
- /* stosw is 2-byte insn, so this one takes 6 bytes: */
- if (count == 4*4 + 2) small_store("stosl; stosl; stosl; stosl; stosw");
- /* 7 bytes */
- if (count == 4*4 + 3) small_store("stosl; stosl; stosl; stosl; stosw; stosb");
- /* 5 bytes */
- if (count == 5*4 + 0) small_store("stosl; stosl; stosl; stosl; stosl");
- /* 6 bytes */
- if (count == 5*4 + 1) small_store("stosl; stosl; stosl; stosl; stosl; stosb");
- /* 7 bytes */
- if (count == 5*4 + 2) small_store("stosl; stosl; stosl; stosl; stosl; stosw");
- /* 8 bytes, but oh well... */
- if (count == 5*4 + 3) small_store("stosl; stosl; stosl; stosl; stosl; stosw; stosb");
- /* 6 bytes */
- if (count == 6*4 + 0) small_store("stosl; stosl; stosl; stosl; stosl; stosl");
- /* the rest would be 7+ bytes and is handled below instead */
- #undef small_store
- /* Not small, but multiple-of-4 store.
- * "mov <const>,%ecx; rep; stosl" sequence is 7 bytes */
- __asm__ __volatile__(
- " rep; stosl\n"
- : "=&c" (ecx), "=&D" (edi)
- : "a" (eax), "0" (count / 4), "1" (s)
- : "memory"
- );
- return s;
- }
- #if 1 /* -51 bytes on shared i386 build with gcc 4.3.0 */
- #define memset(s, c, count) ( \
- ( !(__builtin_constant_p(c) && __builtin_constant_p(count)) \
- || ((count) > (6*4 + 0) && ((count) % 4) != 0) \
- ) \
- ? memset((s), (c), (count)) \
- : inlined_memset_const_c_count4((s), (c), (count)) \
- )
- #endif
- static __always_inline
- void *inlined_mempcpy_const_count4(void *d, const void *s, unsigned count)
- {
- int ecx;
- char *esi, *edi;
- if (count == 0)
- return d;
- if (count == 1) {
- *(char *)d = *(char *)s;
- return d + 1;
- }
- if (count == 2) {
- *(short *)d = *(short *)s;
- return d + 2;
- }
- /* Small string moves: don't clobber ecx
- * (clobbers only esi and edi) */
- #define small_move(arg) { \
- __asm__ __volatile__( \
- arg \
- : "=&S" (esi), "=&D" (edi) \
- : "0" (s), "1" (d) \
- : "memory" \
- ); \
- return edi; \
- }
- if (count == 3) small_move("movsw; movsb");
- if (count == 1*4 + 0) {
- *(int *)d = *(int *)s;
- return d + 4;
- }
- if (count == 1*4 + 1) small_move("movsl; movsb");
- if (count == 1*4 + 2) small_move("movsl; movsw");
- if (count == 1*4 + 3) small_move("movsl; movsw; movsb");
- if (count == 2*4 + 0) small_move("movsl; movsl");
- if (count == 2*4 + 1) small_move("movsl; movsl; movsb");
- if (count == 2*4 + 2) small_move("movsl; movsl; movsw");
- if (count == 2*4 + 3) small_move("movsl; movsl; movsw; movsb");
- if (count == 3*4 + 0) small_move("movsl; movsl; movsl");
- if (count == 3*4 + 1) small_move("movsl; movsl; movsl; movsb");
- if (count == 3*4 + 2) small_move("movsl; movsl; movsl; movsw");
- if (count == 3*4 + 3) small_move("movsl; movsl; movsl; movsw; movsb");
- if (count == 4*4 + 0) small_move("movsl; movsl; movsl; movsl");
- if (count == 4*4 + 1) small_move("movsl; movsl; movsl; movsl; movsb");
- /* going over 7 bytes is suboptimal */
- /* movsw is 2-byte insn, so this one takes 6 bytes: */
- if (count == 4*4 + 2) small_move("movsl; movsl; movsl; movsl; movsw");
- /* 7 bytes */
- if (count == 4*4 + 3) small_move("movsl; movsl; movsl; movsl; movsw; movsb");
- /* 5 bytes */
- if (count == 5*4 + 0) small_move("movsl; movsl; movsl; movsl; movsl");
- /* 6 bytes */
- if (count == 5*4 + 1) small_move("movsl; movsl; movsl; movsl; movsl; movsb");
- /* 7 bytes */
- if (count == 5*4 + 2) small_move("movsl; movsl; movsl; movsl; movsl; movsw");
- /* 8 bytes, but oh well... */
- if (count == 5*4 + 3) small_move("movsl; movsl; movsl; movsl; movsl; movsw; movsb");
- /* 6 bytes */
- if (count == 6*4 + 0) small_move("movsl; movsl; movsl; movsl; movsl; movsl");
- /* the rest would be 7+ bytes and is handled below instead */
- #undef small_move
- /* Not small, but multiple-of-4 move.
- * "mov <const>,%ecx; rep; movsl" sequence is 7 bytes */
- __asm__ __volatile__(
- " rep; movsl\n"
- : "=&c" (ecx), "=&S" (esi), "=&D" (edi)
- : "0" (count / 4), "1" (s), "2" (d)
- : "memory"
- );
- return edi;
- }
- static __always_inline
- void *inlined_memcpy_const_count4(void *d, const void *s, unsigned count)
- {
- inlined_mempcpy_const_count4(d, s, count);
- return d;
- }
- #if 1 /* +34 bytes on shared i386 build with gcc 4.3.0 */
- #define mempcpy(d, s, count) ( \
- ( !(__builtin_constant_p(count)) \
- || ((count) > (6*4 + 0) && ((count) % 4) != 0) \
- ) \
- ? mempcpy((d), (s), (count)) \
- : inlined_mempcpy_const_count4((d), (s), (count)) \
- )
- #define memcpy(d, s, count) ( \
- ( !(__builtin_constant_p(count)) \
- || ((count) > (6*4 + 0) && ((count) % 4) != 0) \
- ) \
- ? memcpy((d), (s), (count)) \
- : inlined_memcpy_const_count4((d), (s), (count)) \
- )
- #endif
- static __always_inline
- size_t inlined_strlen(const char *s)
- {
- int edi;
- int ecx;
- __asm__ __volatile__(
- " repne; scasb\n"
- /* " notl %0\n" */
- /* " decl %0\n" */
- : "=c" (ecx), "=&D" (edi)
- : "1" (s), "a" (0), "0" (0xffffffffu)
- /* : no clobbers */
- );
- return -ecx - 1;
- }
- #if 0 /* +1108 bytes on shared i386 build with gcc 4.3.0 */
- #define strlen(s) inlined_strlen(s)
- #endif
- static __always_inline
- char *inlined_stpcpy(char *dest, const char *src)
- {
- char *esi, *edi;
- int eax;
- __asm__ __volatile__(
- "1: lodsb\n"
- " stosb\n"
- " testb %%al, %%al\n"
- " jnz 1b\n"
- : "=&S" (esi), "=&D" (edi), "=&a" (eax)
- : "0" (src), "1" (dest)
- : "memory"
- );
- return edi - 1;
- }
- static __always_inline
- char *inlined_strcpy(char *dest, const char *src)
- {
- inlined_stpcpy(dest, src);
- return dest;
- }
- #if 0 /* +562 bytes on shared i386 build with gcc 4.3.0 */
- #define stpcpy(dest, src) inlined_stpcpy(dest, src)
- #define strcpy(dest, src) inlined_strcpy(dest, src)
- #endif
- static __always_inline
- void *inlined_memchr(const void *s, int c, size_t count)
- {
- void *edi;
- int ecx;
- /* Unfortunately, c gets loaded to %eax (wide insn), not %al */
- __asm__ __volatile__(
- " jecxz 1f\n"
- " repne; scasb\n"
- " leal -1(%%edi), %%edi\n"
- " je 2f\n"
- "1:\n"
- " xorl %%edi, %%edi\n"
- "2:\n"
- : "=&D" (edi), "=&c" (ecx)
- : "a" (c), "0" (s), "1" (count)
- /* : no clobbers */
- );
- return edi;
- }
- static __always_inline
- void *inlined_memchr_const_c(const void *s, int c, size_t count)
- {
- void *edi;
- int ecx, eax;
- __asm__ __volatile__(
- " jecxz 1f\n"
- " movb %4, %%al\n" /* const c to %%al */
- " repne; scasb\n"
- " leal -1(%%edi), %%edi\n"
- " je 2f\n"
- "1:\n"
- " xorl %%edi, %%edi\n"
- "2:\n"
- : "=&D" (edi), "=&c" (ecx), "=&a" (eax)
- : "0" (s), "i" (c), "1" (count)
- /* : no clobbers */
- );
- return edi;
- }
- #if 1 /* +2 bytes on shared i386 build with gcc 4.3.0 */
- #define memchr(s, c, count) ( \
- __builtin_constant_p(c) \
- ? inlined_memchr_const_c(s, (c) & 0xff, count) \
- : inlined_memchr(s, c, count) \
- )
- #endif
- #endif /* _LIBC_STRING_i386_H */
|