|
@@ -28,21 +28,68 @@
|
|
|
* More importantly, these should provide a good example for
|
|
|
* others to follow when adding arch specific optimizations.
|
|
|
* -Erik
|
|
|
+ *
|
|
|
+ * 2009-04: modified by Denys Vlasenko <vda.linux@googlemail.com>
|
|
|
+ * Fill byte-by-byte is a bit too slow. I prefer 46 byte function
|
|
|
+ * which fills x4 faster than 21 bytes one.
|
|
|
*/
|
|
|
|
|
|
#include <string.h>
|
|
|
|
|
|
-
|
|
|
#undef memset
|
|
|
void *memset(void *s, int c, size_t count)
|
|
|
{
|
|
|
- int d0, d1;
|
|
|
- __asm__ __volatile__(
|
|
|
- "rep\n\t"
|
|
|
- "stosb"
|
|
|
- : "=&c" (d0), "=&D" (d1)
|
|
|
- :"a" (c),"1" (s),"0" (count)
|
|
|
- :"memory");
|
|
|
- return s;
|
|
|
+ int reg, edi;
|
|
|
+ __asm__ __volatile__(
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ " mov %%ecx, %1\n"
|
|
|
+ " shr $2, %%ecx\n"
|
|
|
+ " jz 1f\n"
|
|
|
+
|
|
|
+ " movzx %%al, %%eax\n"
|
|
|
+
|
|
|
+ " imul $0x01010101, %%eax\n"
|
|
|
+
|
|
|
+ " rep; stosl\n"
|
|
|
+
|
|
|
+ "1: and $3, %1\n"
|
|
|
+ " jz 9f\n"
|
|
|
+ "2: stosb\n"
|
|
|
+ " dec %1\n"
|
|
|
+ " jnz 2b\n"
|
|
|
+
|
|
|
+ "9:\n"
|
|
|
+
|
|
|
+ : "=&D" (edi), "=&r" (reg)
|
|
|
+ : "0" (s), "a" (c), "c" (count)
|
|
|
+ : "memory"
|
|
|
+ );
|
|
|
+ return s;
|
|
|
}
|
|
|
libc_hidden_def(memset)
|
|
|
+
|
|
|
+
|
|
|
+gcc 4.3.1
|
|
|
+=========
|
|
|
+57 push %edi
|
|
|
+8b 7c 24 08 mov 0x8(%esp),%edi
|
|
|
+8b 4c 24 10 mov 0x10(%esp),%ecx
|
|
|
+8b 44 24 0c mov 0xc(%esp),%eax
|
|
|
+89 ca mov %ecx,%edx
|
|
|
+c1 e9 02 shr $0x2,%ecx
|
|
|
+74 0b je 1f <__GI_memset+0x1f>
|
|
|
+0f b6 c0 movzbl %al,%eax
|
|
|
+69 c0 01 01 01 01 imul $0x1010101,%eax,%eax
|
|
|
+f3 ab rep stos %eax,%es:(%edi)
|
|
|
+83 e2 03 and $0x3,%edx
|
|
|
+74 04 je 28 <__GI_memset+0x28>
|
|
|
+aa stos %al,%es:(%edi)
|
|
|
+4a dec %edx
|
|
|
+75 fc jne 24 <__GI_memset+0x24>
|
|
|
+8b 44 24 08 mov 0x8(%esp),%eax
|
|
|
+5f pop %edi
|
|
|
+c3 ret
|
|
|
+*/
|