| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338 | /* * Copyright (C) 2008 Denys Vlasenko <vda.linux@googlemail.com> * * Licensed under the LGPL v2.1, see the file COPYING.LIB in this tarball */#if !defined _STRING_H#error "Never use <libc-string_i386.h> directly; include <string.h> instead"#endif#ifndef _LIBC_STRING_i386_H#define _LIBC_STRING_i386_H 1static __always_inlinevoid *inlined_memset_const_c_count4(void *s, unsigned eax, unsigned count){	int ecx, edi;	if (count == 0)		return s;	/* Very small (2 stores or less) are best done with direct	 * mov <const>,<mem> instructions (they do not clobber registers) */	if (count == 1) {		*(char *)(s + 0) = eax;		return s;	}	/* You wonder why & 0xff is needed? Try memset(p, '\xff', size).	 * If char is signed, '\xff' == -1! */	eax = (eax & 0xff) * 0x01010101; /* done at compile time */	if (count == 2) {		*(short *)(s + 0) = eax;		return s;	}	if (count == 3) {		*(short *)(s + 0) = eax;		*(char *) (s + 2) = eax;		return s;	}	if (count == 1*4 + 0) {		*(int *)(s + 0) = eax;		return s;	}	if (count == 1*4 + 1) {		*(int *) (s + 0) = eax;		*(char *)(s + 4) = eax;		return s;	}	if (count == 1*4 + 2) {		*(int *)  (s + 0) = eax;		*(short *)(s + 4) = eax;		return s;	}	/* Small string stores: don't clobber ecx	 * (clobbers only eax and edi) */#define small_store(arg) { \	__asm__ __volatile__( \		arg \		: "=&D" (edi) \		: "a" (eax), "0" (s) \		: "memory" \	); \	return s; \}	if (count == 1*4 + 3) small_store("stosl; stosw; stosb");	if (count == 2*4 + 0) {		((int *)s)[0] = eax;		((int *)s)[1] = eax;		return s;	}	if (count == 2*4 + 1) small_store("stosl; stosl; stosb");	if (count == 2*4 + 2) small_store("stosl; stosl; stosw");	if (count == 2*4 + 3) small_store("stosl; stosl; stosw; stosb");	if (count == 3*4 + 0) small_store("stosl; stosl; stosl");	if (count == 3*4 + 1) small_store("stosl; stosl; stosl; stosb");	if (count == 3*4 + 2) small_store("stosl; stosl; stosl; stosw");	if (count == 3*4 + 3) small_store("stosl; stosl; stosl; stosw; stosb");	if (count == 4*4 + 0) small_store("stosl; stosl; stosl; stosl");	if (count == 4*4 + 1) small_store("stosl; stosl; stosl; stosl; stosb");	/* going over 7 bytes is suboptimal */	/* stosw is 2-byte insn, so this one takes 6 bytes: */	if (count == 4*4 + 2) small_store("stosl; stosl; stosl; stosl; stosw");	/* 7 bytes */	if (count == 4*4 + 3) small_store("stosl; stosl; stosl; stosl; stosw; stosb");	/* 5 bytes */	if (count == 5*4 + 0) small_store("stosl; stosl; stosl; stosl; stosl");	/* 6 bytes */	if (count == 5*4 + 1) small_store("stosl; stosl; stosl; stosl; stosl; stosb");	/* 7 bytes */	if (count == 5*4 + 2) small_store("stosl; stosl; stosl; stosl; stosl; stosw");	/* 8 bytes, but oh well... */	if (count == 5*4 + 3) small_store("stosl; stosl; stosl; stosl; stosl; stosw; stosb");	/* 6 bytes */	if (count == 6*4 + 0) small_store("stosl; stosl; stosl; stosl; stosl; stosl");	/* the rest would be 7+ bytes and is handled below instead */#undef small_store	/* Not small, but multiple-of-4 store.	 * "mov <const>,%ecx; rep; stosl" sequence is 7 bytes */	__asm__ __volatile__(		"	rep; stosl\n"		: "=&c" (ecx), "=&D" (edi)		: "a" (eax), "0" (count / 4), "1" (s)		: "memory"	);	return s;}#if 1 /* -51 bytes on shared i386 build with gcc 4.3.0 */#define memset(s, c, count) ( \	( !(__builtin_constant_p(c) && __builtin_constant_p(count)) \	  || ((count) > (6*4 + 0) && ((count) % 4) != 0) \	) \	? memset((s), (c), (count)) \	: inlined_memset_const_c_count4((s), (c), (count)) \	)#endifstatic __always_inlinevoid *inlined_mempcpy_const_count4(void *d, const void *s, unsigned count){	int ecx;	char *esi, *edi;	if (count == 0)		return d;	if (count == 1) {		*(char *)d = *(char *)s;		return d + 1;	}	if (count == 2) {		*(short *)d = *(short *)s;		return d + 2;	}	/* Small string moves: don't clobber ecx	 * (clobbers only esi and edi) */#define small_move(arg) { \	__asm__ __volatile__( \		arg \		: "=&S" (esi), "=&D" (edi) \		: "0" (s), "1" (d) \		: "memory" \	); \	return edi; \}	if (count == 3) small_move("movsw; movsb");	if (count == 1*4 + 0) {		*(int *)d = *(int *)s;		return d + 4;	}	if (count == 1*4 + 1) small_move("movsl; movsb");	if (count == 1*4 + 2) small_move("movsl; movsw");	if (count == 1*4 + 3) small_move("movsl; movsw; movsb");	if (count == 2*4 + 0) small_move("movsl; movsl");	if (count == 2*4 + 1) small_move("movsl; movsl; movsb");	if (count == 2*4 + 2) small_move("movsl; movsl; movsw");	if (count == 2*4 + 3) small_move("movsl; movsl; movsw; movsb");	if (count == 3*4 + 0) small_move("movsl; movsl; movsl");	if (count == 3*4 + 1) small_move("movsl; movsl; movsl; movsb");	if (count == 3*4 + 2) small_move("movsl; movsl; movsl; movsw");	if (count == 3*4 + 3) small_move("movsl; movsl; movsl; movsw; movsb");	if (count == 4*4 + 0) small_move("movsl; movsl; movsl; movsl");	if (count == 4*4 + 1) small_move("movsl; movsl; movsl; movsl; movsb");	/* going over 7 bytes is suboptimal */	/* movsw is 2-byte insn, so this one takes 6 bytes: */	if (count == 4*4 + 2) small_move("movsl; movsl; movsl; movsl; movsw");	/* 7 bytes */	if (count == 4*4 + 3) small_move("movsl; movsl; movsl; movsl; movsw; movsb");	/* 5 bytes */	if (count == 5*4 + 0) small_move("movsl; movsl; movsl; movsl; movsl");	/* 6 bytes */	if (count == 5*4 + 1) small_move("movsl; movsl; movsl; movsl; movsl; movsb");	/* 7 bytes */	if (count == 5*4 + 2) small_move("movsl; movsl; movsl; movsl; movsl; movsw");	/* 8 bytes, but oh well... */	if (count == 5*4 + 3) small_move("movsl; movsl; movsl; movsl; movsl; movsw; movsb");	/* 6 bytes */	if (count == 6*4 + 0) small_move("movsl; movsl; movsl; movsl; movsl; movsl");	/* the rest would be 7+ bytes and is handled below instead */#undef small_move	/* Not small, but multiple-of-4 move.	 * "mov <const>,%ecx; rep; movsl" sequence is 7 bytes */	__asm__ __volatile__(		"	rep; movsl\n"		: "=&c" (ecx), "=&S" (esi), "=&D" (edi)		: "0" (count / 4), "1" (s), "2" (d)		: "memory"	);	return edi;}static __always_inlinevoid *inlined_memcpy_const_count4(void *d, const void *s, unsigned count){	inlined_mempcpy_const_count4(d, s, count);	return d;}#if 1 /* +34 bytes on shared i386 build with gcc 4.3.0 */#define mempcpy(d, s, count) ( \	( !(__builtin_constant_p(count)) \	  || ((count) > (6*4 + 0) && ((count) % 4) != 0) \	) \	? mempcpy((d), (s), (count)) \	: inlined_mempcpy_const_count4((d), (s), (count)) \	)#define memcpy(d, s, count) ( \	( !(__builtin_constant_p(count)) \	  || ((count) > (6*4 + 0) && ((count) % 4) != 0) \	) \	? memcpy((d), (s), (count)) \	: inlined_memcpy_const_count4((d), (s), (count)) \	)#endifstatic __always_inlinesize_t inlined_strlen(const char *s){	int edi;	int ecx;	__asm__ __volatile__(		"	repne; scasb\n"	/*	"	notl	%0\n" */	/*	"	decl	%0\n" */		: "=c" (ecx), "=&D" (edi)		: "1" (s), "a" (0), "0" (0xffffffffu)		/* : no clobbers */	);	return -ecx - 1;}#if 0 /* +1108 bytes on shared i386 build with gcc 4.3.0 */#define strlen(s) inlined_strlen(s)#endifstatic __always_inlinechar *inlined_stpcpy(char *dest, const char *src){	char *esi, *edi;	int eax;	__asm__ __volatile__(		"1:	lodsb\n"		"	stosb\n"		"	testb	%%al, %%al\n"		"	jnz	1b\n"		: "=&S" (esi), "=&D" (edi), "=&a" (eax)		: "0" (src), "1" (dest)		: "memory"	);	return edi - 1;}static __always_inlinechar *inlined_strcpy(char *dest, const char *src){	inlined_stpcpy(dest, src);	return dest;}#if 0 /* +562 bytes on shared i386 build with gcc 4.3.0 */#define stpcpy(dest, src) inlined_stpcpy(dest, src)#define strcpy(dest, src) inlined_strcpy(dest, src)#endifstatic __always_inlinevoid *inlined_memchr(const void *s, int c, size_t count){	void *edi;	int ecx;	/* Unfortunately, c gets loaded to %eax (wide insn), not %al */	__asm__ __volatile__(		"	jecxz	1f\n"		"	repne; scasb\n"		"	leal	-1(%%edi), %%edi\n"		"	je	2f\n"		"1:\n"		"	xorl	%%edi, %%edi\n"		"2:\n"		: "=&D" (edi), "=&c" (ecx)		: "a" (c), "0" (s), "1" (count)		/* : no clobbers */	);	return edi;}static __always_inlinevoid *inlined_memchr_const_c(const void *s, int c, size_t count){#if defined __OPTIMIZE__	void *edi;	int ecx, eax;	__asm__ __volatile__(		"	jecxz	1f\n"		"	movb	%4, %%al\n" /* const c to %%al */		"	repne; scasb\n"		"	leal	-1(%%edi), %%edi\n"		"	je	2f\n"		"1:\n"		"	xorl	%%edi, %%edi\n"		"2:\n"		: "=&D" (edi), "=&c" (ecx), "=&a" (eax)		: "0" (s), "i" (c), "1" (count)		/* : no clobbers */	);	return edi;#else	/* With -O0, gcc can't figure out how to encode CONST c	 * as an immediate operand. Generating slightly bigger code	 * (usually "movl CONST,%eax", 3 bytes bigger than needed):	 */	void *edi;	int ecx, eax;	__asm__ __volatile__(		"	jecxz	1f\n"		"	repne; scasb\n"		"	leal	-1(%%edi), %%edi\n"		"	je	2f\n"		"1:\n"		"	xorl	%%edi, %%edi\n"		"2:\n"		: "=&D" (edi), "=&c" (ecx), "=&a" (eax)		: "0" (s), "2" (c), "1" (count)		/* : no clobbers */	);	return edi;#endif}#if 1 /* +2 bytes on shared i386 build with gcc 4.3.0 */#define memchr(s, c, count) ( \	__builtin_constant_p(c) \	? inlined_memchr_const_c(s, (c) & 0xff, count) \	: inlined_memchr(s, c, count) \	)#endif#endif /* _LIBC_STRING_i386_H  */
 |