Explorar el Código

amd64 string ops: use alignment more carefully, and comment it.
By capping max padding to not be bigger than three next insns,
we avoid having ridiculously big NOPs like this one:

53:66 66 66 66 2e 0f 1f nopw %cs:0x0(%rax,%rax,1)
5a:84 00 00 00 00 00

which was bigger than next three insns combined!

Size changes:

text data bss dec hex filename
102 0 0 102 66 x86_64/memcpy.o
102 0 0 102 66 x86_64.old/memcpy.o

90 0 0 90 5a x86_64/mempcpy.o
102 0 0 102 66 x86_64.old/mempcpy.o

210 0 0 210 d2 x86_64/memset.o
242 0 0 242 f2 x86_64.old/memset.o

213 0 0 213 d5 x86_64/stpcpy.o
220 0 0 220 dc x86_64.old/stpcpy.o

428 0 0 428 1ac x86_64/strcat.o
444 0 0 444 1bc x86_64.old/strcat.o

417 0 0 417 1a1 x86_64/strchr.o
418 0 0 418 1a2 x86_64.old/strchr.o

33 0 0 33 21 x86_64/strcmp.o
33 0 0 33 21 x86_64.old/strcmp.o

213 0 0 213 d5 x86_64/strcpy.o
220 0 0 220 dc x86_64.old/strcpy.o

135 0 0 135 87 x86_64/strcspn.o
151 0 0 151 97 x86_64.old/strcspn.o

225 0 0 225 e1 x86_64/strlen.o
233 0 0 233 e9 x86_64.old/strlen.o

140 0 0 140 8c x86_64/strpbrk.o
156 0 0 156 9c x86_64.old/strpbrk.o

135 0 0 135 87 x86_64/strspn.o
151 0 0 151 97 x86_64.old/strspn.o

Also, a few files got their .text alignment relaxed from 16 to 8 bytes,
which reduces padding at link time.

Denis Vlasenko hace 16 años
padre
commit
df7958a960

+ 2 - 2
libc/string/x86_64/memcpy.S

@@ -59,9 +59,9 @@ ENTRY (BP_SYM (memcpy))
 	subq	$32, %rcx
 	subq	$32, %rcx
 	js	2f
 	js	2f
 
 
-	.p2align 4
+	/* Next 3 insns are 11 bytes total, make sure we decode them in one go */
+	.p2align 4,,11
 3:
 3:
-
 	/* Now correct the loop counter.  Please note that in the following
 	/* Now correct the loop counter.  Please note that in the following
 	   code the flags are not changed anymore.  */
 	   code the flags are not changed anymore.  */
 	subq	$32, %rcx
 	subq	$32, %rcx

+ 13 - 7
libc/string/x86_64/memset.S

@@ -55,8 +55,10 @@ ENTRY (memset)
 	test	$0x7,%edi	/* Check for alignment.  */
 	test	$0x7,%edi	/* Check for alignment.  */
 	jz	2f
 	jz	2f
 
 
-	.p2align 4
-1:	/* Align ptr to 8 byte.  */
+	/* Next 3 insns are 9 bytes total, make sure we decode them in one go */
+	.p2align 4,,9
+1:
+	/* Align ptr to 8 byte.  */
 	mov	%sil,(%rcx)
 	mov	%sil,(%rcx)
 	dec	%rdx
 	dec	%rdx
 	inc	%rcx
 	inc	%rcx
@@ -70,8 +72,10 @@ ENTRY (memset)
 	cmp	LARGE, %rdx
 	cmp	LARGE, %rdx
 	jae	11f
 	jae	11f
 
 
-	.p2align 4
-3:	/* Fill 64 bytes.  */
+	/* Next 3 insns are 11 bytes total, make sure we decode them in one go */
+	.p2align 4,,11
+3:
+	/* Fill 64 bytes.  */
 	mov	%r8,(%rcx)
 	mov	%r8,(%rcx)
 	mov	%r8,0x8(%rcx)
 	mov	%r8,0x8(%rcx)
 	mov	%r8,0x10(%rcx)
 	mov	%r8,0x10(%rcx)
@@ -114,9 +118,11 @@ ENTRY (memset)
 #endif
 #endif
 	retq
 	retq
 
 
-	.p2align 4
-11:	/* Fill 64 bytes without polluting the cache.  */
-	/* We could use	movntdq    %xmm0,(%rcx) here to further
+	/* Next 3 insns are 14 bytes total, make sure we decode them in one go */
+	.p2align 4,,14
+11:
+	/* Fill 64 bytes without polluting the cache.  */
+	/* We could use	movntdq %xmm0,(%rcx) here to further
 	   speed up for large cases but let's not use XMM registers.  */
 	   speed up for large cases but let's not use XMM registers.  */
 	movnti	%r8,(%rcx)
 	movnti	%r8,(%rcx)
 	movnti  %r8,0x8(%rcx)
 	movnti  %r8,0x8(%rcx)

+ 14 - 5
libc/string/x86_64/strcat.S

@@ -45,7 +45,9 @@ ENTRY (BP_SYM (strcat))
 
 
 
 
 	/* Now the source is aligned.  Scan for NUL byte.  */
 	/* Now the source is aligned.  Scan for NUL byte.  */
-	.p2align 4
+
+	/* Next 3 insns are 10 bytes total, make sure we decode them in one go */
+	.p2align 4,,10
 4:
 4:
 	/* First unroll.  */
 	/* First unroll.  */
 	movq (%rax), %rcx	/* get double word (= 8 bytes) in question */
 	movq (%rax), %rcx	/* get double word (= 8 bytes) in question */
@@ -103,8 +105,11 @@ ENTRY (BP_SYM (strcat))
 				   the addition will not result in 0.  */
 				   the addition will not result in 0.  */
 	jz 4b			/* no NUL found => continue loop */
 	jz 4b			/* no NUL found => continue loop */
 
 
-	.p2align 4		/* Align, it is a jump target.  */
-3:	subq $8,%rax		/* correct pointer increment.  */
+	/* Align, it is a jump target.  */
+	/* Next 3 insns are 8 bytes total, make sure we decode them in one go */
+	.p2align 3,,8
+3:
+	subq $8,%rax		/* correct pointer increment.  */
 
 
 	testb %cl, %cl		/* is first byte NUL? */
 	testb %cl, %cl		/* is first byte NUL? */
 	jz 2f			/* yes => return */
 	jz 2f			/* yes => return */
@@ -160,7 +165,9 @@ ENTRY (BP_SYM (strcat))
 	/* Now the sources is aligned.  Unfortunatly we cannot force
 	/* Now the sources is aligned.  Unfortunatly we cannot force
 	   to have both source and destination aligned, so ignore the
 	   to have both source and destination aligned, so ignore the
 	   alignment of the destination.  */
 	   alignment of the destination.  */
-	.p2align 4
+
+	/* Next 3 insns are 10 bytes total, make sure we decode them in one go */
+	.p2align 4,,10
 22:
 22:
 	/* 1st unroll.  */
 	/* 1st unroll.  */
 	movq	(%rsi), %rax	/* Read double word (8 bytes).  */
 	movq	(%rsi), %rax	/* Read double word (8 bytes).  */
@@ -237,7 +244,9 @@ ENTRY (BP_SYM (strcat))
 
 
 	/* Do the last few bytes. %rax contains the value to write.
 	/* Do the last few bytes. %rax contains the value to write.
 	   The loop is unrolled twice.  */
 	   The loop is unrolled twice.  */
-	.p2align 4
+
+	/* Next 3 insns are 6 bytes total, make sure we decode them in one go */
+	.p2align 3,,6
 23:
 23:
 	movb	%al, (%rdx)	/* 1st byte.  */
 	movb	%al, (%rdx)	/* 1st byte.  */
 	testb	%al, %al	/* Is it NUL.  */
 	testb	%al, %al	/* Is it NUL.  */

+ 8 - 4
libc/string/x86_64/strchr.S

@@ -92,7 +92,8 @@ ENTRY (BP_SYM (strchr))
 	 each of whose bytes is C.  This turns each byte that is C
 	 each of whose bytes is C.  This turns each byte that is C
 	 into a zero.  */
 	 into a zero.  */
 
 
-	.p2align 4
+	/* Next 3 insns are 10 bytes total, make sure we decode them in one go */
+	.p2align 4,,10
 4:
 4:
 	/* Main Loop is unrolled 4 times.  */
 	/* Main Loop is unrolled 4 times.  */
 	/* First unroll.  */
 	/* First unroll.  */
@@ -230,8 +231,11 @@ ENTRY (BP_SYM (strchr))
 	   reversed.  */
 	   reversed.  */
 
 
 
 
-	.p2align 4		/* Align, it's a jump target.  */
-3:	movq	%r9,%rdx	/* move to %rdx so that we can access bytes */
+	/* Align, it's a jump target.  */
+	/* Next 3 insns are 9 bytes total, make sure we decode them in one go */
+	.p2align 4,,9
+3:
+	movq	%r9,%rdx	/* move to %rdx so that we can access bytes */
 	subq	$8,%rax		/* correct pointer increment.  */
 	subq	$8,%rax		/* correct pointer increment.  */
 	testb %cl, %cl		/* is first byte C? */
 	testb %cl, %cl		/* is first byte C? */
 	jz 6f			/* yes => return pointer */
 	jz 6f			/* yes => return pointer */
@@ -281,7 +285,7 @@ ENTRY (BP_SYM (strchr))
 	incq %rax
 	incq %rax
 
 
 6:
 6:
-	nop
+	/* nop - huh?? */
 	retq
 	retq
 END (BP_SYM (strchr))
 END (BP_SYM (strchr))
 
 

+ 6 - 2
libc/string/x86_64/strcpy.S

@@ -53,7 +53,9 @@ ENTRY (BP_SYM (STRCPY))
 	/* Now the sources is aligned.  Unfortunatly we cannot force
 	/* Now the sources is aligned.  Unfortunatly we cannot force
 	   to have both source and destination aligned, so ignore the
 	   to have both source and destination aligned, so ignore the
 	   alignment of the destination.  */
 	   alignment of the destination.  */
-	.p2align 4
+
+	/* Next 3 insns are 10 bytes total, make sure we decode them in one go */
+	.p2align 4,,10
 1:
 1:
 	/* 1st unroll.  */
 	/* 1st unroll.  */
 	movq	(%rsi), %rax	/* Read double word (8 bytes).  */
 	movq	(%rsi), %rax	/* Read double word (8 bytes).  */
@@ -130,7 +132,9 @@ ENTRY (BP_SYM (STRCPY))
 
 
 	/* Do the last few bytes. %rax contains the value to write.
 	/* Do the last few bytes. %rax contains the value to write.
 	   The loop is unrolled twice.  */
 	   The loop is unrolled twice.  */
-	.p2align 4
+
+	/* Next 3 insns are 6 bytes total, make sure we decode them in one go */
+	.p2align 3,,6
 3:
 3:
 	/* Note that stpcpy needs to return with the value of the NUL
 	/* Note that stpcpy needs to return with the value of the NUL
 	   byte.  */
 	   byte.  */

+ 10 - 2
libc/string/x86_64/strcspn.S

@@ -55,7 +55,9 @@ ENTRY (strcspn)
    Although all the following instruction only modify %cl we always
    Although all the following instruction only modify %cl we always
    have a correct zero-extended 64-bit value in %rcx.  */
    have a correct zero-extended 64-bit value in %rcx.  */
 
 
-	.p2align 4
+	/* Next 3 insns are 6 bytes total, make sure we decode them in one go */
+	.p2align 3,,6
+
 L(2):	movb (%rax), %cl	/* get byte from skipset */
 L(2):	movb (%rax), %cl	/* get byte from skipset */
 	testb %cl, %cl		/* is NUL char? */
 	testb %cl, %cl		/* is NUL char? */
 	jz L(1)			/* yes => start compare loop */
 	jz L(1)			/* yes => start compare loop */
@@ -88,7 +90,13 @@ L(1):	leaq -4(%rdx), %rax	/* prepare loop */
 	   value in the table.  But the value of NUL is NUL so the loop
 	   value in the table.  But the value of NUL is NUL so the loop
 	   terminates for NUL in every case.  */
 	   terminates for NUL in every case.  */
 
 
-	.p2align 4
+	/* Next 3 insns are 9 bytes total. */
+	/* .p2align 4,,9 would make sure we decode them in one go, */
+	/* but it will also align entire function to 16 bytes, */
+	/* potentially creating largish padding at link time. */
+	/* We are aligning to 8 bytes instead: */
+	.p2align 3,,8
+
 L(3):	addq $4, %rax		/* adjust pointer for full loop round */
 L(3):	addq $4, %rax		/* adjust pointer for full loop round */
 
 
 	movb (%rax), %cl	/* get byte from string */
 	movb (%rax), %cl	/* get byte from string */

+ 10 - 4
libc/string/x86_64/strlen.S

@@ -40,8 +40,11 @@ ENTRY (strlen)
 
 
 1:	movq $0xfefefefefefefeff,%r8 /* Save magic.  */
 1:	movq $0xfefefefefefefeff,%r8 /* Save magic.  */
 
 
-	.p2align 4		/* Align loop.  */
-4:	/* Main Loop is unrolled 4 times.  */
+	/* Align loop.  */
+	/* Next 3 insns are 10 bytes total, make sure we decode them in one go */
+	.p2align 4,,10
+4:
+	/* Main Loop is unrolled 4 times.  */
 	/* First unroll.  */
 	/* First unroll.  */
 	movq (%rax), %rcx	/* get double word (= 8 bytes) in question */
 	movq (%rax), %rcx	/* get double word (= 8 bytes) in question */
 	addq $8,%rax		/* adjust pointer for next word */
 	addq $8,%rax		/* adjust pointer for next word */
@@ -98,8 +101,11 @@ ENTRY (strlen)
 				   the addition will not result in 0.  */
 				   the addition will not result in 0.  */
 	jz 4b			/* no NUL found => continue loop */
 	jz 4b			/* no NUL found => continue loop */
 
 
-	.p2align 4		/* Align, it is a jump target.  */
-3:	subq $8,%rax		/* correct pointer increment.  */
+	/* Align, it is a jump target.  */
+	/* Next 3 insns are 8 bytes total, make sure we decode them in one go */
+	.p2align 3,,8
+3:
+	subq $8,%rax		/* correct pointer increment.  */
 
 
 	testb %cl, %cl		/* is first byte NUL? */
 	testb %cl, %cl		/* is first byte NUL? */
 	jz 2f			/* yes => return */
 	jz 2f			/* yes => return */

+ 12 - 4
libc/string/x86_64/strspn.S

@@ -50,8 +50,10 @@ ENTRY (strspn)
    Although all the following instruction only modify %cl we always
    Although all the following instruction only modify %cl we always
    have a correct zero-extended 64-bit value in %rcx.  */
    have a correct zero-extended 64-bit value in %rcx.  */
 
 
-	.p2align 4
-L(2):	movb (%rax), %cl	/* get byte from stopset */
+	/* Next 3 insns are 6 bytes total, make sure we decode them in one go */
+	.p2align 3,,6
+L(2):
+	movb (%rax), %cl	/* get byte from stopset */
 	testb %cl, %cl		/* is NUL char? */
 	testb %cl, %cl		/* is NUL char? */
 	jz L(1)			/* yes => start compare loop */
 	jz L(1)			/* yes => start compare loop */
 	movb %cl, (%rsp,%rcx)	/* set corresponding byte in stopset table */
 	movb %cl, (%rsp,%rcx)	/* set corresponding byte in stopset table */
@@ -83,8 +85,14 @@ L(1):	leaq -4(%rdx), %rax	/* prepare loop */
 	   value in the table.  But the value of NUL is NUL so the loop
 	   value in the table.  But the value of NUL is NUL so the loop
 	   terminates for NUL in every case.  */
 	   terminates for NUL in every case.  */
 
 
-	.p2align 4
-L(3):	addq $4, %rax		/* adjust pointer for full loop round */
+	/* Next 3 insns are 9 bytes total. */
+	/* .p2align 4,,9 would make sure we decode them in one go, */
+	/* but it will also align entire function to 16 bytes, */
+	/* potentially creating largish padding at link time. */
+	/* We are aligning to 8 bytes instead: */
+	.p2align 3,,8
+L(3):
+	addq $4, %rax		/* adjust pointer for full loop round */
 
 
 	movb (%rax), %cl	/* get byte from string */
 	movb (%rax), %cl	/* get byte from string */
 	testb %cl, (%rsp,%rcx)	/* is it contained in skipset? */
 	testb %cl, (%rsp,%rcx)	/* is it contained in skipset? */