17 年之前 · df7958a960
--- a/libc/string/x86_64/memcpy.S
+++ b/libc/string/x86_64/memcpy.S
@@ -59,9 +59,9 @@ ENTRY (BP_SYM (memcpy))
 
				 	subq	$32, %rcx
			
 
				 	js	2f
			
 
				 
			
 
				-	.p2align 4
			
 
				+	/* Next 3 insns are 11 bytes total, make sure we decode them in one go */
			
 
				+	.p2align 4,,11
			
 
				 3:
			
 
				-
			
 
				 	/* Now correct the loop counter.  Please note that in the following
			
 
				 	   code the flags are not changed anymore.  */
			
 
				 	subq	$32, %rcx
			
--- a/libc/string/x86_64/memset.S
+++ b/libc/string/x86_64/memset.S
@@ -55,8 +55,10 @@ ENTRY (memset)
 
				 	test	$0x7,%edi	/* Check for alignment.  */
			
 
				 	jz	2f
			
 
				 
			
 
				-	.p2align 4
			
 
				-1:	/* Align ptr to 8 byte.  */
			
 
				+	/* Next 3 insns are 9 bytes total, make sure we decode them in one go */
			
 
				+	.p2align 4,,9
			
 
				+1:
			
 
				+	/* Align ptr to 8 byte.  */
			
 
				 	mov	%sil,(%rcx)
			
 
				 	dec	%rdx
			
 
				 	inc	%rcx
			
@@ -70,8 +72,10 @@ ENTRY (memset)
 
				 	cmp	LARGE, %rdx
			
 
				 	jae	11f
			
 
				 
			
 
				-	.p2align 4
			
 
				-3:	/* Fill 64 bytes.  */
			
 
				+	/* Next 3 insns are 11 bytes total, make sure we decode them in one go */
			
 
				+	.p2align 4,,11
			
 
				+3:
			
 
				+	/* Fill 64 bytes.  */
			
 
				 	mov	%r8,(%rcx)
			
 
				 	mov	%r8,0x8(%rcx)
			
 
				 	mov	%r8,0x10(%rcx)
			
@@ -114,9 +118,11 @@ ENTRY (memset)
 
				 #endif
			
 
				 	retq
			
 
				 
			
 
				-	.p2align 4
			
 
				-11:	/* Fill 64 bytes without polluting the cache.  */
			
 
				-	/* We could use	movntdq    %xmm0,(%rcx) here to further
			
 
				+	/* Next 3 insns are 14 bytes total, make sure we decode them in one go */
			
 
				+	.p2align 4,,14
			
 
				+11:
			
 
				+	/* Fill 64 bytes without polluting the cache.  */
			
 
				+	/* We could use	movntdq %xmm0,(%rcx) here to further
			
 
				 	   speed up for large cases but let's not use XMM registers.  */
			
 
				 	movnti	%r8,(%rcx)
			
 
				 	movnti  %r8,0x8(%rcx)
			
--- a/libc/string/x86_64/strcat.S
+++ b/libc/string/x86_64/strcat.S
@@ -45,7 +45,9 @@ ENTRY (BP_SYM (strcat))
 
				 
			
 
				 
			
 
				 	/* Now the source is aligned.  Scan for NUL byte.  */
			
 
				-	.p2align 4
			
 
				+
			
 
				+	/* Next 3 insns are 10 bytes total, make sure we decode them in one go */
			
 
				+	.p2align 4,,10
			
 
				 4:
			
 
				 	/* First unroll.  */
			
 
				 	movq (%rax), %rcx	/* get double word (= 8 bytes) in question */
			
@@ -103,8 +105,11 @@ ENTRY (BP_SYM (strcat))
 
				 				   the addition will not result in 0.  */
			
 
				 	jz 4b			/* no NUL found => continue loop */
			
 
				 
			
 
				-	.p2align 4		/* Align, it is a jump target.  */
			
 
				-3:	subq $8,%rax		/* correct pointer increment.  */
			
 
				+	/* Align, it is a jump target.  */
			
 
				+	/* Next 3 insns are 8 bytes total, make sure we decode them in one go */
			
 
				+	.p2align 3,,8
			
 
				+3:
			
 
				+	subq $8,%rax		/* correct pointer increment.  */
			
 
				 
			
 
				 	testb %cl, %cl		/* is first byte NUL? */
			
 
				 	jz 2f			/* yes => return */
			
@@ -160,7 +165,9 @@ ENTRY (BP_SYM (strcat))
 
				 	/* Now the sources is aligned.  Unfortunatly we cannot force
			
 
				 	   to have both source and destination aligned, so ignore the
			
 
				 	   alignment of the destination.  */
			
 
				-	.p2align 4
			
 
				+
			
 
				+	/* Next 3 insns are 10 bytes total, make sure we decode them in one go */
			
 
				+	.p2align 4,,10
			
 
				 22:
			
 
				 	/* 1st unroll.  */
			
 
				 	movq	(%rsi), %rax	/* Read double word (8 bytes).  */
			
@@ -237,7 +244,9 @@ ENTRY (BP_SYM (strcat))
 
				 
			
 
				 	/* Do the last few bytes. %rax contains the value to write.
			
 
				 	   The loop is unrolled twice.  */
			
 
				-	.p2align 4
			
 
				+
			
 
				+	/* Next 3 insns are 6 bytes total, make sure we decode them in one go */
			
 
				+	.p2align 3,,6
			
 
				 23:
			
 
				 	movb	%al, (%rdx)	/* 1st byte.  */
			
 
				 	testb	%al, %al	/* Is it NUL.  */
			
--- a/libc/string/x86_64/strchr.S
+++ b/libc/string/x86_64/strchr.S
@@ -92,7 +92,8 @@ ENTRY (BP_SYM (strchr))
 
				 	 each of whose bytes is C.  This turns each byte that is C
			
 
				 	 into a zero.  */
			
 
				 
			
 
				-	.p2align 4
			
 
				+	/* Next 3 insns are 10 bytes total, make sure we decode them in one go */
			
 
				+	.p2align 4,,10
			
 
				 4:
			
 
				 	/* Main Loop is unrolled 4 times.  */
			
 
				 	/* First unroll.  */
			
@@ -230,8 +231,11 @@ ENTRY (BP_SYM (strchr))
 
				 	   reversed.  */
			
 
				 
			
 
				 
			
 
				-	.p2align 4		/* Align, it's a jump target.  */
			
 
				-3:	movq	%r9,%rdx	/* move to %rdx so that we can access bytes */
			
 
				+	/* Align, it's a jump target.  */
			
 
				+	/* Next 3 insns are 9 bytes total, make sure we decode them in one go */
			
 
				+	.p2align 4,,9
			
 
				+3:
			
 
				+	movq	%r9,%rdx	/* move to %rdx so that we can access bytes */
			
 
				 	subq	$8,%rax		/* correct pointer increment.  */
			
 
				 	testb %cl, %cl		/* is first byte C? */
			
 
				 	jz 6f			/* yes => return pointer */
			
@@ -281,7 +285,7 @@ ENTRY (BP_SYM (strchr))
 
				 	incq %rax
			
 
				 
			
 
				 6:
			
 
				-	nop
			
 
				+	/* nop - huh?? */
			
 
				 	retq
			
 
				 END (BP_SYM (strchr))
			
 
				 
			
--- a/libc/string/x86_64/strcpy.S
+++ b/libc/string/x86_64/strcpy.S
@@ -53,7 +53,9 @@ ENTRY (BP_SYM (STRCPY))
 
				 	/* Now the sources is aligned.  Unfortunatly we cannot force
			
 
				 	   to have both source and destination aligned, so ignore the
			
 
				 	   alignment of the destination.  */
			
 
				-	.p2align 4
			
 
				+
			
 
				+	/* Next 3 insns are 10 bytes total, make sure we decode them in one go */
			
 
				+	.p2align 4,,10
			
 
				 1:
			
 
				 	/* 1st unroll.  */
			
 
				 	movq	(%rsi), %rax	/* Read double word (8 bytes).  */
			
@@ -130,7 +132,9 @@ ENTRY (BP_SYM (STRCPY))
 
				 
			
 
				 	/* Do the last few bytes. %rax contains the value to write.
			
 
				 	   The loop is unrolled twice.  */
			
 
				-	.p2align 4
			
 
				+
			
 
				+	/* Next 3 insns are 6 bytes total, make sure we decode them in one go */
			
 
				+	.p2align 3,,6
			
 
				 3:
			
 
				 	/* Note that stpcpy needs to return with the value of the NUL
			
 
				 	   byte.  */
			
--- a/libc/string/x86_64/strcspn.S
+++ b/libc/string/x86_64/strcspn.S
@@ -55,7 +55,9 @@ ENTRY (strcspn)
 
				    Although all the following instruction only modify %cl we always
			
 
				    have a correct zero-extended 64-bit value in %rcx.  */
			
 
				 
			
 
				-	.p2align 4
			
 
				+	/* Next 3 insns are 6 bytes total, make sure we decode them in one go */
			
 
				+	.p2align 3,,6
			
 
				+
			
 
				 L(2):	movb (%rax), %cl	/* get byte from skipset */
			
 
				 	testb %cl, %cl		/* is NUL char? */
			
 
				 	jz L(1)			/* yes => start compare loop */
			
@@ -88,7 +90,13 @@ L(1):	leaq -4(%rdx), %rax	/* prepare loop */
 
				 	   value in the table.  But the value of NUL is NUL so the loop
			
 
				 	   terminates for NUL in every case.  */
			
 
				 
			
 
				-	.p2align 4
			
 
				+	/* Next 3 insns are 9 bytes total. */
			
 
				+	/* .p2align 4,,9 would make sure we decode them in one go, */
			
 
				+	/* but it will also align entire function to 16 bytes, */
			
 
				+	/* potentially creating largish padding at link time. */
			
 
				+	/* We are aligning to 8 bytes instead: */
			
 
				+	.p2align 3,,8
			
 
				+
			
 
				 L(3):	addq $4, %rax		/* adjust pointer for full loop round */
			
 
				 
			
 
				 	movb (%rax), %cl	/* get byte from string */
			
--- a/libc/string/x86_64/strlen.S
+++ b/libc/string/x86_64/strlen.S
@@ -40,8 +40,11 @@ ENTRY (strlen)
 
				 
			
 
				 1:	movq $0xfefefefefefefeff,%r8 /* Save magic.  */
			
 
				 
			
 
				-	.p2align 4		/* Align loop.  */
			
 
				-4:	/* Main Loop is unrolled 4 times.  */
			
 
				+	/* Align loop.  */
			
 
				+	/* Next 3 insns are 10 bytes total, make sure we decode them in one go */
			
 
				+	.p2align 4,,10
			
 
				+4:
			
 
				+	/* Main Loop is unrolled 4 times.  */
			
 
				 	/* First unroll.  */
			
 
				 	movq (%rax), %rcx	/* get double word (= 8 bytes) in question */
			
 
				 	addq $8,%rax		/* adjust pointer for next word */
			
@@ -98,8 +101,11 @@ ENTRY (strlen)
 
				 				   the addition will not result in 0.  */
			
 
				 	jz 4b			/* no NUL found => continue loop */
			
 
				 
			
 
				-	.p2align 4		/* Align, it is a jump target.  */
			
 
				-3:	subq $8,%rax		/* correct pointer increment.  */
			
 
				+	/* Align, it is a jump target.  */
			
 
				+	/* Next 3 insns are 8 bytes total, make sure we decode them in one go */
			
 
				+	.p2align 3,,8
			
 
				+3:
			
 
				+	subq $8,%rax		/* correct pointer increment.  */
			
 
				 
			
 
				 	testb %cl, %cl		/* is first byte NUL? */
			
 
				 	jz 2f			/* yes => return */
			
--- a/libc/string/x86_64/strspn.S
+++ b/libc/string/x86_64/strspn.S
@@ -50,8 +50,10 @@ ENTRY (strspn)
 
				    Although all the following instruction only modify %cl we always
			
 
				    have a correct zero-extended 64-bit value in %rcx.  */
			
 
				 
			
 
				-	.p2align 4
			
 
				-L(2):	movb (%rax), %cl	/* get byte from stopset */
			
 
				+	/* Next 3 insns are 6 bytes total, make sure we decode them in one go */
			
 
				+	.p2align 3,,6
			
 
				+L(2):
			
 
				+	movb (%rax), %cl	/* get byte from stopset */
			
 
				 	testb %cl, %cl		/* is NUL char? */
			
 
				 	jz L(1)			/* yes => start compare loop */
			
 
				 	movb %cl, (%rsp,%rcx)	/* set corresponding byte in stopset table */
			
@@ -83,8 +85,14 @@ L(1):	leaq -4(%rdx), %rax	/* prepare loop */
 
				 	   value in the table.  But the value of NUL is NUL so the loop
			
 
				 	   terminates for NUL in every case.  */
			
 
				 
			
 
				-	.p2align 4
			
 
				-L(3):	addq $4, %rax		/* adjust pointer for full loop round */
			
 
				+	/* Next 3 insns are 9 bytes total. */
			
 
				+	/* .p2align 4,,9 would make sure we decode them in one go, */
			
 
				+	/* but it will also align entire function to 16 bytes, */
			
 
				+	/* potentially creating largish padding at link time. */
			
 
				+	/* We are aligning to 8 bytes instead: */
			
 
				+	.p2align 3,,8
			
 
				+L(3):
			
 
				+	addq $4, %rax		/* adjust pointer for full loop round */
			
 
				 
			
 
				 	movb (%rax), %cl	/* get byte from string */
			
 
				 	testb %cl, (%rsp,%rcx)	/* is it contained in skipset? */