Browse Source

arc: add optimized string functions for ARCv3

Add ability to use optimized versions of string functions for ARCv3 32-bit
CPUs with UCLIBC_HAS_STRING_ARCH_OPT option. Add optimized
memcpy/memset/memcmp code for ARCv3 CPUs based on the code from newlib
and adapt for ARCv3 existed optimized strchr/strcmp/strcpy/strlen.

Link to the Synopsys newlib repo with code for ARCv3 on GitHub:
https://github.com/foss-for-synopsys-dwc-arc-processors/newlib

Signed-off-by: Pavel Kozlov <pavel.kozlov@synopsys.com>
Pavel Kozlov 2 years ago
parent
commit
663b8a0497

+ 93 - 1
libc/string/arc/memcmp.S

@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2013 Synopsys, Inc. (www.synopsys.com)
+ * Copyright (C) 2013, 2022 Synopsys, Inc. (www.synopsys.com)
  * Copyright (C) 2007 ARC International (UK) LTD
  *
  * Licensed under the LGPL v2.1 or later, see the file COPYING.LIB in this tarball.
@@ -17,6 +17,8 @@
 #endif
 
 ENTRY(memcmp)
+
+#if defined(__ARC700__) || defined(__ARCHS__)
 	or	r12,r0,r1
 	asl_s	r12,r12,30
 	sub	r3,r2,1
@@ -149,6 +151,96 @@ ENTRY(memcmp)
 .Lnil:
 	j_s.d	[blink]
 	mov	r0,0
+
+#elif (__ARC64_ARCH32__)
+	;; Based on Synopsys code from newlib's arc64/memcmp.S
+	cmp		r2, 32
+	bls.d	@.L_compare_1_bytes
+	mov		r3, r0	; "r0" will be used as return value
+
+	lsr		r12, r2, 4	; counter for 16-byte chunks
+	xor		r13, r13, r13	; the mask showing inequal registers
+
+.L_compare_16_bytes:
+	ld.ab	r4, [r3, +4]
+	ld.ab	r5, [r1, +4]
+	ld.ab	r6, [r3, +4]
+	ld.ab	r7, [r1, +4]
+	ld.ab	r8, [r3, +4]
+	ld.ab	r9, [r1, +4]
+	ld.ab	r10, [r3, +4]
+	ld.ab	r11, [r1, +4]
+	xor.f	0, r4, r5
+	xor.ne	r13, r13, 0b0001
+	xor.f	0, r6, r7
+	xor.ne	r13, r13, 0b0010
+	xor.f	0, r8, r9
+	xor.ne	r13, r13, 0b0100
+	xor.f	0, r10, r11
+	xor.ne	r13, r13, 0b1000
+	brne	r13, 0, @.L_unequal_find
+	dbnz	r12, @.L_compare_16_bytes
+
+	;; Adjusting the pointers because of the extra loads in the end
+	sub		r1, r1, 4
+	sub		r3, r3, 4
+	bmsk_s	  r2, r2, 3	; any remaining bytes to compare
+
+.L_compare_1_bytes:
+	cmp		r2, 0
+	jeq.d	[blink]
+	xor_s	r0, r0, r0
+
+2:
+	ldb.ab	r4, [r3, +1]
+	ldb.ab	r5, [r1, +1]
+	sub.f	r0, r4, r5
+	jne		[blink]
+	dbnz	r2, @2b
+	j_s		[blink]
+
+	;; At this point, we want to find the _first_ comparison that marked the
+	;; inequality of "lhs" and "rhs"
+.L_unequal_find:
+	ffs		r13, r13
+	asl		r13, r13, 2
+	bi		[r13]
+.L_unequal_r4r5:
+	mov		r1, r4
+	b.d		@.L_diff_byte_in_regs
+	mov		r2, r5
+	nop
+.L_unequal_r6r7:
+	mov		r1, r6
+	b.d		@.L_diff_byte_in_regs
+	mov		r2, r7
+	nop
+.L_unequal_r8r9:
+	mov		r1, r8
+	b.d		@.L_diff_byte_in_regs
+	mov		r2, r9
+	nop
+.L_unequal_r10r11:
+	mov		r1, r10
+	mov		r2, r11
+
+	;; fall-through
+	;; If we're here, that means the two operands are not equal.
+.L_diff_byte_in_regs:
+	xor		r0, r1, r2
+	ffs		r0, r0
+	and		r0, r0, 0x18
+	lsr		r1, r1, r0
+	lsr		r2, r2, r0
+	bmsk_s	r1, r1, 7
+	bmsk_s	r2, r2, 7
+	j_s.d	[blink]
+	sub		r0, r1, r2
+
+#else
+#error "Unsupported ARC CPU type"
+#endif
+
 END(memcmp)
 libc_hidden_def(memcmp)
 

+ 56 - 9
libc/string/arc/memcpy.S

@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2013, 2014-2015, 2017 Synopsys, Inc. (www.synopsys.com)
+ * Copyright (C) 2013, 2014-2015, 2017, 2022 Synopsys, Inc. (www.synopsys.com)
  * Copyright (C) 2007 ARC International (UK) LTD
  *
  * Licensed under the LGPL v2.1 or later, see the file COPYING.LIB in this tarball.
@@ -7,13 +7,9 @@
 
 #include <sysdep.h>
 
-#if !defined(__ARC700__) && !defined(__ARCHS__)
-#error "Neither ARC700 nor ARCHS is defined!"
-#endif
-
 ENTRY(memcpy)
 
-#ifdef __ARC700__
+#if defined(__ARC700__)
 /* This memcpy implementation does not support objects of 1GB or larger -
    the check for alignment does not work then.  */
 /* We assume that most sources and destinations are aligned, and
@@ -73,9 +69,9 @@ ENTRY(memcpy)
 .Lendbloop:
 	j_s.d	[blink]
 	stb	r12,[r5,0]
-#endif /* __ARC700__ */
 
-#ifdef __ARCHS__
+#elif defined(__ARCHS__)
+
 #ifdef __LITTLE_ENDIAN__
 # define SHIFT_1(RX,RY,IMM)	asl	RX, RY, IMM	; <<
 # define SHIFT_2(RX,RY,IMM)	lsr	RX, RY, IMM	; >>
@@ -299,7 +295,58 @@ ENTRY(memcpy)
 	stb.ab	r6, [r3,1]
 .Lcopybytewise_3:
 	j	[blink]
-#endif /* __ARCHS__ */
+
+#elif defined(__ARC64_ARCH32__)
+	;; Based on Synopsys code from newlib's arc64/memcpy.S
+	lsr.f	r11, r2, 4		; counter for 16-byte chunks
+	beq.d	@.L_write_15_bytes
+	mov	r3, r0			; work on a copy of "r0"
+
+.L_write_16_bytes:
+#if defined(__ARC64_LL64__)
+	ldd.ab	r4, [r1, 8]
+	ldd.ab	r6, [r1, 8]
+	std.ab	r4, [r3, 8]
+	std.ab	r6, [r3, 8]
+	dbnz	r11, @.L_write_16_bytes
+#else
+	ld.ab	r4, [r1, 4]
+	ld.ab	r5, [r1, 4]
+	ld.ab	r6, [r1, 4]
+	ld.ab	r7, [r1, 4]
+	st.ab	r4, [r3, 4]
+	st.ab	r5, [r3, 4]
+	st.ab	r6, [r3, 4]
+	dbnz.d	r11, @.L_write_16_bytes
+	st.ab	r7, [r3, 4]
+#endif
+	bmsk_s	r2, r2, 3
+
+.L_write_15_bytes:
+	bbit0.d	r2, 1, @1f
+	lsr	r11, r2, 2
+	ldh.ab	r4, [r1, 2]
+	sth.ab	r4, [r3, 2]
+1:
+	bbit0.d	r2, 0, @1f
+	xor	r11, r11, 3
+	ldb.ab	r4, [r1, 1]
+	stb.ab	r4, [r3, 1]
+1:
+	asl	r11, r11, 1
+	bi	[r11]
+	ld.ab	r4,[r1, 4]
+	st.ab	r4,[r3, 4]
+	ld.ab	r4,[r1, 4]
+	st.ab	r4,[r3, 4]
+	ld	r4,[r1]
+	st	r4,[r3]
+
+	j_s	[blink]
+
+#else
+#error "Unsupported ARC CPU type"
+#endif
 
 END(memcpy)
 libc_hidden_def(memcpy)

+ 52 - 9
libc/string/arc/memset.S

@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2013, 2014-2015, 2017 Synopsys, Inc. (www.synopsys.com)
+ * Copyright (C) 2013, 2014-2015, 2017, 2022 Synopsys, Inc. (www.synopsys.com)
  * Copyright (C) 2007 ARC International (UK) LTD
  *
  * Licensed under the LGPL v2.1 or later, see the file COPYING.LIB in this tarball.
@@ -7,13 +7,9 @@
 
 #include <sysdep.h>
 
-#if !defined(__ARC700__) && !defined(__ARCHS__)
-#error "Neither ARC700 nor ARCHS is defined!"
-#endif
-
 ENTRY(memset)
 
-#ifdef __ARC700__
+#if defined(__ARC700__)
 #define SMALL	7 /* Must be at least 6 to deal with alignment/loop issues.  */
 
 	mov_s	r4,r0
@@ -52,9 +48,8 @@ ENTRY(memset)
 	stb.ab	r1,[r4,1]
 .Ltiny_end:
 	j_s	[blink]
-#endif /* __ARC700__ */
 
-#ifdef __ARCHS__
+#elif defined(__ARCHS__)
 #ifdef DONT_USE_PREALLOC
 #define PREWRITE(A,B)	prefetchw [(A),(B)]
 #else
@@ -156,7 +151,55 @@ ENTRY(memset)
 .Lcopy3bytes:
 
 	j	[blink]
-#endif /* __ARCHS__ */
+
+#elif defined(__ARC64_ARCH32__)
+	;; Based on Synopsys code from newlib's arc64/memset.S
+
+	;; Assemble the bytes to 32bit words
+	bmsk_s	r1, r1, 7		; treat it like unsigned char
+	lsl8	r3, r1
+	or_s	r1, r1, r3
+	lsl16	r3, r1
+	or	r6, r1, r3
+	mov r7,r6
+
+	lsr.f	r5, r2, 4		; counter for 16-byte chunks
+	beq.d	@.L_write_15_bytes
+	mov	r4, r0			; work on a copy of "r0"
+
+.L_write_16_bytes:
+#if defined(__ARC64_LL64__)
+	std.ab	r6, [r4, 8]
+	std.ab	r6, [r4, 8]
+	dbnz	r5, @.L_write_16_bytes
+#else
+	st.ab	r6, [r4, 4]
+	st.ab	r6, [r4, 4]
+	st.ab	r6, [r4, 4]
+	dbnz.d	r5, @.L_write_16_bytes
+	st.ab	r6, [r4, 4]
+#endif
+	bmsk_s	r2, r2, 3
+
+.L_write_15_bytes:
+	bbit0.d	r2, 1, @1f
+	lsr	r3, r2, 2
+	sth.ab	r6, [r4, 2]
+1:
+	bbit0.d	r2, 0, @1f
+	xor	r3, r3, 3
+	stb.ab	r6, [r4, 1]
+1:
+	bi	[r3]
+	st.ab	r6,[r4, 4]
+	st.ab	r6,[r4, 4]
+	st.ab	r6,[r4, 4]
+
+	j_s	[blink]
+
+#else
+#error "Unsupported ARC CPU type"
+#endif
 
 END(memset)
 libc_hidden_def(memset)

+ 13 - 12
libc/string/arc/strchr.S

@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2013 Synopsys, Inc. (www.synopsys.com)
+ * Copyright (C) 2013, 2022 Synopsys, Inc. (www.synopsys.com)
  * Copyright (C) 2007 ARC International (UK) LTD
  *
  * Licensed under the LGPL v2.1 or later, see the file COPYING.LIB in this tarball.
@@ -7,6 +7,7 @@
 
 #include <sysdep.h>
 #include <features.h>
+#include <asm.h>
 
 /* ARC700 has a relatively long pipeline and branch prediction, so we want
    to avoid branches that are hard to predict.  On the other hand, the
@@ -21,7 +22,7 @@ ENTRY(strchr)
 	mov_s	r3,0x01010101
 	breq.d	r2,r0,.Laligned
 	asl	r4,r5,16
-	sub_s	r0,r0,r2
+	SUBR_S	r0,r0,r2
 	asl	r7,r2,3
 	ld_s	r2,[r0]
 #ifdef __LITTLE_ENDIAN__
@@ -77,10 +78,10 @@ ENTRY(strchr)
 	sub	r3,r7,1
 	bic	r3,r3,r7
 	norm	r2,r3
-	sub_s	r0,r0,1
-	asr_s	r2,r2,3
+	SUBR_S	r0,r0,1
+	ASRR_S	r2,r2,3
 	j.d	[blink]
-	sub_s	r0,r0,r2
+	SUBR_S	r0,r0,r2
 
 	.balign	4
 .Lfound0_ua:
@@ -90,13 +91,13 @@ ENTRY(strchr)
 	bic	r3,r3,r6
 	and	r2,r3,r4
 	or_s	r12,r12,r2
-	sub_s	r3,r12,1
+	SUBR_S	r3,r12,1
 	bic_s	r3,r3,r12
 	norm	r3,r3
-	add_s	r0,r0,3
-	asr_s	r12,r3,3
+	ADDR_S	r0,r0,3
+	ASRR_S	r12,r3,3
 	asl.f	0,r2,r3
-	sub_s	r0,r0,r12
+	SUBR_S	r0,r0,r12
 	j_s.d	[blink]
 	mov.pl	r0,0
 #else /* BIG ENDIAN */
@@ -106,10 +107,10 @@ ENTRY(strchr)
 	bic	r2,r7,r6
 .Lfound_char_b:
 	norm	r2,r2
-	sub_s	r0,r0,4
+	SUBR_S	r0,r0,4
 	asr_s	r2,r2,3
 	j.d	[blink]
-	add_s	r0,r0,r2
+	ADDR_S	r0,r0,r2
 
 .Lfound0_ua:
 	mov_s	r3,r7
@@ -126,7 +127,7 @@ ENTRY(strchr)
 	add.pl	r3,r3,1
 	asr_s	r12,r3,3
 	asl.f	0,r2,r3
-	add_s	r0,r0,r12
+	ADDR_S	r0,r0,r12
 	j_s.d	[blink]
 	mov.mi	r0,0
 #endif /* ENDIAN */

+ 10 - 11
libc/string/arc/strcmp.S

@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2013, 2014-2015, 2017 Synopsys, Inc. (www.synopsys.com)
+ * Copyright (C) 2013, 2014-2015, 2017, 2022 Synopsys, Inc. (www.synopsys.com)
  * Copyright (C) 2007 ARC International (UK) LTD
  *
  * Licensed under the LGPL v2.1 or later, see the file COPYING.LIB in this tarball.
@@ -7,14 +7,11 @@
 
 #include <features.h>
 #include <sysdep.h>
-
-#if !defined(__ARC700__) && !defined(__ARCHS__)
-#error "Neither ARC700 nor ARCHS is defined!"
-#endif
+#include <asm.h>
 
 ENTRY(strcmp)
 
-#ifdef __ARC700__
+#if defined(__ARC700__) || defined(__ARC64_ARCH32__)
 /* This is optimized primarily for the ARC700.
    It would be possible to speed up the loops by one cycle / word
    respective one cycle / byte by forcing double source 1 alignment, unrolling
@@ -38,7 +35,7 @@ ENTRY(strcmp)
 	breq	r2,r3,.Lwordloop
 #ifdef	__LITTLE_ENDIAN__
 	xor	r0,r2,r3	; mask for difference
-	sub_s	r1,r0,1
+	SUBR_S	r1,r0,1
 	bic_s	r0,r0,r1	; mask for least significant difference bit
 	sub	r1,r5,r0
 	xor	r0,r5,r1	; mask for least significant difference byte
@@ -55,7 +52,7 @@ ENTRY(strcmp)
 .Lfound0:
 	xor	r0,r2,r3	; mask for difference
 	or	r0,r0,r4	; or in zero indicator
-	sub_s	r1,r0,1
+	SUBR_S	r1,r0,1
 	bic_s	r0,r0,r1	; mask for least significant difference bit
 	sub	r1,r5,r0
 	xor	r0,r5,r1	; mask for least significant difference byte
@@ -99,9 +96,8 @@ ENTRY(strcmp)
 .Lcmpend:
 	j_s.d	[blink]
 	sub	r0,r2,r3
-#endif /* __ARC700__ */
 
-#ifdef __ARCHS__
+#elif defined(__ARCHS__)
 	or	r2, r0, r1
 	bmsk_s	r2, r2, 1
 	brne	r2, 0, @.Lcharloop
@@ -168,7 +164,10 @@ ENTRY(strcmp)
 .Lcmpend:
 	j_s.d	[blink]
 	sub	r0, r2, r3
-#endif /* __ARCHS__ */
+
+#else
+#error "Unsupported ARC CPU type"
+#endif
 
 END(strcmp)
 libc_hidden_def(strcmp)

+ 4 - 3
libc/string/arc/strlen.S

@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2013 Synopsys, Inc. (www.synopsys.com)
+ * Copyright (C) 2013, 2022 Synopsys, Inc. (www.synopsys.com)
  * Copyright (C) 2007 ARC International (UK) LTD
  *
  * Licensed under the LGPL v2.1 or later, see the file COPYING.LIB in this tarball.
@@ -7,6 +7,7 @@
 
 
 #include <sysdep.h>
+#include <asm.h>
 
 ENTRY(strlen)
 	or	r3,r0,7
@@ -15,7 +16,7 @@ ENTRY(strlen)
 	mov	r4,0x01010101
 	; uses long immediate
 #ifdef __LITTLE_ENDIAN__
-	asl_s	r1,r0,3
+	ASLR_S	r1,r0,3
 	btst_s	r0,2
 	asl	r7,r4,r1
 	ror	r5,r4
@@ -59,7 +60,7 @@ ENTRY(strlen)
 	sub.ne	r3,r3,4
 	mov.eq	r1,r12
 #ifdef __LITTLE_ENDIAN__
-	sub_s	r2,r1,1
+	SUBR_S	r2,r1,1
 	bic_s	r2,r2,r1
 	norm	r1,r2
 	sub_s	r0,r0,3

+ 39 - 0
libc/sysdeps/linux/arc/asm.h

@@ -7,6 +7,13 @@
 #ifndef _ARC_ASM_H
 #define _ARC_ASM_H
 
+/*
+ * Some 16-bit instructions were excluded from the ARCv3 ISA
+ * the following macros are introduced to handle these changes in one place.
+ * This will allow not to change existing ARCv2 code and use 16-bit versions
+ * of instructions for ARCv2 and replace them with 32-bit vesrions for ARCv3
+ */
+
 #if defined (__ARC64_ARCH32__)
 
 .macro PUSHR reg
@@ -25,6 +32,22 @@
 	pop	\reg
 .endm
 
+.macro SUBR_S dst,src1,src2
+	sub	\dst, \src1, \src2
+.endm
+
+.macro ADDR_S dst,src1,src2
+	add	\dst, \src1, \src2
+.endm
+
+.macro ASRR_S dst,src1,src2
+	asr	\dst, \src1, \src2
+.endm
+
+.macro ASLR_S dst,src1,src2
+	asl	\dst, \src1, \src2
+.endm
+
 #elif defined (__ARC64_ARCH64__)
 
 # error ARCv3 64-bit is not supported by uClibc-ng
@@ -47,6 +70,22 @@
 	pop_s	\reg
 .endm
 
+.macro SUBR_S dst,src1,src2
+	sub_s	\dst, \src1, \src2
+.endm
+
+.macro ADDR_S dst,src1,src2
+	add_s	\dst, \src1, \src2
+.endm
+
+.macro ASRR_S dst,src1,src2
+	asr_s	\dst, \src1, \src2
+.endm
+
+.macro ASLR_S dst,src1,src2
+	asl_s	\dst, \src1, \src2
+.endm
+
 #endif
 
 #endif /* _ARC_ASM_H  */