|
@@ -0,0 +1,313 @@
|
|
|
+/* Optimized strcmp for Xtensa.
|
|
|
+ Copyright (C) 2001, 2007 Free Software Foundation, Inc.
|
|
|
+ This file is part of the GNU C Library.
|
|
|
+
|
|
|
+ The GNU C Library is free software; you can redistribute it and/or
|
|
|
+ modify it under the terms of the GNU Lesser General Public
|
|
|
+ License as published by the Free Software Foundation; either
|
|
|
+ version 2.1 of the License, or (at your option) any later version.
|
|
|
+
|
|
|
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
+ Lesser General Public License for more details.
|
|
|
+
|
|
|
+ You should have received a copy of the GNU Lesser General Public
|
|
|
+ License along with the GNU C Library; if not, write to the Free
|
|
|
+ Software Foundation, Inc., 51 Franklin Street - Fifth Floor,
|
|
|
+ Boston, MA 02110-1301, USA. */
|
|
|
+
|
|
|
+#include "../../sysdeps/linux/xtensa/sysdep.h"
|
|
|
+#include <bits/xtensa-config.h>
|
|
|
+
|
|
|
+#ifdef __XTENSA_EB__
|
|
|
+#define MASK0 0xff000000
|
|
|
+#define MASK1 0x00ff0000
|
|
|
+#define MASK2 0x0000ff00
|
|
|
+#define MASK3 0x000000ff
|
|
|
+#else
|
|
|
+#define MASK0 0x000000ff
|
|
|
+#define MASK1 0x0000ff00
|
|
|
+#define MASK2 0x00ff0000
|
|
|
+#define MASK3 0xff000000
|
|
|
+#endif
|
|
|
+
|
|
|
+#define MASK4 0x40404040
|
|
|
+
|
|
|
+ .literal .Lmask0, MASK0
|
|
|
+ .literal .Lmask1, MASK1
|
|
|
+ .literal .Lmask2, MASK2
|
|
|
+ .literal .Lmask3, MASK3
|
|
|
+ .literal .Lmask4, MASK4
|
|
|
+
|
|
|
+ .text
|
|
|
+ENTRY (strcmp)
|
|
|
+ /* a2 = s1, a3 = s2 */
|
|
|
+
|
|
|
+ l8ui a8, a2, 0 // byte 0 from s1
|
|
|
+ l8ui a9, a3, 0 // byte 0 from s2
|
|
|
+ movi a10, 3 // mask
|
|
|
+ bne a8, a9, .Lretdiff
|
|
|
+
|
|
|
+ or a11, a2, a3
|
|
|
+ bnone a11, a10, .Laligned
|
|
|
+
|
|
|
+ xor a11, a2, a3 // compare low two bits of s1 and s2
|
|
|
+ bany a11, a10, .Lunaligned // if they have different alignment
|
|
|
+
|
|
|
+ /* s1/s2 are not word-aligned. */
|
|
|
+ addi a2, a2, 1 // advance s1
|
|
|
+ beqz a8, .Leq // bytes equal, if zero, strings are equal
|
|
|
+ addi a3, a3, 1 // advance s2
|
|
|
+ bnone a2, a10, .Laligned // if s1/s2 now aligned
|
|
|
+ l8ui a8, a2, 0 // byte 1 from s1
|
|
|
+ l8ui a9, a3, 0 // byte 1 from s2
|
|
|
+ addi a2, a2, 1 // advance s1
|
|
|
+ bne a8, a9, .Lretdiff // if different, return difference
|
|
|
+ beqz a8, .Leq // bytes equal, if zero, strings are equal
|
|
|
+ addi a3, a3, 1 // advance s2
|
|
|
+ bnone a2, a10, .Laligned // if s1/s2 now aligned
|
|
|
+ l8ui a8, a2, 0 // byte 2 from s1
|
|
|
+ l8ui a9, a3, 0 // byte 2 from s2
|
|
|
+ addi a2, a2, 1 // advance s1
|
|
|
+ bne a8, a9, .Lretdiff // if different, return difference
|
|
|
+ beqz a8, .Leq // bytes equal, if zero, strings are equal
|
|
|
+ addi a3, a3, 1 // advance s2
|
|
|
+ j .Laligned
|
|
|
+
|
|
|
+/* s1 and s2 have different alignment.
|
|
|
+
|
|
|
+ If the zero-overhead loop option is available, use an (almost)
|
|
|
+ infinite zero-overhead loop with conditional exits so we only pay
|
|
|
+ for taken branches when exiting the loop.
|
|
|
+
|
|
|
+ Note: It is important for this unaligned case to come before the
|
|
|
+ code for aligned strings, because otherwise some of the branches
|
|
|
+ above cannot reach and have to be transformed to branches around
|
|
|
+ jumps. The unaligned code is smaller and the branches can reach
|
|
|
+ over it. */
|
|
|
+
|
|
|
+ .align 4
|
|
|
+ /* (2 mod 4) alignment for loop instruction */
|
|
|
+.Lunaligned:
|
|
|
+#if XCHAL_HAVE_LOOPS
|
|
|
+ _movi.n a8, 0 // set up for the maximum loop count
|
|
|
+ loop a8, .Lretdiff // loop forever (almost anyway)
|
|
|
+#endif
|
|
|
+.Lnextbyte:
|
|
|
+ l8ui a8, a2, 0
|
|
|
+ l8ui a9, a3, 0
|
|
|
+ addi a2, a2, 1
|
|
|
+ bne a8, a9, .Lretdiff
|
|
|
+ addi a3, a3, 1
|
|
|
+#if XCHAL_HAVE_LOOPS
|
|
|
+ beqz a8, .Lretdiff
|
|
|
+#else
|
|
|
+ bnez a8, .Lnextbyte
|
|
|
+#endif
|
|
|
+.Lretdiff:
|
|
|
+ sub a2, a8, a9
|
|
|
+ retw
|
|
|
+
|
|
|
+/* s1 is word-aligned; s2 is word-aligned.
|
|
|
+
|
|
|
+ If the zero-overhead loop option is available, use an (almost)
|
|
|
+ infinite zero-overhead loop with conditional exits so we only pay
|
|
|
+ for taken branches when exiting the loop. */
|
|
|
+
|
|
|
+/* New algorithm, relying on the fact that all normal ASCII is between
|
|
|
+ 32 and 127.
|
|
|
+
|
|
|
+ Rather than check all bytes for zero:
|
|
|
+ Take one word (4 bytes). Call it w1.
|
|
|
+ Shift w1 left by one into w1'.
|
|
|
+ Or w1 and w1'. For all normal ASCII bit 6 will be 1; for zero it won't.
|
|
|
+ Check that all 4 bit 6's (one for each byte) are one:
|
|
|
+ If they are, we are definitely not done.
|
|
|
+ If they are not, we are probably done, but need to check for zero. */
|
|
|
+
|
|
|
+ .align 4
|
|
|
+#if XCHAL_HAVE_LOOPS
|
|
|
+.Laligned:
|
|
|
+ .begin no-transform
|
|
|
+ l32r a4, .Lmask0 // mask for byte 0
|
|
|
+ l32r a7, .Lmask4
|
|
|
+ /* Loop forever. (a4 is more than than the maximum number
|
|
|
+ of iterations) */
|
|
|
+ loop a4, .Laligned_done
|
|
|
+
|
|
|
+ /* First unrolled loop body. */
|
|
|
+ l32i a8, a2, 0 // get word from s1
|
|
|
+ l32i a9, a3, 0 // get word from s2
|
|
|
+ slli a5, a8, 1
|
|
|
+ bne a8, a9, .Lwne2
|
|
|
+ or a9, a8, a5
|
|
|
+ bnall a9, a7, .Lprobeq
|
|
|
+
|
|
|
+ /* Second unrolled loop body. */
|
|
|
+ l32i a8, a2, 4 // get word from s1+4
|
|
|
+ l32i a9, a3, 4 // get word from s2+4
|
|
|
+ slli a5, a8, 1
|
|
|
+ bne a8, a9, .Lwne2
|
|
|
+ or a9, a8, a5
|
|
|
+ bnall a9, a7, .Lprobeq2
|
|
|
+
|
|
|
+ addi a2, a2, 8 // advance s1 pointer
|
|
|
+ addi a3, a3, 8 // advance s2 pointer
|
|
|
+.Laligned_done:
|
|
|
+ or a1, a1, a1 // nop
|
|
|
+
|
|
|
+.Lprobeq2:
|
|
|
+ /* Adjust pointers to account for the loop unrolling. */
|
|
|
+ addi a2, a2, 4
|
|
|
+ addi a3, a3, 4
|
|
|
+
|
|
|
+#else /* !XCHAL_HAVE_LOOPS */
|
|
|
+
|
|
|
+.Laligned:
|
|
|
+ movi a4, MASK0 // mask for byte 0
|
|
|
+ movi a7, MASK4
|
|
|
+ j .Lfirstword
|
|
|
+.Lnextword:
|
|
|
+ addi a2, a2, 4 // advance s1 pointer
|
|
|
+ addi a3, a3, 4 // advance s2 pointer
|
|
|
+.Lfirstword:
|
|
|
+ l32i a8, a2, 0 // get word from s1
|
|
|
+ l32i a9, a3, 0 // get word from s2
|
|
|
+ slli a5, a8, 1
|
|
|
+ bne a8, a9, .Lwne2
|
|
|
+ or a9, a8, a5
|
|
|
+ ball a9, a7, .Lnextword
|
|
|
+#endif /* !XCHAL_HAVE_LOOPS */
|
|
|
+
|
|
|
+ /* align (0 mod 4) */
|
|
|
+.Lprobeq:
|
|
|
+ /* Words are probably equal, but check for sure.
|
|
|
+ If not, loop over the rest of string using normal algorithm. */
|
|
|
+
|
|
|
+ bnone a8, a4, .Leq // if byte 0 is zero
|
|
|
+ l32r a5, .Lmask1 // mask for byte 1
|
|
|
+ l32r a6, .Lmask2 // mask for byte 2
|
|
|
+ bnone a8, a5, .Leq // if byte 1 is zero
|
|
|
+ l32r a7, .Lmask3 // mask for byte 3
|
|
|
+ bnone a8, a6, .Leq // if byte 2 is zero
|
|
|
+ bnone a8, a7, .Leq // if byte 3 is zero
|
|
|
+ addi.n a2, a2, 4 // advance s1 pointer
|
|
|
+ addi.n a3, a3, 4 // advance s2 pointer
|
|
|
+#if XCHAL_HAVE_LOOPS
|
|
|
+
|
|
|
+ /* align (1 mod 4) */
|
|
|
+ loop a4, .Leq // loop forever (a4 is bigger than max iters)
|
|
|
+ .end no-transform
|
|
|
+
|
|
|
+ l32i a8, a2, 0 // get word from s1
|
|
|
+ l32i a9, a3, 0 // get word from s2
|
|
|
+ addi a2, a2, 4 // advance s1 pointer
|
|
|
+ bne a8, a9, .Lwne
|
|
|
+ bnone a8, a4, .Leq // if byte 0 is zero
|
|
|
+ bnone a8, a5, .Leq // if byte 1 is zero
|
|
|
+ bnone a8, a6, .Leq // if byte 2 is zero
|
|
|
+ bnone a8, a7, .Leq // if byte 3 is zero
|
|
|
+ addi a3, a3, 4 // advance s2 pointer
|
|
|
+
|
|
|
+#else /* !XCHAL_HAVE_LOOPS */
|
|
|
+
|
|
|
+ j .Lfirstword2
|
|
|
+.Lnextword2:
|
|
|
+ addi a3, a3, 4 // advance s2 pointer
|
|
|
+.Lfirstword2:
|
|
|
+ l32i a8, a2, 0 // get word from s1
|
|
|
+ l32i a9, a3, 0 // get word from s2
|
|
|
+ addi a2, a2, 4 // advance s1 pointer
|
|
|
+ bne a8, a9, .Lwne
|
|
|
+ bnone a8, a4, .Leq // if byte 0 is zero
|
|
|
+ bnone a8, a5, .Leq // if byte 1 is zero
|
|
|
+ bnone a8, a6, .Leq // if byte 2 is zero
|
|
|
+ bany a8, a7, .Lnextword2 // if byte 3 is zero
|
|
|
+#endif /* !XCHAL_HAVE_LOOPS */
|
|
|
+
|
|
|
+ /* Words are equal; some byte is zero. */
|
|
|
+.Leq: movi a2, 0 // return equal
|
|
|
+ retw
|
|
|
+
|
|
|
+.Lwne2: /* Words are not equal. On big-endian processors, if none of the
|
|
|
+ bytes are zero, the return value can be determined by a simple
|
|
|
+ comparison. */
|
|
|
+#ifdef __XTENSA_EB__
|
|
|
+ or a10, a8, a5
|
|
|
+ bnall a10, a7, .Lsomezero
|
|
|
+ bgeu a8, a9, .Lposreturn
|
|
|
+ movi a2, -1
|
|
|
+ retw
|
|
|
+.Lposreturn:
|
|
|
+ movi a2, 1
|
|
|
+ retw
|
|
|
+.Lsomezero: // There is probably some zero byte.
|
|
|
+#endif /* __XTENSA_EB__ */
|
|
|
+.Lwne: /* Words are not equal. */
|
|
|
+ xor a2, a8, a9 // get word with nonzero in byte that differs
|
|
|
+ bany a2, a4, .Ldiff0 // if byte 0 differs
|
|
|
+ movi a5, MASK1 // mask for byte 1
|
|
|
+ bnone a8, a4, .Leq // if byte 0 is zero
|
|
|
+ bany a2, a5, .Ldiff1 // if byte 1 differs
|
|
|
+ movi a6, MASK2 // mask for byte 2
|
|
|
+ bnone a8, a5, .Leq // if byte 1 is zero
|
|
|
+ bany a2, a6, .Ldiff2 // if byte 2 differs
|
|
|
+ bnone a8, a6, .Leq // if byte 2 is zero
|
|
|
+#ifdef __XTENSA_EB__
|
|
|
+.Ldiff3:
|
|
|
+.Ldiff2:
|
|
|
+.Ldiff1:
|
|
|
+ /* Byte 0 is equal (at least) and there is a difference before a zero
|
|
|
+ byte. Just subtract words to get the return value.
|
|
|
+ The high order equal bytes cancel, leaving room for the sign. */
|
|
|
+ sub a2, a8, a9
|
|
|
+ retw
|
|
|
+
|
|
|
+.Ldiff0:
|
|
|
+ /* Need to make room for the sign, so can't subtract whole words. */
|
|
|
+ extui a10, a8, 24, 8
|
|
|
+ extui a11, a9, 24, 8
|
|
|
+ sub a2, a10, a11
|
|
|
+ retw
|
|
|
+
|
|
|
+#else /* !__XTENSA_EB__ */
|
|
|
+ /* Little-endian is a little more difficult because can't subtract
|
|
|
+ whole words. */
|
|
|
+.Ldiff3:
|
|
|
+ /* Bytes 0-2 are equal; byte 3 is different.
|
|
|
+ For little-endian need to have a sign bit for the difference. */
|
|
|
+ extui a10, a8, 24, 8
|
|
|
+ extui a11, a9, 24, 8
|
|
|
+ sub a2, a10, a11
|
|
|
+ retw
|
|
|
+
|
|
|
+.Ldiff0:
|
|
|
+ /* Byte 0 is different. */
|
|
|
+ extui a10, a8, 0, 8
|
|
|
+ extui a11, a9, 0, 8
|
|
|
+ sub a2, a10, a11
|
|
|
+ retw
|
|
|
+
|
|
|
+.Ldiff1:
|
|
|
+ /* Byte 0 is equal; byte 1 is different. */
|
|
|
+ extui a10, a8, 8, 8
|
|
|
+ extui a11, a9, 8, 8
|
|
|
+ sub a2, a10, a11
|
|
|
+ retw
|
|
|
+
|
|
|
+.Ldiff2:
|
|
|
+ /* Bytes 0-1 are equal; byte 2 is different. */
|
|
|
+ extui a10, a8, 16, 8
|
|
|
+ extui a11, a9, 16, 8
|
|
|
+ sub a2, a10, a11
|
|
|
+ retw
|
|
|
+
|
|
|
+#endif /* !__XTENSA_EB */
|
|
|
+
|
|
|
+libc_hidden_def (strcmp)
|
|
|
+
|
|
|
+#ifndef __UCLIBC_HAS_LOCALE__
|
|
|
+strong_alias (strcmp, strcoll)
|
|
|
+libc_hidden_def (strcoll)
|
|
|
+#endif
|