Browse Source

Add optimized memcpy implementation for sh4 (from Stuart Menefy @STMicroelectronics).
This implementation is based on 'backward copying'.

Signed-off-by: Carmelo Amoroso <carmelo.amoroso@st.com>

Carmelo Amoroso 16 years ago
parent
commit
e4f55f33f6

+ 5 - 0
extra/Configs/Config.sh

@@ -48,3 +48,8 @@ config CONFIG_SH4
 	bool "SH4"
 	bool "SH4"
 
 
 endchoice
 endchoice
+
+config ARCH_HAS_BWD_MEMCPY
+       bool
+       default y
+       depends on CONFIG_SH4

+ 28 - 3
libc/string/Makefile.in

@@ -8,6 +8,18 @@
 #
 #
 # Arch specific fun
 # Arch specific fun
 #
 #
+# Collect the subarch specific implementation (asm files)
+ifneq ($(strip $(TARGET_SUBARCH)),)
+STRING_SUBARCH_DIR := $(top_srcdir)libc/string/$(TARGET_ARCH)/$(TARGET_SUBARCH)
+STRING_SUBARCH_OUT := $(top_builddir)libc/string/$(TARGET_ARCH)/$(TARGET_SUBARCH)
+
+STRING_SUBARCH_SSRC := $(wildcard $(STRING_SUBARCH_OUT)/*.S)
+STRING_SUBARCH_SOBJ := $(patsubst $(STRING_SUBARCH_DIR)/%.S,$(STRING_SUBARCH_OUT)/%.o,$(STRING_SUBARCH_SSRC))
+
+STRING_SUBARCH_OBJS := $(STRING_SUBARCH_SOBJ)
+endif
+
+# Collect the arch specific implementation (asm, c files)
 STRING_ARCH_DIR := $(top_srcdir)libc/string/$(TARGET_ARCH)
 STRING_ARCH_DIR := $(top_srcdir)libc/string/$(TARGET_ARCH)
 STRING_ARCH_OUT := $(top_builddir)libc/string/$(TARGET_ARCH)
 STRING_ARCH_OUT := $(top_builddir)libc/string/$(TARGET_ARCH)
 
 
@@ -15,13 +27,19 @@ STRING_ARCH_SRC := $(wildcard $(STRING_ARCH_DIR)/*.c)
 STRING_ARCH_OBJ := $(patsubst $(STRING_ARCH_DIR)/%.c,$(STRING_ARCH_OUT)/%.o,$(STRING_ARCH_SRC))
 STRING_ARCH_OBJ := $(patsubst $(STRING_ARCH_DIR)/%.c,$(STRING_ARCH_OUT)/%.o,$(STRING_ARCH_SRC))
 
 
 STRING_ARCH_SSRC := $(wildcard $(STRING_ARCH_DIR)/*.S)
 STRING_ARCH_SSRC := $(wildcard $(STRING_ARCH_DIR)/*.S)
+
+# Exclude the subarch implementation from the arch ones
+ifneq ($(strip $(STRING_SUBARCH_OBJS)),)
+STRING_ARCH_SSRC := $(filter-out $(patsubst %.o,$(STRING_ARCH_DIR)/%.S,$(notdir $(STRING_SUBARCH_OBJS))),$(STRING_ARCH_SSRC))
+endif
+
 STRING_ARCH_SOBJ := $(patsubst $(STRING_ARCH_DIR)/%.S,$(STRING_ARCH_OUT)/%.o,$(STRING_ARCH_SSRC))
 STRING_ARCH_SOBJ := $(patsubst $(STRING_ARCH_DIR)/%.S,$(STRING_ARCH_OUT)/%.o,$(STRING_ARCH_SSRC))
 
 
 STRING_ARCH_OBJS := $(STRING_ARCH_OBJ) $(STRING_ARCH_SOBJ)
 STRING_ARCH_OBJS := $(STRING_ARCH_OBJ) $(STRING_ARCH_SOBJ)
 
 
-libc-$(UCLIBC_HAS_STRING_ARCH_OPT) += $(STRING_ARCH_OBJS)
+libc-$(UCLIBC_HAS_STRING_ARCH_OPT) += $(STRING_ARCH_OBJS) $(STRING_SUBARCH_OBJS)
 
 
-libc-nomulti-$(UCLIBC_HAS_STRING_ARCH_OPT) += $(STRING_ARCH_SOBJ)
+libc-nomulti-$(UCLIBC_HAS_STRING_ARCH_OPT) += $(STRING_ARCH_SOBJ) $(STRING_SUBARCH_OBJS)
 
 
 #
 #
 # Generic stuff
 # Generic stuff
@@ -30,11 +48,15 @@ STRING_GENERIC_DIR := $(top_srcdir)libc/string/generic
 STRING_GENERIC_OUT := $(top_builddir)libc/string/generic
 STRING_GENERIC_OUT := $(top_builddir)libc/string/generic
 
 
 STRING_GENERIC_SRC := $(wildcard $(STRING_GENERIC_DIR)/*.c)
 STRING_GENERIC_SRC := $(wildcard $(STRING_GENERIC_DIR)/*.c)
+STRING_GENERIC_SRC := $(filter-out $(STRING_GENERIC_DIR)/_memcpy_fwd.c, $(STRING_GENERIC_SRC))
 
 
 ifeq ($(UCLIBC_HAS_STRING_ARCH_OPT),y)
 ifeq ($(UCLIBC_HAS_STRING_ARCH_OPT),y)
 ifneq ($(strip $(STRING_ARCH_OBJS)),)
 ifneq ($(strip $(STRING_ARCH_OBJS)),)
 STRING_GENERIC_SRC := $(filter-out $(patsubst %.o,$(STRING_GENERIC_DIR)/%.c,$(notdir $(STRING_ARCH_OBJS))),$(STRING_GENERIC_SRC))
 STRING_GENERIC_SRC := $(filter-out $(patsubst %.o,$(STRING_GENERIC_DIR)/%.c,$(notdir $(STRING_ARCH_OBJS))),$(STRING_GENERIC_SRC))
 endif
 endif
+ifneq ($(strip $(STRING_SUBARCH_OBJS)),)
+STRING_GENERIC_SRC := $(filter-out $(patsubst %.o,$(STRING_GENERIC_DIR)/%.c,$(notdir $(STRING_SUBARCH_OBJS))),$(STRING_GENERIC_SRC))
+endif
 endif
 endif
 
 
 STRING_GENERIC_OBJS := $(patsubst $(STRING_GENERIC_DIR)/%.c,$(STRING_GENERIC_OUT)/%.o,$(STRING_GENERIC_SRC))
 STRING_GENERIC_OBJS := $(patsubst $(STRING_GENERIC_DIR)/%.c,$(STRING_GENERIC_OUT)/%.o,$(STRING_GENERIC_SRC))
@@ -93,6 +115,9 @@ ifeq ($(UCLIBC_HAS_STRING_ARCH_OPT),y)
 ifneq ($(strip $(STRING_ARCH_OBJS)),)
 ifneq ($(strip $(STRING_ARCH_OBJS)),)
 STRING_CSRC := $(filter-out $(patsubst %.o,$(STRING_DIR)/%.c,$(notdir $(STRING_ARCH_OBJS))),$(STRING_CSRC))
 STRING_CSRC := $(filter-out $(patsubst %.o,$(STRING_DIR)/%.c,$(notdir $(STRING_ARCH_OBJS))),$(STRING_CSRC))
 endif
 endif
+ifneq ($(strip $(STRING_SUBARCH_OBJS)),)
+STRING_CSRC := $(filter-out $(patsubst %.o,$(STRING_DIR)/%.c,$(notdir $(STRING_SUBARCH_OBJS))),$(STRING_CSRC))
+endif
 endif
 endif
 
 
 ifeq ($(UCLIBC_HAS_STRING_GENERIC_OPT),y)
 ifeq ($(UCLIBC_HAS_STRING_GENERIC_OPT),y)
@@ -111,4 +136,4 @@ libc-nomulti-y += $(STRING_OUT)/__xpg_strerror_r.o
 objclean-y += string_objclean
 objclean-y += string_objclean
 
 
 string_objclean:
 string_objclean:
-	$(RM) $(STRING_OUT)/{,*/}*.{o,os}
+	$(RM) $(STRING_OUT)/{,*/}{,*/}*.{o,os,oS}

+ 185 - 0
libc/string/generic/_memcpy_fwd.c

@@ -0,0 +1,185 @@
+/* _wordcopy_fwd_aligned -- Copy block beginning at SRCP to
+   block beginning at DSTP with LEN `op_t' words (not LEN bytes!).
+   Both SRCP and DSTP should be aligned for memory operations on `op_t's.  */
+
+static void _wordcopy_fwd_aligned (long int dstp, long int srcp, size_t len)
+{
+  op_t a0, a1;
+
+  switch (len % 8)
+    {
+    case 2:
+      a0 = ((op_t *) srcp)[0];
+      srcp -= 6 * OPSIZ;
+      dstp -= 7 * OPSIZ;
+      len += 6;
+      goto do1;
+    case 3:
+      a1 = ((op_t *) srcp)[0];
+      srcp -= 5 * OPSIZ;
+      dstp -= 6 * OPSIZ;
+      len += 5;
+      goto do2;
+    case 4:
+      a0 = ((op_t *) srcp)[0];
+      srcp -= 4 * OPSIZ;
+      dstp -= 5 * OPSIZ;
+      len += 4;
+      goto do3;
+    case 5:
+      a1 = ((op_t *) srcp)[0];
+      srcp -= 3 * OPSIZ;
+      dstp -= 4 * OPSIZ;
+      len += 3;
+      goto do4;
+    case 6:
+      a0 = ((op_t *) srcp)[0];
+      srcp -= 2 * OPSIZ;
+      dstp -= 3 * OPSIZ;
+      len += 2;
+      goto do5;
+    case 7:
+      a1 = ((op_t *) srcp)[0];
+      srcp -= 1 * OPSIZ;
+      dstp -= 2 * OPSIZ;
+      len += 1;
+      goto do6;
+
+    case 0:
+      if (OP_T_THRES <= 3 * OPSIZ && len == 0)
+	return;
+      a0 = ((op_t *) srcp)[0];
+      srcp -= 0 * OPSIZ;
+      dstp -= 1 * OPSIZ;
+      goto do7;
+    case 1:
+      a1 = ((op_t *) srcp)[0];
+      srcp -=-1 * OPSIZ;
+      dstp -= 0 * OPSIZ;
+      len -= 1;
+      if (OP_T_THRES <= 3 * OPSIZ && len == 0)
+	goto do0;
+      goto do8;			/* No-op.  */
+    }
+
+  do
+    {
+    do8:
+      a0 = ((op_t *) srcp)[0];
+      ((op_t *) dstp)[0] = a1;
+    do7:
+      a1 = ((op_t *) srcp)[1];
+      ((op_t *) dstp)[1] = a0;
+    do6:
+      a0 = ((op_t *) srcp)[2];
+      ((op_t *) dstp)[2] = a1;
+    do5:
+      a1 = ((op_t *) srcp)[3];
+      ((op_t *) dstp)[3] = a0;
+    do4:
+      a0 = ((op_t *) srcp)[4];
+      ((op_t *) dstp)[4] = a1;
+    do3:
+      a1 = ((op_t *) srcp)[5];
+      ((op_t *) dstp)[5] = a0;
+    do2:
+      a0 = ((op_t *) srcp)[6];
+      ((op_t *) dstp)[6] = a1;
+    do1:
+      a1 = ((op_t *) srcp)[7];
+      ((op_t *) dstp)[7] = a0;
+
+      srcp += 8 * OPSIZ;
+      dstp += 8 * OPSIZ;
+      len -= 8;
+    }
+  while (len != 0);
+
+  /* This is the right position for do0.  Please don't move
+     it into the loop.  */
+ do0:
+  ((op_t *) dstp)[0] = a1;
+}
+
+/* _wordcopy_fwd_dest_aligned -- Copy block beginning at SRCP to
+   block beginning at DSTP with LEN `op_t' words (not LEN bytes!).
+   DSTP should be aligned for memory operations on `op_t's, but SRCP must
+   *not* be aligned.  */
+
+static void _wordcopy_fwd_dest_aligned (long int dstp, long int srcp, size_t len)
+{
+  op_t a0, a1, a2, a3;
+  int sh_1, sh_2;
+
+  /* Calculate how to shift a word read at the memory operation
+     aligned srcp to make it aligned for copy.  */
+
+  sh_1 = 8 * (srcp % OPSIZ);
+  sh_2 = 8 * OPSIZ - sh_1;
+
+  /* Make SRCP aligned by rounding it down to the beginning of the `op_t'
+     it points in the middle of.  */
+  srcp &= -OPSIZ;
+
+  switch (len % 4)
+    {
+    case 2:
+      a1 = ((op_t *) srcp)[0];
+      a2 = ((op_t *) srcp)[1];
+      srcp -= 1 * OPSIZ;
+      dstp -= 3 * OPSIZ;
+      len += 2;
+      goto do1;
+    case 3:
+      a0 = ((op_t *) srcp)[0];
+      a1 = ((op_t *) srcp)[1];
+      srcp -= 0 * OPSIZ;
+      dstp -= 2 * OPSIZ;
+      len += 1;
+      goto do2;
+    case 0:
+      if (OP_T_THRES <= 3 * OPSIZ && len == 0)
+	return;
+      a3 = ((op_t *) srcp)[0];
+      a0 = ((op_t *) srcp)[1];
+      srcp -=-1 * OPSIZ;
+      dstp -= 1 * OPSIZ;
+      len += 0;
+      goto do3;
+    case 1:
+      a2 = ((op_t *) srcp)[0];
+      a3 = ((op_t *) srcp)[1];
+      srcp -=-2 * OPSIZ;
+      dstp -= 0 * OPSIZ;
+      len -= 1;
+      if (OP_T_THRES <= 3 * OPSIZ && len == 0)
+	goto do0;
+      goto do4;			/* No-op.  */
+    }
+
+  do
+    {
+    do4:
+      a0 = ((op_t *) srcp)[0];
+      ((op_t *) dstp)[0] = MERGE (a2, sh_1, a3, sh_2);
+    do3:
+      a1 = ((op_t *) srcp)[1];
+      ((op_t *) dstp)[1] = MERGE (a3, sh_1, a0, sh_2);
+    do2:
+      a2 = ((op_t *) srcp)[2];
+      ((op_t *) dstp)[2] = MERGE (a0, sh_1, a1, sh_2);
+    do1:
+      a3 = ((op_t *) srcp)[3];
+      ((op_t *) dstp)[3] = MERGE (a1, sh_1, a2, sh_2);
+
+      srcp += 4 * OPSIZ;
+      dstp += 4 * OPSIZ;
+      len -= 4;
+    }
+  while (len != 0);
+
+  /* This is the right position for do0.  Please don't move
+     it into the loop.  */
+ do0:
+  ((op_t *) dstp)[0] = MERGE (a2, sh_1, a3, sh_2);
+}

+ 3 - 0
libc/string/generic/memcopy.h

@@ -107,6 +107,7 @@ typedef unsigned char byte;
 	}								      \
 	}								      \
     } while (0)
     } while (0)
 
 
+#ifdef __ARCH_HAS_BWD_MEMCPY__
 /* Copy *up to* NBYTES bytes from SRC_BP to DST_BP, with
 /* Copy *up to* NBYTES bytes from SRC_BP to DST_BP, with
    the assumption that DST_BP is aligned on an OPSIZ multiple.  If
    the assumption that DST_BP is aligned on an OPSIZ multiple.  If
    not all bytes could be easily copied, store remaining number of bytes
    not all bytes could be easily copied, store remaining number of bytes
@@ -125,6 +126,8 @@ typedef unsigned char byte;
       (nbytes_left) = (nbytes) % OPSIZ;					      \
       (nbytes_left) = (nbytes) % OPSIZ;					      \
     } while (0)
     } while (0)
 
 
+#endif
+
 /* Copy *up to* NBYTES_TO_COPY bytes from SRC_END_PTR to DST_END_PTR,
 /* Copy *up to* NBYTES_TO_COPY bytes from SRC_END_PTR to DST_END_PTR,
    beginning at the words (of type op_t) right before the pointers and
    beginning at the words (of type op_t) right before the pointers and
    continuing towards smaller addresses.  May take advantage of that
    continuing towards smaller addresses.  May take advantage of that

+ 0 - 186
libc/string/generic/memcpy.c

@@ -25,192 +25,6 @@
 
 
 /* Experimentally off - libc_hidden_proto(memcpy) */
 /* Experimentally off - libc_hidden_proto(memcpy) */
 
 
-/* _wordcopy_fwd_aligned -- Copy block beginning at SRCP to
-   block beginning at DSTP with LEN `op_t' words (not LEN bytes!).
-   Both SRCP and DSTP should be aligned for memory operations on `op_t's.  */
-
-static void _wordcopy_fwd_aligned (long int dstp, long int srcp, size_t len)
-{
-  op_t a0, a1;
-
-  switch (len % 8)
-    {
-    case 2:
-      a0 = ((op_t *) srcp)[0];
-      srcp -= 6 * OPSIZ;
-      dstp -= 7 * OPSIZ;
-      len += 6;
-      goto do1;
-    case 3:
-      a1 = ((op_t *) srcp)[0];
-      srcp -= 5 * OPSIZ;
-      dstp -= 6 * OPSIZ;
-      len += 5;
-      goto do2;
-    case 4:
-      a0 = ((op_t *) srcp)[0];
-      srcp -= 4 * OPSIZ;
-      dstp -= 5 * OPSIZ;
-      len += 4;
-      goto do3;
-    case 5:
-      a1 = ((op_t *) srcp)[0];
-      srcp -= 3 * OPSIZ;
-      dstp -= 4 * OPSIZ;
-      len += 3;
-      goto do4;
-    case 6:
-      a0 = ((op_t *) srcp)[0];
-      srcp -= 2 * OPSIZ;
-      dstp -= 3 * OPSIZ;
-      len += 2;
-      goto do5;
-    case 7:
-      a1 = ((op_t *) srcp)[0];
-      srcp -= 1 * OPSIZ;
-      dstp -= 2 * OPSIZ;
-      len += 1;
-      goto do6;
-
-    case 0:
-      if (OP_T_THRES <= 3 * OPSIZ && len == 0)
-	return;
-      a0 = ((op_t *) srcp)[0];
-      srcp -= 0 * OPSIZ;
-      dstp -= 1 * OPSIZ;
-      goto do7;
-    case 1:
-      a1 = ((op_t *) srcp)[0];
-      srcp -=-1 * OPSIZ;
-      dstp -= 0 * OPSIZ;
-      len -= 1;
-      if (OP_T_THRES <= 3 * OPSIZ && len == 0)
-	goto do0;
-      goto do8;			/* No-op.  */
-    }
-
-  do
-    {
-    do8:
-      a0 = ((op_t *) srcp)[0];
-      ((op_t *) dstp)[0] = a1;
-    do7:
-      a1 = ((op_t *) srcp)[1];
-      ((op_t *) dstp)[1] = a0;
-    do6:
-      a0 = ((op_t *) srcp)[2];
-      ((op_t *) dstp)[2] = a1;
-    do5:
-      a1 = ((op_t *) srcp)[3];
-      ((op_t *) dstp)[3] = a0;
-    do4:
-      a0 = ((op_t *) srcp)[4];
-      ((op_t *) dstp)[4] = a1;
-    do3:
-      a1 = ((op_t *) srcp)[5];
-      ((op_t *) dstp)[5] = a0;
-    do2:
-      a0 = ((op_t *) srcp)[6];
-      ((op_t *) dstp)[6] = a1;
-    do1:
-      a1 = ((op_t *) srcp)[7];
-      ((op_t *) dstp)[7] = a0;
-
-      srcp += 8 * OPSIZ;
-      dstp += 8 * OPSIZ;
-      len -= 8;
-    }
-  while (len != 0);
-
-  /* This is the right position for do0.  Please don't move
-     it into the loop.  */
- do0:
-  ((op_t *) dstp)[0] = a1;
-}
-
-/* _wordcopy_fwd_dest_aligned -- Copy block beginning at SRCP to
-   block beginning at DSTP with LEN `op_t' words (not LEN bytes!).
-   DSTP should be aligned for memory operations on `op_t's, but SRCP must
-   *not* be aligned.  */
-
-static void _wordcopy_fwd_dest_aligned (long int dstp, long int srcp, size_t len)
-{
-  op_t a0, a1, a2, a3;
-  int sh_1, sh_2;
-
-  /* Calculate how to shift a word read at the memory operation
-     aligned srcp to make it aligned for copy.  */
-
-  sh_1 = 8 * (srcp % OPSIZ);
-  sh_2 = 8 * OPSIZ - sh_1;
-
-  /* Make SRCP aligned by rounding it down to the beginning of the `op_t'
-     it points in the middle of.  */
-  srcp &= -OPSIZ;
-
-  switch (len % 4)
-    {
-    case 2:
-      a1 = ((op_t *) srcp)[0];
-      a2 = ((op_t *) srcp)[1];
-      srcp -= 1 * OPSIZ;
-      dstp -= 3 * OPSIZ;
-      len += 2;
-      goto do1;
-    case 3:
-      a0 = ((op_t *) srcp)[0];
-      a1 = ((op_t *) srcp)[1];
-      srcp -= 0 * OPSIZ;
-      dstp -= 2 * OPSIZ;
-      len += 1;
-      goto do2;
-    case 0:
-      if (OP_T_THRES <= 3 * OPSIZ && len == 0)
-	return;
-      a3 = ((op_t *) srcp)[0];
-      a0 = ((op_t *) srcp)[1];
-      srcp -=-1 * OPSIZ;
-      dstp -= 1 * OPSIZ;
-      len += 0;
-      goto do3;
-    case 1:
-      a2 = ((op_t *) srcp)[0];
-      a3 = ((op_t *) srcp)[1];
-      srcp -=-2 * OPSIZ;
-      dstp -= 0 * OPSIZ;
-      len -= 1;
-      if (OP_T_THRES <= 3 * OPSIZ && len == 0)
-	goto do0;
-      goto do4;			/* No-op.  */
-    }
-
-  do
-    {
-    do4:
-      a0 = ((op_t *) srcp)[0];
-      ((op_t *) dstp)[0] = MERGE (a2, sh_1, a3, sh_2);
-    do3:
-      a1 = ((op_t *) srcp)[1];
-      ((op_t *) dstp)[1] = MERGE (a3, sh_1, a0, sh_2);
-    do2:
-      a2 = ((op_t *) srcp)[2];
-      ((op_t *) dstp)[2] = MERGE (a0, sh_1, a1, sh_2);
-    do1:
-      a3 = ((op_t *) srcp)[3];
-      ((op_t *) dstp)[3] = MERGE (a1, sh_1, a2, sh_2);
-
-      srcp += 4 * OPSIZ;
-      dstp += 4 * OPSIZ;
-      len -= 4;
-    }
-  while (len != 0);
-
-  /* This is the right position for do0.  Please don't move
-     it into the loop.  */
- do0:
-  ((op_t *) dstp)[0] = MERGE (a2, sh_1, a3, sh_2);
-}
-
 void *memcpy (void *dstpp, const void *srcpp, size_t len)
 void *memcpy (void *dstpp, const void *srcpp, size_t len)
 {
 {
   unsigned long int dstp = (long int) dstpp;
   unsigned long int dstp = (long int) dstpp;

+ 13 - 4
libc/string/generic/memmove.c

@@ -24,12 +24,18 @@
 #include "memcopy.h"
 #include "memcopy.h"
 #include "pagecopy.h"
 #include "pagecopy.h"
 
 
+#ifdef __ARCH_HAS_BWD_MEMCPY__
+/* generic-opt memmove assumes memcpy does forward copying! */
+#include "_memcpy_fwd.c"
+#endif
+
 /* Experimentally off - libc_hidden_proto(memmove) */
 /* Experimentally off - libc_hidden_proto(memmove) */
 /* Experimentally off - libc_hidden_proto(memcpy) */
 /* Experimentally off - libc_hidden_proto(memcpy) */
 
 
 static void _wordcopy_bwd_aligned (long int dstp, long int srcp, size_t len)
 static void _wordcopy_bwd_aligned (long int dstp, long int srcp, size_t len)
 {
 {
-  op_t a0, a1;
+  op_t a0 = 0;
+  op_t a1 = 0;
 
 
   switch (len % 8)
   switch (len % 8)
     {
     {
@@ -133,7 +139,10 @@ static void _wordcopy_bwd_aligned (long int dstp, long int srcp, size_t len)
 
 
 static void _wordcopy_bwd_dest_aligned (long int dstp, long int srcp, size_t len)
 static void _wordcopy_bwd_dest_aligned (long int dstp, long int srcp, size_t len)
 {
 {
-  op_t a0, a1, a2, a3;
+  op_t a0 = 0;
+  op_t a1 = 0;
+  op_t a2 = 0;
+  op_t a3 = 0;
   int sh_1, sh_2;
   int sh_1, sh_2;
 
 
   /* Calculate how to shift a word read at the memory operation
   /* Calculate how to shift a word read at the memory operation
@@ -218,8 +227,8 @@ void *memmove (void *dest, const void *src, size_t len)
      Reduces the working set.  */
      Reduces the working set.  */
   if (dstp - srcp >= len)	/* *Unsigned* compare!  */
   if (dstp - srcp >= len)	/* *Unsigned* compare!  */
     {
     {
-#if 1
+#ifndef __ARCH_HAS_BWD_MEMCPY__
-#warning REMINDER: generic-opt memmove assumes memcpy does forward copying!
+      /* Backward memcpy implementation cannot be used */
       memcpy(dest, src, len);
       memcpy(dest, src, len);
 #else
 #else
       /* Copy from the beginning to the end.  */
       /* Copy from the beginning to the end.  */

+ 808 - 0
libc/string/sh/sh4/memcpy.S

@@ -0,0 +1,808 @@
+/*
+ * "memcpy" implementation of SuperH
+ *
+ * Copyright (C) 1999  Niibe Yutaka
+ * Copyright (c) 2002  STMicroelectronics Ltd
+ *   Modified from memcpy.S and micro-optimised for SH4
+ *   Stuart Menefy (stuart.menefy@st.com)
+ *
+ */
+
+/*
+ * void *memcpy(void *dst, const void *src, size_t n);
+ *
+ * It is assumed that there is no overlap between src and dst.
+ * If there is an overlap, then the results are undefined.
+ */
+
+#include <endian.h>
+
+	!
+	!	GHIJ KLMN OPQR -->  ...G HIJK LMNO PQR.
+	!
+
+	! Size is 16 or greater, and may have trailing bytes
+
+	.balign	32
+.Lcase1:
+	! Read a long word and write a long word at once
+	! At the start of each iteration, r7 contains last long load
+	add	#-1,r5		!  79 EX
+	mov	r4,r2		!   5 MT (0 cycles latency)
+
+	mov.l	@(r0,r5),r7	!  21 LS (2 cycles latency)
+	add	#-4,r5		!  50 EX
+
+	add	#7,r2		!  79 EX
+	!
+#ifdef __LITTLE_ENDIAN__
+	! 6 cycles, 4 bytes per iteration
+3:	mov.l	@(r0,r5),r1	!  21 LS (latency=2)	! NMLK
+	mov	r7, r3		!   5 MT (latency=0)	! RQPO
+
+	cmp/hi	r2,r0		!  57 MT
+	shll16	r3		! 103 EX
+
+	mov	r1,r6		!   5 MT (latency=0)
+	shll8	r3		! 102 EX		! Oxxx
+
+	shlr8	r6		! 106 EX		! xNML
+	mov	r1, r7		!   5 MT (latency=0)
+
+	or	r6,r3		!  82 EX		! ONML
+	bt/s	3b		! 109 BR
+
+	 mov.l	r3,@-r0		!  30 LS
+#else
+3:	mov.l	@(r0,r5),r1	!  21 LS (latency=2)	! KLMN
+	mov	r7,r3		!   5 MT (latency=0)	! OPQR
+
+	cmp/hi	r2,r0		!  57 MT
+	shlr16	r3		! 107 EX
+
+	shlr8	r3		! 106 EX		! xxxO
+	mov	r1,r6		!   5 MT (latency=0)
+
+	shll8	r6		! 102 EX		! LMNx
+	mov	r1,r7		!   5 MT (latency=0)
+
+	or	r6,r3		!  82 EX		! LMNO
+	bt/s	3b		! 109 BR
+
+	 mov.l	r3,@-r0		!  30 LS
+#endif
+	! Finally, copy a byte at once, if necessary
+
+	add	#4,r5		!  50 EX
+	cmp/eq	r4,r0		!  54 MT
+
+	add	#-6,r2		!  50 EX
+	bt	9f		! 109 BR
+
+8:	cmp/hi	r2,r0		!  57 MT
+	mov.b	@(r0,r5),r1	!  20 LS (latency=2)
+
+	bt/s	8b		! 109 BR
+
+	 mov.b	r1,@-r0		!  29 LS
+
+9:	rts
+	 nop
+
+
+	!
+	!	GHIJ KLMN OPQR -->  .GHI JKLM NOPQ R...
+	!
+
+	! Size is 16 or greater, and may have trailing bytes
+
+	.balign	32
+.Lcase3:
+	! Read a long word and write a long word at once
+	! At the start of each iteration, r7 contains last long load
+	add	#-3,r5		! 79 EX
+	mov	r4,r2		!  5 MT (0 cycles latency)
+
+	mov.l	@(r0,r5),r7	! 21 LS (2 cycles latency)
+	add	#-4,r5		! 50 EX
+
+	add	#7,r2		!  79 EX
+	!
+#ifdef __LITTLE_ENDIAN__
+	! 6 cycles, 4 bytes per iteration
+3:	mov.l	@(r0,r5),r1	!  21 LS (latency=2)	! NMLK
+	mov	r7, r3		!   5 MT (latency=0)	! RQPO
+
+	cmp/hi	r2,r0		!  57 MT
+	shll8	r3		! 102 EX		! QPOx
+
+	mov	r1,r6		!   5 MT (latency=0)
+	shlr16	r6		! 107 EX
+
+	shlr8	r6		! 106 EX		! xxxN
+	mov	r1, r7		!   5 MT (latency=0)
+
+	or	r6,r3		!  82 EX		! QPON
+	bt/s	3b		! 109 BR
+
+	 mov.l	r3,@-r0		!  30 LS
+#else
+3:	mov	r1,r3		! OPQR
+	shlr8	r3		! xOPQ
+	mov.l	@(r0,r5),r1	! KLMN
+	mov	r1,r6
+	shll16	r6
+	shll8	r6		! Nxxx
+	or	r6,r3		! NOPQ
+	cmp/hi	r2,r0
+	bt/s	3b
+	 mov.l	r3,@-r0
+#endif
+
+	! Finally, copy a byte at once, if necessary
+
+	add	#6,r5		!  50 EX
+	cmp/eq	r4,r0		!  54 MT
+
+	add	#-6,r2		!  50 EX
+	bt	9f		! 109 BR
+
+8:	cmp/hi	r2,r0		!  57 MT
+	mov.b	@(r0,r5),r1	!  20 LS (latency=2)
+
+	bt/s	8b		! 109 BR
+
+	 mov.b	r1,@-r0		!  29 LS
+
+9:	rts
+	 nop
+
+/* void * memcpy(void *dst, const void *src, size_t len) */
+.text
+.align 4
+.type  memcpy,@function
+.globl memcpy;
+memcpy:
+
+	! Calculate the invariants which will be used in the remainder
+	! of the code:
+	!
+	!      r4   -->  [ ...  ] DST             [ ...  ] SRC
+	!	         [ ...  ]                 [ ...  ]
+	!	           :                        :
+	!      r0   -->  [ ...  ]       r0+r5 --> [ ...  ]
+	!
+	!
+
+	! Short circuit the common case of src, dst and len being 32 bit aligned
+	! and test for zero length move
+
+	mov	r6, r0		!   5 MT (0 cycle latency)
+	or	r4, r0		!  82 EX
+
+	or	r5, r0		!  82 EX
+	tst	r6, r6		!  86 MT
+
+	bt/s	99f		! 111 BR		(zero len)
+	 tst	#3, r0		!  87 MT
+
+	mov	r4, r0		!   5 MT (0 cycle latency)
+	add	r6, r0		!  49 EX
+
+	mov	#16, r1		!   6 EX
+	bt/s	.Lcase00	! 111 BR		(aligned)
+
+	 sub	r4, r5		!  75 EX
+
+	! Arguments are not nicely long word aligned or zero len.
+	! Check for small copies, and if so do a simple byte at a time copy.
+	!
+	! Deciding on an exact value of 'small' is not easy, as the point at which
+	! using the optimised routines become worthwhile varies (these are the
+	! cycle counts for differnet sizes using byte-at-a-time vs. optimised):
+	!	size	byte-at-time	long	word	byte
+	!	16	42		39-40	46-50	50-55
+	!	24	58		43-44	54-58	62-67
+	!	36	82		49-50	66-70	80-85
+	! However the penalty for getting it 'wrong' is much higher for long word
+	! aligned data (and this is more common), so use a value of 16.
+
+	cmp/gt	r6,r1		!  56 MT
+
+	add	#-1,r5		!  50 EX
+	bf/s	6f		! 108 BR		(not small)
+
+	 mov	r5, r3		!   5 MT (latency=0)
+	shlr	r6		! 104 EX
+
+	mov.b	@(r0,r5),r1	!  20 LS (latency=2)
+	bf/s	4f		! 111 BR
+
+	 add	#-1,r3		!  50 EX
+	tst	r6, r6		!  86 MT
+
+	bt/s	98f		! 110 BR
+	 mov.b	r1,@-r0		!  29 LS
+
+	! 4 cycles, 2 bytes per iteration
+3:	mov.b	@(r0,r5),r1	!  20 LS (latency=2)
+
+4:	mov.b	@(r0,r3),r2	!  20 LS (latency=2)
+	dt	r6		!  67 EX
+
+	mov.b	r1,@-r0		!  29 LS
+	bf/s	3b		! 111 BR
+
+	 mov.b	r2,@-r0		!  29 LS
+98:
+	rts
+	 nop
+
+99:	rts
+	 mov	r4, r0
+
+	! Size is not small, so its worthwhile looking for optimisations.
+	! First align destination to a long word boundary.
+	!
+	! r5 = normal value -1
+
+6:	tst	#3, r0		!  87 MT
+        mov	#3, r3		!   6 EX
+
+	bt/s	2f		! 111 BR
+	 and	r0,r3		!  78 EX
+
+	! 3 cycles, 1 byte per iteration
+1:	dt	r3		!  67 EX
+	mov.b	@(r0,r5),r1	!  19 LS (latency=2)
+
+	add	#-1, r6		!  79 EX
+	bf/s	1b		! 109 BR
+
+	 mov.b	r1,@-r0		!  28 LS
+
+2:	add	#1, r5		!  79 EX
+
+	! Now select the appropriate bulk transfer code based on relative
+	! alignment of src and dst.
+
+	mov	r0, r3		!   5 MT (latency=0)
+
+	mov	r5, r0		!   5 MT (latency=0)
+	tst	#1, r0		!  87 MT
+
+	bf/s	1f		! 111 BR
+	 mov	#64, r7		!   6 EX
+
+	! bit 0 clear
+
+	cmp/ge	r7, r6		!  55 MT
+
+	bt/s	2f		! 111 BR
+	 tst	#2, r0		!  87 MT
+
+	! small
+	bt/s	.Lcase0
+	 mov	r3, r0
+
+	bra	.Lcase2
+	 nop
+
+	! big
+2:	bt/s	.Lcase0b
+	 mov	r3, r0
+
+	bra	.Lcase2b
+	 nop
+
+	! bit 0 set
+1:	tst	#2, r0		! 87 MT
+
+	bt/s	.Lcase1
+	 mov	r3, r0
+
+	bra	.Lcase3
+	 nop
+
+
+	!
+	!	GHIJ KLMN OPQR -->  GHIJ KLMN OPQR
+	!
+
+	! src, dst and size are all long word aligned
+	! size is non-zero
+
+	.balign	32
+.Lcase00:
+	mov	#64, r1		!   6 EX
+	mov	r5, r3		!   5 MT (latency=0)
+
+	cmp/gt	r6, r1		!  56 MT
+	add	#-4, r5		!  50 EX
+
+	bf	.Lcase00b	! 108 BR		(big loop)
+	shlr2	r6		! 105 EX
+
+	shlr	r6		! 104 EX
+	mov.l	@(r0, r5), r1	!  21 LS (latency=2)
+
+	bf/s	4f		! 111 BR
+	 add	#-8, r3		!  50 EX
+
+	tst	r6, r6		!  86 MT
+	bt/s	5f		! 110 BR
+
+	 mov.l	r1,@-r0		!  30 LS
+
+	! 4 cycles, 2 long words per iteration
+3:	mov.l	@(r0, r5), r1	!  21 LS (latency=2)
+
+4:	mov.l	@(r0, r3), r2	!  21 LS (latency=2)
+	dt	r6		!  67 EX
+
+	mov.l	r1, @-r0	!  30 LS
+	bf/s	3b		! 109 BR
+
+	 mov.l	r2, @-r0	!  30 LS
+
+5:	rts
+	 nop
+
+
+	! Size is 16 or greater and less than 64, but may have trailing bytes
+
+	.balign	32
+.Lcase0:
+	add	#-4, r5		!  50 EX
+	mov	r4, r7		!   5 MT (latency=0)
+
+	mov.l	@(r0, r5), r1	!  21 LS (latency=2)
+	mov	#4, r2		!   6 EX
+
+	add	#11, r7		!  50 EX
+	tst	r2, r6		!  86 MT
+
+	mov	r5, r3		!   5 MT (latency=0)
+	bt/s	4f		! 111 BR
+
+	 add	#-4, r3		!  50 EX
+	mov.l	r1,@-r0		!  30 LS
+
+	! 4 cycles, 2 long words per iteration
+3:	mov.l	@(r0, r5), r1	!  21 LS (latency=2)
+
+4:	mov.l	@(r0, r3), r2	!  21 LS (latency=2)
+	cmp/hi	r7, r0
+
+	mov.l	r1, @-r0	!  30 LS
+	bt/s	3b		! 109 BR
+
+	 mov.l	r2, @-r0	!  30 LS
+
+	! Copy the final 0-3 bytes
+
+	add	#3,r5		!  50 EX
+
+	cmp/eq	r0, r4		!  54 MT
+	add	#-10, r7	!  50 EX
+
+	bt	9f		! 110 BR
+
+	! 3 cycles, 1 byte per iteration
+1:	mov.b	@(r0,r5),r1	!  19 LS
+	cmp/hi	r7,r0		!  57 MT
+
+	bt/s	1b		! 111 BR
+	 mov.b	r1,@-r0		!  28 LS
+
+9:	rts
+	 nop
+
+	! Size is at least 64 bytes, so will be going round the big loop at least once.
+	!
+	!   r2 = rounded up r4
+	!   r3 = rounded down r0
+
+	.balign	32
+.Lcase0b:
+	add	#-4, r5		!  50 EX
+
+.Lcase00b:
+	mov	r0, r3		!   5 MT (latency=0)
+	mov	#(~0x1f), r1	!   6 EX
+
+	and	r1, r3		!  78 EX
+	mov	r4, r2		!   5 MT (latency=0)
+
+	cmp/eq	r3, r0		!  54 MT
+	add	#0x1f, r2	!  50 EX
+
+	bt/s	1f		! 110 BR
+	 and	r1, r2		!  78 EX
+
+	! copy initial words until cache line aligned
+
+	mov.l	@(r0, r5), r1	!  21 LS (latency=2)
+	tst	#4, r0		!  87 MT
+
+	mov	r5, r6		!   5 MT (latency=0)
+	add	#-4, r6		!  50 EX
+
+	bt/s	4f		! 111 BR
+	 add	#8, r3		!  50 EX
+
+	tst	#0x18, r0	!  87 MT
+
+	bt/s	1f		! 109 BR
+	 mov.l	r1,@-r0		!  30 LS
+
+	! 4 cycles, 2 long words per iteration
+3:	mov.l	@(r0, r5), r1	!  21 LS (latency=2)
+
+4:	mov.l	@(r0, r6), r7	!  21 LS (latency=2)
+	cmp/eq	r3, r0		!  54 MT
+
+	mov.l	r1, @-r0	!  30 LS
+	bf/s	3b		! 109 BR
+
+	 mov.l	r7, @-r0	!  30 LS
+
+	! Copy the cache line aligned blocks
+	!
+	! In use: r0, r2, r4, r5
+	! Scratch: r1, r3, r6, r7
+	!
+	! We could do this with the four scratch registers, but if src
+	! and dest hit the same cache line, this will thrash, so make
+	! use of additional registers.
+	!
+	! We also need r0 as a temporary (for movca), so 'undo' the invariant:
+	!   r5:	 src (was r0+r5)
+	!   r1:	 dest (was r0)
+	! this can be reversed at the end, so we don't need to save any extra
+	! state.
+	!
+1:	mov.l	r8, @-r15	!  30 LS
+	add	r0, r5		!  49 EX
+
+	mov.l	r9, @-r15	!  30 LS
+	mov	r0, r1		!   5 MT (latency=0)
+
+	mov.l	r10, @-r15	!  30 LS
+	add	#-0x1c, r5	!  50 EX
+
+	mov.l	r11, @-r15	!  30 LS
+
+	! 16 cycles, 32 bytes per iteration
+2:	mov.l	@(0x00,r5),r0	! 18 LS (latency=2)
+	add	#-0x20, r1	! 50 EX
+	mov.l	@(0x04,r5),r3	! 18 LS (latency=2)
+	mov.l	@(0x08,r5),r6	! 18 LS (latency=2)
+	mov.l	@(0x0c,r5),r7	! 18 LS (latency=2)
+	mov.l	@(0x10,r5),r8	! 18 LS (latency=2)
+	mov.l	@(0x14,r5),r9	! 18 LS (latency=2)
+	mov.l	@(0x18,r5),r10	! 18 LS (latency=2)
+	mov.l	@(0x1c,r5),r11	! 18 LS (latency=2)
+	movca.l	r0,@r1		! 40 LS (latency=3-7)
+	mov.l	r3,@(0x04,r1)	! 33 LS
+	mov.l	r6,@(0x08,r1)	! 33 LS
+	mov.l	r7,@(0x0c,r1)	! 33 LS
+
+	mov.l	r8,@(0x10,r1)	! 33 LS
+	add	#-0x20, r5	! 50 EX
+
+	mov.l	r9,@(0x14,r1)	! 33 LS
+	cmp/eq	r2,r1		! 54 MT
+
+	mov.l	r10,@(0x18,r1)	!  33 LS
+	bf/s	2b		! 109 BR
+
+	 mov.l	r11,@(0x1c,r1)	!  33 LS
+
+	mov	r1, r0		!   5 MT (latency=0)
+
+	mov.l	@r15+, r11	!  15 LS
+	sub	r1, r5		!  75 EX
+
+	mov.l	@r15+, r10	!  15 LS
+	cmp/eq	r4, r0		!  54 MT
+
+	bf/s	1f		! 109 BR
+	 mov.l	 @r15+, r9	!  15 LS
+
+	rts
+1:	 mov.l	@r15+, r8	!  15 LS
+	sub	r4, r1		!  75 EX		(len remaining)
+
+	! number of trailing bytes is non-zero
+	!
+	! invariants restored (r5 already decremented by 4)
+	! also r1=num bytes remaining
+
+	mov	#4, r2		!   6 EX
+	mov	r4, r7		!   5 MT (latency=0)
+
+	add	#0x1c, r5	!  50 EX		(back to -4)
+	cmp/hs	r2, r1		!  58 MT
+
+	bf/s	5f		! 108 BR
+	 add	 #11, r7	!  50 EX
+
+	mov.l	@(r0, r5), r6	!  21 LS (latency=2)
+	tst	r2, r1		!  86 MT
+
+	mov	r5, r3		!   5 MT (latency=0)
+	bt/s	4f		! 111 BR
+
+	 add	#-4, r3		!  50 EX
+	cmp/hs	r2, r1		!  58 MT
+
+	bt/s	5f		! 111 BR
+	 mov.l	r6,@-r0		!  30 LS
+
+	! 4 cycles, 2 long words per iteration
+3:	mov.l	@(r0, r5), r6	!  21 LS (latency=2)
+
+4:	mov.l	@(r0, r3), r2	!  21 LS (latency=2)
+	cmp/hi	r7, r0
+
+	mov.l	r6, @-r0	!  30 LS
+	bt/s	3b		! 109 BR
+
+	 mov.l	r2, @-r0	!  30 LS
+
+	! Copy the final 0-3 bytes
+
+5:	cmp/eq	r0, r4		!  54 MT
+	add	#-10, r7	!  50 EX
+
+	bt	9f		! 110 BR
+	add	#3,r5		!  50 EX
+
+	! 3 cycles, 1 byte per iteration
+1:	mov.b	@(r0,r5),r1	!  19 LS
+	cmp/hi	r7,r0		!  57 MT
+
+	bt/s	1b		! 111 BR
+	 mov.b	r1,@-r0		!  28 LS
+
+9:	rts
+	 nop
+
+	!
+	!	GHIJ KLMN OPQR -->  ..GH IJKL MNOP QR..
+	!
+
+	.balign	32
+.Lcase2:
+	! Size is 16 or greater and less then 64, but may have trailing bytes
+
+2:	mov	r5, r6		!   5 MT (latency=0)
+	add	#-2,r5		!  50 EX
+
+	mov	r4,r2		!   5 MT (latency=0)
+	add	#-4,r6		!  50 EX
+
+	add	#7,r2		!  50 EX
+3:	mov.w	@(r0,r5),r1	!  20 LS (latency=2)
+
+	mov.w	@(r0,r6),r3	!  20 LS (latency=2)
+	cmp/hi	r2,r0		!  57 MT
+
+	mov.w	r1,@-r0		!  29 LS
+	bt/s	3b		! 111 BR
+
+	 mov.w	r3,@-r0		!  29 LS
+
+	bra	10f
+	 nop
+
+
+	.balign	32
+.Lcase2b:
+	! Size is at least 64 bytes, so will be going round the big loop at least once.
+	!
+	!   r2 = rounded up r4
+	!   r3 = rounded down r0
+
+	mov	r0, r3		!   5 MT (latency=0)
+	mov	#(~0x1f), r1	!   6 EX
+
+	and	r1, r3		!  78 EX
+	mov	r4, r2		!   5 MT (latency=0)
+
+	cmp/eq	r3, r0		!  54 MT
+	add	#0x1f, r2	!  50 EX
+
+	add	#-2, r5		!  50 EX
+	bt/s	1f		! 110 BR
+	 and	r1, r2		!  78 EX
+
+	! Copy a short word one at a time until we are cache line aligned
+	!   Normal values: r0, r2, r3, r4
+	!   Unused: r1, r6, r7
+	!   Mod: r5 (=r5-2)
+	!
+	add	#2, r3		!  50 EX
+
+2:	mov.w	@(r0,r5),r1	!  20 LS (latency=2)
+	cmp/eq	r3,r0		!  54 MT
+
+	bf/s	2b		! 111 BR
+
+	 mov.w	r1,@-r0		!  29 LS
+
+	! Copy the cache line aligned blocks
+	!
+	! In use: r0, r2, r4, r5 (=r5-2)
+	! Scratch: r1, r3, r6, r7
+	!
+	! We could do this with the four scratch registers, but if src
+	! and dest hit the same cache line, this will thrash, so make
+	! use of additional registers.
+	!
+	! We also need r0 as a temporary (for movca), so 'undo' the invariant:
+	!   r5:	 src (was r0+r5)
+	!   r1:	 dest (was r0)
+	! this can be reversed at the end, so we don't need to save any extra
+	! state.
+	!
+1:	mov.l	r8, @-r15	!  30 LS
+	add	r0, r5		!  49 EX
+
+	mov.l	r9, @-r15	!  30 LS
+	mov	r0, r1		!   5 MT (latency=0)
+
+	mov.l	r10, @-r15	!  30 LS
+	add	#-0x1e, r5	!  50 EX
+
+	mov.l	r11, @-r15	!  30 LS
+
+	mov.l	r12, @-r15	!  30 LS
+
+	! 17 cycles, 32 bytes per iteration
+#ifdef __LITTLE_ENDIAN__
+2:	mov.w	@r5+, r0	!  14 LS (latency=2)		..JI
+	add	#-0x20, r1	!  50 EX
+
+	mov.l	@r5+, r3	!  15 LS (latency=2)		NMLK
+
+	mov.l	@r5+, r6	!  15 LS (latency=2)		RQPO
+	shll16	r0		! 103 EX			JI..
+
+	mov.l	@r5+, r7	!  15 LS (latency=2)
+	xtrct	r3, r0		!  48 EX			LKJI
+
+	mov.l	@r5+, r8	!  15 LS (latency=2)
+	xtrct	r6, r3		!  48 EX			PONM
+
+	mov.l	@r5+, r9	!  15 LS (latency=2)
+	xtrct	r7, r6		!  48 EX
+
+	mov.l	@r5+, r10	!  15 LS (latency=2)
+	xtrct	r8, r7		!  48 EX
+
+	mov.l	@r5+, r11	!  15 LS (latency=2)
+	xtrct	r9, r8		!  48 EX
+
+	mov.w	@r5+, r12	!  15 LS (latency=2)
+	xtrct	r10, r9		!  48 EX
+
+	movca.l	r0,@r1		!  40 LS (latency=3-7)
+	xtrct	r11, r10	!  48 EX
+
+	mov.l	r3, @(0x04,r1)	!  33 LS
+	xtrct	r12, r11	!  48 EX
+
+	mov.l	r6, @(0x08,r1)	!  33 LS
+
+	mov.l	r7, @(0x0c,r1)	!  33 LS
+
+	mov.l	r8, @(0x10,r1)	!  33 LS
+	add	#-0x40, r5	!  50 EX
+
+	mov.l	r9, @(0x14,r1)	!  33 LS
+	cmp/eq	r2,r1		!  54 MT
+
+	mov.l	r10, @(0x18,r1)	!  33 LS
+	bf/s	2b		! 109 BR
+
+	 mov.l	r11, @(0x1c,r1)	!  33 LS
+#else
+2:	mov.w	@(0x1e,r5), r0	!  17 LS (latency=2)
+	add	#-2, r5		!  50 EX
+
+	mov.l	@(0x1c,r5), r3	!  18 LS (latency=2)
+	add	#-4, r1		!  50 EX
+
+	mov.l	@(0x18,r5), r6	!  18 LS (latency=2)
+	shll16	r0		! 103 EX
+
+	mov.l	@(0x14,r5), r7	!  18 LS (latency=2)
+	xtrct	r3, r0		!  48 EX
+
+	mov.l	@(0x10,r5), r8	!  18 LS (latency=2)
+	xtrct	r6, r3		!  48 EX
+
+	mov.l	@(0x0c,r5), r9	!  18 LS (latency=2)
+	xtrct	r7, r6		!  48 EX
+
+	mov.l	@(0x08,r5), r10	!  18 LS (latency=2)
+	xtrct	r8, r7		!  48 EX
+
+	mov.l	@(0x04,r5), r11	!  18 LS (latency=2)
+	xtrct	r9, r8		!  48 EX
+
+	mov.w	@(0x02,r5), r12	!  18 LS (latency=2)
+	xtrct	r10, r9		!  48 EX
+
+	movca.l	r0,@r1		!  40 LS (latency=3-7)
+	add	#-0x1c, r1	!  50 EX
+
+	mov.l	r3, @(0x1c,r1)	!  33 LS
+	xtrct	r11, r10	!  48 EX
+
+	mov.l	r6, @(0x18,r1)	!  33 LS
+	xtrct	r12, r11	!  48 EX
+
+	mov.l	r7, @(0x14,r1)	!  33 LS
+
+	mov.l	r8, @(0x10,r1)	!  33 LS
+	add	#-0x3e, r5	!  50 EX
+
+	mov.l	r9, @(0x0c,r1)	!  33 LS
+	cmp/eq	r2,r1		!  54 MT
+
+	mov.l	r10, @(0x08,r1)	!  33 LS
+	bf/s	2b		! 109 BR
+
+	 mov.l	r11, @(0x04,r1)	!  33 LS
+#endif
+
+	mov.l	@r15+, r12
+	mov	r1, r0		!   5 MT (latency=0)
+
+	mov.l	@r15+, r11	!  15 LS
+	sub	r1, r5		!  75 EX
+
+	mov.l	@r15+, r10	!  15 LS
+	cmp/eq	r4, r0		!  54 MT
+
+	bf/s	1f		! 109 BR
+	 mov.l	 @r15+, r9	!  15 LS
+
+	rts
+1:	 mov.l	@r15+, r8	!  15 LS
+
+	add	#0x1e, r5	!  50 EX
+
+	! Finish off a short word at a time
+	! r5 must be invariant - 2
+10:	mov	r4,r2		!   5 MT (latency=0)
+	add	#1,r2		!  50 EX
+
+	cmp/hi	r2, r0		!  57 MT
+	bf/s	1f		! 109 BR
+
+	 add	#2, r2		!  50 EX
+
+3:	mov.w	@(r0,r5),r1	!  20 LS
+	cmp/hi	r2,r0		!  57 MT
+
+	bt/s	3b		! 109 BR
+
+	 mov.w	r1,@-r0		!  29 LS
+1:
+
+	!
+	! Finally, copy the last byte if necessary
+	cmp/eq	r4,r0		!  54 MT
+	bt/s	9b
+	 add	#1,r5
+	mov.b	@(r0,r5),r1
+	rts
+	 mov.b	r1,@-r0
+
+.size memcpy,.-memcpy;
+
+libc_hidden_def (memcpy)