|
@@ -0,0 +1,348 @@
|
|
|
+/*
|
|
|
+ * Copyright (C) 2008-2009 Michal Simek <monstr@monstr.eu>
|
|
|
+ * Copyright (C) 2008-2009 PetaLogix
|
|
|
+ * Copyright (C) 2008 Jim Law - Iris LP All rights reserved.
|
|
|
+ *
|
|
|
+ * This file is subject to the terms and conditions of the GNU General
|
|
|
+ * Public License. See the file COPYING in the main directory of this
|
|
|
+ * archive for more details.
|
|
|
+ *
|
|
|
+ * Written by Jim Law <jlaw@irispower.com>
|
|
|
+ *
|
|
|
+ * intended to replace:
|
|
|
+ * memcpy in memcpy.c and
|
|
|
+ * memmove in memmove.c
|
|
|
+ * ... in arch/microblaze/lib
|
|
|
+ *
|
|
|
+ *
|
|
|
+ * assly_fastcopy.S
|
|
|
+ *
|
|
|
+ * Attempt at quicker memcpy and memmove for MicroBlaze
|
|
|
+ * Input : Operand1 in Reg r5 - destination address
|
|
|
+ * Operand2 in Reg r6 - source address
|
|
|
+ * Operand3 in Reg r7 - number of bytes to transfer
|
|
|
+ * Output: Result in Reg r3 - starting destinaition address
|
|
|
+ *
|
|
|
+ *
|
|
|
+ * Explanation:
|
|
|
+ * Perform (possibly unaligned) copy of a block of memory
|
|
|
+ * between mem locations with size of xfer spec'd in bytes
|
|
|
+ */
|
|
|
+
|
|
|
+ .globl memmove
|
|
|
+ .type memmove, @function
|
|
|
+ .ent memmove
|
|
|
+
|
|
|
+memmove:
|
|
|
+ cmpu r4, r5, r6 /* n = s - d */
|
|
|
+ bgei r4, HIDDEN_JUMPTARGET(memcpy)
|
|
|
+
|
|
|
+fast_memcpy_descending:
|
|
|
+ /* move d to return register as value of function */
|
|
|
+ addi r3, r5, 0
|
|
|
+
|
|
|
+ add r5, r5, r7 /* d = d + c */
|
|
|
+ add r6, r6, r7 /* s = s + c */
|
|
|
+
|
|
|
+ addi r4, r0, 4 /* n = 4 */
|
|
|
+ cmpu r4, r4, r7 /* n = c - n (unsigned) */
|
|
|
+ blti r4,d_xfer_end /* if n < 0, less than one word to transfer */
|
|
|
+
|
|
|
+ /* transfer first 0~3 bytes to get aligned dest address */
|
|
|
+ andi r4, r5, 3 /* n = d & 3 */
|
|
|
+ /* if zero, destination already aligned */
|
|
|
+ beqi r4,d_dalign_done
|
|
|
+ rsub r7, r4, r7 /* c = c - n adjust c */
|
|
|
+
|
|
|
+d_xfer_first_loop:
|
|
|
+ /* if no bytes left to transfer, transfer the bulk */
|
|
|
+ beqi r4,d_dalign_done
|
|
|
+ addi r6, r6, -1 /* s-- */
|
|
|
+ addi r5, r5, -1 /* d-- */
|
|
|
+ lbui r11, r6, 0 /* h = *s */
|
|
|
+ sbi r11, r5, 0 /* *d = h */
|
|
|
+ brid d_xfer_first_loop /* loop */
|
|
|
+ addi r4, r4, -1 /* n-- (IN DELAY SLOT) */
|
|
|
+
|
|
|
+d_dalign_done:
|
|
|
+ addi r4, r0, 32 /* n = 32 */
|
|
|
+ cmpu r4, r4, r7 /* n = c - n (unsigned) */
|
|
|
+ /* if n < 0, less than one block to transfer */
|
|
|
+ blti r4, d_block_done
|
|
|
+
|
|
|
+d_block_xfer:
|
|
|
+ andi r4, r7, 0xffffffe0 /* n = c & ~31 */
|
|
|
+ rsub r7, r4, r7 /* c = c - n */
|
|
|
+
|
|
|
+ andi r9, r6, 3 /* t1 = s & 3 */
|
|
|
+ /* if temp != 0, unaligned transfers needed */
|
|
|
+ bnei r9, d_block_unaligned
|
|
|
+
|
|
|
+d_block_aligned:
|
|
|
+ addi r6, r6, -32 /* s = s - 32 */
|
|
|
+ addi r5, r5, -32 /* d = d - 32 */
|
|
|
+ lwi r9, r6, 28 /* t1 = *(s + 28) */
|
|
|
+ lwi r10, r6, 24 /* t2 = *(s + 24) */
|
|
|
+ lwi r11, r6, 20 /* t3 = *(s + 20) */
|
|
|
+ lwi r12, r6, 16 /* t4 = *(s + 16) */
|
|
|
+ swi r9, r5, 28 /* *(d + 28) = t1 */
|
|
|
+ swi r10, r5, 24 /* *(d + 24) = t2 */
|
|
|
+ swi r11, r5, 20 /* *(d + 20) = t3 */
|
|
|
+ swi r12, r5, 16 /* *(d + 16) = t4 */
|
|
|
+ lwi r9, r6, 12 /* t1 = *(s + 12) */
|
|
|
+ lwi r10, r6, 8 /* t2 = *(s + 8) */
|
|
|
+ lwi r11, r6, 4 /* t3 = *(s + 4) */
|
|
|
+ lwi r12, r6, 0 /* t4 = *(s + 0) */
|
|
|
+ swi r9, r5, 12 /* *(d + 12) = t1 */
|
|
|
+ swi r10, r5, 8 /* *(d + 8) = t2 */
|
|
|
+ swi r11, r5, 4 /* *(d + 4) = t3 */
|
|
|
+ addi r4, r4, -32 /* n = n - 32 */
|
|
|
+ bneid r4, d_block_aligned /* while (n) loop */
|
|
|
+ swi r12, r5, 0 /* *(d + 0) = t4 (IN DELAY SLOT) */
|
|
|
+ bri d_block_done
|
|
|
+
|
|
|
+d_block_unaligned:
|
|
|
+ andi r8, r6, 0xfffffffc /* as = s & ~3 */
|
|
|
+ rsub r6, r4, r6 /* s = s - n */
|
|
|
+ lwi r11, r8, 0 /* h = *(as + 0) */
|
|
|
+
|
|
|
+ addi r9, r9, -1
|
|
|
+ beqi r9,d_block_u1 /* t1 was 1 => 1 byte offset */
|
|
|
+ addi r9, r9, -1
|
|
|
+ beqi r9,d_block_u2 /* t1 was 2 => 2 byte offset */
|
|
|
+
|
|
|
+d_block_u3:
|
|
|
+ bsrli r11, r11, 8 /* h = h >> 8 */
|
|
|
+d_bu3_loop:
|
|
|
+ addi r8, r8, -32 /* as = as - 32 */
|
|
|
+ addi r5, r5, -32 /* d = d - 32 */
|
|
|
+ lwi r12, r8, 28 /* v = *(as + 28) */
|
|
|
+ bslli r9, r12, 24 /* t1 = v << 24 */
|
|
|
+ or r9, r11, r9 /* t1 = h | t1 */
|
|
|
+ swi r9, r5, 28 /* *(d + 28) = t1 */
|
|
|
+ bsrli r11, r12, 8 /* h = v >> 8 */
|
|
|
+ lwi r12, r8, 24 /* v = *(as + 24) */
|
|
|
+ bslli r9, r12, 24 /* t1 = v << 24 */
|
|
|
+ or r9, r11, r9 /* t1 = h | t1 */
|
|
|
+ swi r9, r5, 24 /* *(d + 24) = t1 */
|
|
|
+ bsrli r11, r12, 8 /* h = v >> 8 */
|
|
|
+ lwi r12, r8, 20 /* v = *(as + 20) */
|
|
|
+ bslli r9, r12, 24 /* t1 = v << 24 */
|
|
|
+ or r9, r11, r9 /* t1 = h | t1 */
|
|
|
+ swi r9, r5, 20 /* *(d + 20) = t1 */
|
|
|
+ bsrli r11, r12, 8 /* h = v >> 8 */
|
|
|
+ lwi r12, r8, 16 /* v = *(as + 16) */
|
|
|
+ bslli r9, r12, 24 /* t1 = v << 24 */
|
|
|
+ or r9, r11, r9 /* t1 = h | t1 */
|
|
|
+ swi r9, r5, 16 /* *(d + 16) = t1 */
|
|
|
+ bsrli r11, r12, 8 /* h = v >> 8 */
|
|
|
+ lwi r12, r8, 12 /* v = *(as + 12) */
|
|
|
+ bslli r9, r12, 24 /* t1 = v << 24 */
|
|
|
+ or r9, r11, r9 /* t1 = h | t1 */
|
|
|
+ swi r9, r5, 12 /* *(d + 112) = t1 */
|
|
|
+ bsrli r11, r12, 8 /* h = v >> 8 */
|
|
|
+ lwi r12, r8, 8 /* v = *(as + 8) */
|
|
|
+ bslli r9, r12, 24 /* t1 = v << 24 */
|
|
|
+ or r9, r11, r9 /* t1 = h | t1 */
|
|
|
+ swi r9, r5, 8 /* *(d + 8) = t1 */
|
|
|
+ bsrli r11, r12, 8 /* h = v >> 8 */
|
|
|
+ lwi r12, r8, 4 /* v = *(as + 4) */
|
|
|
+ bslli r9, r12, 24 /* t1 = v << 24 */
|
|
|
+ or r9, r11, r9 /* t1 = h | t1 */
|
|
|
+ swi r9, r5, 4 /* *(d + 4) = t1 */
|
|
|
+ bsrli r11, r12, 8 /* h = v >> 8 */
|
|
|
+ lwi r12, r8, 0 /* v = *(as + 0) */
|
|
|
+ bslli r9, r12, 24 /* t1 = v << 24 */
|
|
|
+ or r9, r11, r9 /* t1 = h | t1 */
|
|
|
+ swi r9, r5, 0 /* *(d + 0) = t1 */
|
|
|
+ addi r4, r4, -32 /* n = n - 32 */
|
|
|
+ bneid r4, d_bu3_loop /* while (n) loop */
|
|
|
+ bsrli r11, r12, 8 /* h = v >> 8 (IN DELAY SLOT) */
|
|
|
+ bri d_block_done
|
|
|
+
|
|
|
+d_block_u1:
|
|
|
+ bsrli r11, r11, 24 /* h = h >> 24 */
|
|
|
+d_bu1_loop:
|
|
|
+ addi r8, r8, -32 /* as = as - 32 */
|
|
|
+ addi r5, r5, -32 /* d = d - 32 */
|
|
|
+ lwi r12, r8, 28 /* v = *(as + 28) */
|
|
|
+ bslli r9, r12, 8 /* t1 = v << 8 */
|
|
|
+ or r9, r11, r9 /* t1 = h | t1 */
|
|
|
+ swi r9, r5, 28 /* *(d + 28) = t1 */
|
|
|
+ bsrli r11, r12, 24 /* h = v >> 24 */
|
|
|
+ lwi r12, r8, 24 /* v = *(as + 24) */
|
|
|
+ bslli r9, r12, 8 /* t1 = v << 8 */
|
|
|
+ or r9, r11, r9 /* t1 = h | t1 */
|
|
|
+ swi r9, r5, 24 /* *(d + 24) = t1 */
|
|
|
+ bsrli r11, r12, 24 /* h = v >> 24 */
|
|
|
+ lwi r12, r8, 20 /* v = *(as + 20) */
|
|
|
+ bslli r9, r12, 8 /* t1 = v << 8 */
|
|
|
+ or r9, r11, r9 /* t1 = h | t1 */
|
|
|
+ swi r9, r5, 20 /* *(d + 20) = t1 */
|
|
|
+ bsrli r11, r12, 24 /* h = v >> 24 */
|
|
|
+ lwi r12, r8, 16 /* v = *(as + 16) */
|
|
|
+ bslli r9, r12, 8 /* t1 = v << 8 */
|
|
|
+ or r9, r11, r9 /* t1 = h | t1 */
|
|
|
+ swi r9, r5, 16 /* *(d + 16) = t1 */
|
|
|
+ bsrli r11, r12, 24 /* h = v >> 24 */
|
|
|
+ lwi r12, r8, 12 /* v = *(as + 12) */
|
|
|
+ bslli r9, r12, 8 /* t1 = v << 8 */
|
|
|
+ or r9, r11, r9 /* t1 = h | t1 */
|
|
|
+ swi r9, r5, 12 /* *(d + 112) = t1 */
|
|
|
+ bsrli r11, r12, 24 /* h = v >> 24 */
|
|
|
+ lwi r12, r8, 8 /* v = *(as + 8) */
|
|
|
+ bslli r9, r12, 8 /* t1 = v << 8 */
|
|
|
+ or r9, r11, r9 /* t1 = h | t1 */
|
|
|
+ swi r9, r5, 8 /* *(d + 8) = t1 */
|
|
|
+ bsrli r11, r12, 24 /* h = v >> 24 */
|
|
|
+ lwi r12, r8, 4 /* v = *(as + 4) */
|
|
|
+ bslli r9, r12, 8 /* t1 = v << 8 */
|
|
|
+ or r9, r11, r9 /* t1 = h | t1 */
|
|
|
+ swi r9, r5, 4 /* *(d + 4) = t1 */
|
|
|
+ bsrli r11, r12, 24 /* h = v >> 24 */
|
|
|
+ lwi r12, r8, 0 /* v = *(as + 0) */
|
|
|
+ bslli r9, r12, 8 /* t1 = v << 8 */
|
|
|
+ or r9, r11, r9 /* t1 = h | t1 */
|
|
|
+ swi r9, r5, 0 /* *(d + 0) = t1 */
|
|
|
+ addi r4, r4, -32 /* n = n - 32 */
|
|
|
+ bneid r4, d_bu1_loop /* while (n) loop */
|
|
|
+ bsrli r11, r12, 24 /* h = v >> 24 (IN DELAY SLOT) */
|
|
|
+ bri d_block_done
|
|
|
+
|
|
|
+d_block_u2:
|
|
|
+ bsrli r11, r11, 16 /* h = h >> 16 */
|
|
|
+d_bu2_loop:
|
|
|
+ addi r8, r8, -32 /* as = as - 32 */
|
|
|
+ addi r5, r5, -32 /* d = d - 32 */
|
|
|
+ lwi r12, r8, 28 /* v = *(as + 28) */
|
|
|
+ bslli r9, r12, 16 /* t1 = v << 16 */
|
|
|
+ or r9, r11, r9 /* t1 = h | t1 */
|
|
|
+ swi r9, r5, 28 /* *(d + 28) = t1 */
|
|
|
+ bsrli r11, r12, 16 /* h = v >> 16 */
|
|
|
+ lwi r12, r8, 24 /* v = *(as + 24) */
|
|
|
+ bslli r9, r12, 16 /* t1 = v << 16 */
|
|
|
+ or r9, r11, r9 /* t1 = h | t1 */
|
|
|
+ swi r9, r5, 24 /* *(d + 24) = t1 */
|
|
|
+ bsrli r11, r12, 16 /* h = v >> 16 */
|
|
|
+ lwi r12, r8, 20 /* v = *(as + 20) */
|
|
|
+ bslli r9, r12, 16 /* t1 = v << 16 */
|
|
|
+ or r9, r11, r9 /* t1 = h | t1 */
|
|
|
+ swi r9, r5, 20 /* *(d + 20) = t1 */
|
|
|
+ bsrli r11, r12, 16 /* h = v >> 16 */
|
|
|
+ lwi r12, r8, 16 /* v = *(as + 16) */
|
|
|
+ bslli r9, r12, 16 /* t1 = v << 16 */
|
|
|
+ or r9, r11, r9 /* t1 = h | t1 */
|
|
|
+ swi r9, r5, 16 /* *(d + 16) = t1 */
|
|
|
+ bsrli r11, r12, 16 /* h = v >> 16 */
|
|
|
+ lwi r12, r8, 12 /* v = *(as + 12) */
|
|
|
+ bslli r9, r12, 16 /* t1 = v << 16 */
|
|
|
+ or r9, r11, r9 /* t1 = h | t1 */
|
|
|
+ swi r9, r5, 12 /* *(d + 112) = t1 */
|
|
|
+ bsrli r11, r12, 16 /* h = v >> 16 */
|
|
|
+ lwi r12, r8, 8 /* v = *(as + 8) */
|
|
|
+ bslli r9, r12, 16 /* t1 = v << 16 */
|
|
|
+ or r9, r11, r9 /* t1 = h | t1 */
|
|
|
+ swi r9, r5, 8 /* *(d + 8) = t1 */
|
|
|
+ bsrli r11, r12, 16 /* h = v >> 16 */
|
|
|
+ lwi r12, r8, 4 /* v = *(as + 4) */
|
|
|
+ bslli r9, r12, 16 /* t1 = v << 16 */
|
|
|
+ or r9, r11, r9 /* t1 = h | t1 */
|
|
|
+ swi r9, r5, 4 /* *(d + 4) = t1 */
|
|
|
+ bsrli r11, r12, 16 /* h = v >> 16 */
|
|
|
+ lwi r12, r8, 0 /* v = *(as + 0) */
|
|
|
+ bslli r9, r12, 16 /* t1 = v << 16 */
|
|
|
+ or r9, r11, r9 /* t1 = h | t1 */
|
|
|
+ swi r9, r5, 0 /* *(d + 0) = t1 */
|
|
|
+ addi r4, r4, -32 /* n = n - 32 */
|
|
|
+ bneid r4, d_bu2_loop /* while (n) loop */
|
|
|
+ bsrli r11, r12, 16 /* h = v >> 16 (IN DELAY SLOT) */
|
|
|
+
|
|
|
+d_block_done:
|
|
|
+ addi r4, r0, 4 /* n = 4 */
|
|
|
+ cmpu r4, r4, r7 /* n = c - n (unsigned) */
|
|
|
+ blti r4,d_xfer_end /* if n < 0, less than one word to transfer */
|
|
|
+
|
|
|
+d_word_xfer:
|
|
|
+ andi r4, r7, 0xfffffffc /* n = c & ~3 */
|
|
|
+ rsub r5, r4, r5 /* d = d - n */
|
|
|
+ rsub r6, r4, r6 /* s = s - n */
|
|
|
+ rsub r7, r4, r7 /* c = c - n */
|
|
|
+
|
|
|
+ andi r9, r6, 3 /* t1 = s & 3 */
|
|
|
+ /* if temp != 0, unaligned transfers needed */
|
|
|
+ bnei r9, d_word_unaligned
|
|
|
+
|
|
|
+d_word_aligned:
|
|
|
+ addi r4, r4,-4 /* n-- */
|
|
|
+ lw r9, r6, r4 /* t1 = *(s+n) */
|
|
|
+ bneid r4, d_word_aligned /* loop */
|
|
|
+ sw r9, r5, r4 /* *(d+n) = t1 (IN DELAY SLOT) */
|
|
|
+
|
|
|
+ bri d_word_done
|
|
|
+
|
|
|
+d_word_unaligned:
|
|
|
+ andi r8, r6, 0xfffffffc /* as = s & ~3 */
|
|
|
+ lw r11, r8, r4 /* h = *(as + n) */
|
|
|
+
|
|
|
+ addi r9, r9, -1
|
|
|
+ beqi r9,d_word_u1 /* t1 was 1 => 1 byte offset */
|
|
|
+ addi r9, r9, -1
|
|
|
+ beqi r9,d_word_u2 /* t1 was 2 => 2 byte offset */
|
|
|
+
|
|
|
+d_word_u3:
|
|
|
+ bsrli r11, r11, 8 /* h = h >> 8 */
|
|
|
+d_wu3_loop:
|
|
|
+ addi r4, r4,-4 /* n = n - 4 */
|
|
|
+ lw r12, r8, r4 /* v = *(as + n) */
|
|
|
+ bslli r9, r12, 24 /* t1 = v << 24 */
|
|
|
+ or r9, r11, r9 /* t1 = h | t1 */
|
|
|
+ sw r9, r5, r4 /* *(d + n) = t1 */
|
|
|
+ bneid r4, d_wu3_loop /* while (n) loop */
|
|
|
+ bsrli r11, r12, 8 /* h = v >> 8 (IN DELAY SLOT) */
|
|
|
+
|
|
|
+ bri d_word_done
|
|
|
+
|
|
|
+d_word_u1:
|
|
|
+ bsrli r11, r11, 24 /* h = h >> 24 */
|
|
|
+d_wu1_loop:
|
|
|
+ addi r4, r4,-4 /* n = n - 4 */
|
|
|
+ lw r12, r8, r4 /* v = *(as + n) */
|
|
|
+ bslli r9, r12, 8 /* t1 = v << 8 */
|
|
|
+ or r9, r11, r9 /* t1 = h | t1 */
|
|
|
+ sw r9, r5, r4 /* *(d + n) = t1 */
|
|
|
+ bneid r4, d_wu1_loop /* while (n) loop */
|
|
|
+ bsrli r11, r12, 24 /* h = v >> 24 (IN DELAY SLOT) */
|
|
|
+
|
|
|
+ bri d_word_done
|
|
|
+
|
|
|
+d_word_u2:
|
|
|
+ bsrli r11, r11, 16 /* h = h >> 16 */
|
|
|
+d_wu2_loop:
|
|
|
+ addi r4, r4,-4 /* n = n - 4 */
|
|
|
+ lw r12, r8, r4 /* v = *(as + n) */
|
|
|
+ bslli r9, r12, 16 /* t1 = v << 16 */
|
|
|
+ or r9, r11, r9 /* t1 = h | t1 */
|
|
|
+ sw r9, r5, r4 /* *(d + n) = t1 */
|
|
|
+ bneid r4, d_wu2_loop /* while (n) loop */
|
|
|
+ bsrli r11, r12, 16 /* h = v >> 16 (IN DELAY SLOT) */
|
|
|
+
|
|
|
+d_word_done:
|
|
|
+
|
|
|
+d_xfer_end:
|
|
|
+d_xfer_end_loop:
|
|
|
+ beqi r7, a_done /* while (c) */
|
|
|
+ addi r6, r6, -1 /* s-- */
|
|
|
+ lbui r9, r6, 0 /* t1 = *s */
|
|
|
+ addi r5, r5, -1 /* d-- */
|
|
|
+ sbi r9, r5, 0 /* *d = t1 */
|
|
|
+ brid d_xfer_end_loop /* loop */
|
|
|
+ addi r7, r7, -1 /* c-- (IN DELAY SLOT) */
|
|
|
+
|
|
|
+a_done:
|
|
|
+d_done:
|
|
|
+ rtsd r15, 8
|
|
|
+ nop
|
|
|
+
|
|
|
+.size memmove, . - memmove
|
|
|
+.end memmove
|
|
|
+libc_hidden_def(memmove)
|