| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334 | /* * Copyright (C) 2008-2009 Michal Simek <monstr@monstr.eu> * Copyright (C) 2008-2009 PetaLogix * Copyright (C) 2008 Jim Law - Iris LP  All rights reserved. * * This file is subject to the terms and conditions of the GNU General * Public License.  See the file COPYING in the main directory of this * archive for more details. * * Written by Jim Law <jlaw@irispower.com> * * intended to replace: *	memcpy in memcpy.c and *	memmove in memmove.c * ... in arch/microblaze/lib * * * assly_fastcopy.S * * Attempt at quicker memcpy and memmove for MicroBlaze *	Input :	Operand1 in Reg r5 - destination address *		Operand2 in Reg r6 - source address *		Operand3 in Reg r7 - number of bytes to transfer *	Output: Result in Reg r3 - starting destinaition address * * * Explanation: *	Perform (possibly unaligned) copy of a block of memory *	between mem locations with size of xfer spec'd in bytes */	.text	.globl	memcpy	.type  memcpy, @function	.ent	memcpy#ifdef __MICROBLAZEEL__#	define BSLLI bsrli#	define BSRLI bslli#else#	define BSLLI bslli#	define BSRLI bsrli#endifmemcpy:fast_memcpy_ascending:	/* move d to return register as value of function */	addi	r3, r5, 0	addi	r4, r0, 4	/* n = 4 */	cmpu	r4, r4, r7	/* n = c - n  (unsigned) */	blti	r4, a_xfer_end	/* if n < 0, less than one word to transfer */	/* transfer first 0~3 bytes to get aligned dest address */	andi	r4, r5, 3		/* n = d & 3 */	/* if zero, destination already aligned */	beqi	r4, a_dalign_done	/* n = 4 - n (yields 3, 2, 1 transfers for 1, 2, 3 addr offset) */	rsubi	r4, r4, 4	rsub	r7, r4, r7		/* c = c - n adjust c */a_xfer_first_loop:	/* if no bytes left to transfer, transfer the bulk */	beqi	r4, a_dalign_done	lbui	r11, r6, 0		/* h = *s */	sbi	r11, r5, 0		/* *d = h */	addi	r6, r6, 1		/* s++ */	addi	r5, r5, 1		/* d++ */	brid	a_xfer_first_loop	/* loop */	addi	r4, r4, -1		/* n-- (IN DELAY SLOT) */a_dalign_done:	addi	r4, r0, 32		/* n = 32 */	cmpu	r4, r4, r7		/* n = c - n  (unsigned) */	/* if n < 0, less than one block to transfer */	blti	r4, a_block_donea_block_xfer:	andi	r9, r6, 3		/* t1 = s & 3 */	/* if temp == 0, everything is word-aligned */	beqi	r9, a_word_xfera_block_unaligned:	andi	r4, r7, 0xffffffe0	/* n = c & ~31 */	rsub	r7, r4, r7		/* c = c - n */	andi	r8, r6, 0xfffffffc	/* as = s & ~3 */	add	r6, r6, r4		/* s = s + n */	lwi	r11, r8, 0		/* h = *(as + 0) */	addi	r9, r9, -1	beqi	r9, a_block_u1		/* t1 was 1 => 1 byte offset */	addi	r9, r9, -1	beqi	r9, a_block_u2		/* t1 was 2 => 2 byte offset */a_block_u3:	BSLLI	r11, r11, 24	/* h = h << 24 */a_bu3_loop:	lwi	r12, r8, 4	/* v = *(as + 4) */	BSRLI	r9, r12, 8	/* t1 = v >> 8 */	or	r9, r11, r9	/* t1 = h | t1 */	swi	r9, r5, 0	/* *(d + 0) = t1 */	BSLLI	r11, r12, 24	/* h = v << 24 */	lwi	r12, r8, 8	/* v = *(as + 8) */	BSRLI	r9, r12, 8	/* t1 = v >> 8 */	or	r9, r11, r9	/* t1 = h | t1 */	swi	r9, r5, 4	/* *(d + 4) = t1 */	BSLLI	r11, r12, 24	/* h = v << 24 */	lwi	r12, r8, 12	/* v = *(as + 12) */	BSRLI	r9, r12, 8	/* t1 = v >> 8 */	or	r9, r11, r9	/* t1 = h | t1 */	swi	r9, r5, 8	/* *(d + 8) = t1 */	BSLLI	r11, r12, 24	/* h = v << 24 */	lwi	r12, r8, 16	/* v = *(as + 16) */	BSRLI	r9, r12, 8	/* t1 = v >> 8 */	or	r9, r11, r9	/* t1 = h | t1 */	swi	r9, r5, 12	/* *(d + 12) = t1 */	BSLLI	r11, r12, 24	/* h = v << 24 */	lwi	r12, r8, 20	/* v = *(as + 20) */	BSRLI	r9, r12, 8	/* t1 = v >> 8 */	or	r9, r11, r9	/* t1 = h | t1 */	swi	r9, r5, 16	/* *(d + 16) = t1 */	BSLLI	r11, r12, 24	/* h = v << 24 */	lwi	r12, r8, 24	/* v = *(as + 24) */	BSRLI	r9, r12, 8	/* t1 = v >> 8 */	or	r9, r11, r9	/* t1 = h | t1 */	swi	r9, r5, 20	/* *(d + 20) = t1 */	BSLLI	r11, r12, 24	/* h = v << 24 */	lwi	r12, r8, 28	/* v = *(as + 28) */	BSRLI	r9, r12, 8	/* t1 = v >> 8 */	or	r9, r11, r9	/* t1 = h | t1 */	swi	r9, r5, 24	/* *(d + 24) = t1 */	BSLLI	r11, r12, 24	/* h = v << 24 */	lwi	r12, r8, 32	/* v = *(as + 32) */	BSRLI	r9, r12, 8	/* t1 = v >> 8 */	or	r9, r11, r9	/* t1 = h | t1 */	swi	r9, r5, 28	/* *(d + 28) = t1 */	BSLLI	r11, r12, 24	/* h = v << 24 */	addi	r8, r8, 32	/* as = as + 32 */	addi	r4, r4, -32	/* n = n - 32 */	bneid	r4, a_bu3_loop	/* while (n) loop */	addi	r5, r5, 32	/* d = d + 32 (IN DELAY SLOT) */	bri	a_block_donea_block_u1:	BSLLI	r11, r11, 8	/* h = h << 8 */a_bu1_loop:	lwi	r12, r8, 4	/* v = *(as + 4) */	BSRLI	r9, r12, 24	/* t1 = v >> 24 */	or	r9, r11, r9	/* t1 = h | t1 */	swi	r9, r5, 0	/* *(d + 0) = t1 */	BSLLI	r11, r12, 8	/* h = v << 8 */	lwi	r12, r8, 8	/* v = *(as + 8) */	BSRLI	r9, r12, 24	/* t1 = v >> 24 */	or	r9, r11, r9	/* t1 = h | t1 */	swi	r9, r5, 4	/* *(d + 4) = t1 */	BSLLI	r11, r12, 8	/* h = v << 8 */	lwi	r12, r8, 12	/* v = *(as + 12) */	BSRLI	r9, r12, 24	/* t1 = v >> 24 */	or	r9, r11, r9	/* t1 = h | t1 */	swi	r9, r5, 8	/* *(d + 8) = t1 */	BSLLI	r11, r12, 8	/* h = v << 8 */	lwi	r12, r8, 16	/* v = *(as + 16) */	BSRLI	r9, r12, 24	/* t1 = v >> 24 */	or	r9, r11, r9	/* t1 = h | t1 */	swi	r9, r5, 12	/* *(d + 12) = t1 */	BSLLI	r11, r12, 8	/* h = v << 8 */	lwi	r12, r8, 20	/* v = *(as + 20) */	BSRLI	r9, r12, 24	/* t1 = v >> 24 */	or	r9, r11, r9	/* t1 = h | t1 */	swi	r9, r5, 16	/* *(d + 16) = t1 */	BSLLI	r11, r12, 8	/* h = v << 8 */	lwi	r12, r8, 24	/* v = *(as + 24) */	BSRLI	r9, r12, 24	/* t1 = v >> 24 */	or	r9, r11, r9	/* t1 = h | t1 */	swi	r9, r5, 20	/* *(d + 20) = t1 */	BSLLI	r11, r12, 8	/* h = v << 8 */	lwi	r12, r8, 28	/* v = *(as + 28) */	BSRLI	r9, r12, 24	/* t1 = v >> 24 */	or	r9, r11, r9	/* t1 = h | t1 */	swi	r9, r5, 24	/* *(d + 24) = t1 */	BSLLI	r11, r12, 8	/* h = v << 8 */	lwi	r12, r8, 32	/* v = *(as + 32) */	BSRLI	r9, r12, 24	/* t1 = v >> 24 */	or	r9, r11, r9	/* t1 = h | t1 */	swi	r9, r5, 28	/* *(d + 28) = t1 */	BSLLI	r11, r12, 8	/* h = v << 8 */	addi	r8, r8, 32	/* as = as + 32 */	addi	r4, r4, -32	/* n = n - 32 */	bneid	r4, a_bu1_loop	/* while (n) loop */	addi	r5, r5, 32	/* d = d + 32 (IN DELAY SLOT) */	bri	a_block_donea_block_u2:	BSLLI	r11, r11, 16	/* h = h << 16 */a_bu2_loop:	lwi	r12, r8, 4	/* v = *(as + 4) */	BSRLI	r9, r12, 16	/* t1 = v >> 16 */	or	r9, r11, r9	/* t1 = h | t1 */	swi	r9, r5, 0	/* *(d + 0) = t1 */	BSLLI	r11, r12, 16	/* h = v << 16 */	lwi	r12, r8, 8	/* v = *(as + 8) */	BSRLI	r9, r12, 16	/* t1 = v >> 16 */	or	r9, r11, r9	/* t1 = h | t1 */	swi	r9, r5, 4	/* *(d + 4) = t1 */	BSLLI	r11, r12, 16	/* h = v << 16 */	lwi	r12, r8, 12	/* v = *(as + 12) */	BSRLI	r9, r12, 16	/* t1 = v >> 16 */	or	r9, r11, r9	/* t1 = h | t1 */	swi	r9, r5, 8	/* *(d + 8) = t1 */	BSLLI	r11, r12, 16	/* h = v << 16 */	lwi	r12, r8, 16	/* v = *(as + 16) */	BSRLI	r9, r12, 16	/* t1 = v >> 16 */	or	r9, r11, r9	/* t1 = h | t1 */	swi	r9, r5, 12	/* *(d + 12) = t1 */	BSLLI	r11, r12, 16	/* h = v << 16 */	lwi	r12, r8, 20	/* v = *(as + 20) */	BSRLI	r9, r12, 16	/* t1 = v >> 16 */	or	r9, r11, r9	/* t1 = h | t1 */	swi	r9, r5, 16	/* *(d + 16) = t1 */	BSLLI	r11, r12, 16	/* h = v << 16 */	lwi	r12, r8, 24	/* v = *(as + 24) */	BSRLI	r9, r12, 16	/* t1 = v >> 16 */	or	r9, r11, r9	/* t1 = h | t1 */	swi	r9, r5, 20	/* *(d + 20) = t1 */	BSLLI	r11, r12, 16	/* h = v << 16 */	lwi	r12, r8, 28	/* v = *(as + 28) */	BSRLI	r9, r12, 16	/* t1 = v >> 16 */	or	r9, r11, r9	/* t1 = h | t1 */	swi	r9, r5, 24	/* *(d + 24) = t1 */	BSLLI	r11, r12, 16	/* h = v << 16 */	lwi	r12, r8, 32	/* v = *(as + 32) */	BSRLI	r9, r12, 16	/* t1 = v >> 16 */	or	r9, r11, r9	/* t1 = h | t1 */	swi	r9, r5, 28	/* *(d + 28) = t1 */	BSLLI	r11, r12, 16	/* h = v << 16 */	addi	r8, r8, 32	/* as = as + 32 */	addi	r4, r4, -32	/* n = n - 32 */	bneid	r4, a_bu2_loop	/* while (n) loop */	addi	r5, r5, 32	/* d = d + 32 (IN DELAY SLOT) */a_block_done:	addi	r4, r0, 4	/* n = 4 */	cmpu	r4, r4, r7	/* n = c - n  (unsigned) */	blti	r4, a_xfer_end	/* if n < 0, less than one word to transfer */a_word_xfer:	andi	r4, r7, 0xfffffffc	/* n = c & ~3 */	addi	r10, r0, 0		/* offset = 0 */	andi	r9, r6, 3		/* t1 = s & 3 */	/* if temp != 0, unaligned transfers needed */	bnei	r9, a_word_unaligneda_word_aligned:	lw	r9, r6, r10		/* t1 = *(s+offset) */	sw	r9, r5, r10		/* *(d+offset) = t1 */	addi	r4, r4,-4		/* n-- */	bneid	r4, a_word_aligned	/* loop */	addi	r10, r10, 4		/* offset++ (IN DELAY SLOT) */	bri	a_word_donea_word_unaligned:	andi	r8, r6, 0xfffffffc	/* as = s & ~3 */	lwi	r11, r8, 0		/* h = *(as + 0) */	addi	r8, r8, 4		/* as = as + 4 */	addi	r9, r9, -1	beqi	r9, a_word_u1		/* t1 was 1 => 1 byte offset */	addi	r9, r9, -1	beqi	r9, a_word_u2		/* t1 was 2 => 2 byte offset */a_word_u3:	BSLLI	r11, r11, 24	/* h = h << 24 */a_wu3_loop:	lw	r12, r8, r10	/* v = *(as + offset) */	BSRLI	r9, r12, 8	/* t1 = v >> 8 */	or	r9, r11, r9	/* t1 = h | t1 */	sw	r9, r5, r10	/* *(d + offset) = t1 */	BSLLI	r11, r12, 24	/* h = v << 24 */	addi	r4, r4,-4	/* n = n - 4 */	bneid	r4, a_wu3_loop	/* while (n) loop */	addi	r10, r10, 4	/* offset = ofset + 4 (IN DELAY SLOT) */	bri	a_word_donea_word_u1:	BSLLI	r11, r11, 8	/* h = h << 8 */a_wu1_loop:	lw	r12, r8, r10	/* v = *(as + offset) */	BSRLI	r9, r12, 24	/* t1 = v >> 24 */	or	r9, r11, r9	/* t1 = h | t1 */	sw	r9, r5, r10	/* *(d + offset) = t1 */	BSLLI	r11, r12, 8	/* h = v << 8 */	addi	r4, r4,-4	/* n = n - 4 */	bneid	r4, a_wu1_loop	/* while (n) loop */	addi	r10, r10, 4	/* offset = ofset + 4 (IN DELAY SLOT) */	bri	a_word_donea_word_u2:	BSLLI	r11, r11, 16	/* h = h << 16 */a_wu2_loop:	lw	r12, r8, r10	/* v = *(as + offset) */	BSRLI	r9, r12, 16	/* t1 = v >> 16 */	or	r9, r11, r9	/* t1 = h | t1 */	sw	r9, r5, r10	/* *(d + offset) = t1 */	BSLLI	r11, r12, 16	/* h = v << 16 */	addi	r4, r4,-4	/* n = n - 4 */	bneid	r4, a_wu2_loop	/* while (n) loop */	addi	r10, r10, 4	/* offset = ofset + 4 (IN DELAY SLOT) */a_word_done:	add	r5, r5, r10	/* d = d + offset */	add	r6, r6, r10	/* s = s + offset */	rsub	r7, r10, r7	/* c = c - offset */a_xfer_end:a_xfer_end_loop:	beqi	r7, a_done		/* while (c) */	lbui	r9, r6, 0		/* t1 = *s */	addi	r6, r6, 1		/* s++ */	sbi	r9, r5, 0		/* *d = t1 */	addi	r7, r7, -1		/* c-- */	brid	a_xfer_end_loop		/* loop */	addi	r5, r5, 1		/* d++ (IN DELAY SLOT) */a_done:	rtsd	r15, 8	nop.size  memcpy, . - memcpy.end memcpylibc_hidden_def(memcpy)
 |