|
@@ -5,7 +5,7 @@
|
|
|
* Copyright (C) 1999 Niibe Yutaka
|
|
|
*
|
|
|
* Copyright (c) 2009 STMicroelectronics Ltd
|
|
|
- * Optimised using 64bit data transfer via FPU
|
|
|
+ * Optimised using 64bit data transfer (via FPU) and the movca.l inst.
|
|
|
* Author: Giuseppe Cavallaro <peppe.cavallaro@st.com>
|
|
|
*
|
|
|
* Licensed under the LGPL v2.1, see the file COPYING.LIB in this tarball.
|
|
@@ -24,9 +24,9 @@
|
|
|
* Currenlty it has been only implemented and tested for little endian mode. */
|
|
|
.macro FPU_SET_PAIRED_PREC
|
|
|
sts fpscr, r3
|
|
|
- mov #0x10, r0 ! PR=0 SZ=1
|
|
|
- shll16 r0
|
|
|
- lds r0, fpscr
|
|
|
+ mov #0x10, r1 ! PR=0 SZ=1
|
|
|
+ shll16 r1
|
|
|
+ lds r1, fpscr
|
|
|
.endm
|
|
|
.macro RESTORE_FPSCR
|
|
|
lds r3, fpscr
|
|
@@ -34,12 +34,10 @@
|
|
|
#endif
|
|
|
|
|
|
ENTRY(memset)
|
|
|
- tst r6,r6
|
|
|
- bt/s 5f ! if n=0, do nothing
|
|
|
- add r6,r4
|
|
|
mov #12,r0
|
|
|
+ add r6,r4
|
|
|
cmp/gt r6,r0
|
|
|
- bt/s 4f ! if it's too small, set a byte at once
|
|
|
+ bt/s 40f ! if it's too small, set a byte at once
|
|
|
mov r4,r0
|
|
|
and #3,r0
|
|
|
cmp/eq #0,r0
|
|
@@ -56,7 +54,7 @@ ENTRY(memset)
|
|
|
swap.w r5,r0 ! VV00
|
|
|
or r0,r5 ! VVVV
|
|
|
|
|
|
- ! Enough bytes need to be copied
|
|
|
+ ! Check if enough bytes need to be copied to be worth the big loop
|
|
|
mov #0x40, r0 ! (MT)
|
|
|
cmp/gt r6,r0 ! (MT) 64 > len => slow loop
|
|
|
|
|
@@ -84,6 +82,9 @@ ENTRY(memset)
|
|
|
mov #-5,r0
|
|
|
shld r0,r2 ! number of loops
|
|
|
|
|
|
+ add #-32, r4
|
|
|
+ mov r5, r0
|
|
|
+
|
|
|
#ifdef MEMSET_USES_FPU
|
|
|
lds r5, fpul ! (CO)
|
|
|
fsts fpul, fr0 ! Dr0 will be 'VVVVVVVV'
|
|
@@ -91,36 +92,40 @@ ENTRY(memset)
|
|
|
|
|
|
FPU_SET_PAIRED_PREC
|
|
|
12:
|
|
|
- add #-0x20, r6 !(MT)
|
|
|
+ movca.l r0, @r4
|
|
|
+ mov.l r5, @(4, r4)
|
|
|
+ add #32, r4
|
|
|
fmov dr0, @-r4
|
|
|
fmov dr0, @-r4
|
|
|
+ add #-0x20, r6
|
|
|
fmov dr0, @-r4
|
|
|
dt r2
|
|
|
- bf/s 12b !(BR)
|
|
|
- fmov dr0, @-r4
|
|
|
+ bf/s 12b
|
|
|
+ add #-40, r4
|
|
|
|
|
|
RESTORE_FPSCR
|
|
|
#else
|
|
|
12:
|
|
|
- mov.l r5,@-r4
|
|
|
- mov.l r5,@-r4
|
|
|
- mov.l r5,@-r4
|
|
|
- mov.l r5,@-r4
|
|
|
- mov.l r5,@-r4
|
|
|
- mov.l r5,@-r4
|
|
|
+ movca.l r0,@r4
|
|
|
+ mov.l r5,@(4, r4)
|
|
|
+ mov.l r5,@(8, r4)
|
|
|
+ mov.l r5,@(12,r4)
|
|
|
+ mov.l r5,@(16,r4)
|
|
|
+ mov.l r5,@(20,r4)
|
|
|
add #-0x20, r6
|
|
|
- mov.l r5,@-r4
|
|
|
+ mov.l r5,@(24,r4)
|
|
|
dt r2
|
|
|
+ mov.l r5,@(28,r4)
|
|
|
bf/s 12b
|
|
|
- mov.l r5,@-r4
|
|
|
-#endif
|
|
|
- tst r6,r6
|
|
|
- bt/s 5f
|
|
|
- mov #8, r0
|
|
|
+ add #-32, r4
|
|
|
|
|
|
+#endif
|
|
|
+ add #32, r4
|
|
|
+ mov #8, r0
|
|
|
cmp/ge r0, r6
|
|
|
- bf/s 4f
|
|
|
- mov r6,r0
|
|
|
+ bf 40f
|
|
|
+
|
|
|
+ mov r6,r0
|
|
|
22:
|
|
|
shlr2 r0
|
|
|
shlr r0 ! r0 = r6 >> 3
|
|
@@ -132,9 +137,10 @@ ENTRY(memset)
|
|
|
!
|
|
|
mov #7,r0
|
|
|
and r0,r6
|
|
|
- tst r6,r6
|
|
|
+
|
|
|
+ ! fill bytes (length may be zero)
|
|
|
+40: tst r6,r6
|
|
|
bt 5f
|
|
|
- ! fill bytes
|
|
|
4:
|
|
|
dt r6
|
|
|
bf/s 4b
|