memcpy.S 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218
  1. /*
  2. * Copyright (C) 2017 Hangzhou C-SKY Microsystems co.,ltd.
  3. *
  4. * Licensed under the LGPL v2.1 or later, see the file COPYING.LIB
  5. * in this tarball.
  6. */
  7. .macro GET_FRONT_BITS rx ry
  8. #ifdef __cskyLE__
  9. lsr \rx, \ry
  10. #else
  11. lsl \rx, \ry
  12. #endif
  13. .endm
  14. .macro GET_AFTER_BITS rx ry
  15. #ifdef __cskyLE__
  16. lsl \rx, \ry
  17. #else
  18. lsr \rx, \ry
  19. #endif
  20. .endm
  21. #ifdef WANT_WIDE
  22. # define Wmemcpy wmemcpy
  23. #else
  24. # define Wmemcpy memcpy
  25. #endif
  26. /* void *memcpy(void *dest, const void *src, size_t n); */
  27. .text
  28. .align 2
  29. .global Wmemcpy
  30. .type Wmemcpy, @function
  31. Wmemcpy:
  32. mov r7, r2
  33. cmplti r4, 4 /* If len less than 4 bytes */
  34. jbt .L_copy_by_byte
  35. mov r6, r2
  36. andi r6, 3
  37. cmpnei r6, 0
  38. jbt .L_dest_not_aligned /* If dest is not 4 bytes aligned */
  39. .L0:
  40. mov r6, r3
  41. andi r6, 3
  42. cmpnei r6, 0
  43. jbt .L_dest_aligned_but_src_not_aligned /* If dest is aligned, but src is not aligned */
  44. cmplti r4, 16 /* dest and src are all aligned */
  45. jbt .L_aligned_and_len_less_16bytes /* If len less than 16 bytes */
  46. subi sp, 8
  47. stw r8, (sp, 0)
  48. stw r9, (sp, 4)
  49. .L_aligned_and_len_larger_16bytes: /* src and dst are all aligned, and len > 16 bytes */
  50. ldw r1, (r3, 0)
  51. ldw r5, (r3, 4)
  52. ldw r8, (r3, 8)
  53. ldw r9, (r3, 12)
  54. stw r1, (r7, 0)
  55. stw r5, (r7, 4)
  56. stw r8, (r7, 8)
  57. stw r9, (r7, 12)
  58. subi r4, 16
  59. addi r3, 16
  60. addi r7, 16
  61. cmplti r4, 16
  62. jbf .L_aligned_and_len_larger_16bytes
  63. ldw r8, (sp, 0)
  64. ldw r9, (sp, 4)
  65. addi sp, 8
  66. .L_aligned_and_len_less_16bytes:
  67. cmplti r4, 4
  68. jbt .L_copy_by_byte
  69. ldw r1, (r3, 0)
  70. stw r1, (r7, 0)
  71. subi r4, 4
  72. addi r3, 4
  73. addi r7, 4
  74. jbr .L_aligned_and_len_less_16bytes
  75. .L_copy_by_byte: /* len less than 4 bytes */
  76. cmpnei r4, 0
  77. jbf .L_return
  78. ldb r1, (r3, 0)
  79. stb r1, (r7, 0)
  80. subi r4, 1
  81. addi r3, 1
  82. addi r7, 1
  83. jbr .L_copy_by_byte
  84. .L_return:
  85. rts
  86. /* If dest is not aligned, we copy some bytes to make dest align.
  87. Then we should judge whether src is aligned. */
  88. .L_dest_not_aligned:
  89. mov r5, r3 /* consider overlapped case */
  90. rsub r5, r5, r7
  91. abs r5, r5
  92. cmplt r5, r4
  93. jbt .L_copy_by_byte
  94. .L1:
  95. ldb r1, (r3, 0) /* makes the dest align. */
  96. stb r1, (r7, 0)
  97. addi r6, 1
  98. subi r4, 1
  99. addi r3, 1
  100. addi r7, 1
  101. cmpnei r6, 4
  102. jbt .L1
  103. cmplti r4, 4
  104. jbt .L_copy_by_byte
  105. jbf .L0 /* judge whether the src is aligned. */
  106. .L_dest_aligned_but_src_not_aligned:
  107. mov r5, r3 /* consider overlapped case*/
  108. rsub r5, r5, r7
  109. abs r5, r5
  110. cmplt r5, r4
  111. jbt .L_copy_by_byte
  112. bclri r3, 0
  113. bclri r3, 1
  114. ldw r1, (r3, 0)
  115. addi r3, 4
  116. subi sp, 16
  117. stw r11, (sp,0)
  118. stw r12, (sp,4)
  119. stw r13, (sp,8)
  120. movi r5, 8
  121. mult r5, r6 /* r6 is used to store tne misaligned bits */
  122. mov r12, r5
  123. rsubi r5, 31
  124. addi r5, 1
  125. mov r13, r5
  126. cmplti r4, 16
  127. jbt .L_not_aligned_and_len_less_16bytes
  128. stw r8, (sp, 12)
  129. subi sp, 8
  130. stw r9, (sp, 0)
  131. stw r10, (sp, 4)
  132. .L_not_aligned_and_len_larger_16bytes:
  133. ldw r5, (r3, 0)
  134. ldw r11, (r3, 4)
  135. ldw r8, (r3, 8)
  136. ldw r9, (r3, 12)
  137. GET_FRONT_BITS r1 r12 /* little or big endian? */
  138. mov r10, r5
  139. GET_AFTER_BITS r5 r13
  140. or r5, r1
  141. GET_FRONT_BITS r10 r12
  142. mov r1, r11
  143. GET_AFTER_BITS r11 r13
  144. or r11, r10
  145. GET_FRONT_BITS r1 r12
  146. mov r10, r8
  147. GET_AFTER_BITS r8 r13
  148. or r8, r1
  149. GET_FRONT_BITS r10 r12
  150. mov r1, r9
  151. GET_AFTER_BITS r9 r13
  152. or r9, r10
  153. stw r5, (r7, 0)
  154. stw r11, (r7, 4)
  155. stw r8, (r7, 8)
  156. stw r9, (r7, 12)
  157. subi r4, 16
  158. addi r3, 16
  159. addi r7, 16
  160. cmplti r4, 16
  161. jbf .L_not_aligned_and_len_larger_16bytes
  162. ldw r9, (sp, 0)
  163. ldw r10, (sp, 4)
  164. addi sp, 8
  165. ldw r8, (sp,12)
  166. .L_not_aligned_and_len_less_16bytes:
  167. cmplti r4, 4
  168. jbf .L2
  169. rsubi r6, 4 /* r6 is used to stored the misaligned bits */
  170. subu r3, r6 /* initial the position */
  171. ldw r11, (sp, 0)
  172. ldw r12, (sp, 4)
  173. ldw r13, (sp, 8)
  174. addi sp, 16
  175. jbr .L_copy_by_byte
  176. .L2:
  177. ldw r5, (r3, 0)
  178. GET_FRONT_BITS r1 r12
  179. mov r11, r1
  180. mov r1, r5
  181. GET_AFTER_BITS r5 r13
  182. or r5, r11
  183. stw r5, (r7, 0)
  184. subi r4, 4
  185. addi r3, 4
  186. addi r7, 4
  187. jbr .L_not_aligned_and_len_less_16bytes
  188. .size Wmemcpy, .-Wmemcpy
  189. libc_hidden_def(Wmemcpy)
  190. .weak Wmemcpy