abiv2_memcpy.S 4.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191
  1. /*
  2. * Copyright (C) 2017 Hangzhou C-SKY Microsystems co.,ltd.
  3. *
  4. * Licensed under the LGPL v2.1 or later, see the file COPYING.LIB
  5. * in this tarball.
  6. */
  7. .macro GET_FRONT_BITS rx ry
  8. #ifdef __cskyLE__
  9. lsr \rx, \ry
  10. #else
  11. lsl \rx, \ry
  12. #endif
  13. .endm
  14. .macro GET_AFTER_BITS rx ry
  15. #ifdef __cskyLE__
  16. lsl \rx, \ry
  17. #else
  18. lsr \rx, \ry
  19. #endif
  20. .endm
  21. #ifdef WANT_WIDE
  22. # define Wmemcpy wmemcpy
  23. #else
  24. # define Wmemcpy memcpy
  25. #endif
  26. /* void *memcpy(void *dest, const void *src, size_t n); */
  27. .text
  28. .align 2
  29. .global Wmemcpy
  30. .type Wmemcpy, @function
  31. Wmemcpy:
  32. mov r3, r0
  33. cmplti r2, 4 /* If len less than 4 bytes */
  34. jbt .L_copy_by_byte
  35. mov r12, r0
  36. andi r12, 3
  37. bnez r12, .L_dest_not_aligned /* If dest is not 4 bytes aligned */
  38. .L0:
  39. mov r12, r1
  40. andi r12, 3
  41. bnez r12, .L_dest_aligned_but_src_not_aligned /* If dest is aligned, but src is not aligned */
  42. cmplti r2, 16 /* dest and src are all aligned */
  43. jbt .L_aligned_and_len_less_16bytes /* If len less than 16 bytes */
  44. .L_aligned_and_len_larger_16bytes: /* src and dst are all aligned, and len > 16 bytes */
  45. ldw r18, (r1, 0)
  46. ldw r19, (r1, 4)
  47. ldw r20, (r1, 8)
  48. ldw r21, (r1, 12)
  49. stw r18, (r3, 0)
  50. stw r19, (r3, 4)
  51. stw r20, (r3, 8)
  52. stw r21, (r3, 12)
  53. subi r2, 16
  54. addi r1, 16
  55. addi r3, 16
  56. cmplti r2, 16
  57. jbf .L_aligned_and_len_larger_16bytes
  58. .L_aligned_and_len_less_16bytes:
  59. cmplti r2, 4
  60. jbt .L_copy_by_byte
  61. ldw r18, (r1, 0)
  62. stw r18, (r3, 0)
  63. subi r2, 4
  64. addi r1, 4
  65. addi r3, 4
  66. jbr .L_aligned_and_len_less_16bytes
  67. .L_copy_by_byte: /* len less than 4 bytes */
  68. cmpnei r2, 0
  69. jbf .L_return
  70. ldb r18, (r1, 0)
  71. stb r18, (r3, 0)
  72. subi r2, 1
  73. addi r1, 1
  74. addi r3, 1
  75. jbr .L_copy_by_byte
  76. .L_return:
  77. rts
  78. /* If dest is not aligned, just copying some bytes makes the dest align.
  79. After that, we judge whether the src is aligned. */
  80. .L_dest_not_aligned:
  81. rsub r13, r1, r3 /* consider overlapped case */
  82. abs r13, r13
  83. cmplt r13, r2
  84. jbt .L_copy_by_byte
  85. .L1:
  86. ldb r18, (r1, 0) /* makes the dest align. */
  87. stb r18, (r3, 0)
  88. addi r12, 1
  89. subi r2, 1
  90. addi r1, 1
  91. addi r3, 1
  92. cmpnei r12, 4
  93. jbt .L1
  94. cmplti r2, 4
  95. jbt .L_copy_by_byte
  96. jbf .L0 /* judge whether the src is aligned. */
  97. .L_dest_aligned_but_src_not_aligned:
  98. rsub r13, r1, r3 /* consider overlapped case */
  99. abs r13, r13
  100. cmplt r13, r2
  101. jbt .L_copy_by_byte
  102. bclri r1, 0
  103. bclri r1, 1
  104. ldw r18, (r1, 0)
  105. addi r1, 4
  106. movi r13, 8
  107. mult r13, r12
  108. mov r24, r13 /* r12 is used to store the misaligned bits */
  109. rsubi r13, 32
  110. mov r25, r13
  111. cmplti r2, 16
  112. jbt .L_not_aligned_and_len_less_16bytes
  113. .L_not_aligned_and_len_larger_16bytes:
  114. ldw r20, (r1, 0)
  115. ldw r21, (r1, 4)
  116. ldw r22, (r1, 8)
  117. ldw r23, (r1, 12)
  118. GET_FRONT_BITS r18 r24 /* little or big endian? */
  119. mov r19, r20
  120. GET_AFTER_BITS r20 r25
  121. or r20, r18
  122. GET_FRONT_BITS r19 r24
  123. mov r18, r21
  124. GET_AFTER_BITS r21 r13
  125. or r21, r19
  126. GET_FRONT_BITS r18 r24
  127. mov r19, r22
  128. GET_AFTER_BITS r22 r25
  129. or r22, r18
  130. GET_FRONT_BITS r19 r24
  131. mov r18, r23
  132. GET_AFTER_BITS r23 r25
  133. or r23, r19
  134. stw r20, (r3, 0)
  135. stw r21, (r3, 4)
  136. stw r22, (r3, 8)
  137. stw r23, (r3, 12)
  138. subi r2, 16
  139. addi r1, 16
  140. addi r3, 16
  141. cmplti r2, 16
  142. jbf .L_not_aligned_and_len_larger_16bytes
  143. .L_not_aligned_and_len_less_16bytes:
  144. cmplti r2, 4
  145. jbf .L2
  146. rsubi r12, 4 /* r12 is used to stored the misaligned bits */
  147. subu r1, r12 /* initial the position */
  148. jbr .L_copy_by_byte
  149. .L2:
  150. ldw r21, (r1, 0)
  151. GET_FRONT_BITS r18 r24
  152. mov r19, r18
  153. mov r18, r21
  154. GET_AFTER_BITS r21 r25
  155. or r21, r19
  156. stw r21, (r3, 0)
  157. subi r2, 4
  158. addi r1, 4
  159. addi r3, 4
  160. jbr .L_not_aligned_and_len_less_16bytes
  161. .size Wmemcpy, .-Wmemcpy
  162. libc_hidden_def(Wmemcpy)
  163. .weak Wmemcpy