abiv2_memcpy.S 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184
  1. .macro GET_FRONT_BITS rx ry
  2. #ifdef __cskyLE__
  3. lsr \rx, \ry
  4. #else
  5. lsl \rx, \ry
  6. #endif
  7. .endm
  8. .macro GET_AFTER_BITS rx ry
  9. #ifdef __cskyLE__
  10. lsl \rx, \ry
  11. #else
  12. lsr \rx, \ry
  13. #endif
  14. .endm
  15. #ifdef WANT_WIDE
  16. # define Wmemcpy wmemcpy
  17. #else
  18. # define Wmemcpy memcpy
  19. #endif
  20. /* void *memcpy(void *dest, const void *src, size_t n); */
  21. .text
  22. .align 2
  23. .global Wmemcpy
  24. .type Wmemcpy, @function
  25. Wmemcpy:
  26. mov r3, r0
  27. cmplti r2, 4 /* If len less than 4 bytes */
  28. jbt .L_copy_by_byte
  29. mov r12, r0
  30. andi r12, 3
  31. bnez r12, .L_dest_not_aligned /* If dest is not 4 bytes aligned */
  32. .L0:
  33. mov r12, r1
  34. andi r12, 3
  35. bnez r12, .L_dest_aligned_but_src_not_aligned /* If dest is aligned, but src is not aligned */
  36. cmplti r2, 16 /* dest and src are all aligned */
  37. jbt .L_aligned_and_len_less_16bytes /* If len less than 16 bytes */
  38. .L_aligned_and_len_larger_16bytes: /* src and dst are all aligned, and len > 16 bytes */
  39. ldw r18, (r1, 0)
  40. ldw r19, (r1, 4)
  41. ldw r20, (r1, 8)
  42. ldw r21, (r1, 12)
  43. stw r18, (r3, 0)
  44. stw r19, (r3, 4)
  45. stw r20, (r3, 8)
  46. stw r21, (r3, 12)
  47. subi r2, 16
  48. addi r1, 16
  49. addi r3, 16
  50. cmplti r2, 16
  51. jbf .L_aligned_and_len_larger_16bytes
  52. .L_aligned_and_len_less_16bytes:
  53. cmplti r2, 4
  54. jbt .L_copy_by_byte
  55. ldw r18, (r1, 0)
  56. stw r18, (r3, 0)
  57. subi r2, 4
  58. addi r1, 4
  59. addi r3, 4
  60. jbr .L_aligned_and_len_less_16bytes
  61. .L_copy_by_byte: /* len less than 4 bytes */
  62. cmpnei r2, 0
  63. jbf .L_return
  64. ldb r18, (r1, 0)
  65. stb r18, (r3, 0)
  66. subi r2, 1
  67. addi r1, 1
  68. addi r3, 1
  69. jbr .L_copy_by_byte
  70. .L_return:
  71. rts
  72. /* If dest is not aligned, just copying some bytes makes the dest align.
  73. After that, we judge whether the src is aligned. */
  74. .L_dest_not_aligned:
  75. rsub r13, r1, r3 /* consider overlapped case */
  76. abs r13, r13
  77. cmplt r13, r2
  78. jbt .L_copy_by_byte
  79. .L1:
  80. ldb r18, (r1, 0) /* makes the dest align. */
  81. stb r18, (r3, 0)
  82. addi r12, 1
  83. subi r2, 1
  84. addi r1, 1
  85. addi r3, 1
  86. cmpnei r12, 4
  87. jbt .L1
  88. cmplti r2, 4
  89. jbt .L_copy_by_byte
  90. jbf .L0 /* judge whether the src is aligned. */
  91. .L_dest_aligned_but_src_not_aligned:
  92. rsub r13, r1, r3 /* consider overlapped case */
  93. abs r13, r13
  94. cmplt r13, r2
  95. jbt .L_copy_by_byte
  96. bclri r1, 0
  97. bclri r1, 1
  98. ldw r18, (r1, 0)
  99. addi r1, 4
  100. movi r13, 8
  101. mult r13, r12
  102. mov r24, r13 /* r12 is used to store the misaligned bits */
  103. rsubi r13, 32
  104. mov r25, r13
  105. cmplti r2, 16
  106. jbt .L_not_aligned_and_len_less_16bytes
  107. .L_not_aligned_and_len_larger_16bytes:
  108. ldw r20, (r1, 0)
  109. ldw r21, (r1, 4)
  110. ldw r22, (r1, 8)
  111. ldw r23, (r1, 12)
  112. GET_FRONT_BITS r18 r24 /* little or big endian? */
  113. mov r19, r20
  114. GET_AFTER_BITS r20 r25
  115. or r20, r18
  116. GET_FRONT_BITS r19 r24
  117. mov r18, r21
  118. GET_AFTER_BITS r21 r13
  119. or r21, r19
  120. GET_FRONT_BITS r18 r24
  121. mov r19, r22
  122. GET_AFTER_BITS r22 r25
  123. or r22, r18
  124. GET_FRONT_BITS r19 r24
  125. mov r18, r23
  126. GET_AFTER_BITS r23 r25
  127. or r23, r19
  128. stw r20, (r3, 0)
  129. stw r21, (r3, 4)
  130. stw r22, (r3, 8)
  131. stw r23, (r3, 12)
  132. subi r2, 16
  133. addi r1, 16
  134. addi r3, 16
  135. cmplti r2, 16
  136. jbf .L_not_aligned_and_len_larger_16bytes
  137. .L_not_aligned_and_len_less_16bytes:
  138. cmplti r2, 4
  139. jbf .L2
  140. rsubi r12, 4 /* r12 is used to stored the misaligned bits */
  141. subu r1, r12 /* initial the position */
  142. jbr .L_copy_by_byte
  143. .L2:
  144. ldw r21, (r1, 0)
  145. GET_FRONT_BITS r18 r24
  146. mov r19, r18
  147. mov r18, r21
  148. GET_AFTER_BITS r21 r25
  149. or r21, r19
  150. stw r21, (r3, 0)
  151. subi r2, 4
  152. addi r1, 4
  153. addi r3, 4
  154. jbr .L_not_aligned_and_len_less_16bytes
  155. .size Wmemcpy, .-Wmemcpy
  156. libc_hidden_def(Wmemcpy)
  157. .weak Wmemcpy