memcpy.S 4.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206
  1. /* Cloned and hacked for uClibc by Paul Mundt, December 2003 */
  2. /* Modified by SuperH, Inc. September 2003 */
  3. !
  4. ! Fast SH memcpy
  5. !
  6. ! by Toshiyasu Morita (tm@netcom.com)
  7. ! hacked by J"orn Rernnecke (joern.rennecke@superh.com) ("o for o-umlaut)
  8. ! SH5 code Copyright 2002 SuperH Ltd.
  9. !
  10. ! Entry: ARG0: destination pointer
  11. ! ARG1: source pointer
  12. ! ARG2: byte count
  13. !
  14. ! Exit: RESULT: destination pointer
  15. ! any other registers in the range r0-r7: trashed
  16. !
  17. ! Notes: Usually one wants to do small reads and write a longword, but
  18. ! unfortunately it is difficult in some cases to concatanate bytes
  19. ! into a longword on the SH, so this does a longword read and small
  20. ! writes.
  21. !
  22. ! This implementation makes two assumptions about how it is called:
  23. !
  24. ! 1.: If the byte count is nonzero, the address of the last byte to be
  25. ! copied is unsigned greater than the address of the first byte to
  26. ! be copied. This could be easily swapped for a signed comparison,
  27. ! but the algorithm used needs some comparison.
  28. !
  29. ! 2.: When there are two or three bytes in the last word of an 11-or-more
  30. ! bytes memory chunk to b copied, the rest of the word can be read
  31. ! without side effects.
  32. ! This could be easily changed by increasing the minumum size of
  33. ! a fast memcpy and the amount subtracted from r7 before L_2l_loop be 2,
  34. ! however, this would cost a few extra cyles on average.
  35. ! For SHmedia, the assumption is that any quadword can be read in its
  36. ! enirety if at least one byte is included in the copy.
  37. !
  38. #include <features.h>
  39. .section .text..SHmedia32,"ax"
  40. .globl __memcpy
  41. .hidden __memcpy
  42. .type __memcpy, @function
  43. .align 5
  44. __memcpy:
  45. #define LDUAQ(P,O,D0,D1) ldlo.q P,O,D0; ldhi.q P,O+7,D1
  46. #define STUAQ(P,O,D0,D1) stlo.q P,O,D0; sthi.q P,O+7,D1
  47. #define LDUAL(P,O,D0,D1) ldlo.l P,O,D0; ldhi.l P,O+3,D1
  48. #define STUAL(P,O,D0,D1) stlo.l P,O,D0; sthi.l P,O+3,D1
  49. ld.b r3,0,r63
  50. pta/l Large,tr0
  51. movi 25,r0
  52. bgeu/u r4,r0,tr0
  53. nsb r4,r0
  54. shlli r0,5,r0
  55. movi (L1-L0+63*32 + 1) & 0xffff,r1
  56. sub r1, r0, r0
  57. L0: ptrel r0,tr0
  58. add r2,r4,r5
  59. ptabs r18,tr1
  60. add r3,r4,r6
  61. blink tr0,r63
  62. /* Rearranged to make cut2 safe */
  63. .balign 8
  64. L4_7: /* 4..7 byte memcpy cntd. */
  65. stlo.l r2, 0, r0
  66. or r6, r7, r6
  67. sthi.l r5, -1, r6
  68. stlo.l r5, -4, r6
  69. blink tr1,r63
  70. .balign 8
  71. L1: /* 0 byte memcpy */
  72. nop
  73. blink tr1,r63
  74. nop
  75. nop
  76. nop
  77. nop
  78. L2_3: /* 2 or 3 byte memcpy cntd. */
  79. st.b r5,-1,r6
  80. blink tr1,r63
  81. /* 1 byte memcpy */
  82. ld.b r3,0,r0
  83. st.b r2,0,r0
  84. blink tr1,r63
  85. L8_15: /* 8..15 byte memcpy cntd. */
  86. stlo.q r2, 0, r0
  87. or r6, r7, r6
  88. sthi.q r5, -1, r6
  89. stlo.q r5, -8, r6
  90. blink tr1,r63
  91. /* 2 or 3 byte memcpy */
  92. ld.b r3,0,r0
  93. ld.b r2,0,r63
  94. ld.b r3,1,r1
  95. st.b r2,0,r0
  96. pta/l L2_3,tr0
  97. ld.b r6,-1,r6
  98. st.b r2,1,r1
  99. blink tr0, r63
  100. /* 4 .. 7 byte memcpy */
  101. LDUAL (r3, 0, r0, r1)
  102. pta L4_7, tr0
  103. ldlo.l r6, -4, r7
  104. or r0, r1, r0
  105. sthi.l r2, 3, r0
  106. ldhi.l r6, -1, r6
  107. blink tr0, r63
  108. /* 8 .. 15 byte memcpy */
  109. LDUAQ (r3, 0, r0, r1)
  110. pta L8_15, tr0
  111. ldlo.q r6, -8, r7
  112. or r0, r1, r0
  113. sthi.q r2, 7, r0
  114. ldhi.q r6, -1, r6
  115. blink tr0, r63
  116. /* 16 .. 24 byte memcpy */
  117. LDUAQ (r3, 0, r0, r1)
  118. LDUAQ (r3, 8, r8, r9)
  119. or r0, r1, r0
  120. sthi.q r2, 7, r0
  121. or r8, r9, r8
  122. sthi.q r2, 15, r8
  123. ldlo.q r6, -8, r7
  124. ldhi.q r6, -1, r6
  125. stlo.q r2, 8, r8
  126. stlo.q r2, 0, r0
  127. or r6, r7, r6
  128. sthi.q r5, -1, r6
  129. stlo.q r5, -8, r6
  130. blink tr1,r63
  131. Large:
  132. ld.b r2, 0, r63
  133. pta/l Loop_ua, tr1
  134. ori r3, -8, r7
  135. sub r2, r7, r22
  136. sub r3, r2, r6
  137. add r2, r4, r5
  138. ldlo.q r3, 0, r0
  139. addi r5, -16, r5
  140. movi 64+8, r27 // could subtract r7 from that.
  141. stlo.q r2, 0, r0
  142. sthi.q r2, 7, r0
  143. ldx.q r22, r6, r0
  144. bgtu/l r27, r4, tr1
  145. addi r5, -48, r27
  146. pta/l Loop_line, tr0
  147. addi r6, 64, r36
  148. addi r6, -24, r19
  149. addi r6, -16, r20
  150. addi r6, -8, r21
  151. Loop_line:
  152. ldx.q r22, r36, r63
  153. alloco r22, 32
  154. addi r22, 32, r22
  155. ldx.q r22, r19, r23
  156. sthi.q r22, -25, r0
  157. ldx.q r22, r20, r24
  158. ldx.q r22, r21, r25
  159. stlo.q r22, -32, r0
  160. ldx.q r22, r6, r0
  161. sthi.q r22, -17, r23
  162. sthi.q r22, -9, r24
  163. sthi.q r22, -1, r25
  164. stlo.q r22, -24, r23
  165. stlo.q r22, -16, r24
  166. stlo.q r22, -8, r25
  167. bgeu r27, r22, tr0
  168. Loop_ua:
  169. addi r22, 8, r22
  170. sthi.q r22, -1, r0
  171. stlo.q r22, -8, r0
  172. ldx.q r22, r6, r0
  173. bgtu/l r5, r22, tr1
  174. add r3, r4, r7
  175. ldlo.q r7, -8, r1
  176. sthi.q r22, 7, r0
  177. ldhi.q r7, -1, r7
  178. ptabs r18,tr1
  179. stlo.q r22, 0, r0
  180. or r1, r7, r1
  181. sthi.q r5, 15, r1
  182. stlo.q r5, 8, r1
  183. blink tr1, r63
  184. .size __memcpy,.-__memcpy
  185. strong_alias(__memcpy,memcpy)