memcpy.S 3.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221
  1. /*
  2. * Copyright (C) 2020 Kalray Inc.
  3. *
  4. * Licensed under the LGPL v2.1 or later, see the file COPYING.LIB
  5. * in this tarball.
  6. */
  7. #include <sysdep.h>
  8. .align 16
  9. ENTRY(memcpy)
  10. cb.deqz $r2? .Lreturn
  11. compd.geu $r3 = $r2, 256
  12. copyd $r6 = $r0
  13. ;;
  14. cb.deqz $r3? .Lremaining_256
  15. ;;
  16. lq.u $r32r33 = 0[$r1]
  17. addd $r2 = $r2, -256
  18. ;;
  19. lq.u $r34r35 = 16[$r1]
  20. ;;
  21. lq.u $r36r37 = 32[$r1]
  22. srld $r7 = $r2, 8
  23. ;;
  24. lq.u $r38r39 = 48[$r1]
  25. ;;
  26. lq.u $r40r41 = 64[$r1]
  27. ;;
  28. lq.u $r42r43 = 80[$r1]
  29. ;;
  30. lq.u $r44r45 = 96[$r1]
  31. ;;
  32. lq.u $r46r47 = 112[$r1]
  33. ;;
  34. lq.u $r48r49 = 128[$r1]
  35. ;;
  36. lq.u $r50r51 = 144[$r1]
  37. ;;
  38. lq.u $r52r53 = 160[$r1]
  39. ;;
  40. lq.u $r54r55 = 176[$r1]
  41. ;;
  42. lq.u $r56r57 = 192[$r1]
  43. ;;
  44. lq.u $r58r59 = 208[$r1]
  45. compd.geu $r3 = $r2, 256
  46. ;;
  47. lq.u $r60r61 = 224[$r1]
  48. ;;
  49. lq.u $r62r63 = 240[$r1]
  50. addd $r1 = $r1, 256
  51. ;;
  52. cb.deqz $r7? .Lstreaming_loop_end
  53. ;;
  54. loopdo $r7? .Lstreaming_loop_end
  55. ;;
  56. sq 0[$r0] = $r32r33
  57. addd $r2 = $r2, -256
  58. ;;
  59. lq.u $r32r33 = 0[$r1]
  60. ;;
  61. sq 16[$r0] = $r34r35
  62. ;;
  63. lq.u $r34r35 = 16[$r1]
  64. ;;
  65. sq 32[$r0] = $r36r37
  66. ;;
  67. lq.u $r36r37 = 32[$r1]
  68. ;;
  69. sq 48[$r0] = $r38r39
  70. ;;
  71. lq.u $r38r39 = 48[$r1]
  72. ;;
  73. sq 64[$r0] = $r40r41
  74. ;;
  75. lq.u $r40r41 = 64[$r1]
  76. ;;
  77. sq 80[$r0] = $r42r43
  78. ;;
  79. lq.u $r42r43 = 80[$r1]
  80. ;;
  81. sq 96[$r0] = $r44r45
  82. ;;
  83. lq.u $r44r45 = 96[$r1]
  84. ;;
  85. sq 112[$r0] = $r46r47
  86. ;;
  87. lq.u $r46r47 = 112[$r1]
  88. ;;
  89. sq 128[$r0] = $r48r49
  90. ;;
  91. lq.u $r48r49 = 128[$r1]
  92. ;;
  93. sq 144[$r0] = $r50r51
  94. ;;
  95. lq.u $r50r51 = 144[$r1]
  96. ;;
  97. sq 160[$r0] = $r52r53
  98. ;;
  99. lq.u $r52r53 = 160[$r1]
  100. ;;
  101. sq 176[$r0] = $r54r55
  102. ;;
  103. lq.u $r54r55 = 176[$r1]
  104. ;;
  105. sq 192[$r0] = $r56r57
  106. ;;
  107. lq.u $r56r57 = 192[$r1]
  108. ;;
  109. sq 208[$r0] = $r58r59
  110. ;;
  111. lq.u $r58r59 = 208[$r1]
  112. ;;
  113. sq 224[$r0] = $r60r61
  114. ;;
  115. lq.u $r60r61 = 224[$r1]
  116. ;;
  117. sq 240[$r0] = $r62r63
  118. addd $r0 = $r0, 256
  119. ;;
  120. lq.u $r62r63 = 240[$r1]
  121. addd $r1 = $r1, 256
  122. ;;
  123. .Lstreaming_loop_end:
  124. sq 0[$r0] = $r32r33
  125. ;;
  126. sq 16[$r0] = $r34r35
  127. ;;
  128. sq 32[$r0] = $r36r37
  129. ;;
  130. sq 48[$r0] = $r38r39
  131. ;;
  132. sq 64[$r0] = $r40r41
  133. ;;
  134. sq 80[$r0] = $r42r43
  135. ;;
  136. sq 96[$r0] = $r44r45
  137. ;;
  138. sq 112[$r0] = $r46r47
  139. ;;
  140. sq 128[$r0] = $r48r49
  141. ;;
  142. sq 144[$r0] = $r50r51
  143. ;;
  144. sq 160[$r0] = $r52r53
  145. ;;
  146. sq 176[$r0] = $r54r55
  147. ;;
  148. sq 192[$r0] = $r56r57
  149. ;;
  150. sq 208[$r0] = $r58r59
  151. ;;
  152. sq 224[$r0] = $r60r61
  153. ;;
  154. sq 240[$r0] = $r62r63
  155. addd $r0 = $r0, 256
  156. ;;
  157. .Lremaining_256:
  158. andd $r11 = $r2, 16
  159. srld $r7 = $r2, 5
  160. ;;
  161. cb.deqz $r7? .Lloop_32_end
  162. ;;
  163. loopdo $r7? .Lloop_32_end
  164. ;;
  165. lo $r32r33r34r35 = 0[$r1]
  166. addd $r1 = $r1, 32
  167. addd $r2 = $r2, -32
  168. ;;
  169. so 0[$r0] = $r32r33r34r35
  170. addd $r0 = $r0, 32
  171. ;;
  172. .Lloop_32_end:
  173. andd $r10 = $r2, 8
  174. andd $r9 = $r2, 4
  175. cb.deqz $r11? .Lloop_remaining_16
  176. lq.u.dnez $r11? $r32r33 = 0[$r1]
  177. ;;
  178. sq 0[$r0] = $r32r33
  179. addd $r1 = $r1, 16
  180. addd $r0 = $r0, 16
  181. ;;
  182. .Lloop_remaining_16:
  183. andd $r8 = $r2, 2
  184. andd $r7 = $r2, 1
  185. cb.deqz $r10? .Lloop_remaining_8
  186. ld.dnez $r10? $r32 = 0[$r1]
  187. ;;
  188. sd 0[$r0] = $r32
  189. addd $r1 = $r1, 8
  190. addd $r0 = $r0, 8
  191. ;;
  192. .Lloop_remaining_8:
  193. cb.deqz $r9? .Lloop_remaining_4
  194. lwz.dnez $r9? $r32 = 0[$r1]
  195. ;;
  196. sw 0[$r0] = $r32
  197. addd $r1 = $r1, 4
  198. addd $r0 = $r0, 4
  199. ;;
  200. .Lloop_remaining_4:
  201. cb.deqz $r8? .Lloop_remaining_2
  202. lhz.dnez $r8? $r32 = 0[$r1]
  203. ;;
  204. sh 0[$r0] = $r32
  205. addd $r1 = $r1, 2
  206. addd $r0 = $r0, 2
  207. ;;
  208. .Lloop_remaining_2:
  209. lbz.dnez $r7? $r32 = 0[$r1]
  210. ;;
  211. sb.dnez $r7? 0[$r0] = $r32
  212. ;;
  213. .Lreturn:
  214. copyd $r0 = $r6
  215. ret
  216. ;;
  217. END(memcpy)
  218. libc_hidden_def(memcpy)