memcpy.S 9.8 KB


  1. /*
  2. * Copyright (C) 2008-2009 Michal Simek <monstr@monstr.eu>
  3. * Copyright (C) 2008-2009 PetaLogix
  4. * Copyright (C) 2008 Jim Law - Iris LP All rights reserved.
  5. *
  6. * This file is subject to the terms and conditions of the GNU General
  7. * Public License. See the file COPYING in the main directory of this
  8. * archive for more details.
  9. *
  10. * Written by Jim Law <jlaw@irispower.com>
  11. *
  12. * intended to replace:
  13. * memcpy in memcpy.c and
  14. * memmove in memmove.c
  15. * ... in arch/microblaze/lib
  16. *
  17. *
  18. * assly_fastcopy.S
  19. *
  20. * Attempt at quicker memcpy and memmove for MicroBlaze
  21. * Input : Operand1 in Reg r5 - destination address
  22. * Operand2 in Reg r6 - source address
  23. * Operand3 in Reg r7 - number of bytes to transfer
  24. * Output: Result in Reg r3 - starting destinaition address
  25. *
  26. *
  27. * Explanation:
  28. * Perform (possibly unaligned) copy of a block of memory
  29. * between mem locations with size of xfer spec'd in bytes
  30. */
  31. .text
  32. .globl memcpy
  33. .type memcpy, @function
  34. .ent memcpy
  35. memcpy:
  36. fast_memcpy_ascending:
  37. /* move d to return register as value of function */
  38. addi r3, r5, 0
  39. addi r4, r0, 4 /* n = 4 */
  40. cmpu r4, r4, r7 /* n = c - n (unsigned) */
  41. blti r4, a_xfer_end /* if n < 0, less than one word to transfer */
  42. /* transfer first 0~3 bytes to get aligned dest address */
  43. andi r4, r5, 3 /* n = d & 3 */
  44. /* if zero, destination already aligned */
  45. beqi r4, a_dalign_done
  46. /* n = 4 - n (yields 3, 2, 1 transfers for 1, 2, 3 addr offset) */
  47. rsubi r4, r4, 4
  48. rsub r7, r4, r7 /* c = c - n adjust c */
  49. a_xfer_first_loop:
  50. /* if no bytes left to transfer, transfer the bulk */
  51. beqi r4, a_dalign_done
  52. lbui r11, r6, 0 /* h = *s */
  53. sbi r11, r5, 0 /* *d = h */
  54. addi r6, r6, 1 /* s++ */
  55. addi r5, r5, 1 /* d++ */
  56. brid a_xfer_first_loop /* loop */
  57. addi r4, r4, -1 /* n-- (IN DELAY SLOT) */
  58. a_dalign_done:
  59. addi r4, r0, 32 /* n = 32 */
  60. cmpu r4, r4, r7 /* n = c - n (unsigned) */
  61. /* if n < 0, less than one block to transfer */
  62. blti r4, a_block_done
  63. a_block_xfer:
  64. andi r9, r6, 3 /* t1 = s & 3 */
  65. /* if temp == 0, everything is word-aligned */
  66. beqi r9, a_word_xfer
  67. a_block_unaligned:
  68. andi r4, r7, 0xffffffe0 /* n = c & ~31 */
  69. rsub r7, r4, r7 /* c = c - n */
  70. andi r8, r6, 0xfffffffc /* as = s & ~3 */
  71. add r6, r6, r4 /* s = s + n */
  72. lwi r11, r8, 0 /* h = *(as + 0) */
  73. addi r9, r9, -1
  74. beqi r9, a_block_u1 /* t1 was 1 => 1 byte offset */
  75. addi r9, r9, -1
  76. beqi r9, a_block_u2 /* t1 was 2 => 2 byte offset */
  77. a_block_u3:
  78. bslli r11, r11, 24 /* h = h << 24 */
  79. a_bu3_loop:
  80. lwi r12, r8, 4 /* v = *(as + 4) */
  81. bsrli r9, r12, 8 /* t1 = v >> 8 */
  82. or r9, r11, r9 /* t1 = h | t1 */
  83. swi r9, r5, 0 /* *(d + 0) = t1 */
  84. bslli r11, r12, 24 /* h = v << 24 */
  85. lwi r12, r8, 8 /* v = *(as + 8) */
  86. bsrli r9, r12, 8 /* t1 = v >> 8 */
  87. or r9, r11, r9 /* t1 = h | t1 */
  88. swi r9, r5, 4 /* *(d + 4) = t1 */
  89. bslli r11, r12, 24 /* h = v << 24 */
  90. lwi r12, r8, 12 /* v = *(as + 12) */
  91. bsrli r9, r12, 8 /* t1 = v >> 8 */
  92. or r9, r11, r9 /* t1 = h | t1 */
  93. swi r9, r5, 8 /* *(d + 8) = t1 */
  94. bslli r11, r12, 24 /* h = v << 24 */
  95. lwi r12, r8, 16 /* v = *(as + 16) */
  96. bsrli r9, r12, 8 /* t1 = v >> 8 */
  97. or r9, r11, r9 /* t1 = h | t1 */
  98. swi r9, r5, 12 /* *(d + 12) = t1 */
  99. bslli r11, r12, 24 /* h = v << 24 */
  100. lwi r12, r8, 20 /* v = *(as + 20) */
  101. bsrli r9, r12, 8 /* t1 = v >> 8 */
  102. or r9, r11, r9 /* t1 = h | t1 */
  103. swi r9, r5, 16 /* *(d + 16) = t1 */
  104. bslli r11, r12, 24 /* h = v << 24 */
  105. lwi r12, r8, 24 /* v = *(as + 24) */
  106. bsrli r9, r12, 8 /* t1 = v >> 8 */
  107. or r9, r11, r9 /* t1 = h | t1 */
  108. swi r9, r5, 20 /* *(d + 20) = t1 */
  109. bslli r11, r12, 24 /* h = v << 24 */
  110. lwi r12, r8, 28 /* v = *(as + 28) */
  111. bsrli r9, r12, 8 /* t1 = v >> 8 */
  112. or r9, r11, r9 /* t1 = h | t1 */
  113. swi r9, r5, 24 /* *(d + 24) = t1 */
  114. bslli r11, r12, 24 /* h = v << 24 */
  115. lwi r12, r8, 32 /* v = *(as + 32) */
  116. bsrli r9, r12, 8 /* t1 = v >> 8 */
  117. or r9, r11, r9 /* t1 = h | t1 */
  118. swi r9, r5, 28 /* *(d + 28) = t1 */
  119. bslli r11, r12, 24 /* h = v << 24 */
  120. addi r8, r8, 32 /* as = as + 32 */
  121. addi r4, r4, -32 /* n = n - 32 */
  122. bneid r4, a_bu3_loop /* while (n) loop */
  123. addi r5, r5, 32 /* d = d + 32 (IN DELAY SLOT) */
  124. bri a_block_done
  125. a_block_u1:
  126. bslli r11, r11, 8 /* h = h << 8 */
  127. a_bu1_loop:
  128. lwi r12, r8, 4 /* v = *(as + 4) */
  129. bsrli r9, r12, 24 /* t1 = v >> 24 */
  130. or r9, r11, r9 /* t1 = h | t1 */
  131. swi r9, r5, 0 /* *(d + 0) = t1 */
  132. bslli r11, r12, 8 /* h = v << 8 */
  133. lwi r12, r8, 8 /* v = *(as + 8) */
  134. bsrli r9, r12, 24 /* t1 = v >> 24 */
  135. or r9, r11, r9 /* t1 = h | t1 */
  136. swi r9, r5, 4 /* *(d + 4) = t1 */
  137. bslli r11, r12, 8 /* h = v << 8 */
  138. lwi r12, r8, 12 /* v = *(as + 12) */
  139. bsrli r9, r12, 24 /* t1 = v >> 24 */
  140. or r9, r11, r9 /* t1 = h | t1 */
  141. swi r9, r5, 8 /* *(d + 8) = t1 */
  142. bslli r11, r12, 8 /* h = v << 8 */
  143. lwi r12, r8, 16 /* v = *(as + 16) */
  144. bsrli r9, r12, 24 /* t1 = v >> 24 */
  145. or r9, r11, r9 /* t1 = h | t1 */
  146. swi r9, r5, 12 /* *(d + 12) = t1 */
  147. bslli r11, r12, 8 /* h = v << 8 */
  148. lwi r12, r8, 20 /* v = *(as + 20) */
  149. bsrli r9, r12, 24 /* t1 = v >> 24 */
  150. or r9, r11, r9 /* t1 = h | t1 */
  151. swi r9, r5, 16 /* *(d + 16) = t1 */
  152. bslli r11, r12, 8 /* h = v << 8 */
  153. lwi r12, r8, 24 /* v = *(as + 24) */
  154. bsrli r9, r12, 24 /* t1 = v >> 24 */
  155. or r9, r11, r9 /* t1 = h | t1 */
  156. swi r9, r5, 20 /* *(d + 20) = t1 */
  157. bslli r11, r12, 8 /* h = v << 8 */
  158. lwi r12, r8, 28 /* v = *(as + 28) */
  159. bsrli r9, r12, 24 /* t1 = v >> 24 */
  160. or r9, r11, r9 /* t1 = h | t1 */
  161. swi r9, r5, 24 /* *(d + 24) = t1 */
  162. bslli r11, r12, 8 /* h = v << 8 */
  163. lwi r12, r8, 32 /* v = *(as + 32) */
  164. bsrli r9, r12, 24 /* t1 = v >> 24 */
  165. or r9, r11, r9 /* t1 = h | t1 */
  166. swi r9, r5, 28 /* *(d + 28) = t1 */
  167. bslli r11, r12, 8 /* h = v << 8 */
  168. addi r8, r8, 32 /* as = as + 32 */
  169. addi r4, r4, -32 /* n = n - 32 */
  170. bneid r4, a_bu1_loop /* while (n) loop */
  171. addi r5, r5, 32 /* d = d + 32 (IN DELAY SLOT) */
  172. bri a_block_done
  173. a_block_u2:
  174. bslli r11, r11, 16 /* h = h << 16 */
  175. a_bu2_loop:
  176. lwi r12, r8, 4 /* v = *(as + 4) */
  177. bsrli r9, r12, 16 /* t1 = v >> 16 */
  178. or r9, r11, r9 /* t1 = h | t1 */
  179. swi r9, r5, 0 /* *(d + 0) = t1 */
  180. bslli r11, r12, 16 /* h = v << 16 */
  181. lwi r12, r8, 8 /* v = *(as + 8) */
  182. bsrli r9, r12, 16 /* t1 = v >> 16 */
  183. or r9, r11, r9 /* t1 = h | t1 */
  184. swi r9, r5, 4 /* *(d + 4) = t1 */
  185. bslli r11, r12, 16 /* h = v << 16 */
  186. lwi r12, r8, 12 /* v = *(as + 12) */
  187. bsrli r9, r12, 16 /* t1 = v >> 16 */
  188. or r9, r11, r9 /* t1 = h | t1 */
  189. swi r9, r5, 8 /* *(d + 8) = t1 */
  190. bslli r11, r12, 16 /* h = v << 16 */
  191. lwi r12, r8, 16 /* v = *(as + 16) */
  192. bsrli r9, r12, 16 /* t1 = v >> 16 */
  193. or r9, r11, r9 /* t1 = h | t1 */
  194. swi r9, r5, 12 /* *(d + 12) = t1 */
  195. bslli r11, r12, 16 /* h = v << 16 */
  196. lwi r12, r8, 20 /* v = *(as + 20) */
  197. bsrli r9, r12, 16 /* t1 = v >> 16 */
  198. or r9, r11, r9 /* t1 = h | t1 */
  199. swi r9, r5, 16 /* *(d + 16) = t1 */
  200. bslli r11, r12, 16 /* h = v << 16 */
  201. lwi r12, r8, 24 /* v = *(as + 24) */
  202. bsrli r9, r12, 16 /* t1 = v >> 16 */
  203. or r9, r11, r9 /* t1 = h | t1 */
  204. swi r9, r5, 20 /* *(d + 20) = t1 */
  205. bslli r11, r12, 16 /* h = v << 16 */
  206. lwi r12, r8, 28 /* v = *(as + 28) */
  207. bsrli r9, r12, 16 /* t1 = v >> 16 */
  208. or r9, r11, r9 /* t1 = h | t1 */
  209. swi r9, r5, 24 /* *(d + 24) = t1 */
  210. bslli r11, r12, 16 /* h = v << 16 */
  211. lwi r12, r8, 32 /* v = *(as + 32) */
  212. bsrli r9, r12, 16 /* t1 = v >> 16 */
  213. or r9, r11, r9 /* t1 = h | t1 */
  214. swi r9, r5, 28 /* *(d + 28) = t1 */
  215. bslli r11, r12, 16 /* h = v << 16 */
  216. addi r8, r8, 32 /* as = as + 32 */
  217. addi r4, r4, -32 /* n = n - 32 */
  218. bneid r4, a_bu2_loop /* while (n) loop */
  219. addi r5, r5, 32 /* d = d + 32 (IN DELAY SLOT) */
  220. a_block_done:
  221. addi r4, r0, 4 /* n = 4 */
  222. cmpu r4, r4, r7 /* n = c - n (unsigned) */
  223. blti r4, a_xfer_end /* if n < 0, less than one word to transfer */
  224. a_word_xfer:
  225. andi r4, r7, 0xfffffffc /* n = c & ~3 */
  226. addi r10, r0, 0 /* offset = 0 */
  227. andi r9, r6, 3 /* t1 = s & 3 */
  228. /* if temp != 0, unaligned transfers needed */
  229. bnei r9, a_word_unaligned
  230. a_word_aligned:
  231. lw r9, r6, r10 /* t1 = *(s+offset) */
  232. sw r9, r5, r10 /* *(d+offset) = t1 */
  233. addi r4, r4,-4 /* n-- */
  234. bneid r4, a_word_aligned /* loop */
  235. addi r10, r10, 4 /* offset++ (IN DELAY SLOT) */
  236. bri a_word_done
  237. a_word_unaligned:
  238. andi r8, r6, 0xfffffffc /* as = s & ~3 */
  239. lwi r11, r8, 0 /* h = *(as + 0) */
  240. addi r8, r8, 4 /* as = as + 4 */
  241. addi r9, r9, -1
  242. beqi r9, a_word_u1 /* t1 was 1 => 1 byte offset */
  243. addi r9, r9, -1
  244. beqi r9, a_word_u2 /* t1 was 2 => 2 byte offset */
  245. a_word_u3:
  246. bslli r11, r11, 24 /* h = h << 24 */
  247. a_wu3_loop:
  248. lw r12, r8, r10 /* v = *(as + offset) */
  249. bsrli r9, r12, 8 /* t1 = v >> 8 */
  250. or r9, r11, r9 /* t1 = h | t1 */
  251. sw r9, r5, r10 /* *(d + offset) = t1 */
  252. bslli r11, r12, 24 /* h = v << 24 */
  253. addi r4, r4,-4 /* n = n - 4 */
  254. bneid r4, a_wu3_loop /* while (n) loop */
  255. addi r10, r10, 4 /* offset = ofset + 4 (IN DELAY SLOT) */
  256. bri a_word_done
  257. a_word_u1:
  258. bslli r11, r11, 8 /* h = h << 8 */
  259. a_wu1_loop:
  260. lw r12, r8, r10 /* v = *(as + offset) */
  261. bsrli r9, r12, 24 /* t1 = v >> 24 */
  262. or r9, r11, r9 /* t1 = h | t1 */
  263. sw r9, r5, r10 /* *(d + offset) = t1 */
  264. bslli r11, r12, 8 /* h = v << 8 */
  265. addi r4, r4,-4 /* n = n - 4 */
  266. bneid r4, a_wu1_loop /* while (n) loop */
  267. addi r10, r10, 4 /* offset = ofset + 4 (IN DELAY SLOT) */
  268. bri a_word_done
  269. a_word_u2:
  270. bslli r11, r11, 16 /* h = h << 16 */
  271. a_wu2_loop:
  272. lw r12, r8, r10 /* v = *(as + offset) */
  273. bsrli r9, r12, 16 /* t1 = v >> 16 */
  274. or r9, r11, r9 /* t1 = h | t1 */
  275. sw r9, r5, r10 /* *(d + offset) = t1 */
  276. bslli r11, r12, 16 /* h = v << 16 */
  277. addi r4, r4,-4 /* n = n - 4 */
  278. bneid r4, a_wu2_loop /* while (n) loop */
  279. addi r10, r10, 4 /* offset = ofset + 4 (IN DELAY SLOT) */
  280. a_word_done:
  281. add r5, r5, r10 /* d = d + offset */
  282. add r6, r6, r10 /* s = s + offset */
  283. rsub r7, r10, r7 /* c = c - offset */
  284. a_xfer_end:
  285. a_xfer_end_loop:
  286. beqi r7, a_done /* while (c) */
  287. lbui r9, r6, 0 /* t1 = *s */
  288. addi r6, r6, 1 /* s++ */
  289. sbi r9, r5, 0 /* *d = t1 */
  290. addi r7, r7, -1 /* c-- */
  291. brid a_xfer_end_loop /* loop */
  292. addi r5, r5, 1 /* d++ (IN DELAY SLOT) */
  293. a_done:
  294. rtsd r15, 8
  295. nop
  296. .size memcpy, . - memcpy
  297. .end memcpy
  298. libc_hidden_def(memcpy)