memmove.S 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348
  1. /*
  2. * Copyright (C) 2008-2009 Michal Simek <monstr@monstr.eu>
  3. * Copyright (C) 2008-2009 PetaLogix
  4. * Copyright (C) 2008 Jim Law - Iris LP All rights reserved.
  5. *
  6. * This file is subject to the terms and conditions of the GNU General
  7. * Public License. See the file COPYING in the main directory of this
  8. * archive for more details.
  9. *
  10. * Written by Jim Law <jlaw@irispower.com>
  11. *
  12. * intended to replace:
  13. * memcpy in memcpy.c and
  14. * memmove in memmove.c
  15. * ... in arch/microblaze/lib
  16. *
  17. *
  18. * assly_fastcopy.S
  19. *
  20. * Attempt at quicker memcpy and memmove for MicroBlaze
  21. * Input : Operand1 in Reg r5 - destination address
  22. * Operand2 in Reg r6 - source address
  23. * Operand3 in Reg r7 - number of bytes to transfer
  24. * Output: Result in Reg r3 - starting destinaition address
  25. *
  26. *
  27. * Explanation:
  28. * Perform (possibly unaligned) copy of a block of memory
  29. * between mem locations with size of xfer spec'd in bytes
  30. */
  31. .globl memmove
  32. .type memmove, @function
  33. .ent memmove
  34. memmove:
  35. cmpu r4, r5, r6 /* n = s - d */
  36. bgei r4, HIDDEN_JUMPTARGET(memcpy)
  37. fast_memcpy_descending:
  38. /* move d to return register as value of function */
  39. addi r3, r5, 0
  40. add r5, r5, r7 /* d = d + c */
  41. add r6, r6, r7 /* s = s + c */
  42. addi r4, r0, 4 /* n = 4 */
  43. cmpu r4, r4, r7 /* n = c - n (unsigned) */
  44. blti r4,d_xfer_end /* if n < 0, less than one word to transfer */
  45. /* transfer first 0~3 bytes to get aligned dest address */
  46. andi r4, r5, 3 /* n = d & 3 */
  47. /* if zero, destination already aligned */
  48. beqi r4,d_dalign_done
  49. rsub r7, r4, r7 /* c = c - n adjust c */
  50. d_xfer_first_loop:
  51. /* if no bytes left to transfer, transfer the bulk */
  52. beqi r4,d_dalign_done
  53. addi r6, r6, -1 /* s-- */
  54. addi r5, r5, -1 /* d-- */
  55. lbui r11, r6, 0 /* h = *s */
  56. sbi r11, r5, 0 /* *d = h */
  57. brid d_xfer_first_loop /* loop */
  58. addi r4, r4, -1 /* n-- (IN DELAY SLOT) */
  59. d_dalign_done:
  60. addi r4, r0, 32 /* n = 32 */
  61. cmpu r4, r4, r7 /* n = c - n (unsigned) */
  62. /* if n < 0, less than one block to transfer */
  63. blti r4, d_block_done
  64. d_block_xfer:
  65. andi r4, r7, 0xffffffe0 /* n = c & ~31 */
  66. rsub r7, r4, r7 /* c = c - n */
  67. andi r9, r6, 3 /* t1 = s & 3 */
  68. /* if temp != 0, unaligned transfers needed */
  69. bnei r9, d_block_unaligned
  70. d_block_aligned:
  71. addi r6, r6, -32 /* s = s - 32 */
  72. addi r5, r5, -32 /* d = d - 32 */
  73. lwi r9, r6, 28 /* t1 = *(s + 28) */
  74. lwi r10, r6, 24 /* t2 = *(s + 24) */
  75. lwi r11, r6, 20 /* t3 = *(s + 20) */
  76. lwi r12, r6, 16 /* t4 = *(s + 16) */
  77. swi r9, r5, 28 /* *(d + 28) = t1 */
  78. swi r10, r5, 24 /* *(d + 24) = t2 */
  79. swi r11, r5, 20 /* *(d + 20) = t3 */
  80. swi r12, r5, 16 /* *(d + 16) = t4 */
  81. lwi r9, r6, 12 /* t1 = *(s + 12) */
  82. lwi r10, r6, 8 /* t2 = *(s + 8) */
  83. lwi r11, r6, 4 /* t3 = *(s + 4) */
  84. lwi r12, r6, 0 /* t4 = *(s + 0) */
  85. swi r9, r5, 12 /* *(d + 12) = t1 */
  86. swi r10, r5, 8 /* *(d + 8) = t2 */
  87. swi r11, r5, 4 /* *(d + 4) = t3 */
  88. addi r4, r4, -32 /* n = n - 32 */
  89. bneid r4, d_block_aligned /* while (n) loop */
  90. swi r12, r5, 0 /* *(d + 0) = t4 (IN DELAY SLOT) */
  91. bri d_block_done
  92. d_block_unaligned:
  93. andi r8, r6, 0xfffffffc /* as = s & ~3 */
  94. rsub r6, r4, r6 /* s = s - n */
  95. lwi r11, r8, 0 /* h = *(as + 0) */
  96. addi r9, r9, -1
  97. beqi r9,d_block_u1 /* t1 was 1 => 1 byte offset */
  98. addi r9, r9, -1
  99. beqi r9,d_block_u2 /* t1 was 2 => 2 byte offset */
  100. d_block_u3:
  101. bsrli r11, r11, 8 /* h = h >> 8 */
  102. d_bu3_loop:
  103. addi r8, r8, -32 /* as = as - 32 */
  104. addi r5, r5, -32 /* d = d - 32 */
  105. lwi r12, r8, 28 /* v = *(as + 28) */
  106. bslli r9, r12, 24 /* t1 = v << 24 */
  107. or r9, r11, r9 /* t1 = h | t1 */
  108. swi r9, r5, 28 /* *(d + 28) = t1 */
  109. bsrli r11, r12, 8 /* h = v >> 8 */
  110. lwi r12, r8, 24 /* v = *(as + 24) */
  111. bslli r9, r12, 24 /* t1 = v << 24 */
  112. or r9, r11, r9 /* t1 = h | t1 */
  113. swi r9, r5, 24 /* *(d + 24) = t1 */
  114. bsrli r11, r12, 8 /* h = v >> 8 */
  115. lwi r12, r8, 20 /* v = *(as + 20) */
  116. bslli r9, r12, 24 /* t1 = v << 24 */
  117. or r9, r11, r9 /* t1 = h | t1 */
  118. swi r9, r5, 20 /* *(d + 20) = t1 */
  119. bsrli r11, r12, 8 /* h = v >> 8 */
  120. lwi r12, r8, 16 /* v = *(as + 16) */
  121. bslli r9, r12, 24 /* t1 = v << 24 */
  122. or r9, r11, r9 /* t1 = h | t1 */
  123. swi r9, r5, 16 /* *(d + 16) = t1 */
  124. bsrli r11, r12, 8 /* h = v >> 8 */
  125. lwi r12, r8, 12 /* v = *(as + 12) */
  126. bslli r9, r12, 24 /* t1 = v << 24 */
  127. or r9, r11, r9 /* t1 = h | t1 */
  128. swi r9, r5, 12 /* *(d + 112) = t1 */
  129. bsrli r11, r12, 8 /* h = v >> 8 */
  130. lwi r12, r8, 8 /* v = *(as + 8) */
  131. bslli r9, r12, 24 /* t1 = v << 24 */
  132. or r9, r11, r9 /* t1 = h | t1 */
  133. swi r9, r5, 8 /* *(d + 8) = t1 */
  134. bsrli r11, r12, 8 /* h = v >> 8 */
  135. lwi r12, r8, 4 /* v = *(as + 4) */
  136. bslli r9, r12, 24 /* t1 = v << 24 */
  137. or r9, r11, r9 /* t1 = h | t1 */
  138. swi r9, r5, 4 /* *(d + 4) = t1 */
  139. bsrli r11, r12, 8 /* h = v >> 8 */
  140. lwi r12, r8, 0 /* v = *(as + 0) */
  141. bslli r9, r12, 24 /* t1 = v << 24 */
  142. or r9, r11, r9 /* t1 = h | t1 */
  143. swi r9, r5, 0 /* *(d + 0) = t1 */
  144. addi r4, r4, -32 /* n = n - 32 */
  145. bneid r4, d_bu3_loop /* while (n) loop */
  146. bsrli r11, r12, 8 /* h = v >> 8 (IN DELAY SLOT) */
  147. bri d_block_done
  148. d_block_u1:
  149. bsrli r11, r11, 24 /* h = h >> 24 */
  150. d_bu1_loop:
  151. addi r8, r8, -32 /* as = as - 32 */
  152. addi r5, r5, -32 /* d = d - 32 */
  153. lwi r12, r8, 28 /* v = *(as + 28) */
  154. bslli r9, r12, 8 /* t1 = v << 8 */
  155. or r9, r11, r9 /* t1 = h | t1 */
  156. swi r9, r5, 28 /* *(d + 28) = t1 */
  157. bsrli r11, r12, 24 /* h = v >> 24 */
  158. lwi r12, r8, 24 /* v = *(as + 24) */
  159. bslli r9, r12, 8 /* t1 = v << 8 */
  160. or r9, r11, r9 /* t1 = h | t1 */
  161. swi r9, r5, 24 /* *(d + 24) = t1 */
  162. bsrli r11, r12, 24 /* h = v >> 24 */
  163. lwi r12, r8, 20 /* v = *(as + 20) */
  164. bslli r9, r12, 8 /* t1 = v << 8 */
  165. or r9, r11, r9 /* t1 = h | t1 */
  166. swi r9, r5, 20 /* *(d + 20) = t1 */
  167. bsrli r11, r12, 24 /* h = v >> 24 */
  168. lwi r12, r8, 16 /* v = *(as + 16) */
  169. bslli r9, r12, 8 /* t1 = v << 8 */
  170. or r9, r11, r9 /* t1 = h | t1 */
  171. swi r9, r5, 16 /* *(d + 16) = t1 */
  172. bsrli r11, r12, 24 /* h = v >> 24 */
  173. lwi r12, r8, 12 /* v = *(as + 12) */
  174. bslli r9, r12, 8 /* t1 = v << 8 */
  175. or r9, r11, r9 /* t1 = h | t1 */
  176. swi r9, r5, 12 /* *(d + 112) = t1 */
  177. bsrli r11, r12, 24 /* h = v >> 24 */
  178. lwi r12, r8, 8 /* v = *(as + 8) */
  179. bslli r9, r12, 8 /* t1 = v << 8 */
  180. or r9, r11, r9 /* t1 = h | t1 */
  181. swi r9, r5, 8 /* *(d + 8) = t1 */
  182. bsrli r11, r12, 24 /* h = v >> 24 */
  183. lwi r12, r8, 4 /* v = *(as + 4) */
  184. bslli r9, r12, 8 /* t1 = v << 8 */
  185. or r9, r11, r9 /* t1 = h | t1 */
  186. swi r9, r5, 4 /* *(d + 4) = t1 */
  187. bsrli r11, r12, 24 /* h = v >> 24 */
  188. lwi r12, r8, 0 /* v = *(as + 0) */
  189. bslli r9, r12, 8 /* t1 = v << 8 */
  190. or r9, r11, r9 /* t1 = h | t1 */
  191. swi r9, r5, 0 /* *(d + 0) = t1 */
  192. addi r4, r4, -32 /* n = n - 32 */
  193. bneid r4, d_bu1_loop /* while (n) loop */
  194. bsrli r11, r12, 24 /* h = v >> 24 (IN DELAY SLOT) */
  195. bri d_block_done
  196. d_block_u2:
  197. bsrli r11, r11, 16 /* h = h >> 16 */
  198. d_bu2_loop:
  199. addi r8, r8, -32 /* as = as - 32 */
  200. addi r5, r5, -32 /* d = d - 32 */
  201. lwi r12, r8, 28 /* v = *(as + 28) */
  202. bslli r9, r12, 16 /* t1 = v << 16 */
  203. or r9, r11, r9 /* t1 = h | t1 */
  204. swi r9, r5, 28 /* *(d + 28) = t1 */
  205. bsrli r11, r12, 16 /* h = v >> 16 */
  206. lwi r12, r8, 24 /* v = *(as + 24) */
  207. bslli r9, r12, 16 /* t1 = v << 16 */
  208. or r9, r11, r9 /* t1 = h | t1 */
  209. swi r9, r5, 24 /* *(d + 24) = t1 */
  210. bsrli r11, r12, 16 /* h = v >> 16 */
  211. lwi r12, r8, 20 /* v = *(as + 20) */
  212. bslli r9, r12, 16 /* t1 = v << 16 */
  213. or r9, r11, r9 /* t1 = h | t1 */
  214. swi r9, r5, 20 /* *(d + 20) = t1 */
  215. bsrli r11, r12, 16 /* h = v >> 16 */
  216. lwi r12, r8, 16 /* v = *(as + 16) */
  217. bslli r9, r12, 16 /* t1 = v << 16 */
  218. or r9, r11, r9 /* t1 = h | t1 */
  219. swi r9, r5, 16 /* *(d + 16) = t1 */
  220. bsrli r11, r12, 16 /* h = v >> 16 */
  221. lwi r12, r8, 12 /* v = *(as + 12) */
  222. bslli r9, r12, 16 /* t1 = v << 16 */
  223. or r9, r11, r9 /* t1 = h | t1 */
  224. swi r9, r5, 12 /* *(d + 112) = t1 */
  225. bsrli r11, r12, 16 /* h = v >> 16 */
  226. lwi r12, r8, 8 /* v = *(as + 8) */
  227. bslli r9, r12, 16 /* t1 = v << 16 */
  228. or r9, r11, r9 /* t1 = h | t1 */
  229. swi r9, r5, 8 /* *(d + 8) = t1 */
  230. bsrli r11, r12, 16 /* h = v >> 16 */
  231. lwi r12, r8, 4 /* v = *(as + 4) */
  232. bslli r9, r12, 16 /* t1 = v << 16 */
  233. or r9, r11, r9 /* t1 = h | t1 */
  234. swi r9, r5, 4 /* *(d + 4) = t1 */
  235. bsrli r11, r12, 16 /* h = v >> 16 */
  236. lwi r12, r8, 0 /* v = *(as + 0) */
  237. bslli r9, r12, 16 /* t1 = v << 16 */
  238. or r9, r11, r9 /* t1 = h | t1 */
  239. swi r9, r5, 0 /* *(d + 0) = t1 */
  240. addi r4, r4, -32 /* n = n - 32 */
  241. bneid r4, d_bu2_loop /* while (n) loop */
  242. bsrli r11, r12, 16 /* h = v >> 16 (IN DELAY SLOT) */
  243. d_block_done:
  244. addi r4, r0, 4 /* n = 4 */
  245. cmpu r4, r4, r7 /* n = c - n (unsigned) */
  246. blti r4,d_xfer_end /* if n < 0, less than one word to transfer */
  247. d_word_xfer:
  248. andi r4, r7, 0xfffffffc /* n = c & ~3 */
  249. rsub r5, r4, r5 /* d = d - n */
  250. rsub r6, r4, r6 /* s = s - n */
  251. rsub r7, r4, r7 /* c = c - n */
  252. andi r9, r6, 3 /* t1 = s & 3 */
  253. /* if temp != 0, unaligned transfers needed */
  254. bnei r9, d_word_unaligned
  255. d_word_aligned:
  256. addi r4, r4,-4 /* n-- */
  257. lw r9, r6, r4 /* t1 = *(s+n) */
  258. bneid r4, d_word_aligned /* loop */
  259. sw r9, r5, r4 /* *(d+n) = t1 (IN DELAY SLOT) */
  260. bri d_word_done
  261. d_word_unaligned:
  262. andi r8, r6, 0xfffffffc /* as = s & ~3 */
  263. lw r11, r8, r4 /* h = *(as + n) */
  264. addi r9, r9, -1
  265. beqi r9,d_word_u1 /* t1 was 1 => 1 byte offset */
  266. addi r9, r9, -1
  267. beqi r9,d_word_u2 /* t1 was 2 => 2 byte offset */
  268. d_word_u3:
  269. bsrli r11, r11, 8 /* h = h >> 8 */
  270. d_wu3_loop:
  271. addi r4, r4,-4 /* n = n - 4 */
  272. lw r12, r8, r4 /* v = *(as + n) */
  273. bslli r9, r12, 24 /* t1 = v << 24 */
  274. or r9, r11, r9 /* t1 = h | t1 */
  275. sw r9, r5, r4 /* *(d + n) = t1 */
  276. bneid r4, d_wu3_loop /* while (n) loop */
  277. bsrli r11, r12, 8 /* h = v >> 8 (IN DELAY SLOT) */
  278. bri d_word_done
  279. d_word_u1:
  280. bsrli r11, r11, 24 /* h = h >> 24 */
  281. d_wu1_loop:
  282. addi r4, r4,-4 /* n = n - 4 */
  283. lw r12, r8, r4 /* v = *(as + n) */
  284. bslli r9, r12, 8 /* t1 = v << 8 */
  285. or r9, r11, r9 /* t1 = h | t1 */
  286. sw r9, r5, r4 /* *(d + n) = t1 */
  287. bneid r4, d_wu1_loop /* while (n) loop */
  288. bsrli r11, r12, 24 /* h = v >> 24 (IN DELAY SLOT) */
  289. bri d_word_done
  290. d_word_u2:
  291. bsrli r11, r11, 16 /* h = h >> 16 */
  292. d_wu2_loop:
  293. addi r4, r4,-4 /* n = n - 4 */
  294. lw r12, r8, r4 /* v = *(as + n) */
  295. bslli r9, r12, 16 /* t1 = v << 16 */
  296. or r9, r11, r9 /* t1 = h | t1 */
  297. sw r9, r5, r4 /* *(d + n) = t1 */
  298. bneid r4, d_wu2_loop /* while (n) loop */
  299. bsrli r11, r12, 16 /* h = v >> 16 (IN DELAY SLOT) */
  300. d_word_done:
  301. d_xfer_end:
  302. d_xfer_end_loop:
  303. beqi r7, a_done /* while (c) */
  304. addi r6, r6, -1 /* s-- */
  305. lbui r9, r6, 0 /* t1 = *s */
  306. addi r5, r5, -1 /* d-- */
  307. sbi r9, r5, 0 /* *d = t1 */
  308. brid d_xfer_end_loop /* loop */
  309. addi r7, r7, -1 /* c-- (IN DELAY SLOT) */
  310. a_done:
  311. d_done:
  312. rtsd r15, 8
  313. nop
  314. .size memmove, . - memmove
  315. .end memmove
  316. libc_hidden_def(memmove)