_memcpy.S 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583
  1. /*-
  2. * Copyright (c) 1997 The NetBSD Foundation, Inc.
  3. * All rights reserved.
  4. *
  5. * This code is derived from software contributed to The NetBSD Foundation
  6. * by Neil A. Carson and Mark Brinicombe
  7. *
  8. * Redistribution and use in source and binary forms, with or without
  9. * modification, are permitted provided that the following conditions
  10. * are met:
  11. * 1. Redistributions of source code must retain the above copyright
  12. * notice, this list of conditions and the following disclaimer.
  13. * 2. Redistributions in binary form must reproduce the above copyright
  14. * notice, this list of conditions and the following disclaimer in the
  15. * documentation and/or other materials provided with the distribution.
  16. * 3. All advertising materials mentioning features or use of this software
  17. * must display the following acknowledgement:
  18. * This product includes software developed by the NetBSD
  19. * Foundation, Inc. and its contributors.
  20. * 4. Neither the name of The NetBSD Foundation nor the names of its
  21. * contributors may be used to endorse or promote products derived
  22. * from this software without specific prior written permission.
  23. *
  24. * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
  25. * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  26. * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  27. * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
  28. * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  29. * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  30. * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  31. * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  32. * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  33. * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  34. * POSSIBILITY OF SUCH DAMAGE.
  35. *
  36. * Adapted for uClibc from NetBSD _memcpy.S,v 1.6 2003/10/09
  37. * by Erik Andersen <andersen@codepoet.org>
  38. */
  39. #include <endian.h>
  40. /*
  41. * This is one fun bit of code ...
  42. * Some easy listening music is suggested while trying to understand this
  43. * code e.g. Iron Maiden
  44. *
  45. * For anyone attempting to understand it :
  46. *
  47. * The core code is implemented here with simple stubs for memcpy()
  48. * memmove() and bcopy().
  49. *
  50. * All local labels are prefixed with Lmemcpy_
  51. * Following the prefix a label starting f is used in the forward copy code
  52. * while a label using b is used in the backwards copy code
  53. * The source and destination addresses determine whether a forward or
  54. * backward copy is performed.
  55. * Separate bits of code are used to deal with the following situations
  56. * for both the forward and backwards copy.
  57. * unaligned source address
  58. * unaligned destination address
  59. * Separate copy routines are used to produce an optimised result for each
  60. * of these cases.
  61. * The copy code will use LDM/STM instructions to copy up to 32 bytes at
  62. * a time where possible.
  63. *
  64. * Note: r12 (aka ip) can be trashed during the function along with
  65. * r0-r3 although r0-r2 have defined uses i.e. src, dest, len through out.
  66. * Additional registers are preserved prior to use i.e. r4, r5 & lr
  67. *
  68. * Apologies for the state of the comments ;-)
  69. */
  70. .text
  71. .global _memcpy;
  72. .type _memcpy,%function
  73. .align 4; \
  74. _memcpy:
  75. /* Determine copy direction */
  76. cmp r1, r0
  77. bcc .Lmemcpy_backwards
  78. moveq r0, #0 /* Quick abort for len=0 */
  79. moveq pc, lr
  80. stmdb sp!, {r0, lr} /* memcpy() returns dest addr */
  81. subs r2, r2, #4
  82. blt .Lmemcpy_fl4 /* less than 4 bytes */
  83. ands r12, r0, #3
  84. bne .Lmemcpy_fdestul /* oh unaligned destination addr */
  85. ands r12, r1, #3
  86. bne .Lmemcpy_fsrcul /* oh unaligned source addr */
  87. .Lmemcpy_ft8:
  88. /* We have aligned source and destination */
  89. subs r2, r2, #8
  90. blt .Lmemcpy_fl12 /* less than 12 bytes (4 from above) */
  91. subs r2, r2, #0x14
  92. blt .Lmemcpy_fl32 /* less than 32 bytes (12 from above) */
  93. stmdb sp!, {r4} /* borrow r4 */
  94. /* blat 32 bytes at a time */
  95. /* XXX for really big copies perhaps we should use more registers */
  96. .Lmemcpy_floop32:
  97. ldmia r1!, {r3, r4, r12, lr}
  98. stmia r0!, {r3, r4, r12, lr}
  99. ldmia r1!, {r3, r4, r12, lr}
  100. stmia r0!, {r3, r4, r12, lr}
  101. subs r2, r2, #0x20
  102. bge .Lmemcpy_floop32
  103. cmn r2, #0x10
  104. ldmgeia r1!, {r3, r4, r12, lr} /* blat a remaining 16 bytes */
  105. stmgeia r0!, {r3, r4, r12, lr}
  106. subge r2, r2, #0x10
  107. ldmia sp!, {r4} /* return r4 */
  108. .Lmemcpy_fl32:
  109. adds r2, r2, #0x14
  110. /* blat 12 bytes at a time */
  111. .Lmemcpy_floop12:
  112. ldmgeia r1!, {r3, r12, lr}
  113. stmgeia r0!, {r3, r12, lr}
  114. subges r2, r2, #0x0c
  115. bge .Lmemcpy_floop12
  116. .Lmemcpy_fl12:
  117. adds r2, r2, #8
  118. blt .Lmemcpy_fl4
  119. subs r2, r2, #4
  120. ldrlt r3, [r1], #4
  121. strlt r3, [r0], #4
  122. ldmgeia r1!, {r3, r12}
  123. stmgeia r0!, {r3, r12}
  124. subge r2, r2, #4
  125. .Lmemcpy_fl4:
  126. /* less than 4 bytes to go */
  127. adds r2, r2, #4
  128. ldmeqia sp!, {r0, pc} /* done */
  129. /* copy the crud byte at a time */
  130. cmp r2, #2
  131. ldrb r3, [r1], #1
  132. strb r3, [r0], #1
  133. ldrgeb r3, [r1], #1
  134. strgeb r3, [r0], #1
  135. ldrgtb r3, [r1], #1
  136. strgtb r3, [r0], #1
  137. ldmia sp!, {r0, pc}
  138. /* erg - unaligned destination */
  139. .Lmemcpy_fdestul:
  140. rsb r12, r12, #4
  141. cmp r12, #2
  142. /* align destination with byte copies */
  143. ldrb r3, [r1], #1
  144. strb r3, [r0], #1
  145. ldrgeb r3, [r1], #1
  146. strgeb r3, [r0], #1
  147. ldrgtb r3, [r1], #1
  148. strgtb r3, [r0], #1
  149. subs r2, r2, r12
  150. blt .Lmemcpy_fl4 /* less the 4 bytes */
  151. ands r12, r1, #3
  152. beq .Lmemcpy_ft8 /* we have an aligned source */
  153. /* erg - unaligned source */
  154. /* This is where it gets nasty ... */
  155. .Lmemcpy_fsrcul:
  156. bic r1, r1, #3
  157. ldr lr, [r1], #4
  158. cmp r12, #2
  159. bgt .Lmemcpy_fsrcul3
  160. beq .Lmemcpy_fsrcul2
  161. cmp r2, #0x0c
  162. blt .Lmemcpy_fsrcul1loop4
  163. sub r2, r2, #0x0c
  164. stmdb sp!, {r4, r5}
  165. .Lmemcpy_fsrcul1loop16:
  166. #if __BYTE_ORDER == __BIG_ENDIAN
  167. mov r3, lr, lsl #8
  168. ldmia r1!, {r4, r5, r12, lr}
  169. orr r3, r3, r4, lsr #24
  170. mov r4, r4, lsl #8
  171. orr r4, r4, r5, lsr #24
  172. mov r5, r5, lsl #8
  173. orr r5, r5, r12, lsr #24
  174. mov r12, r12, lsl #8
  175. orr r12, r12, lr, lsr #24
  176. #else
  177. mov r3, lr, lsr #8
  178. ldmia r1!, {r4, r5, r12, lr}
  179. orr r3, r3, r4, lsl #24
  180. mov r4, r4, lsr #8
  181. orr r4, r4, r5, lsl #24
  182. mov r5, r5, lsr #8
  183. orr r5, r5, r12, lsl #24
  184. mov r12, r12, lsr #8
  185. orr r12, r12, lr, lsl #24
  186. #endif
  187. stmia r0!, {r3-r5, r12}
  188. subs r2, r2, #0x10
  189. bge .Lmemcpy_fsrcul1loop16
  190. ldmia sp!, {r4, r5}
  191. adds r2, r2, #0x0c
  192. blt .Lmemcpy_fsrcul1l4
  193. .Lmemcpy_fsrcul1loop4:
  194. #if __BYTE_ORDER == __BIG_ENDIAN
  195. mov r12, lr, lsl #8
  196. ldr lr, [r1], #4
  197. orr r12, r12, lr, lsr #24
  198. #else
  199. mov r12, lr, lsr #8
  200. ldr lr, [r1], #4
  201. orr r12, r12, lr, lsl #24
  202. #endif
  203. str r12, [r0], #4
  204. subs r2, r2, #4
  205. bge .Lmemcpy_fsrcul1loop4
  206. .Lmemcpy_fsrcul1l4:
  207. sub r1, r1, #3
  208. b .Lmemcpy_fl4
  209. .Lmemcpy_fsrcul2:
  210. cmp r2, #0x0c
  211. blt .Lmemcpy_fsrcul2loop4
  212. sub r2, r2, #0x0c
  213. stmdb sp!, {r4, r5}
  214. .Lmemcpy_fsrcul2loop16:
  215. #if __BYTE_ORDER == __BIG_ENDIAN
  216. mov r3, lr, lsl #16
  217. ldmia r1!, {r4, r5, r12, lr}
  218. orr r3, r3, r4, lsr #16
  219. mov r4, r4, lsl #16
  220. orr r4, r4, r5, lsr #16
  221. mov r5, r5, lsl #16
  222. orr r5, r5, r12, lsr #16
  223. mov r12, r12, lsl #16
  224. orr r12, r12, lr, lsr #16
  225. #else
  226. mov r3, lr, lsr #16
  227. ldmia r1!, {r4, r5, r12, lr}
  228. orr r3, r3, r4, lsl #16
  229. mov r4, r4, lsr #16
  230. orr r4, r4, r5, lsl #16
  231. mov r5, r5, lsr #16
  232. orr r5, r5, r12, lsl #16
  233. mov r12, r12, lsr #16
  234. orr r12, r12, lr, lsl #16
  235. #endif
  236. stmia r0!, {r3-r5, r12}
  237. subs r2, r2, #0x10
  238. bge .Lmemcpy_fsrcul2loop16
  239. ldmia sp!, {r4, r5}
  240. adds r2, r2, #0x0c
  241. blt .Lmemcpy_fsrcul2l4
  242. .Lmemcpy_fsrcul2loop4:
  243. #if __BYTE_ORDER == __BIG_ENDIAN
  244. mov r12, lr, lsl #16
  245. ldr lr, [r1], #4
  246. orr r12, r12, lr, lsr #16
  247. #else
  248. mov r12, lr, lsr #16
  249. ldr lr, [r1], #4
  250. orr r12, r12, lr, lsl #16
  251. #endif
  252. str r12, [r0], #4
  253. subs r2, r2, #4
  254. bge .Lmemcpy_fsrcul2loop4
  255. .Lmemcpy_fsrcul2l4:
  256. sub r1, r1, #2
  257. b .Lmemcpy_fl4
  258. .Lmemcpy_fsrcul3:
  259. cmp r2, #0x0c
  260. blt .Lmemcpy_fsrcul3loop4
  261. sub r2, r2, #0x0c
  262. stmdb sp!, {r4, r5}
  263. .Lmemcpy_fsrcul3loop16:
  264. #if __BYTE_ORDER == __BIG_ENDIAN
  265. mov r3, lr, lsl #24
  266. ldmia r1!, {r4, r5, r12, lr}
  267. orr r3, r3, r4, lsr #8
  268. mov r4, r4, lsl #24
  269. orr r4, r4, r5, lsr #8
  270. mov r5, r5, lsl #24
  271. orr r5, r5, r12, lsr #8
  272. mov r12, r12, lsl #24
  273. orr r12, r12, lr, lsr #8
  274. #else
  275. mov r3, lr, lsr #24
  276. ldmia r1!, {r4, r5, r12, lr}
  277. orr r3, r3, r4, lsl #8
  278. mov r4, r4, lsr #24
  279. orr r4, r4, r5, lsl #8
  280. mov r5, r5, lsr #24
  281. orr r5, r5, r12, lsl #8
  282. mov r12, r12, lsr #24
  283. orr r12, r12, lr, lsl #8
  284. #endif
  285. stmia r0!, {r3-r5, r12}
  286. subs r2, r2, #0x10
  287. bge .Lmemcpy_fsrcul3loop16
  288. ldmia sp!, {r4, r5}
  289. adds r2, r2, #0x0c
  290. blt .Lmemcpy_fsrcul3l4
  291. .Lmemcpy_fsrcul3loop4:
  292. #if __BYTE_ORDER == __BIG_ENDIAN
  293. mov r12, lr, lsl #24
  294. ldr lr, [r1], #4
  295. orr r12, r12, lr, lsr #8
  296. #else
  297. mov r12, lr, lsr #24
  298. ldr lr, [r1], #4
  299. orr r12, r12, lr, lsl #8
  300. #endif
  301. str r12, [r0], #4
  302. subs r2, r2, #4
  303. bge .Lmemcpy_fsrcul3loop4
  304. .Lmemcpy_fsrcul3l4:
  305. sub r1, r1, #1
  306. b .Lmemcpy_fl4
  307. .Lmemcpy_backwards:
  308. add r1, r1, r2
  309. add r0, r0, r2
  310. subs r2, r2, #4
  311. blt .Lmemcpy_bl4 /* less than 4 bytes */
  312. ands r12, r0, #3
  313. bne .Lmemcpy_bdestul /* oh unaligned destination addr */
  314. ands r12, r1, #3
  315. bne .Lmemcpy_bsrcul /* oh unaligned source addr */
  316. .Lmemcpy_bt8:
  317. /* We have aligned source and destination */
  318. subs r2, r2, #8
  319. blt .Lmemcpy_bl12 /* less than 12 bytes (4 from above) */
  320. stmdb sp!, {r4, lr}
  321. subs r2, r2, #0x14 /* less than 32 bytes (12 from above) */
  322. blt .Lmemcpy_bl32
  323. /* blat 32 bytes at a time */
  324. /* XXX for really big copies perhaps we should use more registers */
  325. .Lmemcpy_bloop32:
  326. ldmdb r1!, {r3, r4, r12, lr}
  327. stmdb r0!, {r3, r4, r12, lr}
  328. ldmdb r1!, {r3, r4, r12, lr}
  329. stmdb r0!, {r3, r4, r12, lr}
  330. subs r2, r2, #0x20
  331. bge .Lmemcpy_bloop32
  332. .Lmemcpy_bl32:
  333. cmn r2, #0x10
  334. ldmgedb r1!, {r3, r4, r12, lr} /* blat a remaining 16 bytes */
  335. stmgedb r0!, {r3, r4, r12, lr}
  336. subge r2, r2, #0x10
  337. adds r2, r2, #0x14
  338. ldmgedb r1!, {r3, r12, lr} /* blat a remaining 12 bytes */
  339. stmgedb r0!, {r3, r12, lr}
  340. subge r2, r2, #0x0c
  341. ldmia sp!, {r4, lr}
  342. .Lmemcpy_bl12:
  343. adds r2, r2, #8
  344. blt .Lmemcpy_bl4
  345. subs r2, r2, #4
  346. ldrlt r3, [r1, #-4]!
  347. strlt r3, [r0, #-4]!
  348. ldmgedb r1!, {r3, r12}
  349. stmgedb r0!, {r3, r12}
  350. subge r2, r2, #4
  351. .Lmemcpy_bl4:
  352. /* less than 4 bytes to go */
  353. adds r2, r2, #4
  354. moveq pc, lr /* done */
  355. /* copy the crud byte at a time */
  356. cmp r2, #2
  357. ldrb r3, [r1, #-1]!
  358. strb r3, [r0, #-1]!
  359. ldrgeb r3, [r1, #-1]!
  360. strgeb r3, [r0, #-1]!
  361. ldrgtb r3, [r1, #-1]!
  362. strgtb r3, [r0, #-1]!
  363. mov pc, lr
  364. /* erg - unaligned destination */
  365. .Lmemcpy_bdestul:
  366. cmp r12, #2
  367. /* align destination with byte copies */
  368. ldrb r3, [r1, #-1]!
  369. strb r3, [r0, #-1]!
  370. ldrgeb r3, [r1, #-1]!
  371. strgeb r3, [r0, #-1]!
  372. ldrgtb r3, [r1, #-1]!
  373. strgtb r3, [r0, #-1]!
  374. subs r2, r2, r12
  375. blt .Lmemcpy_bl4 /* less than 4 bytes to go */
  376. ands r12, r1, #3
  377. beq .Lmemcpy_bt8 /* we have an aligned source */
  378. /* erg - unaligned source */
  379. /* This is where it gets nasty ... */
  380. .Lmemcpy_bsrcul:
  381. bic r1, r1, #3
  382. ldr r3, [r1, #0]
  383. cmp r12, #2
  384. blt .Lmemcpy_bsrcul1
  385. beq .Lmemcpy_bsrcul2
  386. cmp r2, #0x0c
  387. blt .Lmemcpy_bsrcul3loop4
  388. sub r2, r2, #0x0c
  389. stmdb sp!, {r4, r5, lr}
  390. .Lmemcpy_bsrcul3loop16:
  391. #if __BYTE_ORDER == __BIG_ENDIAN
  392. mov lr, r3, lsr #8
  393. ldmdb r1!, {r3-r5, r12}
  394. orr lr, lr, r12, lsl #24
  395. mov r12, r12, lsr #8
  396. orr r12, r12, r5, lsl #24
  397. mov r5, r5, lsr #8
  398. orr r5, r5, r4, lsl #24
  399. mov r4, r4, lsr #8
  400. orr r4, r4, r3, lsl #24
  401. #else
  402. mov lr, r3, lsl #8
  403. ldmdb r1!, {r3-r5, r12}
  404. orr lr, lr, r12, lsr #24
  405. mov r12, r12, lsl #8
  406. orr r12, r12, r5, lsr #24
  407. mov r5, r5, lsl #8
  408. orr r5, r5, r4, lsr #24
  409. mov r4, r4, lsl #8
  410. orr r4, r4, r3, lsr #24
  411. #endif
  412. stmdb r0!, {r4, r5, r12, lr}
  413. subs r2, r2, #0x10
  414. bge .Lmemcpy_bsrcul3loop16
  415. ldmia sp!, {r4, r5, lr}
  416. adds r2, r2, #0x0c
  417. blt .Lmemcpy_bsrcul3l4
  418. .Lmemcpy_bsrcul3loop4:
  419. #if __BYTE_ORDER == __BIG_ENDIAN
  420. mov r12, r3, lsr #8
  421. ldr r3, [r1, #-4]!
  422. orr r12, r12, r3, lsl #24
  423. #else
  424. mov r12, r3, lsl #8
  425. ldr r3, [r1, #-4]!
  426. orr r12, r12, r3, lsr #24
  427. #endif
  428. str r12, [r0, #-4]!
  429. subs r2, r2, #4
  430. bge .Lmemcpy_bsrcul3loop4
  431. .Lmemcpy_bsrcul3l4:
  432. add r1, r1, #3
  433. b .Lmemcpy_bl4
  434. .Lmemcpy_bsrcul2:
  435. cmp r2, #0x0c
  436. blt .Lmemcpy_bsrcul2loop4
  437. sub r2, r2, #0x0c
  438. stmdb sp!, {r4, r5, lr}
  439. .Lmemcpy_bsrcul2loop16:
  440. #if __BYTE_ORDER == __BIG_ENDIAN
  441. mov lr, r3, lsr #16
  442. ldmdb r1!, {r3-r5, r12}
  443. orr lr, lr, r12, lsl #16
  444. mov r12, r12, lsr #16
  445. orr r12, r12, r5, lsl #16
  446. mov r5, r5, lsr #16
  447. orr r5, r5, r4, lsl #16
  448. mov r4, r4, lsr #16
  449. orr r4, r4, r3, lsl #16
  450. #else
  451. mov lr, r3, lsl #16
  452. ldmdb r1!, {r3-r5, r12}
  453. orr lr, lr, r12, lsr #16
  454. mov r12, r12, lsl #16
  455. orr r12, r12, r5, lsr #16
  456. mov r5, r5, lsl #16
  457. orr r5, r5, r4, lsr #16
  458. mov r4, r4, lsl #16
  459. orr r4, r4, r3, lsr #16
  460. #endif
  461. stmdb r0!, {r4, r5, r12, lr}
  462. subs r2, r2, #0x10
  463. bge .Lmemcpy_bsrcul2loop16
  464. ldmia sp!, {r4, r5, lr}
  465. adds r2, r2, #0x0c
  466. blt .Lmemcpy_bsrcul2l4
  467. .Lmemcpy_bsrcul2loop4:
  468. #if __BYTE_ORDER == __BIG_ENDIAN
  469. mov r12, r3, lsr #16
  470. ldr r3, [r1, #-4]!
  471. orr r12, r12, r3, lsl #16
  472. #else
  473. mov r12, r3, lsl #16
  474. ldr r3, [r1, #-4]!
  475. orr r12, r12, r3, lsr #16
  476. #endif
  477. str r12, [r0, #-4]!
  478. subs r2, r2, #4
  479. bge .Lmemcpy_bsrcul2loop4
  480. .Lmemcpy_bsrcul2l4:
  481. add r1, r1, #2
  482. b .Lmemcpy_bl4
  483. .Lmemcpy_bsrcul1:
  484. cmp r2, #0x0c
  485. blt .Lmemcpy_bsrcul1loop4
  486. sub r2, r2, #0x0c
  487. stmdb sp!, {r4, r5, lr}
  488. .Lmemcpy_bsrcul1loop32:
  489. #if __BYTE_ORDER == __BIG_ENDIAN
  490. mov lr, r3, lsr #24
  491. ldmdb r1!, {r3-r5, r12}
  492. orr lr, lr, r12, lsl #8
  493. mov r12, r12, lsr #24
  494. orr r12, r12, r5, lsl #8
  495. mov r5, r5, lsr #24
  496. orr r5, r5, r4, lsl #8
  497. mov r4, r4, lsr #24
  498. orr r4, r4, r3, lsl #8
  499. #else
  500. mov lr, r3, lsl #24
  501. ldmdb r1!, {r3-r5, r12}
  502. orr lr, lr, r12, lsr #8
  503. mov r12, r12, lsl #24
  504. orr r12, r12, r5, lsr #8
  505. mov r5, r5, lsl #24
  506. orr r5, r5, r4, lsr #8
  507. mov r4, r4, lsl #24
  508. orr r4, r4, r3, lsr #8
  509. #endif
  510. stmdb r0!, {r4, r5, r12, lr}
  511. subs r2, r2, #0x10
  512. bge .Lmemcpy_bsrcul1loop32
  513. ldmia sp!, {r4, r5, lr}
  514. adds r2, r2, #0x0c
  515. blt .Lmemcpy_bsrcul1l4
  516. .Lmemcpy_bsrcul1loop4:
  517. #if __BYTE_ORDER == __BIG_ENDIAN
  518. mov r12, r3, lsr #24
  519. ldr r3, [r1, #-4]!
  520. orr r12, r12, r3, lsl #8
  521. #else
  522. mov r12, r3, lsl #24
  523. ldr r3, [r1, #-4]!
  524. orr r12, r12, r3, lsr #8
  525. #endif
  526. str r12, [r0, #-4]!
  527. subs r2, r2, #4
  528. bge .Lmemcpy_bsrcul1loop4
  529. .Lmemcpy_bsrcul1l4:
  530. add r1, r1, #1
  531. b .Lmemcpy_bl4