_memcpy.S 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584
  1. /*-
  2. * Copyright (c) 1997 The NetBSD Foundation, Inc.
  3. * All rights reserved.
  4. *
  5. * This code is derived from software contributed to The NetBSD Foundation
  6. * by Neil A. Carson and Mark Brinicombe
  7. *
  8. * Redistribution and use in source and binary forms, with or without
  9. * modification, are permitted provided that the following conditions
  10. * are met:
  11. * 1. Redistributions of source code must retain the above copyright
  12. * notice, this list of conditions and the following disclaimer.
  13. * 2. Redistributions in binary form must reproduce the above copyright
  14. * notice, this list of conditions and the following disclaimer in the
  15. * documentation and/or other materials provided with the distribution.
  16. * 3. All advertising materials mentioning features or use of this software
  17. * must display the following acknowledgement:
  18. * This product includes software developed by the NetBSD
  19. * Foundation, Inc. and its contributors.
  20. * 4. Neither the name of The NetBSD Foundation nor the names of its
  21. * contributors may be used to endorse or promote products derived
  22. * from this software without specific prior written permission.
  23. *
  24. * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
  25. * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  26. * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  27. * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
  28. * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  29. * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  30. * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  31. * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  32. * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  33. * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  34. * POSSIBILITY OF SUCH DAMAGE.
  35. *
  36. * Adapted for uClibc from NetBSD _memcpy.S,v 1.6 2003/10/09
  37. * by Erik Andersen <andersen@codepoet.org>
  38. */
  39. #include <endian.h>
  40. /*
  41. * This is one fun bit of code ...
  42. * Some easy listening music is suggested while trying to understand this
  43. * code e.g. Iron Maiden
  44. *
  45. * For anyone attempting to understand it :
  46. *
  47. * The core code is implemented here with simple stubs for memcpy()
  48. * memmove() and bcopy().
  49. *
  50. * All local labels are prefixed with Lmemcpy_
  51. * Following the prefix a label starting f is used in the forward copy code
  52. * while a label using b is used in the backwards copy code
  53. * The source and destination addresses determine whether a forward or
  54. * backward copy is performed.
  55. * Separate bits of code are used to deal with the following situations
  56. * for both the forward and backwards copy.
  57. * unaligned source address
  58. * unaligned destination address
  59. * Separate copy routines are used to produce an optimised result for each
  60. * of these cases.
  61. * The copy code will use LDM/STM instructions to copy up to 32 bytes at
  62. * a time where possible.
  63. *
  64. * Note: r12 (aka ip) can be trashed during the function along with
  65. * r0-r3 although r0-r2 have defined uses i.e. src, dest, len through out.
  66. * Additional registers are preserved prior to use i.e. r4, r5 & lr
  67. *
  68. * Apologies for the state of the comments ;-)
  69. */
  70. .text
  71. .global _memcpy
  72. .hidden _memcpy
  73. .type _memcpy,%function
  74. .align 4
  75. _memcpy:
  76. /* Determine copy direction */
  77. cmp r1, r0
  78. bcc .Lmemcpy_backwards
  79. moveq r0, #0 /* Quick abort for len=0 */
  80. moveq pc, lr
  81. stmdb sp!, {r0, lr} /* memcpy() returns dest addr */
  82. subs r2, r2, #4
  83. blt .Lmemcpy_fl4 /* less than 4 bytes */
  84. ands r12, r0, #3
  85. bne .Lmemcpy_fdestul /* oh unaligned destination addr */
  86. ands r12, r1, #3
  87. bne .Lmemcpy_fsrcul /* oh unaligned source addr */
  88. .Lmemcpy_ft8:
  89. /* We have aligned source and destination */
  90. subs r2, r2, #8
  91. blt .Lmemcpy_fl12 /* less than 12 bytes (4 from above) */
  92. subs r2, r2, #0x14
  93. blt .Lmemcpy_fl32 /* less than 32 bytes (12 from above) */
  94. stmdb sp!, {r4} /* borrow r4 */
  95. /* blat 32 bytes at a time */
  96. /* XXX for really big copies perhaps we should use more registers */
  97. .Lmemcpy_floop32:
  98. ldmia r1!, {r3, r4, r12, lr}
  99. stmia r0!, {r3, r4, r12, lr}
  100. ldmia r1!, {r3, r4, r12, lr}
  101. stmia r0!, {r3, r4, r12, lr}
  102. subs r2, r2, #0x20
  103. bge .Lmemcpy_floop32
  104. cmn r2, #0x10
  105. ldmgeia r1!, {r3, r4, r12, lr} /* blat a remaining 16 bytes */
  106. stmgeia r0!, {r3, r4, r12, lr}
  107. subge r2, r2, #0x10
  108. ldmia sp!, {r4} /* return r4 */
  109. .Lmemcpy_fl32:
  110. adds r2, r2, #0x14
  111. /* blat 12 bytes at a time */
  112. .Lmemcpy_floop12:
  113. ldmgeia r1!, {r3, r12, lr}
  114. stmgeia r0!, {r3, r12, lr}
  115. subges r2, r2, #0x0c
  116. bge .Lmemcpy_floop12
  117. .Lmemcpy_fl12:
  118. adds r2, r2, #8
  119. blt .Lmemcpy_fl4
  120. subs r2, r2, #4
  121. ldrlt r3, [r1], #4
  122. strlt r3, [r0], #4
  123. ldmgeia r1!, {r3, r12}
  124. stmgeia r0!, {r3, r12}
  125. subge r2, r2, #4
  126. .Lmemcpy_fl4:
  127. /* less than 4 bytes to go */
  128. adds r2, r2, #4
  129. ldmeqia sp!, {r0, pc} /* done */
  130. /* copy the crud byte at a time */
  131. cmp r2, #2
  132. ldrb r3, [r1], #1
  133. strb r3, [r0], #1
  134. ldrgeb r3, [r1], #1
  135. strgeb r3, [r0], #1
  136. ldrgtb r3, [r1], #1
  137. strgtb r3, [r0], #1
  138. ldmia sp!, {r0, pc}
  139. /* erg - unaligned destination */
  140. .Lmemcpy_fdestul:
  141. rsb r12, r12, #4
  142. cmp r12, #2
  143. /* align destination with byte copies */
  144. ldrb r3, [r1], #1
  145. strb r3, [r0], #1
  146. ldrgeb r3, [r1], #1
  147. strgeb r3, [r0], #1
  148. ldrgtb r3, [r1], #1
  149. strgtb r3, [r0], #1
  150. subs r2, r2, r12
  151. blt .Lmemcpy_fl4 /* less the 4 bytes */
  152. ands r12, r1, #3
  153. beq .Lmemcpy_ft8 /* we have an aligned source */
  154. /* erg - unaligned source */
  155. /* This is where it gets nasty ... */
  156. .Lmemcpy_fsrcul:
  157. bic r1, r1, #3
  158. ldr lr, [r1], #4
  159. cmp r12, #2
  160. bgt .Lmemcpy_fsrcul3
  161. beq .Lmemcpy_fsrcul2
  162. cmp r2, #0x0c
  163. blt .Lmemcpy_fsrcul1loop4
  164. sub r2, r2, #0x0c
  165. stmdb sp!, {r4, r5}
  166. .Lmemcpy_fsrcul1loop16:
  167. #if __BYTE_ORDER == __BIG_ENDIAN
  168. mov r3, lr, lsl #8
  169. ldmia r1!, {r4, r5, r12, lr}
  170. orr r3, r3, r4, lsr #24
  171. mov r4, r4, lsl #8
  172. orr r4, r4, r5, lsr #24
  173. mov r5, r5, lsl #8
  174. orr r5, r5, r12, lsr #24
  175. mov r12, r12, lsl #8
  176. orr r12, r12, lr, lsr #24
  177. #else
  178. mov r3, lr, lsr #8
  179. ldmia r1!, {r4, r5, r12, lr}
  180. orr r3, r3, r4, lsl #24
  181. mov r4, r4, lsr #8
  182. orr r4, r4, r5, lsl #24
  183. mov r5, r5, lsr #8
  184. orr r5, r5, r12, lsl #24
  185. mov r12, r12, lsr #8
  186. orr r12, r12, lr, lsl #24
  187. #endif
  188. stmia r0!, {r3-r5, r12}
  189. subs r2, r2, #0x10
  190. bge .Lmemcpy_fsrcul1loop16
  191. ldmia sp!, {r4, r5}
  192. adds r2, r2, #0x0c
  193. blt .Lmemcpy_fsrcul1l4
  194. .Lmemcpy_fsrcul1loop4:
  195. #if __BYTE_ORDER == __BIG_ENDIAN
  196. mov r12, lr, lsl #8
  197. ldr lr, [r1], #4
  198. orr r12, r12, lr, lsr #24
  199. #else
  200. mov r12, lr, lsr #8
  201. ldr lr, [r1], #4
  202. orr r12, r12, lr, lsl #24
  203. #endif
  204. str r12, [r0], #4
  205. subs r2, r2, #4
  206. bge .Lmemcpy_fsrcul1loop4
  207. .Lmemcpy_fsrcul1l4:
  208. sub r1, r1, #3
  209. b .Lmemcpy_fl4
  210. .Lmemcpy_fsrcul2:
  211. cmp r2, #0x0c
  212. blt .Lmemcpy_fsrcul2loop4
  213. sub r2, r2, #0x0c
  214. stmdb sp!, {r4, r5}
  215. .Lmemcpy_fsrcul2loop16:
  216. #if __BYTE_ORDER == __BIG_ENDIAN
  217. mov r3, lr, lsl #16
  218. ldmia r1!, {r4, r5, r12, lr}
  219. orr r3, r3, r4, lsr #16
  220. mov r4, r4, lsl #16
  221. orr r4, r4, r5, lsr #16
  222. mov r5, r5, lsl #16
  223. orr r5, r5, r12, lsr #16
  224. mov r12, r12, lsl #16
  225. orr r12, r12, lr, lsr #16
  226. #else
  227. mov r3, lr, lsr #16
  228. ldmia r1!, {r4, r5, r12, lr}
  229. orr r3, r3, r4, lsl #16
  230. mov r4, r4, lsr #16
  231. orr r4, r4, r5, lsl #16
  232. mov r5, r5, lsr #16
  233. orr r5, r5, r12, lsl #16
  234. mov r12, r12, lsr #16
  235. orr r12, r12, lr, lsl #16
  236. #endif
  237. stmia r0!, {r3-r5, r12}
  238. subs r2, r2, #0x10
  239. bge .Lmemcpy_fsrcul2loop16
  240. ldmia sp!, {r4, r5}
  241. adds r2, r2, #0x0c
  242. blt .Lmemcpy_fsrcul2l4
  243. .Lmemcpy_fsrcul2loop4:
  244. #if __BYTE_ORDER == __BIG_ENDIAN
  245. mov r12, lr, lsl #16
  246. ldr lr, [r1], #4
  247. orr r12, r12, lr, lsr #16
  248. #else
  249. mov r12, lr, lsr #16
  250. ldr lr, [r1], #4
  251. orr r12, r12, lr, lsl #16
  252. #endif
  253. str r12, [r0], #4
  254. subs r2, r2, #4
  255. bge .Lmemcpy_fsrcul2loop4
  256. .Lmemcpy_fsrcul2l4:
  257. sub r1, r1, #2
  258. b .Lmemcpy_fl4
  259. .Lmemcpy_fsrcul3:
  260. cmp r2, #0x0c
  261. blt .Lmemcpy_fsrcul3loop4
  262. sub r2, r2, #0x0c
  263. stmdb sp!, {r4, r5}
  264. .Lmemcpy_fsrcul3loop16:
  265. #if __BYTE_ORDER == __BIG_ENDIAN
  266. mov r3, lr, lsl #24
  267. ldmia r1!, {r4, r5, r12, lr}
  268. orr r3, r3, r4, lsr #8
  269. mov r4, r4, lsl #24
  270. orr r4, r4, r5, lsr #8
  271. mov r5, r5, lsl #24
  272. orr r5, r5, r12, lsr #8
  273. mov r12, r12, lsl #24
  274. orr r12, r12, lr, lsr #8
  275. #else
  276. mov r3, lr, lsr #24
  277. ldmia r1!, {r4, r5, r12, lr}
  278. orr r3, r3, r4, lsl #8
  279. mov r4, r4, lsr #24
  280. orr r4, r4, r5, lsl #8
  281. mov r5, r5, lsr #24
  282. orr r5, r5, r12, lsl #8
  283. mov r12, r12, lsr #24
  284. orr r12, r12, lr, lsl #8
  285. #endif
  286. stmia r0!, {r3-r5, r12}
  287. subs r2, r2, #0x10
  288. bge .Lmemcpy_fsrcul3loop16
  289. ldmia sp!, {r4, r5}
  290. adds r2, r2, #0x0c
  291. blt .Lmemcpy_fsrcul3l4
  292. .Lmemcpy_fsrcul3loop4:
  293. #if __BYTE_ORDER == __BIG_ENDIAN
  294. mov r12, lr, lsl #24
  295. ldr lr, [r1], #4
  296. orr r12, r12, lr, lsr #8
  297. #else
  298. mov r12, lr, lsr #24
  299. ldr lr, [r1], #4
  300. orr r12, r12, lr, lsl #8
  301. #endif
  302. str r12, [r0], #4
  303. subs r2, r2, #4
  304. bge .Lmemcpy_fsrcul3loop4
  305. .Lmemcpy_fsrcul3l4:
  306. sub r1, r1, #1
  307. b .Lmemcpy_fl4
  308. .Lmemcpy_backwards:
  309. add r1, r1, r2
  310. add r0, r0, r2
  311. subs r2, r2, #4
  312. blt .Lmemcpy_bl4 /* less than 4 bytes */
  313. ands r12, r0, #3
  314. bne .Lmemcpy_bdestul /* oh unaligned destination addr */
  315. ands r12, r1, #3
  316. bne .Lmemcpy_bsrcul /* oh unaligned source addr */
  317. .Lmemcpy_bt8:
  318. /* We have aligned source and destination */
  319. subs r2, r2, #8
  320. blt .Lmemcpy_bl12 /* less than 12 bytes (4 from above) */
  321. stmdb sp!, {r4, lr}
  322. subs r2, r2, #0x14 /* less than 32 bytes (12 from above) */
  323. blt .Lmemcpy_bl32
  324. /* blat 32 bytes at a time */
  325. /* XXX for really big copies perhaps we should use more registers */
  326. .Lmemcpy_bloop32:
  327. ldmdb r1!, {r3, r4, r12, lr}
  328. stmdb r0!, {r3, r4, r12, lr}
  329. ldmdb r1!, {r3, r4, r12, lr}
  330. stmdb r0!, {r3, r4, r12, lr}
  331. subs r2, r2, #0x20
  332. bge .Lmemcpy_bloop32
  333. .Lmemcpy_bl32:
  334. cmn r2, #0x10
  335. ldmgedb r1!, {r3, r4, r12, lr} /* blat a remaining 16 bytes */
  336. stmgedb r0!, {r3, r4, r12, lr}
  337. subge r2, r2, #0x10
  338. adds r2, r2, #0x14
  339. ldmgedb r1!, {r3, r12, lr} /* blat a remaining 12 bytes */
  340. stmgedb r0!, {r3, r12, lr}
  341. subge r2, r2, #0x0c
  342. ldmia sp!, {r4, lr}
  343. .Lmemcpy_bl12:
  344. adds r2, r2, #8
  345. blt .Lmemcpy_bl4
  346. subs r2, r2, #4
  347. ldrlt r3, [r1, #-4]!
  348. strlt r3, [r0, #-4]!
  349. ldmgedb r1!, {r3, r12}
  350. stmgedb r0!, {r3, r12}
  351. subge r2, r2, #4
  352. .Lmemcpy_bl4:
  353. /* less than 4 bytes to go */
  354. adds r2, r2, #4
  355. moveq pc, lr /* done */
  356. /* copy the crud byte at a time */
  357. cmp r2, #2
  358. ldrb r3, [r1, #-1]!
  359. strb r3, [r0, #-1]!
  360. ldrgeb r3, [r1, #-1]!
  361. strgeb r3, [r0, #-1]!
  362. ldrgtb r3, [r1, #-1]!
  363. strgtb r3, [r0, #-1]!
  364. mov pc, lr
  365. /* erg - unaligned destination */
  366. .Lmemcpy_bdestul:
  367. cmp r12, #2
  368. /* align destination with byte copies */
  369. ldrb r3, [r1, #-1]!
  370. strb r3, [r0, #-1]!
  371. ldrgeb r3, [r1, #-1]!
  372. strgeb r3, [r0, #-1]!
  373. ldrgtb r3, [r1, #-1]!
  374. strgtb r3, [r0, #-1]!
  375. subs r2, r2, r12
  376. blt .Lmemcpy_bl4 /* less than 4 bytes to go */
  377. ands r12, r1, #3
  378. beq .Lmemcpy_bt8 /* we have an aligned source */
  379. /* erg - unaligned source */
  380. /* This is where it gets nasty ... */
  381. .Lmemcpy_bsrcul:
  382. bic r1, r1, #3
  383. ldr r3, [r1, #0]
  384. cmp r12, #2
  385. blt .Lmemcpy_bsrcul1
  386. beq .Lmemcpy_bsrcul2
  387. cmp r2, #0x0c
  388. blt .Lmemcpy_bsrcul3loop4
  389. sub r2, r2, #0x0c
  390. stmdb sp!, {r4, r5, lr}
  391. .Lmemcpy_bsrcul3loop16:
  392. #if __BYTE_ORDER == __BIG_ENDIAN
  393. mov lr, r3, lsr #8
  394. ldmdb r1!, {r3-r5, r12}
  395. orr lr, lr, r12, lsl #24
  396. mov r12, r12, lsr #8
  397. orr r12, r12, r5, lsl #24
  398. mov r5, r5, lsr #8
  399. orr r5, r5, r4, lsl #24
  400. mov r4, r4, lsr #8
  401. orr r4, r4, r3, lsl #24
  402. #else
  403. mov lr, r3, lsl #8
  404. ldmdb r1!, {r3-r5, r12}
  405. orr lr, lr, r12, lsr #24
  406. mov r12, r12, lsl #8
  407. orr r12, r12, r5, lsr #24
  408. mov r5, r5, lsl #8
  409. orr r5, r5, r4, lsr #24
  410. mov r4, r4, lsl #8
  411. orr r4, r4, r3, lsr #24
  412. #endif
  413. stmdb r0!, {r4, r5, r12, lr}
  414. subs r2, r2, #0x10
  415. bge .Lmemcpy_bsrcul3loop16
  416. ldmia sp!, {r4, r5, lr}
  417. adds r2, r2, #0x0c
  418. blt .Lmemcpy_bsrcul3l4
  419. .Lmemcpy_bsrcul3loop4:
  420. #if __BYTE_ORDER == __BIG_ENDIAN
  421. mov r12, r3, lsr #8
  422. ldr r3, [r1, #-4]!
  423. orr r12, r12, r3, lsl #24
  424. #else
  425. mov r12, r3, lsl #8
  426. ldr r3, [r1, #-4]!
  427. orr r12, r12, r3, lsr #24
  428. #endif
  429. str r12, [r0, #-4]!
  430. subs r2, r2, #4
  431. bge .Lmemcpy_bsrcul3loop4
  432. .Lmemcpy_bsrcul3l4:
  433. add r1, r1, #3
  434. b .Lmemcpy_bl4
  435. .Lmemcpy_bsrcul2:
  436. cmp r2, #0x0c
  437. blt .Lmemcpy_bsrcul2loop4
  438. sub r2, r2, #0x0c
  439. stmdb sp!, {r4, r5, lr}
  440. .Lmemcpy_bsrcul2loop16:
  441. #if __BYTE_ORDER == __BIG_ENDIAN
  442. mov lr, r3, lsr #16
  443. ldmdb r1!, {r3-r5, r12}
  444. orr lr, lr, r12, lsl #16
  445. mov r12, r12, lsr #16
  446. orr r12, r12, r5, lsl #16
  447. mov r5, r5, lsr #16
  448. orr r5, r5, r4, lsl #16
  449. mov r4, r4, lsr #16
  450. orr r4, r4, r3, lsl #16
  451. #else
  452. mov lr, r3, lsl #16
  453. ldmdb r1!, {r3-r5, r12}
  454. orr lr, lr, r12, lsr #16
  455. mov r12, r12, lsl #16
  456. orr r12, r12, r5, lsr #16
  457. mov r5, r5, lsl #16
  458. orr r5, r5, r4, lsr #16
  459. mov r4, r4, lsl #16
  460. orr r4, r4, r3, lsr #16
  461. #endif
  462. stmdb r0!, {r4, r5, r12, lr}
  463. subs r2, r2, #0x10
  464. bge .Lmemcpy_bsrcul2loop16
  465. ldmia sp!, {r4, r5, lr}
  466. adds r2, r2, #0x0c
  467. blt .Lmemcpy_bsrcul2l4
  468. .Lmemcpy_bsrcul2loop4:
  469. #if __BYTE_ORDER == __BIG_ENDIAN
  470. mov r12, r3, lsr #16
  471. ldr r3, [r1, #-4]!
  472. orr r12, r12, r3, lsl #16
  473. #else
  474. mov r12, r3, lsl #16
  475. ldr r3, [r1, #-4]!
  476. orr r12, r12, r3, lsr #16
  477. #endif
  478. str r12, [r0, #-4]!
  479. subs r2, r2, #4
  480. bge .Lmemcpy_bsrcul2loop4
  481. .Lmemcpy_bsrcul2l4:
  482. add r1, r1, #2
  483. b .Lmemcpy_bl4
  484. .Lmemcpy_bsrcul1:
  485. cmp r2, #0x0c
  486. blt .Lmemcpy_bsrcul1loop4
  487. sub r2, r2, #0x0c
  488. stmdb sp!, {r4, r5, lr}
  489. .Lmemcpy_bsrcul1loop32:
  490. #if __BYTE_ORDER == __BIG_ENDIAN
  491. mov lr, r3, lsr #24
  492. ldmdb r1!, {r3-r5, r12}
  493. orr lr, lr, r12, lsl #8
  494. mov r12, r12, lsr #24
  495. orr r12, r12, r5, lsl #8
  496. mov r5, r5, lsr #24
  497. orr r5, r5, r4, lsl #8
  498. mov r4, r4, lsr #24
  499. orr r4, r4, r3, lsl #8
  500. #else
  501. mov lr, r3, lsl #24
  502. ldmdb r1!, {r3-r5, r12}
  503. orr lr, lr, r12, lsr #8
  504. mov r12, r12, lsl #24
  505. orr r12, r12, r5, lsr #8
  506. mov r5, r5, lsl #24
  507. orr r5, r5, r4, lsr #8
  508. mov r4, r4, lsl #24
  509. orr r4, r4, r3, lsr #8
  510. #endif
  511. stmdb r0!, {r4, r5, r12, lr}
  512. subs r2, r2, #0x10
  513. bge .Lmemcpy_bsrcul1loop32
  514. ldmia sp!, {r4, r5, lr}
  515. adds r2, r2, #0x0c
  516. blt .Lmemcpy_bsrcul1l4
  517. .Lmemcpy_bsrcul1loop4:
  518. #if __BYTE_ORDER == __BIG_ENDIAN
  519. mov r12, r3, lsr #24
  520. ldr r3, [r1, #-4]!
  521. orr r12, r12, r3, lsl #8
  522. #else
  523. mov r12, r3, lsl #24
  524. ldr r3, [r1, #-4]!
  525. orr r12, r12, r3, lsr #8
  526. #endif
  527. str r12, [r0, #-4]!
  528. subs r2, r2, #4
  529. bge .Lmemcpy_bsrcul1loop4
  530. .Lmemcpy_bsrcul1l4:
  531. add r1, r1, #1
  532. b .Lmemcpy_bl4