memcpy.S 7.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352
  1. /*
  2. * Copyright (C) 2013, 2014-2015, 2017, 2022 Synopsys, Inc. (www.synopsys.com)
  3. * Copyright (C) 2007 ARC International (UK) LTD
  4. *
  5. * Licensed under the LGPL v2.1 or later, see the file COPYING.LIB in this tarball.
  6. */
  7. #include <sysdep.h>
  8. ENTRY(memcpy)
  9. #if defined(__ARC700__)
  10. /* This memcpy implementation does not support objects of 1GB or larger -
  11. the check for alignment does not work then. */
  12. /* We assume that most sources and destinations are aligned, and
  13. that also lengths are mostly a multiple of four, although to a lesser
  14. extent. */
  15. or r3,r0,r1
  16. asl_s r3,r3,30
  17. mov_s r5,r0
  18. brls.d r2,r3,.Lcopy_bytewise
  19. sub.f r3,r2,1
  20. ld_s r12,[r1,0]
  21. asr.f lp_count,r3,3
  22. bbit0.d r3,2,.Lnox4
  23. bmsk_s r2,r2,1
  24. st.ab r12,[r5,4]
  25. ld.a r12,[r1,4]
  26. .Lnox4:
  27. lppnz .Lendloop
  28. ld_s r3,[r1,4]
  29. st.ab r12,[r5,4]
  30. ld.a r12,[r1,8]
  31. st.ab r3,[r5,4]
  32. .Lendloop:
  33. breq r2,0,.Last_store
  34. ld r3,[r5,0]
  35. #ifdef __LITTLE_ENDIAN__
  36. add3 r2,-1,r2
  37. ; uses long immediate
  38. xor_s r12,r12,r3
  39. bmsk r12,r12,r2
  40. xor_s r12,r12,r3
  41. #else /* BIG ENDIAN */
  42. sub3 r2,31,r2
  43. ; uses long immediate
  44. xor_s r3,r3,r12
  45. bmsk r3,r3,r2
  46. xor_s r12,r12,r3
  47. #endif /* ENDIAN */
  48. .Last_store:
  49. j_s.d [blink]
  50. st r12,[r5,0]
  51. .balign 4
  52. .Lcopy_bytewise:
  53. jcs [blink]
  54. ldb_s r12,[r1,0]
  55. lsr.f lp_count,r3
  56. bhs_s .Lnox1
  57. stb.ab r12,[r5,1]
  58. ldb.a r12,[r1,1]
  59. .Lnox1:
  60. lppnz .Lendbloop
  61. ldb_s r3,[r1,1]
  62. stb.ab r12,[r5,1]
  63. ldb.a r12,[r1,2]
  64. stb.ab r3,[r5,1]
  65. .Lendbloop:
  66. j_s.d [blink]
  67. stb r12,[r5,0]
  68. #elif defined(__ARCHS__)
  69. #ifdef __LITTLE_ENDIAN__
  70. # define SHIFT_1(RX,RY,IMM) asl RX, RY, IMM ; <<
  71. # define SHIFT_2(RX,RY,IMM) lsr RX, RY, IMM ; >>
  72. # define MERGE_1(RX,RY,IMM) asl RX, RY, IMM
  73. # define MERGE_2(RX,RY,IMM)
  74. # define EXTRACT_1(RX,RY,IMM) and RX, RY, 0xFFFF
  75. # define EXTRACT_2(RX,RY,IMM) lsr RX, RY, IMM
  76. #else
  77. # define SHIFT_1(RX,RY,IMM) lsr RX, RY, IMM ; >>
  78. # define SHIFT_2(RX,RY,IMM) asl RX, RY, IMM ; <<
  79. # define MERGE_1(RX,RY,IMM) asl RX, RY, IMM ; <<
  80. # define MERGE_2(RX,RY,IMM) asl RX, RY, IMM ; <<
  81. # define EXTRACT_1(RX,RY,IMM) lsr RX, RY, IMM
  82. # define EXTRACT_2(RX,RY,IMM) lsr RX, RY, 0x08
  83. #endif
  84. #if defined(__LL64__) || defined(__ARC_LL64__)
  85. # define PREFETCH_READ(RX) prefetch [RX, 56]
  86. # define PREFETCH_WRITE(RX) prefetchw [RX, 64]
  87. # define LOADX(DST,RX) ldd.ab DST, [RX, 8]
  88. # define STOREX(SRC,RX) std.ab SRC, [RX, 8]
  89. # define ZOLSHFT 5
  90. # define ZOLAND 0x1F
  91. #else
  92. # define PREFETCH_READ(RX) prefetch [RX, 28]
  93. # define PREFETCH_WRITE(RX) prefetchw [RX, 32]
  94. # define LOADX(DST,RX) ld.ab DST, [RX, 4]
  95. # define STOREX(SRC,RX) st.ab SRC, [RX, 4]
  96. # define ZOLSHFT 4
  97. # define ZOLAND 0xF
  98. #endif
  99. prefetch [r1] ; Prefetch the read location
  100. prefetchw [r0] ; Prefetch the write location
  101. mov.f 0, r2
  102. ;;; if size is zero
  103. jz.d [blink]
  104. mov r3, r0 ; don't clobber ret val
  105. ;;; if size <= 8
  106. cmp r2, 8
  107. bls.d @.Lsmallchunk
  108. mov.f lp_count, r2
  109. and.f r4, r0, 0x03
  110. rsub lp_count, r4, 4
  111. lpnz @.Laligndestination
  112. ;; LOOP BEGIN
  113. ldb.ab r5, [r1,1]
  114. sub r2, r2, 1
  115. stb.ab r5, [r3,1]
  116. .Laligndestination:
  117. ;;; Check the alignment of the source
  118. and.f r4, r1, 0x03
  119. bnz.d @.Lsourceunaligned
  120. ;;; CASE 0: Both source and destination are 32bit aligned
  121. ;;; Convert len to Dwords, unfold x4
  122. lsr.f lp_count, r2, ZOLSHFT
  123. lpnz @.Lcopy32_64bytes
  124. ;; LOOP START
  125. LOADX (r6, r1)
  126. PREFETCH_READ (r1)
  127. PREFETCH_WRITE (r3)
  128. LOADX (r8, r1)
  129. LOADX (r10, r1)
  130. LOADX (r4, r1)
  131. STOREX (r6, r3)
  132. STOREX (r8, r3)
  133. STOREX (r10, r3)
  134. STOREX (r4, r3)
  135. .Lcopy32_64bytes:
  136. and.f lp_count, r2, ZOLAND ;Last remaining 31 bytes
  137. .Lsmallchunk:
  138. lpnz @.Lcopyremainingbytes
  139. ;; LOOP START
  140. ldb.ab r5, [r1,1]
  141. stb.ab r5, [r3,1]
  142. .Lcopyremainingbytes:
  143. j [blink]
  144. ;;; END CASE 0
  145. .Lsourceunaligned:
  146. cmp r4, 2
  147. beq.d @.LunalignedOffby2
  148. sub r2, r2, 1
  149. bhi.d @.LunalignedOffby3
  150. ldb.ab r5, [r1, 1]
  151. ;;; CASE 1: The source is unaligned, off by 1
  152. ;; Hence I need to read 1 byte for a 16bit alignment
  153. ;; and 2bytes to reach 32bit alignment
  154. ldh.ab r6, [r1, 2]
  155. sub r2, r2, 2
  156. ;; Convert to words, unfold x2
  157. lsr.f lp_count, r2, 3
  158. MERGE_1 (r6, r6, 8)
  159. MERGE_2 (r5, r5, 24)
  160. or r5, r5, r6
  161. ;; Both src and dst are aligned
  162. lpnz @.Lcopy8bytes_1
  163. ;; LOOP START
  164. ld.ab r6, [r1, 4]
  165. prefetch [r1, 28] ;Prefetch the next read location
  166. ld.ab r8, [r1,4]
  167. prefetchw [r3, 32] ;Prefetch the next write location
  168. SHIFT_1 (r7, r6, 24)
  169. or r7, r7, r5
  170. SHIFT_2 (r5, r6, 8)
  171. SHIFT_1 (r9, r8, 24)
  172. or r9, r9, r5
  173. SHIFT_2 (r5, r8, 8)
  174. st.ab r7, [r3, 4]
  175. st.ab r9, [r3, 4]
  176. .Lcopy8bytes_1:
  177. ;; Write back the remaining 16bits
  178. EXTRACT_1 (r6, r5, 16)
  179. sth.ab r6, [r3, 2]
  180. ;; Write back the remaining 8bits
  181. EXTRACT_2 (r5, r5, 16)
  182. stb.ab r5, [r3, 1]
  183. and.f lp_count, r2, 0x07 ;Last 8bytes
  184. lpnz @.Lcopybytewise_1
  185. ;; LOOP START
  186. ldb.ab r6, [r1,1]
  187. stb.ab r6, [r3,1]
  188. .Lcopybytewise_1:
  189. j [blink]
  190. .LunalignedOffby2:
  191. ;;; CASE 2: The source is unaligned, off by 2
  192. ldh.ab r5, [r1, 2]
  193. sub r2, r2, 1
  194. ;; Both src and dst are aligned
  195. ;; Convert to words, unfold x2
  196. lsr.f lp_count, r2, 3
  197. #ifdef __BIG_ENDIAN__
  198. asl.nz r5, r5, 16
  199. #endif
  200. lpnz @.Lcopy8bytes_2
  201. ;; LOOP START
  202. ld.ab r6, [r1, 4]
  203. prefetch [r1, 28] ;Prefetch the next read location
  204. ld.ab r8, [r1,4]
  205. prefetchw [r3, 32] ;Prefetch the next write location
  206. SHIFT_1 (r7, r6, 16)
  207. or r7, r7, r5
  208. SHIFT_2 (r5, r6, 16)
  209. SHIFT_1 (r9, r8, 16)
  210. or r9, r9, r5
  211. SHIFT_2 (r5, r8, 16)
  212. st.ab r7, [r3, 4]
  213. st.ab r9, [r3, 4]
  214. .Lcopy8bytes_2:
  215. #ifdef __BIG_ENDIAN__
  216. lsr.nz r5, r5, 16
  217. #endif
  218. sth.ab r5, [r3, 2]
  219. and.f lp_count, r2, 0x07 ;Last 8bytes
  220. lpnz @.Lcopybytewise_2
  221. ;; LOOP START
  222. ldb.ab r6, [r1,1]
  223. stb.ab r6, [r3,1]
  224. .Lcopybytewise_2:
  225. j [blink]
  226. .LunalignedOffby3:
  227. ;;; CASE 3: The source is unaligned, off by 3
  228. ;;; Hence, I need to read 1byte for achieve the 32bit alignment
  229. ;; Both src and dst are aligned
  230. ;; Convert to words, unfold x2
  231. lsr.f lp_count, r2, 3
  232. #ifdef __BIG_ENDIAN__
  233. asl.ne r5, r5, 24
  234. #endif
  235. lpnz @.Lcopy8bytes_3
  236. ;; LOOP START
  237. ld.ab r6, [r1, 4]
  238. prefetch [r1, 28] ;Prefetch the next read location
  239. ld.ab r8, [r1,4]
  240. prefetchw [r3, 32] ;Prefetch the next write location
  241. SHIFT_1 (r7, r6, 8)
  242. or r7, r7, r5
  243. SHIFT_2 (r5, r6, 24)
  244. SHIFT_1 (r9, r8, 8)
  245. or r9, r9, r5
  246. SHIFT_2 (r5, r8, 24)
  247. st.ab r7, [r3, 4]
  248. st.ab r9, [r3, 4]
  249. .Lcopy8bytes_3:
  250. #ifdef __BIG_ENDIAN__
  251. lsr.nz r5, r5, 24
  252. #endif
  253. stb.ab r5, [r3, 1]
  254. and.f lp_count, r2, 0x07 ;Last 8bytes
  255. lpnz @.Lcopybytewise_3
  256. ;; LOOP START
  257. ldb.ab r6, [r1,1]
  258. stb.ab r6, [r3,1]
  259. .Lcopybytewise_3:
  260. j [blink]
  261. #elif defined(__ARC64_ARCH32__)
  262. ;; Based on Synopsys code from newlib's arc64/memcpy.S
  263. lsr.f r11, r2, 4 ; counter for 16-byte chunks
  264. beq.d @.L_write_15_bytes
  265. mov r3, r0 ; work on a copy of "r0"
  266. .L_write_16_bytes:
  267. #if defined(__ARC64_LL64__)
  268. ldd.ab r4, [r1, 8]
  269. ldd.ab r6, [r1, 8]
  270. std.ab r4, [r3, 8]
  271. std.ab r6, [r3, 8]
  272. dbnz r11, @.L_write_16_bytes
  273. #else
  274. ld.ab r4, [r1, 4]
  275. ld.ab r5, [r1, 4]
  276. ld.ab r6, [r1, 4]
  277. ld.ab r7, [r1, 4]
  278. st.ab r4, [r3, 4]
  279. st.ab r5, [r3, 4]
  280. st.ab r6, [r3, 4]
  281. dbnz.d r11, @.L_write_16_bytes
  282. st.ab r7, [r3, 4]
  283. #endif
  284. bmsk_s r2, r2, 3
  285. .L_write_15_bytes:
  286. bbit0.d r2, 1, @1f
  287. lsr r11, r2, 2
  288. ldh.ab r4, [r1, 2]
  289. sth.ab r4, [r3, 2]
  290. 1:
  291. bbit0.d r2, 0, @1f
  292. xor r11, r11, 3
  293. ldb.ab r4, [r1, 1]
  294. stb.ab r4, [r3, 1]
  295. 1:
  296. asl r11, r11, 1
  297. bi [r11]
  298. ld.ab r4,[r1, 4]
  299. st.ab r4,[r3, 4]
  300. ld.ab r4,[r1, 4]
  301. st.ab r4,[r3, 4]
  302. ld r4,[r1]
  303. st r4,[r3]
  304. j_s [blink]
  305. #else
  306. #error "Unsupported ARC CPU type"
  307. #endif
  308. END(memcpy)
  309. libc_hidden_def(memcpy)