memcpy.S 6.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305
  1. /*
  2. * Copyright (C) 2013, 2014-2015, 2017 Synopsys, Inc. (www.synopsys.com)
  3. * Copyright (C) 2007 ARC International (UK) LTD
  4. *
  5. * Licensed under the LGPL v2.1 or later, see the file COPYING.LIB in this tarball.
  6. */
  7. #include <sysdep.h>
  8. #if !defined(__ARC700__) && !defined(__ARCHS__)
  9. #error "Neither ARC700 nor ARCHS is defined!"
  10. #endif
  11. ENTRY(memcpy)
  12. #ifdef __ARC700__
  13. /* This memcpy implementation does not support objects of 1GB or larger -
  14. the check for alignment does not work then. */
  15. /* We assume that most sources and destinations are aligned, and
  16. that also lengths are mostly a multiple of four, although to a lesser
  17. extent. */
  18. or r3,r0,r1
  19. asl_s r3,r3,30
  20. mov_s r5,r0
  21. brls.d r2,r3,.Lcopy_bytewise
  22. sub.f r3,r2,1
  23. ld_s r12,[r1,0]
  24. asr.f lp_count,r3,3
  25. bbit0.d r3,2,.Lnox4
  26. bmsk_s r2,r2,1
  27. st.ab r12,[r5,4]
  28. ld.a r12,[r1,4]
  29. .Lnox4:
  30. lppnz .Lendloop
  31. ld_s r3,[r1,4]
  32. st.ab r12,[r5,4]
  33. ld.a r12,[r1,8]
  34. st.ab r3,[r5,4]
  35. .Lendloop:
  36. breq r2,0,.Last_store
  37. ld r3,[r5,0]
  38. #ifdef __LITTLE_ENDIAN__
  39. add3 r2,-1,r2
  40. ; uses long immediate
  41. xor_s r12,r12,r3
  42. bmsk r12,r12,r2
  43. xor_s r12,r12,r3
  44. #else /* BIG ENDIAN */
  45. sub3 r2,31,r2
  46. ; uses long immediate
  47. xor_s r3,r3,r12
  48. bmsk r3,r3,r2
  49. xor_s r12,r12,r3
  50. #endif /* ENDIAN */
  51. .Last_store:
  52. j_s.d [blink]
  53. st r12,[r5,0]
  54. .balign 4
  55. .Lcopy_bytewise:
  56. jcs [blink]
  57. ldb_s r12,[r1,0]
  58. lsr.f lp_count,r3
  59. bhs_s .Lnox1
  60. stb.ab r12,[r5,1]
  61. ldb.a r12,[r1,1]
  62. .Lnox1:
  63. lppnz .Lendbloop
  64. ldb_s r3,[r1,1]
  65. stb.ab r12,[r5,1]
  66. ldb.a r12,[r1,2]
  67. stb.ab r3,[r5,1]
  68. .Lendbloop:
  69. j_s.d [blink]
  70. stb r12,[r5,0]
  71. #endif /* __ARC700__ */
  72. #ifdef __ARCHS__
  73. #ifdef __LITTLE_ENDIAN__
  74. # define SHIFT_1(RX,RY,IMM) asl RX, RY, IMM ; <<
  75. # define SHIFT_2(RX,RY,IMM) lsr RX, RY, IMM ; >>
  76. # define MERGE_1(RX,RY,IMM) asl RX, RY, IMM
  77. # define MERGE_2(RX,RY,IMM)
  78. # define EXTRACT_1(RX,RY,IMM) and RX, RY, 0xFFFF
  79. # define EXTRACT_2(RX,RY,IMM) lsr RX, RY, IMM
  80. #else
  81. # define SHIFT_1(RX,RY,IMM) lsr RX, RY, IMM ; >>
  82. # define SHIFT_2(RX,RY,IMM) asl RX, RY, IMM ; <<
  83. # define MERGE_1(RX,RY,IMM) asl RX, RY, IMM ; <<
  84. # define MERGE_2(RX,RY,IMM) asl RX, RY, IMM ; <<
  85. # define EXTRACT_1(RX,RY,IMM) lsr RX, RY, IMM
  86. # define EXTRACT_2(RX,RY,IMM) lsr RX, RY, 0x08
  87. #endif
  88. #if defined(__LL64__) || defined(__ARC_LL64__)
  89. # define PREFETCH_READ(RX) prefetch [RX, 56]
  90. # define PREFETCH_WRITE(RX) prefetchw [RX, 64]
  91. # define LOADX(DST,RX) ldd.ab DST, [RX, 8]
  92. # define STOREX(SRC,RX) std.ab SRC, [RX, 8]
  93. # define ZOLSHFT 5
  94. # define ZOLAND 0x1F
  95. #else
  96. # define PREFETCH_READ(RX) prefetch [RX, 28]
  97. # define PREFETCH_WRITE(RX) prefetchw [RX, 32]
  98. # define LOADX(DST,RX) ld.ab DST, [RX, 4]
  99. # define STOREX(SRC,RX) st.ab SRC, [RX, 4]
  100. # define ZOLSHFT 4
  101. # define ZOLAND 0xF
  102. #endif
  103. prefetch [r1] ; Prefetch the read location
  104. prefetchw [r0] ; Prefetch the write location
  105. mov.f 0, r2
  106. ;;; if size is zero
  107. jz.d [blink]
  108. mov r3, r0 ; don't clobber ret val
  109. ;;; if size <= 8
  110. cmp r2, 8
  111. bls.d @.Lsmallchunk
  112. mov.f lp_count, r2
  113. and.f r4, r0, 0x03
  114. rsub lp_count, r4, 4
  115. lpnz @.Laligndestination
  116. ;; LOOP BEGIN
  117. ldb.ab r5, [r1,1]
  118. sub r2, r2, 1
  119. stb.ab r5, [r3,1]
  120. .Laligndestination:
  121. ;;; Check the alignment of the source
  122. and.f r4, r1, 0x03
  123. bnz.d @.Lsourceunaligned
  124. ;;; CASE 0: Both source and destination are 32bit aligned
  125. ;;; Convert len to Dwords, unfold x4
  126. lsr.f lp_count, r2, ZOLSHFT
  127. lpnz @.Lcopy32_64bytes
  128. ;; LOOP START
  129. LOADX (r6, r1)
  130. PREFETCH_READ (r1)
  131. PREFETCH_WRITE (r3)
  132. LOADX (r8, r1)
  133. LOADX (r10, r1)
  134. LOADX (r4, r1)
  135. STOREX (r6, r3)
  136. STOREX (r8, r3)
  137. STOREX (r10, r3)
  138. STOREX (r4, r3)
  139. .Lcopy32_64bytes:
  140. and.f lp_count, r2, ZOLAND ;Last remaining 31 bytes
  141. .Lsmallchunk:
  142. lpnz @.Lcopyremainingbytes
  143. ;; LOOP START
  144. ldb.ab r5, [r1,1]
  145. stb.ab r5, [r3,1]
  146. .Lcopyremainingbytes:
  147. j [blink]
  148. ;;; END CASE 0
  149. .Lsourceunaligned:
  150. cmp r4, 2
  151. beq.d @.LunalignedOffby2
  152. sub r2, r2, 1
  153. bhi.d @.LunalignedOffby3
  154. ldb.ab r5, [r1, 1]
  155. ;;; CASE 1: The source is unaligned, off by 1
  156. ;; Hence I need to read 1 byte for a 16bit alignment
  157. ;; and 2bytes to reach 32bit alignment
  158. ldh.ab r6, [r1, 2]
  159. sub r2, r2, 2
  160. ;; Convert to words, unfold x2
  161. lsr.f lp_count, r2, 3
  162. MERGE_1 (r6, r6, 8)
  163. MERGE_2 (r5, r5, 24)
  164. or r5, r5, r6
  165. ;; Both src and dst are aligned
  166. lpnz @.Lcopy8bytes_1
  167. ;; LOOP START
  168. ld.ab r6, [r1, 4]
  169. prefetch [r1, 28] ;Prefetch the next read location
  170. ld.ab r8, [r1,4]
  171. prefetchw [r3, 32] ;Prefetch the next write location
  172. SHIFT_1 (r7, r6, 24)
  173. or r7, r7, r5
  174. SHIFT_2 (r5, r6, 8)
  175. SHIFT_1 (r9, r8, 24)
  176. or r9, r9, r5
  177. SHIFT_2 (r5, r8, 8)
  178. st.ab r7, [r3, 4]
  179. st.ab r9, [r3, 4]
  180. .Lcopy8bytes_1:
  181. ;; Write back the remaining 16bits
  182. EXTRACT_1 (r6, r5, 16)
  183. sth.ab r6, [r3, 2]
  184. ;; Write back the remaining 8bits
  185. EXTRACT_2 (r5, r5, 16)
  186. stb.ab r5, [r3, 1]
  187. and.f lp_count, r2, 0x07 ;Last 8bytes
  188. lpnz @.Lcopybytewise_1
  189. ;; LOOP START
  190. ldb.ab r6, [r1,1]
  191. stb.ab r6, [r3,1]
  192. .Lcopybytewise_1:
  193. j [blink]
  194. .LunalignedOffby2:
  195. ;;; CASE 2: The source is unaligned, off by 2
  196. ldh.ab r5, [r1, 2]
  197. sub r2, r2, 1
  198. ;; Both src and dst are aligned
  199. ;; Convert to words, unfold x2
  200. lsr.f lp_count, r2, 3
  201. #ifdef __BIG_ENDIAN__
  202. asl.nz r5, r5, 16
  203. #endif
  204. lpnz @.Lcopy8bytes_2
  205. ;; LOOP START
  206. ld.ab r6, [r1, 4]
  207. prefetch [r1, 28] ;Prefetch the next read location
  208. ld.ab r8, [r1,4]
  209. prefetchw [r3, 32] ;Prefetch the next write location
  210. SHIFT_1 (r7, r6, 16)
  211. or r7, r7, r5
  212. SHIFT_2 (r5, r6, 16)
  213. SHIFT_1 (r9, r8, 16)
  214. or r9, r9, r5
  215. SHIFT_2 (r5, r8, 16)
  216. st.ab r7, [r3, 4]
  217. st.ab r9, [r3, 4]
  218. .Lcopy8bytes_2:
  219. #ifdef __BIG_ENDIAN__
  220. lsr.nz r5, r5, 16
  221. #endif
  222. sth.ab r5, [r3, 2]
  223. and.f lp_count, r2, 0x07 ;Last 8bytes
  224. lpnz @.Lcopybytewise_2
  225. ;; LOOP START
  226. ldb.ab r6, [r1,1]
  227. stb.ab r6, [r3,1]
  228. .Lcopybytewise_2:
  229. j [blink]
  230. .LunalignedOffby3:
  231. ;;; CASE 3: The source is unaligned, off by 3
  232. ;;; Hence, I need to read 1byte for achieve the 32bit alignment
  233. ;; Both src and dst are aligned
  234. ;; Convert to words, unfold x2
  235. lsr.f lp_count, r2, 3
  236. #ifdef __BIG_ENDIAN__
  237. asl.ne r5, r5, 24
  238. #endif
  239. lpnz @.Lcopy8bytes_3
  240. ;; LOOP START
  241. ld.ab r6, [r1, 4]
  242. prefetch [r1, 28] ;Prefetch the next read location
  243. ld.ab r8, [r1,4]
  244. prefetchw [r3, 32] ;Prefetch the next write location
  245. SHIFT_1 (r7, r6, 8)
  246. or r7, r7, r5
  247. SHIFT_2 (r5, r6, 24)
  248. SHIFT_1 (r9, r8, 8)
  249. or r9, r9, r5
  250. SHIFT_2 (r5, r8, 24)
  251. st.ab r7, [r3, 4]
  252. st.ab r9, [r3, 4]
  253. .Lcopy8bytes_3:
  254. #ifdef __BIG_ENDIAN__
  255. lsr.nz r5, r5, 24
  256. #endif
  257. stb.ab r5, [r3, 1]
  258. and.f lp_count, r2, 0x07 ;Last 8bytes
  259. lpnz @.Lcopybytewise_3
  260. ;; LOOP START
  261. ldb.ab r6, [r1,1]
  262. stb.ab r6, [r3,1]
  263. .Lcopybytewise_3:
  264. j [blink]
  265. #endif /* __ARCHS__ */
  266. END(memcpy)
  267. libc_hidden_def(memcpy)