memcpy.S 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913
  1. /*
  2. * "memcpy" implementation of SuperH
  3. *
  4. * Copyright (C) 1999 Niibe Yutaka
  5. * Copyright (c) 2002 STMicroelectronics Ltd
  6. * Modified from memcpy.S and micro-optimised for SH4
  7. * Stuart Menefy (stuart.menefy@st.com)
  8. *
  9. * Copyright (c) 2009 STMicroelectronics Ltd
  10. * Optimised using prefetching and 64bit data transfer via FPU
  11. * Author: Giuseppe Cavallaro <peppe.cavallaro@st.com>
  12. */
  13. /*
  14. * void *memcpy(void *dst, const void *src, size_t n);
  15. *
  16. * It is assumed that there is no overlap between src and dst.
  17. * If there is an overlap, then the results are undefined.
  18. */
  19. #include <endian.h>
  20. #ifdef __LITTLE_ENDIAN__
  21. #define MEMCPY_USES_FPU
  22. /* Use paired single precision load or store mode for 64-bit tranfering.
  23. * FPSCR.SZ=1,FPSCR.SZ=0 is well defined on both SH4-200 and SH4-300.
  24. * Currenlty it has been only implemented and tested for little endian mode. */
  25. .macro FPU_SET_PAIRED_PREC
  26. sts fpscr, r7
  27. mov #0x10, r6 ! PR=0 SZ=1
  28. shll16 r6
  29. lds r6, fpscr
  30. .endm
  31. .macro RESTORE_FPSCR
  32. lds r7, fpscr
  33. .endm
  34. #endif
  35. !
  36. ! GHIJ KLMN OPQR --> ...G HIJK LMNO PQR.
  37. !
  38. ! Size is 16 or greater, and may have trailing bytes
  39. .balign 32
  40. .Lcase1:
  41. ! Read a long word and write a long word at once
  42. ! At the start of each iteration, r7 contains last long load
  43. add #-1,r5 ! 79 EX
  44. mov r4,r2 ! 5 MT (0 cycles latency)
  45. mov.l @(r0,r5),r7 ! 21 LS (2 cycles latency)
  46. add #-4,r5 ! 50 EX
  47. add #7,r2 ! 79 EX
  48. !
  49. #ifdef __LITTLE_ENDIAN__
  50. ! 6 cycles, 4 bytes per iteration
  51. 3: mov.l @(r0,r5),r1 ! 21 LS (latency=2) ! NMLK
  52. mov r7, r3 ! 5 MT (latency=0) ! RQPO
  53. cmp/hi r2,r0 ! 57 MT
  54. shll16 r3 ! 103 EX
  55. mov r1,r6 ! 5 MT (latency=0)
  56. shll8 r3 ! 102 EX ! Oxxx
  57. shlr8 r6 ! 106 EX ! xNML
  58. mov r1, r7 ! 5 MT (latency=0)
  59. or r6,r3 ! 82 EX ! ONML
  60. bt/s 3b ! 109 BR
  61. mov.l r3,@-r0 ! 30 LS
  62. #else
  63. 3: mov.l @(r0,r5),r1 ! 21 LS (latency=2) ! KLMN
  64. mov r7,r3 ! 5 MT (latency=0) ! OPQR
  65. cmp/hi r2,r0 ! 57 MT
  66. shlr16 r3 ! 107 EX
  67. shlr8 r3 ! 106 EX ! xxxO
  68. mov r1,r6 ! 5 MT (latency=0)
  69. shll8 r6 ! 102 EX ! LMNx
  70. mov r1,r7 ! 5 MT (latency=0)
  71. or r6,r3 ! 82 EX ! LMNO
  72. bt/s 3b ! 109 BR
  73. mov.l r3,@-r0 ! 30 LS
  74. #endif
  75. ! Finally, copy a byte at once, if necessary
  76. add #4,r5 ! 50 EX
  77. cmp/eq r4,r0 ! 54 MT
  78. add #-6,r2 ! 50 EX
  79. bt 9f ! 109 BR
  80. 8: cmp/hi r2,r0 ! 57 MT
  81. mov.b @(r0,r5),r1 ! 20 LS (latency=2)
  82. bt/s 8b ! 109 BR
  83. mov.b r1,@-r0 ! 29 LS
  84. 9: rts
  85. nop
  86. !
  87. ! GHIJ KLMN OPQR --> .GHI JKLM NOPQ R...
  88. !
  89. ! Size is 16 or greater, and may have trailing bytes
  90. .balign 32
  91. .Lcase3:
  92. ! Read a long word and write a long word at once
  93. ! At the start of each iteration, r7 contains last long load
  94. add #-3,r5 ! 79 EX
  95. mov r4,r2 ! 5 MT (0 cycles latency)
  96. mov.l @(r0,r5),r7 ! 21 LS (2 cycles latency)
  97. add #-4,r5 ! 50 EX
  98. add #7,r2 ! 79 EX
  99. !
  100. #ifdef __LITTLE_ENDIAN__
  101. ! 6 cycles, 4 bytes per iteration
  102. 3: mov.l @(r0,r5),r1 ! 21 LS (latency=2) ! NMLK
  103. mov r7, r3 ! 5 MT (latency=0) ! RQPO
  104. cmp/hi r2,r0 ! 57 MT
  105. shll8 r3 ! 102 EX ! QPOx
  106. mov r1,r6 ! 5 MT (latency=0)
  107. shlr16 r6 ! 107 EX
  108. shlr8 r6 ! 106 EX ! xxxN
  109. mov r1, r7 ! 5 MT (latency=0)
  110. or r6,r3 ! 82 EX ! QPON
  111. bt/s 3b ! 109 BR
  112. mov.l r3,@-r0 ! 30 LS
  113. #else
  114. 3: mov r7,r3 ! OPQR
  115. shlr8 r3 ! xOPQ
  116. mov.l @(r0,r5),r7 ! KLMN
  117. mov r7,r6
  118. shll16 r6
  119. shll8 r6 ! Nxxx
  120. or r6,r3 ! NOPQ
  121. cmp/hi r2,r0
  122. bt/s 3b
  123. mov.l r3,@-r0
  124. #endif
  125. ! Finally, copy a byte at once, if necessary
  126. add #6,r5 ! 50 EX
  127. cmp/eq r4,r0 ! 54 MT
  128. add #-6,r2 ! 50 EX
  129. bt 9f ! 109 BR
  130. 8: cmp/hi r2,r0 ! 57 MT
  131. mov.b @(r0,r5),r1 ! 20 LS (latency=2)
  132. bt/s 8b ! 109 BR
  133. mov.b r1,@-r0 ! 29 LS
  134. 9: rts
  135. nop
  136. /* void * memcpy(void *dst, const void *src, size_t len) */
  137. .text
  138. .align 4
  139. .type memcpy,@function
  140. .globl memcpy;
  141. memcpy:
  142. ! Calculate the invariants which will be used in the remainder
  143. ! of the code:
  144. !
  145. ! r4 --> [ ... ] DST [ ... ] SRC
  146. ! [ ... ] [ ... ]
  147. ! : :
  148. ! r0 --> [ ... ] r0+r5 --> [ ... ]
  149. !
  150. !
  151. ! Short circuit the common case of src, dst and len being 32 bit aligned
  152. ! and test for zero length move
  153. mov r6, r0 ! 5 MT (0 cycle latency)
  154. or r4, r0 ! 82 EX
  155. or r5, r0 ! 82 EX
  156. tst r6, r6 ! 86 MT
  157. bt/s 99f ! 111 BR (zero len)
  158. tst #3, r0 ! 87 MT
  159. mov r4, r0 ! 5 MT (0 cycle latency)
  160. add r6, r0 ! 49 EX
  161. bt/s .Lcase00 ! 111 BR (aligned)
  162. sub r4, r5 ! 75 EX
  163. ! Arguments are not nicely long word aligned or zero len.
  164. ! Check for small copies, and if so do a simple byte at a time copy.
  165. !
  166. ! Deciding on an exact value of 'small' is not easy, as the point at which
  167. ! using the optimised routines become worthwhile varies (these are the
  168. ! cycle counts for differnet sizes using byte-at-a-time vs. optimised):
  169. ! size byte-at-time long word byte
  170. ! 16 42 39-40 46-50 50-55
  171. ! 24 58 43-44 54-58 62-67
  172. ! 36 82 49-50 66-70 80-85
  173. ! However the penalty for getting it 'wrong' is much higher for long word
  174. ! aligned data (and this is more common), so use a value of 16.
  175. mov #16, r1 ! 6 EX
  176. cmp/gt r6,r1 ! 56 MT
  177. add #-1,r5 ! 50 EX
  178. bf/s 6f ! 108 BR (not small)
  179. mov r5, r3 ! 5 MT (latency=0)
  180. shlr r6 ! 104 EX
  181. mov.b @(r0,r5),r1 ! 20 LS (latency=2)
  182. bf/s 4f ! 111 BR
  183. add #-1,r3 ! 50 EX
  184. tst r6, r6 ! 86 MT
  185. bt/s 98f ! 110 BR
  186. mov.b r1,@-r0 ! 29 LS
  187. ! 4 cycles, 2 bytes per iteration
  188. 3: mov.b @(r0,r5),r1 ! 20 LS (latency=2)
  189. 4: mov.b @(r0,r3),r2 ! 20 LS (latency=2)
  190. dt r6 ! 67 EX
  191. mov.b r1,@-r0 ! 29 LS
  192. bf/s 3b ! 111 BR
  193. mov.b r2,@-r0 ! 29 LS
  194. 98:
  195. rts
  196. nop
  197. 99: rts
  198. mov r4, r0
  199. ! Size is not small, so its worthwhile looking for optimisations.
  200. ! First align destination to a long word boundary.
  201. !
  202. ! r5 = normal value -1
  203. 6: tst #3, r0 ! 87 MT
  204. mov #3, r3 ! 6 EX
  205. bt/s 2f ! 111 BR
  206. and r0,r3 ! 78 EX
  207. ! 3 cycles, 1 byte per iteration
  208. 1: dt r3 ! 67 EX
  209. mov.b @(r0,r5),r1 ! 19 LS (latency=2)
  210. add #-1, r6 ! 79 EX
  211. bf/s 1b ! 109 BR
  212. mov.b r1,@-r0 ! 28 LS
  213. 2: add #1, r5 ! 79 EX
  214. ! Now select the appropriate bulk transfer code based on relative
  215. ! alignment of src and dst.
  216. mov r0, r3 ! 5 MT (latency=0)
  217. mov r5, r0 ! 5 MT (latency=0)
  218. tst #1, r0 ! 87 MT
  219. bf/s 1f ! 111 BR
  220. mov #64, r7 ! 6 EX
  221. ! bit 0 clear
  222. cmp/ge r7, r6 ! 55 MT
  223. bt/s 2f ! 111 BR
  224. tst #2, r0 ! 87 MT
  225. ! small
  226. bt/s .Lcase0
  227. mov r3, r0
  228. bra .Lcase2
  229. nop
  230. ! big
  231. 2: bt/s .Lcase0b
  232. mov r3, r0
  233. bra .Lcase2b
  234. nop
  235. ! bit 0 set
  236. 1: tst #2, r0 ! 87 MT
  237. bt/s .Lcase1
  238. mov r3, r0
  239. bra .Lcase3
  240. nop
  241. !
  242. ! GHIJ KLMN OPQR --> GHIJ KLMN OPQR
  243. !
  244. ! src, dst and size are all long word aligned
  245. ! size is non-zero
  246. .balign 32
  247. .Lcase00:
  248. mov #64, r1 ! 6 EX
  249. mov r5, r3 ! 5 MT (latency=0)
  250. cmp/gt r6, r1 ! 56 MT
  251. add #-4, r5 ! 50 EX
  252. bf .Lcase00b ! 108 BR (big loop)
  253. shlr2 r6 ! 105 EX
  254. shlr r6 ! 104 EX
  255. mov.l @(r0, r5), r1 ! 21 LS (latency=2)
  256. bf/s 4f ! 111 BR
  257. add #-8, r3 ! 50 EX
  258. tst r6, r6 ! 86 MT
  259. bt/s 5f ! 110 BR
  260. mov.l r1,@-r0 ! 30 LS
  261. ! 4 cycles, 2 long words per iteration
  262. 3: mov.l @(r0, r5), r1 ! 21 LS (latency=2)
  263. 4: mov.l @(r0, r3), r2 ! 21 LS (latency=2)
  264. dt r6 ! 67 EX
  265. mov.l r1, @-r0 ! 30 LS
  266. bf/s 3b ! 109 BR
  267. mov.l r2, @-r0 ! 30 LS
  268. 5: rts
  269. nop
  270. ! Size is 16 or greater and less than 64, but may have trailing bytes
  271. .balign 32
  272. .Lcase0:
  273. add #-4, r5 ! 50 EX
  274. mov r4, r7 ! 5 MT (latency=0)
  275. mov.l @(r0, r5), r1 ! 21 LS (latency=2)
  276. mov #4, r2 ! 6 EX
  277. add #11, r7 ! 50 EX
  278. tst r2, r6 ! 86 MT
  279. mov r5, r3 ! 5 MT (latency=0)
  280. bt/s 4f ! 111 BR
  281. add #-4, r3 ! 50 EX
  282. mov.l r1,@-r0 ! 30 LS
  283. ! 4 cycles, 2 long words per iteration
  284. 3: mov.l @(r0, r5), r1 ! 21 LS (latency=2)
  285. 4: mov.l @(r0, r3), r2 ! 21 LS (latency=2)
  286. cmp/hi r7, r0
  287. mov.l r1, @-r0 ! 30 LS
  288. bt/s 3b ! 109 BR
  289. mov.l r2, @-r0 ! 30 LS
  290. ! Copy the final 0-3 bytes
  291. add #3,r5 ! 50 EX
  292. cmp/eq r0, r4 ! 54 MT
  293. add #-10, r7 ! 50 EX
  294. bt 9f ! 110 BR
  295. ! 3 cycles, 1 byte per iteration
  296. 1: mov.b @(r0,r5),r1 ! 19 LS
  297. cmp/hi r7,r0 ! 57 MT
  298. bt/s 1b ! 111 BR
  299. mov.b r1,@-r0 ! 28 LS
  300. 9: rts
  301. nop
  302. ! Size is at least 64 bytes, so will be going round the big loop at least once.
  303. !
  304. ! r2 = rounded up r4
  305. ! r3 = rounded down r0
  306. .balign 32
  307. .Lcase0b:
  308. add #-4, r5 ! 50 EX
  309. .Lcase00b:
  310. mov r0, r3 ! 5 MT (latency=0)
  311. mov #(~0x1f), r1 ! 6 EX
  312. and r1, r3 ! 78 EX
  313. mov r4, r2 ! 5 MT (latency=0)
  314. cmp/eq r3, r0 ! 54 MT
  315. add #0x1f, r2 ! 50 EX
  316. bt/s 1f ! 110 BR
  317. and r1, r2 ! 78 EX
  318. ! copy initial words until cache line aligned
  319. mov.l @(r0, r5), r1 ! 21 LS (latency=2)
  320. tst #4, r0 ! 87 MT
  321. mov r5, r6 ! 5 MT (latency=0)
  322. add #-4, r6 ! 50 EX
  323. bt/s 4f ! 111 BR
  324. add #8, r3 ! 50 EX
  325. tst #0x18, r0 ! 87 MT
  326. bt/s 1f ! 109 BR
  327. mov.l r1,@-r0 ! 30 LS
  328. ! 4 cycles, 2 long words per iteration
  329. 3: mov.l @(r0, r5), r1 ! 21 LS (latency=2)
  330. 4: mov.l @(r0, r6), r7 ! 21 LS (latency=2)
  331. cmp/eq r3, r0 ! 54 MT
  332. mov.l r1, @-r0 ! 30 LS
  333. bf/s 3b ! 109 BR
  334. mov.l r7, @-r0 ! 30 LS
  335. #ifdef MEMCPY_USES_FPU
  336. ! Copy the cache line aligned blocks by using the FPU registers.
  337. ! If src and dst are well aligned adopt 64-bit data transfer.
  338. ! We also need r0 as a temporary (for movca), so 'undo' the invariant:
  339. ! r5: src (was r0+r5)
  340. ! r1: dest (was r0)
  341. 1:
  342. add r0, r5
  343. mov r0, r1
  344. add #-0x1c, r5
  345. mov r5, r0
  346. tst #7, r0 ! src is 8byte aligned
  347. mov r5, r3
  348. add #-64, r3 ! To pefetch head
  349. bt/s 3f
  350. pref @r3
  351. 2: fmov.s @r5+, fr0
  352. mov r1, r6
  353. fmov.s @r5+, fr1
  354. add #-32, r6
  355. fmov.s @r5+, fr2
  356. fmov.s @r5+, fr3
  357. fmov.s @r5+, fr4
  358. fmov.s @r5+, fr5
  359. fmov.s @r5+, fr6
  360. fmov.s @r5+, fr7
  361. add #-0x40, r5
  362. movca.l r0, @r6 ! Cache allocate + store on dst-32.
  363. fmov.s fr7, @-r1
  364. fmov.s fr6, @-r1
  365. fmov.s fr5, @-r1
  366. fmov.s fr4, @-r1
  367. fmov.s fr3, @-r1
  368. fmov.s fr2, @-r1
  369. fmov.s fr1, @-r1
  370. fmov.s fr0, @-r1
  371. add #-32, r3
  372. cmp/eq r2,r1
  373. bf/s 2b
  374. pref @r3 ! Prefetch the next cache line.
  375. bra 5f
  376. 3: FPU_SET_PAIRED_PREC
  377. 4: fmov @r5+, dr0
  378. mov r1, r6
  379. fmov @r5+, dr2
  380. add #-32, r6
  381. fmov @r5+, dr4
  382. fmov @r5+, dr6
  383. add #-0x40, r5
  384. movca.l r0, @r6
  385. fmov dr6, @-r1
  386. fmov dr4, @-r1
  387. fmov dr2, @-r1
  388. fmov dr0, @-r1
  389. add #-32, r3
  390. cmp/eq r2,r1
  391. bf/s 4b
  392. pref @r3
  393. RESTORE_FPSCR
  394. 5: mov r1, r0
  395. cmp/eq r4, r0 ! 54 MT
  396. bf/s 1f ! 109 BR
  397. sub r1, r5 ! 75 EX
  398. rts
  399. nop
  400. 1:
  401. #else
  402. ! Copy the cache line aligned blocks
  403. !
  404. ! In use: r0, r2, r4, r5
  405. ! Scratch: r1, r3, r6, r7
  406. !
  407. ! We could do this with the four scratch registers, but if src
  408. ! and dest hit the same cache line, this will thrash, so make
  409. ! use of additional registers.
  410. !
  411. ! We also need r0 as a temporary (for movca), so 'undo' the invariant:
  412. ! r5: src (was r0+r5)
  413. ! r1: dest (was r0)
  414. ! this can be reversed at the end, so we don't need to save any extra
  415. ! state.
  416. !
  417. 1: mov.l r8, @-r15 ! 30 LS
  418. add r0, r5 ! 49 EX
  419. mov.l r9, @-r15 ! 30 LS
  420. mov r0, r1 ! 5 MT (latency=0)
  421. mov.l r10, @-r15 ! 30 LS
  422. add #-0x1c, r5 ! 50 EX
  423. mov.l r11, @-r15 ! 30 LS
  424. ! 16 cycles, 32 bytes per iteration
  425. 2: mov.l @(0x00,r5),r0 ! 18 LS (latency=2)
  426. add #-0x20, r1 ! 50 EX
  427. mov.l @(0x04,r5),r3 ! 18 LS (latency=2)
  428. mov.l @(0x08,r5),r6 ! 18 LS (latency=2)
  429. mov.l @(0x0c,r5),r7 ! 18 LS (latency=2)
  430. mov.l @(0x10,r5),r8 ! 18 LS (latency=2)
  431. mov.l @(0x14,r5),r9 ! 18 LS (latency=2)
  432. mov.l @(0x18,r5),r10 ! 18 LS (latency=2)
  433. mov.l @(0x1c,r5),r11 ! 18 LS (latency=2)
  434. movca.l r0,@r1 ! 40 LS (latency=3-7)
  435. mov.l r3,@(0x04,r1) ! 33 LS
  436. mov.l r6,@(0x08,r1) ! 33 LS
  437. mov.l r7,@(0x0c,r1) ! 33 LS
  438. mov.l r8,@(0x10,r1) ! 33 LS
  439. add #-0x20, r5 ! 50 EX
  440. mov.l r9,@(0x14,r1) ! 33 LS
  441. cmp/eq r2,r1 ! 54 MT
  442. mov.l r10,@(0x18,r1) ! 33 LS
  443. bf/s 2b ! 109 BR
  444. mov.l r11,@(0x1c,r1) ! 33 LS
  445. mov r1, r0 ! 5 MT (latency=0)
  446. mov.l @r15+, r11 ! 15 LS
  447. sub r1, r5 ! 75 EX
  448. mov.l @r15+, r10 ! 15 LS
  449. cmp/eq r4, r0 ! 54 MT
  450. bf/s 1f ! 109 BR
  451. mov.l @r15+, r9 ! 15 LS
  452. rts
  453. 1: mov.l @r15+, r8 ! 15 LS
  454. #endif
  455. sub r4, r1 ! 75 EX (len remaining)
  456. ! number of trailing bytes is non-zero
  457. !
  458. ! invariants restored (r5 already decremented by 4)
  459. ! also r1=num bytes remaining
  460. mov #4, r2 ! 6 EX
  461. mov r4, r7 ! 5 MT (latency=0)
  462. add #0x1c, r5 ! 50 EX (back to -4)
  463. cmp/hs r2, r1 ! 58 MT
  464. bf/s 5f ! 108 BR
  465. add #11, r7 ! 50 EX
  466. mov.l @(r0, r5), r6 ! 21 LS (latency=2)
  467. tst r2, r1 ! 86 MT
  468. mov r5, r3 ! 5 MT (latency=0)
  469. bt/s 4f ! 111 BR
  470. add #-4, r3 ! 50 EX
  471. cmp/hs r2, r1 ! 58 MT
  472. bt/s 5f ! 111 BR
  473. mov.l r6,@-r0 ! 30 LS
  474. ! 4 cycles, 2 long words per iteration
  475. 3: mov.l @(r0, r5), r6 ! 21 LS (latency=2)
  476. 4: mov.l @(r0, r3), r2 ! 21 LS (latency=2)
  477. cmp/hi r7, r0
  478. mov.l r6, @-r0 ! 30 LS
  479. bt/s 3b ! 109 BR
  480. mov.l r2, @-r0 ! 30 LS
  481. ! Copy the final 0-3 bytes
  482. 5: cmp/eq r0, r4 ! 54 MT
  483. add #-10, r7 ! 50 EX
  484. bt 9f ! 110 BR
  485. add #3,r5 ! 50 EX
  486. ! 3 cycles, 1 byte per iteration
  487. 1: mov.b @(r0,r5),r1 ! 19 LS
  488. cmp/hi r7,r0 ! 57 MT
  489. bt/s 1b ! 111 BR
  490. mov.b r1,@-r0 ! 28 LS
  491. 9: rts
  492. nop
  493. !
  494. ! GHIJ KLMN OPQR --> ..GH IJKL MNOP QR..
  495. !
  496. .balign 32
  497. .Lcase2:
  498. ! Size is 16 or greater and less then 64, but may have trailing bytes
  499. 2: mov r5, r6 ! 5 MT (latency=0)
  500. add #-2,r5 ! 50 EX
  501. mov r4,r2 ! 5 MT (latency=0)
  502. add #-4,r6 ! 50 EX
  503. add #7,r2 ! 50 EX
  504. 3: mov.w @(r0,r5),r1 ! 20 LS (latency=2)
  505. mov.w @(r0,r6),r3 ! 20 LS (latency=2)
  506. cmp/hi r2,r0 ! 57 MT
  507. mov.w r1,@-r0 ! 29 LS
  508. bt/s 3b ! 111 BR
  509. mov.w r3,@-r0 ! 29 LS
  510. bra 10f
  511. nop
  512. .balign 32
  513. .Lcase2b:
  514. ! Size is at least 64 bytes, so will be going round the big loop at least once.
  515. !
  516. ! r2 = rounded up r4
  517. ! r3 = rounded down r0
  518. mov r0, r3 ! 5 MT (latency=0)
  519. mov #(~0x1f), r1 ! 6 EX
  520. and r1, r3 ! 78 EX
  521. mov r4, r2 ! 5 MT (latency=0)
  522. cmp/eq r3, r0 ! 54 MT
  523. add #0x1f, r2 ! 50 EX
  524. add #-2, r5 ! 50 EX
  525. bt/s 1f ! 110 BR
  526. and r1, r2 ! 78 EX
  527. ! Copy a short word one at a time until we are cache line aligned
  528. ! Normal values: r0, r2, r3, r4
  529. ! Unused: r1, r6, r7
  530. ! Mod: r5 (=r5-2)
  531. !
  532. add #2, r3 ! 50 EX
  533. 2: mov.w @(r0,r5),r1 ! 20 LS (latency=2)
  534. cmp/eq r3,r0 ! 54 MT
  535. bf/s 2b ! 111 BR
  536. mov.w r1,@-r0 ! 29 LS
  537. ! Copy the cache line aligned blocks
  538. !
  539. ! In use: r0, r2, r4, r5 (=r5-2)
  540. ! Scratch: r1, r3, r6, r7
  541. !
  542. ! We could do this with the four scratch registers, but if src
  543. ! and dest hit the same cache line, this will thrash, so make
  544. ! use of additional registers.
  545. !
  546. ! We also need r0 as a temporary (for movca), so 'undo' the invariant:
  547. ! r5: src (was r0+r5)
  548. ! r1: dest (was r0)
  549. ! this can be reversed at the end, so we don't need to save any extra
  550. ! state.
  551. !
  552. 1: mov.l r8, @-r15 ! 30 LS
  553. add r0, r5 ! 49 EX
  554. mov.l r9, @-r15 ! 30 LS
  555. mov r0, r1 ! 5 MT (latency=0)
  556. mov.l r10, @-r15 ! 30 LS
  557. add #-0x1e, r5 ! 50 EX
  558. mov.l r11, @-r15 ! 30 LS
  559. mov.l r12, @-r15 ! 30 LS
  560. ! 17 cycles, 32 bytes per iteration
  561. #ifdef __LITTLE_ENDIAN__
  562. 2: mov.w @r5+, r0 ! 14 LS (latency=2) ..JI
  563. add #-0x20, r1 ! 50 EX
  564. mov.l @r5+, r3 ! 15 LS (latency=2) NMLK
  565. mov.l @r5+, r6 ! 15 LS (latency=2) RQPO
  566. shll16 r0 ! 103 EX JI..
  567. mov.l @r5+, r7 ! 15 LS (latency=2)
  568. xtrct r3, r0 ! 48 EX LKJI
  569. mov.l @r5+, r8 ! 15 LS (latency=2)
  570. xtrct r6, r3 ! 48 EX PONM
  571. mov.l @r5+, r9 ! 15 LS (latency=2)
  572. xtrct r7, r6 ! 48 EX
  573. mov.l @r5+, r10 ! 15 LS (latency=2)
  574. xtrct r8, r7 ! 48 EX
  575. mov.l @r5+, r11 ! 15 LS (latency=2)
  576. xtrct r9, r8 ! 48 EX
  577. mov.w @r5+, r12 ! 15 LS (latency=2)
  578. xtrct r10, r9 ! 48 EX
  579. movca.l r0,@r1 ! 40 LS (latency=3-7)
  580. xtrct r11, r10 ! 48 EX
  581. mov.l r3, @(0x04,r1) ! 33 LS
  582. xtrct r12, r11 ! 48 EX
  583. mov.l r6, @(0x08,r1) ! 33 LS
  584. mov.l r7, @(0x0c,r1) ! 33 LS
  585. mov.l r8, @(0x10,r1) ! 33 LS
  586. add #-0x40, r5 ! 50 EX
  587. mov.l r9, @(0x14,r1) ! 33 LS
  588. cmp/eq r2,r1 ! 54 MT
  589. mov.l r10, @(0x18,r1) ! 33 LS
  590. bf/s 2b ! 109 BR
  591. mov.l r11, @(0x1c,r1) ! 33 LS
  592. #else
  593. 2: mov.w @(0x1e,r5), r0 ! 17 LS (latency=2)
  594. add #-2, r5 ! 50 EX
  595. mov.l @(0x1c,r5), r3 ! 18 LS (latency=2)
  596. add #-4, r1 ! 50 EX
  597. mov.l @(0x18,r5), r6 ! 18 LS (latency=2)
  598. shll16 r0 ! 103 EX
  599. mov.l @(0x14,r5), r7 ! 18 LS (latency=2)
  600. xtrct r3, r0 ! 48 EX
  601. mov.l @(0x10,r5), r8 ! 18 LS (latency=2)
  602. xtrct r6, r3 ! 48 EX
  603. mov.l @(0x0c,r5), r9 ! 18 LS (latency=2)
  604. xtrct r7, r6 ! 48 EX
  605. mov.l @(0x08,r5), r10 ! 18 LS (latency=2)
  606. xtrct r8, r7 ! 48 EX
  607. mov.l @(0x04,r5), r11 ! 18 LS (latency=2)
  608. xtrct r9, r8 ! 48 EX
  609. mov.l @(0x00,r5), r12 ! 18 LS (latency=2)
  610. xtrct r10, r9 ! 48 EX
  611. movca.l r0,@r1 ! 40 LS (latency=3-7)
  612. add #-0x1c, r1 ! 50 EX
  613. mov.l r3, @(0x18,r1) ! 33 LS
  614. xtrct r11, r10 ! 48 EX
  615. mov.l r6, @(0x14,r1) ! 33 LS
  616. xtrct r12, r11 ! 48 EX
  617. mov.l r7, @(0x10,r1) ! 33 LS
  618. mov.l r8, @(0x0c,r1) ! 33 LS
  619. add #-0x1e, r5 ! 50 EX
  620. mov.l r9, @(0x08,r1) ! 33 LS
  621. cmp/eq r2,r1 ! 54 MT
  622. mov.l r10, @(0x04,r1) ! 33 LS
  623. bf/s 2b ! 109 BR
  624. mov.l r11, @(0x00,r1) ! 33 LS
  625. #endif
  626. mov.l @r15+, r12
  627. mov r1, r0 ! 5 MT (latency=0)
  628. mov.l @r15+, r11 ! 15 LS
  629. sub r1, r5 ! 75 EX
  630. mov.l @r15+, r10 ! 15 LS
  631. cmp/eq r4, r0 ! 54 MT
  632. bf/s 1f ! 109 BR
  633. mov.l @r15+, r9 ! 15 LS
  634. rts
  635. 1: mov.l @r15+, r8 ! 15 LS
  636. add #0x1e, r5 ! 50 EX
  637. ! Finish off a short word at a time
  638. ! r5 must be invariant - 2
  639. 10: mov r4,r2 ! 5 MT (latency=0)
  640. add #1,r2 ! 50 EX
  641. cmp/hi r2, r0 ! 57 MT
  642. bf/s 1f ! 109 BR
  643. add #2, r2 ! 50 EX
  644. 3: mov.w @(r0,r5),r1 ! 20 LS
  645. cmp/hi r2,r0 ! 57 MT
  646. bt/s 3b ! 109 BR
  647. mov.w r1,@-r0 ! 29 LS
  648. 1:
  649. !
  650. ! Finally, copy the last byte if necessary
  651. cmp/eq r4,r0 ! 54 MT
  652. bt/s 9b
  653. add #1,r5
  654. mov.b @(r0,r5),r1
  655. rts
  656. mov.b r1,@-r0
  657. .size memcpy,.-memcpy;
  658. libc_hidden_def (memcpy)