memcpy.S 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436
  1. /* Optimized version of the standard memcpy() function.
  2. This file is part of the GNU C Library.
  3. Copyright (C) 2000, 2001, 2003 Free Software Foundation, Inc.
  4. Contributed by Dan Pop for Itanium <Dan.Pop@cern.ch>.
  5. Rewritten for McKinley by Sverre Jarp, HP Labs/CERN <Sverre.Jarp@cern.ch>
  6. The GNU C Library is free software; you can redistribute it and/or
  7. modify it under the terms of the GNU Lesser General Public
  8. License as published by the Free Software Foundation; either
  9. version 2.1 of the License, or (at your option) any later version.
  10. The GNU C Library is distributed in the hope that it will be useful,
  11. but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  13. Lesser General Public License for more details.
  14. You should have received a copy of the GNU Lesser General Public
  15. License along with the GNU C Library; if not, write to the Free
  16. Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  17. 02111-1307 USA. */
  18. /* Return: dest
  19. Inputs:
  20. in0: dest
  21. in1: src
  22. in2: byte count
  23. An assembly implementation of the algorithm used by the generic C
  24. version from glibc. The case when source and sest are aligned is
  25. treated separately, for extra performance.
  26. In this form, memcpy assumes little endian mode. For big endian mode,
  27. sh1 must be computed using an extra instruction: sub sh1 = 64, sh1
  28. and the order of r[MEMLAT] and r[MEMLAT+1] must be reverted in the
  29. shrp instruction. */
  30. #define USE_LFETCH
  31. #define USE_FLP
  32. #include "sysdep.h"
  33. #undef ret
  34. #define LFETCH_DIST 500
  35. #define ALIGN_UNROLL_no 4 // no. of elements
  36. #define ALIGN_UNROLL_sh 2 // (shift amount)
  37. #define MEMLAT 8
  38. #define Nrot ((4*(MEMLAT+2) + 7) & ~7)
  39. #define OP_T_THRES 16
  40. #define OPSIZ 8
  41. #define loopcnt r14
  42. #define elemcnt r15
  43. #define saved_pr r16
  44. #define saved_lc r17
  45. #define adest r18
  46. #define dest r19
  47. #define asrc r20
  48. #define src r21
  49. #define len r22
  50. #define tmp2 r23
  51. #define tmp3 r24
  52. #define tmp4 r25
  53. #define ptable r26
  54. #define ploop56 r27
  55. #define loopaddr r28
  56. #define sh1 r29
  57. #define ptr1 r30
  58. #define ptr2 r31
  59. #define movi0 mov
  60. #define p_scr p6
  61. #define p_xtr p7
  62. #define p_nxtr p8
  63. #define p_few p9
  64. #if defined(USE_FLP)
  65. #define load ldf8
  66. #define store stf8
  67. #define tempreg f6
  68. #define the_r fr
  69. #define the_s fs
  70. #define the_t ft
  71. #define the_q fq
  72. #define the_w fw
  73. #define the_x fx
  74. #define the_y fy
  75. #define the_z fz
  76. #elif defined(USE_INT)
  77. #define load ld8
  78. #define store st8
  79. #define tempreg tmp2
  80. #define the_r r
  81. #define the_s s
  82. #define the_t t
  83. #define the_q q
  84. #define the_w w
  85. #define the_x x
  86. #define the_y y
  87. #define the_z z
  88. #endif
  89. #ifdef GAS_ALIGN_BREAKS_UNWIND_INFO
  90. /* Manually force proper loop-alignment. Note: be sure to
  91. double-check the code-layout after making any changes to
  92. this routine! */
  93. # define ALIGN(n) { nop 0 }
  94. #else
  95. # define ALIGN(n) .align n
  96. #endif
  97. #if defined(USE_LFETCH)
  98. #define LOOP(shift) \
  99. ALIGN(32); \
  100. .loop##shift##: \
  101. { .mmb \
  102. (p[0]) ld8.nt1 r[0] = [asrc], 8 ; \
  103. (p[0]) lfetch.nt1 [ptr1], 16 ; \
  104. nop.b 0 ; \
  105. } { .mib \
  106. (p[MEMLAT+1]) st8 [dest] = tmp3, 8 ; \
  107. (p[MEMLAT]) shrp tmp3 = r[MEMLAT], s[MEMLAT+1], shift ; \
  108. nop.b 0 ;; \
  109. } { .mmb \
  110. (p[0]) ld8.nt1 s[0] = [asrc], 8 ; \
  111. (p[0]) lfetch.nt1 [ptr2], 16 ; \
  112. nop.b 0 ; \
  113. } { .mib \
  114. (p[MEMLAT+1]) st8 [dest] = tmp4, 8 ; \
  115. (p[MEMLAT]) shrp tmp4 = s[MEMLAT], r[MEMLAT], shift ; \
  116. br.ctop.sptk.many .loop##shift \
  117. ;; } \
  118. { .mib \
  119. br.cond.sptk.many .copy_bytes ; /* deal with the remaining bytes */ \
  120. }
  121. #else
  122. #define LOOP(shift) \
  123. ALIGN(32); \
  124. .loop##shift##: \
  125. { .mmb \
  126. (p[0]) ld8.nt1 r[0] = [asrc], 8 ; \
  127. nop.b 0 ; \
  128. } { .mib \
  129. (p[MEMLAT+1]) st8 [dest] = tmp3, 8 ; \
  130. (p[MEMLAT]) shrp tmp3 = r[MEMLAT], s[MEMLAT+1], shift ; \
  131. nop.b 0 ;; \
  132. } { .mmb \
  133. (p[0]) ld8.nt1 s[0] = [asrc], 8 ; \
  134. nop.b 0 ; \
  135. } { .mib \
  136. (p[MEMLAT+1]) st8 [dest] = tmp4, 8 ; \
  137. (p[MEMLAT]) shrp tmp4 = s[MEMLAT], r[MEMLAT], shift ; \
  138. br.ctop.sptk.many .loop##shift \
  139. ;; } \
  140. { .mib \
  141. br.cond.sptk.many .copy_bytes ; /* deal with the remaining bytes */ \
  142. }
  143. #endif
  144. ENTRY(memcpy)
  145. { .mmi
  146. .prologue
  147. alloc r2 = ar.pfs, 3, Nrot - 3, 0, Nrot
  148. .rotr r[MEMLAT+1], s[MEMLAT+2], q[MEMLAT+1], t[MEMLAT+1]
  149. .rotp p[MEMLAT+2]
  150. .rotf fr[MEMLAT+1], fq[MEMLAT+1], fs[MEMLAT+1], ft[MEMLAT+1]
  151. mov ret0 = in0 // return tmp2 = dest
  152. .save pr, saved_pr
  153. movi0 saved_pr = pr // save the predicate registers
  154. } { .mmi
  155. and tmp4 = 7, in0 // check if destination is aligned
  156. mov dest = in0 // dest
  157. mov src = in1 // src
  158. ;; }
  159. { .mii
  160. cmp.eq p_scr, p0 = in2, r0 // if (len == 0)
  161. .save ar.lc, saved_lc
  162. movi0 saved_lc = ar.lc // save the loop counter
  163. .body
  164. cmp.ge p_few, p0 = OP_T_THRES, in2 // is len <= OP_T_THRESH
  165. } { .mbb
  166. mov len = in2 // len
  167. (p_scr) br.cond.dpnt.few .restore_and_exit // Branch no. 1: return dest
  168. (p_few) br.cond.dpnt.many .copy_bytes // Branch no. 2: copy byte by byte
  169. ;; }
  170. { .mmi
  171. #if defined(USE_LFETCH)
  172. lfetch.nt1 [dest] //
  173. lfetch.nt1 [src] //
  174. #endif
  175. shr.u elemcnt = len, 3 // elemcnt = len / 8
  176. } { .mib
  177. cmp.eq p_scr, p0 = tmp4, r0 // is destination aligned?
  178. sub loopcnt = 7, tmp4 //
  179. (p_scr) br.cond.dptk.many .dest_aligned
  180. ;; }
  181. { .mmi
  182. ld1 tmp2 = [src], 1 //
  183. sub len = len, loopcnt, 1 // reduce len
  184. movi0 ar.lc = loopcnt //
  185. } { .mib
  186. cmp.ne p_scr, p0 = 0, loopcnt // avoid loading beyond end-point
  187. ;; }
  188. .l0: // ---------------------------- // L0: Align src on 8-byte boundary
  189. { .mmi
  190. st1 [dest] = tmp2, 1 //
  191. (p_scr) ld1 tmp2 = [src], 1 //
  192. } { .mib
  193. cmp.lt p_scr, p0 = 1, loopcnt // avoid load beyond end-point
  194. add loopcnt = -1, loopcnt
  195. br.cloop.dptk.few .l0 //
  196. ;; }
  197. .dest_aligned:
  198. { .mmi
  199. and tmp4 = 7, src // ready for alignment check
  200. shr.u elemcnt = len, 3 // elemcnt = len / 8
  201. ;; }
  202. { .mib
  203. cmp.ne p_scr, p0 = tmp4, r0 // is source also aligned
  204. tbit.nz p_xtr, p_nxtr = src, 3 // prepare a separate move if src
  205. } { .mib // is not 16B aligned
  206. add ptr2 = LFETCH_DIST, dest // prefetch address
  207. add ptr1 = LFETCH_DIST, src
  208. (p_scr) br.cond.dptk.many .src_not_aligned
  209. ;; }
  210. // The optimal case, when dest, and src are aligned
  211. .both_aligned:
  212. { .mmi
  213. .pred.rel "mutex",p_xtr,p_nxtr
  214. (p_xtr) cmp.gt p_scr, p0 = ALIGN_UNROLL_no+1, elemcnt // Need N + 1 to qualify
  215. (p_nxtr) cmp.gt p_scr, p0 = ALIGN_UNROLL_no, elemcnt // Need only N to qualify
  216. movi0 pr.rot = 1 << 16 // set rotating predicates
  217. } { .mib
  218. (p_scr) br.cond.dpnt.many .copy_full_words
  219. ;; }
  220. { .mmi
  221. (p_xtr) load tempreg = [src], 8
  222. (p_xtr) add elemcnt = -1, elemcnt
  223. movi0 ar.ec = MEMLAT + 1 // set the epilog counter
  224. ;; }
  225. { .mmi
  226. (p_xtr) add len = -8, len //
  227. add asrc = 16, src // one bank apart (for USE_INT)
  228. shr.u loopcnt = elemcnt, ALIGN_UNROLL_sh // cater for unrolling
  229. ;;}
  230. { .mmi
  231. add loopcnt = -1, loopcnt
  232. (p_xtr) store [dest] = tempreg, 8 // copy the "extra" word
  233. nop.i 0
  234. ;; }
  235. { .mib
  236. add adest = 16, dest
  237. movi0 ar.lc = loopcnt // set the loop counter
  238. ;; }
  239. #ifdef GAS_ALIGN_BREAKS_UNWIND_INFO
  240. { nop 0 }
  241. #else
  242. .align 32
  243. #endif
  244. #if defined(USE_FLP)
  245. .l1: // ------------------------------- // L1: Everything a multiple of 8
  246. { .mmi
  247. #if defined(USE_LFETCH)
  248. (p[0]) lfetch.nt1 [ptr2],32
  249. #endif
  250. (p[0]) ldfp8 the_r[0],the_q[0] = [src], 16
  251. (p[0]) add len = -32, len
  252. } {.mmb
  253. (p[MEMLAT]) store [dest] = the_r[MEMLAT], 8
  254. (p[MEMLAT]) store [adest] = the_s[MEMLAT], 8
  255. ;; }
  256. { .mmi
  257. #if defined(USE_LFETCH)
  258. (p[0]) lfetch.nt1 [ptr1],32
  259. #endif
  260. (p[0]) ldfp8 the_s[0], the_t[0] = [src], 16
  261. } {.mmb
  262. (p[MEMLAT]) store [dest] = the_q[MEMLAT], 24
  263. (p[MEMLAT]) store [adest] = the_t[MEMLAT], 24
  264. br.ctop.dptk.many .l1
  265. ;; }
  266. #elif defined(USE_INT)
  267. .l1: // ------------------------------- // L1: Everything a multiple of 8
  268. { .mmi
  269. (p[0]) load the_r[0] = [src], 8
  270. (p[0]) load the_q[0] = [asrc], 8
  271. (p[0]) add len = -32, len
  272. } {.mmb
  273. (p[MEMLAT]) store [dest] = the_r[MEMLAT], 8
  274. (p[MEMLAT]) store [adest] = the_q[MEMLAT], 8
  275. ;; }
  276. { .mmi
  277. (p[0]) load the_s[0] = [src], 24
  278. (p[0]) load the_t[0] = [asrc], 24
  279. } {.mmb
  280. (p[MEMLAT]) store [dest] = the_s[MEMLAT], 24
  281. (p[MEMLAT]) store [adest] = the_t[MEMLAT], 24
  282. #if defined(USE_LFETCH)
  283. ;; }
  284. { .mmb
  285. (p[0]) lfetch.nt1 [ptr2],32
  286. (p[0]) lfetch.nt1 [ptr1],32
  287. #endif
  288. br.ctop.dptk.many .l1
  289. ;; }
  290. #endif
  291. .copy_full_words:
  292. { .mib
  293. cmp.gt p_scr, p0 = 8, len //
  294. shr.u elemcnt = len, 3 //
  295. (p_scr) br.cond.dpnt.many .copy_bytes
  296. ;; }
  297. { .mii
  298. load tempreg = [src], 8
  299. add loopcnt = -1, elemcnt //
  300. ;; }
  301. { .mii
  302. cmp.ne p_scr, p0 = 0, loopcnt //
  303. mov ar.lc = loopcnt //
  304. ;; }
  305. .l2: // ------------------------------- // L2: Max 4 words copied separately
  306. { .mmi
  307. store [dest] = tempreg, 8
  308. (p_scr) load tempreg = [src], 8 //
  309. add len = -8, len
  310. } { .mib
  311. cmp.lt p_scr, p0 = 1, loopcnt // avoid load beyond end-point
  312. add loopcnt = -1, loopcnt
  313. br.cloop.dptk.few .l2
  314. ;; }
  315. .copy_bytes:
  316. { .mib
  317. cmp.eq p_scr, p0 = len, r0 // is len == 0 ?
  318. add loopcnt = -1, len // len--;
  319. (p_scr) br.cond.spnt .restore_and_exit
  320. ;; }
  321. { .mii
  322. ld1 tmp2 = [src], 1
  323. movi0 ar.lc = loopcnt
  324. cmp.ne p_scr, p0 = 0, loopcnt // avoid load beyond end-point
  325. ;; }
  326. .l3: // ------------------------------- // L3: Final byte move
  327. { .mmi
  328. st1 [dest] = tmp2, 1
  329. (p_scr) ld1 tmp2 = [src], 1
  330. } { .mib
  331. cmp.lt p_scr, p0 = 1, loopcnt // avoid load beyond end-point
  332. add loopcnt = -1, loopcnt
  333. br.cloop.dptk.few .l3
  334. ;; }
  335. .restore_and_exit:
  336. { .mmi
  337. movi0 pr = saved_pr, -1 // restore the predicate registers
  338. ;; }
  339. { .mib
  340. movi0 ar.lc = saved_lc // restore the loop counter
  341. br.ret.sptk.many b0
  342. ;; }
  343. .src_not_aligned:
  344. { .mmi
  345. cmp.gt p_scr, p0 = 16, len
  346. and sh1 = 7, src // sh1 = src % 8
  347. shr.u loopcnt = len, 4 // element-cnt = len / 16
  348. } { .mib
  349. add tmp4 = @ltoff(.table), gp
  350. add tmp3 = @ltoff(.loop56), gp
  351. (p_scr) br.cond.dpnt.many .copy_bytes // do byte by byte if too few
  352. ;; }
  353. { .mmi
  354. and asrc = -8, src // asrc = (-8) -- align src for loop
  355. add loopcnt = -1, loopcnt // loopcnt--
  356. shl sh1 = sh1, 3 // sh1 = 8 * (src % 8)
  357. } { .mmi
  358. ld8 ptable = [tmp4] // ptable = &table
  359. ld8 ploop56 = [tmp3] // ploop56 = &loop56
  360. and tmp2 = -16, len // tmp2 = len & -OPSIZ
  361. ;; }
  362. { .mmi
  363. add tmp3 = ptable, sh1 // tmp3 = &table + sh1
  364. add src = src, tmp2 // src += len & (-16)
  365. movi0 ar.lc = loopcnt // set LC
  366. ;; }
  367. { .mmi
  368. ld8 tmp4 = [tmp3] // tmp4 = loop offset
  369. sub len = len, tmp2 // len -= len & (-16)
  370. movi0 ar.ec = MEMLAT + 2 // one more pass needed
  371. ;; }
  372. { .mmi
  373. ld8 s[1] = [asrc], 8 // preload
  374. sub loopaddr = ploop56,tmp4 // loopadd = &loop56 - loop offset
  375. movi0 pr.rot = 1 << 16 // set rotating predicates
  376. ;; }
  377. { .mib
  378. nop.m 0
  379. movi0 b6 = loopaddr
  380. br b6 // jump to the appropriate loop
  381. ;; }
  382. LOOP(8)
  383. LOOP(16)
  384. LOOP(24)
  385. LOOP(32)
  386. LOOP(40)
  387. LOOP(48)
  388. LOOP(56)
  389. END(memcpy)
  390. libc_hidden_def (memcpy)
  391. .rodata
  392. .align 8
  393. .table:
  394. data8 0 // dummy entry
  395. data8 .loop56 - .loop8
  396. data8 .loop56 - .loop16
  397. data8 .loop56 - .loop24
  398. data8 .loop56 - .loop32
  399. data8 .loop56 - .loop40
  400. data8 .loop56 - .loop48
  401. data8 .loop56 - .loop56