123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436 |
- /* Optimized version of the standard memcpy() function.
- This file is part of the GNU C Library.
- Copyright (C) 2000, 2001, 2003 Free Software Foundation, Inc.
- Contributed by Dan Pop for Itanium <Dan.Pop@cern.ch>.
- Rewritten for McKinley by Sverre Jarp, HP Labs/CERN <Sverre.Jarp@cern.ch>
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, write to the Free
- Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
- 02111-1307 USA. */
- /* Return: dest
- Inputs:
- in0: dest
- in1: src
- in2: byte count
- An assembly implementation of the algorithm used by the generic C
- version from glibc. The case when source and sest are aligned is
- treated separately, for extra performance.
- In this form, memcpy assumes little endian mode. For big endian mode,
- sh1 must be computed using an extra instruction: sub sh1 = 64, sh1
- and the order of r[MEMLAT] and r[MEMLAT+1] must be reverted in the
- shrp instruction. */
- #define USE_LFETCH
- #define USE_FLP
- #include "sysdep.h"
- #undef ret
- #define LFETCH_DIST 500
- #define ALIGN_UNROLL_no 4 // no. of elements
- #define ALIGN_UNROLL_sh 2 // (shift amount)
- #define MEMLAT 8
- #define Nrot ((4*(MEMLAT+2) + 7) & ~7)
- #define OP_T_THRES 16
- #define OPSIZ 8
- #define loopcnt r14
- #define elemcnt r15
- #define saved_pr r16
- #define saved_lc r17
- #define adest r18
- #define dest r19
- #define asrc r20
- #define src r21
- #define len r22
- #define tmp2 r23
- #define tmp3 r24
- #define tmp4 r25
- #define ptable r26
- #define ploop56 r27
- #define loopaddr r28
- #define sh1 r29
- #define ptr1 r30
- #define ptr2 r31
- #define movi0 mov
- #define p_scr p6
- #define p_xtr p7
- #define p_nxtr p8
- #define p_few p9
- #if defined(USE_FLP)
- #define load ldf8
- #define store stf8
- #define tempreg f6
- #define the_r fr
- #define the_s fs
- #define the_t ft
- #define the_q fq
- #define the_w fw
- #define the_x fx
- #define the_y fy
- #define the_z fz
- #elif defined(USE_INT)
- #define load ld8
- #define store st8
- #define tempreg tmp2
- #define the_r r
- #define the_s s
- #define the_t t
- #define the_q q
- #define the_w w
- #define the_x x
- #define the_y y
- #define the_z z
- #endif
- #ifdef GAS_ALIGN_BREAKS_UNWIND_INFO
- /* Manually force proper loop-alignment. Note: be sure to
- double-check the code-layout after making any changes to
- this routine! */
- # define ALIGN(n) { nop 0 }
- #else
- # define ALIGN(n) .align n
- #endif
- #if defined(USE_LFETCH)
- #define LOOP(shift) \
- ALIGN(32); \
- .loop##shift##: \
- { .mmb \
- (p[0]) ld8.nt1 r[0] = [asrc], 8 ; \
- (p[0]) lfetch.nt1 [ptr1], 16 ; \
- nop.b 0 ; \
- } { .mib \
- (p[MEMLAT+1]) st8 [dest] = tmp3, 8 ; \
- (p[MEMLAT]) shrp tmp3 = r[MEMLAT], s[MEMLAT+1], shift ; \
- nop.b 0 ;; \
- } { .mmb \
- (p[0]) ld8.nt1 s[0] = [asrc], 8 ; \
- (p[0]) lfetch.nt1 [ptr2], 16 ; \
- nop.b 0 ; \
- } { .mib \
- (p[MEMLAT+1]) st8 [dest] = tmp4, 8 ; \
- (p[MEMLAT]) shrp tmp4 = s[MEMLAT], r[MEMLAT], shift ; \
- br.ctop.sptk.many .loop##shift \
- ;; } \
- { .mib \
- br.cond.sptk.many .copy_bytes ; /* deal with the remaining bytes */ \
- }
- #else
- #define LOOP(shift) \
- ALIGN(32); \
- .loop##shift##: \
- { .mmb \
- (p[0]) ld8.nt1 r[0] = [asrc], 8 ; \
- nop.b 0 ; \
- } { .mib \
- (p[MEMLAT+1]) st8 [dest] = tmp3, 8 ; \
- (p[MEMLAT]) shrp tmp3 = r[MEMLAT], s[MEMLAT+1], shift ; \
- nop.b 0 ;; \
- } { .mmb \
- (p[0]) ld8.nt1 s[0] = [asrc], 8 ; \
- nop.b 0 ; \
- } { .mib \
- (p[MEMLAT+1]) st8 [dest] = tmp4, 8 ; \
- (p[MEMLAT]) shrp tmp4 = s[MEMLAT], r[MEMLAT], shift ; \
- br.ctop.sptk.many .loop##shift \
- ;; } \
- { .mib \
- br.cond.sptk.many .copy_bytes ; /* deal with the remaining bytes */ \
- }
- #endif
- ENTRY(memcpy)
- { .mmi
- .prologue
- alloc r2 = ar.pfs, 3, Nrot - 3, 0, Nrot
- .rotr r[MEMLAT+1], s[MEMLAT+2], q[MEMLAT+1], t[MEMLAT+1]
- .rotp p[MEMLAT+2]
- .rotf fr[MEMLAT+1], fq[MEMLAT+1], fs[MEMLAT+1], ft[MEMLAT+1]
- mov ret0 = in0 // return tmp2 = dest
- .save pr, saved_pr
- movi0 saved_pr = pr // save the predicate registers
- } { .mmi
- and tmp4 = 7, in0 // check if destination is aligned
- mov dest = in0 // dest
- mov src = in1 // src
- ;; }
- { .mii
- cmp.eq p_scr, p0 = in2, r0 // if (len == 0)
- .save ar.lc, saved_lc
- movi0 saved_lc = ar.lc // save the loop counter
- .body
- cmp.ge p_few, p0 = OP_T_THRES, in2 // is len <= OP_T_THRESH
- } { .mbb
- mov len = in2 // len
- (p_scr) br.cond.dpnt.few .restore_and_exit // Branch no. 1: return dest
- (p_few) br.cond.dpnt.many .copy_bytes // Branch no. 2: copy byte by byte
- ;; }
- { .mmi
- #if defined(USE_LFETCH)
- lfetch.nt1 [dest] //
- lfetch.nt1 [src] //
- #endif
- shr.u elemcnt = len, 3 // elemcnt = len / 8
- } { .mib
- cmp.eq p_scr, p0 = tmp4, r0 // is destination aligned?
- sub loopcnt = 7, tmp4 //
- (p_scr) br.cond.dptk.many .dest_aligned
- ;; }
- { .mmi
- ld1 tmp2 = [src], 1 //
- sub len = len, loopcnt, 1 // reduce len
- movi0 ar.lc = loopcnt //
- } { .mib
- cmp.ne p_scr, p0 = 0, loopcnt // avoid loading beyond end-point
- ;; }
- .l0: // ---------------------------- // L0: Align src on 8-byte boundary
- { .mmi
- st1 [dest] = tmp2, 1 //
- (p_scr) ld1 tmp2 = [src], 1 //
- } { .mib
- cmp.lt p_scr, p0 = 1, loopcnt // avoid load beyond end-point
- add loopcnt = -1, loopcnt
- br.cloop.dptk.few .l0 //
- ;; }
- .dest_aligned:
- { .mmi
- and tmp4 = 7, src // ready for alignment check
- shr.u elemcnt = len, 3 // elemcnt = len / 8
- ;; }
- { .mib
- cmp.ne p_scr, p0 = tmp4, r0 // is source also aligned
- tbit.nz p_xtr, p_nxtr = src, 3 // prepare a separate move if src
- } { .mib // is not 16B aligned
- add ptr2 = LFETCH_DIST, dest // prefetch address
- add ptr1 = LFETCH_DIST, src
- (p_scr) br.cond.dptk.many .src_not_aligned
- ;; }
- // The optimal case, when dest, and src are aligned
- .both_aligned:
- { .mmi
- .pred.rel "mutex",p_xtr,p_nxtr
- (p_xtr) cmp.gt p_scr, p0 = ALIGN_UNROLL_no+1, elemcnt // Need N + 1 to qualify
- (p_nxtr) cmp.gt p_scr, p0 = ALIGN_UNROLL_no, elemcnt // Need only N to qualify
- movi0 pr.rot = 1 << 16 // set rotating predicates
- } { .mib
- (p_scr) br.cond.dpnt.many .copy_full_words
- ;; }
- { .mmi
- (p_xtr) load tempreg = [src], 8
- (p_xtr) add elemcnt = -1, elemcnt
- movi0 ar.ec = MEMLAT + 1 // set the epilog counter
- ;; }
- { .mmi
- (p_xtr) add len = -8, len //
- add asrc = 16, src // one bank apart (for USE_INT)
- shr.u loopcnt = elemcnt, ALIGN_UNROLL_sh // cater for unrolling
- ;;}
- { .mmi
- add loopcnt = -1, loopcnt
- (p_xtr) store [dest] = tempreg, 8 // copy the "extra" word
- nop.i 0
- ;; }
- { .mib
- add adest = 16, dest
- movi0 ar.lc = loopcnt // set the loop counter
- ;; }
- #ifdef GAS_ALIGN_BREAKS_UNWIND_INFO
- { nop 0 }
- #else
- .align 32
- #endif
- #if defined(USE_FLP)
- .l1: // ------------------------------- // L1: Everything a multiple of 8
- { .mmi
- #if defined(USE_LFETCH)
- (p[0]) lfetch.nt1 [ptr2],32
- #endif
- (p[0]) ldfp8 the_r[0],the_q[0] = [src], 16
- (p[0]) add len = -32, len
- } {.mmb
- (p[MEMLAT]) store [dest] = the_r[MEMLAT], 8
- (p[MEMLAT]) store [adest] = the_s[MEMLAT], 8
- ;; }
- { .mmi
- #if defined(USE_LFETCH)
- (p[0]) lfetch.nt1 [ptr1],32
- #endif
- (p[0]) ldfp8 the_s[0], the_t[0] = [src], 16
- } {.mmb
- (p[MEMLAT]) store [dest] = the_q[MEMLAT], 24
- (p[MEMLAT]) store [adest] = the_t[MEMLAT], 24
- br.ctop.dptk.many .l1
- ;; }
- #elif defined(USE_INT)
- .l1: // ------------------------------- // L1: Everything a multiple of 8
- { .mmi
- (p[0]) load the_r[0] = [src], 8
- (p[0]) load the_q[0] = [asrc], 8
- (p[0]) add len = -32, len
- } {.mmb
- (p[MEMLAT]) store [dest] = the_r[MEMLAT], 8
- (p[MEMLAT]) store [adest] = the_q[MEMLAT], 8
- ;; }
- { .mmi
- (p[0]) load the_s[0] = [src], 24
- (p[0]) load the_t[0] = [asrc], 24
- } {.mmb
- (p[MEMLAT]) store [dest] = the_s[MEMLAT], 24
- (p[MEMLAT]) store [adest] = the_t[MEMLAT], 24
- #if defined(USE_LFETCH)
- ;; }
- { .mmb
- (p[0]) lfetch.nt1 [ptr2],32
- (p[0]) lfetch.nt1 [ptr1],32
- #endif
- br.ctop.dptk.many .l1
- ;; }
- #endif
- .copy_full_words:
- { .mib
- cmp.gt p_scr, p0 = 8, len //
- shr.u elemcnt = len, 3 //
- (p_scr) br.cond.dpnt.many .copy_bytes
- ;; }
- { .mii
- load tempreg = [src], 8
- add loopcnt = -1, elemcnt //
- ;; }
- { .mii
- cmp.ne p_scr, p0 = 0, loopcnt //
- mov ar.lc = loopcnt //
- ;; }
- .l2: // ------------------------------- // L2: Max 4 words copied separately
- { .mmi
- store [dest] = tempreg, 8
- (p_scr) load tempreg = [src], 8 //
- add len = -8, len
- } { .mib
- cmp.lt p_scr, p0 = 1, loopcnt // avoid load beyond end-point
- add loopcnt = -1, loopcnt
- br.cloop.dptk.few .l2
- ;; }
- .copy_bytes:
- { .mib
- cmp.eq p_scr, p0 = len, r0 // is len == 0 ?
- add loopcnt = -1, len // len--;
- (p_scr) br.cond.spnt .restore_and_exit
- ;; }
- { .mii
- ld1 tmp2 = [src], 1
- movi0 ar.lc = loopcnt
- cmp.ne p_scr, p0 = 0, loopcnt // avoid load beyond end-point
- ;; }
- .l3: // ------------------------------- // L3: Final byte move
- { .mmi
- st1 [dest] = tmp2, 1
- (p_scr) ld1 tmp2 = [src], 1
- } { .mib
- cmp.lt p_scr, p0 = 1, loopcnt // avoid load beyond end-point
- add loopcnt = -1, loopcnt
- br.cloop.dptk.few .l3
- ;; }
- .restore_and_exit:
- { .mmi
- movi0 pr = saved_pr, -1 // restore the predicate registers
- ;; }
- { .mib
- movi0 ar.lc = saved_lc // restore the loop counter
- br.ret.sptk.many b0
- ;; }
- .src_not_aligned:
- { .mmi
- cmp.gt p_scr, p0 = 16, len
- and sh1 = 7, src // sh1 = src % 8
- shr.u loopcnt = len, 4 // element-cnt = len / 16
- } { .mib
- add tmp4 = @ltoff(.table), gp
- add tmp3 = @ltoff(.loop56), gp
- (p_scr) br.cond.dpnt.many .copy_bytes // do byte by byte if too few
- ;; }
- { .mmi
- and asrc = -8, src // asrc = (-8) -- align src for loop
- add loopcnt = -1, loopcnt // loopcnt--
- shl sh1 = sh1, 3 // sh1 = 8 * (src % 8)
- } { .mmi
- ld8 ptable = [tmp4] // ptable = &table
- ld8 ploop56 = [tmp3] // ploop56 = &loop56
- and tmp2 = -16, len // tmp2 = len & -OPSIZ
- ;; }
- { .mmi
- add tmp3 = ptable, sh1 // tmp3 = &table + sh1
- add src = src, tmp2 // src += len & (-16)
- movi0 ar.lc = loopcnt // set LC
- ;; }
- { .mmi
- ld8 tmp4 = [tmp3] // tmp4 = loop offset
- sub len = len, tmp2 // len -= len & (-16)
- movi0 ar.ec = MEMLAT + 2 // one more pass needed
- ;; }
- { .mmi
- ld8 s[1] = [asrc], 8 // preload
- sub loopaddr = ploop56,tmp4 // loopadd = &loop56 - loop offset
- movi0 pr.rot = 1 << 16 // set rotating predicates
- ;; }
- { .mib
- nop.m 0
- movi0 b6 = loopaddr
- br b6 // jump to the appropriate loop
- ;; }
- LOOP(8)
- LOOP(16)
- LOOP(24)
- LOOP(32)
- LOOP(40)
- LOOP(48)
- LOOP(56)
- END(memcpy)
- libc_hidden_def (memcpy)
- .rodata
- .align 8
- .table:
- data8 0 // dummy entry
- data8 .loop56 - .loop8
- data8 .loop56 - .loop16
- data8 .loop56 - .loop24
- data8 .loop56 - .loop32
- data8 .loop56 - .loop40
- data8 .loop56 - .loop48
- data8 .loop56 - .loop56
|