memset.S 3.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205
  1. /*
  2. * Copyright (C) 2013, 2014-2015, 2017, 2022 Synopsys, Inc. (www.synopsys.com)
  3. * Copyright (C) 2007 ARC International (UK) LTD
  4. *
  5. * Licensed under the LGPL v2.1 or later, see the file COPYING.LIB in this tarball.
  6. */
  7. #include <sysdep.h>
  8. ENTRY(memset)
  9. #if defined(__ARC700__)
  10. #define SMALL 7 /* Must be at least 6 to deal with alignment/loop issues. */
  11. mov_s r4,r0
  12. or r12,r0,r2
  13. bmsk.f r12,r12,1
  14. extb_s r1,r1
  15. asl r3,r1,8
  16. beq.d .Laligned
  17. or_s r1,r1,r3
  18. brls r2,SMALL,.Ltiny
  19. add r3,r2,r0
  20. stb r1,[r3,-1]
  21. bclr_s r3,r3,0
  22. stw r1,[r3,-2]
  23. bmsk.f r12,r0,1
  24. add_s r2,r2,r12
  25. sub.ne r2,r2,4
  26. stb.ab r1,[r4,1]
  27. and r4,r4,-2
  28. stw.ab r1,[r4,2]
  29. and r4,r4,-4
  30. .Laligned: ; This code address should be aligned for speed.
  31. asl r3,r1,16
  32. lsr.f lp_count,r2,2
  33. or_s r1,r1,r3
  34. lpne .Loop_end
  35. st.ab r1,[r4,4]
  36. .Loop_end:
  37. j_s [blink]
  38. .balign 4
  39. .Ltiny:
  40. mov.f lp_count,r2
  41. lpne .Ltiny_end
  42. stb.ab r1,[r4,1]
  43. .Ltiny_end:
  44. j_s [blink]
  45. #elif defined(__ARCHS__)
  46. #ifdef DONT_USE_PREALLOC
  47. #define PREWRITE(A,B) prefetchw [(A),(B)]
  48. #else
  49. #define PREWRITE(A,B) prealloc [(A),(B)]
  50. #endif
  51. prefetchw [r0] ; Prefetch the write location
  52. mov.f 0, r2
  53. ;;; if size is zero
  54. jz.d [blink]
  55. mov r3, r0 ; don't clobber ret val
  56. ;;; if length < 8
  57. brls.d.nt r2, 8, .Lsmallchunk
  58. mov.f lp_count,r2
  59. and.f r4, r0, 0x03
  60. rsub lp_count, r4, 4
  61. lpnz @.Laligndestination
  62. ;; LOOP BEGIN
  63. stb.ab r1, [r3,1]
  64. sub r2, r2, 1
  65. .Laligndestination:
  66. ;;; Destination is aligned
  67. and r1, r1, 0xFF
  68. asl r4, r1, 8
  69. or r4, r4, r1
  70. asl r5, r4, 16
  71. or r5, r5, r4
  72. mov r4, r5
  73. sub3 lp_count, r2, 8
  74. cmp r2, 64
  75. bmsk.hi r2, r2, 5
  76. mov.ls lp_count, 0
  77. add3.hi r2, r2, 8
  78. ;;; Convert len to Dwords, unfold x8
  79. lsr.f lp_count, lp_count, 6
  80. lpnz @.Lset64bytes
  81. ;; LOOP START
  82. PREWRITE(r3, 64) ;Prefetch the next write location
  83. #if defined(__LL64__) || defined(__ARC_LL64__)
  84. std.ab r4, [r3, 8]
  85. std.ab r4, [r3, 8]
  86. std.ab r4, [r3, 8]
  87. std.ab r4, [r3, 8]
  88. std.ab r4, [r3, 8]
  89. std.ab r4, [r3, 8]
  90. std.ab r4, [r3, 8]
  91. std.ab r4, [r3, 8]
  92. #else
  93. st.ab r4, [r3, 4]
  94. st.ab r4, [r3, 4]
  95. st.ab r4, [r3, 4]
  96. st.ab r4, [r3, 4]
  97. st.ab r4, [r3, 4]
  98. st.ab r4, [r3, 4]
  99. st.ab r4, [r3, 4]
  100. st.ab r4, [r3, 4]
  101. st.ab r4, [r3, 4]
  102. st.ab r4, [r3, 4]
  103. st.ab r4, [r3, 4]
  104. st.ab r4, [r3, 4]
  105. st.ab r4, [r3, 4]
  106. st.ab r4, [r3, 4]
  107. st.ab r4, [r3, 4]
  108. st.ab r4, [r3, 4]
  109. #endif
  110. .Lset64bytes:
  111. lsr.f lp_count, r2, 5 ;Last remaining max 124 bytes
  112. lpnz .Lset32bytes
  113. ;; LOOP START
  114. prefetchw [r3, 32] ;Prefetch the next write location
  115. #if defined(__LL64__) || defined(__ARC_LL64__)
  116. std.ab r4, [r3, 8]
  117. std.ab r4, [r3, 8]
  118. std.ab r4, [r3, 8]
  119. std.ab r4, [r3, 8]
  120. #else
  121. st.ab r4, [r3, 4]
  122. st.ab r4, [r3, 4]
  123. st.ab r4, [r3, 4]
  124. st.ab r4, [r3, 4]
  125. st.ab r4, [r3, 4]
  126. st.ab r4, [r3, 4]
  127. st.ab r4, [r3, 4]
  128. st.ab r4, [r3, 4]
  129. #endif
  130. .Lset32bytes:
  131. and.f lp_count, r2, 0x1F ;Last remaining 31 bytes
  132. .Lsmallchunk:
  133. lpnz .Lcopy3bytes
  134. ;; LOOP START
  135. stb.ab r1, [r3, 1]
  136. .Lcopy3bytes:
  137. j [blink]
  138. #elif defined(__ARC64_ARCH32__)
  139. ;; Based on Synopsys code from newlib's arc64/memset.S
  140. ;; Assemble the bytes to 32bit words
  141. bmsk_s r1, r1, 7 ; treat it like unsigned char
  142. lsl8 r3, r1
  143. or_s r1, r1, r3
  144. lsl16 r3, r1
  145. or r6, r1, r3
  146. mov r7,r6
  147. lsr.f r5, r2, 4 ; counter for 16-byte chunks
  148. beq.d @.L_write_15_bytes
  149. mov r4, r0 ; work on a copy of "r0"
  150. .L_write_16_bytes:
  151. #if defined(__ARC64_LL64__)
  152. std.ab r6, [r4, 8]
  153. std.ab r6, [r4, 8]
  154. dbnz r5, @.L_write_16_bytes
  155. #else
  156. st.ab r6, [r4, 4]
  157. st.ab r6, [r4, 4]
  158. st.ab r6, [r4, 4]
  159. dbnz.d r5, @.L_write_16_bytes
  160. st.ab r6, [r4, 4]
  161. #endif
  162. bmsk_s r2, r2, 3
  163. .L_write_15_bytes:
  164. bbit0.d r2, 1, @1f
  165. lsr r3, r2, 2
  166. sth.ab r6, [r4, 2]
  167. 1:
  168. bbit0.d r2, 0, @1f
  169. xor r3, r3, 3
  170. stb.ab r6, [r4, 1]
  171. 1:
  172. bi [r3]
  173. st.ab r6,[r4, 4]
  174. st.ab r6,[r4, 4]
  175. st.ab r6,[r4, 4]
  176. j_s [blink]
  177. #else
  178. #error "Unsupported ARC CPU type"
  179. #endif
  180. END(memset)
  181. libc_hidden_def(memset)