memset.S 3.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165
  1. /* Optimized memset for Xtensa.
  2. Copyright (C) 2001, 2007 Free Software Foundation, Inc.
  3. This file is part of the GNU C Library.
  4. The GNU C Library is free software; you can redistribute it and/or
  5. modify it under the terms of the GNU Lesser General Public
  6. License as published by the Free Software Foundation; either
  7. version 2.1 of the License, or (at your option) any later version.
  8. The GNU C Library is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  11. Lesser General Public License for more details.
  12. You should have received a copy of the GNU Lesser General Public
  13. License along with the GNU C Library; if not, write to the Free
  14. Software Foundation, Inc., 51 Franklin Street - Fifth Floor,
  15. Boston, MA 02110-1301, USA. */
  16. #include "../../sysdeps/linux/xtensa/sysdep.h"
  17. #include <bits/xtensa-config.h>
  18. /* Do not use .literal_position in the ENTRY macro. */
  19. #undef LITERAL_POSITION
  20. #define LITERAL_POSITION
  21. /* void *memset (void *dst, int c, size_t length)
  22. The algorithm is as follows:
  23. Create a word with c in all byte positions.
  24. If the destination is aligned, set 16B chunks with a loop, and then
  25. finish up with 8B, 4B, 2B, and 1B stores conditional on the length.
  26. If the destination is unaligned, align it by conditionally
  27. setting 1B and/or 2B and then go to aligned case.
  28. This code tries to use fall-through branches for the common
  29. case of an aligned destination (except for the branches to
  30. the alignment labels). */
  31. /* Byte-by-byte set. */
  32. .text
  33. .align 4
  34. .literal_position
  35. __memset_aux:
  36. /* Skip a byte to get 1 mod 4 alignment for LOOPNEZ
  37. (0 mod 4 alignment for LBEG). */
  38. .byte 0
  39. .Lbyteset:
  40. #if XCHAL_HAVE_LOOPS
  41. loopnez a4, 2f
  42. #else
  43. beqz a4, 2f
  44. add a6, a5, a4 /* a6 = ending address */
  45. #endif
  46. 1: s8i a3, a5, 0
  47. addi a5, a5, 1
  48. #if !XCHAL_HAVE_LOOPS
  49. blt a5, a6, 1b
  50. #endif
  51. 2: retw
  52. /* Destination is unaligned. */
  53. .align 4
  54. .Ldst1mod2: /* dst is only byte aligned */
  55. /* Do short sizes byte-by-byte. */
  56. bltui a4, 8, .Lbyteset
  57. /* Set 1 byte. */
  58. s8i a3, a5, 0
  59. addi a5, a5, 1
  60. addi a4, a4, -1
  61. /* Now retest if dst is aligned. */
  62. _bbci.l a5, 1, .Ldstaligned
  63. .Ldst2mod4: /* dst has 16-bit alignment */
  64. /* Do short sizes byte-by-byte. */
  65. bltui a4, 8, .Lbyteset
  66. /* Set 2 bytes. */
  67. s16i a3, a5, 0
  68. addi a5, a5, 2
  69. addi a4, a4, -2
  70. /* dst is now aligned; return to main algorithm */
  71. j .Ldstaligned
  72. ENTRY (memset)
  73. /* a2 = dst, a3 = c, a4 = length */
  74. /* Duplicate character into all bytes of word. */
  75. extui a3, a3, 0, 8
  76. slli a7, a3, 8
  77. or a3, a3, a7
  78. slli a7, a3, 16
  79. or a3, a3, a7
  80. mov a5, a2 /* copy dst so that a2 is return value */
  81. /* Check if dst is unaligned. */
  82. _bbsi.l a2, 0, .Ldst1mod2
  83. _bbsi.l a2, 1, .Ldst2mod4
  84. .Ldstaligned:
  85. /* Get number of loop iterations with 16B per iteration. */
  86. srli a7, a4, 4
  87. /* Destination is word-aligned. */
  88. #if XCHAL_HAVE_LOOPS
  89. loopnez a7, 2f
  90. #else
  91. beqz a7, 2f
  92. slli a6, a7, 4
  93. add a6, a6, a5 /* a6 = end of last 16B chunk */
  94. #endif
  95. /* Set 16 bytes per iteration. */
  96. 1: s32i a3, a5, 0
  97. s32i a3, a5, 4
  98. s32i a3, a5, 8
  99. s32i a3, a5, 12
  100. addi a5, a5, 16
  101. #if !XCHAL_HAVE_LOOPS
  102. blt a5, a6, 1b
  103. #endif
  104. /* Set any leftover pieces smaller than 16B. */
  105. 2: bbci.l a4, 3, 3f
  106. /* Set 8 bytes. */
  107. s32i a3, a5, 0
  108. s32i a3, a5, 4
  109. addi a5, a5, 8
  110. 3: bbci.l a4, 2, 4f
  111. /* Set 4 bytes. */
  112. s32i a3, a5, 0
  113. addi a5, a5, 4
  114. 4: bbci.l a4, 1, 5f
  115. /* Set 2 bytes. */
  116. s16i a3, a5, 0
  117. addi a5, a5, 2
  118. 5: bbci.l a4, 0, 6f
  119. /* Set 1 byte. */
  120. s8i a3, a5, 0
  121. 6: retw
  122. libc_hidden_def (memset)