memset.S 4.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146
  1. /*
  2. * Copyright (C) 2019 Kalray Inc.
  3. *
  4. * Licensed under the LGPL v2.1 or later, see the file COPYING.LIB
  5. * in this tarball.
  6. */
  7. #define REPLICATE_BYTE_MASK 0x0101010101010101
  8. #define MIN_SIZE_FOR_ALIGN 128
  9. /*
  10. * Optimized memset for kvx architecture
  11. *
  12. * In order to optimize memset on kvx, we can use various things:
  13. * - conditionnal store which avoid branch penalty
  14. * - store half/word/double/quad/octuple to store up to 16 bytes at a time
  15. * - hardware loop for steady cases.
  16. *
  17. * First, we start by checking if the size is below a minimum size. If so, we
  18. * skip the alignment part. Indeed, the kvx supports misalignment and the
  19. * penalty for letting it do unaligned accesses is lower than trying to
  20. * realigning us. So for small sizes, we don't even bother to realign.
  21. * In order to create the 64 bits pattern, we use sbmm to replicate the pattern
  22. * on all bits on a register in one call.
  23. * Once alignment has been reached, we can do the hardware loop using store
  24. * octuple in order to optimize throughput. Care must be taken to align hardware
  25. * loops on at least 8 bytes for performances.
  26. * Once the main loop has been done, we finish the copy by checking length to do
  27. * the necessary calls to store remaining bytes.
  28. */
  29. #include <sysdep.h>
  30. .align 16
  31. ENTRY(memset)
  32. /* Preserve return value */
  33. copyd $r3 = $r0
  34. /* Replicate the first pattern byte on all bytes */
  35. sbmm8 $r32 = $r1, REPLICATE_BYTE_MASK
  36. /* Check if length < MIN_SIZE_FOR_ALIGN */
  37. compd.geu $r7 = $r2, MIN_SIZE_FOR_ALIGN
  38. /* Invert address to compute what we need to copy to be aligned on 32 bytes */
  39. negd $r5 = $r0
  40. ;;
  41. /* Check if we are aligned on 32 bytes */
  42. andw $r9 = $r0, 0x1F
  43. /* Compute the length that will be copied to align on 32 bytes boundary */
  44. andw $r6 = $r5, 0x1F
  45. /*
  46. * If size < MIN_SIZE_FOR_ALIGN bits, directly go to so, it will be done
  47. * unaligned but that is still better that what we can do with sb
  48. */
  49. cb.deqz $r7? .Laligned_32
  50. ;;
  51. /* Remove unaligned part from length */
  52. sbfd $r2 = $r6, $r2
  53. /* If we are already aligned on 32 bytes, jump to main "so" loop */
  54. cb.deqz $r9? .Laligned_32
  55. /* Check if we need to copy 1 byte */
  56. andw $r4 = $r5, (1 << 0)
  57. ;;
  58. /* If we are not aligned, store byte */
  59. sb.dnez $r4? [$r0] = $r32
  60. /* Check if we need to copy 2 bytes */
  61. andw $r4 = $r5, (1 << 1)
  62. /* Add potentially copied part for next store offset */
  63. addd $r0 = $r0, $r4
  64. ;;
  65. sh.dnez $r4? [$r0] = $r32
  66. /* Check if we need to copy 4 bytes */
  67. andw $r4 = $r5, (1 << 2)
  68. addd $r0 = $r0, $r4
  69. ;;
  70. sw.dnez $r4? [$r0] = $r32
  71. /* Check if we need to copy 8 bytes */
  72. andw $r4 = $r5, (1 << 3)
  73. addd $r0 = $r0, $r4
  74. /* Copy second part of pattern for sq */
  75. copyd $r33 = $r32
  76. ;;
  77. sd.dnez $r4? [$r0] = $r32
  78. /* Check if we need to copy 16 bytes */
  79. andw $r4 = $r5, (1 << 4)
  80. addd $r0 = $r0, $r4
  81. ;;
  82. sq.dnez $r4? [$r0] = $r32r33
  83. addd $r0 = $r0, $r4
  84. ;;
  85. .Laligned_32:
  86. /* Copy second part of pattern for sq */
  87. copyd $r33 = $r32
  88. /* Prepare amount of data for 32 bytes store */
  89. srld $r10 = $r2, 5
  90. nop
  91. nop
  92. ;;
  93. copyq $r34r35 = $r32, $r33
  94. /* Remaining bytes for 16 bytes store */
  95. andw $r8 = $r2, (1 << 4)
  96. make $r11 = 32
  97. /* Check if there are enough data for 32 bytes store */
  98. cb.deqz $r10? .Laligned_32_done
  99. ;;
  100. loopdo $r10, .Laligned_32_done
  101. ;;
  102. so 0[$r0] = $r32r33r34r35
  103. addd $r0 = $r0, $r11
  104. ;;
  105. .Laligned_32_done:
  106. /*
  107. * Now that we have handled every aligned bytes using 'so', we can
  108. * handled the remainder of length using store by decrementing size
  109. * We also exploit the fact we are aligned to simply check remaining
  110. * size */
  111. sq.dnez $r8? [$r0] = $r32r33
  112. addd $r0 = $r0, $r8
  113. /* Remaining bytes for 8 bytes store */
  114. andw $r8 = $r2, (1 << 3)
  115. cb.deqz $r2? .Lmemset_done
  116. ;;
  117. sd.dnez $r8? [$r0] = $r32
  118. addd $r0 = $r0, $r8
  119. /* Remaining bytes for 4 bytes store */
  120. andw $r8 = $r2, (1 << 2)
  121. ;;
  122. sw.dnez $r8? [$r0] = $r32
  123. addd $r0 = $r0, $r8
  124. /* Remaining bytes for 2 bytes store */
  125. andw $r8 = $r2, (1 << 1)
  126. ;;
  127. sh.dnez $r8? [$r0] = $r32
  128. addd $r0 = $r0, $r8
  129. ;;
  130. sb.odd $r2? [$r0] = $r32
  131. /* Restore original value */
  132. copyd $r0 = $r3
  133. ret
  134. ;;
  135. .Lmemset_done:
  136. /* Restore original value */
  137. copyd $r0 = $r3
  138. ret
  139. ;;
  140. END(memset)
  141. libc_hidden_def(memset)