urem.S 6.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350
  1. /* This file is generated from divrem.m4; DO NOT EDIT! */
  2. /*
  3. * Division and remainder, from Appendix E of the Sparc Version 8
  4. * Architecture Manual, with fixes from Gordon Irlam.
  5. */
  6. /*
  7. * Input: dividend and divisor in %o0 and %o1 respectively.
  8. *
  9. * m4 parameters:
  10. * .urem name of function to generate
  11. * rem rem=div => %o0 / %o1; rem=rem => %o0 % %o1
  12. * false false=true => signed; false=false => unsigned
  13. *
  14. * Algorithm parameters:
  15. * N how many bits per iteration we try to get (4)
  16. * WORDSIZE total number of bits (32)
  17. *
  18. * Derived constants:
  19. * TOPBITS number of bits in the top decade of a number
  20. *
  21. * Important variables:
  22. * Q the partial quotient under development (initially 0)
  23. * R the remainder so far, initially the dividend
  24. * ITER number of main division loop iterations required;
  25. * equal to ceil(log2(quotient) / N). Note that this
  26. * is the log base (2^N) of the quotient.
  27. * V the current comparand, initially divisor*2^(ITER*N-1)
  28. *
  29. * Cost:
  30. * Current estimate for non-large dividend is
  31. * ceil(log2(quotient) / N) * (10 + 7N/2) + C
  32. * A large dividend is one greater than 2^(31-TOPBITS) and takes a
  33. * different path, as the upper bits of the quotient must be developed
  34. * one bit at a time.
  35. */
  36. #include <sys/syscall.h>
  37. .global .urem;
  38. .align 4;
  39. .type .urem ,@function;
  40. .urem:
  41. ! Ready to divide. Compute size of quotient; scale comparand.
  42. orcc %o1, %g0, %o5
  43. bne 1f
  44. mov %o0, %o3
  45. ! Divide by zero trap. If it returns, return 0 (about as
  46. ! wrong as possible, but that is what SunOS does...).
  47. ta 0x02
  48. retl
  49. clr %o0
  50. 1:
  51. cmp %o3, %o5 ! if %o1 exceeds %o0, done
  52. blu .Lgot_result ! (and algorithm fails otherwise)
  53. clr %o2
  54. sethi %hi(1 << (32 - 4 - 1)), %g1
  55. cmp %o3, %g1
  56. blu .Lnot_really_big
  57. clr %o4
  58. ! Here the dividend is >= 2**(31-N) or so. We must be careful here,
  59. ! as our usual N-at-a-shot divide step will cause overflow and havoc.
  60. ! The number of bits in the result here is N*ITER+SC, where SC <= N.
  61. ! Compute ITER in an unorthodox manner: know we need to shift V into
  62. ! the top decade: so do not even bother to compare to R.
  63. 1:
  64. cmp %o5, %g1
  65. bgeu 3f
  66. mov 1, %g2
  67. sll %o5, 4, %o5
  68. b 1b
  69. add %o4, 1, %o4
  70. ! Now compute %g2.
  71. 2: addcc %o5, %o5, %o5
  72. bcc .Lnot_too_big
  73. add %g2, 1, %g2
  74. ! We get here if the %o1 overflowed while shifting.
  75. ! This means that %o3 has the high-order bit set.
  76. ! Restore %o5 and subtract from %o3.
  77. sll %g1, 4, %g1 ! high order bit
  78. srl %o5, 1, %o5 ! rest of %o5
  79. add %o5, %g1, %o5
  80. b .Ldo_single_div
  81. sub %g2, 1, %g2
  82. .Lnot_too_big:
  83. 3: cmp %o5, %o3
  84. blu 2b
  85. nop
  86. be .Ldo_single_div
  87. nop
  88. /* NB: these are commented out in the V8-Sparc manual as well */
  89. /* (I do not understand this) */
  90. ! %o5 > %o3: went too far: back up 1 step
  91. ! srl %o5, 1, %o5
  92. ! dec %g2
  93. ! do single-bit divide steps
  94. !
  95. ! We have to be careful here. We know that %o3 >= %o5, so we can do the
  96. ! first divide step without thinking. BUT, the others are conditional,
  97. ! and are only done if %o3 >= 0. Because both %o3 and %o5 may have the high-
  98. ! order bit set in the first step, just falling into the regular
  99. ! division loop will mess up the first time around.
  100. ! So we unroll slightly...
  101. .Ldo_single_div:
  102. subcc %g2, 1, %g2
  103. bl .Lend_regular_divide
  104. nop
  105. sub %o3, %o5, %o3
  106. mov 1, %o2
  107. b .Lend_single_divloop
  108. nop
  109. .Lsingle_divloop:
  110. sll %o2, 1, %o2
  111. bl 1f
  112. srl %o5, 1, %o5
  113. ! %o3 >= 0
  114. sub %o3, %o5, %o3
  115. b 2f
  116. add %o2, 1, %o2
  117. 1: ! %o3 < 0
  118. add %o3, %o5, %o3
  119. sub %o2, 1, %o2
  120. 2:
  121. .Lend_single_divloop:
  122. subcc %g2, 1, %g2
  123. bge .Lsingle_divloop
  124. tst %o3
  125. b,a .Lend_regular_divide
  126. .Lnot_really_big:
  127. 1:
  128. sll %o5, 4, %o5
  129. cmp %o5, %o3
  130. bleu 1b
  131. addcc %o4, 1, %o4
  132. be .Lgot_result
  133. sub %o4, 1, %o4
  134. tst %o3 ! set up for initial iteration
  135. .Ldivloop:
  136. sll %o2, 4, %o2
  137. ! depth 1, accumulated bits 0
  138. bl .L1.16
  139. srl %o5,1,%o5
  140. ! remainder is positive
  141. subcc %o3,%o5,%o3
  142. ! depth 2, accumulated bits 1
  143. bl .L2.17
  144. srl %o5,1,%o5
  145. ! remainder is positive
  146. subcc %o3,%o5,%o3
  147. ! depth 3, accumulated bits 3
  148. bl .L3.19
  149. srl %o5,1,%o5
  150. ! remainder is positive
  151. subcc %o3,%o5,%o3
  152. ! depth 4, accumulated bits 7
  153. bl .L4.23
  154. srl %o5,1,%o5
  155. ! remainder is positive
  156. subcc %o3,%o5,%o3
  157. b 9f
  158. add %o2, (7*2+1), %o2
  159. .L4.23:
  160. ! remainder is negative
  161. addcc %o3,%o5,%o3
  162. b 9f
  163. add %o2, (7*2-1), %o2
  164. .L3.19:
  165. ! remainder is negative
  166. addcc %o3,%o5,%o3
  167. ! depth 4, accumulated bits 5
  168. bl .L4.21
  169. srl %o5,1,%o5
  170. ! remainder is positive
  171. subcc %o3,%o5,%o3
  172. b 9f
  173. add %o2, (5*2+1), %o2
  174. .L4.21:
  175. ! remainder is negative
  176. addcc %o3,%o5,%o3
  177. b 9f
  178. add %o2, (5*2-1), %o2
  179. .L2.17:
  180. ! remainder is negative
  181. addcc %o3,%o5,%o3
  182. ! depth 3, accumulated bits 1
  183. bl .L3.17
  184. srl %o5,1,%o5
  185. ! remainder is positive
  186. subcc %o3,%o5,%o3
  187. ! depth 4, accumulated bits 3
  188. bl .L4.19
  189. srl %o5,1,%o5
  190. ! remainder is positive
  191. subcc %o3,%o5,%o3
  192. b 9f
  193. add %o2, (3*2+1), %o2
  194. .L4.19:
  195. ! remainder is negative
  196. addcc %o3,%o5,%o3
  197. b 9f
  198. add %o2, (3*2-1), %o2
  199. .L3.17:
  200. ! remainder is negative
  201. addcc %o3,%o5,%o3
  202. ! depth 4, accumulated bits 1
  203. bl .L4.17
  204. srl %o5,1,%o5
  205. ! remainder is positive
  206. subcc %o3,%o5,%o3
  207. b 9f
  208. add %o2, (1*2+1), %o2
  209. .L4.17:
  210. ! remainder is negative
  211. addcc %o3,%o5,%o3
  212. b 9f
  213. add %o2, (1*2-1), %o2
  214. .L1.16:
  215. ! remainder is negative
  216. addcc %o3,%o5,%o3
  217. ! depth 2, accumulated bits -1
  218. bl .L2.15
  219. srl %o5,1,%o5
  220. ! remainder is positive
  221. subcc %o3,%o5,%o3
  222. ! depth 3, accumulated bits -1
  223. bl .L3.15
  224. srl %o5,1,%o5
  225. ! remainder is positive
  226. subcc %o3,%o5,%o3
  227. ! depth 4, accumulated bits -1
  228. bl .L4.15
  229. srl %o5,1,%o5
  230. ! remainder is positive
  231. subcc %o3,%o5,%o3
  232. b 9f
  233. add %o2, (-1*2+1), %o2
  234. .L4.15:
  235. ! remainder is negative
  236. addcc %o3,%o5,%o3
  237. b 9f
  238. add %o2, (-1*2-1), %o2
  239. .L3.15:
  240. ! remainder is negative
  241. addcc %o3,%o5,%o3
  242. ! depth 4, accumulated bits -3
  243. bl .L4.13
  244. srl %o5,1,%o5
  245. ! remainder is positive
  246. subcc %o3,%o5,%o3
  247. b 9f
  248. add %o2, (-3*2+1), %o2
  249. .L4.13:
  250. ! remainder is negative
  251. addcc %o3,%o5,%o3
  252. b 9f
  253. add %o2, (-3*2-1), %o2
  254. .L2.15:
  255. ! remainder is negative
  256. addcc %o3,%o5,%o3
  257. ! depth 3, accumulated bits -3
  258. bl .L3.13
  259. srl %o5,1,%o5
  260. ! remainder is positive
  261. subcc %o3,%o5,%o3
  262. ! depth 4, accumulated bits -5
  263. bl .L4.11
  264. srl %o5,1,%o5
  265. ! remainder is positive
  266. subcc %o3,%o5,%o3
  267. b 9f
  268. add %o2, (-5*2+1), %o2
  269. .L4.11:
  270. ! remainder is negative
  271. addcc %o3,%o5,%o3
  272. b 9f
  273. add %o2, (-5*2-1), %o2
  274. .L3.13:
  275. ! remainder is negative
  276. addcc %o3,%o5,%o3
  277. ! depth 4, accumulated bits -7
  278. bl .L4.9
  279. srl %o5,1,%o5
  280. ! remainder is positive
  281. subcc %o3,%o5,%o3
  282. b 9f
  283. add %o2, (-7*2+1), %o2
  284. .L4.9:
  285. ! remainder is negative
  286. addcc %o3,%o5,%o3
  287. b 9f
  288. add %o2, (-7*2-1), %o2
  289. 9:
  290. .Lend_regular_divide:
  291. subcc %o4, 1, %o4
  292. bge .Ldivloop
  293. tst %o3
  294. bl,a .Lgot_result
  295. ! non-restoring fixup here (one instruction only!)
  296. add %o3, %o1, %o3
  297. .Lgot_result:
  298. retl
  299. mov %o3, %o0
  300. .size .urem , . -.urem