Browse Source

import different optimized versions of div funcs based upon target sparc arch

Mike Frysinger 19 years ago
parent
commit
46b06e9c1c

+ 9 - 367
libc/sysdeps/linux/sparc/rem.S

@@ -1,367 +1,9 @@
-   /* This file is generated from divrem.m4; DO NOT EDIT! */
-/*
- * Division and remainder, from Appendix E of the Sparc Version 8
- * Architecture Manual, with fixes from Gordon Irlam.
- */
-
-/*
- * Input: dividend and divisor in %o0 and %o1 respectively.
- *
- * m4 parameters:
- *  .rem	name of function to generate
- *  rem		rem=div => %o0 / %o1; rem=rem => %o0 % %o1
- *  true		true=true => signed; true=false => unsigned
- *
- * Algorithm parameters:
- *  N		how many bits per iteration we try to get (4)
- *  WORDSIZE	total number of bits (32)
- *
- * Derived constants:
- *  TOPBITS	number of bits in the top decade of a number
- *
- * Important variables:
- *  Q		the partial quotient under development (initially 0)
- *  R		the remainder so far, initially the dividend
- *  ITER	number of main division loop iterations required;
- *		equal to ceil(log2(quotient) / N).  Note that this
- *		is the log base (2^N) of the quotient.
- *  V		the current comparand, initially divisor*2^(ITER*N-1)
- *
- * Cost:
- *  Current estimate for non-large dividend is
- *	ceil(log2(quotient) / N) * (10 + 7N/2) + C
- *  A large dividend is one greater than 2^(31-TOPBITS) and takes a
- *  different path, as the upper bits of the quotient must be developed
- *  one bit at a time.
- */
-
-
-
-#include <sys/syscall.h>
-
-
-.global   .rem;
-.align 4;
-.type  .rem ,@function; 
-
-.rem: 
-	! compute sign of result; if neither is negative, no problem
-	orcc	%o1, %o0, %g0	! either negative?
-	bge	2f			! no, go do the divide
-	mov	%o0, %g3		! sign of remainder matches %o0
-	tst	%o1
-	bge	1f
-	tst	%o0
-	! %o1 is definitely negative; %o0 might also be negative
-	bge	2f			! if %o0 not negative...
-	sub	%g0, %o1, %o1	! in any case, make %o1 nonneg
-1:	! %o0 is negative, %o1 is nonnegative
-	sub	%g0, %o0, %o0	! make %o0 nonnegative
-2:
-
-	! Ready to divide.  Compute size of quotient; scale comparand.
-	orcc	%o1, %g0, %o5
-	bne	1f
-	mov	%o0, %o3
-
-		! Divide by zero trap.  If it returns, return 0 (about as
-		! wrong as possible, but that is what SunOS does...).
-		ta	0x02
-		retl
-		clr	%o0
-
-1:
-	cmp	%o3, %o5			! if %o1 exceeds %o0, done
-	blu	.Lgot_result		! (and algorithm fails otherwise)
-	clr	%o2
-	sethi	%hi(1 << (32 - 4 - 1)), %g1
-	cmp	%o3, %g1
-	blu	.Lnot_really_big
-	clr	%o4
-
-	! Here the dividend is >= 2**(31-N) or so.  We must be careful here,
-	! as our usual N-at-a-shot divide step will cause overflow and havoc.
-	! The number of bits in the result here is N*ITER+SC, where SC <= N.
-	! Compute ITER in an unorthodox manner: know we need to shift V into
-	! the top decade: so do not even bother to compare to R.
-	1:
-		cmp	%o5, %g1
-		bgeu	3f
-		mov	1, %g2
-		sll	%o5, 4, %o5
-		b	1b
-		add	%o4, 1, %o4
-
-	! Now compute %g2.
-	2:	addcc	%o5, %o5, %o5
-		bcc	.Lnot_too_big
-		add	%g2, 1, %g2
-
-		! We get here if the %o1 overflowed while shifting.
-		! This means that %o3 has the high-order bit set.
-		! Restore %o5 and subtract from %o3.
-		sll	%g1, 4, %g1	! high order bit
-		srl	%o5, 1, %o5		! rest of %o5
-		add	%o5, %g1, %o5
-		b	.Ldo_single_div
-		sub	%g2, 1, %g2
-
-	.Lnot_too_big:
-	3:	cmp	%o5, %o3
-		blu	2b
-		nop
-		be	.Ldo_single_div
-		nop
-	/* NB: these are commented out in the V8-Sparc manual as well */
-	/* (I do not understand this) */
-	! %o5 > %o3: went too far: back up 1 step
-	!	srl	%o5, 1, %o5
-	!	dec	%g2
-	! do single-bit divide steps
-	!
-	! We have to be careful here.  We know that %o3 >= %o5, so we can do the
-	! first divide step without thinking.  BUT, the others are conditional,
-	! and are only done if %o3 >= 0.  Because both %o3 and %o5 may have the high-
-	! order bit set in the first step, just falling into the regular
-	! division loop will mess up the first time around.
-	! So we unroll slightly...
-	.Ldo_single_div:
-		subcc	%g2, 1, %g2
-		bl	.Lend_regular_divide
-		nop
-		sub	%o3, %o5, %o3
-		mov	1, %o2
-		b	.Lend_single_divloop
-		nop
-	.Lsingle_divloop:
-		sll	%o2, 1, %o2
-		bl	1f
-		srl	%o5, 1, %o5
-		! %o3 >= 0
-		sub	%o3, %o5, %o3
-		b	2f
-		add	%o2, 1, %o2
-	1:	! %o3 < 0
-		add	%o3, %o5, %o3
-		sub	%o2, 1, %o2
-	2:
-	.Lend_single_divloop:
-		subcc	%g2, 1, %g2
-		bge	.Lsingle_divloop
-		tst	%o3
-		b,a	.Lend_regular_divide
-
-.Lnot_really_big:
-1:
-	sll	%o5, 4, %o5
-	cmp	%o5, %o3
-	bleu	1b
-	addcc	%o4, 1, %o4
-	be	.Lgot_result
-	sub	%o4, 1, %o4
-
-	tst	%o3	! set up for initial iteration
-.Ldivloop:
-	sll	%o2, 4, %o2
-		! depth 1, accumulated bits 0
-	bl	.L1.16
-	srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-			! depth 2, accumulated bits 1
-	bl	.L2.17
-	srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-			! depth 3, accumulated bits 3
-	bl	.L3.19
-	srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-			! depth 4, accumulated bits 7
-	bl	.L4.23
-	srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (7*2+1), %o2
-	
-.L4.23:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (7*2-1), %o2
-	
-	
-.L3.19:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-			! depth 4, accumulated bits 5
-	bl	.L4.21
-	srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (5*2+1), %o2
-	
-.L4.21:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (5*2-1), %o2
-	
-	
-	
-.L2.17:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-			! depth 3, accumulated bits 1
-	bl	.L3.17
-	srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-			! depth 4, accumulated bits 3
-	bl	.L4.19
-	srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (3*2+1), %o2
-	
-.L4.19:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (3*2-1), %o2
-	
-	
-.L3.17:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-			! depth 4, accumulated bits 1
-	bl	.L4.17
-	srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (1*2+1), %o2
-	
-.L4.17:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (1*2-1), %o2
-	
-	
-	
-	
-.L1.16:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-			! depth 2, accumulated bits -1
-	bl	.L2.15
-	srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-			! depth 3, accumulated bits -1
-	bl	.L3.15
-	srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-			! depth 4, accumulated bits -1
-	bl	.L4.15
-	srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (-1*2+1), %o2
-	
-.L4.15:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (-1*2-1), %o2
-	
-	
-.L3.15:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-			! depth 4, accumulated bits -3
-	bl	.L4.13
-	srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (-3*2+1), %o2
-	
-.L4.13:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (-3*2-1), %o2
-	
-	
-	
-.L2.15:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-			! depth 3, accumulated bits -3
-	bl	.L3.13
-	srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-			! depth 4, accumulated bits -5
-	bl	.L4.11
-	srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (-5*2+1), %o2
-	
-.L4.11:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (-5*2-1), %o2
-	
-	
-.L3.13:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-			! depth 4, accumulated bits -7
-	bl	.L4.9
-	srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (-7*2+1), %o2
-	
-.L4.9:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (-7*2-1), %o2
-	
-	
-	
-	
-	9:
-.Lend_regular_divide:
-	subcc	%o4, 1, %o4
-	bge	.Ldivloop
-	tst	%o3
-	bl,a	.Lgot_result
-	! non-restoring fixup here (one instruction only!)
-	add	%o3, %o1, %o3
-
-
-.Lgot_result:
-	! check to see if answer should be < 0
-	tst	%g3
-	bl,a	1f
-	sub %g0, %o3, %o3
-1:
-	retl
-	mov %o3, %o0
-
-.size .rem,.-.rem;
+#include "_math_inc.h"
+
+#if defined(__CONFIG_SPARC_V9__) || defined(__CONFIG_SPARC_V9B__)
+# include "sparcv9/rem.S"
+#elif defined(__CONFIG_SPARC_V8__)
+# include "sparcv8/rem.S"
+#else
+# include "sparcv7/rem.S"
+#endif

+ 9 - 366
libc/sysdeps/linux/sparc/sdiv.S

@@ -1,366 +1,9 @@
-   /* This file is generated from divrem.m4; DO NOT EDIT! */
-/*
- * Division and remainder, from Appendix E of the Sparc Version 8
- * Architecture Manual, with fixes from Gordon Irlam.
- */
-
-/*
- * Input: dividend and divisor in %o0 and %o1 respectively.
- *
- * m4 parameters:
- *  .div	name of function to generate
- *  div		div=div => %o0 / %o1; div=rem => %o0 % %o1
- *  true		true=true => signed; true=false => unsigned
- *
- * Algorithm parameters:
- *  N		how many bits per iteration we try to get (4)
- *  WORDSIZE	total number of bits (32)
- *
- * Derived constants:
- *  TOPBITS	number of bits in the top decade of a number
- *
- * Important variables:
- *  Q		the partial quotient under development (initially 0)
- *  R		the remainder so far, initially the dividend
- *  ITER	number of main division loop iterations required;
- *		equal to ceil(log2(quotient) / N).  Note that this
- *		is the log base (2^N) of the quotient.
- *  V		the current comparand, initially divisor*2^(ITER*N-1)
- *
- * Cost:
- *  Current estimate for non-large dividend is
- *	ceil(log2(quotient) / N) * (10 + 7N/2) + C
- *  A large dividend is one greater than 2^(31-TOPBITS) and takes a
- *  different path, as the upper bits of the quotient must be developed
- *  one bit at a time.
- */
-
-
-
-#include <sys/syscall.h>
-
-.global   .div;
-.align 4;
-.type  .div ,@function; 
-
-.div: 
-	! compute sign of result; if neither is negative, no problem
-	orcc	%o1, %o0, %g0	! either negative?
-	bge	2f			! no, go do the divide
-	xor	%o1, %o0, %g3	! compute sign in any case
-	tst	%o1
-	bge	1f
-	tst	%o0
-	! %o1 is definitely negative; %o0 might also be negative
-	bge	2f			! if %o0 not negative...
-	sub	%g0, %o1, %o1	! in any case, make %o1 nonneg
-1:	! %o0 is negative, %o1 is nonnegative
-	sub	%g0, %o0, %o0	! make %o0 nonnegative
-2:
-
-	! Ready to divide.  Compute size of quotient; scale comparand.
-	orcc	%o1, %g0, %o5
-	bne	1f
-	mov	%o0, %o3
-
-		! Divide by zero trap.  If it returns, return 0 (about as
-		! wrong as possible, but that is what SunOS does...).
-		ta	0x02
-		retl
-		clr	%o0
-
-1:
-	cmp	%o3, %o5			! if %o1 exceeds %o0, done
-	blu	.Lgot_result		! (and algorithm fails otherwise)
-	clr	%o2
-	sethi	%hi(1 << (32 - 4 - 1)), %g1
-	cmp	%o3, %g1
-	blu	.Lnot_really_big
-	clr	%o4
-
-	! Here the dividend is >= 2**(31-N) or so.  We must be careful here,
-	! as our usual N-at-a-shot divide step will cause overflow and havoc.
-	! The number of bits in the result here is N*ITER+SC, where SC <= N.
-	! Compute ITER in an unorthodox manner: know we need to shift V into
-	! the top decade: so do not even bother to compare to R.
-	1:
-		cmp	%o5, %g1
-		bgeu	3f
-		mov	1, %g2
-		sll	%o5, 4, %o5
-		b	1b
-		add	%o4, 1, %o4
-
-	! Now compute %g2.
-	2:	addcc	%o5, %o5, %o5
-		bcc	.Lnot_too_big
-		add	%g2, 1, %g2
-
-		! We get here if the %o1 overflowed while shifting.
-		! This means that %o3 has the high-order bit set.
-		! Restore %o5 and subtract from %o3.
-		sll	%g1, 4, %g1	! high order bit
-		srl	%o5, 1, %o5		! rest of %o5
-		add	%o5, %g1, %o5
-		b	.Ldo_single_div
-		sub	%g2, 1, %g2
-
-	.Lnot_too_big:
-	3:	cmp	%o5, %o3
-		blu	2b
-		nop
-		be	.Ldo_single_div
-		nop
-	/* NB: these are commented out in the V8-Sparc manual as well */
-	/* (I do not understand this) */
-	! %o5 > %o3: went too far: back up 1 step
-	!	srl	%o5, 1, %o5
-	!	dec	%g2
-	! do single-bit divide steps
-	!
-	! We have to be careful here.  We know that %o3 >= %o5, so we can do the
-	! first divide step without thinking.  BUT, the others are conditional,
-	! and are only done if %o3 >= 0.  Because both %o3 and %o5 may have the high-
-	! order bit set in the first step, just falling into the regular
-	! division loop will mess up the first time around.
-	! So we unroll slightly...
-	.Ldo_single_div:
-		subcc	%g2, 1, %g2
-		bl	.Lend_regular_divide
-		nop
-		sub	%o3, %o5, %o3
-		mov	1, %o2
-		b	.Lend_single_divloop
-		nop
-	.Lsingle_divloop:
-		sll	%o2, 1, %o2
-		bl	1f
-		srl	%o5, 1, %o5
-		! %o3 >= 0
-		sub	%o3, %o5, %o3
-		b	2f
-		add	%o2, 1, %o2
-	1:	! %o3 < 0
-		add	%o3, %o5, %o3
-		sub	%o2, 1, %o2
-	2:
-	.Lend_single_divloop:
-		subcc	%g2, 1, %g2
-		bge	.Lsingle_divloop
-		tst	%o3
-		b,a	.Lend_regular_divide
-
-.Lnot_really_big:
-1:
-	sll	%o5, 4, %o5
-	cmp	%o5, %o3
-	bleu	1b
-	addcc	%o4, 1, %o4
-	be	.Lgot_result
-	sub	%o4, 1, %o4
-
-	tst	%o3	! set up for initial iteration
-.Ldivloop:
-	sll	%o2, 4, %o2
-		! depth 1, accumulated bits 0
-	bl	.L1.16
-	srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-			! depth 2, accumulated bits 1
-	bl	.L2.17
-	srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-			! depth 3, accumulated bits 3
-	bl	.L3.19
-	srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-			! depth 4, accumulated bits 7
-	bl	.L4.23
-	srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (7*2+1), %o2
-	
-.L4.23:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (7*2-1), %o2
-	
-	
-.L3.19:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-			! depth 4, accumulated bits 5
-	bl	.L4.21
-	srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (5*2+1), %o2
-	
-.L4.21:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (5*2-1), %o2
-	
-	
-	
-.L2.17:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-			! depth 3, accumulated bits 1
-	bl	.L3.17
-	srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-			! depth 4, accumulated bits 3
-	bl	.L4.19
-	srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (3*2+1), %o2
-	
-.L4.19:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (3*2-1), %o2
-	
-	
-.L3.17:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-			! depth 4, accumulated bits 1
-	bl	.L4.17
-	srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (1*2+1), %o2
-	
-.L4.17:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (1*2-1), %o2
-	
-	
-	
-	
-.L1.16:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-			! depth 2, accumulated bits -1
-	bl	.L2.15
-	srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-			! depth 3, accumulated bits -1
-	bl	.L3.15
-	srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-			! depth 4, accumulated bits -1
-	bl	.L4.15
-	srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (-1*2+1), %o2
-	
-.L4.15:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (-1*2-1), %o2
-	
-	
-.L3.15:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-			! depth 4, accumulated bits -3
-	bl	.L4.13
-	srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (-3*2+1), %o2
-	
-.L4.13:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (-3*2-1), %o2
-	
-	
-	
-.L2.15:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-			! depth 3, accumulated bits -3
-	bl	.L3.13
-	srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-			! depth 4, accumulated bits -5
-	bl	.L4.11
-	srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (-5*2+1), %o2
-	
-.L4.11:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (-5*2-1), %o2
-	
-	
-.L3.13:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-			! depth 4, accumulated bits -7
-	bl	.L4.9
-	srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (-7*2+1), %o2
-	
-.L4.9:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (-7*2-1), %o2
-	
-	
-	
-	
-	9:
-.Lend_regular_divide:
-	subcc	%o4, 1, %o4
-	bge	.Ldivloop
-	tst	%o3
-	bl,a	.Lgot_result
-	! non-restoring fixup here (one instruction only!)
-	sub	%o2, 1, %o2
-
-
-.Lgot_result:
-	! check to see if answer should be < 0
-	tst	%g3
-	bl,a	1f
-	sub %g0, %o2, %o2
-1:
-	retl
-	mov %o2, %o0
-
-.size .div,.-.div;
+#include "_math_inc.h"
+
+#if defined(__CONFIG_SPARC_V9__) || defined(__CONFIG_SPARC_V9B__)
+# include "sparcv9/sdiv.S"
+#elif defined(__CONFIG_SPARC_V8__)
+# include "sparcv8/sdiv.S"
+#else
+# include "sparcv7/sdiv.S"
+#endif

+ 360 - 0
libc/sysdeps/linux/sparc/sparcv7/rem.S

@@ -0,0 +1,360 @@
+   /* This file is generated from divrem.m4; DO NOT EDIT! */
+/*
+ * Division and remainder, from Appendix E of the Sparc Version 8
+ * Architecture Manual, with fixes from Gordon Irlam.
+ */
+
+/*
+ * Input: dividend and divisor in %o0 and %o1 respectively.
+ *
+ * m4 parameters:
+ *  .rem	name of function to generate
+ *  rem		rem=div => %o0 / %o1; rem=rem => %o0 % %o1
+ *  true		true=true => signed; true=false => unsigned
+ *
+ * Algorithm parameters:
+ *  N		how many bits per iteration we try to get (4)
+ *  WORDSIZE	total number of bits (32)
+ *
+ * Derived constants:
+ *  TOPBITS	number of bits in the top decade of a number
+ *
+ * Important variables:
+ *  Q		the partial quotient under development (initially 0)
+ *  R		the remainder so far, initially the dividend
+ *  ITER	number of main division loop iterations required;
+ *		equal to ceil(log2(quotient) / N).  Note that this
+ *		is the log base (2^N) of the quotient.
+ *  V		the current comparand, initially divisor*2^(ITER*N-1)
+ *
+ * Cost:
+ *  Current estimate for non-large dividend is
+ *	ceil(log2(quotient) / N) * (10 + 7N/2) + C
+ *  A large dividend is one greater than 2^(31-TOPBITS) and takes a
+ *  different path, as the upper bits of the quotient must be developed
+ *  one bit at a time.
+ */
+
+
+
+ENTRY(.rem)
+	! compute sign of result; if neither is negative, no problem
+	orcc	%o1, %o0, %g0	! either negative?
+	bge	2f			! no, go do the divide
+	mov	%o0, %g3		! sign of remainder matches %o0
+	tst	%o1
+	bge	1f
+	tst	%o0
+	! %o1 is definitely negative; %o0 might also be negative
+	bge	2f			! if %o0 not negative...
+	sub	%g0, %o1, %o1	! in any case, make %o1 nonneg
+1:	! %o0 is negative, %o1 is nonnegative
+	sub	%g0, %o0, %o0	! make %o0 nonnegative
+2:
+
+	! Ready to divide.  Compute size of quotient; scale comparand.
+	orcc	%o1, %g0, %o5
+	bne	1f
+	mov	%o0, %o3
+
+		! Divide by zero trap.  If it returns, return 0 (about as
+		! wrong as possible, but that is what SunOS does...).
+		ta	ST_DIV0
+		retl
+		clr	%o0
+
+1:
+	cmp	%o3, %o5			! if %o1 exceeds %o0, done
+	blu	LOC(got_result)		! (and algorithm fails otherwise)
+	clr	%o2
+	sethi	%hi(1 << (32 - 4 - 1)), %g1
+	cmp	%o3, %g1
+	blu	LOC(not_really_big)
+	clr	%o4
+
+	! Here the dividend is >= 2**(31-N) or so.  We must be careful here,
+	! as our usual N-at-a-shot divide step will cause overflow and havoc.
+	! The number of bits in the result here is N*ITER+SC, where SC <= N.
+	! Compute ITER in an unorthodox manner: know we need to shift V into
+	! the top decade: so do not even bother to compare to R.
+	1:
+		cmp	%o5, %g1
+		bgeu	3f
+		mov	1, %g2
+		sll	%o5, 4, %o5
+		b	1b
+		add	%o4, 1, %o4
+
+	! Now compute %g2.
+	2:	addcc	%o5, %o5, %o5
+		bcc	LOC(not_too_big)
+		add	%g2, 1, %g2
+
+		! We get here if the %o1 overflowed while shifting.
+		! This means that %o3 has the high-order bit set.
+		! Restore %o5 and subtract from %o3.
+		sll	%g1, 4, %g1	! high order bit
+		srl	%o5, 1, %o5		! rest of %o5
+		add	%o5, %g1, %o5
+		b	LOC(do_single_div)
+		sub	%g2, 1, %g2
+
+	LOC(not_too_big):
+	3:	cmp	%o5, %o3
+		blu	2b
+		nop
+		be	LOC(do_single_div)
+		nop
+	/* NB: these are commented out in the V8-Sparc manual as well */
+	/* (I do not understand this) */
+	! %o5 > %o3: went too far: back up 1 step
+	!	srl	%o5, 1, %o5
+	!	dec	%g2
+	! do single-bit divide steps
+	!
+	! We have to be careful here.  We know that %o3 >= %o5, so we can do the
+	! first divide step without thinking.  BUT, the others are conditional,
+	! and are only done if %o3 >= 0.  Because both %o3 and %o5 may have the high-
+	! order bit set in the first step, just falling into the regular
+	! division loop will mess up the first time around.
+	! So we unroll slightly...
+	LOC(do_single_div):
+		subcc	%g2, 1, %g2
+		bl	LOC(end_regular_divide)
+		nop
+		sub	%o3, %o5, %o3
+		mov	1, %o2
+		b	LOC(end_single_divloop)
+		nop
+	LOC(single_divloop):
+		sll	%o2, 1, %o2
+		bl	1f
+		srl	%o5, 1, %o5
+		! %o3 >= 0
+		sub	%o3, %o5, %o3
+		b	2f
+		add	%o2, 1, %o2
+	1:	! %o3 < 0
+		add	%o3, %o5, %o3
+		sub	%o2, 1, %o2
+	2:
+	LOC(end_single_divloop):
+		subcc	%g2, 1, %g2
+		bge	LOC(single_divloop)
+		tst	%o3
+		b,a	LOC(end_regular_divide)
+
+LOC(not_really_big):
+1:
+	sll	%o5, 4, %o5
+	cmp	%o5, %o3
+	bleu	1b
+	addcc	%o4, 1, %o4
+	be	LOC(got_result)
+	sub	%o4, 1, %o4
+
+	tst	%o3	! set up for initial iteration
+LOC(divloop):
+	sll	%o2, 4, %o2
+		! depth 1, accumulated bits 0
+	bl	LOC(1.16)
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+			! depth 2, accumulated bits 1
+	bl	LOC(2.17)
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+			! depth 3, accumulated bits 3
+	bl	LOC(3.19)
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+			! depth 4, accumulated bits 7
+	bl	LOC(4.23)
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+		b	9f
+		add	%o2, (7*2+1), %o2
+	
+LOC(4.23):
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+		b	9f
+		add	%o2, (7*2-1), %o2
+	
+	
+LOC(3.19):
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+			! depth 4, accumulated bits 5
+	bl	LOC(4.21)
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+		b	9f
+		add	%o2, (5*2+1), %o2
+	
+LOC(4.21):
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+		b	9f
+		add	%o2, (5*2-1), %o2
+	
+	
+	
+LOC(2.17):
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+			! depth 3, accumulated bits 1
+	bl	LOC(3.17)
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+			! depth 4, accumulated bits 3
+	bl	LOC(4.19)
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+		b	9f
+		add	%o2, (3*2+1), %o2
+	
+LOC(4.19):
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+		b	9f
+		add	%o2, (3*2-1), %o2
+	
+	
+LOC(3.17):
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+			! depth 4, accumulated bits 1
+	bl	LOC(4.17)
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+		b	9f
+		add	%o2, (1*2+1), %o2
+	
+LOC(4.17):
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+		b	9f
+		add	%o2, (1*2-1), %o2
+	
+	
+	
+	
+LOC(1.16):
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+			! depth 2, accumulated bits -1
+	bl	LOC(2.15)
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+			! depth 3, accumulated bits -1
+	bl	LOC(3.15)
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+			! depth 4, accumulated bits -1
+	bl	LOC(4.15)
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+		b	9f
+		add	%o2, (-1*2+1), %o2
+	
+LOC(4.15):
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+		b	9f
+		add	%o2, (-1*2-1), %o2
+	
+	
+LOC(3.15):
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+			! depth 4, accumulated bits -3
+	bl	LOC(4.13)
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+		b	9f
+		add	%o2, (-3*2+1), %o2
+	
+LOC(4.13):
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+		b	9f
+		add	%o2, (-3*2-1), %o2
+	
+	
+	
+LOC(2.15):
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+			! depth 3, accumulated bits -3
+	bl	LOC(3.13)
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+			! depth 4, accumulated bits -5
+	bl	LOC(4.11)
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+		b	9f
+		add	%o2, (-5*2+1), %o2
+	
+LOC(4.11):
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+		b	9f
+		add	%o2, (-5*2-1), %o2
+	
+	
+LOC(3.13):
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+			! depth 4, accumulated bits -7
+	bl	LOC(4.9)
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+		b	9f
+		add	%o2, (-7*2+1), %o2
+	
+LOC(4.9):
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+		b	9f
+		add	%o2, (-7*2-1), %o2
+	
+	
+	
+	
+	9:
+LOC(end_regular_divide):
+	subcc	%o4, 1, %o4
+	bge	LOC(divloop)
+	tst	%o3
+	bl,a	LOC(got_result)
+	! non-restoring fixup here (one instruction only!)
+	add	%o3, %o1, %o3
+
+
+LOC(got_result):
+	! check to see if answer should be < 0
+	tst	%g3
+	bl,a	1f
+	sub %g0, %o3, %o3
+1:
+	retl
+	mov %o3, %o0
+
+END(.rem)

+ 360 - 0
libc/sysdeps/linux/sparc/sparcv7/sdiv.S

@@ -0,0 +1,360 @@
+   /* This file is generated from divrem.m4; DO NOT EDIT! */
+/*
+ * Division and remainder, from Appendix E of the Sparc Version 8
+ * Architecture Manual, with fixes from Gordon Irlam.
+ */
+
+/*
+ * Input: dividend and divisor in %o0 and %o1 respectively.
+ *
+ * m4 parameters:
+ *  .div	name of function to generate
+ *  div		div=div => %o0 / %o1; div=rem => %o0 % %o1
+ *  true		true=true => signed; true=false => unsigned
+ *
+ * Algorithm parameters:
+ *  N		how many bits per iteration we try to get (4)
+ *  WORDSIZE	total number of bits (32)
+ *
+ * Derived constants:
+ *  TOPBITS	number of bits in the top decade of a number
+ *
+ * Important variables:
+ *  Q		the partial quotient under development (initially 0)
+ *  R		the remainder so far, initially the dividend
+ *  ITER	number of main division loop iterations required;
+ *		equal to ceil(log2(quotient) / N).  Note that this
+ *		is the log base (2^N) of the quotient.
+ *  V		the current comparand, initially divisor*2^(ITER*N-1)
+ *
+ * Cost:
+ *  Current estimate for non-large dividend is
+ *	ceil(log2(quotient) / N) * (10 + 7N/2) + C
+ *  A large dividend is one greater than 2^(31-TOPBITS) and takes a
+ *  different path, as the upper bits of the quotient must be developed
+ *  one bit at a time.
+ */
+
+
+
+ENTRY(.div)
+	! compute sign of result; if neither is negative, no problem
+	orcc	%o1, %o0, %g0	! either negative?
+	bge	2f			! no, go do the divide
+	xor	%o1, %o0, %g3	! compute sign in any case
+	tst	%o1
+	bge	1f
+	tst	%o0
+	! %o1 is definitely negative; %o0 might also be negative
+	bge	2f			! if %o0 not negative...
+	sub	%g0, %o1, %o1	! in any case, make %o1 nonneg
+1:	! %o0 is negative, %o1 is nonnegative
+	sub	%g0, %o0, %o0	! make %o0 nonnegative
+2:
+
+	! Ready to divide.  Compute size of quotient; scale comparand.
+	orcc	%o1, %g0, %o5
+	bne	1f
+	mov	%o0, %o3
+
+		! Divide by zero trap.  If it returns, return 0 (about as
+		! wrong as possible, but that is what SunOS does...).
+		ta	ST_DIV0
+		retl
+		clr	%o0
+
+1:
+	cmp	%o3, %o5			! if %o1 exceeds %o0, done
+	blu	LOC(got_result)		! (and algorithm fails otherwise)
+	clr	%o2
+	sethi	%hi(1 << (32 - 4 - 1)), %g1
+	cmp	%o3, %g1
+	blu	LOC(not_really_big)
+	clr	%o4
+
+	! Here the dividend is >= 2**(31-N) or so.  We must be careful here,
+	! as our usual N-at-a-shot divide step will cause overflow and havoc.
+	! The number of bits in the result here is N*ITER+SC, where SC <= N.
+	! Compute ITER in an unorthodox manner: know we need to shift V into
+	! the top decade: so do not even bother to compare to R.
+	1:
+		cmp	%o5, %g1
+		bgeu	3f
+		mov	1, %g2
+		sll	%o5, 4, %o5
+		b	1b
+		add	%o4, 1, %o4
+
+	! Now compute %g2.
+	2:	addcc	%o5, %o5, %o5
+		bcc	LOC(not_too_big)
+		add	%g2, 1, %g2
+
+		! We get here if the %o1 overflowed while shifting.
+		! This means that %o3 has the high-order bit set.
+		! Restore %o5 and subtract from %o3.
+		sll	%g1, 4, %g1	! high order bit
+		srl	%o5, 1, %o5		! rest of %o5
+		add	%o5, %g1, %o5
+		b	LOC(do_single_div)
+		sub	%g2, 1, %g2
+
+	LOC(not_too_big):
+	3:	cmp	%o5, %o3
+		blu	2b
+		nop
+		be	LOC(do_single_div)
+		nop
+	/* NB: these are commented out in the V8-Sparc manual as well */
+	/* (I do not understand this) */
+	! %o5 > %o3: went too far: back up 1 step
+	!	srl	%o5, 1, %o5
+	!	dec	%g2
+	! do single-bit divide steps
+	!
+	! We have to be careful here.  We know that %o3 >= %o5, so we can do the
+	! first divide step without thinking.  BUT, the others are conditional,
+	! and are only done if %o3 >= 0.  Because both %o3 and %o5 may have the high-
+	! order bit set in the first step, just falling into the regular
+	! division loop will mess up the first time around.
+	! So we unroll slightly...
+	LOC(do_single_div):
+		subcc	%g2, 1, %g2
+		bl	LOC(end_regular_divide)
+		nop
+		sub	%o3, %o5, %o3
+		mov	1, %o2
+		b	LOC(end_single_divloop)
+		nop
+	LOC(single_divloop):
+		sll	%o2, 1, %o2
+		bl	1f
+		srl	%o5, 1, %o5
+		! %o3 >= 0
+		sub	%o3, %o5, %o3
+		b	2f
+		add	%o2, 1, %o2
+	1:	! %o3 < 0
+		add	%o3, %o5, %o3
+		sub	%o2, 1, %o2
+	2:
+	LOC(end_single_divloop):
+		subcc	%g2, 1, %g2
+		bge	LOC(single_divloop)
+		tst	%o3
+		b,a	LOC(end_regular_divide)
+
+LOC(not_really_big):
+1:
+	sll	%o5, 4, %o5
+	cmp	%o5, %o3
+	bleu	1b
+	addcc	%o4, 1, %o4
+	be	LOC(got_result)
+	sub	%o4, 1, %o4
+
+	tst	%o3	! set up for initial iteration
+LOC(divloop):
+	sll	%o2, 4, %o2
+		! depth 1, accumulated bits 0
+	bl	LOC(1.16)
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+			! depth 2, accumulated bits 1
+	bl	LOC(2.17)
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+			! depth 3, accumulated bits 3
+	bl	LOC(3.19)
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+			! depth 4, accumulated bits 7
+	bl	LOC(4.23)
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+		b	9f
+		add	%o2, (7*2+1), %o2
+	
+LOC(4.23):
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+		b	9f
+		add	%o2, (7*2-1), %o2
+	
+	
+LOC(3.19):
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+			! depth 4, accumulated bits 5
+	bl	LOC(4.21)
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+		b	9f
+		add	%o2, (5*2+1), %o2
+	
+LOC(4.21):
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+		b	9f
+		add	%o2, (5*2-1), %o2
+	
+	
+	
+LOC(2.17):
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+			! depth 3, accumulated bits 1
+	bl	LOC(3.17)
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+			! depth 4, accumulated bits 3
+	bl	LOC(4.19)
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+		b	9f
+		add	%o2, (3*2+1), %o2
+	
+LOC(4.19):
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+		b	9f
+		add	%o2, (3*2-1), %o2
+	
+	
+LOC(3.17):
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+			! depth 4, accumulated bits 1
+	bl	LOC(4.17)
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+		b	9f
+		add	%o2, (1*2+1), %o2
+	
+LOC(4.17):
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+		b	9f
+		add	%o2, (1*2-1), %o2
+	
+	
+	
+	
+LOC(1.16):
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+			! depth 2, accumulated bits -1
+	bl	LOC(2.15)
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+			! depth 3, accumulated bits -1
+	bl	LOC(3.15)
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+			! depth 4, accumulated bits -1
+	bl	LOC(4.15)
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+		b	9f
+		add	%o2, (-1*2+1), %o2
+	
+LOC(4.15):
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+		b	9f
+		add	%o2, (-1*2-1), %o2
+	
+	
+LOC(3.15):
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+			! depth 4, accumulated bits -3
+	bl	LOC(4.13)
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+		b	9f
+		add	%o2, (-3*2+1), %o2
+	
+LOC(4.13):
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+		b	9f
+		add	%o2, (-3*2-1), %o2
+	
+	
+	
+LOC(2.15):
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+			! depth 3, accumulated bits -3
+	bl	LOC(3.13)
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+			! depth 4, accumulated bits -5
+	bl	LOC(4.11)
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+		b	9f
+		add	%o2, (-5*2+1), %o2
+	
+LOC(4.11):
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+		b	9f
+		add	%o2, (-5*2-1), %o2
+	
+	
+LOC(3.13):
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+			! depth 4, accumulated bits -7
+	bl	LOC(4.9)
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+		b	9f
+		add	%o2, (-7*2+1), %o2
+	
+LOC(4.9):
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+		b	9f
+		add	%o2, (-7*2-1), %o2
+	
+	
+	
+	
+	9:
+LOC(end_regular_divide):
+	subcc	%o4, 1, %o4
+	bge	LOC(divloop)
+	tst	%o3
+	bl,a	LOC(got_result)
+	! non-restoring fixup here (one instruction only!)
+	sub	%o2, 1, %o2
+
+
+LOC(got_result):
+	! check to see if answer should be < 0
+	tst	%g3
+	bl,a	1f
+	sub %g0, %o2, %o2
+1:
+	retl
+	mov %o2, %o0
+
+END(.div)

+ 343 - 0
libc/sysdeps/linux/sparc/sparcv7/udiv.S

@@ -0,0 +1,343 @@
+   /* This file is generated from divrem.m4; DO NOT EDIT! */
+/*
+ * Division and remainder, from Appendix E of the Sparc Version 8
+ * Architecture Manual, with fixes from Gordon Irlam.
+ */
+
+/*
+ * Input: dividend and divisor in %o0 and %o1 respectively.
+ *
+ * m4 parameters:
+ *  .udiv	name of function to generate
+ *  div		div=div => %o0 / %o1; div=rem => %o0 % %o1
+ *  false		false=true => signed; false=false => unsigned
+ *
+ * Algorithm parameters:
+ *  N		how many bits per iteration we try to get (4)
+ *  WORDSIZE	total number of bits (32)
+ *
+ * Derived constants:
+ *  TOPBITS	number of bits in the top decade of a number
+ *
+ * Important variables:
+ *  Q		the partial quotient under development (initially 0)
+ *  R		the remainder so far, initially the dividend
+ *  ITER	number of main division loop iterations required;
+ *		equal to ceil(log2(quotient) / N).  Note that this
+ *		is the log base (2^N) of the quotient.
+ *  V		the current comparand, initially divisor*2^(ITER*N-1)
+ *
+ * Cost:
+ *  Current estimate for non-large dividend is
+ *	ceil(log2(quotient) / N) * (10 + 7N/2) + C
+ *  A large dividend is one greater than 2^(31-TOPBITS) and takes a
+ *  different path, as the upper bits of the quotient must be developed
+ *  one bit at a time.
+ */
+
+
+
+ENTRY(.udiv)
+
+	! Ready to divide.  Compute size of quotient; scale comparand.
+	orcc	%o1, %g0, %o5
+	bne	1f
+	mov	%o0, %o3
+
+		! Divide by zero trap.  If it returns, return 0 (about as
+		! wrong as possible, but that is what SunOS does...).
+		ta	ST_DIV0
+		retl
+		clr	%o0
+
+1:
+	cmp	%o3, %o5			! if %o1 exceeds %o0, done
+	blu	LOC(got_result)		! (and algorithm fails otherwise)
+	clr	%o2
+	sethi	%hi(1 << (32 - 4 - 1)), %g1
+	cmp	%o3, %g1
+	blu	LOC(not_really_big)
+	clr	%o4
+
+	! Here the dividend is >= 2**(31-N) or so.  We must be careful here,
+	! as our usual N-at-a-shot divide step will cause overflow and havoc.
+	! The number of bits in the result here is N*ITER+SC, where SC <= N.
+	! Compute ITER in an unorthodox manner: know we need to shift V into
+	! the top decade: so do not even bother to compare to R.
+	1:
+		cmp	%o5, %g1
+		bgeu	3f
+		mov	1, %g2
+		sll	%o5, 4, %o5
+		b	1b
+		add	%o4, 1, %o4
+
+	! Now compute %g2.
+	2:	addcc	%o5, %o5, %o5
+		bcc	LOC(not_too_big)
+		add	%g2, 1, %g2
+
+		! We get here if the %o1 overflowed while shifting.
+		! This means that %o3 has the high-order bit set.
+		! Restore %o5 and subtract from %o3.
+		sll	%g1, 4, %g1	! high order bit
+		srl	%o5, 1, %o5		! rest of %o5
+		add	%o5, %g1, %o5
+		b	LOC(do_single_div)
+		sub	%g2, 1, %g2
+
+	LOC(not_too_big):
+	3:	cmp	%o5, %o3
+		blu	2b
+		nop
+		be	LOC(do_single_div)
+		nop
+	/* NB: these are commented out in the V8-Sparc manual as well */
+	/* (I do not understand this) */
+	! %o5 > %o3: went too far: back up 1 step
+	!	srl	%o5, 1, %o5
+	!	dec	%g2
+	! do single-bit divide steps
+	!
+	! We have to be careful here.  We know that %o3 >= %o5, so we can do the
+	! first divide step without thinking.  BUT, the others are conditional,
+	! and are only done if %o3 >= 0.  Because both %o3 and %o5 may have the high-
+	! order bit set in the first step, just falling into the regular
+	! division loop will mess up the first time around.
+	! So we unroll slightly...
+	LOC(do_single_div):
+		subcc	%g2, 1, %g2
+		bl	LOC(end_regular_divide)
+		nop
+		sub	%o3, %o5, %o3
+		mov	1, %o2
+		b	LOC(end_single_divloop)
+		nop
+	LOC(single_divloop):
+		sll	%o2, 1, %o2
+		bl	1f
+		srl	%o5, 1, %o5
+		! %o3 >= 0
+		sub	%o3, %o5, %o3
+		b	2f
+		add	%o2, 1, %o2
+	1:	! %o3 < 0
+		add	%o3, %o5, %o3
+		sub	%o2, 1, %o2
+	2:
+	LOC(end_single_divloop):
+		subcc	%g2, 1, %g2
+		bge	LOC(single_divloop)
+		tst	%o3
+		b,a	LOC(end_regular_divide)
+
+LOC(not_really_big):
+1:
+	sll	%o5, 4, %o5
+	cmp	%o5, %o3
+	bleu	1b
+	addcc	%o4, 1, %o4
+	be	LOC(got_result)
+	sub	%o4, 1, %o4
+
+	tst	%o3	! set up for initial iteration
+LOC(divloop):
+	sll	%o2, 4, %o2
+		! depth 1, accumulated bits 0
+	bl	LOC(1.16)
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+			! depth 2, accumulated bits 1
+	bl	LOC(2.17)
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+			! depth 3, accumulated bits 3
+	bl	LOC(3.19)
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+			! depth 4, accumulated bits 7
+	bl	LOC(4.23)
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+		b	9f
+		add	%o2, (7*2+1), %o2
+	
+LOC(4.23):
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+		b	9f
+		add	%o2, (7*2-1), %o2
+	
+	
+LOC(3.19):
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+			! depth 4, accumulated bits 5
+	bl	LOC(4.21)
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+		b	9f
+		add	%o2, (5*2+1), %o2
+	
+LOC(4.21):
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+		b	9f
+		add	%o2, (5*2-1), %o2
+	
+	
+	
+LOC(2.17):
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+			! depth 3, accumulated bits 1
+	bl	LOC(3.17)
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+			! depth 4, accumulated bits 3
+	bl	LOC(4.19)
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+		b	9f
+		add	%o2, (3*2+1), %o2
+	
+LOC(4.19):
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+		b	9f
+		add	%o2, (3*2-1), %o2
+	
+	
+LOC(3.17):
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+			! depth 4, accumulated bits 1
+	bl	LOC(4.17)
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+		b	9f
+		add	%o2, (1*2+1), %o2
+	
+LOC(4.17):
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+		b	9f
+		add	%o2, (1*2-1), %o2
+	
+	
+	
+	
+LOC(1.16):
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+			! depth 2, accumulated bits -1
+	bl	LOC(2.15)
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+			! depth 3, accumulated bits -1
+	bl	LOC(3.15)
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+			! depth 4, accumulated bits -1
+	bl	LOC(4.15)
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+		b	9f
+		add	%o2, (-1*2+1), %o2
+	
+LOC(4.15):
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+		b	9f
+		add	%o2, (-1*2-1), %o2
+	
+	
+LOC(3.15):
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+			! depth 4, accumulated bits -3
+	bl	LOC(4.13)
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+		b	9f
+		add	%o2, (-3*2+1), %o2
+	
+LOC(4.13):
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+		b	9f
+		add	%o2, (-3*2-1), %o2
+	
+	
+	
+LOC(2.15):
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+			! depth 3, accumulated bits -3
+	bl	LOC(3.13)
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+			! depth 4, accumulated bits -5
+	bl	LOC(4.11)
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+		b	9f
+		add	%o2, (-5*2+1), %o2
+	
+LOC(4.11):
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+		b	9f
+		add	%o2, (-5*2-1), %o2
+	
+	
+LOC(3.13):
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+			! depth 4, accumulated bits -7
+	bl	LOC(4.9)
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+		b	9f
+		add	%o2, (-7*2+1), %o2
+	
+LOC(4.9):
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+		b	9f
+		add	%o2, (-7*2-1), %o2
+	
+	
+	
+	
+	9:
+LOC(end_regular_divide):
+	subcc	%o4, 1, %o4
+	bge	LOC(divloop)
+	tst	%o3
+	bl,a	LOC(got_result)
+	! non-restoring fixup here (one instruction only!)
+	sub	%o2, 1, %o2
+
+
+LOC(got_result):
+
+	retl
+	mov %o2, %o0
+
+END(.udiv)

+ 153 - 0
libc/sysdeps/linux/sparc/sparcv7/umul.S

@@ -0,0 +1,153 @@
+/*
+ * Unsigned multiply.  Returns %o0 * %o1 in %o1%o0 (i.e., %o1 holds the
+ * upper 32 bits of the 64-bit product).
+ *
+ * This code optimizes short (less than 13-bit) multiplies.  Short
+ * multiplies require 25 instruction cycles, and long ones require
+ * 45 instruction cycles.
+ *
+ * On return, overflow has occurred (%o1 is not zero) if and only if
+ * the Z condition code is clear, allowing, e.g., the following:
+ *
+ *	call	.umul
+ *	nop
+ *	bnz	overflow	(or tnz)
+ */
+
+ENTRY(.umul)
+	or	%o0, %o1, %o4
+	mov	%o0, %y			! multiplier -> Y
+	andncc	%o4, 0xfff, %g0		! test bits 12..31 of *both* args
+	be	LOC(mul_shortway)	! if zero, can do it the short way
+	 andcc	%g0, %g0, %o4		! zero the partial product; clear N & V
+
+	/*
+	 * Long multiply.  32 steps, followed by a final shift step.
+	 */
+	mulscc	%o4, %o1, %o4	! 1
+	mulscc	%o4, %o1, %o4	! 2
+	mulscc	%o4, %o1, %o4	! 3
+	mulscc	%o4, %o1, %o4	! 4
+	mulscc	%o4, %o1, %o4	! 5
+	mulscc	%o4, %o1, %o4	! 6
+	mulscc	%o4, %o1, %o4	! 7
+	mulscc	%o4, %o1, %o4	! 8
+	mulscc	%o4, %o1, %o4	! 9
+	mulscc	%o4, %o1, %o4	! 10
+	mulscc	%o4, %o1, %o4	! 11
+	mulscc	%o4, %o1, %o4	! 12
+	mulscc	%o4, %o1, %o4	! 13
+	mulscc	%o4, %o1, %o4	! 14
+	mulscc	%o4, %o1, %o4	! 15
+	mulscc	%o4, %o1, %o4	! 16
+	mulscc	%o4, %o1, %o4	! 17
+	mulscc	%o4, %o1, %o4	! 18
+	mulscc	%o4, %o1, %o4	! 19
+	mulscc	%o4, %o1, %o4	! 20
+	mulscc	%o4, %o1, %o4	! 21
+	mulscc	%o4, %o1, %o4	! 22
+	mulscc	%o4, %o1, %o4	! 23
+	mulscc	%o4, %o1, %o4	! 24
+	mulscc	%o4, %o1, %o4	! 25
+	mulscc	%o4, %o1, %o4	! 26
+	mulscc	%o4, %o1, %o4	! 27
+	mulscc	%o4, %o1, %o4	! 28
+	mulscc	%o4, %o1, %o4	! 29
+	mulscc	%o4, %o1, %o4	! 30
+	mulscc	%o4, %o1, %o4	! 31
+	mulscc	%o4, %o1, %o4	! 32
+	mulscc	%o4, %g0, %o4	! final shift
+
+	/*
+	 * Normally, with the shift-and-add approach, if both numbers are
+	 * positive you get the correct result.  With 32-bit two's-complement
+	 * numbers, -x is represented as
+	 *
+	 *		  x		    32
+	 *	( 2  -  ------ ) mod 2  *  2
+	 *		   32
+	 *		  2
+	 *
+	 * (the `mod 2' subtracts 1 from 1.bbbb).  To avoid lots of 2^32s,
+	 * we can treat this as if the radix point were just to the left
+	 * of the sign bit (multiply by 2^32), and get
+	 *
+	 *	-x  =  (2 - x) mod 2
+	 *
+	 * Then, ignoring the `mod 2's for convenience:
+	 *
+	 *   x *  y	= xy
+	 *  -x *  y	= 2y - xy
+	 *   x * -y	= 2x - xy
+	 *  -x * -y	= 4 - 2x - 2y + xy
+	 *
+	 * For signed multiplies, we subtract (x << 32) from the partial
+	 * product to fix this problem for negative multipliers (see mul.s).
+	 * Because of the way the shift into the partial product is calculated
+	 * (N xor V), this term is automatically removed for the multiplicand,
+	 * so we don't have to adjust.
+	 *
+	 * But for unsigned multiplies, the high order bit wasn't a sign bit,
+	 * and the correction is wrong.  So for unsigned multiplies where the
+	 * high order bit is one, we end up with xy - (y << 32).  To fix it
+	 * we add y << 32.
+	 */
+#if 0
+	tst	%o1
+	bl,a	1f		! if %o1 < 0 (high order bit = 1),
+	 add	%o4, %o0, %o4	! %o4 += %o0 (add y to upper half)
+1:	rd	%y, %o0		! get lower half of product
+	retl
+	 addcc	%o4, %g0, %o1	! put upper half in place and set Z for %o1==0
+#else
+	/* Faster code from tege@sics.se.  */
+	sra	%o1, 31, %o2	! make mask from sign bit
+	and	%o0, %o2, %o2	! %o2 = 0 or %o0, depending on sign of %o1
+	rd	%y, %o0		! get lower half of product
+	retl
+	 addcc	%o4, %o2, %o1	! add compensation and put upper half in place
+#endif
+
+LOC(mul_shortway):
+	/*
+	 * Short multiply.  12 steps, followed by a final shift step.
+	 * The resulting bits are off by 12 and (32-12) = 20 bit positions,
+	 * but there is no problem with %o0 being negative (unlike above),
+	 * and overflow is impossible (the answer is at most 24 bits long).
+	 */
+	mulscc	%o4, %o1, %o4	! 1
+	mulscc	%o4, %o1, %o4	! 2
+	mulscc	%o4, %o1, %o4	! 3
+	mulscc	%o4, %o1, %o4	! 4
+	mulscc	%o4, %o1, %o4	! 5
+	mulscc	%o4, %o1, %o4	! 6
+	mulscc	%o4, %o1, %o4	! 7
+	mulscc	%o4, %o1, %o4	! 8
+	mulscc	%o4, %o1, %o4	! 9
+	mulscc	%o4, %o1, %o4	! 10
+	mulscc	%o4, %o1, %o4	! 11
+	mulscc	%o4, %o1, %o4	! 12
+	mulscc	%o4, %g0, %o4	! final shift
+
+	/*
+	 * %o4 has 20 of the bits that should be in the result; %y has
+	 * the bottom 12 (as %y's top 12).  That is:
+	 *
+	 *	  %o4		    %y
+	 * +----------------+----------------+
+	 * | -12- |   -20-  | -12- |   -20-  |
+	 * +------(---------+------)---------+
+	 *	   -----result-----
+	 *
+	 * The 12 bits of %o4 left of the `result' area are all zero;
+	 * in fact, all top 20 bits of %o4 are zero.
+	 */
+
+	rd	%y, %o5
+	sll	%o4, 12, %o0	! shift middle bits left 12
+	srl	%o5, 20, %o5	! shift low bits right 20
+	or	%o5, %o0, %o0
+	retl
+	 addcc	%g0, %g0, %o1	! %o1 = zero, and set Z
+
+END(.umul)

+ 343 - 0
libc/sysdeps/linux/sparc/sparcv7/urem.S

@@ -0,0 +1,343 @@
+   /* This file is generated from divrem.m4; DO NOT EDIT! */
+/*
+ * Division and remainder, from Appendix E of the Sparc Version 8
+ * Architecture Manual, with fixes from Gordon Irlam.
+ */
+
+/*
+ * Input: dividend and divisor in %o0 and %o1 respectively.
+ *
+ * m4 parameters:
+ *  .urem	name of function to generate
+ *  rem		rem=div => %o0 / %o1; rem=rem => %o0 % %o1
+ *  false		false=true => signed; false=false => unsigned
+ *
+ * Algorithm parameters:
+ *  N		how many bits per iteration we try to get (4)
+ *  WORDSIZE	total number of bits (32)
+ *
+ * Derived constants:
+ *  TOPBITS	number of bits in the top decade of a number
+ *
+ * Important variables:
+ *  Q		the partial quotient under development (initially 0)
+ *  R		the remainder so far, initially the dividend
+ *  ITER	number of main division loop iterations required;
+ *		equal to ceil(log2(quotient) / N).  Note that this
+ *		is the log base (2^N) of the quotient.
+ *  V		the current comparand, initially divisor*2^(ITER*N-1)
+ *
+ * Cost:
+ *  Current estimate for non-large dividend is
+ *	ceil(log2(quotient) / N) * (10 + 7N/2) + C
+ *  A large dividend is one greater than 2^(31-TOPBITS) and takes a
+ *  different path, as the upper bits of the quotient must be developed
+ *  one bit at a time.
+ */
+
+
+
+ENTRY(.urem)
+
+	! Ready to divide.  Compute size of quotient; scale comparand.
+	orcc	%o1, %g0, %o5
+	bne	1f
+	mov	%o0, %o3
+
+		! Divide by zero trap.  If it returns, return 0 (about as
+		! wrong as possible, but that is what SunOS does...).
+		ta	ST_DIV0
+		retl
+		clr	%o0
+
+1:
+	cmp	%o3, %o5			! if %o1 exceeds %o0, done
+	blu	LOC(got_result)		! (and algorithm fails otherwise)
+	clr	%o2
+	sethi	%hi(1 << (32 - 4 - 1)), %g1
+	cmp	%o3, %g1
+	blu	LOC(not_really_big)
+	clr	%o4
+
+	! Here the dividend is >= 2**(31-N) or so.  We must be careful here,
+	! as our usual N-at-a-shot divide step will cause overflow and havoc.
+	! The number of bits in the result here is N*ITER+SC, where SC <= N.
+	! Compute ITER in an unorthodox manner: know we need to shift V into
+	! the top decade: so do not even bother to compare to R.
+	1:
+		cmp	%o5, %g1
+		bgeu	3f
+		mov	1, %g2
+		sll	%o5, 4, %o5
+		b	1b
+		add	%o4, 1, %o4
+
+	! Now compute %g2.
+	2:	addcc	%o5, %o5, %o5
+		bcc	LOC(not_too_big)
+		add	%g2, 1, %g2
+
+		! We get here if the %o1 overflowed while shifting.
+		! This means that %o3 has the high-order bit set.
+		! Restore %o5 and subtract from %o3.
+		sll	%g1, 4, %g1	! high order bit
+		srl	%o5, 1, %o5		! rest of %o5
+		add	%o5, %g1, %o5
+		b	LOC(do_single_div)
+		sub	%g2, 1, %g2
+
+	LOC(not_too_big):
+	3:	cmp	%o5, %o3
+		blu	2b
+		nop
+		be	LOC(do_single_div)
+		nop
+	/* NB: these are commented out in the V8-Sparc manual as well */
+	/* (I do not understand this) */
+	! %o5 > %o3: went too far: back up 1 step
+	!	srl	%o5, 1, %o5
+	!	dec	%g2
+	! do single-bit divide steps
+	!
+	! We have to be careful here.  We know that %o3 >= %o5, so we can do the
+	! first divide step without thinking.  BUT, the others are conditional,
+	! and are only done if %o3 >= 0.  Because both %o3 and %o5 may have the high-
+	! order bit set in the first step, just falling into the regular
+	! division loop will mess up the first time around.
+	! So we unroll slightly...
+	LOC(do_single_div):
+		subcc	%g2, 1, %g2
+		bl	LOC(end_regular_divide)
+		nop
+		sub	%o3, %o5, %o3
+		mov	1, %o2
+		b	LOC(end_single_divloop)
+		nop
+	LOC(single_divloop):
+		sll	%o2, 1, %o2
+		bl	1f
+		srl	%o5, 1, %o5
+		! %o3 >= 0
+		sub	%o3, %o5, %o3
+		b	2f
+		add	%o2, 1, %o2
+	1:	! %o3 < 0
+		add	%o3, %o5, %o3
+		sub	%o2, 1, %o2
+	2:
+	LOC(end_single_divloop):
+		subcc	%g2, 1, %g2
+		bge	LOC(single_divloop)
+		tst	%o3
+		b,a	LOC(end_regular_divide)
+
+LOC(not_really_big):
+1:
+	sll	%o5, 4, %o5
+	cmp	%o5, %o3
+	bleu	1b
+	addcc	%o4, 1, %o4
+	be	LOC(got_result)
+	sub	%o4, 1, %o4
+
+	tst	%o3	! set up for initial iteration
+LOC(divloop):
+	sll	%o2, 4, %o2
+		! depth 1, accumulated bits 0
+	bl	LOC(1.16)
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+			! depth 2, accumulated bits 1
+	bl	LOC(2.17)
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+			! depth 3, accumulated bits 3
+	bl	LOC(3.19)
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+			! depth 4, accumulated bits 7
+	bl	LOC(4.23)
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+		b	9f
+		add	%o2, (7*2+1), %o2
+	
+LOC(4.23):
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+		b	9f
+		add	%o2, (7*2-1), %o2
+	
+	
+LOC(3.19):
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+			! depth 4, accumulated bits 5
+	bl	LOC(4.21)
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+		b	9f
+		add	%o2, (5*2+1), %o2
+	
+LOC(4.21):
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+		b	9f
+		add	%o2, (5*2-1), %o2
+	
+	
+	
+LOC(2.17):
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+			! depth 3, accumulated bits 1
+	bl	LOC(3.17)
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+			! depth 4, accumulated bits 3
+	bl	LOC(4.19)
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+		b	9f
+		add	%o2, (3*2+1), %o2
+	
+LOC(4.19):
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+		b	9f
+		add	%o2, (3*2-1), %o2
+	
+	
+LOC(3.17):
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+			! depth 4, accumulated bits 1
+	bl	LOC(4.17)
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+		b	9f
+		add	%o2, (1*2+1), %o2
+	
+LOC(4.17):
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+		b	9f
+		add	%o2, (1*2-1), %o2
+	
+	
+	
+	
+LOC(1.16):
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+			! depth 2, accumulated bits -1
+	bl	LOC(2.15)
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+			! depth 3, accumulated bits -1
+	bl	LOC(3.15)
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+			! depth 4, accumulated bits -1
+	bl	LOC(4.15)
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+		b	9f
+		add	%o2, (-1*2+1), %o2
+	
+LOC(4.15):
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+		b	9f
+		add	%o2, (-1*2-1), %o2
+	
+	
+LOC(3.15):
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+			! depth 4, accumulated bits -3
+	bl	LOC(4.13)
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+		b	9f
+		add	%o2, (-3*2+1), %o2
+	
+LOC(4.13):
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+		b	9f
+		add	%o2, (-3*2-1), %o2
+	
+	
+	
+LOC(2.15):
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+			! depth 3, accumulated bits -3
+	bl	LOC(3.13)
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+			! depth 4, accumulated bits -5
+	bl	LOC(4.11)
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+		b	9f
+		add	%o2, (-5*2+1), %o2
+	
+LOC(4.11):
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+		b	9f
+		add	%o2, (-5*2-1), %o2
+	
+	
+LOC(3.13):
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+			! depth 4, accumulated bits -7
+	bl	LOC(4.9)
+	srl	%o5,1,%o5
+	! remainder is positive
+	subcc	%o3,%o5,%o3
+		b	9f
+		add	%o2, (-7*2+1), %o2
+	
+LOC(4.9):
+	! remainder is negative
+	addcc	%o3,%o5,%o3
+		b	9f
+		add	%o2, (-7*2-1), %o2
+	
+	
+	
+	
+	9:
+LOC(end_regular_divide):
+	subcc	%o4, 1, %o4
+	bge	LOC(divloop)
+	tst	%o3
+	bl,a	LOC(got_result)
+	! non-restoring fixup here (one instruction only!)
+	add	%o3, %o1, %o3
+
+
+LOC(got_result):
+
+	retl
+	mov %o3, %o0
+
+END(.urem)

+ 19 - 0
libc/sysdeps/linux/sparc/sparcv8/rem.S

@@ -0,0 +1,19 @@
+/*
+ * Sparc v8 has divide.
+ */
+
+ENTRY(.rem)
+
+	sra	%o0, 31, %o2
+	wr	%o2, 0, %y
+	nop
+	nop
+	nop
+	sdivcc	%o0, %o1, %o2
+	bvs,a	1f
+	 xnor	%o2, %g0, %o2
+1:	smul	%o2, %o1, %o2
+	retl
+	 sub	%o0, %o2, %o0
+
+END(.rem)

+ 18 - 0
libc/sysdeps/linux/sparc/sparcv8/sdiv.S

@@ -0,0 +1,18 @@
+/*
+ * Sparc v8 has divide.
+ */
+
+ENTRY(.div)
+
+	sra	%o0, 31, %o2
+	wr	%o2, 0, %y
+	nop
+	nop
+	nop
+	sdivcc	%o0, %o1, %o0
+	bvs,a	1f
+	 xnor	%o0, %g0, %o0
+1:	retl
+	 nop
+
+END(.div)

+ 13 - 0
libc/sysdeps/linux/sparc/sparcv8/udiv.S

@@ -0,0 +1,13 @@
+/*
+ * Sparc v8 has divide.
+ */
+
+ENTRY(.udiv)
+
+	wr	%g0, 0, %y
+	nop
+	nop
+	retl
+	 udiv	%o0, %o1, %o0
+
+END(.udiv)

+ 11 - 0
libc/sysdeps/linux/sparc/sparcv8/umul.S

@@ -0,0 +1,11 @@
+/*
+ * Sparc v8 has multiply.
+ */
+
+ENTRY(.umul)
+
+	umul	%o0, %o1, %o0
+	retl
+	 rd	%y, %o1
+
+END(.umul)

+ 16 - 0
libc/sysdeps/linux/sparc/sparcv8/urem.S

@@ -0,0 +1,16 @@
+/*
+ * Sparc v8 has divide.
+ */
+
+ENTRY(.urem)
+
+	wr	%g0, 0, %y
+	nop
+	nop
+	nop
+	udiv	%o0, %o1, %o2
+	umul	%o2, %o1, %o2
+	retl
+	 sub	%o0, %o2, %o0
+
+END(.urem)

+ 20 - 0
libc/sysdeps/linux/sparc/sparcv9/rem.S

@@ -0,0 +1,20 @@
+/*
+ * Sparc v9 has divide.
+ * As divx takes 68 cycles and sdivcc only 36,
+ * we use sdivcc eventhough it is deprecated.
+ */
+
+	.text
+	.align		32
+ENTRY(.rem)
+
+	sra		%o0, 31, %o2
+	wr		%o2, 0, %y
+	sdivcc		%o0, %o1, %o2
+	xnor		%o2, %g0, %o3
+	movvs		%icc, %o3, %o2
+	smul		%o2, %o1, %o2
+	retl
+	 sub		%o0, %o2, %o0
+
+END(.rem)

+ 18 - 0
libc/sysdeps/linux/sparc/sparcv9/sdiv.S

@@ -0,0 +1,18 @@
+/*
+ * Sparc v9 has divide.
+ * As divx takes 68 cycles and sdivcc only 36,
+ * we use sdivcc eventhough it is deprecated.
+ */
+
+	.text
+	.align		32
+ENTRY(.div)
+
+	sra		%o0, 31, %o2
+	wr		%o2, 0, %y
+	sdivcc		%o0, %o1, %o0
+	xnor		%o0, %g0, %o2
+	retl
+	 movvs		%icc, %o2, %o0
+
+END(.div)

+ 15 - 0
libc/sysdeps/linux/sparc/sparcv9/udiv.S

@@ -0,0 +1,15 @@
+/*
+ * Sparc v9 has divide.
+ * As divx takes 68 cycles and udiv only 37,
+ * we use udiv eventhough it is deprecated.
+ */
+
+	.text
+	.align		32
+ENTRY(.udiv)
+
+	wr		%g0, 0, %y
+	retl
+	 udiv		%o0, %o1, %o0
+
+END(.udiv)

+ 15 - 0
libc/sysdeps/linux/sparc/sparcv9/umul.S

@@ -0,0 +1,15 @@
+/*
+ * Sparc v9 has multiply.
+ */
+
+	.text
+	.align		32
+ENTRY(.umul)
+
+	srl		%o0, 0, %o0
+	srl		%o1, 0, %o1
+	mulx		%o0, %o1, %o0
+	retl
+	 srlx		%o0, 32, %o1
+
+END(.umul)

+ 17 - 0
libc/sysdeps/linux/sparc/sparcv9/urem.S

@@ -0,0 +1,17 @@
+/*
+ * Sparc v9 has divide.
+ * As divx takes 68 cycles and udiv only 37,
+ * we use udiv eventhough it is deprecated.
+ */
+
+	.text
+	.align		32
+ENTRY(.urem)
+
+	wr		%g0, 0, %y
+	udiv		%o0, %o1, %o2
+	umul		%o2, %o1, %o2
+	retl
+	 sub		%o0, %o2, %o0
+
+END(.urem)

+ 9 - 348
libc/sysdeps/linux/sparc/udiv.S

@@ -1,348 +1,9 @@
-   /* This file is generated from divrem.m4; DO NOT EDIT! */
-/*
- * Division and remainder, from Appendix E of the Sparc Version 8
- * Architecture Manual, with fixes from Gordon Irlam.
- */
-
-/*
- * Input: dividend and divisor in %o0 and %o1 respectively.
- *
- * m4 parameters:
- *  .udiv	name of function to generate
- *  div		div=div => %o0 / %o1; div=rem => %o0 % %o1
- *  false		false=true => signed; false=false => unsigned
- *
- * Algorithm parameters:
- *  N		how many bits per iteration we try to get (4)
- *  WORDSIZE	total number of bits (32)
- *
- * Derived constants:
- *  TOPBITS	number of bits in the top decade of a number
- *
- * Important variables:
- *  Q		the partial quotient under development (initially 0)
- *  R		the remainder so far, initially the dividend
- *  ITER	number of main division loop iterations required;
- *		equal to ceil(log2(quotient) / N).  Note that this
- *		is the log base (2^N) of the quotient.
- *  V		the current comparand, initially divisor*2^(ITER*N-1)
- *
- * Cost:
- *  Current estimate for non-large dividend is
- *	ceil(log2(quotient) / N) * (10 + 7N/2) + C
- *  A large dividend is one greater than 2^(31-TOPBITS) and takes a
- *  different path, as the upper bits of the quotient must be developed
- *  one bit at a time.
- */
-
-
-
-#include <sys/syscall.h>
-
-.global   .udiv;
-.align 4;
-.type  .udiv ,@function; 
-
-.udiv: 
-	! Ready to divide.  Compute size of quotient; scale comparand.
-	orcc	%o1, %g0, %o5
-	bne	1f
-	mov	%o0, %o3
-
-		! Divide by zero trap.  If it returns, return 0 (about as
-		! wrong as possible, but that is what SunOS does...).
-		ta	0x02
-		retl
-		clr	%o0
-
-1:
-	cmp	%o3, %o5			! if %o1 exceeds %o0, done
-	blu	.Lgot_result  		! (and algorithm fails otherwise)
-	clr	%o2
-	sethi	%hi(1 << (32 - 4 - 1)), %g1
-	cmp	%o3, %g1
-	blu	.Lnot_really_big  
-	clr	%o4
-
-	! Here the dividend is >= 2**(31-N) or so.  We must be careful here,
-	! as our usual N-at-a-shot divide step will cause overflow and havoc.
-	! The number of bits in the result here is N*ITER+SC, where SC <= N.
-	! Compute ITER in an unorthodox manner: know we need to shift V into
-	! the top decade: so do not even bother to compare to R.
-	1:
-		cmp	%o5, %g1
-		bgeu	3f
-		mov	1, %g2
-		sll	%o5, 4, %o5
-		b	1b
-		add	%o4, 1, %o4
-
-	! Now compute %g2.
-	2:	addcc	%o5, %o5, %o5
-		bcc	.Lnot_too_big  
-		add	%g2, 1, %g2
-
-		! We get here if the %o1 overflowed while shifting.
-		! This means that %o3 has the high-order bit set.
-		! Restore %o5 and subtract from %o3.
-		sll	%g1, 4, %g1	! high order bit
-		srl	%o5, 1, %o5		! rest of %o5
-		add	%o5, %g1, %o5
-		b	.Ldo_single_div
-		sub	%g2, 1, %g2
-
-	.Lnot_too_big:
-	3:	cmp	%o5, %o3
-		blu	2b
-		nop
-		be	.Ldo_single_div
-		nop
-	/* NB: these are commented out in the V8-Sparc manual as well */
-	/* (I do not understand this) */
-	! %o5 > %o3: went too far: back up 1 step
-	!	srl	%o5, 1, %o5
-	!	dec	%g2
-	! do single-bit divide steps
-	!
-	! We have to be careful here.  We know that %o3 >= %o5, so we can do the
-	! first divide step without thinking.  BUT, the others are conditional,
-	! and are only done if %o3 >= 0.  Because both %o3 and %o5 may have the high-
-	! order bit set in the first step, just falling into the regular
-	! division loop will mess up the first time around.
-	! So we unroll slightly...
-	.Ldo_single_div:
-		subcc	%g2, 1, %g2
-		bl	.Lend_regular_divide
-		nop
-		sub	%o3, %o5, %o3
-		mov	1, %o2
-		b	.Lend_single_divloop
-		nop
-	.Lsingle_divloop:
-		sll	%o2, 1, %o2
-		bl	1f
-		srl	%o5, 1, %o5
-		! %o3 >= 0
-		sub	%o3, %o5, %o3
-		b	2f
-		add	%o2, 1, %o2
-	1:	! %o3 < 0
-		add	%o3, %o5, %o3
-		sub	%o2, 1, %o2
-	2:
-	.Lend_single_divloop:
-		subcc	%g2, 1, %g2
-		bge	.Lsingle_divloop
-		tst	%o3
-		b,a	.Lend_regular_divide
-
-.Lnot_really_big:
-1:
-	sll	%o5, 4, %o5
-	cmp	%o5, %o3
-	bleu	1b
-	addcc	%o4, 1, %o4
-	be	.Lgot_result
-	sub	%o4, 1, %o4
-
-	tst	%o3	! set up for initial iteration
-.Ldivloop:
-	sll	%o2, 4, %o2
-		! depth 1, accumulated bits 0
-	bl	.L1.16
-	srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-			! depth 2, accumulated bits 1
-	bl	.L2.17
-	srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-			! depth 3, accumulated bits 3
-	bl	.L3.19
-	srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-			! depth 4, accumulated bits 7
-	bl	.L4.23
-	srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (7*2+1), %o2
-	
-.L4.23:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (7*2-1), %o2
-	
-	
-.L3.19:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-			! depth 4, accumulated bits 5
-	bl	.L4.21
-	srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (5*2+1), %o2
-	
-.L4.21:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (5*2-1), %o2
-	
-	
-	
-.L2.17:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-			! depth 3, accumulated bits 1
-	bl	.L3.17
-	srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-			! depth 4, accumulated bits 3
-	bl	.L4.19
-	srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (3*2+1), %o2
-	
-.L4.19:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (3*2-1), %o2
-	
-	
-.L3.17:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-			! depth 4, accumulated bits 1
-	bl	.L4.17
-	srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (1*2+1), %o2
-	
-.L4.17:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (1*2-1), %o2
-	
-	
-	
-	
-.L1.16:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-			! depth 2, accumulated bits -1
-	bl	.L2.15
-	srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-			! depth 3, accumulated bits -1
-	bl	.L3.15
-	srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-			! depth 4, accumulated bits -1
-	bl	.L4.15
-	srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (-1*2+1), %o2
-	
-.L4.15:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (-1*2-1), %o2
-	
-	
-.L3.15:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-			! depth 4, accumulated bits -3
-	bl	.L4.13
-	srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (-3*2+1), %o2
-	
-.L4.13:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (-3*2-1), %o2
-	
-	
-	
-.L2.15:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-			! depth 3, accumulated bits -3
-	bl	.L3.13
-	srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-			! depth 4, accumulated bits -5
-	bl	.L4.11
-	srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (-5*2+1), %o2
-	
-.L4.11:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (-5*2-1), %o2
-	
-	
-.L3.13:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-			! depth 4, accumulated bits -7
-	bl	.L4.9
-	srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (-7*2+1), %o2
-	
-.L4.9:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (-7*2-1), %o2
-	
-	
-	
-	
-	9:
-.Lend_regular_divide:
-	subcc	%o4, 1, %o4
-	bge	.Ldivloop
-	tst	%o3
-	bl,a	.Lgot_result
-	! non-restoring fixup here (one instruction only!)
-	sub	%o2, 1, %o2
-
-
-.Lgot_result:
-
-	retl
-	mov %o2, %o0
-
-.size .udiv,.-.udiv;
+#include "_math_inc.h"
+
+#if defined(__CONFIG_SPARC_V9__) || defined(__CONFIG_SPARC_V9B__)
+# include "sparcv9/udiv.S"
+#elif defined(__CONFIG_SPARC_V8__)
+# include "sparcv8/udiv.S"
+#else
+# include "sparcv7/udiv.S"
+#endif

+ 6 - 157
libc/sysdeps/linux/sparc/umul.S

@@ -1,160 +1,9 @@
-/*
- * Unsigned multiply.  Returns %o0 * %o1 in %o1%o0 (i.e., %o1 holds the
- * upper 32 bits of the 64-bit product).
- *
- * This code optimizes short (less than 13-bit) multiplies.  Short
- * multiplies require 25 instruction cycles, and long ones require
- * 45 instruction cycles.
- *
- * On return, overflow has occurred (%o1 is not zero) if and only if
- * the Z condition code is clear, allowing, e.g., the following:
- *
- *	call	.umul
- *	nop
- *	bnz	overflow	(or tnz)
- */
+#include "_math_inc.h"
 
-#include <sys/syscall.h>
-
-
-.global   .umul;
-.align 4;
-.type  .umul ,@function; 
-
-.umul: 
-	or	%o0, %o1, %o4
-	mov	%o0, %y			! multiplier -> Y
-	andncc	%o4, 0xfff, %g0		! test bits 12..31 of *both* args
-	be	.Lmul_shortway	! if zero, can do it the short way
-	 andcc	%g0, %g0, %o4		! zero the partial product; clear N & V
-
-	/*
-	 * Long multiply.  32 steps, followed by a final shift step.
-	 */
-	mulscc	%o4, %o1, %o4	! 1
-	mulscc	%o4, %o1, %o4	! 2
-	mulscc	%o4, %o1, %o4	! 3
-	mulscc	%o4, %o1, %o4	! 4
-	mulscc	%o4, %o1, %o4	! 5
-	mulscc	%o4, %o1, %o4	! 6
-	mulscc	%o4, %o1, %o4	! 7
-	mulscc	%o4, %o1, %o4	! 8
-	mulscc	%o4, %o1, %o4	! 9
-	mulscc	%o4, %o1, %o4	! 10
-	mulscc	%o4, %o1, %o4	! 11
-	mulscc	%o4, %o1, %o4	! 12
-	mulscc	%o4, %o1, %o4	! 13
-	mulscc	%o4, %o1, %o4	! 14
-	mulscc	%o4, %o1, %o4	! 15
-	mulscc	%o4, %o1, %o4	! 16
-	mulscc	%o4, %o1, %o4	! 17
-	mulscc	%o4, %o1, %o4	! 18
-	mulscc	%o4, %o1, %o4	! 19
-	mulscc	%o4, %o1, %o4	! 20
-	mulscc	%o4, %o1, %o4	! 21
-	mulscc	%o4, %o1, %o4	! 22
-	mulscc	%o4, %o1, %o4	! 23
-	mulscc	%o4, %o1, %o4	! 24
-	mulscc	%o4, %o1, %o4	! 25
-	mulscc	%o4, %o1, %o4	! 26
-	mulscc	%o4, %o1, %o4	! 27
-	mulscc	%o4, %o1, %o4	! 28
-	mulscc	%o4, %o1, %o4	! 29
-	mulscc	%o4, %o1, %o4	! 30
-	mulscc	%o4, %o1, %o4	! 31
-	mulscc	%o4, %o1, %o4	! 32
-	mulscc	%o4, %g0, %o4	! final shift
-
-	/*
-	 * Normally, with the shift-and-add approach, if both numbers are
-	 * positive you get the correct result.  With 32-bit two's-complement
-	 * numbers, -x is represented as
-	 *
-	 *		  x		    32
-	 *	( 2  -  ------ ) mod 2  *  2
-	 *		   32
-	 *		  2
-	 *
-	 * (the `mod 2' subtracts 1 from 1.bbbb).  To avoid lots of 2^32s,
-	 * we can treat this as if the radix point were just to the left
-	 * of the sign bit (multiply by 2^32), and get
-	 *
-	 *	-x  =  (2 - x) mod 2
-	 *
-	 * Then, ignoring the `mod 2's for convenience:
-	 *
-	 *   x *  y	= xy
-	 *  -x *  y	= 2y - xy
-	 *   x * -y	= 2x - xy
-	 *  -x * -y	= 4 - 2x - 2y + xy
-	 *
-	 * For signed multiplies, we subtract (x << 32) from the partial
-	 * product to fix this problem for negative multipliers (see mul.s).
-	 * Because of the way the shift into the partial product is calculated
-	 * (N xor V), this term is automatically removed for the multiplicand,
-	 * so we don't have to adjust.
-	 *
-	 * But for unsigned multiplies, the high order bit wasn't a sign bit,
-	 * and the correction is wrong.  So for unsigned multiplies where the
-	 * high order bit is one, we end up with xy - (y << 32).  To fix it
-	 * we add y << 32.
-	 */
-#if 0
-	tst	%o1
-	bl,a	1f		! if %o1 < 0 (high order bit = 1),
-	 add	%o4, %o0, %o4	! %o4 += %o0 (add y to upper half)
-1:	rd	%y, %o0		! get lower half of product
-	retl
-	 addcc	%o4, %g0, %o1	! put upper half in place and set Z for %o1==0
+#if defined(__CONFIG_SPARC_V9__) || defined(__CONFIG_SPARC_V9B__)
+# include "sparcv9/umul.S"
+#elif defined(__CONFIG_SPARC_V8__)
+# include "sparcv8/umul.S"
 #else
-	/* Faster code from tege@sics.se.  */
-	sra	%o1, 31, %o2	! make mask from sign bit
-	and	%o0, %o2, %o2	! %o2 = 0 or %o0, depending on sign of %o1
-	rd	%y, %o0		! get lower half of product
-	retl
-	 addcc	%o4, %o2, %o1	! add compensation and put upper half in place
+# include "sparcv7/umul.S"
 #endif
-
-.Lmul_shortway:
-	/*
-	 * Short multiply.  12 steps, followed by a final shift step.
-	 * The resulting bits are off by 12 and (32-12) = 20 bit positions,
-	 * but there is no problem with %o0 being negative (unlike above),
-	 * and overflow is impossible (the answer is at most 24 bits long).
-	 */
-	mulscc	%o4, %o1, %o4	! 1
-	mulscc	%o4, %o1, %o4	! 2
-	mulscc	%o4, %o1, %o4	! 3
-	mulscc	%o4, %o1, %o4	! 4
-	mulscc	%o4, %o1, %o4	! 5
-	mulscc	%o4, %o1, %o4	! 6
-	mulscc	%o4, %o1, %o4	! 7
-	mulscc	%o4, %o1, %o4	! 8
-	mulscc	%o4, %o1, %o4	! 9
-	mulscc	%o4, %o1, %o4	! 10
-	mulscc	%o4, %o1, %o4	! 11
-	mulscc	%o4, %o1, %o4	! 12
-	mulscc	%o4, %g0, %o4	! final shift
-
-	/*
-	 * %o4 has 20 of the bits that should be in the result; %y has
-	 * the bottom 12 (as %y's top 12).  That is:
-	 *
-	 *	  %o4		    %y
-	 * +----------------+----------------+
-	 * | -12- |   -20-  | -12- |   -20-  |
-	 * +------(---------+------)---------+
-	 *	   -----result-----
-	 *
-	 * The 12 bits of %o4 left of the `result' area are all zero;
-	 * in fact, all top 20 bits of %o4 are zero.
-	 */
-
-	rd	%y, %o5
-	sll	%o4, 12, %o0	! shift middle bits left 12
-	srl	%o5, 20, %o5	! shift low bits right 20
-	or	%o5, %o0, %o0
-	retl
-	 addcc	%g0, %g0, %o1	! %o1 = zero, and set Z
-
-.size  .umul , . -.umul

+ 9 - 350
libc/sysdeps/linux/sparc/urem.S

@@ -1,350 +1,9 @@
-   /* This file is generated from divrem.m4; DO NOT EDIT! */
-/*
- * Division and remainder, from Appendix E of the Sparc Version 8
- * Architecture Manual, with fixes from Gordon Irlam.
- */
-
-/*
- * Input: dividend and divisor in %o0 and %o1 respectively.
- *
- * m4 parameters:
- *  .urem	name of function to generate
- *  rem		rem=div => %o0 / %o1; rem=rem => %o0 % %o1
- *  false		false=true => signed; false=false => unsigned
- *
- * Algorithm parameters:
- *  N		how many bits per iteration we try to get (4)
- *  WORDSIZE	total number of bits (32)
- *
- * Derived constants:
- *  TOPBITS	number of bits in the top decade of a number
- *
- * Important variables:
- *  Q		the partial quotient under development (initially 0)
- *  R		the remainder so far, initially the dividend
- *  ITER	number of main division loop iterations required;
- *		equal to ceil(log2(quotient) / N).  Note that this
- *		is the log base (2^N) of the quotient.
- *  V		the current comparand, initially divisor*2^(ITER*N-1)
- *
- * Cost:
- *  Current estimate for non-large dividend is
- *	ceil(log2(quotient) / N) * (10 + 7N/2) + C
- *  A large dividend is one greater than 2^(31-TOPBITS) and takes a
- *  different path, as the upper bits of the quotient must be developed
- *  one bit at a time.
- */
-
-
-
-#include <sys/syscall.h>
-
-
-.global   .urem;
-.align 4;
-.type  .urem ,@function; 
-
-.urem: 
-
-	! Ready to divide.  Compute size of quotient; scale comparand.
-	orcc	%o1, %g0, %o5
-	bne	1f
-	mov	%o0, %o3
-
-		! Divide by zero trap.  If it returns, return 0 (about as
-		! wrong as possible, but that is what SunOS does...).
-		ta	0x02 
-		retl
-		clr	%o0
-
-1:
-	cmp	%o3, %o5			! if %o1 exceeds %o0, done
-	blu	.Lgot_result		! (and algorithm fails otherwise)
-	clr	%o2
-	sethi	%hi(1 << (32 - 4 - 1)), %g1
-	cmp	%o3, %g1
-	blu	.Lnot_really_big
-	clr	%o4
-
-	! Here the dividend is >= 2**(31-N) or so.  We must be careful here,
-	! as our usual N-at-a-shot divide step will cause overflow and havoc.
-	! The number of bits in the result here is N*ITER+SC, where SC <= N.
-	! Compute ITER in an unorthodox manner: know we need to shift V into
-	! the top decade: so do not even bother to compare to R.
-	1:
-		cmp	%o5, %g1
-		bgeu	3f
-		mov	1, %g2
-		sll	%o5, 4, %o5
-		b	1b
-		add	%o4, 1, %o4
-
-	! Now compute %g2.
-	2:	addcc	%o5, %o5, %o5
-		bcc	.Lnot_too_big
-		add	%g2, 1, %g2
-
-		! We get here if the %o1 overflowed while shifting.
-		! This means that %o3 has the high-order bit set.
-		! Restore %o5 and subtract from %o3.
-		sll	%g1, 4, %g1	! high order bit
-		srl	%o5, 1, %o5		! rest of %o5
-		add	%o5, %g1, %o5
-		b	.Ldo_single_div
-		sub	%g2, 1, %g2
-
-	.Lnot_too_big:
-	3:	cmp	%o5, %o3
-		blu	2b
-		nop
-		be	.Ldo_single_div
-		nop
-	/* NB: these are commented out in the V8-Sparc manual as well */
-	/* (I do not understand this) */
-	! %o5 > %o3: went too far: back up 1 step
-	!	srl	%o5, 1, %o5
-	!	dec	%g2
-	! do single-bit divide steps
-	!
-	! We have to be careful here.  We know that %o3 >= %o5, so we can do the
-	! first divide step without thinking.  BUT, the others are conditional,
-	! and are only done if %o3 >= 0.  Because both %o3 and %o5 may have the high-
-	! order bit set in the first step, just falling into the regular
-	! division loop will mess up the first time around.
-	! So we unroll slightly...
-	.Ldo_single_div:
-		subcc	%g2, 1, %g2
-		bl	.Lend_regular_divide
-		nop
-		sub	%o3, %o5, %o3
-		mov	1, %o2
-		b	.Lend_single_divloop
-		nop
-	.Lsingle_divloop:
-		sll	%o2, 1, %o2
-		bl	1f
-		srl	%o5, 1, %o5
-		! %o3 >= 0
-		sub	%o3, %o5, %o3
-		b	2f
-		add	%o2, 1, %o2
-	1:	! %o3 < 0
-		add	%o3, %o5, %o3
-		sub	%o2, 1, %o2
-	2:
-	.Lend_single_divloop:
-		subcc	%g2, 1, %g2
-		bge	.Lsingle_divloop
-		tst	%o3
-		b,a	.Lend_regular_divide
-
-.Lnot_really_big:
-1:
-	sll	%o5, 4, %o5
-	cmp	%o5, %o3
-	bleu	1b
-	addcc	%o4, 1, %o4
-	be	.Lgot_result
-	sub	%o4, 1, %o4
-
-	tst	%o3	! set up for initial iteration
-.Ldivloop:
-	sll	%o2, 4, %o2
-		! depth 1, accumulated bits 0
-	bl	.L1.16
-	srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-			! depth 2, accumulated bits 1
-	bl	.L2.17
-	srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-			! depth 3, accumulated bits 3
-	bl	.L3.19
-	srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-			! depth 4, accumulated bits 7
-	bl	.L4.23
-	srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (7*2+1), %o2
-	
-.L4.23:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (7*2-1), %o2
-	
-	
-.L3.19:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-			! depth 4, accumulated bits 5
-	bl	.L4.21
-	srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (5*2+1), %o2
-	
-.L4.21:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (5*2-1), %o2
-	
-	
-	
-.L2.17:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-			! depth 3, accumulated bits 1
-	bl	.L3.17
-	srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-			! depth 4, accumulated bits 3
-	bl	.L4.19
-	srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (3*2+1), %o2
-	
-.L4.19:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (3*2-1), %o2
-	
-	
-.L3.17:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-			! depth 4, accumulated bits 1
-	bl	.L4.17
-	srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (1*2+1), %o2
-	
-.L4.17:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (1*2-1), %o2
-	
-	
-	
-	
-.L1.16:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-			! depth 2, accumulated bits -1
-	bl	.L2.15
-	srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-			! depth 3, accumulated bits -1
-	bl	.L3.15
-	srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-			! depth 4, accumulated bits -1
-	bl	.L4.15
-	srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (-1*2+1), %o2
-	
-.L4.15:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (-1*2-1), %o2
-	
-	
-.L3.15:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-			! depth 4, accumulated bits -3
-	bl	.L4.13
-	srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (-3*2+1), %o2
-	
-.L4.13:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (-3*2-1), %o2
-	
-	
-	
-.L2.15:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-			! depth 3, accumulated bits -3
-	bl	.L3.13
-	srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-			! depth 4, accumulated bits -5
-	bl	.L4.11
-	srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (-5*2+1), %o2
-	
-.L4.11:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (-5*2-1), %o2
-	
-	
-.L3.13:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-			! depth 4, accumulated bits -7
-	bl	.L4.9
-	srl	%o5,1,%o5
-	! remainder is positive
-	subcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (-7*2+1), %o2
-	
-.L4.9:
-	! remainder is negative
-	addcc	%o3,%o5,%o3
-		b	9f
-		add	%o2, (-7*2-1), %o2
-	
-	
-	
-	
-	9:
-.Lend_regular_divide:
-	subcc	%o4, 1, %o4
-	bge	.Ldivloop
-	tst	%o3
-	bl,a	.Lgot_result
-	! non-restoring fixup here (one instruction only!)
-	add	%o3, %o1, %o3
-
-
-.Lgot_result:
-
-	retl
-	mov %o3, %o0
-
-.size  .urem , . -.urem
+#include "_math_inc.h"
+
+#if defined(__CONFIG_SPARC_V9__) || defined(__CONFIG_SPARC_V9B__)
+# include "sparcv9/urem.S"
+#elif defined(__CONFIG_SPARC_V8__)
+# include "sparcv8/urem.S"
+#else
+# include "sparcv7/urem.S"
+#endif