V9/libc/sun/Fmuld.s

	.data
	.asciz	"@(#)Fmuld.s 1.1 86/02/03 Copyr 1985 Sun Micro"
	.even
	.text

|	Copyright (c) 1985 by Sun Microsystems, Inc.

#include "fpcrtdefs.h"

/*
 *	double-precision floating math run-time support
 *
 *	copyright 1981, 1982 Richard E. James III
 *	translated to SUN idiom 10/11 March 1983 rt
 *	parameter passing re-done 22 July 1983 rt
 */

ARG2PTR = a0

/*
 *	ieee double floating divide
 *	copyright 1981, Richard E. James III
 *	translated to SUN idiom 10 March 1983 rt
 */

/*
 *	entry conditions:
 *	    first argument in d0/d1
 *	    second argument on stack
 *	exit conditions:
 *	    result (8 bytes) in d0/d1
 *
 *	register conventions:
 *	    d0/d1	divisor
 *	    d2/d3	dividend
 *	    d4		count
 *	    d5		sign an exponent
 *	    d6		result exponent
 *	    d6/d7	quotient
 */
	SAVEMASK = 0x3f00	| registers d2-d7
	RESTMASK = 0xfc
	NSAVED   = 6*4		| 6 registers * sizeof(register)

RTENTRY(Fdivd)
|	save registers and load operands into registers
	moveml	#0x3f00,sp@-	| registers d2-d7
	movl	d0,d2
	movl	d1,d3
	movl	ARG2PTR@+,d0
	movl	ARG2PTR@ ,d1
	| save sign of result
	movl	d0,d5
	eorl	d2,d5		| sign of result
	clrl	d4		| flag for divide
	bsrs	extrem
	| compute resulting exponent
	movw	d6,d5
	subw	d7,d5
	| do top 30-31 bits of divide (d_rcp will post normalize)
	movw	#30,d4	| count 30..0
	bsr	shsub	| shift and subtract loop
	movl	d7,d6	| top of answer
	| do next 22 bits of divide
	movw	#23,d4	| count 23..0 (total = 54-55 bits)
	bsr	shsub
	| put together answer
	lsll	#8,d7	| line up bottom
	orl	d3,d2
	beqs	1$
	bset	#1,d7	| sticky bit on if remainder <> 0
1$:	lsll	#1,d7
	roxll	#1,d6
	| normalize (once), round, check result exp, pack
	movl	d6,d2	| upper
	movl	d7,d3	| lower
	movw	d5,d6	| exponent
	addw	#1023,d6	| re-bias
	jbsr	d_nrcp	| norm once, rnd, ck, pack
	bra	dmsign


/*
 * round, check for over/underflow, and pack in the exponent.
 * d_nrcp does one normalize and then calls d_rcp.
 * d_rcp rounds the double value and packs the exponent in,
 * catching infinity, zero, and denormalized numbers.
 * d_usel puts together the larger argument.
 * 
 * input:
 *      d2/d3   mantissa (- if norm)
 *      d6      biased exponent
 *      (need sign, sticky)
 * output:
 *      d2/d3   most of number, no sign or hidden bit,
 *              waiting to shift sign in.
 * other:
 *      d4      lost
 *      d5      unchanged
 */
 
d_nrcp:
        tstl    d2
        jmi     d_rcp   | already normalized
        subqw   #1,d6
        lsll    #1,d3   | do extra normalize (for mul/div)
        roxll   #1,d2
	jra	d_rcp

/*
 * subroutine for unpacking one operand, and normalizing a denormalized number
 * input:
 *	d0/d1	number
 * output:
 *	d0/d1	mantissa
 *	d7.w	exponent
 *	z	on iff mantissa is zero_
 *
 * unchanged:
 *	d4	bottom = 0xf77
 */

unp:	movl	d0,d7	| start getting exp
	andl	#0xfffff,d0	| clear out sign and exp
	swap	d7
	lsrw	#(16-1-11),d7
	andw	d4,d7	| expondnt
	bnes	3$	| normal number
	| denormalized number or zero:
	tstl	d0	| upper
	bnes	1$
	tstl	d1	| lower
	beqs	3$	|zero
1$:	addqw	#1,d7
2$:	subql	#1,d7
	lsll	#1,d1
	roxll	#1,d0	| normalize
	btst	#20,d0
	beqs	2$	| loop until normalized
3$:	rts


/*
 * subroutine for extracting and taking care of 0/GU/INF/NAN.
 * input:
 *	d0/d1, d2/d3
 *	d4	+ for div; - for mod
 *
 * output:
 *	d0/d1 (bottom) and d2/d3 (top) converted to
 *		000XXXXX/XXXXXXXX
 *	d6	exponent of top
 *	d7	exponent of bottom
 *
 * unchanged:
 *	d5	sign
 */

extrem:
	movw	#0x7ff,d4	| mask, sign.l untouched
	exg	d2,d0
	exg	d3,d1
	bsrs	unp	| unpack top
	exg	d0,d2
	exg	d1,d3
	exg	d7,d6
	beqs	topzero	| top is zero
	cmpw	d4,d6
	beqs	topbig	| top is inf or nan
	bset	#20,d2	| set hidden bit
topzero:bsrs	unp	| unpack bottom
	beqs	botzero	| bottom is 0
	cmpw	d4,d7
	beqs	botbig	| bottom is inf or nan
	bset	#20,d0
	lsll	#1,d1	| left shift bottom
	roxll	#1,d0
	rts

/*
 * EXCEPTION HANDLING
 */

topbig:	| top is inf or nan
	bsrs	unp	| unpack bottom
	tstl	d4	
	bmis	isnan	| inf or nan / ...
	cmpw	d6,d7
	beqs	isnan	| both inf/nan
	tstl	d2	| top
	beqs	geninf	| inf / ... = inf
	bras	isnan	| nan / ... = nan

botzero:| bottom is 0
	tstl	d4
	bmis	gennan	| dmod(... 0) = nan
	orl	d2,d3	| top
	beqs	gennan	| 0/0 = nan
			| nonzero/0 = inf
	| generate infinity for answer
geninf:	movl	#0x7ff00000*2,d2	| infinity
	bras	clrbot

botbig:	tstl	d0	| bottom = nan, result = nan
	bnes	isnan
	tstl	d4
	bpls	genzero	| ... / inf = 0
	addqw	#4,sp
	addw    #11,d6 		| append sign
        jbsr    d_norm          | normalize
        jbsr    d_rcp           | adjust exp, ck extremes, pack
dmsign:	roxll	#1,d5		| sign -> x
	roxrl	#1, d2		| rotate in sign
	| answer is now in:
	|	d2	most significant 32 bits
	|	d3	least significant 32 bits
dmexit:	movl	d2,d0
	movl	d3,d1
	| restore registers and split
	moveml	sp@+,#RESTMASK
	RET


	| invalid operand/operation
isnan:	cmpw	d7,d6
	bnes	1$	| exponents equal
	cmpl	d0, d2
1$:	bges	2$
	movw	d7,d6
	movl	d0,d2
2$:	swap	d2
	lslw	#(16-1-11),d6
	orw	d6,d2	| put back together
	swap	d2
	lsll	#1,d2
	cmpl	#0x7ff00000*2, d2
	bhis	gotnan	| use larger nan

gennan:	movl	#0x7ff00004*2,d2
	tstl	d4
	bpls	gotnan	| nan 4 for div
	addql	#2,d2	| nan 5 for mod
gotnan:	clrl	d5
	bras	clrbot

genzero:clrl	d2
clrbot:	clrl	d3
sign:	addqw	#4,sp
	bra	dmsign

/*
 *  the shsub subroutine does a shift-subtract loop
 * that is the heart of divide and mod.
 * the algorithm is a simple shift and subtract loop,
 * but it adds when it overshoots.
 * why not use the divs/divu instructions? That approach is slower!
 *
 * registers:
 *	d2/d3	current dividend (updated)
 *	d0/d1	divisor (unchanged)
 *	d4.w	(inpout) number of interations -1, and bit number
 *	d5/d6	-untouched-
 *	d7	quotient being devloped (ignored by mod)
 */


shsub:
	clrl	d7	| quotient
	| shift once, see if subtract needed
1$:	addl	d3,d3
	addxl	d2,d2	|(64-bit left shift)
	cmpl	d0,d2
	dbge	d4,1$	| loop while divident is small
	| tally quotient and subtract
	bset	d4,d7	| quotient bit
	subl	d1,d3
	subxl	d0,d2	| 64-bit subtract
	dbmi	d4,1$	| loop (d4) times
	| now one of three things has happeded:
	| 1. count exhausted and extra subtract done (first DB hit count)
	| 2. count exhausted in second DB
	| 3. overshot because compare didn't check lower parts
	bpls	2$	| case 2
	addl	d1,d3	| take care of overshoot
	addxl	d0,d2
	bclr	d4,d7
	tstw	d4
	dblt	d4,1$	| case 3
			| case 1
2$:	rts

/*
 *	ieee double floating multiply
 *	copyright 1981, Richard E. James III
 *	translated to SUN idiom 10 March 1983 rt
 */

/*
	Revised to do correct IEEE rounding by dgh 24 Aug 84
 *	Conventions in the main multiplication section are as follows:
 *	r is the argument passed in the registers d0/d1
 *	s is the argument passed on the stack, saved in a0/a1
 *		while multiplying
 *	r1,r2,r3,r4 are the 16 bit components of r in descending order
 *      s1,s2,s3,s4 are the 16 bit components of s
 *	r is kept in d0 and d1, sometimes with words swapped
 *	s is kept in a0 and a1 unchanged
 *      the product is kept in d2/d3/d4/d5
 *	d6 contains a current partial product
 *	d7 contains a current partial product
 *	d7 contains 0 when needed for addxl
 *	a2 saves the sign and exponent of the result
 *
 *	At the end, d4 and d5 if nonzero are jammed into lsb of d3.
 */
/*
 *	entry conditions:
 *	    first argument in d0/d1
 *	    second argument on stack
 *	exit conditions:
 *	    result (8 bytes) in d0/d1
 *
 *	register conventions:
 *	    d0-d3	operands or pieces of result
 *	    d5		exponent of larger
 *	    d5		temp for multiplying
 *	    d6		sign and exponent
 *	    d7		exponent of smaller
 *	    d7		zero
 */
	SAVEMASK = 0x3fe0	| registers d2-d7,a0-a2
	RESTMASK = 0x7fc
	NSAVED   = 6*4		| 6 registers * sizeof(register)

	SAVE03	 = 0xf000	| registers d0-d3
	FETCH03  = 0x000f

RTENTRY(Fsqrd)
	moveml	#SAVEMASK,sp@- 	| registers d2-d7
	movel	d0,d2
	movel	d1,d3
	bras	Fmuld2
RTENTRY(Fmuld)
|	save registers and load operands into registers
	moveml	#SAVEMASK,sp@- 	| registers d2-d7
	movl	ARG2PTR@+,d2
	movl	ARG2PTR@ ,d3
Fmuld2:
				| save sign of result
	movl	d0,d5
	eorl	d2,d5		| sign of result
	asll	#1,d0		| toss sign
	asll	#1,d2		| EEmmmm0
	cmpl	d2,d0
	| order operands (exponents at least)
	blss	eswap
	exg	d0,d2		| d2/d3 = larger
	exg	d1,d3
	| extract and check exponents
eswap:	jbsr	d_exte
	movw	d6,d5		| larger exp
	movl	d5,d6
	addw	d7,d6		| result exp (and sign)
	cmpw	#0x7ff,d5 	| check larger
	jeq	ovfl		| inf or nan
	tstw	d7
	jeq	ufl		| 0 or 	gradual underflow
	| set hidden bits
	bset	#31,d0
back:	bset	#31,d2

|		Main multiply sequence.

	movl	d2,a0		| a0 gets s1s2.
	movl	d3,a1		| a1 gets s3s4.
	movl	d6,a2		| a2 saves sign and exponent of result.

	movl	a0,d3		| d3 gets s1s2.
	swap	d3		| d3 gets s2s1.
	movw	d3,d4		| d4 gets s1.
	mulu	d1,d4		| d4 gets r4*s1.
	mulu	d0,d3		| d3 gets r2*s1.
	clrw	d2		| d2 gets 0; only gets carries in this phase.

	movl	a1,d6		| d6 gets s3s4.
	swap	d6		| d6 gets s4s3.
	movw	d6,d5		| d5 gets s3.
	jeq	s3zero		| Branch if s3=0 to avoid following.
	mulu	d1,d5		| d5 gets r4*s3.
	movw	d6,d7		| d7 gets s3.
	mulu	d0,d7		| d7 gets r2*s3.
	addl	d7,d4
	clrl	d7		| Make a zero for addx.
	addxl	d7,d3
	addxw	d7,d2
phase3:
	swap	d0		| d0 gets r2r1.
	swap	d1		| d1 gets r4r3.
	movw	a0,d6		| d6 gets s2.
	beqs	4$		| Skip following if s2=0.
	movw	d6,d7		| d7 gets s2.
	mulu	d0,d6		| d6 gets r1*s2.
	mulu	d1,d7		| d7 gets r3*s2.
	addl	d7,d4
	addxl	d6,d3
	clrw	d7
	addxw	d7,d2
4$:
	movw	a1,d6		| d6 gets s4.
	beqs	5$		| Skip if s4=0.
	movw	d6,d7	 	| d7 gets s4.
	mulu	d0,d6		| d6 gets r1*s4.
	mulu	d1,d7		| d7 gets r3*s4.
	addl	d7,d5
	addxl	d6,d4
	clrl	d7
	addxl	d7,d3
	addxw	d7,d2
5$:
	swap	d2		| Exchange order of registers which contain
	swap	d3		| all the "odd" partial products.
	swap	d4
	swap	d5
	movw	d3,d2		| It's really a 16 bit shift!
	movw	d4,d3
	movw	d5,d4
	clrw	d5

	movl	a1,d6		| d6 gets s3s4.
	swap	d6		| d6 gets s4s3.
	movw	d6,d7		| d7 gets s3.
	beqs	6$
	mulu	d0,d6		| d6 gets r1*s3.
	mulu	d1,d7		| d7 gets r3*s3.
	addl	d7,d4
	addxl	d6,d3
	clrl	d7
	addxl	d7,d2

6$:
	movl	a0,d6		| d6 gets s1s2.
	swap	d6		| d6 gets s2s1.
	movw	d6,d7		| d7 gets s1.
	mulu	d0,d6		| d6 gets r1*s1.
	mulu	d1,d7		| d7 gets r3*s1.
	addl	d7,d3
	addxl	d6,d2

	swap	d0		| d0 gets r1r2.
	swap	d1		| d1 gets r3r4.
	movw	a0,d6		| d6 gets s2.
	beqs	8$
	movw	d6,d7		| d7 gets s2.
	mulu	d0,d6		| d6 gets r2*s2.
	mulu	d1,d7		| d7 gets r4*s2.
	addl	d7,d4
	addxl	d6,d3
	clrl	d7
	addxl	d7,d2
8$:
	movw	a1,d6		| d6 gets s4.
	beqs	9$
	movw	d6,d7		| d7 gets s4.
	mulu	d0,d6		| d6 gets r2*s4.
	mulu	d1,d7		| d7 gets r4*s4.
	addl	d7,d5
	addxl	d6,d4
	clrl	d7
	addxl	d7,d3
	addxl	d7,d2
9$:
	orl	d5,d4
	beqs	10$		| Branch if no sticky bits.
	bset	#0,d3		| Set that sticky!
10$:
	movl	a2,d6		| Restore sign/exponent of result.

	subw	#1022,d6	| toss xtra bias
	jbsr	d_nrcp		| norm, rnd, ck, pack
msign:	roxll	#1,d6
	roxrl	#1,d2		| append sign
mexit: | answer is now in d2/d3: put in d0/d1
	movl	d2,d0
	movl	d3,d1
	| restore registers and split
	moveml	sp@+,#RESTMASK
	| slide down return address to pop off junk
	RET

s3zero:
	clrl	d5		| We do need to clear d5 sometime.
	jra	phase3

| EXCEPTION CASES

ovfl:	movl	d2,d5	| larger mantissa, if it is nan, use it
	orw	d7,d5	| smaller exponent
	orl	d0,d5
	orl	d1,d5	| smaller value
	beqs	m_gennan| inf * 0
	| else nan*x or inf*nonzero:
	movw	#0x7ff,d6
	jbsr	d_usel
	bras	msign

ufl:	movl	d0,d7	| mantissa of smaller
	orl	d1,d7
	beqs	signed0	| 0*number
normu:	subqw	#1,d6
	lsll	#1,d1	| adjust denormalized number
	roxll	#1,d0
	bpls	normu
	addqw	#1,d6
	jra	back
	| (if both are denormalized, answer will be zero anyway)
m_gennan:movl	#0x7ff00002,d2
	bras	mexit
signed0:clrl	d2
	clrl	d3
	bras	msign

	SAVEMASK = 0x3f00
	RESTMASK = 0x00fc

	EXP	= d2
	TYPE	= d3
	/* type values: */
	    ZERO  = 1 | wonderful
	    GU    = 2
	    PLAIN = 3
	    INF   = 4
	    NAN   = 5

RTENTRY(Fscaleid)
	moveml	#SAVEMASK,sp@-	| state save
	jbsr	d_unpk
	cmpb	#PLAIN,TYPE	| is it a funny number?
	bgts	gohome		| yes -- return argument
	cmpb	#ZERO,TYPE
	beqs	gohome		| is zero -- return arg
	| normal path through here
	movw	EXP,d7		
	extl	d7		| d7 gets long exponent.
	addl	a0@,d7		| d7 gets modified exponent.
	bvcs	nooflo		| Branch if no overflow.
	bmis	posov		| Branch if positive overflow to -.
				| Branch if negative overflow to +.
negov:
	movw	#-2000,EXP
	bras	gohome
posov:	
	movw	#2000,EXP
	bras	gohome
nooflo:
	cmpl	#2000,d7
	bges	posov
	cmpl	#-2000,d7
	bles	negov
	movw	d7,EXP		| Final OK exponent.
gohome:
	jbsr	d_pack
gone:	moveml	sp@+,#RESTMASK
	RET