// Description: Unsigned multiplication routines
// Original code by: Jackasser (found on codebase)
// Partially adapted by Freshness.                                           

.const	fpprecbits		=	15				// Fixed point precision bits

// Multiply functions
// ******************

// Generic 8x8 => 16 multiplier
// *****************************
// IN: A = Multiplier
//     Y = Multiplier
mul8x8:
                sta m81a+1
				sta m83a+1
                eor #$ff
                sta m82a+1
                sta m84a+1				
                sec
m81a:           lda square1_lo,y
m82a:           sbc square2_lo,y
				sta PRODUCT
m83a:           lda square1_hi,y
m84a:           sbc square2_hi,y
                sta PRODUCT+1				
                rts				


// Input: 16-bit unsigned value in ARG1                                      
//        16-bit unsigned value in ARG2                                      
//        Carry=0: Re-use ARG1 from previous multiplication (faster)         
//        Carry=1: Set ARG1 (slower)                                         
//                                                                         
// Output: 32-bit unsigned value in PRODUCT                                
//                                                                         
// Clobbered: PRODUCT, X, A, C                                             
//                                                                         
// Allocation setup: ARG1,ARG2 and PRODUCT preferably on Zero-page.            
//                   square1m16lo, square1m16hi, square2m16lo, square2m16hi must be
//                   page aligned. Each table are 512 bytes. Total 2kb.    
mult16x16:
                bcc !+
                    lda ARG1+0
                    sta m161a+1
                    sta m163a+1
                    sta m165a+1
                    sta m167a+1
                    eor #$ff
                    sta m162a+1
                    sta m164a+1
                    sta m166a+1
                    sta m168a+1
                    lda ARG1+1
                    sta m161b+1
                    sta m163b+1
                    sta m165b+1
                    sta m167b+1
                    eor #$ff
                    sta m162b+1
                    sta m164b+1
                    sta m166b+1
                    sta m168b+1
                !:
                ldx ARG2+0
                sec
m161a:          lda square1_lo,x
m162a:          sbc square2_lo,x
                sta PRODUCT+0
m163a:          lda square1_hi,x
m164a:          sbc square2_hi,x
                sta m16AA+1

                sec
m161b:          lda square1_lo,x
m162b:          sbc square2_lo,x
                sta m16cc+1
m163b:          lda square1_hi,x
m164b:          sbc square2_hi,x
                sta m16CC+1

                ldx ARG2+1
                sec
m165a:          lda square1_lo,x
m166a:          sbc square2_lo,x
                sta m16bb+1
m167a:          lda square1_hi,x
m168a:          sbc square2_hi,x
                sta m16BB+1

                sec
m165b:          lda square1_lo,x
m166b:          sbc square2_lo,x
                sta m16dd+1
m167b:          lda square1_hi,x
m168b:          sbc square2_hi,x
                sta PRODUCT+3

                clc
m16AA:          lda #0
m16bb:          adc #0
                sta PRODUCT+1
m16BB:          lda #0
m16CC:          adc #0
                sta PRODUCT+2
                bcc !+
                    inc PRODUCT+3
                    clc
                !:
m16cc:          lda #0
                adc PRODUCT+1
                sta PRODUCT+1
m16dd:          lda #0
                adc PRODUCT+2
                sta PRODUCT+2
                bcc !+
                    inc PRODUCT+3
                !:
				.for(var i=0;i<[16-fpprecbits];i++) {
					asl PRODUCT+0
					rol PRODUCT+1
					rol PRODUCT+2
					rol PRODUCT+3
				}
                rts     


// 24bit/16bit slow division (taken from codebase)
// ***********************************************
divide24x16:
	lda #0	        	//preset remainder to 0
	sta temp
	sta temp2
	ldx #24	        	//repeat for each bit: ...
divloop:	
	asl DIVIDEND	//dividend lb & hb*2, msb -> Carry
	rol DIVIDEND+1
	rol DIVIDEND+2
	rol temp		//remainder lb & hb * 2 + msb from carry
	rol temp2
	lda temp
	sec
	sbc DIVISOR		//substract divisor to see if it fits in
	tay	        	//lb result -> Y, for we may need it later
	lda temp2
	sbc DIVISOR+1
	bcc skip		//if carry=0 then divisor didn't fit in yet

	sta temp2		//else save substraction result as new remainder,
	sty temp
	inc DIVIDEND	// and INCrement result cause divisor fit in 1 times
skip:	
	dex
	bne divloop	
	rts				
	
// 2k precalc table
// ****************
.align $100
square1_lo:
.fill 512,<[[i*i]/4]                               
square1_hi:
.fill 512,>[[i*i]/4]                               
square2_lo:
.fill 512,<[[[i-255]*[i-255]]/4]                   
square2_hi:
.fill 512,>[[[i-255]*[i-255]]/4]                   

