@ arm-note:  adcs r0, r0, r0   does a 33 bit rotate left through the carry.
@ arm-note:  movs r0, r0, rrx  does a 33 bit rotate right through the carry.

@  clz  | count leading zeros - not available on the arm4 / strongarm.
@  tst  | test with and
@  teq  | test with eor
@  cmp  | test with sub
@  cmn  | test with add

@  adc  | Rd := Rn + operand + C  ; C = borrowfromcalc
@  add  | Rd := Rn + operand
@  rsb  | Rd := operand - Rn      ; C = !borrowfromcalc
@  rsbc | Rd := operand - Rn - !C ; C = !borrowfromcalc
@  sbc  | Rd := Rn - operand - !C ; C = !borrowfromcalc
@  sub  | Rd := Rn - operand      ; C = !borrowfromcalc

@  and  | Rd := Rn & operand
@  orr  | Rd := Rn | operand
@  eor  | Rd := Rn ^ operand
@  mvn  | Rd := ~operand
@  mov  | Rd := operand

@  mla  | Rd := Rm*Rs+Rn
@  mul  | Rd := Rm*Rs
@  smlal| RdHi.RdLo += Rm*Rs
@  smull| RdHi.RdLo = Rm*Rs
@  umlal| RdHi.RdLo += Rm*Rs
@  umull| RdHi.RdLo = Rm*Rs

@  msr  | set flags in the PSR.

@    flags
@       N = alu.bit31
@       Z = alu==0
@       C = carryfromcalc
@       C = overflowfromcalc

@ EQ    Z==1
@ NE    Z==0
@ CS/HS C==1 carry-set / higher-same
@ CC/LO C==0 carry-clear/ lower
@ MI    N==1
@ PL    N==0
@ VS    V==1
@ VC    V==0
@ HI    C==1 && Z==0
@ LS    C==0 && Z==1
@ GE    N==V
@ LT    N!=V
@ GT    N==V && Z==0
@ LE    N!=V && Z==1

@ calculate  (r0<<r2) / r1
@ 
    .text
    .align 0
    .global fxpt_wjdiv32
fxpt_wjdiv32:
	stmfd	sp!, {r4-r7,lr}

@ r3 is where the quotient is calculated
	mov r3, #0

@ r4 remembers the sign of the result
	mov r4, #0

@ make both r0, r1 positive
tstr1neg:
	tst r1, r1
	beq divzero
	bpl normalizer1
	rsbs r1, r1, #0
	eor r4, r4, #1

	blt tstr0neg

normalizer1:	@ r2 += clz(r1)
	movs r1, r1, lsl #1
	add r2, r2, #1
	bge normalizer1

tstr0neg:
	tst r0, r0
	beq exitdiv
	bpl normalizer0
	rsbs r0, r0, #0
	eor r4, r4, #1

	blt checkoverflow

normalizer0:	@ r2 -= clz(r0)
	movs r0, r0, lsl #1
	sub r2, r2, #1
	bge normalizer0

checkoverflow:
@check if result will fit.
	cmp r2, #32
	bge divoverflow
	cmp r2, #0
	ble divunderflow

@  -> now doing  r0/r1*2^(clz(r0)-clz(r1))

@ r6 = 2nd word of dividend.
	mov r6, #0
@ r7 = 2nd word of divisor.
	mov r7, #0
divloop:
	cmp r0, r1
	cmpeq r6, r7
	blo  shiftbit		@ unsigned compare r0.r6 < r1.r7
	subs r0, r0, r1
	sbc  r6, r6, r7

	@ here the carry==1

shiftbit:
	adc r3, r3, r3	@ add bit to quotient

	movs r1, r1, lsr #1
	mov r7, r7, rrx

	subs r2, r2, #1
	bge divloop

@ round off last bit
	subs r0, r0, r1
	sbc  r6, r6, r7
	adc  r3, r3, #0

	movs r0, r3
	bmi divoverflow
@ now correct sign
	tst r4, #1
	rsbne r0, r0, #0	@ negate if needed
exitdiv:
	ldmfd	sp!, {r4-r7,pc}

divzero:
	tst r0,r0
	mov r0, #0x80000000
	mvnpl r0,r0
	b exitdiv

divoverflow:
	tst r4, #1
	mov r0, #0x80000000
	mvneq r0,r0
	b exitdiv

divunderflow:
	mov r0, #0
	b exitdiv

