65816-llvm-mos/runtime/src/libgcc.s

; Minimal libgcc-equivalent runtime for the W65816 / Apple IIgs.
; Provides the helpers that the LLVM backend lowers integer multiply,
; shift, divide, and modulo operations to.  Implementations are
; correct-but-unoptimised; they exist to unblock end-to-end testing,
; not to compete with hand-tuned 65816 math libraries.
;
; Calling convention (matches W65816ISelLowering::LowerCall):
;   - Arg 0 in A (16-bit M).
;   - Arg 1 pushed via PHA before the JSL.  Reads as (4,S) inside the
;     callee (3-byte JSL return address sits at 1..3,S).
;   - Return value in A.  Caller releases pushed args.
;   - Routines run in 16-bit M, 16-bit X (REP #$30 by convention).
;
; Direct-page scratch lives at DP+$E0..DP+$EF (16 bytes).  Programs
; that use this runtime must keep DP=0 or remap accordingly.
;
; Assembled with: tools/llvm-mos-build/bin/llvm-mc -arch=w65816 \
;                                                 -filetype=obj
;                                                 runtime/src/libgcc.s
;                                                 -o runtime/libgcc.o

	.text

; --------------------------------------------------------------------
; Indirect-call trampoline.  An indirect call (function pointer) stores
; the target's 16-bit address to __indirTarget before JSL'ing here.
; This routine does a JMP indirect through that variable: control
; transfers to the target with the original caller's JSL frame still
; on the stack, so target's RTL returns to the original caller (one
; frame, no double-RTL).
;
; Caller emit sequence in W65816ISelLowering::LowerCall:
;   sta __indirTarget    ; store ptr (must precede any A clobber for args)
;   ... arg pushes ...
;   jsl __jsl_indir
;
; Single-bank only (the IIgs convention assumes code in bank 0/1
; via JSL — JMP indirect is bank-local).
; --------------------------------------------------------------------
	.globl __indirTarget
	.bss
__indirTarget:
	.zero 2

	.text
	.globl __jsl_indir
__jsl_indir:
	; Hand-encoded JMP (__indirTarget): 6C is "jmp (a)" — the assembler
	; doesn't yet parse the `(abs)` syntax, so emit the bytes directly
	; with a 16-bit relocation against the variable.  Effective transfer:
	; PC <- mem[__indirTarget].
	.byte	0x6C
	.word	__indirTarget

; --------------------------------------------------------------------
; __mulhi3 — 16-bit multiply.  A * (4,S) -> A.
; Signed and unsigned share an implementation: only the low 16 bits of
; the product are returned, which is identical for both.  Uses
; shift-and-add over the multiplier bits.
; --------------------------------------------------------------------
	.globl __mulhi3
__mulhi3:
	sta	0xe0		; multiplier
	lda	0x4, s
	sta	0xe2		; multiplicand
	lda	#0x0
	sta	0xe4		; running product
.Lmul_loop:
	lda	0xe0
	beq	.Lmul_done
	lsr	a
	sta	0xe0
	bcc	.Lmul_skip
	lda	0xe4
	clc
	adc	0xe2
	sta	0xe4
.Lmul_skip:
	asl	0xe2
	bra	.Lmul_loop
.Lmul_done:
	lda	0xe4
	rtl

; --------------------------------------------------------------------
; __ashlhi3 — A << (4,S) -> A.  Shift count is i16 but only the low 4
; bits are meaningful (counts >=16 are undefined behaviour in C).
; --------------------------------------------------------------------
	.globl __ashlhi3
__ashlhi3:
	pha			; save value on stack so we can free A
	lda	0x6, s		; arg 1 sits at 6,s now (PHA shifted by 2)
	tax
	pla			; restore value
.Lashl_loop:
	cpx	#0x0
	beq	.Lashl_done
	asl	a
	dex
	bra	.Lashl_loop
.Lashl_done:
	rtl

; --------------------------------------------------------------------
; __lshrhi3 — A logical >> (4,S) -> A.  Same shape as __ashlhi3 with
; LSR instead of ASL.
; --------------------------------------------------------------------
	.globl __lshrhi3
__lshrhi3:
	pha
	lda	0x6, s
	tax
	pla
.Llshr_loop:
	cpx	#0x0
	beq	.Llshr_done
	lsr	a
	dex
	bra	.Llshr_loop
.Llshr_done:
	rtl

; --------------------------------------------------------------------
; __ashrhi3 — A arithmetic >> (4,S) -> A.  Sign bit is preserved by
; copying it into carry before each ROR via CMP #$8000 (which sets
; carry exactly when the sign bit is set on a 16-bit unsigned compare).
; --------------------------------------------------------------------
	.globl __ashrhi3
__ashrhi3:
	pha
	lda	0x6, s
	tax
	pla
.Lashr_loop:
	cpx	#0x0
	beq	.Lashr_done
	cmp	#0x8000
	ror	a
	dex
	bra	.Lashr_loop
.Lashr_done:
	rtl

; --------------------------------------------------------------------
; __udivhi3 — A unsigned / (4,S) -> A.
; Restoring shift-subtract division.  Common helper; __umodhi3 reuses
; the algorithm and returns the remainder instead.
; Scratch:  $e6 = numerator,  $e8 = denominator,
;           $ea = quotient,   $ec = remainder.
; --------------------------------------------------------------------
	.globl __udivhi3
__udivhi3:
	; Public entry: A=dividend, (4,S)=divisor.  Set up scratch and
	; call the same JSR-based core used by signed divide.
	sta	0xe6
	lda	0x4, s
	sta	0xe8
	jsr	__udivmod_core
	lda	0xea
	rtl

	.globl __umodhi3
__umodhi3:
	sta	0xe6
	lda	0x4, s
	sta	0xe8
	jsr	__udivmod_core
	lda	0xec
	rtl

; --------------------------------------------------------------------
; __divhi3 / __modhi3 — signed 16-bit divide and modulo.  Strategy:
; - Stash sign of dividend in $ee bit 0 (used by modulo).
; - Stash result sign of quotient (sign(a) XOR sign(b)) in $ee bit 1
;   (used by divide).
; - Take absolute values, run the unsigned core, then negate the
;   appropriate result if its sign bit is set.
; C99: quotient truncates toward zero; remainder takes the sign of the
; dividend.
; --------------------------------------------------------------------
	.globl __divhi3
__divhi3:
	jsr	__divmod_setup
	jsr	__udivmod_core
	; Quotient is in $ea.  Negate if bit 1 of $ee is set.
	lda	0xea
	pha
	lda	0xee
	and	#0x2
	beq	.Ldiv_pos
	pla
	eor	#0xffff
	clc
	adc	#0x1
	rtl
.Ldiv_pos:
	pla
	rtl

	.globl __modhi3
__modhi3:
	jsr	__divmod_setup
	jsr	__udivmod_core
	; Remainder is in $ec.  Negate if bit 0 of $ee is set (dividend
	; was negative).
	lda	0xec
	pha
	lda	0xee
	and	#0x1
	beq	.Lmod_pos
	pla
	eor	#0xffff
	clc
	adc	#0x1
	rtl
.Lmod_pos:
	pla
	rtl

; --------------------------------------------------------------------
; __divmod_setup — common prologue for __divhi3/__modhi3.  Reads
; A=dividend and (4,S)=divisor (the public-entry stack frame is intact
; because we used JSR not JSL, so (4,S) still points to the user's
; pushed arg1 relative to the original JSL).  Computes |a| -> $e6,
; |b| -> $e8, and sign tracker -> $ee:
;   bit 0 = 1 if dividend was negative (modulo result sign)
;   bit 1 = 1 if dividend XOR divisor signs differ (quotient sign)
; Uses JSR/RTS, same bank.
; --------------------------------------------------------------------
__divmod_setup:
	; Sign tracker.  We don't have STZ in our instruction set yet, so
	; clear via PHA/LDA #0/STA/PLA to avoid trashing A.
	pha
	lda	#0x0
	sta	0xee
	pla
	; Dividend sign + abs value.
	cmp	#0x8000
	bcc	.Lset_a_pos
	; Negative: set bits 0 and 1 (dividend sign, result sign so far).
	pha
	lda	0xee
	ora	#0x3
	sta	0xee
	pla
	eor	#0xffff
	clc
	adc	#0x1
.Lset_a_pos:
	sta	0xe6
	; Divisor sign + abs value.  After our JSR (pushed 2 bytes of
	; near-return), the user's arg1 has shifted up by 2 from (4,S)
	; to (6,S).
	lda	0x6, s
	cmp	#0x8000
	bcc	.Lset_b_pos
	; Negative: flip bit 1 of $ee (XOR with sign of dividend).
	pha
	lda	0xee
	eor	#0x2
	sta	0xee
	pla
	eor	#0xffff
	clc
	adc	#0x1
.Lset_b_pos:
	sta	0xe8
	rts

; --------------------------------------------------------------------
; __udivmod_core — internal restoring divide.  Inputs at $e6/$e8,
; outputs quotient at $ea, remainder at $ec.  JSR/RTS local helper.
; --------------------------------------------------------------------
__udivmod_core:
	lda	#0x0
	sta	0xea
	sta	0xec
	ldx	#0x10
.Lcore_loop:
	asl	0xe6
	rol	0xec
	asl	0xea
	lda	0xec
	cmp	0xe8
	bcc	.Lcore_skip
	sec
	sbc	0xe8
	sta	0xec
	inc	0xea
.Lcore_skip:
	dex
	bne	.Lcore_loop
	rts

; ====================================================================
; 32-bit (long / si) helpers.
;
; ABI for these is the natural extension of the i16 libcalls:
;   - arg0_lo in A
;   - arg0_hi at (4,s)
;   - arg1_lo at (6,s)         (or shift count, for the shift helpers)
;   - arg1_hi at (8,s)
;   - return: result_lo in A, result_hi in X
;
; All are correct-but-unoptimised; goal is unblocking end-to-end builds,
; not winning a 65816 codegolf.
;
; Direct-page scratch for these:
;   $e0..$e3  = a (lo, hi)        [renamed from $e0/$e2 for the i16 ones]
;   $e4..$e7  = b (lo, hi)
;   $e8..$eb  = result / quotient (lo, hi)
;   $ec..$ef  = remainder (lo, hi)
; ====================================================================

; --------------------------------------------------------------------
; __mulsi3 — 32-bit multiply.  Shift-and-add over 32 bits of the
; multiplier.  Result = (a * b) mod 2^32.
;
; ABI: A = a_lo, X = a_hi (the i32-first-arg in A:X convention),
;      (4,s) = b_lo, (6,s) = b_hi.  Result returned in A:X (lo:hi).
; --------------------------------------------------------------------
	.globl __mulsi3
__mulsi3:
	; Stash a (multiplier) into $e0/$e2.
	sta	0xe0
	stx	0xe2
	; Stash b (multiplicand) into $e4/$e6.
	lda	0x4, s
	sta	0xe4
	lda	0x6, s
	sta	0xe6
	; Clear running product at $e8/$ea.
	lda	#0x0
	sta	0xe8
	sta	0xea
	; Loop 32 times: examine LSB of multiplier, conditionally add
	; multiplicand to product, then shift multiplier right and
	; multiplicand left.  Use Y as a 16-bit counter (X mode = 16).
	ldy	#0x20
.Lmulsi_loop:
	; Test bit 0 of multiplier (lo word).
	lda	0xe0
	lsr	a
	sta	0xe0
	bcc	.Lmulsi_noadd
	; Add multiplicand to product (32-bit).
	clc
	lda	0xe8
	adc	0xe4
	sta	0xe8
	lda	0xea
	adc	0xe6
	sta	0xea
.Lmulsi_noadd:
	; Shift multiplier right (32-bit, hi-into-lo) — we already shifted
	; the lo half above, but the bit shifted out went to carry.  We
	; need to also bring the lo bit of the hi half into bit 15 of lo,
	; and shift hi right.  Simpler: do a full 32-bit shift right
	; before the LSR.  Restructure:
	;
	; Shift multiplicand left (32-bit, carry chain).
	asl	0xe4
	rol	0xe6
	; Bring multiplier hi into multiplier lo's high bit.  Multiplier
	; has been shifted lo>>1 already; we need to also put hi's lo bit
	; into lo's hi bit and shift hi right.
	lsr	0xe2
	bcc	.Lmulsi_no_borrow
	; Carry from hi >> 1 needs to land in bit 15 of lo.  ORA #$8000.
	lda	0xe0
	ora	#0x8000
	sta	0xe0
.Lmulsi_no_borrow:
	dey
	bne	.Lmulsi_loop
	; Result is in $e8 (lo) / $ea (hi).
	ldx	0xea
	lda	0xe8
	rtl

; --------------------------------------------------------------------
; __ashlsi3 — (A:X) << (4,s) -> A:X.  Shift count is i16 in low byte;
; counts >= 32 are UB in C.  Uses a per-bit loop (cheap on 65816 — one
; ASL + ROL per bit).
;
; ABI: A = a_lo, X = a_hi (i32-first-arg in A:X), (4,s) = count.
; --------------------------------------------------------------------
	.globl __ashlsi3
__ashlsi3:
	sta	0xe0			; lo
	stx	0xe2			; hi
	lda	0x4, s
	tay				; count -> Y
.Lashlsi_loop:
	cpy	#0x0
	beq	.Lashlsi_done
	asl	0xe0
	rol	0xe2
	dey
	bra	.Lashlsi_loop
.Lashlsi_done:
	ldx	0xe2
	lda	0xe0
	rtl

; --------------------------------------------------------------------
; __lshrsi3 — logical >> shift.  LSR hi, ROR lo: hi gets a 0, lo gets
; hi's old bit 0.  Per-bit loop.
; --------------------------------------------------------------------
	.globl __lshrsi3
__lshrsi3:
	sta	0xe0
	stx	0xe2
	lda	0x4, s
	tay
.Llshrsi_loop:
	cpy	#0x0
	beq	.Llshrsi_done
	lsr	0xe2
	ror	0xe0
	dey
	bra	.Llshrsi_loop
.Llshrsi_done:
	ldx	0xe2
	lda	0xe0
	rtl

; --------------------------------------------------------------------
; __ashrsi3 — arithmetic >> shift.  Sign bit must be preserved on each
; iteration: copy bit 15 of hi into carry (via CMP #$8000), then ROR
; hi, ROR lo.  Per-bit loop.
; --------------------------------------------------------------------
	.globl __ashrsi3
__ashrsi3:
	sta	0xe0
	stx	0xe2
	lda	0x4, s
	tay
.Lashrsi_loop:
	cpy	#0x0
	beq	.Lashrsi_done
	; CMP #$8000 sets C iff the unsigned value >= 0x8000, i.e. bit 15
	; is set — exactly the sign bit.
	lda	0xe2
	cmp	#0x8000
	ror	0xe2
	ror	0xe0
	dey
	bra	.Lashrsi_loop
.Lashrsi_done:
	ldx	0xe2
	lda	0xe0
	rtl

; --------------------------------------------------------------------
; __udivmodsi_core — internal 32-bit unsigned divide.  Inputs in
; $e0/$e2 (numerator) and $e4/$e6 (denominator); outputs quotient in
; $e8/$ea and remainder in $ec/$ee.  32-iteration restoring divide.
; JSR/RTS local helper.
; --------------------------------------------------------------------
__udivmodsi_core:
	lda	#0x0
	sta	0xe8
	sta	0xea
	sta	0xec
	sta	0xee
	ldy	#0x20
.Lcoresi_loop:
	; Shift numerator left through remainder.
	asl	0xe0
	rol	0xe2
	rol	0xec
	rol	0xee
	; Shift quotient left.
	asl	0xe8
	rol	0xea
	; Compare remainder to denominator (32-bit).
	lda	0xee
	cmp	0xe6
	bcc	.Lcoresi_skip
	bne	.Lcoresi_take
	lda	0xec
	cmp	0xe4
	bcc	.Lcoresi_skip
.Lcoresi_take:
	; Remainder >= denominator: subtract and set quotient bit 0.
	sec
	lda	0xec
	sbc	0xe4
	sta	0xec
	lda	0xee
	sbc	0xe6
	sta	0xee
	inc	0xe8
.Lcoresi_skip:
	dey
	bne	.Lcoresi_loop
	rts

; --------------------------------------------------------------------
; __udivsi3 — unsigned 32/32 -> 32 divide.
; --------------------------------------------------------------------
	.globl __udivsi3
__udivsi3:
	; ABI: A = a_lo, X = a_hi, (4,s) = b_lo, (6,s) = b_hi.
	sta	0xe0
	stx	0xe2
	lda	0x4, s
	sta	0xe4
	lda	0x6, s
	sta	0xe6
	jsr	__udivmodsi_core
	ldx	0xea
	lda	0xe8
	rtl

; --------------------------------------------------------------------
; __umodsi3 — unsigned 32/32 -> 32 modulo.
; --------------------------------------------------------------------
	.globl __umodsi3
__umodsi3:
	sta	0xe0
	stx	0xe2
	lda	0x4, s
	sta	0xe4
	lda	0x6, s
	sta	0xe6
	jsr	__udivmodsi_core
	ldx	0xee
	lda	0xec
	rtl

; --------------------------------------------------------------------
; __divsi3 / __modsi3 — signed 32-bit divide / modulo.  Strategy mirrors
; the i16 helpers: stash signs, take abs, run unsigned core, negate
; result(s) as needed.  Sign tracker bits in $f0:
;   bit 0 = dividend was negative (modulo result sign)
;   bit 1 = quotient sign (sign(a) XOR sign(b))
; --------------------------------------------------------------------
	.globl __divsi3
__divsi3:
	jsr	__divmodsi_setup
	jsr	__udivmodsi_core
	; Quotient at $e8/$ea.  Negate if bit 1 of $f0 is set.
	lda	0xf0
	and	#0x2
	beq	.Ldivsi_pos
	; 32-bit two's complement of quotient.
	lda	0xe8
	eor	#0xffff
	clc
	adc	#0x1
	sta	0xe8
	lda	0xea
	eor	#0xffff
	adc	#0x0
	sta	0xea
.Ldivsi_pos:
	ldx	0xea
	lda	0xe8
	rtl

	.globl __modsi3
__modsi3:
	jsr	__divmodsi_setup
	jsr	__udivmodsi_core
	; Remainder at $ec/$ee.  Negate if bit 0 of $f0 set (dividend
	; was negative — C99 remainder takes dividend's sign).
	lda	0xf0
	and	#0x1
	beq	.Lmodsi_pos
	lda	0xec
	eor	#0xffff
	clc
	adc	#0x1
	sta	0xec
	lda	0xee
	eor	#0xffff
	adc	#0x0
	sta	0xee
.Lmodsi_pos:
	ldx	0xee
	lda	0xec
	rtl

; --------------------------------------------------------------------
; __divmodsi_setup — common prologue for __divsi3 / __modsi3.
; Reads A=a_lo, X=a_hi (i32-first-arg ABI), (4,s)=b_lo, (6,s)=b_hi.
; Writes |a| to $e0/$e2, |b| to $e4/$e6, sign bits to $f0.  JSR/RTS.
; After JSR's 2-byte ret push, callee-relative offsets are (6,s)=b_lo,
; (8,s)=b_hi.
; --------------------------------------------------------------------
__divmodsi_setup:
	; Clear sign tracker.
	pha
	lda	#0x0
	sta	0xf0
	pla
	; |a|: A=a_lo, X=a_hi.  Save them first (we need a_hi for sign test).
	sta	0xe0			; tentative a_lo (may negate below)
	stx	0xe2			; tentative a_hi
	cpx	#0x8000
	bcc	.Lsetsi_a_pos
	; a is negative.  Set sign tracker bits 0+1 and negate.
	lda	0xf0
	ora	#0x3
	sta	0xf0
	; 32-bit negate: invert + 1.
	lda	0xe0
	eor	#0xffff
	clc
	adc	#0x1
	sta	0xe0
	lda	0xe2
	eor	#0xffff
	adc	#0x0
	sta	0xe2
.Lsetsi_a_pos:
	; |b|.  Args shifted by 2 (the JSR ret push).
	lda	0x6, s
	sta	0xe4
	lda	0x8, s
	sta	0xe6
	cmp	#0x8000
	bcc	.Lsetsi_b_pos
	; b is negative.  Flip bit 1 of $f0.
	lda	0xf0
	eor	#0x2
	sta	0xf0
	lda	0xe4
	eor	#0xffff
	clc
	adc	#0x1
	sta	0xe4
	lda	0xe6
	eor	#0xffff
	adc	#0x0
	sta	0xe6
.Lsetsi_b_pos:
	rts

; ====================================================================
; i64 (long long) helpers.
;
; Calling convention (i64 first arg is split via i32-first-arg path):
;   A   = arg0_lo[0..15]    (lowest word)
;   X   = arg0_lo[16..31]
;   4,S = arg0_hi[0..15]
;   6,S = arg0_hi[16..31]   (highest word)
;   For binary ops (mul/div/mod), arg1 follows on the stack:
;   8,S = arg1_lo[0..15]
;   10,S = arg1_lo[16..31]
;   12,S = arg1_hi[0..15]
;   14,S = arg1_hi[16..31]
;   For shift ops, the count occupies a single i16 at 8,S.
;
; Return ABI (matches LowerReturn for i64):
;   A   = result_lo[0..15]
;   X   = result_lo[16..31]
;   Y   = result_hi[0..15]
;   DP $F0..$F1 = result_hi[16..31]
;
; Scratch DP layout (per-libcall, no overlap between concurrent calls):
;   $E0..$E7 = a (8 bytes; 4 16-bit words)
;   $E8..$EF = b OR product (8 bytes)
;
; All routines run with REP #$30 (M=0, X=0).
; ====================================================================

; --------------------------------------------------------------------
; __divmoddi4_stash — common entry point.  Stashes a -> $E0..$E7,
; b -> $E8..$EF.  Used by __udivdi3 / __umoddi3 / __divdi3 / __moddi3
; setup; signed variants flip signs around it.
; --------------------------------------------------------------------
__divmoddi4_stash:
	sta	0xe0			; a_lo_lo
	stx	0xe2			; a_lo_hi
	lda	0x4, s
	sta	0xe4			; a_hi_lo
	lda	0x6, s
	sta	0xe6			; a_hi_hi
	lda	0x8, s
	sta	0xe8			; b_lo_lo
	lda	0xa, s
	sta	0xea			; b_lo_hi
	lda	0xc, s
	sta	0xec			; b_hi_lo
	lda	0xe, s
	sta	0xee			; b_hi_hi
	rts

; --------------------------------------------------------------------
; Helper: pack the result at $E0..$E7 into the i64 return ABI.
; Trashes A, Y.  Caller falls through to RTL.
; --------------------------------------------------------------------
__retdi:
	lda	0xe6
	sta	0xf0
	lda	0xe4
	tay
	lda	0xe2
	tax
	lda	0xe0
	rtl

; --------------------------------------------------------------------
; __ashldi3 — i64 left shift by n.  Per-bit loop.  Y holds count.
; --------------------------------------------------------------------
	.globl __ashldi3
__ashldi3:
	sta	0xe0
	stx	0xe2
	lda	0x4, s
	sta	0xe4
	lda	0x6, s
	sta	0xe6
	lda	0x8, s
	tay				; Y = count
.Lashldi_loop:
	cpy	#0x0
	beq	.Lashldi_done
	asl	0xe0
	rol	0xe2
	rol	0xe4
	rol	0xe6
	dey
	bra	.Lashldi_loop
.Lashldi_done:
	brl	__retdi

; --------------------------------------------------------------------
; __lshrdi3 — i64 logical right shift.  LSR top word, ROR rest.
; --------------------------------------------------------------------
	.globl __lshrdi3
__lshrdi3:
	sta	0xe0
	stx	0xe2
	lda	0x4, s
	sta	0xe4
	lda	0x6, s
	sta	0xe6
	lda	0x8, s
	tay
.Llshrdi_loop:
	cpy	#0x0
	beq	.Llshrdi_done
	lsr	0xe6
	ror	0xe4
	ror	0xe2
	ror	0xe0
	dey
	bra	.Llshrdi_loop
.Llshrdi_done:
	brl	__retdi

; --------------------------------------------------------------------
; __ashrdi3 — i64 arithmetic right shift.  Same as lshrdi3 but the top
; bit replicates: sign-extend by ASL/ROR which would clear; instead
; take a copy of the sign and OR it back, OR use cmp/sbc trick — use
; the standard idiom: capture sign before LSR via "asl; ror" so C is
; preserved.  Simpler: copy bit 15 of $E7 into C before each shift.
; --------------------------------------------------------------------
	.globl __ashrdi3
__ashrdi3:
	sta	0xe0
	stx	0xe2
	lda	0x4, s
	sta	0xe4
	lda	0x6, s
	sta	0xe6
	lda	0x8, s
	tay
.Lashrdi_loop:
	cpy	#0x0
	beq	.Lashrdi_done
	; "ASL $E6" sets C from bit 15 (the sign), then we ROR $E6 back.
	; Net effect on $E6: arithmetic right shift by 1 (sign preserved).
	; The carry chain into $E4..$E0 is the new bit 15.
	lda	0xe6
	asl	a			; C = sign bit; A = (sign<<1) | rest
	ror	0xe6			; $E6: (sign << 15) | ($E6 >> 1)
	ror	0xe4
	ror	0xe2
	ror	0xe0
	dey
	bra	.Lashrdi_loop
.Lashrdi_done:
	brl	__retdi

; --------------------------------------------------------------------
; __muldi3 — i64 multiply (low 64 bits of 64x64 product).
; Shift-and-add over a (64 bits).  Product accumulates at $F2..$F9
; (above the return DP slot, scratch).  Need a fresh 8-byte product
; slot since $E0..$EF holds operands.
; --------------------------------------------------------------------
	.globl __muldi3
__muldi3:
	jsr	__divmoddi4_stash
	; Clear product P0..P3 at $F2..$F8.
	lda	#0x0
	sta	0xf2
	sta	0xf4
	sta	0xf6
	sta	0xf8
	; Loop 64 times on a's bits.
	ldy	#0x40
.Lmuldi_loop:
	; Test bit 0 of a (= LSR a; C = old bit 0).
	lda	0xe0
	lsr	a
	sta	0xe0
	lda	0xe2
	ror	a
	sta	0xe2
	lda	0xe4
	ror	a
	sta	0xe4
	lda	0xe6
	ror	a
	sta	0xe6
	bcc	.Lmuldi_noadd
	; Add b ($E8..$EE) to product ($F2..$F8).
	clc
	lda	0xf2
	adc	0xe8
	sta	0xf2
	lda	0xf4
	adc	0xea
	sta	0xf4
	lda	0xf6
	adc	0xec
	sta	0xf6
	lda	0xf8
	adc	0xee
	sta	0xf8
.Lmuldi_noadd:
	; Shift b left by 1 (so each iteration uses next bit position).
	asl	0xe8
	rol	0xea
	rol	0xec
	rol	0xee
	dey
	bne	.Lmuldi_loop
	; Move product into return slots ($E0..$E7) and tail-call __retdi.
	lda	0xf2
	sta	0xe0
	lda	0xf4
	sta	0xe2
	lda	0xf6
	sta	0xe4
	lda	0xf8
	sta	0xe6
	brl	__retdi

; --------------------------------------------------------------------
; __ucmpdi2 — unsigned i64 compare.  Returns 0 if a<b, 1 if a==b,
; 2 if a>b (libgcc convention).  We emit i16 result in A (with the
; high bytes don't-care).
; --------------------------------------------------------------------
	.globl __ucmpdi2
__ucmpdi2:
	; Compare from MSB downwards.  Stash a/b first so we have a stable
	; layout.
	jsr	__divmoddi4_stash
	; Compare $E6 vs $EE (a_hi_hi vs b_hi_hi).
	lda	0xe6
	cmp	0xee
	bne	.Lucmpdi_decided
	lda	0xe4
	cmp	0xec
	bne	.Lucmpdi_decided
	lda	0xe2
	cmp	0xea
	bne	.Lucmpdi_decided
	lda	0xe0
	cmp	0xe8
	bne	.Lucmpdi_decided
	; Equal.
	lda	#0x1
	rtl
.Lucmpdi_decided:
	; Carry clear  -> a < b -> return 0.
	; Carry set, Z clear -> a > b -> return 2.
	bcc	.Lucmpdi_lt
	lda	#0x2
	rtl
.Lucmpdi_lt:
	lda	#0x0
	rtl

; --------------------------------------------------------------------
; __cmpdi2 — signed i64 compare.  Same {0,1,2} return convention.
; Implemented by flipping the high-word sign bits before doing an
; unsigned compare ($N XOR $8000 swaps the signed-int order to
; unsigned-int order).
; --------------------------------------------------------------------
	.globl __cmpdi2
__cmpdi2:
	jsr	__divmoddi4_stash
	lda	0xe6
	eor	#0x8000
	sta	0xe6
	lda	0xee
	eor	#0x8000
	sta	0xee
	; Unsigned compare on the rewritten values.
	lda	0xe6
	cmp	0xee
	bne	.Lcmpdi_decided
	lda	0xe4
	cmp	0xec
	bne	.Lcmpdi_decided
	lda	0xe2
	cmp	0xea
	bne	.Lcmpdi_decided
	lda	0xe0
	cmp	0xe8
	bne	.Lcmpdi_decided
	lda	#0x1
	rtl
.Lcmpdi_decided:
	bcc	.Lcmpdi_lt
	lda	#0x2
	rtl
.Lcmpdi_lt:
	lda	#0x0
	rtl

; --------------------------------------------------------------------
; __udivdi3 / __umoddi3 — unsigned 64-bit divide / modulo.  Restoring
; division: shift dividend left into a remainder register, conditionally
; subtract the divisor.  The two libcalls share the core; quotient
; lands at $E0..$E7, remainder at $F2..$F8.  Each entry sets a flag in
; X to select which to return.
; --------------------------------------------------------------------
	.globl __udivdi3
__udivdi3:
	jsr	__divmoddi4_stash
	jsr	__udivmoddi_core
	brl	__retdi

	.globl __umoddi3
__umoddi3:
	jsr	__divmoddi4_stash
	jsr	__udivmoddi_core
	; Move remainder ($F2..$F8) -> $E0..$E7 for return.
	lda	0xf2
	sta	0xe0
	lda	0xf4
	sta	0xe2
	lda	0xf6
	sta	0xe4
	lda	0xf8
	sta	0xe6
	brl	__retdi

; Core: dividend at $E0..$E6, divisor at $E8..$EE.
; Output: quotient at $E0..$E6, remainder at $F2..$F8.
__udivmoddi_core:
	; Clear remainder $F2..$F8.
	lda	#0x0
	sta	0xf2
	sta	0xf4
	sta	0xf6
	sta	0xf8
	ldy	#0x40
.Ludivmoddi_loop:
	; Shift left: dividend (becomes quotient) and remainder together
	; as a 128-bit register.  bit shifted out of dividend top -> remainder LSB.
	asl	0xe0
	rol	0xe2
	rol	0xe4
	rol	0xe6
	rol	0xf2
	rol	0xf4
	rol	0xf6
	rol	0xf8
	; Try remainder - divisor.  If no borrow, accept and set quotient bit.
	sec
	lda	0xf2
	sbc	0xe8
	sta	0xfa			; tentative subtract result at $FA..$
	lda	0xf4
	sbc	0xea
	sta	0xfc
	lda	0xf6
	sbc	0xec
	sta	0xfe
	lda	0xf8
	sbc	0xee
	; A holds new high word.  C = !borrow.
	bcc	.Ludivmoddi_skip
	; Accept: remainder = remainder - divisor, quotient bit 0 = 1.
	sta	0xf8
	lda	0xfe
	sta	0xf6
	lda	0xfc
	sta	0xf4
	lda	0xfa
	sta	0xf2
	; Set bit 0 of dividend (which we shifted left, so position is open).
	lda	0xe0
	ora	#0x1
	sta	0xe0
.Ludivmoddi_skip:
	dey
	bne	.Ludivmoddi_loop
	rts

; --------------------------------------------------------------------
; __divdi3 / __moddi3 — signed 64-bit divide / modulo.  Take absolute
; values, run the unsigned core, fix up the sign.
;   div: sign(quotient) = sign(a) XOR sign(b)
;   mod: sign(remainder) = sign(a)
; --------------------------------------------------------------------
	.globl __divdi3
__divdi3:
	jsr	__divmoddi4_stash
	; Track signs: bit 15 of $E6 (a) and $EE (b).  Save XOR in a temp.
	lda	0xe6
	eor	0xee
	and	#0x8000
	sta	0xfa			; sign of quotient at $FA
	; Abs(a)
	jsr	__absdi_a
	; Abs(b)
	jsr	__absdi_b
	jsr	__udivmoddi_core
	; Fix quotient sign: if $FA != 0, negate $E0..$E6.
	lda	0xfa
	beq	.Ldivdi_pos
	jsr	__negdi_a
.Ldivdi_pos:
	brl	__retdi

	.globl __moddi3
__moddi3:
	jsr	__divmoddi4_stash
	; Mod sign = sign of a.
	lda	0xe6
	and	#0x8000
	sta	0xfa
	jsr	__absdi_a
	jsr	__absdi_b
	jsr	__udivmoddi_core
	; Move remainder to $E0..$E6.
	lda	0xf2
	sta	0xe0
	lda	0xf4
	sta	0xe2
	lda	0xf6
	sta	0xe4
	lda	0xf8
	sta	0xe6
	; Apply sign.
	lda	0xfa
	beq	.Lmoddi_pos
	jsr	__negdi_a
.Lmoddi_pos:
	brl	__retdi

; --- subroutines used by signed div/mod ---

; __absdi_a: if $E6 has sign bit set, negate $E0..$E6.
__absdi_a:
	lda	0xe6
	bpl	.Labsdi_a_done
	jsr	__negdi_a
.Labsdi_a_done:
	rts

; __absdi_b: if $EE has sign bit set, negate $E8..$EE.
__absdi_b:
	lda	0xee
	bpl	.Labsdi_b_done
	jsr	__negdi_b
.Labsdi_b_done:
	rts

; __negdi_a: 2's complement negate $E0..$E6.
__negdi_a:
	sec
	lda	#0x0
	sbc	0xe0
	sta	0xe0
	lda	#0x0
	sbc	0xe2
	sta	0xe2
	lda	#0x0
	sbc	0xe4
	sta	0xe4
	lda	#0x0
	sbc	0xe6
	sta	0xe6
	rts

; __negdi_b: 2's complement negate $E8..$EE.
__negdi_b:
	sec
	lda	#0x0
	sbc	0xe8
	sta	0xe8
	lda	#0x0
	sbc	0xea
	sta	0xea
	lda	#0x0
	sbc	0xec
	sta	0xec
	lda	#0x0
	sbc	0xee
	sta	0xee
	rts

; --------------------------------------------------------------------
; setjmp(jmp_buf env) - save calling environment, return 0
; longjmp(jmp_buf env, int val) - restore environment, return val (or 1 if val == 0)
;
; jmp_buf layout (8 bytes):
;   [0..1]  = caller's stack pointer (SP+3 at entry to setjmp)
;   [2..3]  = return address PC lo:hi (16 bits)
;   [4]     = return address bank (1 byte)
;   [5..6]  = direct page register (DP)
;   [7]     = reserved / padding
;
; Caller-save convention: longjmp doesn't restore X / Y / A — caller's
; setjmp returned 0 with all-callee-savable regs already preserved by
; setjmp's caller.
; --------------------------------------------------------------------
	.globl setjmp
setjmp:
	sta	0xe0		; jmp_buf addr -> DP scratch
	tsc			; A = current SP
	clc
	adc	#0x3		; A = caller's SP (undo JSL push)
	ldy	#0
	sta	(0xe0), y	; env[0..1] = caller SP
	lda	0x1, s		; A = retaddr lo:hi
	ldy	#2
	sta	(0xe0), y	; env[2..3] = retaddr lo:hi
	sep	#0x20
	lda	0x3, s		; A_lo = bank
	ldy	#4
	sta	(0xe0), y	; env[4] = bank
	rep	#0x20
	tdc			; A = DP
	ldy	#5
	sta	(0xe0), y	; env[5..6] = DP
	lda	#0		; setjmp returns 0
	rtl

	.globl longjmp
longjmp:
	sta	0xe0		; jmp_buf addr -> DP scratch
	lda	0x4, s		; A = val (2nd arg, on stack)
	sta	0xe2		; save val
	; Restore SP: env[0..1] - 3 (so the upcoming PHAs land at the right slots).
	ldy	#0
	lda	(0xe0), y	; A = saved SP
	sec
	sbc	#0x3
	tcs			; SP = saved_SP - 3
	; Push retaddr: bank, then 16-bit lo:hi.  RTL pulls lo, hi, bank.
	sep	#0x20
	ldy	#4
	lda	(0xe0), y	; bank
	pha
	rep	#0x20
	ldy	#2
	lda	(0xe0), y	; lo:hi
	pha
	; Restore DP.
	ldy	#5
	lda	(0xe0), y
	tcd
	; Compute return value: val if nonzero, else 1.
	lda	0xe2
	bne	.Llj_done
	lda	#1
.Llj_done:
	rtl