65816-llvm-mos/runtime/src/libgcc.s
Scott Duensing 6d7eae0356 Checkpoint.
2026-04-30 01:29:16 -05:00

1180 lines
27 KiB
ArmAsm

; Minimal libgcc-equivalent runtime for the W65816 / Apple IIgs.
; Provides the helpers that the LLVM backend lowers integer multiply,
; shift, divide, and modulo operations to. Implementations are
; correct-but-unoptimised; they exist to unblock end-to-end testing,
; not to compete with hand-tuned 65816 math libraries.
;
; Calling convention (matches W65816ISelLowering::LowerCall):
; - Arg 0 in A (16-bit M).
; - Arg 1 pushed via PHA before the JSL. Reads as (4,S) inside the
; callee (3-byte JSL return address sits at 1..3,S).
; - Return value in A. Caller releases pushed args.
; - Routines run in 16-bit M, 16-bit X (REP #$30 by convention).
;
; Direct-page scratch lives at DP+$E0..DP+$EF (16 bytes). Programs
; that use this runtime must keep DP=0 or remap accordingly.
;
; Assembled with: tools/llvm-mos-build/bin/llvm-mc -arch=w65816 \
; -filetype=obj
; runtime/src/libgcc.s
; -o runtime/libgcc.o
.text
; --------------------------------------------------------------------
; Indirect-call trampoline. An indirect call (function pointer) stores
; the target's 16-bit address to __indirTarget before JSL'ing here.
; This routine does a JMP indirect through that variable: control
; transfers to the target with the original caller's JSL frame still
; on the stack, so target's RTL returns to the original caller (one
; frame, no double-RTL).
;
; Caller emit sequence in W65816ISelLowering::LowerCall:
; sta __indirTarget ; store ptr (must precede any A clobber for args)
; ... arg pushes ...
; jsl __jsl_indir
;
; Single-bank only (the IIgs convention assumes code in bank 0/1
; via JSL — JMP indirect is bank-local).
; --------------------------------------------------------------------
.globl __indirTarget
.bss
__indirTarget:
.zero 2
.text
.globl __jsl_indir
__jsl_indir:
; Hand-encoded JMP (__indirTarget): 6C is "jmp (a)" — the assembler
; doesn't yet parse the `(abs)` syntax, so emit the bytes directly
; with a 16-bit relocation against the variable. Effective transfer:
; PC <- mem[__indirTarget].
.byte 0x6C
.word __indirTarget
; --------------------------------------------------------------------
; __mulhi3 — 16-bit multiply. A * (4,S) -> A.
; Signed and unsigned share an implementation: only the low 16 bits of
; the product are returned, which is identical for both. Uses
; shift-and-add over the multiplier bits.
; --------------------------------------------------------------------
.globl __mulhi3
__mulhi3:
sta 0xe0 ; multiplier
lda 0x4, s
sta 0xe2 ; multiplicand
lda #0x0
sta 0xe4 ; running product
.Lmul_loop:
lda 0xe0
beq .Lmul_done
lsr a
sta 0xe0
bcc .Lmul_skip
lda 0xe4
clc
adc 0xe2
sta 0xe4
.Lmul_skip:
asl 0xe2
bra .Lmul_loop
.Lmul_done:
lda 0xe4
rtl
; --------------------------------------------------------------------
; __ashlhi3 — A << (4,S) -> A. Shift count is i16 but only the low 4
; bits are meaningful (counts >=16 are undefined behaviour in C).
; --------------------------------------------------------------------
.globl __ashlhi3
__ashlhi3:
pha ; save value on stack so we can free A
lda 0x6, s ; arg 1 sits at 6,s now (PHA shifted by 2)
tax
pla ; restore value
.Lashl_loop:
cpx #0x0
beq .Lashl_done
asl a
dex
bra .Lashl_loop
.Lashl_done:
rtl
; --------------------------------------------------------------------
; __lshrhi3 — A logical >> (4,S) -> A. Same shape as __ashlhi3 with
; LSR instead of ASL.
; --------------------------------------------------------------------
.globl __lshrhi3
__lshrhi3:
pha
lda 0x6, s
tax
pla
.Llshr_loop:
cpx #0x0
beq .Llshr_done
lsr a
dex
bra .Llshr_loop
.Llshr_done:
rtl
; --------------------------------------------------------------------
; __ashrhi3 — A arithmetic >> (4,S) -> A. Sign bit is preserved by
; copying it into carry before each ROR via CMP #$8000 (which sets
; carry exactly when the sign bit is set on a 16-bit unsigned compare).
; --------------------------------------------------------------------
.globl __ashrhi3
__ashrhi3:
pha
lda 0x6, s
tax
pla
.Lashr_loop:
cpx #0x0
beq .Lashr_done
cmp #0x8000
ror a
dex
bra .Lashr_loop
.Lashr_done:
rtl
; --------------------------------------------------------------------
; __udivhi3 — A unsigned / (4,S) -> A.
; Restoring shift-subtract division. Common helper; __umodhi3 reuses
; the algorithm and returns the remainder instead.
; Scratch: $e6 = numerator, $e8 = denominator,
; $ea = quotient, $ec = remainder.
; --------------------------------------------------------------------
.globl __udivhi3
__udivhi3:
; Public entry: A=dividend, (4,S)=divisor. Set up scratch and
; call the same JSR-based core used by signed divide.
sta 0xe6
lda 0x4, s
sta 0xe8
jsr __udivmod_core
lda 0xea
rtl
.globl __umodhi3
__umodhi3:
sta 0xe6
lda 0x4, s
sta 0xe8
jsr __udivmod_core
lda 0xec
rtl
; --------------------------------------------------------------------
; __divhi3 / __modhi3 — signed 16-bit divide and modulo. Strategy:
; - Stash sign of dividend in $ee bit 0 (used by modulo).
; - Stash result sign of quotient (sign(a) XOR sign(b)) in $ee bit 1
; (used by divide).
; - Take absolute values, run the unsigned core, then negate the
; appropriate result if its sign bit is set.
; C99: quotient truncates toward zero; remainder takes the sign of the
; dividend.
; --------------------------------------------------------------------
.globl __divhi3
__divhi3:
jsr __divmod_setup
jsr __udivmod_core
; Quotient is in $ea. Negate if bit 1 of $ee is set.
lda 0xea
pha
lda 0xee
and #0x2
beq .Ldiv_pos
pla
eor #0xffff
clc
adc #0x1
rtl
.Ldiv_pos:
pla
rtl
.globl __modhi3
__modhi3:
jsr __divmod_setup
jsr __udivmod_core
; Remainder is in $ec. Negate if bit 0 of $ee is set (dividend
; was negative).
lda 0xec
pha
lda 0xee
and #0x1
beq .Lmod_pos
pla
eor #0xffff
clc
adc #0x1
rtl
.Lmod_pos:
pla
rtl
; --------------------------------------------------------------------
; __divmod_setup — common prologue for __divhi3/__modhi3. Reads
; A=dividend and (4,S)=divisor (the public-entry stack frame is intact
; because we used JSR not JSL, so (4,S) still points to the user's
; pushed arg1 relative to the original JSL). Computes |a| -> $e6,
; |b| -> $e8, and sign tracker -> $ee:
; bit 0 = 1 if dividend was negative (modulo result sign)
; bit 1 = 1 if dividend XOR divisor signs differ (quotient sign)
; Uses JSR/RTS, same bank.
; --------------------------------------------------------------------
__divmod_setup:
; Sign tracker. We don't have STZ in our instruction set yet, so
; clear via PHA/LDA #0/STA/PLA to avoid trashing A.
pha
lda #0x0
sta 0xee
pla
; Dividend sign + abs value.
cmp #0x8000
bcc .Lset_a_pos
; Negative: set bits 0 and 1 (dividend sign, result sign so far).
pha
lda 0xee
ora #0x3
sta 0xee
pla
eor #0xffff
clc
adc #0x1
.Lset_a_pos:
sta 0xe6
; Divisor sign + abs value. After our JSR (pushed 2 bytes of
; near-return), the user's arg1 has shifted up by 2 from (4,S)
; to (6,S).
lda 0x6, s
cmp #0x8000
bcc .Lset_b_pos
; Negative: flip bit 1 of $ee (XOR with sign of dividend).
pha
lda 0xee
eor #0x2
sta 0xee
pla
eor #0xffff
clc
adc #0x1
.Lset_b_pos:
sta 0xe8
rts
; --------------------------------------------------------------------
; __udivmod_core — internal restoring divide. Inputs at $e6/$e8,
; outputs quotient at $ea, remainder at $ec. JSR/RTS local helper.
; --------------------------------------------------------------------
__udivmod_core:
lda #0x0
sta 0xea
sta 0xec
ldx #0x10
.Lcore_loop:
asl 0xe6
rol 0xec
asl 0xea
lda 0xec
cmp 0xe8
bcc .Lcore_skip
sec
sbc 0xe8
sta 0xec
inc 0xea
.Lcore_skip:
dex
bne .Lcore_loop
rts
; ====================================================================
; 32-bit (long / si) helpers.
;
; ABI for these is the natural extension of the i16 libcalls:
; - arg0_lo in A
; - arg0_hi at (4,s)
; - arg1_lo at (6,s) (or shift count, for the shift helpers)
; - arg1_hi at (8,s)
; - return: result_lo in A, result_hi in X
;
; All are correct-but-unoptimised; goal is unblocking end-to-end builds,
; not winning a 65816 codegolf.
;
; Direct-page scratch for these:
; $e0..$e3 = a (lo, hi) [renamed from $e0/$e2 for the i16 ones]
; $e4..$e7 = b (lo, hi)
; $e8..$eb = result / quotient (lo, hi)
; $ec..$ef = remainder (lo, hi)
; ====================================================================
; --------------------------------------------------------------------
; __mulsi3 — 32-bit multiply. Shift-and-add over 32 bits of the
; multiplier. Result = (a * b) mod 2^32.
;
; ABI: A = a_lo, X = a_hi (the i32-first-arg in A:X convention),
; (4,s) = b_lo, (6,s) = b_hi. Result returned in A:X (lo:hi).
; --------------------------------------------------------------------
.globl __mulsi3
__mulsi3:
; Stash a (multiplier) into $e0/$e2.
sta 0xe0
stx 0xe2
; Stash b (multiplicand) into $e4/$e6.
lda 0x4, s
sta 0xe4
lda 0x6, s
sta 0xe6
; Clear running product at $e8/$ea.
lda #0x0
sta 0xe8
sta 0xea
; Loop 32 times: examine LSB of multiplier, conditionally add
; multiplicand to product, then shift multiplier right and
; multiplicand left. Use Y as a 16-bit counter (X mode = 16).
ldy #0x20
.Lmulsi_loop:
; Test bit 0 of multiplier (lo word).
lda 0xe0
lsr a
sta 0xe0
bcc .Lmulsi_noadd
; Add multiplicand to product (32-bit).
clc
lda 0xe8
adc 0xe4
sta 0xe8
lda 0xea
adc 0xe6
sta 0xea
.Lmulsi_noadd:
; Shift multiplier right (32-bit, hi-into-lo) — we already shifted
; the lo half above, but the bit shifted out went to carry. We
; need to also bring the lo bit of the hi half into bit 15 of lo,
; and shift hi right. Simpler: do a full 32-bit shift right
; before the LSR. Restructure:
;
; Shift multiplicand left (32-bit, carry chain).
asl 0xe4
rol 0xe6
; Bring multiplier hi into multiplier lo's high bit. Multiplier
; has been shifted lo>>1 already; we need to also put hi's lo bit
; into lo's hi bit and shift hi right.
lsr 0xe2
bcc .Lmulsi_no_borrow
; Carry from hi >> 1 needs to land in bit 15 of lo. ORA #$8000.
lda 0xe0
ora #0x8000
sta 0xe0
.Lmulsi_no_borrow:
dey
bne .Lmulsi_loop
; Result is in $e8 (lo) / $ea (hi).
ldx 0xea
lda 0xe8
rtl
; --------------------------------------------------------------------
; __ashlsi3 — (A:X) << (4,s) -> A:X. Shift count is i16 in low byte;
; counts >= 32 are UB in C. Uses a per-bit loop (cheap on 65816 — one
; ASL + ROL per bit).
;
; ABI: A = a_lo, X = a_hi (i32-first-arg in A:X), (4,s) = count.
; --------------------------------------------------------------------
.globl __ashlsi3
__ashlsi3:
sta 0xe0 ; lo
stx 0xe2 ; hi
lda 0x4, s
tay ; count -> Y
.Lashlsi_loop:
cpy #0x0
beq .Lashlsi_done
asl 0xe0
rol 0xe2
dey
bra .Lashlsi_loop
.Lashlsi_done:
ldx 0xe2
lda 0xe0
rtl
; --------------------------------------------------------------------
; __lshrsi3 — logical >> shift. LSR hi, ROR lo: hi gets a 0, lo gets
; hi's old bit 0. Per-bit loop.
; --------------------------------------------------------------------
.globl __lshrsi3
__lshrsi3:
sta 0xe0
stx 0xe2
lda 0x4, s
tay
.Llshrsi_loop:
cpy #0x0
beq .Llshrsi_done
lsr 0xe2
ror 0xe0
dey
bra .Llshrsi_loop
.Llshrsi_done:
ldx 0xe2
lda 0xe0
rtl
; --------------------------------------------------------------------
; __ashrsi3 — arithmetic >> shift. Sign bit must be preserved on each
; iteration: copy bit 15 of hi into carry (via CMP #$8000), then ROR
; hi, ROR lo. Per-bit loop.
; --------------------------------------------------------------------
.globl __ashrsi3
__ashrsi3:
sta 0xe0
stx 0xe2
lda 0x4, s
tay
.Lashrsi_loop:
cpy #0x0
beq .Lashrsi_done
; CMP #$8000 sets C iff the unsigned value >= 0x8000, i.e. bit 15
; is set — exactly the sign bit.
lda 0xe2
cmp #0x8000
ror 0xe2
ror 0xe0
dey
bra .Lashrsi_loop
.Lashrsi_done:
ldx 0xe2
lda 0xe0
rtl
; --------------------------------------------------------------------
; __udivmodsi_core — internal 32-bit unsigned divide. Inputs in
; $e0/$e2 (numerator) and $e4/$e6 (denominator); outputs quotient in
; $e8/$ea and remainder in $ec/$ee. 32-iteration restoring divide.
; JSR/RTS local helper.
; --------------------------------------------------------------------
__udivmodsi_core:
lda #0x0
sta 0xe8
sta 0xea
sta 0xec
sta 0xee
ldy #0x20
.Lcoresi_loop:
; Shift numerator left through remainder.
asl 0xe0
rol 0xe2
rol 0xec
rol 0xee
; Shift quotient left.
asl 0xe8
rol 0xea
; Compare remainder to denominator (32-bit).
lda 0xee
cmp 0xe6
bcc .Lcoresi_skip
bne .Lcoresi_take
lda 0xec
cmp 0xe4
bcc .Lcoresi_skip
.Lcoresi_take:
; Remainder >= denominator: subtract and set quotient bit 0.
sec
lda 0xec
sbc 0xe4
sta 0xec
lda 0xee
sbc 0xe6
sta 0xee
inc 0xe8
.Lcoresi_skip:
dey
bne .Lcoresi_loop
rts
; --------------------------------------------------------------------
; __udivsi3 — unsigned 32/32 -> 32 divide.
; --------------------------------------------------------------------
.globl __udivsi3
__udivsi3:
; ABI: A = a_lo, X = a_hi, (4,s) = b_lo, (6,s) = b_hi.
sta 0xe0
stx 0xe2
lda 0x4, s
sta 0xe4
lda 0x6, s
sta 0xe6
jsr __udivmodsi_core
ldx 0xea
lda 0xe8
rtl
; --------------------------------------------------------------------
; __umodsi3 — unsigned 32/32 -> 32 modulo.
; --------------------------------------------------------------------
.globl __umodsi3
__umodsi3:
sta 0xe0
stx 0xe2
lda 0x4, s
sta 0xe4
lda 0x6, s
sta 0xe6
jsr __udivmodsi_core
ldx 0xee
lda 0xec
rtl
; --------------------------------------------------------------------
; __divsi3 / __modsi3 — signed 32-bit divide / modulo. Strategy mirrors
; the i16 helpers: stash signs, take abs, run unsigned core, negate
; result(s) as needed. Sign tracker bits in $f0:
; bit 0 = dividend was negative (modulo result sign)
; bit 1 = quotient sign (sign(a) XOR sign(b))
; --------------------------------------------------------------------
.globl __divsi3
__divsi3:
jsr __divmodsi_setup
jsr __udivmodsi_core
; Quotient at $e8/$ea. Negate if bit 1 of $f0 is set.
lda 0xf0
and #0x2
beq .Ldivsi_pos
; 32-bit two's complement of quotient.
lda 0xe8
eor #0xffff
clc
adc #0x1
sta 0xe8
lda 0xea
eor #0xffff
adc #0x0
sta 0xea
.Ldivsi_pos:
ldx 0xea
lda 0xe8
rtl
.globl __modsi3
__modsi3:
jsr __divmodsi_setup
jsr __udivmodsi_core
; Remainder at $ec/$ee. Negate if bit 0 of $f0 set (dividend
; was negative — C99 remainder takes dividend's sign).
lda 0xf0
and #0x1
beq .Lmodsi_pos
lda 0xec
eor #0xffff
clc
adc #0x1
sta 0xec
lda 0xee
eor #0xffff
adc #0x0
sta 0xee
.Lmodsi_pos:
ldx 0xee
lda 0xec
rtl
; --------------------------------------------------------------------
; __divmodsi_setup — common prologue for __divsi3 / __modsi3.
; Reads A=a_lo, X=a_hi (i32-first-arg ABI), (4,s)=b_lo, (6,s)=b_hi.
; Writes |a| to $e0/$e2, |b| to $e4/$e6, sign bits to $f0. JSR/RTS.
; After JSR's 2-byte ret push, callee-relative offsets are (6,s)=b_lo,
; (8,s)=b_hi.
; --------------------------------------------------------------------
__divmodsi_setup:
; Clear sign tracker.
pha
lda #0x0
sta 0xf0
pla
; |a|: A=a_lo, X=a_hi. Save them first (we need a_hi for sign test).
sta 0xe0 ; tentative a_lo (may negate below)
stx 0xe2 ; tentative a_hi
cpx #0x8000
bcc .Lsetsi_a_pos
; a is negative. Set sign tracker bits 0+1 and negate.
lda 0xf0
ora #0x3
sta 0xf0
; 32-bit negate: invert + 1.
lda 0xe0
eor #0xffff
clc
adc #0x1
sta 0xe0
lda 0xe2
eor #0xffff
adc #0x0
sta 0xe2
.Lsetsi_a_pos:
; |b|. Args shifted by 2 (the JSR ret push).
lda 0x6, s
sta 0xe4
lda 0x8, s
sta 0xe6
cmp #0x8000
bcc .Lsetsi_b_pos
; b is negative. Flip bit 1 of $f0.
lda 0xf0
eor #0x2
sta 0xf0
lda 0xe4
eor #0xffff
clc
adc #0x1
sta 0xe4
lda 0xe6
eor #0xffff
adc #0x0
sta 0xe6
.Lsetsi_b_pos:
rts
; ====================================================================
; i64 (long long) helpers.
;
; Calling convention (i64 first arg is split via i32-first-arg path):
; A = arg0_lo[0..15] (lowest word)
; X = arg0_lo[16..31]
; 4,S = arg0_hi[0..15]
; 6,S = arg0_hi[16..31] (highest word)
; For binary ops (mul/div/mod), arg1 follows on the stack:
; 8,S = arg1_lo[0..15]
; 10,S = arg1_lo[16..31]
; 12,S = arg1_hi[0..15]
; 14,S = arg1_hi[16..31]
; For shift ops, the count occupies a single i16 at 8,S.
;
; Return ABI (matches LowerReturn for i64):
; A = result_lo[0..15]
; X = result_lo[16..31]
; Y = result_hi[0..15]
; DP $F0..$F1 = result_hi[16..31]
;
; Scratch DP layout (per-libcall, no overlap between concurrent calls):
; $E0..$E7 = a (8 bytes; 4 16-bit words)
; $E8..$EF = b OR product (8 bytes)
;
; All routines run with REP #$30 (M=0, X=0).
; ====================================================================
; --------------------------------------------------------------------
; __divmoddi4_stash — common entry point. Stashes a -> $E0..$E7,
; b -> $E8..$EF. Used by __udivdi3 / __umoddi3 / __divdi3 / __moddi3
; setup; signed variants flip signs around it.
; --------------------------------------------------------------------
__divmoddi4_stash:
sta 0xe0 ; a_lo_lo
stx 0xe2 ; a_lo_hi
lda 0x4, s
sta 0xe4 ; a_hi_lo
lda 0x6, s
sta 0xe6 ; a_hi_hi
lda 0x8, s
sta 0xe8 ; b_lo_lo
lda 0xa, s
sta 0xea ; b_lo_hi
lda 0xc, s
sta 0xec ; b_hi_lo
lda 0xe, s
sta 0xee ; b_hi_hi
rts
; --------------------------------------------------------------------
; Helper: pack the result at $E0..$E7 into the i64 return ABI.
; Trashes A, Y. Caller falls through to RTL.
; --------------------------------------------------------------------
__retdi:
lda 0xe6
sta 0xf0
lda 0xe4
tay
lda 0xe2
tax
lda 0xe0
rtl
; --------------------------------------------------------------------
; __ashldi3 — i64 left shift by n. Per-bit loop. Y holds count.
; --------------------------------------------------------------------
.globl __ashldi3
__ashldi3:
sta 0xe0
stx 0xe2
lda 0x4, s
sta 0xe4
lda 0x6, s
sta 0xe6
lda 0x8, s
tay ; Y = count
.Lashldi_loop:
cpy #0x0
beq .Lashldi_done
asl 0xe0
rol 0xe2
rol 0xe4
rol 0xe6
dey
bra .Lashldi_loop
.Lashldi_done:
brl __retdi
; --------------------------------------------------------------------
; __lshrdi3 — i64 logical right shift. LSR top word, ROR rest.
; --------------------------------------------------------------------
.globl __lshrdi3
__lshrdi3:
sta 0xe0
stx 0xe2
lda 0x4, s
sta 0xe4
lda 0x6, s
sta 0xe6
lda 0x8, s
tay
.Llshrdi_loop:
cpy #0x0
beq .Llshrdi_done
lsr 0xe6
ror 0xe4
ror 0xe2
ror 0xe0
dey
bra .Llshrdi_loop
.Llshrdi_done:
brl __retdi
; --------------------------------------------------------------------
; __ashrdi3 — i64 arithmetic right shift. Same as lshrdi3 but the top
; bit replicates: sign-extend by ASL/ROR which would clear; instead
; take a copy of the sign and OR it back, OR use cmp/sbc trick — use
; the standard idiom: capture sign before LSR via "asl; ror" so C is
; preserved. Simpler: copy bit 15 of $E7 into C before each shift.
; --------------------------------------------------------------------
.globl __ashrdi3
__ashrdi3:
sta 0xe0
stx 0xe2
lda 0x4, s
sta 0xe4
lda 0x6, s
sta 0xe6
lda 0x8, s
tay
.Lashrdi_loop:
cpy #0x0
beq .Lashrdi_done
; "ASL $E6" sets C from bit 15 (the sign), then we ROR $E6 back.
; Net effect on $E6: arithmetic right shift by 1 (sign preserved).
; The carry chain into $E4..$E0 is the new bit 15.
lda 0xe6
asl a ; C = sign bit; A = (sign<<1) | rest
ror 0xe6 ; $E6: (sign << 15) | ($E6 >> 1)
ror 0xe4
ror 0xe2
ror 0xe0
dey
bra .Lashrdi_loop
.Lashrdi_done:
brl __retdi
; --------------------------------------------------------------------
; __muldi3 — i64 multiply (low 64 bits of 64x64 product).
; Shift-and-add over a (64 bits). Product accumulates at $F2..$F9
; (above the return DP slot, scratch). Need a fresh 8-byte product
; slot since $E0..$EF holds operands.
; --------------------------------------------------------------------
.globl __muldi3
__muldi3:
jsr __divmoddi4_stash
; Clear product P0..P3 at $F2..$F8.
lda #0x0
sta 0xf2
sta 0xf4
sta 0xf6
sta 0xf8
; Loop 64 times on a's bits.
ldy #0x40
.Lmuldi_loop:
; Test bit 0 of a (= LSR a; C = old bit 0).
lda 0xe0
lsr a
sta 0xe0
lda 0xe2
ror a
sta 0xe2
lda 0xe4
ror a
sta 0xe4
lda 0xe6
ror a
sta 0xe6
bcc .Lmuldi_noadd
; Add b ($E8..$EE) to product ($F2..$F8).
clc
lda 0xf2
adc 0xe8
sta 0xf2
lda 0xf4
adc 0xea
sta 0xf4
lda 0xf6
adc 0xec
sta 0xf6
lda 0xf8
adc 0xee
sta 0xf8
.Lmuldi_noadd:
; Shift b left by 1 (so each iteration uses next bit position).
asl 0xe8
rol 0xea
rol 0xec
rol 0xee
dey
bne .Lmuldi_loop
; Move product into return slots ($E0..$E7) and tail-call __retdi.
lda 0xf2
sta 0xe0
lda 0xf4
sta 0xe2
lda 0xf6
sta 0xe4
lda 0xf8
sta 0xe6
brl __retdi
; --------------------------------------------------------------------
; __ucmpdi2 — unsigned i64 compare. Returns 0 if a<b, 1 if a==b,
; 2 if a>b (libgcc convention). We emit i16 result in A (with the
; high bytes don't-care).
; --------------------------------------------------------------------
.globl __ucmpdi2
__ucmpdi2:
; Compare from MSB downwards. Stash a/b first so we have a stable
; layout.
jsr __divmoddi4_stash
; Compare $E6 vs $EE (a_hi_hi vs b_hi_hi).
lda 0xe6
cmp 0xee
bne .Lucmpdi_decided
lda 0xe4
cmp 0xec
bne .Lucmpdi_decided
lda 0xe2
cmp 0xea
bne .Lucmpdi_decided
lda 0xe0
cmp 0xe8
bne .Lucmpdi_decided
; Equal.
lda #0x1
rtl
.Lucmpdi_decided:
; Carry clear -> a < b -> return 0.
; Carry set, Z clear -> a > b -> return 2.
bcc .Lucmpdi_lt
lda #0x2
rtl
.Lucmpdi_lt:
lda #0x0
rtl
; --------------------------------------------------------------------
; __cmpdi2 — signed i64 compare. Same {0,1,2} return convention.
; Implemented by flipping the high-word sign bits before doing an
; unsigned compare ($N XOR $8000 swaps the signed-int order to
; unsigned-int order).
; --------------------------------------------------------------------
.globl __cmpdi2
__cmpdi2:
jsr __divmoddi4_stash
lda 0xe6
eor #0x8000
sta 0xe6
lda 0xee
eor #0x8000
sta 0xee
; Unsigned compare on the rewritten values.
lda 0xe6
cmp 0xee
bne .Lcmpdi_decided
lda 0xe4
cmp 0xec
bne .Lcmpdi_decided
lda 0xe2
cmp 0xea
bne .Lcmpdi_decided
lda 0xe0
cmp 0xe8
bne .Lcmpdi_decided
lda #0x1
rtl
.Lcmpdi_decided:
bcc .Lcmpdi_lt
lda #0x2
rtl
.Lcmpdi_lt:
lda #0x0
rtl
; --------------------------------------------------------------------
; __udivdi3 / __umoddi3 — unsigned 64-bit divide / modulo. Restoring
; division: shift dividend left into a remainder register, conditionally
; subtract the divisor. The two libcalls share the core; quotient
; lands at $E0..$E7, remainder at $F2..$F8. Each entry sets a flag in
; X to select which to return.
; --------------------------------------------------------------------
.globl __udivdi3
__udivdi3:
jsr __divmoddi4_stash
jsr __udivmoddi_core
brl __retdi
.globl __umoddi3
__umoddi3:
jsr __divmoddi4_stash
jsr __udivmoddi_core
; Move remainder ($F2..$F8) -> $E0..$E7 for return.
lda 0xf2
sta 0xe0
lda 0xf4
sta 0xe2
lda 0xf6
sta 0xe4
lda 0xf8
sta 0xe6
brl __retdi
; Core: dividend at $E0..$E6, divisor at $E8..$EE.
; Output: quotient at $E0..$E6, remainder at $F2..$F8.
__udivmoddi_core:
; Clear remainder $F2..$F8.
lda #0x0
sta 0xf2
sta 0xf4
sta 0xf6
sta 0xf8
ldy #0x40
.Ludivmoddi_loop:
; Shift left: dividend (becomes quotient) and remainder together
; as a 128-bit register. bit shifted out of dividend top -> remainder LSB.
asl 0xe0
rol 0xe2
rol 0xe4
rol 0xe6
rol 0xf2
rol 0xf4
rol 0xf6
rol 0xf8
; Try remainder - divisor. If no borrow, accept and set quotient bit.
sec
lda 0xf2
sbc 0xe8
sta 0xfa ; tentative subtract result at $FA..$
lda 0xf4
sbc 0xea
sta 0xfc
lda 0xf6
sbc 0xec
sta 0xfe
lda 0xf8
sbc 0xee
; A holds new high word. C = !borrow.
bcc .Ludivmoddi_skip
; Accept: remainder = remainder - divisor, quotient bit 0 = 1.
sta 0xf8
lda 0xfe
sta 0xf6
lda 0xfc
sta 0xf4
lda 0xfa
sta 0xf2
; Set bit 0 of dividend (which we shifted left, so position is open).
lda 0xe0
ora #0x1
sta 0xe0
.Ludivmoddi_skip:
dey
bne .Ludivmoddi_loop
rts
; --------------------------------------------------------------------
; __divdi3 / __moddi3 — signed 64-bit divide / modulo. Take absolute
; values, run the unsigned core, fix up the sign.
; div: sign(quotient) = sign(a) XOR sign(b)
; mod: sign(remainder) = sign(a)
; --------------------------------------------------------------------
.globl __divdi3
__divdi3:
jsr __divmoddi4_stash
; Track signs: bit 15 of $E6 (a) and $EE (b). Save XOR in a temp.
lda 0xe6
eor 0xee
and #0x8000
sta 0xfa ; sign of quotient at $FA
; Abs(a)
jsr __absdi_a
; Abs(b)
jsr __absdi_b
jsr __udivmoddi_core
; Fix quotient sign: if $FA != 0, negate $E0..$E6.
lda 0xfa
beq .Ldivdi_pos
jsr __negdi_a
.Ldivdi_pos:
brl __retdi
.globl __moddi3
__moddi3:
jsr __divmoddi4_stash
; Mod sign = sign of a.
lda 0xe6
and #0x8000
sta 0xfa
jsr __absdi_a
jsr __absdi_b
jsr __udivmoddi_core
; Move remainder to $E0..$E6.
lda 0xf2
sta 0xe0
lda 0xf4
sta 0xe2
lda 0xf6
sta 0xe4
lda 0xf8
sta 0xe6
; Apply sign.
lda 0xfa
beq .Lmoddi_pos
jsr __negdi_a
.Lmoddi_pos:
brl __retdi
; --- subroutines used by signed div/mod ---
; __absdi_a: if $E6 has sign bit set, negate $E0..$E6.
__absdi_a:
lda 0xe6
bpl .Labsdi_a_done
jsr __negdi_a
.Labsdi_a_done:
rts
; __absdi_b: if $EE has sign bit set, negate $E8..$EE.
__absdi_b:
lda 0xee
bpl .Labsdi_b_done
jsr __negdi_b
.Labsdi_b_done:
rts
; __negdi_a: 2's complement negate $E0..$E6.
__negdi_a:
sec
lda #0x0
sbc 0xe0
sta 0xe0
lda #0x0
sbc 0xe2
sta 0xe2
lda #0x0
sbc 0xe4
sta 0xe4
lda #0x0
sbc 0xe6
sta 0xe6
rts
; __negdi_b: 2's complement negate $E8..$EE.
__negdi_b:
sec
lda #0x0
sbc 0xe8
sta 0xe8
lda #0x0
sbc 0xea
sta 0xea
lda #0x0
sbc 0xec
sta 0xec
lda #0x0
sbc 0xee
sta 0xee
rts
; --------------------------------------------------------------------
; setjmp(jmp_buf env) - save calling environment, return 0
; longjmp(jmp_buf env, int val) - restore environment, return val (or 1 if val == 0)
;
; jmp_buf layout (8 bytes):
; [0..1] = caller's stack pointer (SP+3 at entry to setjmp)
; [2..3] = return address PC lo:hi (16 bits)
; [4] = return address bank (1 byte)
; [5..6] = direct page register (DP)
; [7] = reserved / padding
;
; Caller-save convention: longjmp doesn't restore X / Y / A — caller's
; setjmp returned 0 with all-callee-savable regs already preserved by
; setjmp's caller.
; --------------------------------------------------------------------
.globl setjmp
setjmp:
sta 0xe0 ; jmp_buf addr -> DP scratch
tsc ; A = current SP
clc
adc #0x3 ; A = caller's SP (undo JSL push)
ldy #0
sta (0xe0), y ; env[0..1] = caller SP
lda 0x1, s ; A = retaddr lo:hi
ldy #2
sta (0xe0), y ; env[2..3] = retaddr lo:hi
sep #0x20
lda 0x3, s ; A_lo = bank
ldy #4
sta (0xe0), y ; env[4] = bank
rep #0x20
tdc ; A = DP
ldy #5
sta (0xe0), y ; env[5..6] = DP
lda #0 ; setjmp returns 0
rtl
.globl longjmp
longjmp:
sta 0xe0 ; jmp_buf addr -> DP scratch
lda 0x4, s ; A = val (2nd arg, on stack)
sta 0xe2 ; save val
; Restore SP: env[0..1] - 3 (so the upcoming PHAs land at the right slots).
ldy #0
lda (0xe0), y ; A = saved SP
sec
sbc #0x3
tcs ; SP = saved_SP - 3
; Push retaddr: bank, then 16-bit lo:hi. RTL pulls lo, hi, bank.
sep #0x20
ldy #4
lda (0xe0), y ; bank
pha
rep #0x20
ldy #2
lda (0xe0), y ; lo:hi
pha
; Restore DP.
ldy #5
lda (0xe0), y
tcd
; Compute return value: val if nonzero, else 1.
lda 0xe2
bne .Llj_done
lda #1
.Llj_done:
rtl