; Minimal libgcc-equivalent runtime for the W65816 / Apple IIgs. ; Provides the helpers that the LLVM backend lowers integer multiply, ; shift, divide, and modulo operations to. Implementations are ; correct-but-unoptimised; they exist to unblock end-to-end testing, ; not to compete with hand-tuned 65816 math libraries. ; ; Calling convention (matches W65816ISelLowering::LowerCall): ; - Arg 0 in A (16-bit M). ; - Arg 1 pushed via PHA before the JSL. Reads as (4,S) inside the ; callee (3-byte JSL return address sits at 1..3,S). ; - Return value in A. Caller releases pushed args. ; - Routines run in 16-bit M, 16-bit X (REP #$30 by convention). ; ; Direct-page scratch lives at DP+$E0..DP+$EF (16 bytes). Programs ; that use this runtime must keep DP=0 or remap accordingly. ; ; Assembled with: tools/llvm-mos-build/bin/llvm-mc -arch=w65816 \ ; -filetype=obj ; runtime/src/libgcc.s ; -o runtime/libgcc.o .text ; -------------------------------------------------------------------- ; Indirect-call trampoline. An indirect call (function pointer) stores ; the target's 16-bit address to __indirTarget before JSL'ing here. ; This routine does a JMP indirect through that variable: control ; transfers to the target with the original caller's JSL frame still ; on the stack, so target's RTL returns to the original caller (one ; frame, no double-RTL). ; ; Caller emit sequence in W65816ISelLowering::LowerCall: ; sta __indirTarget ; store ptr (must precede any A clobber for args) ; ... arg pushes ... ; jsl __jsl_indir ; ; Single-bank only (the IIgs convention assumes code in bank 0/1 ; via JSL — JMP indirect is bank-local). ; -------------------------------------------------------------------- .globl __indirTarget .bss __indirTarget: .zero 2 .text .globl __jsl_indir __jsl_indir: ; Hand-encoded JMP (__indirTarget): 6C is "jmp (a)" — the assembler ; doesn't yet parse the `(abs)` syntax, so emit the bytes directly ; with a 16-bit relocation against the variable. Effective transfer: ; PC <- mem[__indirTarget]. .byte 0x6C .word __indirTarget ; -------------------------------------------------------------------- ; __mulhi3 — 16-bit multiply. A * (4,S) -> A. ; Signed and unsigned share an implementation: only the low 16 bits of ; the product are returned, which is identical for both. Uses ; shift-and-add over the multiplier bits. ; -------------------------------------------------------------------- .globl __mulhi3 __mulhi3: sta 0xe0 ; multiplier lda 0x4, s sta 0xe2 ; multiplicand lda #0x0 sta 0xe4 ; running product .Lmul_loop: lda 0xe0 beq .Lmul_done lsr a sta 0xe0 bcc .Lmul_skip lda 0xe4 clc adc 0xe2 sta 0xe4 .Lmul_skip: asl 0xe2 bra .Lmul_loop .Lmul_done: lda 0xe4 rtl ; -------------------------------------------------------------------- ; __ashlhi3 — A << (4,S) -> A. Shift count is i16 but only the low 4 ; bits are meaningful (counts >=16 are undefined behaviour in C). ; -------------------------------------------------------------------- .globl __ashlhi3 __ashlhi3: pha ; save value on stack so we can free A lda 0x6, s ; arg 1 sits at 6,s now (PHA shifted by 2) tax pla ; restore value .Lashl_loop: cpx #0x0 beq .Lashl_done asl a dex bra .Lashl_loop .Lashl_done: rtl ; -------------------------------------------------------------------- ; __lshrhi3 — A logical >> (4,S) -> A. Same shape as __ashlhi3 with ; LSR instead of ASL. ; -------------------------------------------------------------------- .globl __lshrhi3 __lshrhi3: pha lda 0x6, s tax pla .Llshr_loop: cpx #0x0 beq .Llshr_done lsr a dex bra .Llshr_loop .Llshr_done: rtl ; -------------------------------------------------------------------- ; __ashrhi3 — A arithmetic >> (4,S) -> A. Sign bit is preserved by ; copying it into carry before each ROR via CMP #$8000 (which sets ; carry exactly when the sign bit is set on a 16-bit unsigned compare). ; -------------------------------------------------------------------- .globl __ashrhi3 __ashrhi3: pha lda 0x6, s tax pla .Lashr_loop: cpx #0x0 beq .Lashr_done cmp #0x8000 ror a dex bra .Lashr_loop .Lashr_done: rtl ; -------------------------------------------------------------------- ; __udivhi3 — A unsigned / (4,S) -> A. ; Restoring shift-subtract division. Common helper; __umodhi3 reuses ; the algorithm and returns the remainder instead. ; Scratch: $e6 = numerator, $e8 = denominator, ; $ea = quotient, $ec = remainder. ; -------------------------------------------------------------------- .globl __udivhi3 __udivhi3: ; Public entry: A=dividend, (4,S)=divisor. Set up scratch and ; call the same JSR-based core used by signed divide. sta 0xe6 lda 0x4, s sta 0xe8 jsr __udivmod_core lda 0xea rtl .globl __umodhi3 __umodhi3: sta 0xe6 lda 0x4, s sta 0xe8 jsr __udivmod_core lda 0xec rtl ; -------------------------------------------------------------------- ; __divhi3 / __modhi3 — signed 16-bit divide and modulo. Strategy: ; - Stash sign of dividend in $ee bit 0 (used by modulo). ; - Stash result sign of quotient (sign(a) XOR sign(b)) in $ee bit 1 ; (used by divide). ; - Take absolute values, run the unsigned core, then negate the ; appropriate result if its sign bit is set. ; C99: quotient truncates toward zero; remainder takes the sign of the ; dividend. ; -------------------------------------------------------------------- .globl __divhi3 __divhi3: jsr __divmod_setup jsr __udivmod_core ; Quotient is in $ea. Negate if bit 1 of $ee is set. lda 0xea pha lda 0xee and #0x2 beq .Ldiv_pos pla eor #0xffff clc adc #0x1 rtl .Ldiv_pos: pla rtl .globl __modhi3 __modhi3: jsr __divmod_setup jsr __udivmod_core ; Remainder is in $ec. Negate if bit 0 of $ee is set (dividend ; was negative). lda 0xec pha lda 0xee and #0x1 beq .Lmod_pos pla eor #0xffff clc adc #0x1 rtl .Lmod_pos: pla rtl ; -------------------------------------------------------------------- ; __divmod_setup — common prologue for __divhi3/__modhi3. Reads ; A=dividend and (4,S)=divisor (the public-entry stack frame is intact ; because we used JSR not JSL, so (4,S) still points to the user's ; pushed arg1 relative to the original JSL). Computes |a| -> $e6, ; |b| -> $e8, and sign tracker -> $ee: ; bit 0 = 1 if dividend was negative (modulo result sign) ; bit 1 = 1 if dividend XOR divisor signs differ (quotient sign) ; Uses JSR/RTS, same bank. ; -------------------------------------------------------------------- __divmod_setup: ; Sign tracker. We don't have STZ in our instruction set yet, so ; clear via PHA/LDA #0/STA/PLA to avoid trashing A. pha lda #0x0 sta 0xee pla ; Dividend sign + abs value. cmp #0x8000 bcc .Lset_a_pos ; Negative: set bits 0 and 1 (dividend sign, result sign so far). pha lda 0xee ora #0x3 sta 0xee pla eor #0xffff clc adc #0x1 .Lset_a_pos: sta 0xe6 ; Divisor sign + abs value. After our JSR (pushed 2 bytes of ; near-return), the user's arg1 has shifted up by 2 from (4,S) ; to (6,S). lda 0x6, s cmp #0x8000 bcc .Lset_b_pos ; Negative: flip bit 1 of $ee (XOR with sign of dividend). pha lda 0xee eor #0x2 sta 0xee pla eor #0xffff clc adc #0x1 .Lset_b_pos: sta 0xe8 rts ; -------------------------------------------------------------------- ; __udivmod_core — internal restoring divide. Inputs at $e6/$e8, ; outputs quotient at $ea, remainder at $ec. JSR/RTS local helper. ; -------------------------------------------------------------------- __udivmod_core: lda #0x0 sta 0xea sta 0xec ldx #0x10 .Lcore_loop: asl 0xe6 rol 0xec asl 0xea lda 0xec cmp 0xe8 bcc .Lcore_skip sec sbc 0xe8 sta 0xec inc 0xea .Lcore_skip: dex bne .Lcore_loop rts ; ==================================================================== ; 32-bit (long / si) helpers. ; ; ABI for these is the natural extension of the i16 libcalls: ; - arg0_lo in A ; - arg0_hi at (4,s) ; - arg1_lo at (6,s) (or shift count, for the shift helpers) ; - arg1_hi at (8,s) ; - return: result_lo in A, result_hi in X ; ; All are correct-but-unoptimised; goal is unblocking end-to-end builds, ; not winning a 65816 codegolf. ; ; Direct-page scratch for these: ; $e0..$e3 = a (lo, hi) [renamed from $e0/$e2 for the i16 ones] ; $e4..$e7 = b (lo, hi) ; $e8..$eb = result / quotient (lo, hi) ; $ec..$ef = remainder (lo, hi) ; ==================================================================== ; -------------------------------------------------------------------- ; __mulsi3 — 32-bit multiply. Shift-and-add over 32 bits of the ; multiplier. Result = (a * b) mod 2^32. ; ; ABI: A = a_lo, X = a_hi (the i32-first-arg in A:X convention), ; (4,s) = b_lo, (6,s) = b_hi. Result returned in A:X (lo:hi). ; -------------------------------------------------------------------- .globl __mulsi3 __mulsi3: ; Stash a (multiplier) into $e0/$e2. sta 0xe0 stx 0xe2 ; Stash b (multiplicand) into $e4/$e6. lda 0x4, s sta 0xe4 lda 0x6, s sta 0xe6 ; Clear running product at $e8/$ea. lda #0x0 sta 0xe8 sta 0xea ; Loop 32 times: examine LSB of multiplier, conditionally add ; multiplicand to product, then shift multiplier right and ; multiplicand left. Use Y as a 16-bit counter (X mode = 16). ldy #0x20 .Lmulsi_loop: ; Test bit 0 of multiplier (lo word). lda 0xe0 lsr a sta 0xe0 bcc .Lmulsi_noadd ; Add multiplicand to product (32-bit). clc lda 0xe8 adc 0xe4 sta 0xe8 lda 0xea adc 0xe6 sta 0xea .Lmulsi_noadd: ; Shift multiplier right (32-bit, hi-into-lo) — we already shifted ; the lo half above, but the bit shifted out went to carry. We ; need to also bring the lo bit of the hi half into bit 15 of lo, ; and shift hi right. Simpler: do a full 32-bit shift right ; before the LSR. Restructure: ; ; Shift multiplicand left (32-bit, carry chain). asl 0xe4 rol 0xe6 ; Bring multiplier hi into multiplier lo's high bit. Multiplier ; has been shifted lo>>1 already; we need to also put hi's lo bit ; into lo's hi bit and shift hi right. lsr 0xe2 bcc .Lmulsi_no_borrow ; Carry from hi >> 1 needs to land in bit 15 of lo. ORA #$8000. lda 0xe0 ora #0x8000 sta 0xe0 .Lmulsi_no_borrow: dey bne .Lmulsi_loop ; Result is in $e8 (lo) / $ea (hi). ldx 0xea lda 0xe8 rtl ; -------------------------------------------------------------------- ; __ashlsi3 — (A:X) << (4,s) -> A:X. Shift count is i16 in low byte; ; counts >= 32 are UB in C. Uses a per-bit loop (cheap on 65816 — one ; ASL + ROL per bit). ; ; ABI: A = a_lo, X = a_hi (i32-first-arg in A:X), (4,s) = count. ; -------------------------------------------------------------------- .globl __ashlsi3 __ashlsi3: sta 0xe0 ; lo stx 0xe2 ; hi lda 0x4, s tay ; count -> Y .Lashlsi_loop: cpy #0x0 beq .Lashlsi_done asl 0xe0 rol 0xe2 dey bra .Lashlsi_loop .Lashlsi_done: ldx 0xe2 lda 0xe0 rtl ; -------------------------------------------------------------------- ; __lshrsi3 — logical >> shift. LSR hi, ROR lo: hi gets a 0, lo gets ; hi's old bit 0. Per-bit loop. ; -------------------------------------------------------------------- .globl __lshrsi3 __lshrsi3: sta 0xe0 stx 0xe2 lda 0x4, s tay .Llshrsi_loop: cpy #0x0 beq .Llshrsi_done lsr 0xe2 ror 0xe0 dey bra .Llshrsi_loop .Llshrsi_done: ldx 0xe2 lda 0xe0 rtl ; -------------------------------------------------------------------- ; __ashrsi3 — arithmetic >> shift. Sign bit must be preserved on each ; iteration: copy bit 15 of hi into carry (via CMP #$8000), then ROR ; hi, ROR lo. Per-bit loop. ; -------------------------------------------------------------------- .globl __ashrsi3 __ashrsi3: sta 0xe0 stx 0xe2 lda 0x4, s tay .Lashrsi_loop: cpy #0x0 beq .Lashrsi_done ; CMP #$8000 sets C iff the unsigned value >= 0x8000, i.e. bit 15 ; is set — exactly the sign bit. lda 0xe2 cmp #0x8000 ror 0xe2 ror 0xe0 dey bra .Lashrsi_loop .Lashrsi_done: ldx 0xe2 lda 0xe0 rtl ; -------------------------------------------------------------------- ; __udivmodsi_core — internal 32-bit unsigned divide. Inputs in ; $e0/$e2 (numerator) and $e4/$e6 (denominator); outputs quotient in ; $e8/$ea and remainder in $ec/$ee. 32-iteration restoring divide. ; JSR/RTS local helper. ; -------------------------------------------------------------------- __udivmodsi_core: lda #0x0 sta 0xe8 sta 0xea sta 0xec sta 0xee ldy #0x20 .Lcoresi_loop: ; Shift numerator left through remainder. asl 0xe0 rol 0xe2 rol 0xec rol 0xee ; Shift quotient left. asl 0xe8 rol 0xea ; Compare remainder to denominator (32-bit). lda 0xee cmp 0xe6 bcc .Lcoresi_skip bne .Lcoresi_take lda 0xec cmp 0xe4 bcc .Lcoresi_skip .Lcoresi_take: ; Remainder >= denominator: subtract and set quotient bit 0. sec lda 0xec sbc 0xe4 sta 0xec lda 0xee sbc 0xe6 sta 0xee inc 0xe8 .Lcoresi_skip: dey bne .Lcoresi_loop rts ; -------------------------------------------------------------------- ; __udivsi3 — unsigned 32/32 -> 32 divide. ; -------------------------------------------------------------------- .globl __udivsi3 __udivsi3: ; ABI: A = a_lo, X = a_hi, (4,s) = b_lo, (6,s) = b_hi. sta 0xe0 stx 0xe2 lda 0x4, s sta 0xe4 lda 0x6, s sta 0xe6 jsr __udivmodsi_core ldx 0xea lda 0xe8 rtl ; -------------------------------------------------------------------- ; __umodsi3 — unsigned 32/32 -> 32 modulo. ; -------------------------------------------------------------------- .globl __umodsi3 __umodsi3: sta 0xe0 stx 0xe2 lda 0x4, s sta 0xe4 lda 0x6, s sta 0xe6 jsr __udivmodsi_core ldx 0xee lda 0xec rtl ; -------------------------------------------------------------------- ; __divsi3 / __modsi3 — signed 32-bit divide / modulo. Strategy mirrors ; the i16 helpers: stash signs, take abs, run unsigned core, negate ; result(s) as needed. Sign tracker bits in $f0: ; bit 0 = dividend was negative (modulo result sign) ; bit 1 = quotient sign (sign(a) XOR sign(b)) ; -------------------------------------------------------------------- .globl __divsi3 __divsi3: jsr __divmodsi_setup jsr __udivmodsi_core ; Quotient at $e8/$ea. Negate if bit 1 of $f0 is set. lda 0xf0 and #0x2 beq .Ldivsi_pos ; 32-bit two's complement of quotient. lda 0xe8 eor #0xffff clc adc #0x1 sta 0xe8 lda 0xea eor #0xffff adc #0x0 sta 0xea .Ldivsi_pos: ldx 0xea lda 0xe8 rtl .globl __modsi3 __modsi3: jsr __divmodsi_setup jsr __udivmodsi_core ; Remainder at $ec/$ee. Negate if bit 0 of $f0 set (dividend ; was negative — C99 remainder takes dividend's sign). lda 0xf0 and #0x1 beq .Lmodsi_pos lda 0xec eor #0xffff clc adc #0x1 sta 0xec lda 0xee eor #0xffff adc #0x0 sta 0xee .Lmodsi_pos: ldx 0xee lda 0xec rtl ; -------------------------------------------------------------------- ; __divmodsi_setup — common prologue for __divsi3 / __modsi3. ; Reads A=a_lo, X=a_hi (i32-first-arg ABI), (4,s)=b_lo, (6,s)=b_hi. ; Writes |a| to $e0/$e2, |b| to $e4/$e6, sign bits to $f0. JSR/RTS. ; After JSR's 2-byte ret push, callee-relative offsets are (6,s)=b_lo, ; (8,s)=b_hi. ; -------------------------------------------------------------------- __divmodsi_setup: ; Clear sign tracker. pha lda #0x0 sta 0xf0 pla ; |a|: A=a_lo, X=a_hi. Save them first (we need a_hi for sign test). sta 0xe0 ; tentative a_lo (may negate below) stx 0xe2 ; tentative a_hi cpx #0x8000 bcc .Lsetsi_a_pos ; a is negative. Set sign tracker bits 0+1 and negate. lda 0xf0 ora #0x3 sta 0xf0 ; 32-bit negate: invert + 1. lda 0xe0 eor #0xffff clc adc #0x1 sta 0xe0 lda 0xe2 eor #0xffff adc #0x0 sta 0xe2 .Lsetsi_a_pos: ; |b|. Args shifted by 2 (the JSR ret push). lda 0x6, s sta 0xe4 lda 0x8, s sta 0xe6 cmp #0x8000 bcc .Lsetsi_b_pos ; b is negative. Flip bit 1 of $f0. lda 0xf0 eor #0x2 sta 0xf0 lda 0xe4 eor #0xffff clc adc #0x1 sta 0xe4 lda 0xe6 eor #0xffff adc #0x0 sta 0xe6 .Lsetsi_b_pos: rts ; ==================================================================== ; i64 (long long) helpers. ; ; Calling convention (i64 first arg is split via i32-first-arg path): ; A = arg0_lo[0..15] (lowest word) ; X = arg0_lo[16..31] ; 4,S = arg0_hi[0..15] ; 6,S = arg0_hi[16..31] (highest word) ; For binary ops (mul/div/mod), arg1 follows on the stack: ; 8,S = arg1_lo[0..15] ; 10,S = arg1_lo[16..31] ; 12,S = arg1_hi[0..15] ; 14,S = arg1_hi[16..31] ; For shift ops, the count occupies a single i16 at 8,S. ; ; Return ABI (matches LowerReturn for i64): ; A = result_lo[0..15] ; X = result_lo[16..31] ; Y = result_hi[0..15] ; DP $F0..$F1 = result_hi[16..31] ; ; Scratch DP layout (per-libcall, no overlap between concurrent calls): ; $E0..$E7 = a (8 bytes; 4 16-bit words) ; $E8..$EF = b OR product (8 bytes) ; ; All routines run with REP #$30 (M=0, X=0). ; ==================================================================== ; -------------------------------------------------------------------- ; __divmoddi4_stash — common entry point. Stashes a -> $E0..$E7, ; b -> $E8..$EF. Used by __udivdi3 / __umoddi3 / __divdi3 / __moddi3 ; setup; signed variants flip signs around it. ; -------------------------------------------------------------------- __divmoddi4_stash: sta 0xe0 ; a_lo_lo stx 0xe2 ; a_lo_hi lda 0x4, s sta 0xe4 ; a_hi_lo lda 0x6, s sta 0xe6 ; a_hi_hi lda 0x8, s sta 0xe8 ; b_lo_lo lda 0xa, s sta 0xea ; b_lo_hi lda 0xc, s sta 0xec ; b_hi_lo lda 0xe, s sta 0xee ; b_hi_hi rts ; -------------------------------------------------------------------- ; Helper: pack the result at $E0..$E7 into the i64 return ABI. ; Trashes A, Y. Caller falls through to RTL. ; -------------------------------------------------------------------- __retdi: lda 0xe6 sta 0xf0 lda 0xe4 tay lda 0xe2 tax lda 0xe0 rtl ; -------------------------------------------------------------------- ; __ashldi3 — i64 left shift by n. Per-bit loop. Y holds count. ; -------------------------------------------------------------------- .globl __ashldi3 __ashldi3: sta 0xe0 stx 0xe2 lda 0x4, s sta 0xe4 lda 0x6, s sta 0xe6 lda 0x8, s tay ; Y = count .Lashldi_loop: cpy #0x0 beq .Lashldi_done asl 0xe0 rol 0xe2 rol 0xe4 rol 0xe6 dey bra .Lashldi_loop .Lashldi_done: brl __retdi ; -------------------------------------------------------------------- ; __lshrdi3 — i64 logical right shift. LSR top word, ROR rest. ; -------------------------------------------------------------------- .globl __lshrdi3 __lshrdi3: sta 0xe0 stx 0xe2 lda 0x4, s sta 0xe4 lda 0x6, s sta 0xe6 lda 0x8, s tay .Llshrdi_loop: cpy #0x0 beq .Llshrdi_done lsr 0xe6 ror 0xe4 ror 0xe2 ror 0xe0 dey bra .Llshrdi_loop .Llshrdi_done: brl __retdi ; -------------------------------------------------------------------- ; __ashrdi3 — i64 arithmetic right shift. Same as lshrdi3 but the top ; bit replicates: sign-extend by ASL/ROR which would clear; instead ; take a copy of the sign and OR it back, OR use cmp/sbc trick — use ; the standard idiom: capture sign before LSR via "asl; ror" so C is ; preserved. Simpler: copy bit 15 of $E7 into C before each shift. ; -------------------------------------------------------------------- .globl __ashrdi3 __ashrdi3: sta 0xe0 stx 0xe2 lda 0x4, s sta 0xe4 lda 0x6, s sta 0xe6 lda 0x8, s tay .Lashrdi_loop: cpy #0x0 beq .Lashrdi_done ; "ASL $E6" sets C from bit 15 (the sign), then we ROR $E6 back. ; Net effect on $E6: arithmetic right shift by 1 (sign preserved). ; The carry chain into $E4..$E0 is the new bit 15. lda 0xe6 asl a ; C = sign bit; A = (sign<<1) | rest ror 0xe6 ; $E6: (sign << 15) | ($E6 >> 1) ror 0xe4 ror 0xe2 ror 0xe0 dey bra .Lashrdi_loop .Lashrdi_done: brl __retdi ; -------------------------------------------------------------------- ; __muldi3 — i64 multiply (low 64 bits of 64x64 product). ; Shift-and-add over a (64 bits). Product accumulates at $F2..$F9 ; (above the return DP slot, scratch). Need a fresh 8-byte product ; slot since $E0..$EF holds operands. ; -------------------------------------------------------------------- .globl __muldi3 __muldi3: jsr __divmoddi4_stash ; Clear product P0..P3 at $F2..$F8. lda #0x0 sta 0xf2 sta 0xf4 sta 0xf6 sta 0xf8 ; Loop 64 times on a's bits. ldy #0x40 .Lmuldi_loop: ; Test bit 0 of a (= LSR a; C = old bit 0). lda 0xe0 lsr a sta 0xe0 lda 0xe2 ror a sta 0xe2 lda 0xe4 ror a sta 0xe4 lda 0xe6 ror a sta 0xe6 bcc .Lmuldi_noadd ; Add b ($E8..$EE) to product ($F2..$F8). clc lda 0xf2 adc 0xe8 sta 0xf2 lda 0xf4 adc 0xea sta 0xf4 lda 0xf6 adc 0xec sta 0xf6 lda 0xf8 adc 0xee sta 0xf8 .Lmuldi_noadd: ; Shift b left by 1 (so each iteration uses next bit position). asl 0xe8 rol 0xea rol 0xec rol 0xee dey bne .Lmuldi_loop ; Move product into return slots ($E0..$E7) and tail-call __retdi. lda 0xf2 sta 0xe0 lda 0xf4 sta 0xe2 lda 0xf6 sta 0xe4 lda 0xf8 sta 0xe6 brl __retdi ; -------------------------------------------------------------------- ; __ucmpdi2 — unsigned i64 compare. Returns 0 if ab (libgcc convention). We emit i16 result in A (with the ; high bytes don't-care). ; -------------------------------------------------------------------- .globl __ucmpdi2 __ucmpdi2: ; Compare from MSB downwards. Stash a/b first so we have a stable ; layout. jsr __divmoddi4_stash ; Compare $E6 vs $EE (a_hi_hi vs b_hi_hi). lda 0xe6 cmp 0xee bne .Lucmpdi_decided lda 0xe4 cmp 0xec bne .Lucmpdi_decided lda 0xe2 cmp 0xea bne .Lucmpdi_decided lda 0xe0 cmp 0xe8 bne .Lucmpdi_decided ; Equal. lda #0x1 rtl .Lucmpdi_decided: ; Carry clear -> a < b -> return 0. ; Carry set, Z clear -> a > b -> return 2. bcc .Lucmpdi_lt lda #0x2 rtl .Lucmpdi_lt: lda #0x0 rtl ; -------------------------------------------------------------------- ; __cmpdi2 — signed i64 compare. Same {0,1,2} return convention. ; Implemented by flipping the high-word sign bits before doing an ; unsigned compare ($N XOR $8000 swaps the signed-int order to ; unsigned-int order). ; -------------------------------------------------------------------- .globl __cmpdi2 __cmpdi2: jsr __divmoddi4_stash lda 0xe6 eor #0x8000 sta 0xe6 lda 0xee eor #0x8000 sta 0xee ; Unsigned compare on the rewritten values. lda 0xe6 cmp 0xee bne .Lcmpdi_decided lda 0xe4 cmp 0xec bne .Lcmpdi_decided lda 0xe2 cmp 0xea bne .Lcmpdi_decided lda 0xe0 cmp 0xe8 bne .Lcmpdi_decided lda #0x1 rtl .Lcmpdi_decided: bcc .Lcmpdi_lt lda #0x2 rtl .Lcmpdi_lt: lda #0x0 rtl ; -------------------------------------------------------------------- ; __udivdi3 / __umoddi3 — unsigned 64-bit divide / modulo. Restoring ; division: shift dividend left into a remainder register, conditionally ; subtract the divisor. The two libcalls share the core; quotient ; lands at $E0..$E7, remainder at $F2..$F8. Each entry sets a flag in ; X to select which to return. ; -------------------------------------------------------------------- .globl __udivdi3 __udivdi3: jsr __divmoddi4_stash jsr __udivmoddi_core brl __retdi .globl __umoddi3 __umoddi3: jsr __divmoddi4_stash jsr __udivmoddi_core ; Move remainder ($F2..$F8) -> $E0..$E7 for return. lda 0xf2 sta 0xe0 lda 0xf4 sta 0xe2 lda 0xf6 sta 0xe4 lda 0xf8 sta 0xe6 brl __retdi ; Core: dividend at $E0..$E6, divisor at $E8..$EE. ; Output: quotient at $E0..$E6, remainder at $F2..$F8. __udivmoddi_core: ; Clear remainder $F2..$F8. lda #0x0 sta 0xf2 sta 0xf4 sta 0xf6 sta 0xf8 ldy #0x40 .Ludivmoddi_loop: ; Shift left: dividend (becomes quotient) and remainder together ; as a 128-bit register. bit shifted out of dividend top -> remainder LSB. asl 0xe0 rol 0xe2 rol 0xe4 rol 0xe6 rol 0xf2 rol 0xf4 rol 0xf6 rol 0xf8 ; Try remainder - divisor. If no borrow, accept and set quotient bit. sec lda 0xf2 sbc 0xe8 sta 0xfa ; tentative subtract result at $FA..$ lda 0xf4 sbc 0xea sta 0xfc lda 0xf6 sbc 0xec sta 0xfe lda 0xf8 sbc 0xee ; A holds new high word. C = !borrow. bcc .Ludivmoddi_skip ; Accept: remainder = remainder - divisor, quotient bit 0 = 1. sta 0xf8 lda 0xfe sta 0xf6 lda 0xfc sta 0xf4 lda 0xfa sta 0xf2 ; Set bit 0 of dividend (which we shifted left, so position is open). lda 0xe0 ora #0x1 sta 0xe0 .Ludivmoddi_skip: dey bne .Ludivmoddi_loop rts ; -------------------------------------------------------------------- ; __divdi3 / __moddi3 — signed 64-bit divide / modulo. Take absolute ; values, run the unsigned core, fix up the sign. ; div: sign(quotient) = sign(a) XOR sign(b) ; mod: sign(remainder) = sign(a) ; -------------------------------------------------------------------- .globl __divdi3 __divdi3: jsr __divmoddi4_stash ; Track signs: bit 15 of $E6 (a) and $EE (b). Save XOR in a temp. lda 0xe6 eor 0xee and #0x8000 sta 0xfa ; sign of quotient at $FA ; Abs(a) jsr __absdi_a ; Abs(b) jsr __absdi_b jsr __udivmoddi_core ; Fix quotient sign: if $FA != 0, negate $E0..$E6. lda 0xfa beq .Ldivdi_pos jsr __negdi_a .Ldivdi_pos: brl __retdi .globl __moddi3 __moddi3: jsr __divmoddi4_stash ; Mod sign = sign of a. lda 0xe6 and #0x8000 sta 0xfa jsr __absdi_a jsr __absdi_b jsr __udivmoddi_core ; Move remainder to $E0..$E6. lda 0xf2 sta 0xe0 lda 0xf4 sta 0xe2 lda 0xf6 sta 0xe4 lda 0xf8 sta 0xe6 ; Apply sign. lda 0xfa beq .Lmoddi_pos jsr __negdi_a .Lmoddi_pos: brl __retdi ; --- subroutines used by signed div/mod --- ; __absdi_a: if $E6 has sign bit set, negate $E0..$E6. __absdi_a: lda 0xe6 bpl .Labsdi_a_done jsr __negdi_a .Labsdi_a_done: rts ; __absdi_b: if $EE has sign bit set, negate $E8..$EE. __absdi_b: lda 0xee bpl .Labsdi_b_done jsr __negdi_b .Labsdi_b_done: rts ; __negdi_a: 2's complement negate $E0..$E6. __negdi_a: sec lda #0x0 sbc 0xe0 sta 0xe0 lda #0x0 sbc 0xe2 sta 0xe2 lda #0x0 sbc 0xe4 sta 0xe4 lda #0x0 sbc 0xe6 sta 0xe6 rts ; __negdi_b: 2's complement negate $E8..$EE. __negdi_b: sec lda #0x0 sbc 0xe8 sta 0xe8 lda #0x0 sbc 0xea sta 0xea lda #0x0 sbc 0xec sta 0xec lda #0x0 sbc 0xee sta 0xee rts ; -------------------------------------------------------------------- ; setjmp(jmp_buf env) - save calling environment, return 0 ; longjmp(jmp_buf env, int val) - restore environment, return val (or 1 if val == 0) ; ; jmp_buf layout (8 bytes): ; [0..1] = caller's stack pointer (SP+3 at entry to setjmp) ; [2..3] = return address PC lo:hi (16 bits) ; [4] = return address bank (1 byte) ; [5..6] = direct page register (DP) ; [7] = reserved / padding ; ; Caller-save convention: longjmp doesn't restore X / Y / A — caller's ; setjmp returned 0 with all-callee-savable regs already preserved by ; setjmp's caller. ; -------------------------------------------------------------------- .globl setjmp setjmp: sta 0xe0 ; jmp_buf addr -> DP scratch tsc ; A = current SP clc adc #0x3 ; A = caller's SP (undo JSL push) ldy #0 sta (0xe0), y ; env[0..1] = caller SP lda 0x1, s ; A = retaddr lo:hi ldy #2 sta (0xe0), y ; env[2..3] = retaddr lo:hi sep #0x20 lda 0x3, s ; A_lo = bank ldy #4 sta (0xe0), y ; env[4] = bank rep #0x20 tdc ; A = DP ldy #5 sta (0xe0), y ; env[5..6] = DP lda #0 ; setjmp returns 0 rtl .globl longjmp longjmp: sta 0xe0 ; jmp_buf addr -> DP scratch lda 0x4, s ; A = val (2nd arg, on stack) sta 0xe2 ; save val ; Restore SP: env[0..1] - 3 (so the upcoming PHAs land at the right slots). ldy #0 lda (0xe0), y ; A = saved SP sec sbc #0x3 tcs ; SP = saved_SP - 3 ; Push retaddr: bank, then 16-bit lo:hi. RTL pulls lo, hi, bank. sep #0x20 ldy #4 lda (0xe0), y ; bank pha rep #0x20 ldy #2 lda (0xe0), y ; lo:hi pha ; Restore DP. ldy #5 lda (0xe0), y tcd ; Compute return value: val if nonzero, else 1. lda 0xe2 bne .Llj_done lda #1 .Llj_done: rtl