1180 lines
27 KiB
ArmAsm
1180 lines
27 KiB
ArmAsm
; Minimal libgcc-equivalent runtime for the W65816 / Apple IIgs.
|
|
; Provides the helpers that the LLVM backend lowers integer multiply,
|
|
; shift, divide, and modulo operations to. Implementations are
|
|
; correct-but-unoptimised; they exist to unblock end-to-end testing,
|
|
; not to compete with hand-tuned 65816 math libraries.
|
|
;
|
|
; Calling convention (matches W65816ISelLowering::LowerCall):
|
|
; - Arg 0 in A (16-bit M).
|
|
; - Arg 1 pushed via PHA before the JSL. Reads as (4,S) inside the
|
|
; callee (3-byte JSL return address sits at 1..3,S).
|
|
; - Return value in A. Caller releases pushed args.
|
|
; - Routines run in 16-bit M, 16-bit X (REP #$30 by convention).
|
|
;
|
|
; Direct-page scratch lives at DP+$E0..DP+$EF (16 bytes). Programs
|
|
; that use this runtime must keep DP=0 or remap accordingly.
|
|
;
|
|
; Assembled with: tools/llvm-mos-build/bin/llvm-mc -arch=w65816 \
|
|
; -filetype=obj
|
|
; runtime/src/libgcc.s
|
|
; -o runtime/libgcc.o
|
|
|
|
.text
|
|
|
|
; --------------------------------------------------------------------
|
|
; Indirect-call trampoline. An indirect call (function pointer) stores
|
|
; the target's 16-bit address to __indirTarget before JSL'ing here.
|
|
; This routine does a JMP indirect through that variable: control
|
|
; transfers to the target with the original caller's JSL frame still
|
|
; on the stack, so target's RTL returns to the original caller (one
|
|
; frame, no double-RTL).
|
|
;
|
|
; Caller emit sequence in W65816ISelLowering::LowerCall:
|
|
; sta __indirTarget ; store ptr (must precede any A clobber for args)
|
|
; ... arg pushes ...
|
|
; jsl __jsl_indir
|
|
;
|
|
; Single-bank only (the IIgs convention assumes code in bank 0/1
|
|
; via JSL — JMP indirect is bank-local).
|
|
; --------------------------------------------------------------------
|
|
.globl __indirTarget
|
|
.bss
|
|
__indirTarget:
|
|
.zero 2
|
|
|
|
.text
|
|
.globl __jsl_indir
|
|
__jsl_indir:
|
|
; Hand-encoded JMP (__indirTarget): 6C is "jmp (a)" — the assembler
|
|
; doesn't yet parse the `(abs)` syntax, so emit the bytes directly
|
|
; with a 16-bit relocation against the variable. Effective transfer:
|
|
; PC <- mem[__indirTarget].
|
|
.byte 0x6C
|
|
.word __indirTarget
|
|
|
|
; --------------------------------------------------------------------
|
|
; __mulhi3 — 16-bit multiply. A * (4,S) -> A.
|
|
; Signed and unsigned share an implementation: only the low 16 bits of
|
|
; the product are returned, which is identical for both. Uses
|
|
; shift-and-add over the multiplier bits.
|
|
; --------------------------------------------------------------------
|
|
.globl __mulhi3
|
|
__mulhi3:
|
|
sta 0xe0 ; multiplier
|
|
lda 0x4, s
|
|
sta 0xe2 ; multiplicand
|
|
lda #0x0
|
|
sta 0xe4 ; running product
|
|
.Lmul_loop:
|
|
lda 0xe0
|
|
beq .Lmul_done
|
|
lsr a
|
|
sta 0xe0
|
|
bcc .Lmul_skip
|
|
lda 0xe4
|
|
clc
|
|
adc 0xe2
|
|
sta 0xe4
|
|
.Lmul_skip:
|
|
asl 0xe2
|
|
bra .Lmul_loop
|
|
.Lmul_done:
|
|
lda 0xe4
|
|
rtl
|
|
|
|
; --------------------------------------------------------------------
|
|
; __ashlhi3 — A << (4,S) -> A. Shift count is i16 but only the low 4
|
|
; bits are meaningful (counts >=16 are undefined behaviour in C).
|
|
; --------------------------------------------------------------------
|
|
.globl __ashlhi3
|
|
__ashlhi3:
|
|
pha ; save value on stack so we can free A
|
|
lda 0x6, s ; arg 1 sits at 6,s now (PHA shifted by 2)
|
|
tax
|
|
pla ; restore value
|
|
.Lashl_loop:
|
|
cpx #0x0
|
|
beq .Lashl_done
|
|
asl a
|
|
dex
|
|
bra .Lashl_loop
|
|
.Lashl_done:
|
|
rtl
|
|
|
|
; --------------------------------------------------------------------
|
|
; __lshrhi3 — A logical >> (4,S) -> A. Same shape as __ashlhi3 with
|
|
; LSR instead of ASL.
|
|
; --------------------------------------------------------------------
|
|
.globl __lshrhi3
|
|
__lshrhi3:
|
|
pha
|
|
lda 0x6, s
|
|
tax
|
|
pla
|
|
.Llshr_loop:
|
|
cpx #0x0
|
|
beq .Llshr_done
|
|
lsr a
|
|
dex
|
|
bra .Llshr_loop
|
|
.Llshr_done:
|
|
rtl
|
|
|
|
; --------------------------------------------------------------------
|
|
; __ashrhi3 — A arithmetic >> (4,S) -> A. Sign bit is preserved by
|
|
; copying it into carry before each ROR via CMP #$8000 (which sets
|
|
; carry exactly when the sign bit is set on a 16-bit unsigned compare).
|
|
; --------------------------------------------------------------------
|
|
.globl __ashrhi3
|
|
__ashrhi3:
|
|
pha
|
|
lda 0x6, s
|
|
tax
|
|
pla
|
|
.Lashr_loop:
|
|
cpx #0x0
|
|
beq .Lashr_done
|
|
cmp #0x8000
|
|
ror a
|
|
dex
|
|
bra .Lashr_loop
|
|
.Lashr_done:
|
|
rtl
|
|
|
|
; --------------------------------------------------------------------
|
|
; __udivhi3 — A unsigned / (4,S) -> A.
|
|
; Restoring shift-subtract division. Common helper; __umodhi3 reuses
|
|
; the algorithm and returns the remainder instead.
|
|
; Scratch: $e6 = numerator, $e8 = denominator,
|
|
; $ea = quotient, $ec = remainder.
|
|
; --------------------------------------------------------------------
|
|
.globl __udivhi3
|
|
__udivhi3:
|
|
; Public entry: A=dividend, (4,S)=divisor. Set up scratch and
|
|
; call the same JSR-based core used by signed divide.
|
|
sta 0xe6
|
|
lda 0x4, s
|
|
sta 0xe8
|
|
jsr __udivmod_core
|
|
lda 0xea
|
|
rtl
|
|
|
|
.globl __umodhi3
|
|
__umodhi3:
|
|
sta 0xe6
|
|
lda 0x4, s
|
|
sta 0xe8
|
|
jsr __udivmod_core
|
|
lda 0xec
|
|
rtl
|
|
|
|
; --------------------------------------------------------------------
|
|
; __divhi3 / __modhi3 — signed 16-bit divide and modulo. Strategy:
|
|
; - Stash sign of dividend in $ee bit 0 (used by modulo).
|
|
; - Stash result sign of quotient (sign(a) XOR sign(b)) in $ee bit 1
|
|
; (used by divide).
|
|
; - Take absolute values, run the unsigned core, then negate the
|
|
; appropriate result if its sign bit is set.
|
|
; C99: quotient truncates toward zero; remainder takes the sign of the
|
|
; dividend.
|
|
; --------------------------------------------------------------------
|
|
.globl __divhi3
|
|
__divhi3:
|
|
jsr __divmod_setup
|
|
jsr __udivmod_core
|
|
; Quotient is in $ea. Negate if bit 1 of $ee is set.
|
|
lda 0xea
|
|
pha
|
|
lda 0xee
|
|
and #0x2
|
|
beq .Ldiv_pos
|
|
pla
|
|
eor #0xffff
|
|
clc
|
|
adc #0x1
|
|
rtl
|
|
.Ldiv_pos:
|
|
pla
|
|
rtl
|
|
|
|
.globl __modhi3
|
|
__modhi3:
|
|
jsr __divmod_setup
|
|
jsr __udivmod_core
|
|
; Remainder is in $ec. Negate if bit 0 of $ee is set (dividend
|
|
; was negative).
|
|
lda 0xec
|
|
pha
|
|
lda 0xee
|
|
and #0x1
|
|
beq .Lmod_pos
|
|
pla
|
|
eor #0xffff
|
|
clc
|
|
adc #0x1
|
|
rtl
|
|
.Lmod_pos:
|
|
pla
|
|
rtl
|
|
|
|
; --------------------------------------------------------------------
|
|
; __divmod_setup — common prologue for __divhi3/__modhi3. Reads
|
|
; A=dividend and (4,S)=divisor (the public-entry stack frame is intact
|
|
; because we used JSR not JSL, so (4,S) still points to the user's
|
|
; pushed arg1 relative to the original JSL). Computes |a| -> $e6,
|
|
; |b| -> $e8, and sign tracker -> $ee:
|
|
; bit 0 = 1 if dividend was negative (modulo result sign)
|
|
; bit 1 = 1 if dividend XOR divisor signs differ (quotient sign)
|
|
; Uses JSR/RTS, same bank.
|
|
; --------------------------------------------------------------------
|
|
__divmod_setup:
|
|
; Sign tracker. We don't have STZ in our instruction set yet, so
|
|
; clear via PHA/LDA #0/STA/PLA to avoid trashing A.
|
|
pha
|
|
lda #0x0
|
|
sta 0xee
|
|
pla
|
|
; Dividend sign + abs value.
|
|
cmp #0x8000
|
|
bcc .Lset_a_pos
|
|
; Negative: set bits 0 and 1 (dividend sign, result sign so far).
|
|
pha
|
|
lda 0xee
|
|
ora #0x3
|
|
sta 0xee
|
|
pla
|
|
eor #0xffff
|
|
clc
|
|
adc #0x1
|
|
.Lset_a_pos:
|
|
sta 0xe6
|
|
; Divisor sign + abs value. After our JSR (pushed 2 bytes of
|
|
; near-return), the user's arg1 has shifted up by 2 from (4,S)
|
|
; to (6,S).
|
|
lda 0x6, s
|
|
cmp #0x8000
|
|
bcc .Lset_b_pos
|
|
; Negative: flip bit 1 of $ee (XOR with sign of dividend).
|
|
pha
|
|
lda 0xee
|
|
eor #0x2
|
|
sta 0xee
|
|
pla
|
|
eor #0xffff
|
|
clc
|
|
adc #0x1
|
|
.Lset_b_pos:
|
|
sta 0xe8
|
|
rts
|
|
|
|
; --------------------------------------------------------------------
|
|
; __udivmod_core — internal restoring divide. Inputs at $e6/$e8,
|
|
; outputs quotient at $ea, remainder at $ec. JSR/RTS local helper.
|
|
; --------------------------------------------------------------------
|
|
__udivmod_core:
|
|
lda #0x0
|
|
sta 0xea
|
|
sta 0xec
|
|
ldx #0x10
|
|
.Lcore_loop:
|
|
asl 0xe6
|
|
rol 0xec
|
|
asl 0xea
|
|
lda 0xec
|
|
cmp 0xe8
|
|
bcc .Lcore_skip
|
|
sec
|
|
sbc 0xe8
|
|
sta 0xec
|
|
inc 0xea
|
|
.Lcore_skip:
|
|
dex
|
|
bne .Lcore_loop
|
|
rts
|
|
|
|
; ====================================================================
|
|
; 32-bit (long / si) helpers.
|
|
;
|
|
; ABI for these is the natural extension of the i16 libcalls:
|
|
; - arg0_lo in A
|
|
; - arg0_hi at (4,s)
|
|
; - arg1_lo at (6,s) (or shift count, for the shift helpers)
|
|
; - arg1_hi at (8,s)
|
|
; - return: result_lo in A, result_hi in X
|
|
;
|
|
; All are correct-but-unoptimised; goal is unblocking end-to-end builds,
|
|
; not winning a 65816 codegolf.
|
|
;
|
|
; Direct-page scratch for these:
|
|
; $e0..$e3 = a (lo, hi) [renamed from $e0/$e2 for the i16 ones]
|
|
; $e4..$e7 = b (lo, hi)
|
|
; $e8..$eb = result / quotient (lo, hi)
|
|
; $ec..$ef = remainder (lo, hi)
|
|
; ====================================================================
|
|
|
|
; --------------------------------------------------------------------
|
|
; __mulsi3 — 32-bit multiply. Shift-and-add over 32 bits of the
|
|
; multiplier. Result = (a * b) mod 2^32.
|
|
;
|
|
; ABI: A = a_lo, X = a_hi (the i32-first-arg in A:X convention),
|
|
; (4,s) = b_lo, (6,s) = b_hi. Result returned in A:X (lo:hi).
|
|
; --------------------------------------------------------------------
|
|
.globl __mulsi3
|
|
__mulsi3:
|
|
; Stash a (multiplier) into $e0/$e2.
|
|
sta 0xe0
|
|
stx 0xe2
|
|
; Stash b (multiplicand) into $e4/$e6.
|
|
lda 0x4, s
|
|
sta 0xe4
|
|
lda 0x6, s
|
|
sta 0xe6
|
|
; Clear running product at $e8/$ea.
|
|
lda #0x0
|
|
sta 0xe8
|
|
sta 0xea
|
|
; Loop 32 times: examine LSB of multiplier, conditionally add
|
|
; multiplicand to product, then shift multiplier right and
|
|
; multiplicand left. Use Y as a 16-bit counter (X mode = 16).
|
|
ldy #0x20
|
|
.Lmulsi_loop:
|
|
; Test bit 0 of multiplier (lo word).
|
|
lda 0xe0
|
|
lsr a
|
|
sta 0xe0
|
|
bcc .Lmulsi_noadd
|
|
; Add multiplicand to product (32-bit).
|
|
clc
|
|
lda 0xe8
|
|
adc 0xe4
|
|
sta 0xe8
|
|
lda 0xea
|
|
adc 0xe6
|
|
sta 0xea
|
|
.Lmulsi_noadd:
|
|
; Shift multiplier right (32-bit, hi-into-lo) — we already shifted
|
|
; the lo half above, but the bit shifted out went to carry. We
|
|
; need to also bring the lo bit of the hi half into bit 15 of lo,
|
|
; and shift hi right. Simpler: do a full 32-bit shift right
|
|
; before the LSR. Restructure:
|
|
;
|
|
; Shift multiplicand left (32-bit, carry chain).
|
|
asl 0xe4
|
|
rol 0xe6
|
|
; Bring multiplier hi into multiplier lo's high bit. Multiplier
|
|
; has been shifted lo>>1 already; we need to also put hi's lo bit
|
|
; into lo's hi bit and shift hi right.
|
|
lsr 0xe2
|
|
bcc .Lmulsi_no_borrow
|
|
; Carry from hi >> 1 needs to land in bit 15 of lo. ORA #$8000.
|
|
lda 0xe0
|
|
ora #0x8000
|
|
sta 0xe0
|
|
.Lmulsi_no_borrow:
|
|
dey
|
|
bne .Lmulsi_loop
|
|
; Result is in $e8 (lo) / $ea (hi).
|
|
ldx 0xea
|
|
lda 0xe8
|
|
rtl
|
|
|
|
; --------------------------------------------------------------------
|
|
; __ashlsi3 — (A:X) << (4,s) -> A:X. Shift count is i16 in low byte;
|
|
; counts >= 32 are UB in C. Uses a per-bit loop (cheap on 65816 — one
|
|
; ASL + ROL per bit).
|
|
;
|
|
; ABI: A = a_lo, X = a_hi (i32-first-arg in A:X), (4,s) = count.
|
|
; --------------------------------------------------------------------
|
|
.globl __ashlsi3
|
|
__ashlsi3:
|
|
sta 0xe0 ; lo
|
|
stx 0xe2 ; hi
|
|
lda 0x4, s
|
|
tay ; count -> Y
|
|
.Lashlsi_loop:
|
|
cpy #0x0
|
|
beq .Lashlsi_done
|
|
asl 0xe0
|
|
rol 0xe2
|
|
dey
|
|
bra .Lashlsi_loop
|
|
.Lashlsi_done:
|
|
ldx 0xe2
|
|
lda 0xe0
|
|
rtl
|
|
|
|
; --------------------------------------------------------------------
|
|
; __lshrsi3 — logical >> shift. LSR hi, ROR lo: hi gets a 0, lo gets
|
|
; hi's old bit 0. Per-bit loop.
|
|
; --------------------------------------------------------------------
|
|
.globl __lshrsi3
|
|
__lshrsi3:
|
|
sta 0xe0
|
|
stx 0xe2
|
|
lda 0x4, s
|
|
tay
|
|
.Llshrsi_loop:
|
|
cpy #0x0
|
|
beq .Llshrsi_done
|
|
lsr 0xe2
|
|
ror 0xe0
|
|
dey
|
|
bra .Llshrsi_loop
|
|
.Llshrsi_done:
|
|
ldx 0xe2
|
|
lda 0xe0
|
|
rtl
|
|
|
|
; --------------------------------------------------------------------
|
|
; __ashrsi3 — arithmetic >> shift. Sign bit must be preserved on each
|
|
; iteration: copy bit 15 of hi into carry (via CMP #$8000), then ROR
|
|
; hi, ROR lo. Per-bit loop.
|
|
; --------------------------------------------------------------------
|
|
.globl __ashrsi3
|
|
__ashrsi3:
|
|
sta 0xe0
|
|
stx 0xe2
|
|
lda 0x4, s
|
|
tay
|
|
.Lashrsi_loop:
|
|
cpy #0x0
|
|
beq .Lashrsi_done
|
|
; CMP #$8000 sets C iff the unsigned value >= 0x8000, i.e. bit 15
|
|
; is set — exactly the sign bit.
|
|
lda 0xe2
|
|
cmp #0x8000
|
|
ror 0xe2
|
|
ror 0xe0
|
|
dey
|
|
bra .Lashrsi_loop
|
|
.Lashrsi_done:
|
|
ldx 0xe2
|
|
lda 0xe0
|
|
rtl
|
|
|
|
; --------------------------------------------------------------------
|
|
; __udivmodsi_core — internal 32-bit unsigned divide. Inputs in
|
|
; $e0/$e2 (numerator) and $e4/$e6 (denominator); outputs quotient in
|
|
; $e8/$ea and remainder in $ec/$ee. 32-iteration restoring divide.
|
|
; JSR/RTS local helper.
|
|
; --------------------------------------------------------------------
|
|
__udivmodsi_core:
|
|
lda #0x0
|
|
sta 0xe8
|
|
sta 0xea
|
|
sta 0xec
|
|
sta 0xee
|
|
ldy #0x20
|
|
.Lcoresi_loop:
|
|
; Shift numerator left through remainder.
|
|
asl 0xe0
|
|
rol 0xe2
|
|
rol 0xec
|
|
rol 0xee
|
|
; Shift quotient left.
|
|
asl 0xe8
|
|
rol 0xea
|
|
; Compare remainder to denominator (32-bit).
|
|
lda 0xee
|
|
cmp 0xe6
|
|
bcc .Lcoresi_skip
|
|
bne .Lcoresi_take
|
|
lda 0xec
|
|
cmp 0xe4
|
|
bcc .Lcoresi_skip
|
|
.Lcoresi_take:
|
|
; Remainder >= denominator: subtract and set quotient bit 0.
|
|
sec
|
|
lda 0xec
|
|
sbc 0xe4
|
|
sta 0xec
|
|
lda 0xee
|
|
sbc 0xe6
|
|
sta 0xee
|
|
inc 0xe8
|
|
.Lcoresi_skip:
|
|
dey
|
|
bne .Lcoresi_loop
|
|
rts
|
|
|
|
; --------------------------------------------------------------------
|
|
; __udivsi3 — unsigned 32/32 -> 32 divide.
|
|
; --------------------------------------------------------------------
|
|
.globl __udivsi3
|
|
__udivsi3:
|
|
; ABI: A = a_lo, X = a_hi, (4,s) = b_lo, (6,s) = b_hi.
|
|
sta 0xe0
|
|
stx 0xe2
|
|
lda 0x4, s
|
|
sta 0xe4
|
|
lda 0x6, s
|
|
sta 0xe6
|
|
jsr __udivmodsi_core
|
|
ldx 0xea
|
|
lda 0xe8
|
|
rtl
|
|
|
|
; --------------------------------------------------------------------
|
|
; __umodsi3 — unsigned 32/32 -> 32 modulo.
|
|
; --------------------------------------------------------------------
|
|
.globl __umodsi3
|
|
__umodsi3:
|
|
sta 0xe0
|
|
stx 0xe2
|
|
lda 0x4, s
|
|
sta 0xe4
|
|
lda 0x6, s
|
|
sta 0xe6
|
|
jsr __udivmodsi_core
|
|
ldx 0xee
|
|
lda 0xec
|
|
rtl
|
|
|
|
; --------------------------------------------------------------------
|
|
; __divsi3 / __modsi3 — signed 32-bit divide / modulo. Strategy mirrors
|
|
; the i16 helpers: stash signs, take abs, run unsigned core, negate
|
|
; result(s) as needed. Sign tracker bits in $f0:
|
|
; bit 0 = dividend was negative (modulo result sign)
|
|
; bit 1 = quotient sign (sign(a) XOR sign(b))
|
|
; --------------------------------------------------------------------
|
|
.globl __divsi3
|
|
__divsi3:
|
|
jsr __divmodsi_setup
|
|
jsr __udivmodsi_core
|
|
; Quotient at $e8/$ea. Negate if bit 1 of $f0 is set.
|
|
lda 0xf0
|
|
and #0x2
|
|
beq .Ldivsi_pos
|
|
; 32-bit two's complement of quotient.
|
|
lda 0xe8
|
|
eor #0xffff
|
|
clc
|
|
adc #0x1
|
|
sta 0xe8
|
|
lda 0xea
|
|
eor #0xffff
|
|
adc #0x0
|
|
sta 0xea
|
|
.Ldivsi_pos:
|
|
ldx 0xea
|
|
lda 0xe8
|
|
rtl
|
|
|
|
.globl __modsi3
|
|
__modsi3:
|
|
jsr __divmodsi_setup
|
|
jsr __udivmodsi_core
|
|
; Remainder at $ec/$ee. Negate if bit 0 of $f0 set (dividend
|
|
; was negative — C99 remainder takes dividend's sign).
|
|
lda 0xf0
|
|
and #0x1
|
|
beq .Lmodsi_pos
|
|
lda 0xec
|
|
eor #0xffff
|
|
clc
|
|
adc #0x1
|
|
sta 0xec
|
|
lda 0xee
|
|
eor #0xffff
|
|
adc #0x0
|
|
sta 0xee
|
|
.Lmodsi_pos:
|
|
ldx 0xee
|
|
lda 0xec
|
|
rtl
|
|
|
|
; --------------------------------------------------------------------
|
|
; __divmodsi_setup — common prologue for __divsi3 / __modsi3.
|
|
; Reads A=a_lo, X=a_hi (i32-first-arg ABI), (4,s)=b_lo, (6,s)=b_hi.
|
|
; Writes |a| to $e0/$e2, |b| to $e4/$e6, sign bits to $f0. JSR/RTS.
|
|
; After JSR's 2-byte ret push, callee-relative offsets are (6,s)=b_lo,
|
|
; (8,s)=b_hi.
|
|
; --------------------------------------------------------------------
|
|
__divmodsi_setup:
|
|
; Clear sign tracker.
|
|
pha
|
|
lda #0x0
|
|
sta 0xf0
|
|
pla
|
|
; |a|: A=a_lo, X=a_hi. Save them first (we need a_hi for sign test).
|
|
sta 0xe0 ; tentative a_lo (may negate below)
|
|
stx 0xe2 ; tentative a_hi
|
|
cpx #0x8000
|
|
bcc .Lsetsi_a_pos
|
|
; a is negative. Set sign tracker bits 0+1 and negate.
|
|
lda 0xf0
|
|
ora #0x3
|
|
sta 0xf0
|
|
; 32-bit negate: invert + 1.
|
|
lda 0xe0
|
|
eor #0xffff
|
|
clc
|
|
adc #0x1
|
|
sta 0xe0
|
|
lda 0xe2
|
|
eor #0xffff
|
|
adc #0x0
|
|
sta 0xe2
|
|
.Lsetsi_a_pos:
|
|
; |b|. Args shifted by 2 (the JSR ret push).
|
|
lda 0x6, s
|
|
sta 0xe4
|
|
lda 0x8, s
|
|
sta 0xe6
|
|
cmp #0x8000
|
|
bcc .Lsetsi_b_pos
|
|
; b is negative. Flip bit 1 of $f0.
|
|
lda 0xf0
|
|
eor #0x2
|
|
sta 0xf0
|
|
lda 0xe4
|
|
eor #0xffff
|
|
clc
|
|
adc #0x1
|
|
sta 0xe4
|
|
lda 0xe6
|
|
eor #0xffff
|
|
adc #0x0
|
|
sta 0xe6
|
|
.Lsetsi_b_pos:
|
|
rts
|
|
|
|
; ====================================================================
|
|
; i64 (long long) helpers.
|
|
;
|
|
; Calling convention (i64 first arg is split via i32-first-arg path):
|
|
; A = arg0_lo[0..15] (lowest word)
|
|
; X = arg0_lo[16..31]
|
|
; 4,S = arg0_hi[0..15]
|
|
; 6,S = arg0_hi[16..31] (highest word)
|
|
; For binary ops (mul/div/mod), arg1 follows on the stack:
|
|
; 8,S = arg1_lo[0..15]
|
|
; 10,S = arg1_lo[16..31]
|
|
; 12,S = arg1_hi[0..15]
|
|
; 14,S = arg1_hi[16..31]
|
|
; For shift ops, the count occupies a single i16 at 8,S.
|
|
;
|
|
; Return ABI (matches LowerReturn for i64):
|
|
; A = result_lo[0..15]
|
|
; X = result_lo[16..31]
|
|
; Y = result_hi[0..15]
|
|
; DP $F0..$F1 = result_hi[16..31]
|
|
;
|
|
; Scratch DP layout (per-libcall, no overlap between concurrent calls):
|
|
; $E0..$E7 = a (8 bytes; 4 16-bit words)
|
|
; $E8..$EF = b OR product (8 bytes)
|
|
;
|
|
; All routines run with REP #$30 (M=0, X=0).
|
|
; ====================================================================
|
|
|
|
; --------------------------------------------------------------------
|
|
; __divmoddi4_stash — common entry point. Stashes a -> $E0..$E7,
|
|
; b -> $E8..$EF. Used by __udivdi3 / __umoddi3 / __divdi3 / __moddi3
|
|
; setup; signed variants flip signs around it.
|
|
; --------------------------------------------------------------------
|
|
__divmoddi4_stash:
|
|
sta 0xe0 ; a_lo_lo
|
|
stx 0xe2 ; a_lo_hi
|
|
lda 0x4, s
|
|
sta 0xe4 ; a_hi_lo
|
|
lda 0x6, s
|
|
sta 0xe6 ; a_hi_hi
|
|
lda 0x8, s
|
|
sta 0xe8 ; b_lo_lo
|
|
lda 0xa, s
|
|
sta 0xea ; b_lo_hi
|
|
lda 0xc, s
|
|
sta 0xec ; b_hi_lo
|
|
lda 0xe, s
|
|
sta 0xee ; b_hi_hi
|
|
rts
|
|
|
|
; --------------------------------------------------------------------
|
|
; Helper: pack the result at $E0..$E7 into the i64 return ABI.
|
|
; Trashes A, Y. Caller falls through to RTL.
|
|
; --------------------------------------------------------------------
|
|
__retdi:
|
|
lda 0xe6
|
|
sta 0xf0
|
|
lda 0xe4
|
|
tay
|
|
lda 0xe2
|
|
tax
|
|
lda 0xe0
|
|
rtl
|
|
|
|
; --------------------------------------------------------------------
|
|
; __ashldi3 — i64 left shift by n. Per-bit loop. Y holds count.
|
|
; --------------------------------------------------------------------
|
|
.globl __ashldi3
|
|
__ashldi3:
|
|
sta 0xe0
|
|
stx 0xe2
|
|
lda 0x4, s
|
|
sta 0xe4
|
|
lda 0x6, s
|
|
sta 0xe6
|
|
lda 0x8, s
|
|
tay ; Y = count
|
|
.Lashldi_loop:
|
|
cpy #0x0
|
|
beq .Lashldi_done
|
|
asl 0xe0
|
|
rol 0xe2
|
|
rol 0xe4
|
|
rol 0xe6
|
|
dey
|
|
bra .Lashldi_loop
|
|
.Lashldi_done:
|
|
brl __retdi
|
|
|
|
; --------------------------------------------------------------------
|
|
; __lshrdi3 — i64 logical right shift. LSR top word, ROR rest.
|
|
; --------------------------------------------------------------------
|
|
.globl __lshrdi3
|
|
__lshrdi3:
|
|
sta 0xe0
|
|
stx 0xe2
|
|
lda 0x4, s
|
|
sta 0xe4
|
|
lda 0x6, s
|
|
sta 0xe6
|
|
lda 0x8, s
|
|
tay
|
|
.Llshrdi_loop:
|
|
cpy #0x0
|
|
beq .Llshrdi_done
|
|
lsr 0xe6
|
|
ror 0xe4
|
|
ror 0xe2
|
|
ror 0xe0
|
|
dey
|
|
bra .Llshrdi_loop
|
|
.Llshrdi_done:
|
|
brl __retdi
|
|
|
|
; --------------------------------------------------------------------
|
|
; __ashrdi3 — i64 arithmetic right shift. Same as lshrdi3 but the top
|
|
; bit replicates: sign-extend by ASL/ROR which would clear; instead
|
|
; take a copy of the sign and OR it back, OR use cmp/sbc trick — use
|
|
; the standard idiom: capture sign before LSR via "asl; ror" so C is
|
|
; preserved. Simpler: copy bit 15 of $E7 into C before each shift.
|
|
; --------------------------------------------------------------------
|
|
.globl __ashrdi3
|
|
__ashrdi3:
|
|
sta 0xe0
|
|
stx 0xe2
|
|
lda 0x4, s
|
|
sta 0xe4
|
|
lda 0x6, s
|
|
sta 0xe6
|
|
lda 0x8, s
|
|
tay
|
|
.Lashrdi_loop:
|
|
cpy #0x0
|
|
beq .Lashrdi_done
|
|
; "ASL $E6" sets C from bit 15 (the sign), then we ROR $E6 back.
|
|
; Net effect on $E6: arithmetic right shift by 1 (sign preserved).
|
|
; The carry chain into $E4..$E0 is the new bit 15.
|
|
lda 0xe6
|
|
asl a ; C = sign bit; A = (sign<<1) | rest
|
|
ror 0xe6 ; $E6: (sign << 15) | ($E6 >> 1)
|
|
ror 0xe4
|
|
ror 0xe2
|
|
ror 0xe0
|
|
dey
|
|
bra .Lashrdi_loop
|
|
.Lashrdi_done:
|
|
brl __retdi
|
|
|
|
; --------------------------------------------------------------------
|
|
; __muldi3 — i64 multiply (low 64 bits of 64x64 product).
|
|
; Shift-and-add over a (64 bits). Product accumulates at $F2..$F9
|
|
; (above the return DP slot, scratch). Need a fresh 8-byte product
|
|
; slot since $E0..$EF holds operands.
|
|
; --------------------------------------------------------------------
|
|
.globl __muldi3
|
|
__muldi3:
|
|
jsr __divmoddi4_stash
|
|
; Clear product P0..P3 at $F2..$F8.
|
|
lda #0x0
|
|
sta 0xf2
|
|
sta 0xf4
|
|
sta 0xf6
|
|
sta 0xf8
|
|
; Loop 64 times on a's bits.
|
|
ldy #0x40
|
|
.Lmuldi_loop:
|
|
; Test bit 0 of a (= LSR a; C = old bit 0).
|
|
lda 0xe0
|
|
lsr a
|
|
sta 0xe0
|
|
lda 0xe2
|
|
ror a
|
|
sta 0xe2
|
|
lda 0xe4
|
|
ror a
|
|
sta 0xe4
|
|
lda 0xe6
|
|
ror a
|
|
sta 0xe6
|
|
bcc .Lmuldi_noadd
|
|
; Add b ($E8..$EE) to product ($F2..$F8).
|
|
clc
|
|
lda 0xf2
|
|
adc 0xe8
|
|
sta 0xf2
|
|
lda 0xf4
|
|
adc 0xea
|
|
sta 0xf4
|
|
lda 0xf6
|
|
adc 0xec
|
|
sta 0xf6
|
|
lda 0xf8
|
|
adc 0xee
|
|
sta 0xf8
|
|
.Lmuldi_noadd:
|
|
; Shift b left by 1 (so each iteration uses next bit position).
|
|
asl 0xe8
|
|
rol 0xea
|
|
rol 0xec
|
|
rol 0xee
|
|
dey
|
|
bne .Lmuldi_loop
|
|
; Move product into return slots ($E0..$E7) and tail-call __retdi.
|
|
lda 0xf2
|
|
sta 0xe0
|
|
lda 0xf4
|
|
sta 0xe2
|
|
lda 0xf6
|
|
sta 0xe4
|
|
lda 0xf8
|
|
sta 0xe6
|
|
brl __retdi
|
|
|
|
; --------------------------------------------------------------------
|
|
; __ucmpdi2 — unsigned i64 compare. Returns 0 if a<b, 1 if a==b,
|
|
; 2 if a>b (libgcc convention). We emit i16 result in A (with the
|
|
; high bytes don't-care).
|
|
; --------------------------------------------------------------------
|
|
.globl __ucmpdi2
|
|
__ucmpdi2:
|
|
; Compare from MSB downwards. Stash a/b first so we have a stable
|
|
; layout.
|
|
jsr __divmoddi4_stash
|
|
; Compare $E6 vs $EE (a_hi_hi vs b_hi_hi).
|
|
lda 0xe6
|
|
cmp 0xee
|
|
bne .Lucmpdi_decided
|
|
lda 0xe4
|
|
cmp 0xec
|
|
bne .Lucmpdi_decided
|
|
lda 0xe2
|
|
cmp 0xea
|
|
bne .Lucmpdi_decided
|
|
lda 0xe0
|
|
cmp 0xe8
|
|
bne .Lucmpdi_decided
|
|
; Equal.
|
|
lda #0x1
|
|
rtl
|
|
.Lucmpdi_decided:
|
|
; Carry clear -> a < b -> return 0.
|
|
; Carry set, Z clear -> a > b -> return 2.
|
|
bcc .Lucmpdi_lt
|
|
lda #0x2
|
|
rtl
|
|
.Lucmpdi_lt:
|
|
lda #0x0
|
|
rtl
|
|
|
|
; --------------------------------------------------------------------
|
|
; __cmpdi2 — signed i64 compare. Same {0,1,2} return convention.
|
|
; Implemented by flipping the high-word sign bits before doing an
|
|
; unsigned compare ($N XOR $8000 swaps the signed-int order to
|
|
; unsigned-int order).
|
|
; --------------------------------------------------------------------
|
|
.globl __cmpdi2
|
|
__cmpdi2:
|
|
jsr __divmoddi4_stash
|
|
lda 0xe6
|
|
eor #0x8000
|
|
sta 0xe6
|
|
lda 0xee
|
|
eor #0x8000
|
|
sta 0xee
|
|
; Unsigned compare on the rewritten values.
|
|
lda 0xe6
|
|
cmp 0xee
|
|
bne .Lcmpdi_decided
|
|
lda 0xe4
|
|
cmp 0xec
|
|
bne .Lcmpdi_decided
|
|
lda 0xe2
|
|
cmp 0xea
|
|
bne .Lcmpdi_decided
|
|
lda 0xe0
|
|
cmp 0xe8
|
|
bne .Lcmpdi_decided
|
|
lda #0x1
|
|
rtl
|
|
.Lcmpdi_decided:
|
|
bcc .Lcmpdi_lt
|
|
lda #0x2
|
|
rtl
|
|
.Lcmpdi_lt:
|
|
lda #0x0
|
|
rtl
|
|
|
|
; --------------------------------------------------------------------
|
|
; __udivdi3 / __umoddi3 — unsigned 64-bit divide / modulo. Restoring
|
|
; division: shift dividend left into a remainder register, conditionally
|
|
; subtract the divisor. The two libcalls share the core; quotient
|
|
; lands at $E0..$E7, remainder at $F2..$F8. Each entry sets a flag in
|
|
; X to select which to return.
|
|
; --------------------------------------------------------------------
|
|
.globl __udivdi3
|
|
__udivdi3:
|
|
jsr __divmoddi4_stash
|
|
jsr __udivmoddi_core
|
|
brl __retdi
|
|
|
|
.globl __umoddi3
|
|
__umoddi3:
|
|
jsr __divmoddi4_stash
|
|
jsr __udivmoddi_core
|
|
; Move remainder ($F2..$F8) -> $E0..$E7 for return.
|
|
lda 0xf2
|
|
sta 0xe0
|
|
lda 0xf4
|
|
sta 0xe2
|
|
lda 0xf6
|
|
sta 0xe4
|
|
lda 0xf8
|
|
sta 0xe6
|
|
brl __retdi
|
|
|
|
; Core: dividend at $E0..$E6, divisor at $E8..$EE.
|
|
; Output: quotient at $E0..$E6, remainder at $F2..$F8.
|
|
__udivmoddi_core:
|
|
; Clear remainder $F2..$F8.
|
|
lda #0x0
|
|
sta 0xf2
|
|
sta 0xf4
|
|
sta 0xf6
|
|
sta 0xf8
|
|
ldy #0x40
|
|
.Ludivmoddi_loop:
|
|
; Shift left: dividend (becomes quotient) and remainder together
|
|
; as a 128-bit register. bit shifted out of dividend top -> remainder LSB.
|
|
asl 0xe0
|
|
rol 0xe2
|
|
rol 0xe4
|
|
rol 0xe6
|
|
rol 0xf2
|
|
rol 0xf4
|
|
rol 0xf6
|
|
rol 0xf8
|
|
; Try remainder - divisor. If no borrow, accept and set quotient bit.
|
|
sec
|
|
lda 0xf2
|
|
sbc 0xe8
|
|
sta 0xfa ; tentative subtract result at $FA..$
|
|
lda 0xf4
|
|
sbc 0xea
|
|
sta 0xfc
|
|
lda 0xf6
|
|
sbc 0xec
|
|
sta 0xfe
|
|
lda 0xf8
|
|
sbc 0xee
|
|
; A holds new high word. C = !borrow.
|
|
bcc .Ludivmoddi_skip
|
|
; Accept: remainder = remainder - divisor, quotient bit 0 = 1.
|
|
sta 0xf8
|
|
lda 0xfe
|
|
sta 0xf6
|
|
lda 0xfc
|
|
sta 0xf4
|
|
lda 0xfa
|
|
sta 0xf2
|
|
; Set bit 0 of dividend (which we shifted left, so position is open).
|
|
lda 0xe0
|
|
ora #0x1
|
|
sta 0xe0
|
|
.Ludivmoddi_skip:
|
|
dey
|
|
bne .Ludivmoddi_loop
|
|
rts
|
|
|
|
; --------------------------------------------------------------------
|
|
; __divdi3 / __moddi3 — signed 64-bit divide / modulo. Take absolute
|
|
; values, run the unsigned core, fix up the sign.
|
|
; div: sign(quotient) = sign(a) XOR sign(b)
|
|
; mod: sign(remainder) = sign(a)
|
|
; --------------------------------------------------------------------
|
|
.globl __divdi3
|
|
__divdi3:
|
|
jsr __divmoddi4_stash
|
|
; Track signs: bit 15 of $E6 (a) and $EE (b). Save XOR in a temp.
|
|
lda 0xe6
|
|
eor 0xee
|
|
and #0x8000
|
|
sta 0xfa ; sign of quotient at $FA
|
|
; Abs(a)
|
|
jsr __absdi_a
|
|
; Abs(b)
|
|
jsr __absdi_b
|
|
jsr __udivmoddi_core
|
|
; Fix quotient sign: if $FA != 0, negate $E0..$E6.
|
|
lda 0xfa
|
|
beq .Ldivdi_pos
|
|
jsr __negdi_a
|
|
.Ldivdi_pos:
|
|
brl __retdi
|
|
|
|
.globl __moddi3
|
|
__moddi3:
|
|
jsr __divmoddi4_stash
|
|
; Mod sign = sign of a.
|
|
lda 0xe6
|
|
and #0x8000
|
|
sta 0xfa
|
|
jsr __absdi_a
|
|
jsr __absdi_b
|
|
jsr __udivmoddi_core
|
|
; Move remainder to $E0..$E6.
|
|
lda 0xf2
|
|
sta 0xe0
|
|
lda 0xf4
|
|
sta 0xe2
|
|
lda 0xf6
|
|
sta 0xe4
|
|
lda 0xf8
|
|
sta 0xe6
|
|
; Apply sign.
|
|
lda 0xfa
|
|
beq .Lmoddi_pos
|
|
jsr __negdi_a
|
|
.Lmoddi_pos:
|
|
brl __retdi
|
|
|
|
; --- subroutines used by signed div/mod ---
|
|
|
|
; __absdi_a: if $E6 has sign bit set, negate $E0..$E6.
|
|
__absdi_a:
|
|
lda 0xe6
|
|
bpl .Labsdi_a_done
|
|
jsr __negdi_a
|
|
.Labsdi_a_done:
|
|
rts
|
|
|
|
; __absdi_b: if $EE has sign bit set, negate $E8..$EE.
|
|
__absdi_b:
|
|
lda 0xee
|
|
bpl .Labsdi_b_done
|
|
jsr __negdi_b
|
|
.Labsdi_b_done:
|
|
rts
|
|
|
|
; __negdi_a: 2's complement negate $E0..$E6.
|
|
__negdi_a:
|
|
sec
|
|
lda #0x0
|
|
sbc 0xe0
|
|
sta 0xe0
|
|
lda #0x0
|
|
sbc 0xe2
|
|
sta 0xe2
|
|
lda #0x0
|
|
sbc 0xe4
|
|
sta 0xe4
|
|
lda #0x0
|
|
sbc 0xe6
|
|
sta 0xe6
|
|
rts
|
|
|
|
; __negdi_b: 2's complement negate $E8..$EE.
|
|
__negdi_b:
|
|
sec
|
|
lda #0x0
|
|
sbc 0xe8
|
|
sta 0xe8
|
|
lda #0x0
|
|
sbc 0xea
|
|
sta 0xea
|
|
lda #0x0
|
|
sbc 0xec
|
|
sta 0xec
|
|
lda #0x0
|
|
sbc 0xee
|
|
sta 0xee
|
|
rts
|
|
|
|
; --------------------------------------------------------------------
|
|
; setjmp(jmp_buf env) - save calling environment, return 0
|
|
; longjmp(jmp_buf env, int val) - restore environment, return val (or 1 if val == 0)
|
|
;
|
|
; jmp_buf layout (8 bytes):
|
|
; [0..1] = caller's stack pointer (SP+3 at entry to setjmp)
|
|
; [2..3] = return address PC lo:hi (16 bits)
|
|
; [4] = return address bank (1 byte)
|
|
; [5..6] = direct page register (DP)
|
|
; [7] = reserved / padding
|
|
;
|
|
; Caller-save convention: longjmp doesn't restore X / Y / A — caller's
|
|
; setjmp returned 0 with all-callee-savable regs already preserved by
|
|
; setjmp's caller.
|
|
; --------------------------------------------------------------------
|
|
.globl setjmp
|
|
setjmp:
|
|
sta 0xe0 ; jmp_buf addr -> DP scratch
|
|
tsc ; A = current SP
|
|
clc
|
|
adc #0x3 ; A = caller's SP (undo JSL push)
|
|
ldy #0
|
|
sta (0xe0), y ; env[0..1] = caller SP
|
|
lda 0x1, s ; A = retaddr lo:hi
|
|
ldy #2
|
|
sta (0xe0), y ; env[2..3] = retaddr lo:hi
|
|
sep #0x20
|
|
lda 0x3, s ; A_lo = bank
|
|
ldy #4
|
|
sta (0xe0), y ; env[4] = bank
|
|
rep #0x20
|
|
tdc ; A = DP
|
|
ldy #5
|
|
sta (0xe0), y ; env[5..6] = DP
|
|
lda #0 ; setjmp returns 0
|
|
rtl
|
|
|
|
.globl longjmp
|
|
longjmp:
|
|
sta 0xe0 ; jmp_buf addr -> DP scratch
|
|
lda 0x4, s ; A = val (2nd arg, on stack)
|
|
sta 0xe2 ; save val
|
|
; Restore SP: env[0..1] - 3 (so the upcoming PHAs land at the right slots).
|
|
ldy #0
|
|
lda (0xe0), y ; A = saved SP
|
|
sec
|
|
sbc #0x3
|
|
tcs ; SP = saved_SP - 3
|
|
; Push retaddr: bank, then 16-bit lo:hi. RTL pulls lo, hi, bank.
|
|
sep #0x20
|
|
ldy #4
|
|
lda (0xe0), y ; bank
|
|
pha
|
|
rep #0x20
|
|
ldy #2
|
|
lda (0xe0), y ; lo:hi
|
|
pha
|
|
; Restore DP.
|
|
ldy #5
|
|
lda (0xe0), y
|
|
tcd
|
|
; Compute return value: val if nonzero, else 1.
|
|
lda 0xe2
|
|
bne .Llj_done
|
|
lda #1
|
|
.Llj_done:
|
|
rtl
|