From 55c1ae1c3ee398ad2ad561579557d2d72088656f Mon Sep 17 00:00:00 2001
From: Scott Duensing <scott.duensing@gmail.com>
Date: Tue, 28 Apr 2026 16:49:41 -0500
Subject: [PATCH] Initial check in.  Lots of work yet to do.

---
 .gitignore                                    |   4 +
 SESSION_STATE.md                              | 146 +++
 patches/0006-runtime-libcalls-w65816.patch    |  20 +
 runtime/build.sh                              |  18 +
 runtime/src/libgcc.s                          | 640 ++++++++++++
 scripts/safeCC.sh                             |  33 +
 scripts/smokeTest.sh                          | 567 ++++++++++-
 .../W65816/AsmParser/W65816AsmParser.cpp      |   8 +-
 src/llvm/lib/Target/W65816/CMakeLists.txt     |   1 +
 .../W65816/MCTargetDesc/W65816AsmBackend.cpp  |  16 +
 src/llvm/lib/Target/W65816/W65816.h           |  22 +-
 .../lib/Target/W65816/W65816AsmPrinter.cpp    | 153 +++
 .../lib/Target/W65816/W65816CallingConv.td    |   6 +-
 .../lib/Target/W65816/W65816FrameLowering.cpp | 206 +++-
 .../lib/Target/W65816/W65816ISelDAGToDAG.cpp  |  39 +-
 .../lib/Target/W65816/W65816ISelLowering.cpp  | 924 ++++++++++++++++--
 .../lib/Target/W65816/W65816ISelLowering.h    |  41 +
 .../lib/Target/W65816/W65816InstrFormats.td   |  17 +
 .../lib/Target/W65816/W65816InstrInfo.cpp     |  71 +-
 src/llvm/lib/Target/W65816/W65816InstrInfo.h  |  23 +
 src/llvm/lib/Target/W65816/W65816InstrInfo.td | 503 +++++++++-
 .../lib/Target/W65816/W65816RegisterInfo.cpp  |  81 +-
 .../Target/W65816/W65816StackSlotCleanup.cpp  | 355 +++++++
 .../lib/Target/W65816/W65816TargetMachine.cpp |   6 +
 24 files changed, 3776 insertions(+), 124 deletions(-)
 create mode 100644 patches/0006-runtime-libcalls-w65816.patch
 create mode 100755 runtime/build.sh
 create mode 100644 runtime/src/libgcc.s
 create mode 100755 scripts/safeCC.sh
 create mode 100644 src/llvm/lib/Target/W65816/W65816StackSlotCleanup.cpp

diff --git a/.gitignore b/.gitignore
index cdd6ac6..9abef7d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,6 +5,10 @@ tools/
 # Claude Code tool state
 .claude/
 
+# Runtime build artifacts: regenerable via runtime/build.sh from
+# runtime/src/*.s.  The source files (.s, build.sh) are tracked.
+runtime/*.o
+
 # Editor / OS
 *.swp
 *.swo
diff --git a/SESSION_STATE.md b/SESSION_STATE.md
index 1953623..9af65c4 100644
--- a/SESSION_STATE.md
+++ b/SESSION_STATE.md
@@ -219,6 +219,152 @@ Design doc section 7 lists a 12-step implementation order. We are at:
       scheduling pass.** The prologue `REP #$30` is unconditional;
       the REP/SEP pass will remove it when redundant.
 
+### Where we actually got to (current state, 2026-04-27)
+
+The "open codegen gaps" list above is mostly resolved.  Status of the
+seven sub-items at line 192:
+
+1. **Multi-arg call lowering (caller side)** — done.  `LowerCall`
+   pushes args 1..N-1 right-to-left via `W65816ISD::PUSH`,
+   `ADJCALLSTACKUP` unwinds with `tsc;clc;adc #N;tcs`.
+2. **Frame-reserved scratch space** — done.  `emitPrologue` /
+   `emitEpilogue` use `tsc;sec;sbc #N;tcs` and the inverse.
+3. **Mixed-mode i8/i16** — partial.  Per-function mode based on IR
+   scan; full REP/SEP scheduling pass still TODO (Step 4).
+4. **Signed `(a - b)` overflow in compares** — handled for i8/i16
+   via the signed-CC promote-to-i16 path.  Still has the BMI/BPL
+   correctness caveat at INT16_MIN/MAX boundaries.
+5. **`mul var, var` and friends** — done via libcalls; runtime stubs
+   live in `runtime/src/libgcc.s` (__mulhi3, __mulsi3, __ashlhi3,
+   __ashrhi3, __lshrhi3, __ashlsi3, __ashrsi3, __lshrsi3, __udivhi3,
+   __divhi3, __umodhi3, __modhi3, __udivsi3, __divsi3, __umodsi3,
+   __modsi3).
+6. **SETCC and SELECT_CC i16** — done via custom inserter and the
+   `W65816cmp + W65816selectcc` SDNode pair.
+7. **Library functions** — done; see #5 above.
+
+### i32 (long) support — landed (2026-04-26..28)
+
+- Type legalization splits i32 into two i16 halves.
+- ABI: i32 first-arg lives in A:X (lo:hi), matching the return
+  ABI; subsequent i32 args go on stack 2 bytes per half.
+  `RetCC_W65816` assigns `[A, X]` for two i16 returns so
+  `__mulsi3` / `__divsi3` libcall returns work.
+- ADD/SUB use the native ADC carry chain via ISD::ADDC/ADDE/SUBC/
+  SUBE Legal: `ADCi16imm` etc. mark `Defs = [P]` and pattern-match
+  `addc`; new `ADCEi16imm` / `ADCEabs` / `ADCEfi` (and SBC/E
+  variants) mark `Uses = [P], Defs = [P]` for `adde`/`sube`.
+  `ADDE_RR` / `SUBE_RR` have the inserter equivalent for two-Acc16
+  chains (e.g. fib32's loop).  Net: an i32 add went from ~25 insns
+  (manual UADDO + SETCC + add-of-bool) to ~17 incl. prologue/epilogue,
+  with the core 8 being the optimal `clc;adc;sta;lda;adc;tax;lda;rtl`.
+- NEGC16 / NEGE16 lower `(subc/sube 0, x)` for i32 negate via the
+  ADD chain (`EOR #$FFFF; CLC; ADC #1` lo, `EOR #$FFFF; ADC #0` hi).
+- MUL/DIV/MOD/SHL/SHR/USHR all libcalled; preferredShift­Legalization­
+  Strategy returns `LowerToLibcall` for i32 to keep LLVM from emitting
+  SHL_PARTS we'd have no pattern for.
+- `BuildSDIVPow2` / `BuildSREMPow2` overrides return SDValue() to
+  block the magic-constant pow2 expansion that emits unsupported
+  BUILD_VECTOR.
+
+### Other recent work
+
+- `i1` `sext_inreg` lowered as `(sub 0, (and x, 1))`.
+- `i8` `sext_inreg` and `sextload-i8` go through the existing
+  branchless `((x & 0xFF) ^ 0x80) - 0x80` sequence (SEXTLOAD i8 set
+  to Expand, sext_inreg pattern added).
+- `extloadi8` from an `Acc16` register pointer maps to `LDAptr` (16-
+  bit load; consumer ignores high byte).
+- Bare `ISD::FrameIndex` selected as `ADDframe (FI, 0)` for
+  alloca'd-array address-of; `eliminateFrameIndex` expands ADDframe
+  into `tsc;clc;adc #disp` (LEA equivalent).
+- **Indirect calls** (function pointers): `LowerCall` redirects
+  through `__jsl_indir` in `runtime/src/libgcc.s` — caller stores
+  the dynamic target to global `__indirTarget` then JSLs the
+  trampoline, which does `JMP (__indirTarget)`.  Target's RTL pops
+  the original JSL frame and returns directly to the caller.
+  Single-bank only (JMP indirect is bank-local).
+- **Code-quality cleanup pass** (`W65816StackSlotCleanup`,
+  addPostRegAlloc):
+  - Removes redundant `LDAfi slot` after `STAfi reg, slot` when the
+    LDA's destination matches and nothing in between clobbers
+    either reg or slot.  Catches the regalloc spill+reload cycle
+    around COPY $a → vreg.
+  - Removes dead `STAfi reg, slot` when a subsequent `STAfi`
+    overwrites the same slot before any read, OR when the function
+    returns without reading the slot (catches result-spill-before-
+    return that the libcall return ABI makes redundant).
+  - Combined with `isReMaterializable` on LDAfi from fixed FIs, the
+    i32 add went from 17 → 11 instructions.
+- **i32 shift-by-1 inline** (task #59).  The type-legalizer's
+  SHL_PARTS / SRL_PARTS expansion of `i32 << 1` / `>> 1` emits a
+  `(srl x, 15)` or `(shl x, 15)` for the carry-cross-halves slot.
+  Previously routed through __lshrhi3 / __ashlhi3 libcalls.  Added
+  SRL15A pseudo (`ASL A; LDA #0; ROL A`, 3 bytes) and SHL15A
+  (`LSR A; LDA #0; ROR A`).  i32 shl-by-1 went 33 → 26 insns;
+  shr-by-1 29 → 23.
+- **i16 shift-by-8 inline** (task #60).  Same idea for `(srl x, 8)`
+  and `(shl x, 8)` — used by i32 shift-by-8 type-legalization.
+  XBA swaps the two bytes of A in 16-bit M; AND clears the half
+  we don't want.  4 bytes per shift.  i32 shl/shr-by-8 went
+  39/35 → 27/24 insns.
+- **PUSH16X for direct X-push** (task #61).  When LowerCall sees
+  an outgoing arg whose SDValue is `CopyFromReg` of a vreg that's
+  live-in from $x (i.e. the i32-first-arg-in-A:X hi half), emit
+  `phx` directly instead of `txa; pha` (which also requires
+  spilling $a to preserve it).  mul32 went 19 → 13 insns.
+- **Dead frame-slot trimming** (task #62).  Extended W65816Stack­
+  SlotCleanup to scan MIR for unreferenced (post-cleanup) local
+  frame indices and zero-size them so PrologueEpilogue trims the
+  prologue PHA/TSC reservation.  Combined with the spill cleanup,
+  shrinks frames in many functions by 2-4 bytes (one fewer
+  PHA + PLY pair).
+- **i32 first-arg in A:X (task #50)**.  When the first original
+  argument is i32 (LowerFormalArguments / LowerCall detect via
+  `Outs[0..1].OrigArgIndex == 0` on i16 halves), pass it lo:hi in
+  A:X — matching the i32 return ABI.  Saves one stack slot per
+  i32 arg.  Required updating libgcc.s helpers (`__mulsi3`,
+  `__udivsi3`, `__umodsi3`, `__divsi3`, `__modsi3`, `__ashlsi3`,
+  `__lshrsi3`, `__ashrsi3`, `__divmodsi_setup`) to read arg0_hi
+  from X (and shifted arg1 offsets).
+- **Implicit Defs/Uses on stack-rel MC instructions**: was a
+  pre-existing latent bug — `eliminateFrameIndex` strips the
+  implicit A/P def/use info when it converts ADCfi/STAfi/etc. to
+  the MC form (ADC d,S, STA d,S etc.).  Machine Copy Propagation
+  then sees stale dataflow and elides necessary TAX/TXA copies.
+  Fixed by re-attaching `RegState::Implicit` operands on each
+  expanded MC instruction in W65816RegisterInfo::eliminateFrame­
+  Index.  Without this, the i32-A:X ABI miscompiles return values
+  (TAX gets elided, X retains arg0_hi instead of result_hi).
+  The fix also benefits the existing single-A path; before it,
+  certain Machine Copy Propagation choices were unsafe but
+  happened not to trigger.  Now they're also safe.
+
+### Currently still pending
+
+- **REP/SEP scheduling pass** (Step 4) — per-function mode only;
+  mixed-mode functions don't work.
+- **Vararg functions** — `LowerFormalArguments` reports a fatal
+  error.
+- **i32 comparison** — uses SETCC+ADD-of-bool instead of a CMP+SBC
+  chain (analogous to the ADC chain we landed for add/sub).
+- **Regalloc** (#56) — heapify-style functions with 4+ live i16
+  values run out of A.
+
+### Smoke-test coverage (31 checks as of 2026-04-28)
+
+`scripts/smokeTest.sh` covers: target registration, llvm-mc encode/
+disassemble, end-to-end IR→ELF, multi-pattern function, single-arg
+call, 3-arg stack reads, pure-i8 SEP prologue, multi-branch SETCC,
+SELECT_CC, two-Acc16 spill, libcall emission (__mulhi3/__ashlhi3),
+pointer load/store, runtime/build.sh, real-world program,
+libcall-symbol coverage, signed/eq i8 compare, -O2 tiny C, i32 add
+end-to-end, i32 carry-chain shape (1 clc + 2 adc + 0 bcc), i32
+A:X first-arg ABI (1 txa), 32-bit fib loop (ADDE_RR inserter),
+__mulsi3 libcall, alloca'd-array LEA, signed-byte strcmp
+(sextload + sext_inreg + extload-via-ptr), indirect call via
+__jsl_indir trampoline, i32 shift-by-1 inline (no hi3 libcall).
+
 ## 3. What is installed and where
 
 All under `/home/scott/claude/llvm816/tools/`:
diff --git a/patches/0006-runtime-libcalls-w65816.patch b/patches/0006-runtime-libcalls-w65816.patch
new file mode 100644
index 0000000..8df049c
--- /dev/null
+++ b/patches/0006-runtime-libcalls-w65816.patch
@@ -0,0 +1,20 @@
+diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.td b/llvm/include/llvm/IR/RuntimeLibcalls.td
+index 0000000..0000000 100644
+--- a/llvm/include/llvm/IR/RuntimeLibcalls.td
++++ b/llvm/include/llvm/IR/RuntimeLibcalls.td
+@@ -3620,6 +3620,15 @@ def MOSSystemLibrary
+           __memset,
+           abort)>;
+
++// W65816 (WDC 65816) - integer libcalls only.  Multiply, divide, modulo
++// and shifts go through the standard compiler-rt names (__mulhi3,
++// __divhi3 etc.).  No floating point yet.
++def isW65816 : RuntimeLibcallPredicate<"TT.getArch() == Triple::w65816">;
++
++def W65816SystemLibrary
++    : SystemRuntimeLibrary<isW65816,
++          (add DefaultRuntimeLibcallImpls)>;
++
+ //===----------------------------------------------------------------------===//
+ // Legacy Default Runtime Libcalls
+ //===----------------------------------------------------------------------===//
diff --git a/runtime/build.sh b/runtime/build.sh
new file mode 100755
index 0000000..11f2747
--- /dev/null
+++ b/runtime/build.sh
@@ -0,0 +1,18 @@
+#!/usr/bin/env bash
+# Assemble the W65816 runtime library to runtime/libgcc.o.
+# Run after editing runtime/src/*.s.
+
+set -euo pipefail
+PROJECT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+LLVM_MC="$PROJECT_ROOT/tools/llvm-mos-build/bin/llvm-mc"
+
+[ -x "$LLVM_MC" ] || {
+    echo "llvm-mc not found at $LLVM_MC" >&2
+    exit 1
+}
+
+"$LLVM_MC" -arch=w65816 -filetype=obj \
+    "$PROJECT_ROOT/runtime/src/libgcc.s" \
+    -o "$PROJECT_ROOT/runtime/libgcc.o"
+
+echo "built runtime/libgcc.o"
diff --git a/runtime/src/libgcc.s b/runtime/src/libgcc.s
new file mode 100644
index 0000000..ad6a680
--- /dev/null
+++ b/runtime/src/libgcc.s
@@ -0,0 +1,640 @@
+; Minimal libgcc-equivalent runtime for the W65816 / Apple IIgs.
+; Provides the helpers that the LLVM backend lowers integer multiply,
+; shift, divide, and modulo operations to.  Implementations are
+; correct-but-unoptimised; they exist to unblock end-to-end testing,
+; not to compete with hand-tuned 65816 math libraries.
+;
+; Calling convention (matches W65816ISelLowering::LowerCall):
+;   - Arg 0 in A (16-bit M).
+;   - Arg 1 pushed via PHA before the JSL.  Reads as (4,S) inside the
+;     callee (3-byte JSL return address sits at 1..3,S).
+;   - Return value in A.  Caller releases pushed args.
+;   - Routines run in 16-bit M, 16-bit X (REP #$30 by convention).
+;
+; Direct-page scratch lives at DP+$E0..DP+$EF (16 bytes).  Programs
+; that use this runtime must keep DP=0 or remap accordingly.
+;
+; Assembled with: tools/llvm-mos-build/bin/llvm-mc -arch=w65816 \
+;                                                 -filetype=obj
+;                                                 runtime/src/libgcc.s
+;                                                 -o runtime/libgcc.o
+
+	.text
+
+; --------------------------------------------------------------------
+; Indirect-call trampoline.  An indirect call (function pointer) stores
+; the target's 16-bit address to __indirTarget before JSL'ing here.
+; This routine does a JMP indirect through that variable: control
+; transfers to the target with the original caller's JSL frame still
+; on the stack, so target's RTL returns to the original caller (one
+; frame, no double-RTL).
+;
+; Caller emit sequence in W65816ISelLowering::LowerCall:
+;   sta __indirTarget    ; store ptr (must precede any A clobber for args)
+;   ... arg pushes ...
+;   jsl __jsl_indir
+;
+; Single-bank only (the IIgs convention assumes code in bank 0/1
+; via JSL — JMP indirect is bank-local).
+; --------------------------------------------------------------------
+	.globl __indirTarget
+	.bss
+__indirTarget:
+	.zero 2
+
+	.text
+	.globl __jsl_indir
+__jsl_indir:
+	; Hand-encoded JMP (__indirTarget): 6C is "jmp (a)" — the assembler
+	; doesn't yet parse the `(abs)` syntax, so emit the bytes directly
+	; with a 16-bit relocation against the variable.  Effective transfer:
+	; PC <- mem[__indirTarget].
+	.byte	0x6C
+	.word	__indirTarget
+
+; --------------------------------------------------------------------
+; __mulhi3 — 16-bit multiply.  A * (4,S) -> A.
+; Signed and unsigned share an implementation: only the low 16 bits of
+; the product are returned, which is identical for both.  Uses
+; shift-and-add over the multiplier bits.
+; --------------------------------------------------------------------
+	.globl __mulhi3
+__mulhi3:
+	sta	0xe0		; multiplier
+	lda	0x4, s
+	sta	0xe2		; multiplicand
+	lda	#0x0
+	sta	0xe4		; running product
+.Lmul_loop:
+	lda	0xe0
+	beq	.Lmul_done
+	lsr	a
+	sta	0xe0
+	bcc	.Lmul_skip
+	lda	0xe4
+	clc
+	adc	0xe2
+	sta	0xe4
+.Lmul_skip:
+	asl	0xe2
+	bra	.Lmul_loop
+.Lmul_done:
+	lda	0xe4
+	rtl
+
+; --------------------------------------------------------------------
+; __ashlhi3 — A << (4,S) -> A.  Shift count is i16 but only the low 4
+; bits are meaningful (counts >=16 are undefined behaviour in C).
+; --------------------------------------------------------------------
+	.globl __ashlhi3
+__ashlhi3:
+	pha			; save value on stack so we can free A
+	lda	0x6, s		; arg 1 sits at 6,s now (PHA shifted by 2)
+	tax
+	pla			; restore value
+.Lashl_loop:
+	cpx	#0x0
+	beq	.Lashl_done
+	asl	a
+	dex
+	bra	.Lashl_loop
+.Lashl_done:
+	rtl
+
+; --------------------------------------------------------------------
+; __lshrhi3 — A logical >> (4,S) -> A.  Same shape as __ashlhi3 with
+; LSR instead of ASL.
+; --------------------------------------------------------------------
+	.globl __lshrhi3
+__lshrhi3:
+	pha
+	lda	0x6, s
+	tax
+	pla
+.Llshr_loop:
+	cpx	#0x0
+	beq	.Llshr_done
+	lsr	a
+	dex
+	bra	.Llshr_loop
+.Llshr_done:
+	rtl
+
+; --------------------------------------------------------------------
+; __ashrhi3 — A arithmetic >> (4,S) -> A.  Sign bit is preserved by
+; copying it into carry before each ROR via CMP #$8000 (which sets
+; carry exactly when the sign bit is set on a 16-bit unsigned compare).
+; --------------------------------------------------------------------
+	.globl __ashrhi3
+__ashrhi3:
+	pha
+	lda	0x6, s
+	tax
+	pla
+.Lashr_loop:
+	cpx	#0x0
+	beq	.Lashr_done
+	cmp	#0x8000
+	ror	a
+	dex
+	bra	.Lashr_loop
+.Lashr_done:
+	rtl
+
+; --------------------------------------------------------------------
+; __udivhi3 — A unsigned / (4,S) -> A.
+; Restoring shift-subtract division.  Common helper; __umodhi3 reuses
+; the algorithm and returns the remainder instead.
+; Scratch:  $e6 = numerator,  $e8 = denominator,
+;           $ea = quotient,   $ec = remainder.
+; --------------------------------------------------------------------
+	.globl __udivhi3
+__udivhi3:
+	; Public entry: A=dividend, (4,S)=divisor.  Set up scratch and
+	; call the same JSR-based core used by signed divide.
+	sta	0xe6
+	lda	0x4, s
+	sta	0xe8
+	jsr	__udivmod_core
+	lda	0xea
+	rtl
+
+	.globl __umodhi3
+__umodhi3:
+	sta	0xe6
+	lda	0x4, s
+	sta	0xe8
+	jsr	__udivmod_core
+	lda	0xec
+	rtl
+
+; --------------------------------------------------------------------
+; __divhi3 / __modhi3 — signed 16-bit divide and modulo.  Strategy:
+; - Stash sign of dividend in $ee bit 0 (used by modulo).
+; - Stash result sign of quotient (sign(a) XOR sign(b)) in $ee bit 1
+;   (used by divide).
+; - Take absolute values, run the unsigned core, then negate the
+;   appropriate result if its sign bit is set.
+; C99: quotient truncates toward zero; remainder takes the sign of the
+; dividend.
+; --------------------------------------------------------------------
+	.globl __divhi3
+__divhi3:
+	jsr	__divmod_setup
+	jsr	__udivmod_core
+	; Quotient is in $ea.  Negate if bit 1 of $ee is set.
+	lda	0xea
+	pha
+	lda	0xee
+	and	#0x2
+	beq	.Ldiv_pos
+	pla
+	eor	#0xffff
+	clc
+	adc	#0x1
+	rtl
+.Ldiv_pos:
+	pla
+	rtl
+
+	.globl __modhi3
+__modhi3:
+	jsr	__divmod_setup
+	jsr	__udivmod_core
+	; Remainder is in $ec.  Negate if bit 0 of $ee is set (dividend
+	; was negative).
+	lda	0xec
+	pha
+	lda	0xee
+	and	#0x1
+	beq	.Lmod_pos
+	pla
+	eor	#0xffff
+	clc
+	adc	#0x1
+	rtl
+.Lmod_pos:
+	pla
+	rtl
+
+; --------------------------------------------------------------------
+; __divmod_setup — common prologue for __divhi3/__modhi3.  Reads
+; A=dividend and (4,S)=divisor (the public-entry stack frame is intact
+; because we used JSR not JSL, so (4,S) still points to the user's
+; pushed arg1 relative to the original JSL).  Computes |a| -> $e6,
+; |b| -> $e8, and sign tracker -> $ee:
+;   bit 0 = 1 if dividend was negative (modulo result sign)
+;   bit 1 = 1 if dividend XOR divisor signs differ (quotient sign)
+; Uses JSR/RTS, same bank.
+; --------------------------------------------------------------------
+__divmod_setup:
+	; Sign tracker.  We don't have STZ in our instruction set yet, so
+	; clear via PHA/LDA #0/STA/PLA to avoid trashing A.
+	pha
+	lda	#0x0
+	sta	0xee
+	pla
+	; Dividend sign + abs value.
+	cmp	#0x8000
+	bcc	.Lset_a_pos
+	; Negative: set bits 0 and 1 (dividend sign, result sign so far).
+	pha
+	lda	0xee
+	ora	#0x3
+	sta	0xee
+	pla
+	eor	#0xffff
+	clc
+	adc	#0x1
+.Lset_a_pos:
+	sta	0xe6
+	; Divisor sign + abs value.  After our JSR (pushed 2 bytes of
+	; near-return), the user's arg1 has shifted up by 2 from (4,S)
+	; to (6,S).
+	lda	0x6, s
+	cmp	#0x8000
+	bcc	.Lset_b_pos
+	; Negative: flip bit 1 of $ee (XOR with sign of dividend).
+	pha
+	lda	0xee
+	eor	#0x2
+	sta	0xee
+	pla
+	eor	#0xffff
+	clc
+	adc	#0x1
+.Lset_b_pos:
+	sta	0xe8
+	rts
+
+; --------------------------------------------------------------------
+; __udivmod_core — internal restoring divide.  Inputs at $e6/$e8,
+; outputs quotient at $ea, remainder at $ec.  JSR/RTS local helper.
+; --------------------------------------------------------------------
+__udivmod_core:
+	lda	#0x0
+	sta	0xea
+	sta	0xec
+	ldx	#0x10
+.Lcore_loop:
+	asl	0xe6
+	rol	0xec
+	asl	0xea
+	lda	0xec
+	cmp	0xe8
+	bcc	.Lcore_skip
+	sec
+	sbc	0xe8
+	sta	0xec
+	inc	0xea
+.Lcore_skip:
+	dex
+	bne	.Lcore_loop
+	rts
+
+; ====================================================================
+; 32-bit (long / si) helpers.
+;
+; ABI for these is the natural extension of the i16 libcalls:
+;   - arg0_lo in A
+;   - arg0_hi at (4,s)
+;   - arg1_lo at (6,s)         (or shift count, for the shift helpers)
+;   - arg1_hi at (8,s)
+;   - return: result_lo in A, result_hi in X
+;
+; All are correct-but-unoptimised; goal is unblocking end-to-end builds,
+; not winning a 65816 codegolf.
+;
+; Direct-page scratch for these:
+;   $e0..$e3  = a (lo, hi)        [renamed from $e0/$e2 for the i16 ones]
+;   $e4..$e7  = b (lo, hi)
+;   $e8..$eb  = result / quotient (lo, hi)
+;   $ec..$ef  = remainder (lo, hi)
+; ====================================================================
+
+; --------------------------------------------------------------------
+; __mulsi3 — 32-bit multiply.  Shift-and-add over 32 bits of the
+; multiplier.  Result = (a * b) mod 2^32.
+;
+; ABI: A = a_lo, X = a_hi (the i32-first-arg in A:X convention),
+;      (4,s) = b_lo, (6,s) = b_hi.  Result returned in A:X (lo:hi).
+; --------------------------------------------------------------------
+	.globl __mulsi3
+__mulsi3:
+	; Stash a (multiplier) into $e0/$e2.
+	sta	0xe0
+	stx	0xe2
+	; Stash b (multiplicand) into $e4/$e6.
+	lda	0x4, s
+	sta	0xe4
+	lda	0x6, s
+	sta	0xe6
+	; Clear running product at $e8/$ea.
+	lda	#0x0
+	sta	0xe8
+	sta	0xea
+	; Loop 32 times: examine LSB of multiplier, conditionally add
+	; multiplicand to product, then shift multiplier right and
+	; multiplicand left.  Use Y as a 16-bit counter (X mode = 16).
+	ldy	#0x20
+.Lmulsi_loop:
+	; Test bit 0 of multiplier (lo word).
+	lda	0xe0
+	lsr	a
+	sta	0xe0
+	bcc	.Lmulsi_noadd
+	; Add multiplicand to product (32-bit).
+	clc
+	lda	0xe8
+	adc	0xe4
+	sta	0xe8
+	lda	0xea
+	adc	0xe6
+	sta	0xea
+.Lmulsi_noadd:
+	; Shift multiplier right (32-bit, hi-into-lo) — we already shifted
+	; the lo half above, but the bit shifted out went to carry.  We
+	; need to also bring the lo bit of the hi half into bit 15 of lo,
+	; and shift hi right.  Simpler: do a full 32-bit shift right
+	; before the LSR.  Restructure:
+	;
+	; Shift multiplicand left (32-bit, carry chain).
+	asl	0xe4
+	rol	0xe6
+	; Bring multiplier hi into multiplier lo's high bit.  Multiplier
+	; has been shifted lo>>1 already; we need to also put hi's lo bit
+	; into lo's hi bit and shift hi right.
+	lsr	0xe2
+	bcc	.Lmulsi_no_borrow
+	; Carry from hi >> 1 needs to land in bit 15 of lo.  ORA #$8000.
+	lda	0xe0
+	ora	#0x8000
+	sta	0xe0
+.Lmulsi_no_borrow:
+	dey
+	bne	.Lmulsi_loop
+	; Result is in $e8 (lo) / $ea (hi).
+	ldx	0xea
+	lda	0xe8
+	rtl
+
+; --------------------------------------------------------------------
+; __ashlsi3 — (A:X) << (4,s) -> A:X.  Shift count is i16 in low byte;
+; counts >= 32 are UB in C.  Uses a per-bit loop (cheap on 65816 — one
+; ASL + ROL per bit).
+;
+; ABI: A = a_lo, X = a_hi (i32-first-arg in A:X), (4,s) = count.
+; --------------------------------------------------------------------
+	.globl __ashlsi3
+__ashlsi3:
+	sta	0xe0			; lo
+	stx	0xe2			; hi
+	lda	0x4, s
+	tay				; count -> Y
+.Lashlsi_loop:
+	cpy	#0x0
+	beq	.Lashlsi_done
+	asl	0xe0
+	rol	0xe2
+	dey
+	bra	.Lashlsi_loop
+.Lashlsi_done:
+	ldx	0xe2
+	lda	0xe0
+	rtl
+
+; --------------------------------------------------------------------
+; __lshrsi3 — logical >> shift.  LSR hi, ROR lo: hi gets a 0, lo gets
+; hi's old bit 0.  Per-bit loop.
+; --------------------------------------------------------------------
+	.globl __lshrsi3
+__lshrsi3:
+	sta	0xe0
+	stx	0xe2
+	lda	0x4, s
+	tay
+.Llshrsi_loop:
+	cpy	#0x0
+	beq	.Llshrsi_done
+	lsr	0xe2
+	ror	0xe0
+	dey
+	bra	.Llshrsi_loop
+.Llshrsi_done:
+	ldx	0xe2
+	lda	0xe0
+	rtl
+
+; --------------------------------------------------------------------
+; __ashrsi3 — arithmetic >> shift.  Sign bit must be preserved on each
+; iteration: copy bit 15 of hi into carry (via CMP #$8000), then ROR
+; hi, ROR lo.  Per-bit loop.
+; --------------------------------------------------------------------
+	.globl __ashrsi3
+__ashrsi3:
+	sta	0xe0
+	stx	0xe2
+	lda	0x4, s
+	tay
+.Lashrsi_loop:
+	cpy	#0x0
+	beq	.Lashrsi_done
+	; CMP #$8000 sets C iff the unsigned value >= 0x8000, i.e. bit 15
+	; is set — exactly the sign bit.
+	lda	0xe2
+	cmp	#0x8000
+	ror	0xe2
+	ror	0xe0
+	dey
+	bra	.Lashrsi_loop
+.Lashrsi_done:
+	ldx	0xe2
+	lda	0xe0
+	rtl
+
+; --------------------------------------------------------------------
+; __udivmodsi_core — internal 32-bit unsigned divide.  Inputs in
+; $e0/$e2 (numerator) and $e4/$e6 (denominator); outputs quotient in
+; $e8/$ea and remainder in $ec/$ee.  32-iteration restoring divide.
+; JSR/RTS local helper.
+; --------------------------------------------------------------------
+__udivmodsi_core:
+	lda	#0x0
+	sta	0xe8
+	sta	0xea
+	sta	0xec
+	sta	0xee
+	ldy	#0x20
+.Lcoresi_loop:
+	; Shift numerator left through remainder.
+	asl	0xe0
+	rol	0xe2
+	rol	0xec
+	rol	0xee
+	; Shift quotient left.
+	asl	0xe8
+	rol	0xea
+	; Compare remainder to denominator (32-bit).
+	lda	0xee
+	cmp	0xe6
+	bcc	.Lcoresi_skip
+	bne	.Lcoresi_take
+	lda	0xec
+	cmp	0xe4
+	bcc	.Lcoresi_skip
+.Lcoresi_take:
+	; Remainder >= denominator: subtract and set quotient bit 0.
+	sec
+	lda	0xec
+	sbc	0xe4
+	sta	0xec
+	lda	0xee
+	sbc	0xe6
+	sta	0xee
+	inc	0xe8
+.Lcoresi_skip:
+	dey
+	bne	.Lcoresi_loop
+	rts
+
+; --------------------------------------------------------------------
+; __udivsi3 — unsigned 32/32 -> 32 divide.
+; --------------------------------------------------------------------
+	.globl __udivsi3
+__udivsi3:
+	; ABI: A = a_lo, X = a_hi, (4,s) = b_lo, (6,s) = b_hi.
+	sta	0xe0
+	stx	0xe2
+	lda	0x4, s
+	sta	0xe4
+	lda	0x6, s
+	sta	0xe6
+	jsr	__udivmodsi_core
+	ldx	0xea
+	lda	0xe8
+	rtl
+
+; --------------------------------------------------------------------
+; __umodsi3 — unsigned 32/32 -> 32 modulo.
+; --------------------------------------------------------------------
+	.globl __umodsi3
+__umodsi3:
+	sta	0xe0
+	stx	0xe2
+	lda	0x4, s
+	sta	0xe4
+	lda	0x6, s
+	sta	0xe6
+	jsr	__udivmodsi_core
+	ldx	0xee
+	lda	0xec
+	rtl
+
+; --------------------------------------------------------------------
+; __divsi3 / __modsi3 — signed 32-bit divide / modulo.  Strategy mirrors
+; the i16 helpers: stash signs, take abs, run unsigned core, negate
+; result(s) as needed.  Sign tracker bits in $f0:
+;   bit 0 = dividend was negative (modulo result sign)
+;   bit 1 = quotient sign (sign(a) XOR sign(b))
+; --------------------------------------------------------------------
+	.globl __divsi3
+__divsi3:
+	jsr	__divmodsi_setup
+	jsr	__udivmodsi_core
+	; Quotient at $e8/$ea.  Negate if bit 1 of $f0 is set.
+	lda	0xf0
+	and	#0x2
+	beq	.Ldivsi_pos
+	; 32-bit two's complement of quotient.
+	lda	0xe8
+	eor	#0xffff
+	clc
+	adc	#0x1
+	sta	0xe8
+	lda	0xea
+	eor	#0xffff
+	adc	#0x0
+	sta	0xea
+.Ldivsi_pos:
+	ldx	0xea
+	lda	0xe8
+	rtl
+
+	.globl __modsi3
+__modsi3:
+	jsr	__divmodsi_setup
+	jsr	__udivmodsi_core
+	; Remainder at $ec/$ee.  Negate if bit 0 of $f0 set (dividend
+	; was negative — C99 remainder takes dividend's sign).
+	lda	0xf0
+	and	#0x1
+	beq	.Lmodsi_pos
+	lda	0xec
+	eor	#0xffff
+	clc
+	adc	#0x1
+	sta	0xec
+	lda	0xee
+	eor	#0xffff
+	adc	#0x0
+	sta	0xee
+.Lmodsi_pos:
+	ldx	0xee
+	lda	0xec
+	rtl
+
+; --------------------------------------------------------------------
+; __divmodsi_setup — common prologue for __divsi3 / __modsi3.
+; Reads A=a_lo, X=a_hi (i32-first-arg ABI), (4,s)=b_lo, (6,s)=b_hi.
+; Writes |a| to $e0/$e2, |b| to $e4/$e6, sign bits to $f0.  JSR/RTS.
+; After JSR's 2-byte ret push, callee-relative offsets are (6,s)=b_lo,
+; (8,s)=b_hi.
+; --------------------------------------------------------------------
+__divmodsi_setup:
+	; Clear sign tracker.
+	pha
+	lda	#0x0
+	sta	0xf0
+	pla
+	; |a|: A=a_lo, X=a_hi.  Save them first (we need a_hi for sign test).
+	sta	0xe0			; tentative a_lo (may negate below)
+	stx	0xe2			; tentative a_hi
+	cpx	#0x8000
+	bcc	.Lsetsi_a_pos
+	; a is negative.  Set sign tracker bits 0+1 and negate.
+	lda	0xf0
+	ora	#0x3
+	sta	0xf0
+	; 32-bit negate: invert + 1.
+	lda	0xe0
+	eor	#0xffff
+	clc
+	adc	#0x1
+	sta	0xe0
+	lda	0xe2
+	eor	#0xffff
+	adc	#0x0
+	sta	0xe2
+.Lsetsi_a_pos:
+	; |b|.  Args shifted by 2 (the JSR ret push).
+	lda	0x6, s
+	sta	0xe4
+	lda	0x8, s
+	sta	0xe6
+	cmp	#0x8000
+	bcc	.Lsetsi_b_pos
+	; b is negative.  Flip bit 1 of $f0.
+	lda	0xf0
+	eor	#0x2
+	sta	0xf0
+	lda	0xe4
+	eor	#0xffff
+	clc
+	adc	#0x1
+	sta	0xe4
+	lda	0xe6
+	eor	#0xffff
+	adc	#0x0
+	sta	0xe6
+.Lsetsi_b_pos:
+	rts
diff --git a/scripts/safeCC.sh b/scripts/safeCC.sh
new file mode 100755
index 0000000..bc3344b
--- /dev/null
+++ b/scripts/safeCC.sh
@@ -0,0 +1,33 @@
+#!/usr/bin/env bash
+# Wrapper for ad-hoc invocations of the W65816 cross-compiler toolchain.
+# Applies the same memory/CPU caps as smokeTest.sh so a runaway backend
+# bug (infinite combine, runaway inserter) can't OOM-kill the whole tmux
+# scope and take Claude Code down with it.
+#
+# Usage:
+#   scripts/safeCC.sh clang --target=w65816 -O2 -S foo.c -o foo.s
+#   scripts/safeCC.sh llc -march=w65816 foo.ll -o foo.s
+#
+# The first arg is resolved against tools/llvm-mos-build/bin/ if it isn't
+# already an absolute or relative path containing a slash.
+
+set -euo pipefail
+
+ulimit -v $((4 * 1024 * 1024))   # 4 GB virtual memory
+ulimit -t 90                     # 90 CPU-seconds
+
+if [ $# -lt 1 ]; then
+    printf 'usage: %s <tool> [args...]\n' "$0" >&2
+    exit 2
+fi
+
+PROJECT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+BIN_DIR="$PROJECT_ROOT/tools/llvm-mos-build/bin"
+
+tool="$1"
+shift
+
+case "$tool" in
+    /*|./*|*/*) exec "$tool" "$@" ;;
+    *) exec "$BIN_DIR/$tool" "$@" ;;
+esac
diff --git a/scripts/smokeTest.sh b/scripts/smokeTest.sh
index 8de958d..0b3c20d 100755
--- a/scripts/smokeTest.sh
+++ b/scripts/smokeTest.sh
@@ -11,6 +11,18 @@
 set -euo pipefail
 source "$(dirname "$0")/common.sh"
 
+# Resource caps for child compilers.  A bug in the W65816 backend can send
+# clang/llc into a runaway combine/inserter loop that allocates tens of GB
+# of RAM.  When that happens the kernel OOM-killer takes down the entire
+# tmux scope (bash, the compiler, and the parent Claude Code session with
+# it).  Bounding virtual memory and CPU time here turns "OOM kills the
+# terminal" into "compiler dies with SIGSEGV / SIGXCPU and we get a clean
+# error."  Numbers are well above what a healthy compile of these tiny
+# test inputs needs (~200 MB / a few seconds), so legitimate work is
+# unaffected.
+ulimit -v $((4 * 1024 * 1024))   # 4 GB virtual memory ceiling
+ulimit -t 90                     # 90 CPU-seconds per process
+
 BUILD_DIR="$TOOLS_DIR/llvm-mos-build"
 LLC="$BUILD_DIR/bin/llc"
 LLVM_MC="$BUILD_DIR/bin/llvm-mc"
@@ -249,7 +261,344 @@ EOF
     done
 fi
 
-# 11. Real C through clang.  Uses the clang front-end if it has been
+# 11a. SETCC via clang: a > b returns 0/1.  Exercises the multi-branch
+# CC path (BEQ + BPL diamond, since SETGT can't be a single Bxx).
+CLANG="$BUILD_DIR/bin/clang"
+if [ -x "$CLANG" ]; then
+    log "check: clang compiles a > b via multi-branch SETCC"
+    cFile="$(mktemp --suffix=.c)"
+    sCmpFile="$(mktemp --suffix=.s)"
+    trap 'rm -f "$irFile" "$sFile" "$irCallFile" "$sCallFile" "$irMaFile" "$sMaFile" "$irI8File" "$sI8File" "$cFile" "$sCmpFile"' EXIT
+    cat > "$cFile" <<'EOF'
+int gt(int a, int b) { return a > b; }
+EOF
+    "$CLANG" --target=w65816 -O2 -S "$cFile" -o "$sCmpFile"
+    # Expect a CMP, then BEQ + BPL forming the multi-branch diamond.
+    for expect in "cmp	0x4, s" "lda	#0x1" "beq" "bpl" "lda	#0x0"; do
+        if ! grep -qF "$expect" "$sCmpFile"; then
+            warn "setcc gt test missing: $expect"
+            cat "$sCmpFile" >&2
+            die "setcc gt test failed"
+        fi
+    done
+fi
+
+# 11b. SELECT via clang: c ? a : b returns one of two constants.
+if [ -x "$CLANG" ]; then
+    log "check: clang compiles c ? 100 : 200 via SELECT_CC"
+    cFile2="$(mktemp --suffix=.c)"
+    sSelFile="$(mktemp --suffix=.s)"
+    trap 'rm -f "$irFile" "$sFile" "$irCallFile" "$sCallFile" "$irMaFile" "$sMaFile" "$irI8File" "$sI8File" "$cFile" "$sCmpFile" "$cFile2" "$sSelFile"' EXIT
+    cat > "$cFile2" <<'EOF'
+int sel(int c) { return c ? 100 : 200; }
+EOF
+    "$CLANG" --target=w65816 -O2 -S "$cFile2" -o "$sSelFile"
+    for expect in "cmp	#0x0" "lda	#0xc8" "beq" "lda	#0x64"; do
+        if ! grep -qF "$expect" "$sSelFile"; then
+            warn "select test missing: $expect"
+            cat "$sSelFile" >&2
+            die "select test failed"
+        fi
+    done
+fi
+
+# 11c. Two-Acc16 op via clang: a - b where both are non-foldable Acc16.
+# Caller-side b lives in memory (FI), so this matches via SBCfi without
+# the spill — but a + b + c chains through a true two-Acc16 add.
+if [ -x "$CLANG" ]; then
+    log "check: clang compiles two-Acc16 ops via spill (chained add)"
+    cFile3="$(mktemp --suffix=.c)"
+    sChainFile="$(mktemp --suffix=.s)"
+    trap 'rm -f "$irFile" "$sFile" "$irCallFile" "$sCallFile" "$irMaFile" "$sMaFile" "$irI8File" "$sI8File" "$cFile" "$sCmpFile" "$cFile2" "$sSelFile" "$cFile3" "$sChainFile"' EXIT
+    cat > "$cFile3" <<'EOF'
+// max3 forces two-Acc16: outer SELECT_CC compares one Acc16 PHI value
+// to another Acc16 PHI value (m vs c, both computed values).
+int max3(int a, int b, int c) {
+  int m = a > b ? a : b;
+  return m > c ? m : c;
+}
+EOF
+    "$CLANG" --target=w65816 -O2 -S "$cFile3" -o "$sChainFile"
+    # Expect at least one sta-spill paired with cmp to a stack-relative
+    # slot - the signature of the two-Acc16 CMP_RR custom inserter.
+    if ! grep -qE 'sta	0x[0-9a-f]+, s' "$sChainFile" \
+       || ! grep -qE 'cmp	0x[0-9a-f]+, s' "$sChainFile"; then
+        cat "$sChainFile" >&2
+        die "two-Acc16 (max3) didn't spill+cmp via stack-relative"
+    fi
+fi
+
+# 11d. Multiply via libcall.
+if [ -x "$CLANG" ]; then
+    log "check: clang emits __mulhi3 libcall for i16 multiply"
+    cFile4="$(mktemp --suffix=.c)"
+    sMulFile="$(mktemp --suffix=.s)"
+    trap 'rm -f "$irFile" "$sFile" "$irCallFile" "$sCallFile" "$irMaFile" "$sMaFile" "$irI8File" "$sI8File" "$cFile" "$sCmpFile" "$cFile2" "$sSelFile" "$cFile3" "$sChainFile" "$cFile4" "$sMulFile"' EXIT
+    cat > "$cFile4" <<'EOF'
+int mul(int a, int b) { return a * b; }
+EOF
+    "$CLANG" --target=w65816 -O2 -S "$cFile4" -o "$sMulFile"
+    if ! grep -qF "jsl	__mulhi3" "$sMulFile"; then
+        cat "$sMulFile" >&2
+        die "expected jsl __mulhi3"
+    fi
+fi
+
+# 11e. Variable shift via libcall.
+if [ -x "$CLANG" ]; then
+    log "check: clang emits __ashlhi3 libcall for variable i16 shift"
+    cFile5="$(mktemp --suffix=.c)"
+    sShfFile="$(mktemp --suffix=.s)"
+    trap 'rm -f "$irFile" "$sFile" "$irCallFile" "$sCallFile" "$irMaFile" "$sMaFile" "$irI8File" "$sI8File" "$cFile" "$sCmpFile" "$cFile2" "$sSelFile" "$cFile3" "$sChainFile" "$cFile4" "$sMulFile" "$cFile5" "$sShfFile"' EXIT
+    cat > "$cFile5" <<'EOF'
+int shf(int x, int n) { return x << n; }
+EOF
+    "$CLANG" --target=w65816 -O2 -S "$cFile5" -o "$sShfFile"
+    if ! grep -qF "jsl	__ashlhi3" "$sShfFile"; then
+        cat "$sShfFile" >&2
+        die "expected jsl __ashlhi3"
+    fi
+fi
+
+# 11f. Pointer deref: *p loads via stack-relative-indirect-Y.
+if [ -x "$CLANG" ]; then
+    log "check: clang compiles *p via LDA (slot,s),y"
+    cFile6="$(mktemp --suffix=.c)"
+    sPtrFile="$(mktemp --suffix=.s)"
+    trap 'rm -f "$irFile" "$sFile" "$irCallFile" "$sCallFile" "$irMaFile" "$sMaFile" "$irI8File" "$sI8File" "$cFile" "$sCmpFile" "$cFile2" "$sSelFile" "$cFile3" "$sChainFile" "$cFile4" "$sMulFile" "$cFile5" "$sShfFile" "$cFile6" "$sPtrFile"' EXIT
+    cat > "$cFile6" <<'EOF'
+int load_ptr(const int *p) { return *p; }
+void store_ptr(int *p, int v) { *p = v; }
+EOF
+    "$CLANG" --target=w65816 -O2 -S "$cFile6" -o "$sPtrFile"
+    for expect in "ldy	#0x0" "lda	(0x" "sta	(0x"; do
+        if ! grep -qF "$expect" "$sPtrFile"; then
+            warn "ptr-deref test missing: $expect"
+            cat "$sPtrFile" >&2
+            die "ptr-deref test failed"
+        fi
+    done
+fi
+
+# 11g. i8 store via pointer: *p = v wraps the STA in SEP/REP so only
+# 1 byte is written.  Both load_byte and store_byte must compile.
+if [ -x "$CLANG" ]; then
+    log "check: clang compiles *p = v with SEP/REP-wrapped STA (i8 store)"
+    cFile7="$(mktemp --suffix=.c)"
+    sBptrFile="$(mktemp --suffix=.s)"
+    trap 'rm -f "$irFile" "$sFile" "$irCallFile" "$sCallFile" "$irMaFile" "$sMaFile" "$irI8File" "$sI8File" "$cFile" "$sCmpFile" "$cFile2" "$sSelFile" "$cFile3" "$sChainFile" "$cFile4" "$sMulFile" "$cFile5" "$sShfFile" "$cFile6" "$sPtrFile" "$cFile7" "$sBptrFile"' EXIT
+    cat > "$cFile7" <<'EOF'
+unsigned char loadb(const unsigned char *p) { return *p; }
+void storeb(unsigned char *p, unsigned char v) { *p = v; }
+unsigned char incb(unsigned char *p) { return ++*p; }
+EOF
+    "$CLANG" --target=w65816 -O2 -S "$cFile7" -o "$sBptrFile"
+    # storeb body should contain SEP #$20 ... STA (slot,s),y ... REP #$20.
+    if ! grep -qF "sep	#0x20" "$sBptrFile" \
+       || ! grep -qF "rep	#0x20" "$sBptrFile" \
+       || ! grep -qE 'sta	\(0x[0-9a-f]+, s\), y' "$sBptrFile"; then
+        cat "$sBptrFile" >&2
+        die "i8 ptr-store test missing SEP/STA/REP sequence"
+    fi
+    # All three functions must produce labels.
+    for sym in loadb storeb incb; do
+        if ! grep -qE "^${sym}:" "$sBptrFile"; then
+            cat "$sBptrFile" >&2
+            die "i8 ptr test: missing function ${sym}"
+        fi
+    done
+    # Correctness check: storeb's prologue must NOT clobber A.  A holds
+    # the pointer arg on entry; the first body op must spill A intact.
+    # The fixed prologue uses N/2 PHAs (small N) or TAY/TSC/.../TYA
+    # (large N).  Either way, the first non-prologue op should be a
+    # `sta NN,s` that captures arg0=p.  If we see TSC anywhere in the
+    # prologue WITHOUT a TAY before it, that's the broken form (A
+    # clobbered by TSC, then the spill stores garbage SP value as if
+    # it were the pointer).
+    storeb_body="$(sed -n '/^storeb:/,/^\.Lfunc_end/p' "$sBptrFile")"
+    if printf '%s\n' "$storeb_body" | grep -qE '^	tsc$' \
+       && ! printf '%s\n' "$storeb_body" | grep -qE '^	tay$'; then
+        cat "$sBptrFile" >&2
+        die "storeb prologue uses bare TSC without TAY — A (the pointer arg) gets clobbered before being spilled.  Byte store writes to the wrong address.  Use PHA-based prologue or TAY/TSC/.../TYA bracket."
+    fi
+    # Also: there must be at least one `sta NN,s` in the body (the spill
+    # of the pointer arg).
+    if ! printf '%s\n' "$storeb_body" | grep -qE '^	sta	0x[0-9a-f]+, s$'; then
+        cat "$sBptrFile" >&2
+        die "storeb missing pointer-arg spill (sta NN,s)"
+    fi
+fi
+
+# 11h. i8 global access stays in 8-bit M (no over-read).  bump_gb must
+# get the SEP #$20 prologue and emit a single-byte lda/inc/sta sequence.
+if [ -x "$CLANG" ]; then
+    log "check: clang keeps pure-i8 global access in 8-bit M (no wide-read regression)"
+    cFile8="$(mktemp --suffix=.c)"
+    sGbFile="$(mktemp --suffix=.s)"
+    trap 'rm -f "$irFile" "$sFile" "$irCallFile" "$sCallFile" "$irMaFile" "$sMaFile" "$irI8File" "$sI8File" "$cFile" "$sCmpFile" "$cFile2" "$sSelFile" "$cFile3" "$sChainFile" "$cFile4" "$sMulFile" "$cFile5" "$sShfFile" "$cFile6" "$sPtrFile" "$cFile7" "$sBptrFile" "$cFile8" "$sGbFile"' EXIT
+    cat > "$cFile8" <<'EOF'
+unsigned char gb;
+void bump_gb(void) { gb++; }
+EOF
+    "$CLANG" --target=w65816 -O2 -S "$cFile8" -o "$sGbFile"
+    # Must use 8-bit M prologue (sep #$20), not the 16-bit one.
+    if ! grep -qF "sep	#0x20" "$sGbFile"; then
+        cat "$sGbFile" >&2
+        die "bump_gb test: expected sep #\$20 prologue (got 16-bit M)"
+    fi
+fi
+
+# 11j. Runtime library assembles and exports all expected libcalls.
+# This is the destination of every __mulhi3/__ashlhi3/etc. that clang
+# emits — without it, generated code links to nothing.
+RUNTIME_SH="$PROJECT_ROOT/runtime/build.sh"
+RUNTIME_OBJ="$PROJECT_ROOT/runtime/libgcc.o"
+if [ -x "$RUNTIME_SH" ]; then
+    log "check: runtime/build.sh assembles libgcc.o with all libcall symbols"
+    "$RUNTIME_SH" >/dev/null
+    if [ ! -f "$RUNTIME_OBJ" ]; then
+        die "runtime/build.sh did not produce libgcc.o"
+    fi
+    syms="$("$BUILD_DIR/bin/llvm-objdump" -t "$RUNTIME_OBJ" 2>&1 | awk '{print $NF}')"
+    for need in __mulhi3 __ashlhi3 __ashrhi3 __lshrhi3 __divhi3 __udivhi3 __modhi3 __umodhi3; do
+        if ! printf '%s\n' "$syms" | grep -qx "$need"; then
+            printf '%s\n' "$syms" >&2
+            die "runtime missing symbol: $need"
+        fi
+    done
+fi
+
+# 11m. Real-world surface area: a non-trivial program that exercises
+# struct-field deref, char* iteration, multiply, shift, and a bit-twiddle
+# function.  Validates the backend compiles a realistic C input
+# end-to-end without crashing.  Doesn't assert specific asm; just
+# success and that the function bodies are non-empty.
+if [ -x "$CLANG" ]; then
+    log "check: clang compiles a real-world multi-function program"
+    cFile12="$(mktemp --suffix=.c)"
+    sBigFile="$(mktemp --suffix=.s)"
+    trap 'rm -f "$irFile" "$sFile" "$irCallFile" "$sCallFile" "$irMaFile" "$sMaFile" "$irI8File" "$sI8File" "$cFile" "$sCmpFile" "$cFile2" "$sSelFile" "$cFile3" "$sChainFile" "$cFile4" "$sMulFile" "$cFile5" "$sShfFile" "$cFile6" "$sPtrFile" "$cFile7" "$sBptrFile" "$cFile8" "$sGbFile" "$cFile9" "$sEqbFile" "$cFile10" "$sSgnFile" "$cFile11" "$sCallsFile" "$cFile12" "$sBigFile"' EXIT
+    cat > "$cFile12" <<'EOF'
+typedef unsigned char u8;
+typedef unsigned int u16;
+struct Node { u16 data; struct Node *next; };
+u16 list_sum(const struct Node *h) {
+    u16 s=0; while(h){ s+=h->data; h=h->next; } return s;
+}
+int strcmp_test(const char *a, const char *b) {
+    while (*a && *a == *b) { a++; b++; }
+    return (unsigned char)*a - (unsigned char)*b;
+}
+u16 fnv16(const u8 *p, u16 n) {
+    u16 h=0x811C; for (u16 i=0;i<n;i++){ h^=p[i]; h=h*0x101; } return h;
+}
+u16 ctz16(u16 x) {
+    if (!x) return 16;
+    u16 n=0;
+    if (!(x & 0xFF)) { n+=8; x>>=8; }
+    if (!(x & 0x0F)) { n+=4; x>>=4; }
+    if (!(x & 0x03)) { n+=2; x>>=2; }
+    if (!(x & 0x01))   n+=1;
+    return n;
+}
+EOF
+    "$CLANG" --target=w65816 -O2 -S "$cFile12" -o "$sBigFile"
+    for sym in list_sum strcmp_test fnv16 ctz16; do
+        if ! grep -qE "^${sym}:" "$sBigFile"; then
+            cat "$sBigFile" >&2
+            die "real-world test missing function: $sym"
+        fi
+    done
+fi
+
+# 11l. Linkage contract: every libcall clang generates from arithmetic
+# ops must match a symbol provided by runtime/libgcc.o.  We can't run a
+# real link yet (no w65816-aware linker), but we can verify the symbol
+# names line up — drift here would be a silent runtime crash.
+if [ -x "$CLANG" ] && [ -f "$RUNTIME_OBJ" ]; then
+    log "check: every libcall clang emits has a matching definition in libgcc.o"
+    cFile11="$(mktemp --suffix=.c)"
+    sCallsFile="$(mktemp --suffix=.s)"
+    trap 'rm -f "$irFile" "$sFile" "$irCallFile" "$sCallFile" "$irMaFile" "$sMaFile" "$irI8File" "$sI8File" "$cFile" "$sCmpFile" "$cFile2" "$sSelFile" "$cFile3" "$sChainFile" "$cFile4" "$sMulFile" "$cFile5" "$sShfFile" "$cFile6" "$sPtrFile" "$cFile7" "$sBptrFile" "$cFile8" "$sGbFile" "$cFile9" "$sEqbFile" "$cFile10" "$sSgnFile" "$cFile11" "$sCallsFile"' EXIT
+    cat > "$cFile11" <<'EOF'
+int   m1(int a, int b) { return a * b; }
+unsigned int m2(unsigned int a, unsigned int b) { return a * b; }
+int   s1(int x, int n) { return x << n; }
+unsigned int s2(unsigned int x, int n) { return x >> n; }
+int   s3(int x, int n) { return x >> n; }
+int   d1(int a, int b) { return a / b; }
+unsigned int d2(unsigned int a, unsigned int b) { return a / b; }
+int   r1(int a, int b) { return a % b; }
+unsigned int r2(unsigned int a, unsigned int b) { return a % b; }
+long  m3(long a, long b) { return a * b; }
+unsigned long m4(unsigned long a, unsigned long b) { return a * b; }
+long  s4(long x, int n) { return x << n; }
+long  s5(long x, int n) { return x >> n; }
+unsigned long s6(unsigned long x, int n) { return x >> n; }
+long  d3(long a, long b) { return a / b; }
+unsigned long d4(unsigned long a, unsigned long b) { return a / b; }
+long  r3(long a, long b) { return a % b; }
+unsigned long r4(unsigned long a, unsigned long b) { return a % b; }
+EOF
+    "$CLANG" --target=w65816 -O2 -S "$cFile11" -o "$sCallsFile"
+    runtime_syms="$("$BUILD_DIR/bin/llvm-objdump" -t "$RUNTIME_OBJ" 2>&1 | awk '$2 == "g" {print $NF}')"
+    emitted="$(grep -oE 'jsl	__[a-z0-9]+' "$sCallsFile" | awk '{print $2}' | sort -u)"
+    for sym in $emitted; do
+        if ! printf '%s\n' "$runtime_syms" | grep -qx "$sym"; then
+            warn "clang emitted libcall $sym but runtime/libgcc.o has no such symbol"
+            printf 'runtime exports:\n%s\n' "$runtime_syms" >&2
+            printf 'clang emitted:\n%s\n' "$emitted" >&2
+            die "libcall name drift: $sym missing from runtime"
+        fi
+    done
+fi
+
+# 11k. signed i8 compare: forces 16-bit M prologue (instrLowersToWide)
+# because the SEXT lowering needs i16 ops.  Verifies both that the
+# code compiles AND that the prologue is REP #$30 (not the 8-bit M
+# fast path, which would silently corrupt the SEXT mask).
+if [ -x "$CLANG" ]; then
+    log "check: signed i8 compare gets 16-bit M prologue + emits cmp"
+    cFile10="$(mktemp --suffix=.c)"
+    sSgnFile="$(mktemp --suffix=.s)"
+    trap 'rm -f "$irFile" "$sFile" "$irCallFile" "$sCallFile" "$irMaFile" "$sMaFile" "$irI8File" "$sI8File" "$cFile" "$sCmpFile" "$cFile2" "$sSelFile" "$cFile3" "$sChainFile" "$cFile4" "$sMulFile" "$cFile5" "$sShfFile" "$cFile6" "$sPtrFile" "$cFile7" "$sBptrFile" "$cFile8" "$sGbFile" "$cFile9" "$sEqbFile" "$cFile10" "$sSgnFile"' EXIT
+    cat > "$cFile10" <<'EOF'
+signed char sgnlt(signed char a, signed char b) { return a < b; }
+EOF
+    "$CLANG" --target=w65816 -O2 -S "$cFile10" -o "$sSgnFile"
+    # Must use 16-bit M (rep #$30), not the 8-bit fast path.
+    if ! grep -qF "rep	#0x30" "$sSgnFile"; then
+        cat "$sSgnFile" >&2
+        die "sgnlt: expected rep #\$30 prologue (i8 signed cmp needs 16-bit M)"
+    fi
+    # Must NOT contain the 8-bit prologue, which would mean we never
+    # transitioned (the SEXT injection's ora #\$ff00 would silently
+    # truncate to ora #\$00 in 8-bit M).
+    if grep -qF "rep	#0x10" "$sSgnFile" && ! grep -qF "rep	#0x30" "$sSgnFile"; then
+        cat "$sSgnFile" >&2
+        die "sgnlt: only saw 8-bit M prologue, SEXT high-byte mask would be dropped"
+    fi
+fi
+
+# 11i. i8 equality compare on two stack args (eqbyte): exercises i8
+# SETCC promotion through Lower*CC.
+if [ -x "$CLANG" ]; then
+    log "check: clang lowers i8 == i8 via promoted i16 cmp"
+    cFile9="$(mktemp --suffix=.c)"
+    sEqbFile="$(mktemp --suffix=.s)"
+    trap 'rm -f "$irFile" "$sFile" "$irCallFile" "$sCallFile" "$irMaFile" "$sMaFile" "$irI8File" "$sI8File" "$cFile" "$sCmpFile" "$cFile2" "$sSelFile" "$cFile3" "$sChainFile" "$cFile4" "$sMulFile" "$cFile5" "$sShfFile" "$cFile6" "$sPtrFile" "$cFile7" "$sBptrFile" "$cFile8" "$sGbFile" "$cFile9" "$sEqbFile"' EXIT
+    cat > "$cFile9" <<'EOF'
+unsigned char eqbyte(unsigned char a, unsigned char b) { return a == b; }
+EOF
+    "$CLANG" --target=w65816 -O2 -S "$cFile9" -o "$sEqbFile"
+    # Must produce a cmp + beq (the eq diamond).
+    if ! grep -qE 'cmp	' "$sEqbFile" || ! grep -qF "beq" "$sEqbFile"; then
+        cat "$sEqbFile" >&2
+        die "eqbyte test: expected cmp + beq sequence"
+    fi
+fi
+
+# 12. Real C through clang.  Uses the clang front-end if it has been
 # built; skipped otherwise (clang takes 15-30 minutes to build the
 # first time; afterwards rebuilds are fast).
 CLANG="$BUILD_DIR/bin/clang"
@@ -270,6 +619,222 @@ EOF
             die "clang end-to-end test failed"
         fi
     done
+
+    # 13. i32 (long) compile path.  Type legalization splits i32 into
+    # two i16 halves; the high half flows through the (add FrameIndex,
+    # 2) shape, which previously crashed ISel with "Cannot select
+    # FrameIndex<-2>".  SelectFrameIndex now folds (add FI, const) so
+    # the split loads land on a stack-relative addressing mode.
+    # Return ABI: low->A, high->X (TAX in the epilogue).
+    # Also asserts the native ADC carry chain (CLC + ADC + ADC) is in
+    # place — task #49 replaced the bloated SETCC-based carry detect
+    # (lda;cmp;bcc;lda) with a direct ADDC/ADDE-pattern lowering that
+    # uses the C flag in P as a Glue-modeled physreg.
+    log "check: clang compiles a long add (i32 split + A:X return)"
+    cI32File="$(mktemp --suffix=.c)"
+    oI32File="$(mktemp --suffix=.o)"
+    trap 'rm -f "$irFile" "$sFile" "$irCallFile" "$sCallFile" "$irMaFile" "$sMaFile" "$irI8File" "$sI8File" "$cFile" "$oFile2" "$cI32File" "$oI32File"' EXIT
+    cat > "$cI32File" <<'EOF'
+long add32(long a, long b) { return a + b; }
+EOF
+    "$CLANG" --target=w65816 -O2 -c "$cI32File" -o "$oI32File"
+    disasmI32="$("$OBJDUMP" --triple=w65816 -d "$oI32File" 2>&1)"
+    # TAX confirms the high-half-into-X part of the return ABI fired.
+    # Without it, both halves would pile into A and one would be lost.
+    # Exactly one CLC and exactly two ADCs prove the native carry chain
+    # is wired (one CLC for lo, ADC lo, ADC hi-with-carry); a regression
+    # to the SETCC path would show two CLCs and a bcc/cmp.
+    for expect in "tax" "rtl" "clc" "adc"; do
+        if ! printf '%s\n' "$disasmI32" | grep -qF "$expect"; then
+            warn "i32 add test missing: $expect"
+            printf '%s\n' "$disasmI32" >&2
+            die "i32 add end-to-end test failed"
+        fi
+    done
+    nClc="$(printf '%s\n' "$disasmI32" | grep -cE '\bclc\b' || true)"
+    nAdc="$(printf '%s\n' "$disasmI32" | grep -cE '\badc\b' || true)"
+    nBcc="$(printf '%s\n' "$disasmI32" | grep -cE '\bbcc\b' || true)"
+    if [ "$nClc" != "1" ] || [ "$nAdc" != "2" ] || [ "$nBcc" != "0" ]; then
+        warn "i32 add carry-chain shape wrong (clc=$nClc adc=$nAdc bcc=$nBcc, want 1/2/0)"
+        printf '%s\n' "$disasmI32" >&2
+        die "i32 add carry-chain regression"
+    fi
+    # Lock the post-StackSlotCleanup instruction count: should be ~11 for
+    # add32 (rep + pha + clc + adc + sta + txa + adc + tax + lda + ply + rtl
+    # — i32-first-arg in A:X means arg0_hi loads as TXA, no LDAfi).  If
+    # this regresses meaningfully (say >14) the cleanup pass, the
+    # rematerialization flag, or the A:X first-arg ABI has been broken.
+    nInsns="$(printf '%s\n' "$disasmI32" | grep -cE '^[0-9a-f]+:' || true)"
+    if [ "$nInsns" -gt 14 ]; then
+        warn "i32 add bloat (got $nInsns insns, want <=14 — was 25 pre-cleanup, 11 post)"
+        printf '%s\n' "$disasmI32" >&2
+        die "i32 add code-quality regression"
+    fi
+    # The A:X arg0 ABI moves arg0_hi out of the stack slot, so the
+    # asm should contain TXA (X→A for the hi-half ADC tied input)
+    # exactly once.  A regression to "load arg0_hi from stack" would
+    # remove the TXA and add an extra LDA.
+    nTxa="$(printf '%s\n' "$disasmI32" | grep -cE '\btxa\b' || true)"
+    if [ "$nTxa" != "1" ]; then
+        warn "i32 add: expected exactly 1 txa (i32-first-arg-in-A:X path); got $nTxa"
+        printf '%s\n' "$disasmI32" >&2
+        die "i32 add A:X first-arg ABI regression"
+    fi
+
+    # i32 carry chain on two-Acc16 (no foldable load): exercises the
+    # ADD_RR + ADDE_RR custom-inserter path.  fib32 has live a/b values
+    # the inserter must spill to a fresh slot; pre-fix this crashed at
+    # ISel with "Cannot select: adde reg, reg".
+    log "check: clang compiles a 32-bit fib loop (ADDE_RR inserter path)"
+    cFibFile="$(mktemp --suffix=.c)"
+    sFibFile="$(mktemp --suffix=.s)"
+    trap 'rm -f "$irFile" "$sFile" "$irCallFile" "$sCallFile" "$irMaFile" "$sMaFile" "$irI8File" "$sI8File" "$cFile" "$oFile2" "$cI32File" "$oI32File" "$cFibFile" "$sFibFile"' EXIT
+    cat > "$cFibFile" <<'EOF'
+unsigned long fib32(unsigned long n) {
+    unsigned long a = 0, b = 1, t;
+    while (n > 0) { t = a + b; a = b; b = t; n--; }
+    return a;
+}
+EOF
+    if ! "$CLANG" --target=w65816 -O2 -S "$cFibFile" -o "$sFibFile" 2>&1 >/dev/null; then
+        die "i32 fib (ADDE_RR inserter) failed to compile"
+    fi
+    if ! grep -qE '\bclc\b' "$sFibFile" || ! grep -qE '\badc\b' "$sFibFile"; then
+        warn "i32 fib output missing clc/adc"
+        die "i32 fib carry-chain regression"
+    fi
+
+    # i32 multiply via __mulsi3 libcall: tests the multi-i16-return path
+    # (RetCC_W65816 assigning A then X for 2 i16 returns) plus the i32
+    # arg push side.  Pre-fix this hit "multi-return calls not yet
+    # supported (Ins.size=4)" when LowerCallTo split the i32 return.
+    log "check: clang compiles a long multiply via __mulsi3 libcall"
+    cMulFile="$(mktemp --suffix=.c)"
+    sMulFile="$(mktemp --suffix=.s)"
+    trap 'rm -f "$irFile" "$sFile" "$irCallFile" "$sCallFile" "$irMaFile" "$sMaFile" "$irI8File" "$sI8File" "$cFile" "$oFile2" "$cI32File" "$oI32File" "$cFibFile" "$sFibFile" "$cMulFile" "$sMulFile"' EXIT
+    cat > "$cMulFile" <<'EOF'
+unsigned long mul32(unsigned long a, unsigned long b) { return a * b; }
+EOF
+    if ! "$CLANG" --target=w65816 -O2 -S "$cMulFile" -o "$sMulFile" 2>&1 >/dev/null; then
+        die "i32 mul via __mulsi3 failed to compile"
+    fi
+    if ! grep -q '__mulsi3' "$sMulFile"; then
+        die "i32 mul did not emit __mulsi3 libcall"
+    fi
+
+    # i32 shift-by-1 (SHL/SRL): the type-legalizer's SHL_PARTS / SRL_PARTS
+    # expansion needs `(srl x, 15)` or `(shl x, 15)` for the carry-cross-
+    # halves slot.  Without inline patterns those fall to __lshrhi3 /
+    # __ashlhi3 libcalls (~10 byte overhead per shift).  SRL15A and
+    # SHL15A pseudos handle them inline (`ASL/LSR; LDA #0; ROL/ROR`,
+    # 3 bytes).  Verify the shift-by-1 output doesn't contain a hi3
+    # libcall.
+    log "check: clang i32 shift-by-1 stays inline (no __lshrhi3 / __ashlhi3 libcall)"
+    cSh1File="$(mktemp --suffix=.c)"
+    sSh1File="$(mktemp --suffix=.s)"
+    trap 'rm -f "$irFile" "$sFile" "$irCallFile" "$sCallFile" "$irMaFile" "$sMaFile" "$irI8File" "$sI8File" "$cFile" "$oFile2" "$cI32File" "$oI32File" "$cFibFile" "$sFibFile" "$cMulFile" "$sMulFile" "$cSh1File" "$sSh1File"' EXIT
+    cat > "$cSh1File" <<'EOF'
+unsigned long shl1(unsigned long a) { return a << 1; }
+unsigned long shr1(unsigned long a) { return a >> 1; }
+EOF
+    if ! "$CLANG" --target=w65816 -O2 -S "$cSh1File" -o "$sSh1File" 2>&1 >/dev/null; then
+        die "i32 shift-by-1 failed to compile"
+    fi
+    if grep -qE '__lshrhi3|__ashlhi3' "$sSh1File"; then
+        warn "i32 shift-by-1 still calling i16 shift libcall — SRL15A/SHL15A pattern not firing"
+        die "i32 shift-by-1 regression"
+    fi
+
+    # Varargs (<stdarg.h>): LowerFormalArguments creates a fixed FI
+    # for the first vararg slot when IsVarArg; LowerVASTART stores
+    # its address to the va_list pointer.  VAARG/VACOPY/VAEND use
+    # default LLVM expansions.  Pre-fix this hit
+    # "vararg functions not yet supported" fatal error.
+    log "check: clang compiles a vararg function (<stdarg.h>)"
+    cVaFile="$(mktemp --suffix=.c)"
+    sVaFile="$(mktemp --suffix=.s)"
+    trap 'rm -f "$irFile" "$sFile" "$irCallFile" "$sCallFile" "$irMaFile" "$sMaFile" "$irI8File" "$sI8File" "$cFile" "$oFile2" "$cI32File" "$oI32File" "$cFibFile" "$sFibFile" "$cMulFile" "$sMulFile" "$cSh1File" "$sSh1File" "$cVaFile" "$sVaFile"' EXIT
+    cat > "$cVaFile" <<'EOF'
+#include <stdarg.h>
+int sumArgs(int n, ...) {
+    va_list args;
+    va_start(args, n);
+    int sum = 0;
+    for (int i = 0; i < n; i++) sum += va_arg(args, int);
+    va_end(args);
+    return sum;
+}
+EOF
+    if ! "$CLANG" --target=w65816 -O2 -S "$cVaFile" -o "$sVaFile" 2>&1 >/dev/null; then
+        die "vararg function failed to compile"
+    fi
+
+    # Stack-array LEA: `char arr[16]; arr[i] = ...` needs the address
+    # of an alloca'd object as an i16 value.  Pre-fix this hit "Cannot
+    # select: FrameIndex<0>" because addr_fi only matches in load/store
+    # contexts.  W65816DAGToDAGISel::Select now lowers a bare
+    # ISD::FrameIndex to ADDframe (FI, 0); eliminateFrameIndex expands
+    # ADDframe into TSC + CLC + ADC #disp.
+    log "check: clang takes the address of a stack-allocated array"
+    cAllocaFile="$(mktemp --suffix=.c)"
+    sAllocaFile="$(mktemp --suffix=.s)"
+    trap 'rm -f "$irFile" "$sFile" "$irCallFile" "$sCallFile" "$irMaFile" "$sMaFile" "$irI8File" "$sI8File" "$cFile" "$oFile2" "$cI32File" "$oI32File" "$cFibFile" "$sFibFile" "$cMulFile" "$sMulFile" "$cAllocaFile" "$sAllocaFile"' EXIT
+    cat > "$cAllocaFile" <<'EOF'
+void writeBytes(char *out, char v) {
+    char tmp[8];
+    for (int i = 0; i < 8; i++) tmp[i] = v + i;
+    for (int i = 0; i < 8; i++) out[i] = tmp[i];
+}
+EOF
+    if ! "$CLANG" --target=w65816 -O2 -S "$cAllocaFile" -o "$sAllocaFile" 2>&1 >/dev/null; then
+        die "alloca'd-array address failed to compile"
+    fi
+    # The TSC; CLC; ADC #disp triple is the LEA expansion of ADDframe;
+    # at least one occurrence proves the pseudo wired through.
+    if ! grep -qE '^\s*tsc' "$sAllocaFile"; then
+        die "alloca'd-array LEA missing TSC (ADDframe expansion broken)"
+    fi
+
+    # signed-byte arithmetic (`(int)(*p) - (int)(*q)` style — strcmp).
+    # Exercises three formerly-missing patterns: SEXTLOAD i16 from i8
+    # (we Expand it to (sext (load))), sext_inreg i16 from i8 (the
+    # `((x & 0xFF) ^ 0x80) - 0x80` tablegen Pat), and extloadi8 from
+    # an Acc16 register pointer (LDAptr / "high byte don't care").
+    log "check: clang compiles a signed-byte strcmp (sextload + sext_inreg + extload-via-ptr)"
+    cStrFile="$(mktemp --suffix=.c)"
+    sStrFile="$(mktemp --suffix=.s)"
+    trap 'rm -f "$irFile" "$sFile" "$irCallFile" "$sCallFile" "$irMaFile" "$sMaFile" "$irI8File" "$sI8File" "$cFile" "$oFile2" "$cI32File" "$oI32File" "$cFibFile" "$sFibFile" "$cMulFile" "$sMulFile" "$cAllocaFile" "$sAllocaFile" "$cStrFile" "$sStrFile"' EXIT
+    cat > "$cStrFile" <<'EOF'
+int strcmp32(const char *a, const char *b) {
+    while (*a && *a == *b) { a++; b++; }
+    return (int)(*a) - (int)(*b);
+}
+EOF
+    if ! "$CLANG" --target=w65816 -O2 -S "$cStrFile" -o "$sStrFile" 2>&1 >/dev/null; then
+        die "signed-byte strcmp failed to compile"
+    fi
+
+    # Indirect calls (function pointers).  Lowered via the runtime
+    # trampoline at runtime/src/libgcc.s::__jsl_indir, which does
+    # JMP (__indirTarget) — caller stores target to __indirTarget then
+    # JSL __jsl_indir.  Pre-fix, LowerCall reported a fatal error.
+    log "check: clang compiles an indirect call (via __jsl_indir trampoline)"
+    cIndFile="$(mktemp --suffix=.c)"
+    sIndFile="$(mktemp --suffix=.s)"
+    trap 'rm -f "$irFile" "$sFile" "$irCallFile" "$sCallFile" "$irMaFile" "$sMaFile" "$irI8File" "$sI8File" "$cFile" "$oFile2" "$cI32File" "$oI32File" "$cFibFile" "$sFibFile" "$cMulFile" "$sMulFile" "$cAllocaFile" "$sAllocaFile" "$cStrFile" "$sStrFile" "$cIndFile" "$sIndFile"' EXIT
+    cat > "$cIndFile" <<'EOF'
+typedef int (*BinOp)(int, int);
+int doOp(BinOp op, int x, int y) { return op(x, y); }
+EOF
+    if ! "$CLANG" --target=w65816 -O2 -S "$cIndFile" -o "$sIndFile" 2>&1 >/dev/null; then
+        die "indirect call failed to compile"
+    fi
+    if ! grep -q '__indirTarget' "$sIndFile"; then
+        die "indirect call missing __indirTarget store"
+    fi
+    if ! grep -q '__jsl_indir' "$sIndFile"; then
+        die "indirect call missing JSL to __jsl_indir trampoline"
+    fi
 fi
 
 log "all smoke checks passed"
diff --git a/src/llvm/lib/Target/W65816/AsmParser/W65816AsmParser.cpp b/src/llvm/lib/Target/W65816/AsmParser/W65816AsmParser.cpp
index 24c28bd..45d2d2b 100644
--- a/src/llvm/lib/Target/W65816/AsmParser/W65816AsmParser.cpp
+++ b/src/llvm/lib/Target/W65816/AsmParser/W65816AsmParser.cpp
@@ -200,7 +200,13 @@ public:
   }
 
   bool isPCRel8()  const {
-    return Kind == k_Addr && isConstant(Addr) && constFitsUnsigned(Addr, 8);
+    // Branch targets are typically symbols (resolved by the assembler /
+    // linker into the final 8-bit signed offset).  Accept any address
+    // expression — constant in-range, or symbolic.  Constants outside
+    // 8 bits are rejected so they fall through to PCRel16 / longer
+    // forms instead of silently overflowing.
+    return Kind == k_Addr &&
+           (!isConstant(Addr) || constFitsUnsigned(Addr, 8));
   }
   bool isPCRel16() const {
     return Kind == k_Addr &&
diff --git a/src/llvm/lib/Target/W65816/CMakeLists.txt b/src/llvm/lib/Target/W65816/CMakeLists.txt
index 2000020..dea260c 100644
--- a/src/llvm/lib/Target/W65816/CMakeLists.txt
+++ b/src/llvm/lib/Target/W65816/CMakeLists.txt
@@ -24,6 +24,7 @@ add_llvm_target(W65816CodeGen
   W65816RegisterInfo.cpp
   W65816SelectionDAGInfo.cpp
   W65816Subtarget.cpp
+  W65816StackSlotCleanup.cpp
   W65816TargetMachine.cpp
   W65816AsmPrinter.cpp
   W65816MCInstLower.cpp
diff --git a/src/llvm/lib/Target/W65816/MCTargetDesc/W65816AsmBackend.cpp b/src/llvm/lib/Target/W65816/MCTargetDesc/W65816AsmBackend.cpp
index a2cf6a6..a637fd5 100644
--- a/src/llvm/lib/Target/W65816/MCTargetDesc/W65816AsmBackend.cpp
+++ b/src/llvm/lib/Target/W65816/MCTargetDesc/W65816AsmBackend.cpp
@@ -66,6 +66,22 @@ public:
       return;
     }
 
+    // PCRel8 (Bxx / BRA) takes a signed 8-bit offset.  If the resolved
+    // displacement won't fit, the encoded byte is meaningless — the
+    // branch would land somewhere unintended.  Diagnose explicitly
+    // instead of silently truncating.
+    if (Fixup.getKind() == W65816::fixup_8_pcrel) {
+      int64_t Signed = static_cast<int64_t>(Value);
+      if (Signed < -128 || Signed > 127) {
+        getContext().reportError(
+            Fixup.getLoc(),
+            "branch target out of range for 8-bit PC-relative branch "
+            "(offset " + Twine(Signed) + " bytes); use a long branch (BRL) "
+            "or restructure the code");
+        return;  // don't patch — leave zero, error already issued
+      }
+    }
+
     // Little-endian patch.
     for (unsigned i = 0; i < Width; ++i) {
       Data[Offset + i] = static_cast<uint8_t>((Value >> (8 * i)) & 0xff);
diff --git a/src/llvm/lib/Target/W65816/W65816.h b/src/llvm/lib/Target/W65816/W65816.h
index ae90fc2..6a3bed6 100644
--- a/src/llvm/lib/Target/W65816/W65816.h
+++ b/src/llvm/lib/Target/W65816/W65816.h
@@ -20,15 +20,26 @@
 namespace W65816CC {
 // 65816 branch condition codes.  Encoded as i8 immediate operands in
 // the BR_CC SDNode and tablegen patterns.
+//
+// 0..7 map to single Bxx instructions.  8..11 are pseudo codes that
+// expand to a two-branch sequence — needed for SETGT/SETLE/SETUGT/
+// SETULE when the operand we'd swap to LHS is a load (no
+// pattern-match for load on LHS without spilling A).  Only used in
+// SELECT_CC16's custom inserter; never reaches a single Bxx.
 enum CondCode {
   COND_EQ = 0,  // BEQ
   COND_NE = 1,  // BNE
   COND_HS = 2,  // BCS  (unsigned >=)
   COND_LO = 3,  // BCC  (unsigned <)
-  COND_MI = 4,  // BMI  (negative)
-  COND_PL = 5,  // BPL  (non-negative)
+  COND_MI = 4,  // BMI  (negative,  signed <)
+  COND_PL = 5,  // BPL  (non-negative, signed >=)
   COND_VS = 6,  // BVS  (overflow)
   COND_VC = 7,  // BVC  (no overflow)
+  // Multi-branch pseudo codes (handled by SELECT_CC16 inserter):
+  COND_GT_MB = 8,  // signed >   : take if (PL && NE)
+  COND_LE_MB = 9,  // signed <=  : take if (MI || EQ)
+  COND_HI_MB = 10, // unsigned > : take if (HS && NE)
+  COND_LS_MB = 11, // unsigned <=: take if (LO || EQ)
   COND_INVALID = -1
 };
 } // namespace W65816CC
@@ -42,8 +53,15 @@ class PassRegistry;
 FunctionPass *createW65816ISelDag(W65816TargetMachine &TM,
                                   CodeGenOptLevel OptLevel);
 
+// Post-RA cleanup: removes redundant STAfi+LDAfi same-slot pairs that
+// the greedy allocator emits when materialising a COPY $a -> vreg as
+// a spill/reload cycle, even though A still holds the value.  See
+// W65816StackSlotCleanup.cpp.
+FunctionPass *createW65816StackSlotCleanup();
+
 void initializeW65816AsmPrinterPass(PassRegistry &);
 void initializeW65816DAGToDAGISelLegacyPass(PassRegistry &);
+void initializeW65816StackSlotCleanupPass(PassRegistry &);
 
 } // namespace llvm
 
diff --git a/src/llvm/lib/Target/W65816/W65816AsmPrinter.cpp b/src/llvm/lib/Target/W65816/W65816AsmPrinter.cpp
index e5c9a4f..1cdcfdc 100644
--- a/src/llvm/lib/Target/W65816/W65816AsmPrinter.cpp
+++ b/src/llvm/lib/Target/W65816/W65816AsmPrinter.cpp
@@ -82,6 +82,13 @@ void W65816AsmPrinter::emitInstruction(const MachineInstr *MI) {
   switch (MI->getOpcode()) {
   default:
     break;
+  case W65816::LDXi16imm: {
+    MCInst Ldx;
+    Ldx.setOpcode(W65816::LDX_Imm16);
+    Ldx.addOperand(lowerOperand(MI->getOperand(1), MCInstLowering));
+    EmitToStreamer(*OutStreamer, Ldx);
+    return;
+  }
   case W65816::LDAi16imm: {
     MCInst Lda;
     Lda.setOpcode(W65816::LDA_Imm16);
@@ -126,6 +133,18 @@ void W65816AsmPrinter::emitInstruction(const MachineInstr *MI) {
     EmitToStreamer(*OutStreamer, Op);
     return;
   }
+  case W65816::ADCEi16imm:
+  case W65816::SBCEi16imm: {
+    // Chained ADC/SBC: no CLC/SEC prefix — the carry/borrow from the
+    // previous addc/adde/subc/sube is already in P.  See ADCi16imm
+    // comment in W65816InstrInfo.td.
+    bool IsSub = MI->getOpcode() == W65816::SBCEi16imm;
+    MCInst Op;
+    Op.setOpcode(IsSub ? W65816::SBC_Imm16 : W65816::ADC_Imm16);
+    Op.addOperand(lowerOperand(MI->getOperand(2), MCInstLowering));
+    EmitToStreamer(*OutStreamer, Op);
+    return;
+  }
   case W65816::ADCi8imm:
   case W65816::SBCi8imm: {
     bool IsSub = MI->getOpcode() == W65816::SBCi8imm;
@@ -185,6 +204,16 @@ void W65816AsmPrinter::emitInstruction(const MachineInstr *MI) {
     EmitToStreamer(*OutStreamer, Op);
     return;
   }
+  case W65816::ADCEabs:
+  case W65816::SBCEabs: {
+    // Chained variant — no CLC/SEC prefix.
+    bool IsSub = MI->getOpcode() == W65816::SBCEabs;
+    MCInst Op;
+    Op.setOpcode(IsSub ? W65816::SBC_Abs : W65816::ADC_Abs);
+    Op.addOperand(lowerOperand(MI->getOperand(2), MCInstLowering));
+    EmitToStreamer(*OutStreamer, Op);
+    return;
+  }
   case W65816::CMPi16imm: {
     // CMPi16imm has (outs), (ins Acc16:$lhs, i16imm:$rhs); MC needs only
     // the immediate.
@@ -248,6 +277,18 @@ void W65816AsmPrinter::emitInstruction(const MachineInstr *MI) {
     EmitToStreamer(*OutStreamer, Jsl);
     return;
   }
+  case W65816::PUSH16: {
+    MCInst Pha;
+    Pha.setOpcode(W65816::PHA);
+    EmitToStreamer(*OutStreamer, Pha);
+    return;
+  }
+  case W65816::PUSH16X: {
+    MCInst Phx;
+    Phx.setOpcode(W65816::PHX);
+    EmitToStreamer(*OutStreamer, Phx);
+    return;
+  }
   case W65816::ASLA16: {
     MCInst Asl;
     Asl.setOpcode(W65816::ASL_A);
@@ -275,6 +316,12 @@ void W65816AsmPrinter::emitInstruction(const MachineInstr *MI) {
     MCInst ror;  ror.setOpcode(W65816::ROR_A); EmitToStreamer(*OutStreamer, ror);
     return;
   }
+  case W65816::XBA16: {
+    MCInst Xba;
+    Xba.setOpcode(W65816::XBA);
+    EmitToStreamer(*OutStreamer, Xba);
+    return;
+  }
   case W65816::INA_PSEUDO: {
     MCInst In;
     In.setOpcode(W65816::INA);
@@ -305,6 +352,112 @@ void W65816AsmPrinter::emitInstruction(const MachineInstr *MI) {
     EmitToStreamer(*OutStreamer, Inc);
     return;
   }
+  case W65816::NEGC16: {
+    // (subc 0, x) — lo half of multi-precision negate.
+    // EOR #$FFFF; CLC; ADC #1.  C-out = 1 iff result = 0 (i.e. x was 0),
+    // matching SBC's "no borrow" convention.
+    MCInst Eor;
+    Eor.setOpcode(W65816::EOR_Imm16);
+    Eor.addOperand(MCOperand::createImm(0xFFFF));
+    EmitToStreamer(*OutStreamer, Eor);
+    MCInst Clc;
+    Clc.setOpcode(W65816::CLC);
+    EmitToStreamer(*OutStreamer, Clc);
+    MCInst Adc;
+    Adc.setOpcode(W65816::ADC_Imm16);
+    Adc.addOperand(MCOperand::createImm(1));
+    EmitToStreamer(*OutStreamer, Adc);
+    return;
+  }
+  case W65816::SRL15A: {
+    // ASL A; LDA #0; ROL A — extract bit 15 to bit 0.
+    MCInst Asl;
+    Asl.setOpcode(W65816::ASL_A);
+    EmitToStreamer(*OutStreamer, Asl);
+    MCInst Lda;
+    Lda.setOpcode(W65816::LDA_Imm16);
+    Lda.addOperand(MCOperand::createImm(0));
+    EmitToStreamer(*OutStreamer, Lda);
+    MCInst Rol;
+    Rol.setOpcode(W65816::ROL_A);
+    EmitToStreamer(*OutStreamer, Rol);
+    return;
+  }
+  case W65816::SHL15A: {
+    // LSR A; LDA #0; ROR A — move bit 0 to bit 15.
+    MCInst Lsr;
+    Lsr.setOpcode(W65816::LSR_A);
+    EmitToStreamer(*OutStreamer, Lsr);
+    MCInst Lda;
+    Lda.setOpcode(W65816::LDA_Imm16);
+    Lda.addOperand(MCOperand::createImm(0));
+    EmitToStreamer(*OutStreamer, Lda);
+    MCInst Ror;
+    Ror.setOpcode(W65816::ROR_A);
+    EmitToStreamer(*OutStreamer, Ror);
+    return;
+  }
+  case W65816::SRL8A: {
+    // XBA; AND #$00FF — high byte to low byte, zero high.
+    MCInst Xba;
+    Xba.setOpcode(W65816::XBA);
+    EmitToStreamer(*OutStreamer, Xba);
+    MCInst And;
+    And.setOpcode(W65816::AND_Imm16);
+    And.addOperand(MCOperand::createImm(0x00FF));
+    EmitToStreamer(*OutStreamer, And);
+    return;
+  }
+  case W65816::SHL8A: {
+    // XBA; AND #$FF00 — low byte to high byte, zero low.
+    MCInst Xba;
+    Xba.setOpcode(W65816::XBA);
+    EmitToStreamer(*OutStreamer, Xba);
+    MCInst And;
+    And.setOpcode(W65816::AND_Imm16);
+    And.addOperand(MCOperand::createImm(0xFF00));
+    EmitToStreamer(*OutStreamer, And);
+    return;
+  }
+  case W65816::SRA15A: {
+    // ASL A; LDA #0; ADC #-1; EOR #-1 — sign-fill from bit 15.
+    // ASL: C = bit 15 of input (the sign).
+    // LDA #0: A = 0, C unchanged.
+    // ADC #-1: A = 0 + (-1) + C = -1 + C.  If C=1 (neg): A = 0; if
+    //   C=0 (pos): A = -1.  Inverted from what we want.
+    // EOR #-1: flip bits — A = -1 (neg) or 0 (pos), correct.
+    MCInst Asl;
+    Asl.setOpcode(W65816::ASL_A);
+    EmitToStreamer(*OutStreamer, Asl);
+    MCInst Lda;
+    Lda.setOpcode(W65816::LDA_Imm16);
+    Lda.addOperand(MCOperand::createImm(0));
+    EmitToStreamer(*OutStreamer, Lda);
+    MCInst Adc;
+    Adc.setOpcode(W65816::ADC_Imm16);
+    Adc.addOperand(MCOperand::createImm(0xFFFF));
+    EmitToStreamer(*OutStreamer, Adc);
+    MCInst Eor;
+    Eor.setOpcode(W65816::EOR_Imm16);
+    Eor.addOperand(MCOperand::createImm(0xFFFF));
+    EmitToStreamer(*OutStreamer, Eor);
+    return;
+  }
+  case W65816::NEGE16: {
+    // (sube 0, x) — hi half of multi-precision negate.
+    // EOR #$FFFF; ADC #0.  Carry-in from the previous subc/sube is
+    // already in P; ADC #0 propagates it as ~x + C, which matches
+    // 0 - x - !C in two's complement.
+    MCInst Eor;
+    Eor.setOpcode(W65816::EOR_Imm16);
+    Eor.addOperand(MCOperand::createImm(0xFFFF));
+    EmitToStreamer(*OutStreamer, Eor);
+    MCInst Adc;
+    Adc.setOpcode(W65816::ADC_Imm16);
+    Adc.addOperand(MCOperand::createImm(0));
+    EmitToStreamer(*OutStreamer, Adc);
+    return;
+  }
   }
 
   MCInst TmpInst;
diff --git a/src/llvm/lib/Target/W65816/W65816CallingConv.td b/src/llvm/lib/Target/W65816/W65816CallingConv.td
index 7bf96fb..82fc165 100644
--- a/src/llvm/lib/Target/W65816/W65816CallingConv.td
+++ b/src/llvm/lib/Target/W65816/W65816CallingConv.td
@@ -18,8 +18,10 @@
 def RetCC_W65816 : CallingConv<[
   // i8 values are returned in the 8-bit accumulator.
   CCIfType<[i8],  CCAssignToReg<[A]>>,
-  // i16 values are returned in the 16-bit accumulator (same physical reg).
-  CCIfType<[i16], CCAssignToReg<[A]>>
+  // i16 values are returned in A; for a split i32 (legalizer produces
+  // two i16 returns), the second slot lands in X.  LowerReturn /
+  // LowerCall hardcode the same A,X order — keep them in sync.
+  CCIfType<[i16], CCAssignToReg<[A, X]>>
 ]>;
 
 //===----------------------------------------------------------------------===//
diff --git a/src/llvm/lib/Target/W65816/W65816FrameLowering.cpp b/src/llvm/lib/Target/W65816/W65816FrameLowering.cpp
index c4346d1..cffa52f 100644
--- a/src/llvm/lib/Target/W65816/W65816FrameLowering.cpp
+++ b/src/llvm/lib/Target/W65816/W65816FrameLowering.cpp
@@ -19,11 +19,52 @@
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/Support/ErrorHandling.h"
 
 using namespace llvm;
 
+// "Wide" = needs to live in a 16-bit register at some point during the
+// function body.  i8 and i1 are fine in 8-bit M.  Pointer operands that
+// are constant addresses (globals, externs) are fine too — they're
+// immediate operands of LDA/STA, not values held in A.  A non-constant
+// pointer (function arg, computed value) does need to sit in A as 16
+// bits for stack-relative-indirect addressing.
+static bool isWideTyForMode(Type *T, const llvm::Value *V) {
+  if (!T || T->isVoidTy()) return false;
+  if (T->isIntegerTy(8) || T->isIntegerTy(1)) return false;
+  if (T->isPointerTy() && V && (isa<GlobalValue>(V) || isa<Constant>(V)))
+    return false;
+  return true;
+}
+
+// Some IR ops, even when their visible types are all i8, lower to
+// sequences that need 16-bit M during execution: signed compares (via
+// SEXT to i16 + cmp), variable shifts (libcall via i16-promoted args),
+// constant shifts > 4 (also routed through i16 via LowerShift), and
+// any sext of an i8 (synthesized as a SELECT_CC with i16 mask ops).
+// Detect those here so the prologue picks 16-bit M up front.
+static bool instrLowersToWide(const Instruction &I) {
+  if (auto *Cmp = dyn_cast<ICmpInst>(&I)) {
+    if (Cmp->isSigned() &&
+        Cmp->getOperand(0)->getType()->isIntegerTy(8))
+      return true;
+  }
+  if (isa<SExtInst>(&I) &&
+      I.getOperand(0)->getType()->isIntegerTy(8))
+    return true;
+  unsigned Op = I.getOpcode();
+  if ((Op == Instruction::Shl || Op == Instruction::LShr ||
+       Op == Instruction::AShr) &&
+      I.getType()->isIntegerTy(8))
+    return true;
+  return false;
+}
+
 W65816FrameLowering::W65816FrameLowering(const W65816Subtarget &STI)
     : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, Align(1), 0,
                           Align(1)) {}
@@ -54,39 +95,33 @@ void W65816FrameLowering::emitPrologue(MachineFunction &MF,
   MachineBasicBlock::iterator MBBI = MBB.begin();
   DebugLoc DL;
 
-  // Heuristic: scan the function body for any value with i8 type.
-  // Captures both signature types and internal i8 ops (e.g. a void
-  // function that loads / stores bytes).  An eventual full
-  // mode-dependence analysis (the REP/SEP pass) will replace this.
-  bool UsesAcc8 = false;
+  // Heuristic: choose 8-bit M (REP #$10 + SEP #$20) only for "pure-i8"
+  // functions — those whose signature and body use no type wider than
+  // i8 (no i16 ops, no pointers).  Any wider type forces 16-bit M
+  // (REP #$30) since pointer dereferences and stack-relative addressing
+  // need M=1 to load/store 16 bits at a time.  In 16-bit M functions,
+  // individual i8 ops are wrapped with SEP/REP at the pseudo level.
+  // A future REP/SEP scheduling pass (design doc 3.3) will replace
+  // this whole-function decision with a per-region one.
   const Function &F = MF.getFunction();
-  auto isI8 = [](Type *T) { return T && T->isIntegerTy(8); };
-  if (isI8(F.getReturnType()))
-    UsesAcc8 = true;
+  bool HasWide = isWideTyForMode(F.getReturnType(), nullptr);
   for (const Argument &Arg : F.args()) {
-    if (isI8(Arg.getType())) {
-      UsesAcc8 = true;
-      break;
-    }
+    if (isWideTyForMode(Arg.getType(), &Arg)) { HasWide = true; break; }
   }
-  if (!UsesAcc8) {
+  if (!HasWide) {
     for (const BasicBlock &BB : F) {
-      if (UsesAcc8) break;
+      if (HasWide) break;
       for (const Instruction &I : BB) {
-        if (isI8(I.getType())) {
-          UsesAcc8 = true;
-          break;
-        }
+        if (isWideTyForMode(I.getType(), &I)) { HasWide = true; break; }
+        if (instrLowersToWide(I)) { HasWide = true; break; }
         for (const Value *Op : I.operands()) {
-          if (isI8(Op->getType())) {
-            UsesAcc8 = true;
-            break;
-          }
+          if (isWideTyForMode(Op->getType(), Op)) { HasWide = true; break; }
         }
-        if (UsesAcc8) break;
+        if (HasWide) break;
       }
     }
   }
+  bool UsesAcc8 = !HasWide;
   (void)MRI;
 
   if (UsesAcc8) {
@@ -96,17 +131,47 @@ void W65816FrameLowering::emitPrologue(MachineFunction &MF,
     BuildMI(MBB, MBBI, DL, TII.get(W65816::REP)).addImm(0x30);
   }
 
-  // Reserve stack space for locals/spills if any.  Sequence is
-  // `TSC ; SEC ; SBC #N ; TCS` to subtract N from S in 16-bit mode.
-  // Skipped for i8 functions for now since the stack adjustment uses
-  // the 16-bit accumulator (would need a save/restore around it).
+  // Reserve stack space for locals/spills.
+  //
+  // Critical: arg0 lives in A on entry, so the prologue MUST NOT
+  // clobber A.  The naive `TSC; SEC; SBC #N; TCS` sequence destroys A
+  // (TSC overwrites A with SP) — used to silently corrupt arg0 in
+  // every function with a stack frame, until this fix.
+  //
+  // Strategy (16-bit M):
+  //   - Small frames (N <= 14 bytes): use N/2 `PHA` instructions.  PHA
+  //     pushes A's value (whatever it is — including arg0) and only
+  //     decrements S.  A is not modified.  N/2 bytes of code per call.
+  //     Side-effect: the bytes pushed contain copies of arg0; the body's
+  //     regalloc-inserted spills may overwrite them, which is fine.
+  //   - Larger frames: TAY/TSC/.../TYA — 8 bytes total, preserves A
+  //     through Y as a temporary.  Y is caller-saved by our (loose) ABI.
+  //
+  // Strategy (8-bit M): PHA in 8-bit M pushes 1 byte, so N PHAs for
+  // N bytes.  Without this, spills land on top of the return address
+  // and corrupt it (was a latent silent crash for 8-bit M functions
+  // that needed any spilling).
   uint64_t StackSize = MF.getFrameInfo().getStackSize();
-  if (StackSize > 0 && !UsesAcc8) {
-    BuildMI(MBB, MBBI, DL, TII.get(W65816::TSC));
-    BuildMI(MBB, MBBI, DL, TII.get(W65816::SEC));
-    BuildMI(MBB, MBBI, DL, TII.get(W65816::SBC_Imm16))
-        .addImm(StackSize);
-    BuildMI(MBB, MBBI, DL, TII.get(W65816::TCS));
+  if (StackSize > 0) {
+    if (UsesAcc8) {
+      // 8-bit M: 1 PHA per byte.  Preserves A.
+      for (uint64_t i = 0; i < StackSize; ++i)
+        BuildMI(MBB, MBBI, DL, TII.get(W65816::PHA));
+    } else if (StackSize <= 14 && (StackSize % 2) == 0) {
+      // 16-bit M, small frame: N/2 PHAs.  Preserves A.
+      for (uint64_t i = 0; i < StackSize / 2; ++i)
+        BuildMI(MBB, MBBI, DL, TII.get(W65816::PHA));
+    } else {
+      // 16-bit M, larger frame: TAY/TSC/.../TYA bracket.  Preserves A
+      // via Y as a temp.
+      BuildMI(MBB, MBBI, DL, TII.get(W65816::TAY));
+      BuildMI(MBB, MBBI, DL, TII.get(W65816::TSC));
+      BuildMI(MBB, MBBI, DL, TII.get(W65816::SEC));
+      BuildMI(MBB, MBBI, DL, TII.get(W65816::SBC_Imm16))
+          .addImm(StackSize);
+      BuildMI(MBB, MBBI, DL, TII.get(W65816::TCS));
+      BuildMI(MBB, MBBI, DL, TII.get(W65816::TYA));
+    }
   }
 }
 
@@ -124,25 +189,90 @@ void W65816FrameLowering::emitEpilogue(MachineFunction &MF,
   // Insert before the terminator (the return).
   DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
 
+  // Mirror the prologue's pure-i8 detection: skip the 16-bit stack
+  // adjustment only if the function ran in 8-bit M (no wide types
+  // anywhere).
   const Function &F = MF.getFunction();
-  bool UsesAcc8 = F.getReturnType()->isIntegerTy(8);
-  if (!UsesAcc8) {
+  bool HasWide = isWideTyForMode(F.getReturnType(), nullptr);
+  if (!HasWide) {
     for (const Argument &Arg : F.args()) {
-      if (Arg.getType()->isIntegerTy(8)) { UsesAcc8 = true; break; }
+      if (isWideTyForMode(Arg.getType(), &Arg)) { HasWide = true; break; }
     }
   }
-  if (UsesAcc8) return;  // Cannot 16-bit math while in 8-bit mode.
+  if (!HasWide) {
+    for (const BasicBlock &BB : F) {
+      if (HasWide) break;
+      for (const Instruction &I : BB) {
+        if (isWideTyForMode(I.getType(), &I)) { HasWide = true; break; }
+        if (instrLowersToWide(I)) { HasWide = true; break; }
+        for (const Value *Op : I.operands()) {
+          if (isWideTyForMode(Op->getType(), Op)) { HasWide = true; break; }
+        }
+        if (HasWide) break;
+      }
+    }
+  }
+  // 8-bit M epilogue.  Save A in Y(low) via TAY, pop N bytes via N
+  // PLAs (each pops 1 byte in 8-bit M), restore A via TYA.  Y is
+  // caller-saved by our ABI so we can use it freely.  Total cost:
+  // N + 2 bytes per epilogue.
+  if (!HasWide) {
+    BuildMI(MBB, MBBI, DL, TII.get(W65816::TAY));   // save A in Y
+    for (uint64_t i = 0; i < StackSize; ++i)
+      BuildMI(MBB, MBBI, DL, TII.get(W65816::PLA)); // pop frame bytes
+    BuildMI(MBB, MBBI, DL, TII.get(W65816::TYA));   // restore A from Y
+    return;
+  }
 
+  // 16-bit M epilogue.  Mirror the prologue: A holds the return value
+  // at this point and MUST be preserved.  Small frames release via
+  // N/2 PLY (pop into Y, discard); larger frames use
+  // TAY/TSC/CLC/ADC #N/TCS/TYA.
+  if (StackSize <= 14 && (StackSize % 2) == 0) {
+    for (uint64_t i = 0; i < StackSize / 2; ++i)
+      BuildMI(MBB, MBBI, DL, TII.get(W65816::PLY));
+    return;
+  }
+  BuildMI(MBB, MBBI, DL, TII.get(W65816::TAY));
   BuildMI(MBB, MBBI, DL, TII.get(W65816::TSC));
   BuildMI(MBB, MBBI, DL, TII.get(W65816::CLC));
   BuildMI(MBB, MBBI, DL, TII.get(W65816::ADC_Imm16))
       .addImm(StackSize);
   BuildMI(MBB, MBBI, DL, TII.get(W65816::TCS));
+  BuildMI(MBB, MBBI, DL, TII.get(W65816::TYA));
 }
 
 MachineBasicBlock::iterator W65816FrameLowering::eliminateCallFramePseudoInstr(
     MachineFunction &MF, MachineBasicBlock &MBB,
     MachineBasicBlock::iterator I) const {
-  // Drop ADJCALLSTACKDOWN/UP with no replacement for now.
+  const W65816Subtarget &STI = MF.getSubtarget<W65816Subtarget>();
+  const W65816InstrInfo &TII = *STI.getInstrInfo();
+
+  // ADJCALLSTACKDOWN does nothing — we push args via PUSH16/PHA which
+  // implicitly decrements SP, so no separate adjustment is needed.
+  // ADJCALLSTACKUP releases all the pushed bytes after a call.
+  //
+  // Critical: A holds the callee's return value here, so this MUST NOT
+  // clobber A.  The naive `tsc;clc;adc #N;tcs` does (TSC overwrites A),
+  // which silently corrupts every call's return value.  Same fix as the
+  // epilogue: small N via PLY (clobbers Y, preserves A); larger N via
+  // TAY/.../TYA bracket.
+  if (I->getOpcode() == W65816::ADJCALLSTACKUP) {
+    int N = I->getOperand(0).getImm();
+    if (N > 0) {
+      DebugLoc DL = I->getDebugLoc();
+      if (N <= 14 && (N % 2) == 0) {
+        for (int i = 0; i < N / 2; ++i)
+          BuildMI(MBB, I, DL, TII.get(W65816::PLY));
+      } else {
+        BuildMI(MBB, I, DL, TII.get(W65816::TAY));
+        BuildMI(MBB, I, DL, TII.get(W65816::TSC));
+        BuildMI(MBB, I, DL, TII.get(W65816::CLC));
+        BuildMI(MBB, I, DL, TII.get(W65816::ADC_Imm16)).addImm(N);
+        BuildMI(MBB, I, DL, TII.get(W65816::TCS));
+        BuildMI(MBB, I, DL, TII.get(W65816::TYA));
+      }
+    }
+  }
   return MBB.erase(I);
 }
diff --git a/src/llvm/lib/Target/W65816/W65816ISelDAGToDAG.cpp b/src/llvm/lib/Target/W65816/W65816ISelDAGToDAG.cpp
index 108a36c..84c8bfe 100644
--- a/src/llvm/lib/Target/W65816/W65816ISelDAGToDAG.cpp
+++ b/src/llvm/lib/Target/W65816/W65816ISelDAGToDAG.cpp
@@ -71,17 +71,52 @@ void W65816DAGToDAGISel::Select(SDNode *Node) {
     return;
   }
 
-  // Defer to the auto-generated selector for everything else.  Custom
-  // selection paths (frame-index, wrapper, etc.) will land here later.
+  // Custom selection: bare FrameIndex SDValue used as an i16 pointer
+  // value (e.g. `&arr[0]` for a stack-allocated array).  The
+  // auto-generated selector has no pattern for `(i16 frameindex)`
+  // because tablegen doesn't expose FrameIndex as a leaf type — so
+  // ISel fails with "Cannot select: FrameIndex" before ever reaching
+  // a load/store-context fold.  Convert it to ADDframe (FI, 0); the
+  // frame-index elimination pass turns ADDframe into TSC + CLC + ADC
+  // #(offset+stackSize), producing SP+offset in A.
+  if (Node->getOpcode() == ISD::FrameIndex) {
+    SDLoc DL(Node);
+    int FI = cast<FrameIndexSDNode>(Node)->getIndex();
+    SDValue TFI = CurDAG->getTargetFrameIndex(FI, MVT::i16);
+    SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i16);
+    CurDAG->SelectNodeTo(Node, W65816::ADDframe, MVT::i16, TFI, Zero);
+    return;
+  }
+
+  // Defer to the auto-generated selector for everything else.
   SelectCode(Node);
 }
 
 bool W65816DAGToDAGISel::SelectFrameIndex(SDValue N, SDValue &Base,
                                           SDValue &Offset) {
+  // Bare FrameIndex: offset 0.
   if (auto *FIN = dyn_cast<FrameIndexSDNode>(N)) {
     Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i16);
     Offset = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i16);
     return true;
   }
+  // (add FrameIndex, const): fold the const into the memfi offset.
+  // Type legalization emits this shape when splitting a multi-byte
+  // load/store at a stack slot into multiple smaller loads (e.g. an
+  // i32 spill becomes two i16 loads, with the high load at FI+2).
+  // Without this, the bare FrameIndex inside the add is left as an
+  // unmatched i16 leaf and ISel reports "Cannot select FrameIndex".
+  if (N.getOpcode() == ISD::ADD) {
+    SDValue LHS = N.getOperand(0);
+    SDValue RHS = N.getOperand(1);
+    if (auto *FIN = dyn_cast<FrameIndexSDNode>(LHS)) {
+      if (auto *CN = dyn_cast<ConstantSDNode>(RHS)) {
+        Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i16);
+        Offset = CurDAG->getTargetConstant(CN->getSExtValue(),
+                                           SDLoc(N), MVT::i16);
+        return true;
+      }
+    }
+  }
   return false;
 }
diff --git a/src/llvm/lib/Target/W65816/W65816ISelLowering.cpp b/src/llvm/lib/Target/W65816/W65816ISelLowering.cpp
index 7c5de40..7a7f379 100644
--- a/src/llvm/lib/Target/W65816/W65816ISelLowering.cpp
+++ b/src/llvm/lib/Target/W65816/W65816ISelLowering.cpp
@@ -12,10 +12,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "W65816ISelLowering.h"
+#include "W65816InstrInfo.h"
+#include "W65816MachineFunctionInfo.h"
 #include "W65816SelectionDAGInfo.h"
 #include "W65816Subtarget.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
@@ -53,20 +56,121 @@ W65816TargetLowering::W65816TargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::BRCOND,   MVT::Other, Expand);
   setOperationAction(ISD::BR_JT,    MVT::Other, Expand);
 
-  // SELECT / SELECT_CC: leave as default for now.  Expanding either
-  // currently infinite-loops because they cross-expand into each other
-  // without a base case.  Custom lowering to a Bxx + branch + phi
-  // pattern is the right fix; tracked separately.
+  // SETCC and SELECT_CC: custom-lowered to a CMP + W65816ISD::SELECT_CC
+  // pseudo (with usesCustomInserter=1) that EmitInstrWithCustomInserter
+  // expands into a Bxx + diamond CFG + PHI.  SETCC funnels through the
+  // same path with TVal=1 / FVal=0.  SELECT (no condition operand) is
+  // expanded to SELECT_CC by the legalizer using SETNE against zero.
+  setOperationAction(ISD::SETCC,     MVT::i16, Custom);
+  setOperationAction(ISD::SETCC,     MVT::i8,  Custom);
+  setOperationAction(ISD::SELECT_CC, MVT::i16, Custom);
+  setOperationAction(ISD::SELECT_CC, MVT::i8,  Custom);
+  setOperationAction(ISD::SELECT,    MVT::i16, Expand);
+  setOperationAction(ISD::SELECT,    MVT::i8,  Expand);
+  // 65816 has no inline sign-extend instruction; synthesize i8 -> i16
+  // via a bit-7 test and SELECT_CC (see LowerSignExtend).
+  setOperationAction(ISD::SIGN_EXTEND, MVT::i16, Custom);
+
+  // We have zextload-i8 and extload-i8 patterns (LDA + AND #$FF / bare
+  // LDA for the anyext case).  No native sextload; mark it Expand so
+  // LLVM rewrites `sextload i16, i8` into `(sign_extend (load i8))`,
+  // which then flows through LowerSignExtend's branchless 3-insn
+  // sequence (AND #$00FF; EOR #$0080; SEC; SBC #$0080).
+  for (MVT VT : MVT::integer_valuetypes())
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Expand);
+
+  // Vararg support: VASTART writes the address of the first vararg slot
+  // to the va_list pointer.  VAARG/VACOPY/VAEND use the default
+  // expansions that load through that pointer and bump it.  This makes
+  // <stdarg.h>-style functions (e.g. printf-likes) compile cleanly.
+  setOperationAction(ISD::VASTART, MVT::Other, Custom);
+  setOperationAction(ISD::VAARG,   MVT::Other, Expand);
+  setOperationAction(ISD::VACOPY,  MVT::Other, Expand);
+  setOperationAction(ISD::VAEND,   MVT::Other, Expand);
 
   // The 65816 has no hardware multiplier or divider.  Multiply by a
   // power-of-two constant is auto-rewritten to shifts by the DAG
-  // combiner; arbitrary multiply / divide / mod fail to select today.
-  // Real support needs (a) library functions (`__mulhi3` etc.) and
-  // (b) multi-arg call lowering — both are tracked separately.
+  // combiner; arbitrary multiply / divide / mod go through libcalls
+  // (`__mulhi3` for i16 multiply etc.).  The libcall expander emits a
+  // standard CALL node which flows through LowerCall, so multi-arg
+  // call lowering must be working first (it is, see task #26).
   setOperationAction(ISD::MULHU,  MVT::i16, Expand);
   setOperationAction(ISD::MULHS,  MVT::i16, Expand);
   setOperationAction(ISD::SMUL_LOHI, MVT::i16, Expand);
   setOperationAction(ISD::UMUL_LOHI, MVT::i16, Expand);
+  setOperationAction(ISD::MUL,    MVT::i16, LibCall);
+  setOperationAction(ISD::SDIV,   MVT::i16, LibCall);
+  setOperationAction(ISD::UDIV,   MVT::i16, LibCall);
+  setOperationAction(ISD::SREM,   MVT::i16, LibCall);
+  setOperationAction(ISD::UREM,   MVT::i16, LibCall);
+  setOperationAction(ISD::SDIVREM, MVT::i16, Expand);
+  setOperationAction(ISD::UDIVREM, MVT::i16, Expand);
+
+  // Variable-amount and large-constant shifts.  We have inline
+  // patterns for shift-by-1..4; everything else goes through
+  // __ashlhi3 / __lshrhi3 / __ashrhi3.  Setting the action to Custom
+  // lets us return SDValue() for the fast cases and route everything
+  // else through the libcall lowering helper.
+  setOperationAction(ISD::SHL, MVT::i16, Custom);
+  setOperationAction(ISD::SRL, MVT::i16, Custom);
+  setOperationAction(ISD::SRA, MVT::i16, Custom);
+  // i8 shifts go through Custom too — LowerShift detects the i8 result
+  // and routes through trunc(i16-shift(zext_or_sext(lhs), amount)).
+  // Avoids needing a parallel set of qi3 libcalls.
+  setOperationAction(ISD::SHL, MVT::i8, Custom);
+  setOperationAction(ISD::SRL, MVT::i8, Custom);
+  setOperationAction(ISD::SRA, MVT::i8, Custom);
+
+  // ADDC/ADDE/SUBC/SUBE are the legacy SDNodes with implicit Glue carrying
+  // the carry/borrow flag between the two halves of a multi-precision add or
+  // sub.  Setting them Legal triggers the type legalizer's carry-chain split
+  // for i32 ADD/SUB, which lowers to native ADC/SBC pairs (~7 instructions)
+  // instead of the default UADDO+SETCC+ADD-of-bool path (~25 instructions).
+  // The matching tablegen pseudos add Defs/Uses on the P register, which
+  // tablegen wires up to the SDNode's SDNPInGlue/SDNPOutGlue automatically.
+  setOperationAction(ISD::ADDC, MVT::i16, Legal);
+  setOperationAction(ISD::ADDE, MVT::i16, Legal);
+  setOperationAction(ISD::SUBC, MVT::i16, Legal);
+  setOperationAction(ISD::SUBE, MVT::i16, Legal);
+
+  // i32 (long).  Type legalization splits i32 into two i16 halves; with
+  // ADDC/ADDE Legal (above), ADD/SUB go through the native carry chain.
+  // AND/OR/XOR split cleanly into per-half ops with no carry to track.
+  // Multiply/divide/shift go through libcall stubs whose
+  // implementations live in runtime/src/libgcc.s.  SHL_PARTS / SRL_PARTS
+  // / SRA_PARTS are the SDNodes the type legalizer emits when splitting
+  // a variable-amount shift; without an action they get "Cannot select".
+  // LibCall on the parent node routes the whole shift through one
+  // __ashlsi3 / __lshrsi3 / __ashrsi3 call, which is both smaller and
+  // simpler than implementing a 32-bit shift in 65816 assembly inline.
+  for (MVT VT : {MVT::i32}) {
+    setOperationAction(ISD::MUL,  VT, LibCall);
+    setOperationAction(ISD::SDIV, VT, LibCall);
+    setOperationAction(ISD::UDIV, VT, LibCall);
+    setOperationAction(ISD::SREM, VT, LibCall);
+    setOperationAction(ISD::UREM, VT, LibCall);
+    setOperationAction(ISD::MULHU, VT, Expand);
+    setOperationAction(ISD::MULHS, VT, Expand);
+    setOperationAction(ISD::SMUL_LOHI, VT, Expand);
+    setOperationAction(ISD::UMUL_LOHI, VT, Expand);
+    setOperationAction(ISD::SDIVREM, VT, Expand);
+    setOperationAction(ISD::UDIVREM, VT, Expand);
+    // i32 shifts route through a libcall via the
+    // preferredShiftLegalizationStrategy override (see header).  No
+    // explicit SHL/SHL_PARTS action needed — the override forces the
+    // type-legalizer's libcall path before SHL_PARTS would be emitted.
+  }
+
+  // Disable jump tables.  Generating them costs us BRIND (indirect
+  // branch via 16-bit pointer load), which we don't have.  A long
+  // if-else chain compiles fine without them.  Setting the threshold
+  // to UINT_MAX makes LLVM never form a jump table.
+  setMinimumJumpTableEntries(UINT_MAX);
+
+  // Opt into PerformDAGCombine on LOAD nodes — needed for the
+  // address-select reverse combine (see W65816TargetLowering::
+  // PerformDAGCombine).
+  setTargetDAGCombine(ISD::LOAD);
 }
 
 // Map an LLVM SETCC condition to a W65816 branch.  Returns the condition
@@ -94,14 +198,32 @@ static W65816CC::CondCode mapCC(ISD::CondCode CC) {
   }
 }
 
-SDValue W65816TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
-  SDValue Chain  = Op.getOperand(0);
-  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
-  SDValue LHS    = Op.getOperand(2);
-  SDValue RHS    = Op.getOperand(3);
-  SDValue Dest   = Op.getOperand(4);
-  SDLoc DL(Op);
+// If both compare operands are i8, widen them to i16 so the existing
+// i16 CMP path can handle them.  Use ZEXT for unsigned/eq/ne CCs and
+// SEXT for signed CCs — picking the wrong extension would invert the
+// answer (e.g. -1i8 sext to 0xFFFF compares < 1 signed; zext to 0x00FF
+// compares > 1 unsigned, which would flip a signed less-than).
+static void promoteI8Cmp(SDValue &LHS, SDValue &RHS, ISD::CondCode CC,
+                         SelectionDAG &DAG, const SDLoc &DL) {
+  if (LHS.getValueType() != MVT::i8) return;
+  unsigned Ext;
+  switch (CC) {
+  case ISD::SETLT: case ISD::SETLE: case ISD::SETGT: case ISD::SETGE:
+    Ext = ISD::SIGN_EXTEND; break;
+  default:
+    Ext = ISD::ZERO_EXTEND; break;  // unsigned + eq/ne
+  }
+  LHS = DAG.getNode(Ext, DL, MVT::i16, LHS);
+  RHS = DAG.getNode(Ext, DL, MVT::i16, RHS);
+}
 
+// Normalize a (LHS, RHS, CC) triple so the result is something we can
+// emit with one CMP + Bxx.  Returns the W65816 condition code; updates
+// LHS/RHS/CC in place.  Returns COND_INVALID on failure.
+static W65816CC::CondCode normalizeCC(SDValue &LHS, SDValue &RHS,
+                                      ISD::CondCode &CC, SelectionDAG &DAG,
+                                      const SDLoc &DL) {
+  promoteI8Cmp(LHS, RHS, CC, DAG, DL);
   // CMP wants the comparand (constant or memory) on the right.  If a DAG
   // pre-pass put the constant on the left, swap and flip the condition.
   if (isa<ConstantSDNode>(LHS) && !isa<ConstantSDNode>(RHS)) {
@@ -110,11 +232,10 @@ SDValue W65816TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
   }
 
   // Rewrite SETULE / SETUGT / SETLE / SETGT to SETULT / SETUGE / SETLT /
-  // SETGE with constant +/- 1.  This keeps the variable on the LHS (so
-  // our pattern matches) and lets us use the BCS / BCC / BMI / BPL
-  // mnemonics natively.  Only valid when the constant is not at its
-  // signed/unsigned boundary; for now we just bail in that pathological
-  // case.
+  // SETGE with constant +/- 1.  Keeps the variable on the LHS and lets
+  // us use BCS / BCC / BMI / BPL natively.  Only valid when the constant
+  // is not at its signed/unsigned boundary; we bail in that pathological
+  // case for now.
   if (auto *RhsConst = dyn_cast<ConstantSDNode>(RHS)) {
     int64_t V = RhsConst->getSExtValue();
     if (CC == ISD::SETULE && (uint64_t)V < 0xffff) {
@@ -132,35 +253,214 @@ SDValue W65816TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
     }
   }
 
-  // Final fallback: any condition we didn't handle yet might still be
-  // representable by swapping operands (e.g. SETUGT b a → SETULT a b).
-  // Try once if the direct map doesn't recognise it.
   W65816CC::CondCode TCC = mapCC(CC);
   if (TCC == W65816CC::COND_INVALID) {
-    std::swap(LHS, RHS);
-    CC = ISD::getSetCCSwappedOperands(CC);
-    TCC = mapCC(CC);
+    // Try swapping operands first — preferable since it leaves us with
+    // a single-Bxx form.  But reject the swap if it would put a load on
+    // the LHS (we can't pattern-match cmp(load,reg) without spilling A).
+    bool RhsIsLoad = isa<LoadSDNode>(RHS.getNode());
+    bool LhsIsLoad = isa<LoadSDNode>(LHS.getNode());
+    bool SwapWouldHurt = RhsIsLoad && !LhsIsLoad;
+    if (!SwapWouldHurt) {
+      std::swap(LHS, RHS);
+      CC = ISD::getSetCCSwappedOperands(CC);
+      TCC = mapCC(CC);
+    }
   }
+  // Final fallback: GT/LE/UGT/ULE without a useful swap target.  Use a
+  // multi-branch pseudo CC; the SELECT_CC16 custom inserter expands it
+  // into a 3-BB diamond.  Only valid for SELECT_CC, not for BR_CC —
+  // LowerBR_CC re-routes those through SETCC + BR_CC NE.
+  if (TCC == W65816CC::COND_INVALID) {
+    switch (CC) {
+    case ISD::SETGT:  TCC = W65816CC::COND_GT_MB; break;
+    case ISD::SETLE:  TCC = W65816CC::COND_LE_MB; break;
+    case ISD::SETUGT: TCC = W65816CC::COND_HI_MB; break;
+    case ISD::SETULE: TCC = W65816CC::COND_LS_MB; break;
+    default: break;
+    }
+  }
+  return TCC;
+}
+
+SDValue W65816TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
+  SDValue Chain  = Op.getOperand(0);
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
+  SDValue LHS    = Op.getOperand(2);
+  SDValue RHS    = Op.getOperand(3);
+  SDValue Dest   = Op.getOperand(4);
+  SDLoc DL(Op);
+  EVT VT = LHS.getValueType();
+
+  W65816CC::CondCode TCC = normalizeCC(LHS, RHS, CC, DAG, DL);
   if (TCC == W65816CC::COND_INVALID)
     report_fatal_error("W65816: branch condition not yet implemented");
 
+  // Multi-branch CCs only have inserter support via SELECT_CC16.  For
+  // BR_CC, reroute through SETCC: materialise the boolean to A, then
+  // branch on NE-vs-zero.  One extra LDA but always works.
+  if (TCC >= W65816CC::COND_GT_MB) {
+    SDValue Bool = DAG.getNode(ISD::SETCC, DL, VT, LHS, RHS,
+                               DAG.getCondCode(CC));
+    SDValue Zero = DAG.getConstant(0, DL, VT);
+    return DAG.getNode(ISD::BR_CC, DL, MVT::Other, Chain,
+                       DAG.getCondCode(ISD::SETNE), Bool, Zero, Dest);
+  }
+
   SDValue Glue = DAG.getNode(W65816ISD::CMP, DL, MVT::Glue, LHS, RHS);
   SDValue CCOp = DAG.getTargetConstant(TCC, DL, MVT::i8);
   return DAG.getNode(W65816ISD::BR_CC, DL, MVT::Other, Chain, Dest, CCOp,
                      Glue);
 }
 
+SDValue W65816TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
+  // setcc lhs, rhs, cc  ->  select_cc lhs, rhs, 1, 0, cc.
+  // The SELECT_CC then re-enters LowerOperation and we lower it via the
+  // diamond-CFG path.  setBooleanContents(ZeroOrOne) means callers see
+  // the result as a clean 0/1 value.
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+  SDLoc DL(Op);
+  EVT VT = Op.getValueType();
+  SDValue One  = DAG.getConstant(1, DL, VT);
+  SDValue Zero = DAG.getConstant(0, DL, VT);
+  return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, One, Zero,
+                     DAG.getCondCode(CC));
+}
+
+SDValue W65816TargetLowering::LowerSELECT_CC(SDValue Op,
+                                             SelectionDAG &DAG) const {
+  SDValue LHS    = Op.getOperand(0);
+  SDValue RHS    = Op.getOperand(1);
+  SDValue TVal   = Op.getOperand(2);
+  SDValue FVal   = Op.getOperand(3);
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
+  SDLoc DL(Op);
+
+  W65816CC::CondCode TCC = normalizeCC(LHS, RHS, CC, DAG, DL);
+  if (TCC == W65816CC::COND_INVALID)
+    report_fatal_error("W65816: select_cc condition not yet implemented");
+
+  SDValue Glue = DAG.getNode(W65816ISD::CMP, DL, MVT::Glue, LHS, RHS);
+  SDValue CCOp = DAG.getTargetConstant(TCC, DL, MVT::i8);
+  SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
+  SDValue Ops[] = {TVal, FVal, CCOp, Glue};
+  return DAG.getNode(W65816ISD::SELECT_CC, DL, VTs, Ops);
+}
+
+// i8 -> i16 sign extend.  Branchless 3-instruction trick:
+//   sext(x) = ((x & 0xFF) ^ 0x80) - 0x80
+// Verify:  x=0x00 -> 0x80 - 0x80 = 0x0000.  x=0x7F -> 0xFF - 0x80 = 0x7F.
+//          x=0x80 -> 0x00 - 0x80 = 0xFF80 (-128).  x=0xFF -> 0x7F - 0x80
+//          = 0xFFFF (-1).
+// Lowers to: AND #$00FF; EOR #$0080; SEC; SBC #$0080  (10 bytes total,
+// no branches, no temp slots — much cheaper than the SELECT_CC diamond
+// version that produced ~14 instructions plus stack spills).
+SDValue W65816TargetLowering::LowerSignExtend(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  SDValue X = Op.getOperand(0);
+  if (X.getValueType() != MVT::i8 || Op.getValueType() != MVT::i16)
+    return SDValue();
+  SDLoc DL(Op);
+  SDValue Z = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i16, X);
+  SDValue Sign = DAG.getConstant(0x0080, DL, MVT::i16);
+  SDValue Xor = DAG.getNode(ISD::XOR, DL, MVT::i16, Z, Sign);
+  return DAG.getNode(ISD::SUB, DL, MVT::i16, Xor, Sign);
+}
+
+// VASTART: store the address of the first vararg slot (recorded by
+// LowerFormalArguments via VarArgsFrameIndex) to the va_list pointer.
+// va_list is just `i16 *next` here — minimum implementation.
+static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) {
+  MachineFunction &MF = DAG.getMachineFunction();
+  auto *FuncInfo = MF.getInfo<W65816MachineFunctionInfo>();
+  SDLoc DL(Op);
+  // Address of the first vararg slot.
+  SDValue VAFI = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
+                                   MVT::i16);
+  SDValue Chain = Op.getOperand(0);
+  SDValue VAListPtr = Op.getOperand(1);
+  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+  return DAG.getStore(Chain, DL, VAFI, VAListPtr, MachinePointerInfo(SV));
+}
+
 SDValue W65816TargetLowering::LowerOperation(SDValue Op,
                                              SelectionDAG &DAG) const {
   switch (Op.getOpcode()) {
   case ISD::GlobalAddress:  return LowerGlobalAddress(Op, DAG);
   case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);
   case ISD::BR_CC:          return LowerBR_CC(Op, DAG);
+  case ISD::SETCC:          return LowerSETCC(Op, DAG);
+  case ISD::SELECT_CC:      return LowerSELECT_CC(Op, DAG);
+  case ISD::SIGN_EXTEND:    return LowerSignExtend(Op, DAG);
+  case ISD::VASTART:        return LowerVASTART(Op, DAG);
+  case ISD::SHL:
+  case ISD::SRL:
+  case ISD::SRA:            return LowerShift(Op, DAG);
   default:
     llvm_unreachable("W65816: unexpected operation in LowerOperation");
   }
 }
 
+SDValue W65816TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
+  // i8 shifts: promote to i16, shift, truncate.  SRA promotes via SEXT
+  // (preserves sign for arithmetic right shift); SHL/SRL via ZEXT
+  // (logical / left shifts don't care about high bits).  This routes
+  // i8 shifts through the same i16 fast paths and libcalls — no
+  // parallel qi3 libcall set needed.
+  if (Op.getValueType() == MVT::i8) {
+    SDLoc DL(Op);
+    SDValue X = Op.getOperand(0);
+    SDValue N = Op.getOperand(1);
+    unsigned Ext = (Op.getOpcode() == ISD::SRA) ? ISD::SIGN_EXTEND
+                                                : ISD::ZERO_EXTEND;
+    SDValue X16 = DAG.getNode(Ext, DL, MVT::i16, X);
+    SDValue N16 = N.getValueType() == MVT::i16
+                      ? N
+                      : DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i16, N);
+    SDValue R16 = DAG.getNode(Op.getOpcode(), DL, MVT::i16, X16, N16);
+    return DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, R16);
+  }
+  // Fast path: shift-by-{1,2,3,4} have inline tablegen patterns.  Return
+  // Op (the unchanged node) so the legalizer leaves it alone — the
+  // pattern matcher catches it later.  Returning SDValue() instead
+  // would fall through to the generic Expand path, which generates a
+  // BUILD_VECTOR-based magic-constant rewrite that we can't lower.
+  // Also allow `(srl x, 15)` through — pattern SRL15A handles it as
+  // `ASL A; LDA #0; ROL A` (3 bytes), much shorter than the libcall.
+  // The type-legalizer's i32-shift-by-1 expansion emits this exact
+  // node for the high-half "bit-from-low" slot.
+  // Everything else goes to a libcall (__ashlhi3 / __lshrhi3 / __ashrhi3).
+  SDValue Amount = Op.getOperand(1);
+  if (auto *C = dyn_cast<ConstantSDNode>(Amount)) {
+    uint64_t N = C->getZExtValue();
+    if (N >= 1 && N <= 4)
+      return Op;
+    if ((N == 15 || N == 8) &&
+        (Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SHL))
+      return Op;
+    if (N == 15 && Op.getOpcode() == ISD::SRA)
+      return Op;
+  }
+
+  RTLIB::Libcall LC;
+  switch (Op.getOpcode()) {
+  case ISD::SHL: LC = RTLIB::SHL_I16; break;
+  case ISD::SRL: LC = RTLIB::SRL_I16; break;
+  case ISD::SRA: LC = RTLIB::SRA_I16; break;
+  default: llvm_unreachable("not a shift");
+  }
+
+  // makeLibCall wants the args as TargetLowering::ArgListEntry; the
+  // simpler getNode form is to manually build the call.  But the
+  // makeLibCall helper handles the calling convention.
+  SmallVector<SDValue, 2> Args = {Op.getOperand(0), Op.getOperand(1)};
+  TargetLowering::MakeLibCallOptions Opts;
+  Opts.setIsSigned(Op.getOpcode() == ISD::SRA);
+  return makeLibCall(DAG, LC, Op.getValueType(), Args, Opts, SDLoc(Op)).first;
+}
+
 SDValue W65816TargetLowering::LowerGlobalAddress(SDValue Op,
                                                  SelectionDAG &DAG) const {
   auto *GA = cast<GlobalAddressSDNode>(Op);
@@ -195,13 +495,19 @@ SDValue W65816TargetLowering::LowerFormalArguments(
   //   (low addr)   <next push>     <- (1,S)
   //
   // Each i16 stack arg occupies 2 bytes.  arg 1 lives at (4,S).
-  if (IsVarArg)
-    report_fatal_error("W65816: vararg functions not yet supported");
 
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo &MFI = MF.getFrameInfo();
   MachineRegisterInfo &MRI = MF.getRegInfo();
 
+  // i32 first-arg ABI: if the first original argument is i32 (the
+  // type legalizer split it into two i16 InputArgs both with
+  // OrigArgIndex == 0), pass it in A:X (lo:hi) — matching the i32
+  // return ABI (also A:X).  Saves one stack slot for the i32 arg.
+  bool I32FirstArg =
+      Ins.size() >= 2 && Ins[0].VT == MVT::i16 && Ins[1].VT == MVT::i16 &&
+      Ins[0].OrigArgIndex == 0 && Ins[1].OrigArgIndex == 0;
+
   unsigned ArgIdx = 0;
   // Stack offset is measured from S+1 (the WDC convention) and grows
   // upward as we walk through the stack-passed args.
@@ -217,31 +523,52 @@ SDValue W65816TargetLowering::LowerFormalArguments(
           VT == MVT::i16 ? &W65816::Acc16RegClass : &W65816::Acc8RegClass);
       MRI.addLiveIn(W65816::A, VReg);
       InVals.push_back(DAG.getCopyFromReg(Chain, DL, VReg, VT));
+    } else if (ArgIdx == 1 && I32FirstArg) {
+      // i32 first-arg hi half: in X.
+      Register VReg = MRI.createVirtualRegister(&W65816::Idx16RegClass);
+      MRI.addLiveIn(W65816::X, VReg);
+      InVals.push_back(DAG.getCopyFromReg(Chain, DL, VReg, MVT::i16));
     } else {
-      // Subsequent args are loaded from the stack.  Use a fixed frame
-      // object positioned at the absolute stack offset; the
-      // eliminateFrameIndex pass turns it into LDA d,S.
-      unsigned ObjSize = (VT == MVT::i16) ? 2 : 1;
+      // Subsequent args are loaded from the stack.  i8 args are
+      // promoted to i16 slots (matching CC_W65816's CCPromoteToType)
+      // so the load can run in the function's default 16-bit M mode
+      // without needing a per-byte SEP/REP wrap; we then truncate the
+      // i16 back to i8 for the IR.  i16 args are loaded directly.
+      unsigned ObjSize = 2;
       int FI = MFI.CreateFixedObject(ObjSize, StackOffset, /*Immutable*/true);
       StackOffset += ObjSize;
       SDValue FIN = DAG.getFrameIndex(FI, MVT::i16);
-      InVals.push_back(DAG.getLoad(
-          VT, DL, Chain, FIN,
-          MachinePointerInfo::getFixedStack(MF, FI)));
+      SDValue Val = DAG.getLoad(
+          MVT::i16, DL, Chain, FIN,
+          MachinePointerInfo::getFixedStack(MF, FI));
+      if (VT == MVT::i8)
+        Val = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Val);
+      InVals.push_back(Val);
     }
     ++ArgIdx;
   }
+
+  // Vararg support: stash the FrameIndex of the next stack-arg slot
+  // (where the caller's first vararg lives) so VASTART can use it
+  // as the va_list start.  StackOffset has been advanced past every
+  // named stack arg; the first vararg sits at SP + StackOffset.
+  if (IsVarArg) {
+    int FI = MFI.CreateFixedObject(2, StackOffset, /*Immutable=*/true);
+    auto *FuncInfo = MF.getInfo<W65816MachineFunctionInfo>();
+    FuncInfo->setVarArgsFrameIndex(FI);
+  }
+
   return Chain;
 }
 
 SDValue
 W65816TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                                 SmallVectorImpl<SDValue> &InVals) const {
-  // Single-arg version: arg 0 in A; LowerFormalArguments accepts
-  // additional args via the stack, but this side doesn't yet emit the
-  // pushes.  Multi-arg call lowering wants a PUSHA pseudo with proper
-  // SP unwinding via TSC/ADC #N/TCS in the ADJCALLSTACKUP pseudo —
-  // tracked separately.
+  // Multi-arg ABI: arg 0 in A, args 1..N-1 pushed in REVERSE order via
+  // PUSH16 (PHA) so the callee's `(4,S)` reads pick up arg 1, `(6,S)`
+  // gets arg 2, etc.  CALLSEQ_START records the byte count;
+  // ADJCALLSTACKUP after the call emits `tsc;clc;adc #N;tcs` to
+  // release the pushed bytes (eliminateCallFramePseudoInstr).
   SelectionDAG &DAG = CLI.DAG;
   SDLoc &DL = CLI.DL;
   SDValue Chain = CLI.Chain;
@@ -252,16 +579,116 @@ W65816TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
   if (CLI.IsTailCall)
     CLI.IsTailCall = false;
-  if (Outs.size() > 1)
-    report_fatal_error("W65816: multi-argument calls not yet supported");
-  if (Ins.size() > 1)
+  // Up to 2 return values: i8/i16 in A, or split i32 in A:X.  The
+  // result-read loop at the end of this function honors the same
+  // ordering as LowerReturn.
+  if (Ins.size() > 2)
     report_fatal_error("W65816: multi-return calls not yet supported");
 
-  Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
+  // Indirect calls (function pointers): redirect through the runtime
+  // trampoline `__jsl_indir`.  The 65816 has no JSL-indirect; instead,
+  // we store the dynamic target to a global (`__indirTarget`), then
+  // JSL the trampoline, which immediately does `JMP (__indirTarget)`.
+  // The target's RTL pops the original JSL's return frame and returns
+  // straight back to the caller — no double-RTL or extra frame.
+  // Caveat: single-bank only (JMP indirect is bank-local).
+  bool IsIndirect = !isa<GlobalAddressSDNode>(Callee) &&
+                    !isa<ExternalSymbolSDNode>(Callee);
+  if (IsIndirect) {
+    // Store the dynamic target to __indirTarget *before* any other
+    // setup, since pushing args clobbers A.  STAabs takes an
+    // ExternalSymbol-wrapped address operand.
+    SDValue TargetSym = DAG.getTargetExternalSymbol("__indirTarget",
+                                                    MVT::i16);
+    SDValue WrappedSym = DAG.getNode(W65816ISD::Wrapper, DL, MVT::i16,
+                                     TargetSym);
+    Chain = DAG.getStore(Chain, DL, Callee, WrappedSym,
+                         MachinePointerInfo());
+    // Replace the callee with __jsl_indir for the actual JSL.
+    Callee = DAG.getExternalSymbol("__jsl_indir", MVT::i16);
+  }
 
+  for (const ISD::OutputArg &O : Outs) {
+    if (O.VT != MVT::i16 && O.VT != MVT::i8)
+      report_fatal_error("W65816: argument type not yet supported");
+  }
+
+  // i32 first-arg ABI: if Outs[0] and Outs[1] are halves of the same
+  // original i32 first arg (OrigArgIndex == 0), pass them in A:X.
+  bool I32FirstArg =
+      Outs.size() >= 2 && Outs[0].VT == MVT::i16 && Outs[1].VT == MVT::i16 &&
+      Outs[0].OrigArgIndex == 0 && Outs[1].OrigArgIndex == 0;
+  unsigned FirstStackArg = I32FirstArg ? 2 : 1;
+
+  // i8 stack args are promoted to i16 (2-byte slots) so the callee can
+  // read them with a 16-bit M load — matches LowerFormalArguments and
+  // CC_W65816's CCPromoteToType<i16>.  Arg 0 stays in A in its native
+  // width; only stack-passed args promote.
+  unsigned StackBytes = 2 * (Outs.size() > FirstStackArg
+                                 ? Outs.size() - FirstStackArg : 0);
+
+  Chain = DAG.getCALLSEQ_START(Chain, StackBytes, 0, DL);
+
+  // Push stack-passed args in reverse so arg FirstStackArg ends up at
+  // the lowest post-JSL stack-relative offset (4,S).  Each push uses A
+  // by default; if the value being pushed is already a `CopyFromReg X`
+  // (e.g. forwarding the i32-first-arg-in-A:X hi half), push directly
+  // from X via PHX — saves the TXA + A-spill round-trip that would
+  // otherwise be required.
   SDValue Glue;
+  for (int i = (int)Outs.size() - 1; i >= (int)FirstStackArg; --i) {
+    SDValue V = OutVals[i];
+    if (Outs[i].VT == MVT::i8)
+      V = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i16, V);
+    // Detect "value is already in X" — either as a physreg
+    // CopyFromReg($x), or as a vreg in the Idx16 class that's
+    // live-in from $x.  In the i32-first-arg-in-A:X path,
+    // LowerFormalArguments creates a vreg in Idx16 and addLiveIn's
+    // it to $x.
+    bool ViaX = false;
+    if (V.getOpcode() == ISD::CopyFromReg) {
+      auto *RegN = dyn_cast<RegisterSDNode>(V.getOperand(1).getNode());
+      if (RegN) {
+        Register R = RegN->getReg();
+        if (R.isPhysical() && R == W65816::X) {
+          ViaX = true;
+        } else if (R.isVirtual()) {
+          MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
+          if (MRI.getRegClass(R) == &W65816::Idx16RegClass) {
+            for (auto &LI : MRI.liveins())
+              if (LI.second == R && LI.first == W65816::X) {
+                ViaX = true;
+                break;
+              }
+          }
+        }
+      }
+    }
+    if (ViaX) {
+      // CopyToReg(X, X) is a no-op but it threads the Glue chain so the
+      // PUSH_X can be sequenced correctly relative to other pushes.
+      Chain = DAG.getCopyToReg(Chain, DL, W65816::X, V, Glue);
+      Glue = Chain.getValue(1);
+      Chain = DAG.getNode(W65816ISD::PUSH_X, DL,
+                          DAG.getVTList(MVT::Other, MVT::Glue), Chain, Glue);
+    } else {
+      Chain = DAG.getCopyToReg(Chain, DL, W65816::A, V, Glue);
+      Glue = Chain.getValue(1);
+      Chain = DAG.getNode(W65816ISD::PUSH, DL,
+                          DAG.getVTList(MVT::Other, MVT::Glue), Chain, Glue);
+    }
+    Glue = Chain.getValue(1);
+  }
+
+  // i32 first-arg hi half goes in X.  Emit before the A copy so the
+  // CopyToReg for X is glued, then A's copy follows.
+  if (I32FirstArg) {
+    Chain = DAG.getCopyToReg(Chain, DL, W65816::X, OutVals[1], Glue);
+    Glue = Chain.getValue(1);
+  }
+
+  // Arg 0 in A.
   if (!OutVals.empty()) {
-    MVT VT = Outs[0].VT;
     Chain = DAG.getCopyToReg(Chain, DL, W65816::A, OutVals[0], Glue);
     Glue = Chain.getValue(1);
   }
@@ -274,6 +701,8 @@ W65816TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   SmallVector<SDValue, 4> CallOps = {Chain, Callee};
   if (!OutVals.empty())
     CallOps.push_back(DAG.getRegister(W65816::A, Outs[0].VT));
+  if (I32FirstArg)
+    CallOps.push_back(DAG.getRegister(W65816::X, Outs[1].VT));
   if (Glue.getNode())
     CallOps.push_back(Glue);
 
@@ -281,14 +710,20 @@ W65816TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                       DAG.getVTList(MVT::Other, MVT::Glue), CallOps);
   Glue = Chain.getValue(1);
 
-  Chain = DAG.getCALLSEQ_END(Chain, 0, 0, Glue, DL);
+  Chain = DAG.getCALLSEQ_END(Chain, StackBytes, 0, Glue, DL);
   Glue = Chain.getValue(1);
 
-  for (const ISD::InputArg &Arg : Ins) {
-    MVT VT = Arg.VT;
+  // Read return value(s).  Mirrors LowerReturn: i8/i16 in A, i32 in A:X.
+  if (Ins.size() > 2)
+    report_fatal_error("W65816: return type not yet supported");
+  static constexpr Register RetRegs[2] = {W65816::A, W65816::X};
+  for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
+    MVT VT = Ins[i].VT;
     if (VT != MVT::i16 && VT != MVT::i8)
       report_fatal_error("W65816: return type not yet supported");
-    SDValue V = DAG.getCopyFromReg(Chain, DL, W65816::A, VT, Glue);
+    if (i == 1 && VT != MVT::i16)
+      report_fatal_error("W65816: split return must be i16");
+    SDValue V = DAG.getCopyFromReg(Chain, DL, RetRegs[i], VT, Glue);
     Chain = V.getValue(1);
     Glue = V.getValue(2);
     InVals.push_back(V);
@@ -302,24 +737,39 @@ SDValue W65816TargetLowering::LowerReturn(
     const SmallVectorImpl<ISD::OutputArg> &Outs,
     const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL,
     SelectionDAG &DAG) const {
-  // Copy scalar return values into A and emit a retglue chain.  Supports
-  // one i16 return today; i8 would use the same A register in 8-bit mode,
-  // and larger returns (i32 A:X, structures via hidden pointer) are future
-  // work.
-  // Copy each scalar return value into A and reference A in the RET_GLUE
-  // operand list so the register allocator keeps the defining instructions
-  // alive (otherwise dead-MI elimination strips them — the physreg copy
-  // alone is not enough of a liveness signal).
-  SDValue Glue;
-  SmallVector<SDValue, 4> RetOps(1, Chain);
-  for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
+  // Return ABI:
+  //   i8/i16:  value in A.
+  //   i32:     low half (Outs[0]) in A, high half (Outs[1]) in X.
+  //   wider:   not yet supported.
+  // Type legalization splits an i32 return into 2 consecutive i16 Outs.
+  // Emission order matters: we copy the high half to X *first* so that
+  // the regalloc can place both halves through the only Acc16 reg (A)
+  // without conflict.  The TAX in copyPhysReg preserves A, so the
+  // subsequent copy of the low half to A doesn't clobber the high.
+  // Emitting low->A first would force a spill since computing the high
+  // would overwrite A while the low is still live for RTL.
+  if (Outs.size() > 2)
+    report_fatal_error("W65816: return type not yet supported");
+  for (unsigned i = 0; i != Outs.size(); ++i) {
     MVT VT = Outs[i].VT;
     if (VT != MVT::i16 && VT != MVT::i8)
       report_fatal_error("W65816: return type not yet supported");
-    Chain = DAG.getCopyToReg(Chain, DL, W65816::A, OutVals[i], Glue);
-    Glue = Chain.getValue(1);
-    RetOps.push_back(DAG.getRegister(W65816::A, VT));
+    if (i == 1 && VT != MVT::i16)
+      report_fatal_error("W65816: split return must be i16");
   }
+  SDValue Glue;
+  SmallVector<SDValue, 4> RetOps(1, Chain);
+  if (Outs.size() == 2) {
+    Chain = DAG.getCopyToReg(Chain, DL, W65816::X, OutVals[1], Glue);
+    Glue = Chain.getValue(1);
+  }
+  if (!Outs.empty()) {
+    Chain = DAG.getCopyToReg(Chain, DL, W65816::A, OutVals[0], Glue);
+    Glue = Chain.getValue(1);
+    RetOps.push_back(DAG.getRegister(W65816::A, Outs[0].VT));
+  }
+  if (Outs.size() == 2)
+    RetOps.push_back(DAG.getRegister(W65816::X, Outs[1].VT));
 
   RetOps[0] = Chain;
   if (Glue.getNode())
@@ -327,3 +777,353 @@ SDValue W65816TargetLowering::LowerReturn(
 
   return DAG.getNode(W65816ISD::RET_GLUE, DL, MVT::Other, RetOps);
 }
+
+// DAG combine: undo clang's `load(SELECT_CC(fi, fi))` rewrite of
+// `c ? *p : *q` when both ptrs are FrameIndex.  Without this, the
+// SELECT_CC matcher (which expects Acc16 inputs) fails to match the
+// FrameIndex tval/fval.  We rewrite back to the original
+// `SELECT_CC(load(fi), load(fi))` shape — safe because both stack
+// slots are guaranteed valid memory.  We deliberately do NOT do this
+// for arbitrary pointers, since reading from both branches could
+// touch invalid memory or memory-mapped IO with side effects.
+SDValue
+W65816TargetLowering::PerformDAGCombine(SDNode *N,
+                                        DAGCombinerInfo &DCI) const {
+  if (N->getOpcode() != ISD::LOAD)
+    return SDValue();
+  LoadSDNode *Ld = cast<LoadSDNode>(N);
+  if (!Ld->isSimple())
+    return SDValue();
+  SDValue Ptr = Ld->getBasePtr();
+
+  // Pre-legalize SELECT (cond, T, F): undo the address-select if both
+  // pointer operands are FrameIndex.
+  if (Ptr.getOpcode() == ISD::SELECT) {
+    SDValue T = Ptr.getOperand(1);
+    SDValue F = Ptr.getOperand(2);
+    if (T.getOpcode() != ISD::FrameIndex ||
+        F.getOpcode() != ISD::FrameIndex)
+      return SDValue();
+    SelectionDAG &DAG = DCI.DAG;
+    EVT VT = N->getValueType(0);
+    SDLoc DL(N);
+    SDValue Chain = Ld->getChain();
+    MachineFunction &MF = DAG.getMachineFunction();
+    int TFI = cast<FrameIndexSDNode>(T)->getIndex();
+    int FFI = cast<FrameIndexSDNode>(F)->getIndex();
+    SDValue LoadT = DAG.getLoad(VT, DL, Chain, T,
+                                MachinePointerInfo::getFixedStack(MF, TFI));
+    SDValue LoadF = DAG.getLoad(VT, DL, Chain, F,
+                                MachinePointerInfo::getFixedStack(MF, FFI));
+    SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
+                                   LoadT.getValue(1), LoadF.getValue(1));
+    SDValue NewSel = DAG.getNode(ISD::SELECT, DL, VT,
+                                 Ptr.getOperand(0), LoadT, LoadF);
+    DCI.CombineTo(N, NewSel, NewChain);
+    return SDValue(N, 0);
+  }
+
+  // Match either pre-legalize ISD::SELECT_CC (LHS,RHS,T,F,CC) or our
+  // post-legalize W65816ISD::SELECT_CC (T,F,CC,glue).  We only sink the
+  // load into both branches when both branch values are FrameIndex —
+  // safe because stack slots are guaranteed valid memory.  For
+  // arbitrary pointers, side-effecting reads make this unsafe.
+  if (Ptr.getOpcode() == ISD::SELECT_CC) {
+    SDValue T = Ptr.getOperand(2);
+    SDValue F = Ptr.getOperand(3);
+    if (T.getOpcode() != ISD::FrameIndex ||
+        F.getOpcode() != ISD::FrameIndex)
+      return SDValue();
+
+    SelectionDAG &DAG = DCI.DAG;
+    EVT VT = N->getValueType(0);
+    SDLoc DL(N);
+    SDValue Chain = Ld->getChain();
+    MachineFunction &MF = DAG.getMachineFunction();
+    int TFI = cast<FrameIndexSDNode>(T)->getIndex();
+    int FFI = cast<FrameIndexSDNode>(F)->getIndex();
+
+    SDValue LoadT = DAG.getLoad(VT, DL, Chain, T,
+                                MachinePointerInfo::getFixedStack(MF, TFI));
+    SDValue LoadF = DAG.getLoad(VT, DL, Chain, F,
+                                MachinePointerInfo::getFixedStack(MF, FFI));
+    SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
+                                   LoadT.getValue(1), LoadF.getValue(1));
+
+    SDValue NewSel = DAG.getNode(ISD::SELECT_CC, DL, VT,
+                                 Ptr.getOperand(0), Ptr.getOperand(1),
+                                 LoadT, LoadF, Ptr.getOperand(4));
+    DCI.CombineTo(N, NewSel, NewChain);
+    return SDValue(N, 0);
+  }
+  return SDValue();
+}
+
+// Map a W65816CC code to the matching Bxx opcode.
+static unsigned getBranchOpcodeForCC(unsigned CC) {
+  switch (CC) {
+  case W65816CC::COND_EQ: return W65816::BEQ;
+  case W65816CC::COND_NE: return W65816::BNE;
+  case W65816CC::COND_HS: return W65816::BCS;
+  case W65816CC::COND_LO: return W65816::BCC;
+  case W65816CC::COND_MI: return W65816::BMI;
+  case W65816CC::COND_PL: return W65816::BPL;
+  case W65816CC::COND_VS: return W65816::BVS;
+  case W65816CC::COND_VC: return W65816::BVC;
+  }
+  llvm_unreachable("invalid W65816 condition code");
+}
+
+// For multi-branch CCs, return the (branchA, branchB, BothMeanTrue) tuple.
+// branchA is tested first; if it takes, we go to TrueBB if BothMeanTrue
+// (i.e. both branches are "take if true"), otherwise to FalseBB.  branchB
+// is tested next with the same semantic.
+//
+//   GT  : (BPL && BNE) → BEQ FalseBB; BPL TrueBB; fall-through FalseBB
+//   LE  : (BMI || BEQ) → BEQ TrueBB;  BMI TrueBB;  fall-through FalseBB
+//   HI  : (BCS && BNE) → BEQ FalseBB; BCS TrueBB; fall-through FalseBB
+//   LS  : (BCC || BEQ) → BEQ TrueBB;  BCC TrueBB;  fall-through FalseBB
+struct MultiBranch {
+  unsigned First, Second;
+  bool FirstToTrue, SecondToTrue;
+};
+static MultiBranch getMultiBranch(unsigned CC) {
+  switch (CC) {
+  case W65816CC::COND_GT_MB:
+    return {W65816::BEQ, W65816::BPL, false, true};
+  case W65816CC::COND_LE_MB:
+    return {W65816::BEQ, W65816::BMI, true, true};
+  case W65816CC::COND_HI_MB:
+    return {W65816::BEQ, W65816::BCS, false, true};
+  case W65816CC::COND_LS_MB:
+    return {W65816::BEQ, W65816::BCC, true, true};
+  }
+  llvm_unreachable("not a multi-branch CC");
+}
+
+// Emit a two-Acc16 binary op as STAfi src2; OPfi dst, src1.  Allocates
+// a fresh 2-byte stack slot per call.  For CMP (HasOut=false) there's
+// no destination register, just the two src operands.  Always spill
+// the SECOND operand so non-commutative ops (sub, cmp) compute
+// src1 OP src2 correctly via OPfi (which gives src1 OP load(spill)).
+static MachineBasicBlock *
+emitRROp(MachineInstr &MI, MachineBasicBlock *BB, unsigned StoreOp,
+         unsigned OpFI, bool HasOut) {
+  MachineFunction *MF = BB->getParent();
+  const W65816Subtarget &STI = MF->getSubtarget<W65816Subtarget>();
+  const W65816InstrInfo &TII = *STI.getInstrInfo();
+  DebugLoc DL = MI.getDebugLoc();
+
+  int FI = MF->getFrameInfo().CreateStackObject(2, Align(2),
+                                                /*isSpillSlot=*/true);
+
+  unsigned LhsIdx = HasOut ? 1 : 0;
+  unsigned RhsIdx = HasOut ? 2 : 1;
+  Register Src1 = MI.getOperand(LhsIdx).getReg();
+  Register Src2 = MI.getOperand(RhsIdx).getReg();
+
+  // Spill src2 (the rhs).  Then OPfi computes src1 OP load(spill).
+  BuildMI(*BB, MI.getIterator(), DL, TII.get(StoreOp))
+      .addReg(Src2)
+      .addFrameIndex(FI)
+      .addImm(0);
+
+  if (HasOut) {
+    Register Dst = MI.getOperand(0).getReg();
+    BuildMI(*BB, MI.getIterator(), DL, TII.get(OpFI), Dst)
+        .addReg(Src1)
+        .addFrameIndex(FI)
+        .addImm(0);
+  } else {
+    BuildMI(*BB, MI.getIterator(), DL, TII.get(OpFI))
+        .addReg(Src1)
+        .addFrameIndex(FI)
+        .addImm(0);
+  }
+
+  MI.eraseFromParent();
+  return BB;
+}
+
+MachineBasicBlock *
+W65816TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
+                                                  MachineBasicBlock *BB) const {
+  // The only opcode we currently emit with usesCustomInserter=1 is
+  // SELECT_CC16.  Expand it into a diamond CFG with a PHI.  For
+  // single-branch CCs:
+  //
+  //   thisMBB:
+  //     ... CMP already emitted ...
+  //     Bxx sinkMBB        ; branch to "true" path
+  //     ; fall through to copy0MBB
+  //   copy0MBB:
+  //     ; (no instructions; PHI picks fval here)
+  //   sinkMBB:
+  //     dst = PHI [tval, thisMBB], [fval, copy0MBB]
+  //
+  // For multi-branch CCs (GT/LE/UGT/ULE without const RHS, where a
+  // single Bxx isn't enough), insert two branches.  Both target either
+  // sinkMBB or copy0MBB depending on the condition.
+  switch (MI.getOpcode()) {
+  default:
+    llvm_unreachable("unexpected instruction in EmitInstrWithCustomInserter");
+  case W65816::ADD_RR:
+    return emitRROp(MI, BB, W65816::STAfi, W65816::ADCfi, /*HasOut=*/true);
+  case W65816::SUB_RR:
+    return emitRROp(MI, BB, W65816::STAfi, W65816::SBCfi, /*HasOut=*/true);
+  // Carry-chain variants for the hi half of an i32 split.  STAfi doesn't
+  // touch P, so the carry from the previous addc/adde survives the
+  // spill and is consumed by ADCEfi/SBCEfi below.
+  case W65816::ADDE_RR:
+    return emitRROp(MI, BB, W65816::STAfi, W65816::ADCEfi, /*HasOut=*/true);
+  case W65816::SUBE_RR:
+    return emitRROp(MI, BB, W65816::STAfi, W65816::SBCEfi, /*HasOut=*/true);
+  case W65816::AND_RR:
+    return emitRROp(MI, BB, W65816::STAfi, W65816::ANDfi, /*HasOut=*/true);
+  case W65816::ORA_RR:
+    return emitRROp(MI, BB, W65816::STAfi, W65816::ORAfi, /*HasOut=*/true);
+  case W65816::EOR_RR:
+    return emitRROp(MI, BB, W65816::STAfi, W65816::EORfi, /*HasOut=*/true);
+  case W65816::CMP_RR:
+    return emitRROp(MI, BB, W65816::STAfi, W65816::CMPfi, /*HasOut=*/false);
+  case W65816::LDAptrOff:
+  case W65816::STAptrOff:
+  case W65816::STBptrOff: {
+    // Pointer access with a constant offset folded into Y.  Saves a
+    // CLC/ADC #off pair plus a spill/reload over computing
+    // `ptr + off` then doing LDAptr/STAptr.  Since Y is 16-bit, any
+    // i16 offset fits.  Operand layout:
+    //   LDAptrOff: 0=dst, 1=ptr, 2=off
+    //   STAptrOff / STBptrOff: 0=val, 1=ptr, 2=off
+    MachineFunction *MF = BB->getParent();
+    const W65816Subtarget &STI = MF->getSubtarget<W65816Subtarget>();
+    const W65816InstrInfo &TII = *STI.getInstrInfo();
+    DebugLoc DL = MI.getDebugLoc();
+    bool IsLoad = MI.getOpcode() == W65816::LDAptrOff;
+    bool IsByteStore = MI.getOpcode() == W65816::STBptrOff;
+    Register Ptr = MI.getOperand(1).getReg();
+    int64_t Off = MI.getOperand(2).getImm();
+    int FI = MF->getFrameInfo().CreateStackObject(2, Align(2),
+                                                  /*isSpillSlot=*/true);
+    BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STAfi))
+        .addReg(Ptr).addFrameIndex(FI).addImm(0);
+    BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDY_Imm16))
+        .addImm(Off);
+    if (IsLoad) {
+      Register Dst = MI.getOperand(0).getReg();
+      BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDAfi_indY), Dst)
+          .addFrameIndex(FI).addImm(0);
+    } else {
+      Register Val = MI.getOperand(0).getReg();
+      if (IsByteStore)
+        BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::SEP)).addImm(0x20);
+      BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STAfi_indY))
+          .addReg(Val).addFrameIndex(FI).addImm(0);
+      if (IsByteStore)
+        BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::REP)).addImm(0x20);
+    }
+    MI.eraseFromParent();
+    return BB;
+  }
+  case W65816::LDAptr:
+  case W65816::STAptr:
+  case W65816::STBptr: {
+    // Spill the pointer to a fresh 2-byte stack slot.  Then LDY #0 and
+    // emit LDAfi_indY / STAfi_indY against that slot.  The (slot,S),Y
+    // addressing reads the pointer from the spill, adds Y (=0), and
+    // dereferences.  STBptr (truncating i8 store) wraps the actual STA
+    // in SEP/REP so M=8 across the store and only one byte is written.
+    MachineFunction *MF = BB->getParent();
+    const W65816Subtarget &STI = MF->getSubtarget<W65816Subtarget>();
+    const W65816InstrInfo &TII = *STI.getInstrInfo();
+    DebugLoc DL = MI.getDebugLoc();
+    bool IsLoad = MI.getOpcode() == W65816::LDAptr;
+    bool IsByteStore = MI.getOpcode() == W65816::STBptr;
+
+    // Operand layout (explicit only; Defs=[Y] adds an implicit at the
+    // end which we don't read here):
+    //   LDAptr: 0=dst, 1=ptr
+    //   STAptr / STBptr: 0=val, 1=ptr
+    // The pointer operand is always at index 1.  Earlier code reading
+    // operand 2 for stores hit the implicit Y def, not the pointer —
+    // which only "worked" because regalloc didn't notice and A
+    // happened to hold the right bytes by accident.
+    Register Ptr = MI.getOperand(1).getReg();
+    int FI = MF->getFrameInfo().CreateStackObject(2, Align(2),
+                                                  /*isSpillSlot=*/true);
+
+    // Spill ptr.
+    BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STAfi))
+        .addReg(Ptr).addFrameIndex(FI).addImm(0);
+    // LDY #0.  LDY_Imm16 has no output operand; Y is defined implicitly
+    // via the pseudo's Defs=[Y] marking.
+    BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDY_Imm16))
+        .addImm(0);
+
+    if (IsLoad) {
+      Register Dst = MI.getOperand(0).getReg();
+      BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDAfi_indY), Dst)
+          .addFrameIndex(FI).addImm(0);
+    } else {
+      Register Val = MI.getOperand(0).getReg();
+      if (IsByteStore)
+        BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::SEP)).addImm(0x20);
+      BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STAfi_indY))
+          .addReg(Val).addFrameIndex(FI).addImm(0);
+      if (IsByteStore)
+        BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::REP)).addImm(0x20);
+    }
+    MI.eraseFromParent();
+    return BB;
+  }
+  case W65816::SELECT_CC16: {
+    const W65816Subtarget &STI = BB->getParent()->getSubtarget<W65816Subtarget>();
+    const W65816InstrInfo &TII = *STI.getInstrInfo();
+    DebugLoc DL = MI.getDebugLoc();
+    MachineFunction *MF = BB->getParent();
+    const BasicBlock *LLVM_BB = BB->getBasicBlock();
+    MachineFunction::iterator It = ++BB->getIterator();
+
+    MachineBasicBlock *thisMBB  = BB;
+    MachineBasicBlock *copy0MBB = MF->CreateMachineBasicBlock(LLVM_BB);
+    MachineBasicBlock *sinkMBB  = MF->CreateMachineBasicBlock(LLVM_BB);
+    MF->insert(It, copy0MBB);
+    MF->insert(It, sinkMBB);
+
+    // Move the rest of thisMBB after MI to sinkMBB.
+    sinkMBB->splice(sinkMBB->begin(), BB,
+                    std::next(MachineBasicBlock::iterator(MI)), BB->end());
+    sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+    BB->addSuccessor(copy0MBB);
+    BB->addSuccessor(sinkMBB);
+
+    unsigned CC = MI.getOperand(3).getImm();
+    if (CC < W65816CC::COND_GT_MB) {
+      // Single-branch: Bxx sinkMBB.
+      unsigned BrOp = getBranchOpcodeForCC(CC);
+      BuildMI(thisMBB, DL, TII.get(BrOp)).addMBB(sinkMBB);
+    } else {
+      // Multi-branch: two Bxx.  Each may target sinkMBB (true) or
+      // copy0MBB (false).  Fall-through is the OTHER block.
+      MultiBranch MB = getMultiBranch(CC);
+      MachineBasicBlock *Tgt1 = MB.FirstToTrue  ? sinkMBB : copy0MBB;
+      MachineBasicBlock *Tgt2 = MB.SecondToTrue ? sinkMBB : copy0MBB;
+      BuildMI(thisMBB, DL, TII.get(MB.First)).addMBB(Tgt1);
+      BuildMI(thisMBB, DL, TII.get(MB.Second)).addMBB(Tgt2);
+    }
+
+    // copy0MBB falls through to sinkMBB.
+    copy0MBB->addSuccessor(sinkMBB);
+
+    // sinkMBB:  dst = PHI [tval, thisMBB], [fval, copy0MBB].
+    BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII.get(W65816::PHI),
+            MI.getOperand(0).getReg())
+        .addReg(MI.getOperand(1).getReg()).addMBB(thisMBB)
+        .addReg(MI.getOperand(2).getReg()).addMBB(copy0MBB);
+
+    MI.eraseFromParent();
+    return sinkMBB;
+  }
+  }
+}
diff --git a/src/llvm/lib/Target/W65816/W65816ISelLowering.h b/src/llvm/lib/Target/W65816/W65816ISelLowering.h
index 7755bf3..6c52639 100644
--- a/src/llvm/lib/Target/W65816/W65816ISelLowering.h
+++ b/src/llvm/lib/Target/W65816/W65816ISelLowering.h
@@ -46,6 +46,10 @@ public:
 
   SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
 
+  MachineBasicBlock *
+  EmitInstrWithCustomInserter(MachineInstr &MI,
+                              MachineBasicBlock *MBB) const override;
+
   // The 65816 has no alignment requirement on memory access — any
   // address is fine.  Telling LLVM this lets it emit single 16-bit
   // loads/stores even when the IR alignment is 1, instead of
@@ -59,10 +63,47 @@ public:
     return true;
   }
 
+  // Disable LLVM's magic-constant expansion of sdiv/srem by power-of-2.
+  // The default expansion generates BUILD_VECTOR (used as a "splat shifter"
+  // intermediate) which we can't lower; without an override, every sdiv/srem
+  // by a pow2 constant crashes ISel.  Returning the original node leaves it
+  // intact for the libcall lowering path (SDIV/SREM are LibCall in our
+  // ctor — see setOperationAction calls above).
+  SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor,
+                        SelectionDAG &DAG,
+                        SmallVectorImpl<SDNode *> &Created) const override {
+    return SDValue(N, 0);
+  }
+  SDValue BuildSREMPow2(SDNode *N, const APInt &Divisor,
+                        SelectionDAG &DAG,
+                        SmallVectorImpl<SDNode *> &Created) const override {
+    return SDValue(N, 0);
+  }
+
+  SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
+
+  // Force i32 / i64 shifts through a libcall (__ashlsi3 / __lshrsi3 /
+  // __ashrsi3) instead of LLVM's default ExpandToParts strategy, which
+  // emits an SHL_PARTS node we have no pattern for.  ExpandToParts also
+  // produces a long select-based sequence; the libcall is both smaller
+  // and matches our existing libcall-based approach for i16 mul/div.
+  ShiftLegalizationStrategy
+  preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N,
+                                     unsigned ExpansionFactor) const override {
+    if (N->getValueType(0).getSizeInBits() > 16)
+      return ShiftLegalizationStrategy::LowerToLibcall;
+    return TargetLowering::preferredShiftLegalizationStrategy(DAG, N,
+                                                              ExpansionFactor);
+  }
+
 private:
   SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSignExtend(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerShift(SDValue Op, SelectionDAG &DAG) const;
 };
 
 } // namespace llvm
diff --git a/src/llvm/lib/Target/W65816/W65816InstrFormats.td b/src/llvm/lib/Target/W65816/W65816InstrFormats.td
index 30305f8..5083021 100644
--- a/src/llvm/lib/Target/W65816/W65816InstrFormats.td
+++ b/src/llvm/lib/Target/W65816/W65816InstrFormats.td
@@ -258,6 +258,23 @@ class InstStackRel<bits<8> op, string mnem>
   let Inst{15-8} = off;
 }
 
+// Stack-relative indirect indexed-Y: `LDA (off,S),Y`.  Reads the 16-bit
+// pointer stored at S+off, adds Y, then loads from that address.  Used
+// to dereference pointers spilled to a stack scratch slot — the only
+// way the 65816 can deref a pointer not already in zero page.
+// isCodeGenOnly because the asm-parser doesn't accept `(d,S),Y` syntax
+// today; codegen builds these MIs directly.
+class InstStackRelIndY<bits<8> op, string mnem>
+    : W65816Inst<(outs), (ins addrDP:$off),
+                 !strconcat(mnem, "\t($off, s), y")> {
+  let Size = 2;
+  bits<8>  off;
+  bits<16> Inst;
+  let Inst{7-0}  = op;
+  let Inst{15-8} = off;
+  let isCodeGenOnly = 1;
+}
+
 class InstPCRel8<bits<8> op, string mnem>
     : W65816Inst<(outs), (ins pcrel8:$dest), !strconcat(mnem, "\t$dest")> {
   let Size = 2;
diff --git a/src/llvm/lib/Target/W65816/W65816InstrInfo.cpp b/src/llvm/lib/Target/W65816/W65816InstrInfo.cpp
index d7708b1..607af09 100644
--- a/src/llvm/lib/Target/W65816/W65816InstrInfo.cpp
+++ b/src/llvm/lib/Target/W65816/W65816InstrInfo.cpp
@@ -14,6 +14,7 @@
 #include "W65816InstrInfo.h"
 #include "W65816.h"
 #include "W65816Subtarget.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/Support/ErrorHandling.h"
 
@@ -34,13 +35,28 @@ void W65816InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                                   const DebugLoc &DL, Register DestReg,
                                   Register SrcReg, bool KillSrc,
                                   bool RenamableDest, bool RenamableSrc) const {
-  // The only Acc16 register is A; copies between A and itself are no-ops.
-  // Cross-class copies (e.g. A → X) need TAX/TXA pairs which we don't
-  // need yet — bail loudly so we notice when the time comes.
   if (DestReg == SrcReg)
     return;
-  if (DestReg == W65816::A && SrcReg == W65816::A)
+  // A → X / X → A via TAX / TXA.  Used by i32 return ABI (lo in A, hi
+  // in X) and by callers reading split-i32 results.  Both instructions
+  // are 16-bit when M=0/X=0; that matches our default mode.
+  if (DestReg == W65816::X && SrcReg == W65816::A) {
+    BuildMI(MBB, I, DL, get(W65816::TAX));
     return;
+  }
+  if (DestReg == W65816::A && SrcReg == W65816::X) {
+    BuildMI(MBB, I, DL, get(W65816::TXA));
+    return;
+  }
+  // A → Y / Y → A via TAY / TYA.  Same M/X width caveat.
+  if (DestReg == W65816::Y && SrcReg == W65816::A) {
+    BuildMI(MBB, I, DL, get(W65816::TAY));
+    return;
+  }
+  if (DestReg == W65816::A && SrcReg == W65816::Y) {
+    BuildMI(MBB, I, DL, get(W65816::TYA));
+    return;
+  }
   llvm_unreachable("W65816: cross-class copyPhysReg not yet implemented");
 }
 
@@ -71,3 +87,50 @@ void W65816InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
       .addFrameIndex(FrameIdx)
       .addImm(0);
 }
+
+Register W65816InstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
+                                              int &FrameIndex) const {
+  if (MI.getOpcode() != W65816::LDAfi)
+    return 0;
+  // memfi packs (FrameIndex, offset).  Treat only offset==0 as a true
+  // stack-slot load — non-zero offset means we're addressing within
+  // the slot (e.g. the high half of an i32 spill), which the generic
+  // peephole/CSE machinery doesn't model.
+  if (MI.getNumOperands() < 3 || !MI.getOperand(1).isFI() ||
+      !MI.getOperand(2).isImm() || MI.getOperand(2).getImm() != 0)
+    return 0;
+  FrameIndex = MI.getOperand(1).getIndex();
+  return MI.getOperand(0).getReg();
+}
+
+Register W65816InstrInfo::isStoreToStackSlot(const MachineInstr &MI,
+                                             int &FrameIndex) const {
+  if (MI.getOpcode() != W65816::STAfi)
+    return 0;
+  // STAfi: (ins Acc16:$src, memfi:$addr) — op0 is src reg, op1 is
+  // FrameIndex, op2 is offset.
+  if (MI.getNumOperands() < 3 || !MI.getOperand(1).isFI() ||
+      !MI.getOperand(2).isImm() || MI.getOperand(2).getImm() != 0)
+    return 0;
+  FrameIndex = MI.getOperand(1).getIndex();
+  return MI.getOperand(0).getReg();
+}
+
+bool W65816InstrInfo::isReMaterializableImpl(const MachineInstr &MI) const {
+  // Only LDAfi is gated on this hook.  We declare it
+  // isReMaterializable=1 in tablegen so the framework will *consider*
+  // re-emitting it instead of spilling, then call back here to confirm.
+  // The instruction is safely rematerializable iff it loads from a
+  // *fixed* (immutable) frame index — i.e. an arg slot.  Loads from a
+  // regular spill slot read a computed value that may not be available
+  // at the rematerialization point.
+  if (MI.getOpcode() != W65816::LDAfi)
+    return TargetInstrInfo::isReMaterializableImpl(MI);
+
+  // Operand 1 is the FrameIndex (operand 0 is the def).
+  const MachineOperand &FIOp = MI.getOperand(1);
+  if (!FIOp.isFI())
+    return false;
+  const MachineFrameInfo &MFI = MI.getMF()->getFrameInfo();
+  return MFI.isFixedObjectIndex(FIOp.getIndex());
+}
diff --git a/src/llvm/lib/Target/W65816/W65816InstrInfo.h b/src/llvm/lib/Target/W65816/W65816InstrInfo.h
index 19fc860..8a3ba39 100644
--- a/src/llvm/lib/Target/W65816/W65816InstrInfo.h
+++ b/src/llvm/lib/Target/W65816/W65816InstrInfo.h
@@ -46,6 +46,29 @@ public:
       int FrameIdx, const TargetRegisterClass *RC, Register VReg,
       unsigned SubReg = 0,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
+
+  // Override the default rematerializability check to recognise LDAfi
+  // from a *fixed* (immutable) frame index — i.e. an arg slot — as
+  // trivially rematerializable.  Without this, the greedy allocator
+  // spills arg loads to a fresh local slot the moment A is needed for
+  // anything else, then reloads from the local slot at every use.
+  // With it, the allocator just re-emits `LDA arg_slot,S` at each use
+  // and the `STA local; LDA local; LDA local` cluster collapses to a
+  // single `LDA arg_slot,S`.  Spill-slot LDAfi (regular FI) is *not*
+  // rematerializable — that loads a computed value.
+  bool isReMaterializableImpl(const MachineInstr &MI) const override;
+
+  // Tell the framework which pseudos are direct stack-slot loads/stores.
+  // MachineCSE, machine-licm, and peephole-opt use these hooks to elide
+  // redundant store/load pairs and to hoist invariants.  Without them,
+  // patterns like `STAfi A, slot; LDAfi slot, A` (introduced by the
+  // greedy allocator's COPY-of-physreg expansion) survive into final
+  // asm as `sta x,s; lda x,s` no-op pairs.
+  Register isLoadFromStackSlot(const MachineInstr &MI,
+                               int &FrameIndex) const override;
+  Register isStoreToStackSlot(const MachineInstr &MI,
+                              int &FrameIndex) const override;
+
 };
 
 } // namespace llvm
diff --git a/src/llvm/lib/Target/W65816/W65816InstrInfo.td b/src/llvm/lib/Target/W65816/W65816InstrInfo.td
index 5db2373..db318c5 100644
--- a/src/llvm/lib/Target/W65816/W65816InstrInfo.td
+++ b/src/llvm/lib/Target/W65816/W65816InstrInfo.td
@@ -54,6 +54,31 @@ def W65816cmp  : SDNode<"W65816ISD::CMP",   SDT_W65816Cmp,  [SDNPOutGlue]>;
 def W65816brcc : SDNode<"W65816ISD::BR_CC", SDT_W65816BrCC,
                         [SDNPHasChain, SDNPInGlue]>;
 
+// Push A onto the stack.  Used by LowerCall to pass extra args.
+// Takes Chain + Glue (with A pre-loaded via CopyToReg), produces
+// Chain + Glue.  Has a side effect (SP changes) and stores to
+// memory.  In 16-bit M mode, pushes 2 bytes and decrements SP by 2;
+// the call's ADJCALLSTACKUP pseudo unwinds those bytes via
+// tsc;clc;adc #N;tcs after the JSL returns.
+def W65816push : SDNode<"W65816ISD::PUSH", SDTNone,
+                        [SDNPHasChain, SDNPInGlue, SDNPOutGlue,
+                         SDNPSideEffect, SDNPMayStore]>;
+
+// Push X onto the stack.  Same shape as W65816push but the value to
+// push is glued from CopyToReg(X) instead of CopyToReg(A).
+def W65816pushx : SDNode<"W65816ISD::PUSH_X", SDTNone,
+                         [SDNPHasChain, SDNPInGlue, SDNPOutGlue,
+                          SDNPSideEffect, SDNPMayStore]>;
+
+// SELECT_CC: takes (TVal, FVal, CC) plus a glue value carrying the
+// flags from a preceding W65816cmp.  Lowered by EmitInstrWithCustomInserter
+// into a CMP (already in the BB) + Bxx + diamond CFG + PHI.
+def SDT_W65816SelectCC : SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>,
+                                              SDTCisSameAs<0, 2>,
+                                              SDTCisVT<3, i8>]>;
+def W65816selectcc : SDNode<"W65816ISD::SELECT_CC", SDT_W65816SelectCC,
+                            [SDNPInGlue]>;
+
 //===----------------------------------------------------------------------===//
 // Pseudo Instructions
 //===----------------------------------------------------------------------===//
@@ -71,14 +96,51 @@ def ADJCALLSTACKUP   : W65816Pseudo<(outs),
                                                         timm:$amt2)]>;
 }
 
-let isReMaterializable = 1 in
-def ADDframe : W65816Pseudo<(outs PtrRegs:$dst),
+// LEA-equivalent: compute the address (SP + frame_offset + offset) of a
+// stack slot and place it in A.  Selected from a bare ISD::FrameIndex
+// SDValue in W65816DAGToDAGISel::Select; expanded by eliminateFrameIndex
+// into TSC + CLC + ADC #disp.  Output is Acc16 because the address ends
+// up in A; PtrRegs (which only contains SP) is the wrong class.
+let isReMaterializable = 1, hasSideEffects = 0,
+    mayLoad = 0, mayStore = 0 in
+def ADDframe : W65816Pseudo<(outs Acc16:$dst),
                             (ins i16imm:$base, i16imm:$offset),
                             "# ADDframe PSEUDO", []>;
 
 // The retglue node lowers directly to RTL (see Returns section below).
 // No separate RET pseudo — the real MC instruction handles the pattern.
 
+// Push A onto the stack.  Expanded in AsmPrinter to MC `PHA`.  Used by
+// LowerCall to pass extra args; the matching `tsc;clc;adc #N;tcs` SP
+// unwind happens in eliminateCallFramePseudoInstr for ADJCALLSTACKUP.
+let Defs = [SP], Uses = [A, SP], mayStore = 1, hasSideEffects = 1 in {
+def PUSH16 : W65816Pseudo<(outs), (ins), "# PUSH16",
+                          [(W65816push)]>;
+}
+// Push X onto the stack.  Used by LowerCall when an outgoing arg's
+// SDValue is already in X (e.g. forwarding the i32-first-arg-in-A:X
+// hi half).  Saves a TXA+spill round-trip.  Expansion: PHX.
+let Defs = [SP], Uses = [X, SP], mayStore = 1, hasSideEffects = 1 in {
+def PUSH16X : W65816Pseudo<(outs), (ins), "# PUSH16X",
+                           [(W65816pushx)]>;
+}
+
+// SELECT_CC16: implements (set Acc16:$dst, (W65816selectcc tval, fval, cc))
+// where the CMP that produced the flags has already been emitted (its
+// glue is implicit via the P register).  EmitInstrWithCustomInserter
+// expands this into a Bxx + 2 BBs + PHI.  Marked usesCustomInserter so
+// the codegen invokes our hook; Uses=[P] so MachineSched keeps the CMP
+// adjacent.
+let usesCustomInserter = 1, Uses = [P], hasSideEffects = 1 in {
+def SELECT_CC16 : W65816Pseudo<(outs Acc16:$dst),
+                               (ins Acc16:$tval, Acc16:$fval, i8imm:$cc),
+                               "# SELECT_CC16 $dst, $tval, $fval, $cc",
+                               [(set Acc16:$dst,
+                                     (W65816selectcc Acc16:$tval,
+                                                     Acc16:$fval,
+                                                     timm:$cc))]>;
+}
+
 //===----------------------------------------------------------------------===//
 // Codegen pseudos that expand to MC instructions in the AsmPrinter.
 //
@@ -94,6 +156,15 @@ let isAsCheapAsAMove = 1, isReMaterializable = 1,
 def LDAi16imm : W65816Pseudo<(outs Acc16:$dst), (ins i16imm:$imm),
                              "# LDAi16imm $dst, $imm",
                              [(set Acc16:$dst, (i16 imm:$imm))]>;
+// Materialise an i16 constant directly in X (Idx16).  Useful when the
+// constant's only consumer is `CopyToReg($x)` — saves an LDA+TAX
+// round-trip (and the A-clobber that round-trip implies).  Common for
+// the high half of `(zext i16 to i32)` returns, where hi=const-zero.
+let isReMaterializable = 1, isAsCheapAsAMove = 1, hasSideEffects = 0,
+    mayLoad = 0, mayStore = 0 in
+def LDXi16imm : W65816Pseudo<(outs Idx16:$dst), (ins i16imm:$imm),
+                             "# LDXi16imm $dst, $imm",
+                             [(set Idx16:$dst, (i16 imm:$imm))]>;
 def LDAi8imm  : W65816Pseudo<(outs Acc8:$dst), (ins i8imm:$imm),
                              "# LDAi8imm $dst, $imm",
                              [(set Acc8:$dst, (i8 imm:$imm))]>;
@@ -177,8 +248,13 @@ def : Pat<(store Acc16:$src, (W65816Wrapper texternalsym:$s)),
 // source and dest to A — there is only one Acc16 register so this is
 // implicit, but stating it lets the register allocator coalesce
 // without needing a COPY.
+//
+// Defs = [P] models the C-flag side-effect.  Required so tablegen can
+// connect this instruction to the SDNode `addc` / `subc` (SDNPOutGlue),
+// which is what the type legalizer emits as the lo half of a multi-
+// precision add/sub when ADDC/SUBC is Legal (see W65816ISelLowering ctor).
 let Constraints = "$src = $dst",
-    hasSideEffects = 0, mayLoad = 0, mayStore = 0 in {
+    hasSideEffects = 0, mayLoad = 0, mayStore = 0, Defs = [P] in {
 def ADCi16imm : W65816Pseudo<(outs Acc16:$dst),
                              (ins Acc16:$src, i16imm:$imm),
                              "# ADCi16imm $dst, $src, $imm",
@@ -191,10 +267,19 @@ def SBCi16imm : W65816Pseudo<(outs Acc16:$dst),
                                    (sub Acc16:$src, imm:$imm))]>;
 }
 
+// addc/subc: same as add/sub on this target (CLC then ADC, SEC then SBC),
+// but the SDNode produces a Glue carrying the post-op carry into a
+// subsequent adde/sube.  Tablegen wires the Glue to the P register
+// because the instruction has Defs = [P].
+def : Pat<(addc Acc16:$src, imm:$imm),
+          (ADCi16imm Acc16:$src, imm:$imm)>;
+def : Pat<(subc Acc16:$src, imm:$imm),
+          (SBCi16imm Acc16:$src, imm:$imm)>;
+
 // ADC/SBC from a 16-bit absolute address.  Folds a load on the
 // right-hand side of an add/sub into the carry-arithmetic op.
 let Constraints = "$src = $dst",
-    hasSideEffects = 0, mayLoad = 1, mayStore = 0 in {
+    hasSideEffects = 0, mayLoad = 1, mayStore = 0, Defs = [P] in {
 def ADCabs : W65816Pseudo<(outs Acc16:$dst),
                           (ins Acc16:$src, i32imm:$addr),
                           "# ADCabs $dst, $src, $addr", []>;
@@ -214,6 +299,61 @@ def : Pat<(sub Acc16:$src,
 def : Pat<(sub Acc16:$src,
               (i16 (load (W65816Wrapper texternalsym:$s)))),
           (SBCabs Acc16:$src, texternalsym:$s)>;
+def : Pat<(addc Acc16:$src,
+              (i16 (load (W65816Wrapper tglobaladdr:$g)))),
+          (ADCabs Acc16:$src, tglobaladdr:$g)>;
+def : Pat<(addc Acc16:$src,
+              (i16 (load (W65816Wrapper texternalsym:$s)))),
+          (ADCabs Acc16:$src, texternalsym:$s)>;
+def : Pat<(subc Acc16:$src,
+              (i16 (load (W65816Wrapper tglobaladdr:$g)))),
+          (SBCabs Acc16:$src, tglobaladdr:$g)>;
+def : Pat<(subc Acc16:$src,
+              (i16 (load (W65816Wrapper texternalsym:$s)))),
+          (SBCabs Acc16:$src, texternalsym:$s)>;
+
+// adde/sube: the chained ADC/SBC for the hi half of a multi-precision
+// add/sub.  Reads the C flag from the previous addc/adde (Uses = [P]),
+// produces a fresh carry/borrow (Defs = [P]).  AsmPrinter expansion
+// emits a bare ADC/SBC with no preceding CLC/SEC; eliminateFrameIndex
+// for ADCEfi/SBCEfi skips the carry-prefix step that the standalone
+// ADCfi/SBCfi rely on.
+let Constraints = "$src = $dst",
+    hasSideEffects = 0, mayLoad = 0, mayStore = 0,
+    Uses = [P], Defs = [P] in {
+def ADCEi16imm : W65816Pseudo<(outs Acc16:$dst),
+                              (ins Acc16:$src, i16imm:$imm),
+                              "# ADCEi16imm $dst, $src, $imm",
+                              [(set Acc16:$dst,
+                                    (adde Acc16:$src, imm:$imm))]>;
+def SBCEi16imm : W65816Pseudo<(outs Acc16:$dst),
+                              (ins Acc16:$src, i16imm:$imm),
+                              "# SBCEi16imm $dst, $src, $imm",
+                              [(set Acc16:$dst,
+                                    (sube Acc16:$src, imm:$imm))]>;
+}
+let Constraints = "$src = $dst",
+    hasSideEffects = 0, mayLoad = 1, mayStore = 0,
+    Uses = [P], Defs = [P] in {
+def ADCEabs : W65816Pseudo<(outs Acc16:$dst),
+                           (ins Acc16:$src, i32imm:$addr),
+                           "# ADCEabs $dst, $src, $addr", []>;
+def SBCEabs : W65816Pseudo<(outs Acc16:$dst),
+                           (ins Acc16:$src, i32imm:$addr),
+                           "# SBCEabs $dst, $src, $addr", []>;
+}
+def : Pat<(adde Acc16:$src,
+              (i16 (load (W65816Wrapper tglobaladdr:$g)))),
+          (ADCEabs Acc16:$src, tglobaladdr:$g)>;
+def : Pat<(adde Acc16:$src,
+              (i16 (load (W65816Wrapper texternalsym:$s)))),
+          (ADCEabs Acc16:$src, texternalsym:$s)>;
+def : Pat<(sube Acc16:$src,
+              (i16 (load (W65816Wrapper tglobaladdr:$g)))),
+          (SBCEabs Acc16:$src, tglobaladdr:$g)>;
+def : Pat<(sube Acc16:$src,
+              (i16 (load (W65816Wrapper texternalsym:$s)))),
+          (SBCEabs Acc16:$src, texternalsym:$s)>;
 
 // (add Acc16, Acc16) — same value added to itself, equivalent to a 1-bit
 // left shift.  Pattern needs a tied input so the result lands in A.
@@ -293,6 +433,27 @@ def NEGA16 : W65816Pseudo<(outs Acc16:$dst), (ins Acc16:$src),
                           [(set Acc16:$dst, (sub (i16 0), Acc16:$src))]>;
 }
 
+// Multi-precision negation: lo + hi halves of `-x` where x is i32.
+// LLVM splits `0 - x` into `(subc 0, x_lo)` and `(sube 0, x_hi)`.
+// We implement both via the ADD chain `~x + carry` since INC doesn't
+// touch C; the bit pattern of C from `~x + 1` matches what `subc 0, x`
+// would set (C=1 iff x was 0, i.e. no borrow).
+//   NEGC16  matches subc → "EOR #$FFFF; CLC; ADC #1"   (5 bytes)
+//   NEGE16  matches sube → "EOR #$FFFF; ADC #0"        (4 bytes, uses C-in)
+let Constraints = "$src = $dst",
+    hasSideEffects = 0, mayLoad = 0, mayStore = 0, Defs = [P] in {
+def NEGC16 : W65816Pseudo<(outs Acc16:$dst), (ins Acc16:$src),
+                          "# NEGC16 $dst, $src",
+                          [(set Acc16:$dst, (subc (i16 0), Acc16:$src))]>;
+}
+let Constraints = "$src = $dst",
+    hasSideEffects = 0, mayLoad = 0, mayStore = 0,
+    Uses = [P], Defs = [P] in {
+def NEGE16 : W65816Pseudo<(outs Acc16:$dst), (ins Acc16:$src),
+                          "# NEGE16 $dst, $src",
+                          [(set Acc16:$dst, (sube (i16 0), Acc16:$src))]>;
+}
+
 // Bitwise NOT pattern moved below EORi16imm definition.
 
 // 16-bit bitwise ops: AND / OR / XOR against an immediate or memory
@@ -340,6 +501,71 @@ def : Pat<(xor Acc16:$src, (i16 (load (W65816Wrapper tglobaladdr:$g)))),
 def : Pat<(xor Acc16:$src, (i16 -1)),
           (EORi16imm Acc16:$src, 0xFFFF)>;
 
+// (srl x, 15): extract bit 15 to bit 0 (yields 0 or 1).  The
+// type-legalizer's SHL_PARTS expansion of `i32 << 1` needs this for
+// the high-half "carry from low" slot, and routing it through the
+// __lshrhi3 libcall costs ~10 bytes per i32 shift-by-1.  Inline as
+// `ASL A; LDA #0; ROL A` (3 bytes): ASL puts bit 15 into C and
+// trashes A; LDA #0 doesn't touch C; ROL A folds C into bit 0.
+//
+// (shl x, 15): move bit 0 to bit 15 (yields 0 or 0x8000).  Used by
+// SRL_PARTS / SRA_PARTS expansion of `i32 >> 1` for the low-half
+// "carry from hi" slot.  Mirror sequence: `LSR A; LDA #0; ROR A`.
+let Constraints = "$src = $dst",
+    hasSideEffects = 0, mayLoad = 0, mayStore = 0, Defs = [P] in {
+def SRL15A : W65816Pseudo<(outs Acc16:$dst), (ins Acc16:$src),
+                          "# SRL15A $dst, $src",
+                          [(set Acc16:$dst, (srl Acc16:$src, (i16 15)))]>;
+def SHL15A : W65816Pseudo<(outs Acc16:$dst), (ins Acc16:$src),
+                          "# SHL15A $dst, $src",
+                          [(set Acc16:$dst, (shl Acc16:$src, (i16 15)))]>;
+}
+// (srl x, 8): high byte to low byte, zero high byte.  XBA swaps the
+// two bytes of A (in 16-bit M); AND #$00FF clears the new high byte.
+// 4 bytes total — much shorter than the __lshrhi3 libcall path.  Used
+// by i32 shift-by-8 SHL_PARTS expansion for the cross-half slot.
+//
+// (shl x, 8): low byte to high byte, zero low byte.  Mirror.
+let Constraints = "$src = $dst",
+    hasSideEffects = 0, mayLoad = 0, mayStore = 0 in {
+def SRL8A : W65816Pseudo<(outs Acc16:$dst), (ins Acc16:$src),
+                         "# SRL8A $dst, $src",
+                         [(set Acc16:$dst, (srl Acc16:$src, (i16 8)))]>;
+def SHL8A : W65816Pseudo<(outs Acc16:$dst), (ins Acc16:$src),
+                         "# SHL8A $dst, $src",
+                         [(set Acc16:$dst, (shl Acc16:$src, (i16 8)))]>;
+}
+// (sra x, 15): sign-fill — yields $0000 if x is non-negative, $FFFF
+// if negative.  Used by i32 sext-from-i16 type-legalization for the
+// hi half (avoids the __ashrhi3 libcall path).  Sequence:
+// `ASL A; LDA #0; SBC #0; EOR #-1` (when our SBCi16imm uses SEC + SBC,
+// LDA #0; SBC #0 produces $FFFF if C=0, $0000 if C=1; EOR #-1 flips).
+// Actually simpler since SBC sets carry differently: see AsmPrinter
+// expansion for the exact 5-byte sequence.
+let Constraints = "$src = $dst",
+    hasSideEffects = 0, mayLoad = 0, mayStore = 0, Defs = [P] in {
+def SRA15A : W65816Pseudo<(outs Acc16:$dst), (ins Acc16:$src),
+                          "# SRA15A $dst, $src",
+                          [(set Acc16:$dst, (sra Acc16:$src, (i16 15)))]>;
+}
+
+// sext_inreg from i1: broadcast bit 0 to all bits.  LLVM emits this
+// for `(c & 1) ? -1 : 0` patterns (e.g. CRC inner loops).  The result
+// is `-(x & 1)` — 0 if bit 0 was clear, 0xFFFF if set.  Mask to bit
+// 0 then two's-complement-negate.  Three pseudos = ~7 bytes.
+def : Pat<(sext_inreg Acc16:$src, i1),
+          (NEGA16 (ANDi16imm Acc16:$src, 1))>;
+
+// sext_inreg from i8: branchless `((x & 0xFF) ^ 0x80) - 0x80` trick
+// (same sequence LowerSignExtend uses for ISD::SIGN_EXTEND i8->i16).
+// LLVM emits this when expanding a sextload-i16-from-i8 (we set
+// SEXTLOAD i8 to Expand in the lowering ctor) and for explicit
+// `(int)(signed char)` casts.
+def : Pat<(sext_inreg Acc16:$src, i8),
+          (SBCi16imm (EORi16imm
+                         (ANDi16imm Acc16:$src, 0x00FF), 0x0080),
+                     0x0080)>;
+
 // Frame-index loads/stores: take a FrameIndex + offset (packed into a
 // single MIOperandInfo) and expand (in eliminateFrameIndex) into an
 // LDA / STA d,S with the offset baked in.  Used by LowerFormalArguments
@@ -350,7 +576,12 @@ def memfi : Operand<i16> {
   let PrintMethod   = "printFrameMem";
 }
 
-let mayLoad = 1, hasSideEffects = 0, mayStore = 0 in {
+// LDAfi is rematerializable when the FI is a fixed (immutable) arg
+// slot — see W65816InstrInfo::isReMaterializableImpl.  Without this,
+// greedy regalloc spills every arg load to a fresh local slot then
+// reloads from there, ballooning every i32-arg function by 4-6 insns.
+let mayLoad = 1, hasSideEffects = 0, mayStore = 0,
+    isReMaterializable = 1 in {
 def LDAfi : W65816Pseudo<(outs Acc16:$dst), (ins memfi:$addr),
                          "# LDAfi $dst, $addr", []>;
 }
@@ -369,14 +600,37 @@ def : Pat<(i16 (load addr_fi:$addr)),
 def : Pat<(store Acc16:$src, addr_fi:$addr),
           (STAfi Acc16:$src, addr_fi:$addr)>;
 
+// i8 access to a FrameIndex slot.  The slots holding i8 values are
+// allocated as 2 bytes (CC_W65816 promotes i8 args to i16; spills also
+// align), so reading 2 bytes is safe even for an i8 value — we just
+// narrow to Acc8.  Extending loads mask the high byte (zext) or leave
+// it (anyext).  Truncating store writes the full i16 (overwrites the
+// 2-byte slot's high byte with whatever sits in A's high byte; safe
+// since the slot holds an i8 and no other consumer reads that high
+// byte).
+def : Pat<(i8 (load addr_fi:$addr)),
+          (COPY_TO_REGCLASS (LDAfi addr_fi:$addr), Acc8)>;
+def : Pat<(i16 (zextloadi8 addr_fi:$addr)),
+          (ANDi16imm (LDAfi addr_fi:$addr), 0xFF)>;
+def : Pat<(i16 (extloadi8 addr_fi:$addr)),
+          (LDAfi addr_fi:$addr)>;
+def : Pat<(store Acc8:$src, addr_fi:$addr),
+          (STAfi (COPY_TO_REGCLASS Acc8:$src, Acc16), addr_fi:$addr)>;
+def : Pat<(truncstorei8 Acc16:$src, addr_fi:$addr),
+          (STAfi Acc16:$src, addr_fi:$addr)>;
+
 // Frame-index folding into ADC / SBC / AND / ORA / EOR / CMP.  Same
 // shape as the *abs variants but the second operand is a stack slot.
+// ADCfi/SBCfi mark P as Def so they can match `addc`/`subc` (the lo
+// half of a multi-precision split — see ADCi16imm comment above).
 let Constraints = "$src = $dst",
     hasSideEffects = 0, mayLoad = 1, mayStore = 0 in {
+let Defs = [P] in {
 def ADCfi : W65816Pseudo<(outs Acc16:$dst), (ins Acc16:$src, memfi:$addr),
                          "# ADCfi $dst, $src, $addr", []>;
 def SBCfi : W65816Pseudo<(outs Acc16:$dst), (ins Acc16:$src, memfi:$addr),
                          "# SBCfi $dst, $src, $addr", []>;
+}
 def ANDfi : W65816Pseudo<(outs Acc16:$dst), (ins Acc16:$src, memfi:$addr),
                          "# ANDfi $dst, $src, $addr", []>;
 def ORAfi : W65816Pseudo<(outs Acc16:$dst), (ins Acc16:$src, memfi:$addr),
@@ -384,6 +638,16 @@ def ORAfi : W65816Pseudo<(outs Acc16:$dst), (ins Acc16:$src, memfi:$addr),
 def EORfi : W65816Pseudo<(outs Acc16:$dst), (ins Acc16:$src, memfi:$addr),
                          "# EORfi $dst, $src, $addr", []>;
 }
+// ADCEfi / SBCEfi: chained ADC/SBC, hi half of a multi-precision split.
+// Read carry from previous addc/adde/subc/sube via Uses = [P].
+let Constraints = "$src = $dst",
+    hasSideEffects = 0, mayLoad = 1, mayStore = 0,
+    Uses = [P], Defs = [P] in {
+def ADCEfi : W65816Pseudo<(outs Acc16:$dst), (ins Acc16:$src, memfi:$addr),
+                          "# ADCEfi $dst, $src, $addr", []>;
+def SBCEfi : W65816Pseudo<(outs Acc16:$dst), (ins Acc16:$src, memfi:$addr),
+                          "# SBCEfi $dst, $src, $addr", []>;
+}
 let hasSideEffects = 0, mayLoad = 1, mayStore = 0, Defs = [P] in {
 def CMPfi : W65816Pseudo<(outs), (ins Acc16:$lhs, memfi:$addr),
                          "# CMPfi $lhs, $addr", []>;
@@ -392,6 +656,14 @@ def : Pat<(add Acc16:$src, (i16 (load addr_fi:$addr))),
           (ADCfi Acc16:$src, addr_fi:$addr)>;
 def : Pat<(sub Acc16:$src, (i16 (load addr_fi:$addr))),
           (SBCfi Acc16:$src, addr_fi:$addr)>;
+def : Pat<(addc Acc16:$src, (i16 (load addr_fi:$addr))),
+          (ADCfi Acc16:$src, addr_fi:$addr)>;
+def : Pat<(subc Acc16:$src, (i16 (load addr_fi:$addr))),
+          (SBCfi Acc16:$src, addr_fi:$addr)>;
+def : Pat<(adde Acc16:$src, (i16 (load addr_fi:$addr))),
+          (ADCEfi Acc16:$src, addr_fi:$addr)>;
+def : Pat<(sube Acc16:$src, (i16 (load addr_fi:$addr))),
+          (SBCEfi Acc16:$src, addr_fi:$addr)>;
 def : Pat<(and Acc16:$src, (i16 (load addr_fi:$addr))),
           (ANDfi Acc16:$src, addr_fi:$addr)>;
 def : Pat<(or  Acc16:$src, (i16 (load addr_fi:$addr))),
@@ -433,11 +705,217 @@ def : Pat<(W65816cmp Acc16:$lhs,
               (i16 (load (W65816Wrapper texternalsym:$s)))),
           (CMPabs Acc16:$lhs, texternalsym:$s)>;
 
-// Two-Acc16 ops: deferred — needs proper frame setup so the register
-// allocator can spill one operand to a local stack slot.  Without
-// reserved frame space, the spill goes to a negative SP offset and
-// eliminateFrameIndex bails.  See SESSION_STATE §6 for the
-// dependency chain.
+// 16-bit byte swap: XBA exchanges A.high and A.low.  Pattern matches
+// the (bswap Acc16) SDNode emitted by clang for byte-reverse loops.
+let Constraints = "$src = $dst",
+    hasSideEffects = 0, mayLoad = 0, mayStore = 0 in {
+def XBA16 : W65816Pseudo<(outs Acc16:$dst), (ins Acc16:$src),
+                         "# XBA16 $dst, $src",
+                         [(set Acc16:$dst, (bswap Acc16:$src))]>;
+}
+
+// Two-Acc16 binary ops.  We have only one A register, so when both
+// operands are computed values (neither a foldable load/imm/global) we
+// must spill one to a stack slot.  Each pseudo's custom inserter
+// allocates a fresh slot and emits a STAfi+OPfi sequence; the
+// register allocator handles the surrounding spills/reloads.
+// hasSideEffects=1 tells the validator the pseudo may load/store
+// without requiring a matching SDNode pattern (the stores are added
+// by the inserter, not visible in the DAG pattern).
+//
+// Defs = [P] on ADD_RR/SUB_RR matches the C-flag side-effect of the
+// underlying ADC/SBC, letting these pseudos serve `addc`/`subc` (the
+// lo half of an i32 split) as well as plain `add`/`sub`.
+let usesCustomInserter = 1, hasSideEffects = 1 in {
+let Defs = [P] in {
+def ADD_RR : W65816Pseudo<(outs Acc16:$dst),
+                          (ins Acc16:$src1, Acc16:$src2),
+                          "# ADD_RR $dst, $src1, $src2",
+                          [(set Acc16:$dst,
+                                (add Acc16:$src1, Acc16:$src2))]>;
+def SUB_RR : W65816Pseudo<(outs Acc16:$dst),
+                          (ins Acc16:$src1, Acc16:$src2),
+                          "# SUB_RR $dst, $src1, $src2",
+                          [(set Acc16:$dst,
+                                (sub Acc16:$src1, Acc16:$src2))]>;
+}
+def AND_RR : W65816Pseudo<(outs Acc16:$dst),
+                          (ins Acc16:$src1, Acc16:$src2),
+                          "# AND_RR $dst, $src1, $src2",
+                          [(set Acc16:$dst,
+                                (and Acc16:$src1, Acc16:$src2))]>;
+def ORA_RR : W65816Pseudo<(outs Acc16:$dst),
+                          (ins Acc16:$src1, Acc16:$src2),
+                          "# ORA_RR $dst, $src1, $src2",
+                          [(set Acc16:$dst,
+                                (or  Acc16:$src1, Acc16:$src2))]>;
+def EOR_RR : W65816Pseudo<(outs Acc16:$dst),
+                          (ins Acc16:$src1, Acc16:$src2),
+                          "# EOR_RR $dst, $src1, $src2",
+                          [(set Acc16:$dst,
+                                (xor Acc16:$src1, Acc16:$src2))]>;
+}
+def : Pat<(addc Acc16:$src1, Acc16:$src2),
+          (ADD_RR Acc16:$src1, Acc16:$src2)>;
+def : Pat<(subc Acc16:$src1, Acc16:$src2),
+          (SUB_RR Acc16:$src1, Acc16:$src2)>;
+
+// Chained-carry two-Acc16 add/sub for the hi half of i32 splits.
+// Inserter mirrors ADD_RR (STAfi spill + ADCEfi load-fold) but emits
+// the carry-chain pseudo so the previous addc/adde's C flag is
+// consumed instead of overwritten by a CLC.  Uses+Defs = [P]
+// reflects the carry chain through the SDNode.
+let usesCustomInserter = 1, hasSideEffects = 1,
+    Uses = [P], Defs = [P] in {
+def ADDE_RR : W65816Pseudo<(outs Acc16:$dst),
+                           (ins Acc16:$src1, Acc16:$src2),
+                           "# ADDE_RR $dst, $src1, $src2",
+                           [(set Acc16:$dst,
+                                 (adde Acc16:$src1, Acc16:$src2))]>;
+def SUBE_RR : W65816Pseudo<(outs Acc16:$dst),
+                           (ins Acc16:$src1, Acc16:$src2),
+                           "# SUBE_RR $dst, $src1, $src2",
+                           [(set Acc16:$dst,
+                                 (sube Acc16:$src1, Acc16:$src2))]>;
+}
+let usesCustomInserter = 1, hasSideEffects = 1, Defs = [P] in {
+def CMP_RR : W65816Pseudo<(outs), (ins Acc16:$lhs, Acc16:$rhs),
+                          "# CMP_RR $lhs, $rhs",
+                          [(W65816cmp Acc16:$lhs, Acc16:$rhs)]>;
+}
+
+// Pointer dereference.  The 65816 can't deref a register pointer
+// directly — the indirect addressing modes all read the pointer from
+// memory (DP or stack).  These pseudos spill the Acc16 pointer to a
+// fresh stack slot, set Y=0, and emit LDA/STA (slot,S),Y.  Y gets
+// clobbered as a side effect.  hasSideEffects=1 covers the spill
+// store the inserter adds, in addition to the deref.
+let usesCustomInserter = 1, hasSideEffects = 1, mayLoad = 1,
+    Defs = [Y] in {
+def LDAptr : W65816Pseudo<(outs Acc16:$dst), (ins Acc16:$ptr),
+                          "# LDAptr $dst, $ptr",
+                          [(set Acc16:$dst, (load Acc16:$ptr))]>;
+}
+let usesCustomInserter = 1, hasSideEffects = 1, mayStore = 1,
+    Defs = [Y] in {
+def STAptr : W65816Pseudo<(outs), (ins Acc16:$val, Acc16:$ptr),
+                          "# STAptr $val, $ptr",
+                          [(store Acc16:$val, Acc16:$ptr)]>;
+}
+
+// i8 zero-extending pointer load: do a 16-bit LDA (slot,s),y and mask
+// the high byte.  Reads one byte past the source — fine for byte-array
+// iteration where the buffer is at least 2 bytes long.  A future
+// SEP/REP-aware mode pass could switch to a true 8-bit LDA.
+def : Pat<(i16 (zextloadi8 Acc16:$ptr)),
+          (ANDi16imm (LDAptr Acc16:$ptr), 0xFF)>;
+// Anyext byte load via pointer: consumer doesn't care about the high
+// byte, so just LDA (16-bit).  Same 1-byte-past-buffer caveat as
+// zextloadi8.
+def : Pat<(i16 (extloadi8 Acc16:$ptr)),
+          (LDAptr Acc16:$ptr)>;
+// And the equivalent for absolute addresses (byte loads via global ptr).
+// (Already covered for Wrapper(global) above; this catches the case
+// where the ptr is materialised as a value.)
+
+// Intermediate pseudos used by the LDAptr/STAptr inserters.  Each takes
+// a memfi describing the slot containing the pointer; eliminateFrameIndex
+// resolves it to LDA_StackRelIndY / STA_StackRelIndY with the right d-byte.
+// Y must hold 0 at the issue point (the inserter emits LDY #0 first).
+let mayLoad = 1, hasSideEffects = 0, mayStore = 0, Uses = [Y] in {
+def LDAfi_indY : W65816Pseudo<(outs Acc16:$dst), (ins memfi:$addr),
+                              "# LDAfi_indY $dst, $addr", []>;
+}
+let mayStore = 1, hasSideEffects = 0, mayLoad = 0, Uses = [Y] in {
+def STAfi_indY : W65816Pseudo<(outs), (ins Acc16:$src, memfi:$addr),
+                              "# STAfi_indY $src, $addr", []>;
+}
+
+// i8 truncating store via Acc16 pointer.  Same shape as STAptr but
+// custom inserter wraps the actual STA in SEP/REP so the M-bit is 8
+// across the store and only one byte is written.  Without the wrap the
+// 16-bit STA would clobber the byte at ptr+1.  Two patterns: the
+// natural truncstorei8 from an i16 value (common with arg promotion),
+// and a true i8 store (Acc8) that arises from i8-typed IR.
+let usesCustomInserter = 1, hasSideEffects = 1, mayStore = 1,
+    Defs = [Y] in {
+def STBptr : W65816Pseudo<(outs), (ins Acc16:$val, Acc16:$ptr),
+                          "# STBptr $val, $ptr",
+                          [(truncstorei8 Acc16:$val, Acc16:$ptr)]>;
+}
+
+// Pointer access with constant offset.  `(load (add ptr, $off))` and
+// `(store val, (add ptr, $off))` come up for struct field access and
+// array indexing with small constant offsets.  Without these patterns,
+// the offset becomes an explicit ADC #imm that has to spill A and
+// recompute the pointer per access.  With them, we just load Y with
+// the offset in the inserter (Y is 16-bit so any i16 constant fits).
+let usesCustomInserter = 1, hasSideEffects = 1, mayLoad = 1,
+    Defs = [Y] in {
+def LDAptrOff : W65816Pseudo<(outs Acc16:$dst),
+                             (ins Acc16:$ptr, i16imm:$off),
+                             "# LDAptrOff $dst, $ptr, $off", []>;
+}
+let usesCustomInserter = 1, hasSideEffects = 1, mayStore = 1,
+    Defs = [Y] in {
+def STAptrOff : W65816Pseudo<(outs),
+                             (ins Acc16:$val, Acc16:$ptr, i16imm:$off),
+                             "# STAptrOff $val, $ptr, $off", []>;
+def STBptrOff : W65816Pseudo<(outs),
+                             (ins Acc16:$val, Acc16:$ptr, i16imm:$off),
+                             "# STBptrOff $val, $ptr, $off", []>;
+}
+def : Pat<(i16 (load (add Acc16:$ptr, (i16 imm:$off)))),
+          (LDAptrOff Acc16:$ptr, imm:$off)>;
+def : Pat<(store Acc16:$val, (add Acc16:$ptr, (i16 imm:$off))),
+          (STAptrOff Acc16:$val, Acc16:$ptr, imm:$off)>;
+def : Pat<(truncstorei8 Acc16:$val, (add Acc16:$ptr, (i16 imm:$off))),
+          (STBptrOff Acc16:$val, Acc16:$ptr, imm:$off)>;
+def : Pat<(store Acc8:$val, (add Acc16:$ptr, (i16 imm:$off))),
+          (STBptrOff (COPY_TO_REGCLASS Acc8:$val, Acc16),
+                     Acc16:$ptr, imm:$off)>;
+def : Pat<(store Acc8:$val, Acc16:$ptr),
+          (STBptr (COPY_TO_REGCLASS Acc8:$val, Acc16), Acc16:$ptr)>;
+
+// i8 load via Acc16 pointer producing a true i8 (Acc8) result.  Reuses
+// the existing zextloadi8 16-bit-LDA-and-mask path: load 2 bytes, mask
+// the high byte, then narrow to Acc8.  COPY_TO_REGCLASS to Acc8 is a
+// no-op at MC level (same physical A).  Reads one byte past the source;
+// fine for char-array iteration where the buffer is at least 2 bytes.
+def : Pat<(i8 (load Acc16:$ptr)),
+          (COPY_TO_REGCLASS (ANDi16imm (LDAptr Acc16:$ptr), 0xFF), Acc8)>;
+
+// Acc8-to-Acc16 type conversions.  Both Acc8 and Acc16 alias physical A,
+// so COPY_TO_REGCLASS is a no-op at MC level.  ZEXT additionally masks
+// the high byte (which holds B from before any prior SEP).  ANYEXT
+// leaves the high byte untouched since the consumer doesn't care.
+def : Pat<(i16 (anyext Acc8:$src)),
+          (COPY_TO_REGCLASS Acc8:$src, Acc16)>;
+def : Pat<(i16 (zext Acc8:$src)),
+          (ANDi16imm (COPY_TO_REGCLASS Acc8:$src, Acc16), 0xFF)>;
+def : Pat<(i8 (trunc Acc16:$src)),
+          (COPY_TO_REGCLASS Acc16:$src, Acc8)>;
+
+// Acc8 reg-reg arithmetic and bitwise ops, expanded through the Acc16
+// _RR pseudos.  Cheap to do because Acc8 and Acc16 alias the same
+// physical A — COPY_TO_REGCLASS is a no-op.  Only the low byte
+// matters; the high byte gets unrelated bits but is discarded by the
+// final narrow-back to Acc8.  This lets an i8 expression that wasn't
+// promoted by legalization (e.g. an i8 XOR feeding only an i8 store)
+// reuse the spill-and-OPfi inserter without needing dedicated Acc8
+// pseudos.
+multiclass Acc8RR<SDNode op, Instruction ri> {
+  def : Pat<(i8 (op Acc8:$a, Acc8:$b)),
+            (COPY_TO_REGCLASS
+                (ri (COPY_TO_REGCLASS Acc8:$a, Acc16),
+                    (COPY_TO_REGCLASS Acc8:$b, Acc16)),
+                Acc8)>;
+}
+defm : Acc8RR<add,  ADD_RR>;
+defm : Acc8RR<sub,  SUB_RR>;
+defm : Acc8RR<and,  AND_RR>;
+defm : Acc8RR<or,   ORA_RR>;
+defm : Acc8RR<xor,  EOR_RR>;
 
 // (memory inc/dec patterns moved below INC_Abs/DEC_Abs defs.)
 
@@ -728,6 +1206,11 @@ def AND_StackRel : InstStackRel<0x23, "and">;
 def ORA_StackRel : InstStackRel<0x03, "ora">;
 def EOR_StackRel : InstStackRel<0x43, "eor">;
 
+//---------------------------------------------------------------- Stack-ind-Y
+// Stack-relative indirect indexed-Y: deref a pointer spilled at S+off.
+def LDA_StackRelIndY : InstStackRelIndY<0xB3, "lda">;
+def STA_StackRelIndY : InstStackRelIndY<0x93, "sta">;
+
 //===----------------------------------------------------------------------===//
 // Branch patterns (placed after the Bxx defs).
 //
diff --git a/src/llvm/lib/Target/W65816/W65816RegisterInfo.cpp b/src/llvm/lib/Target/W65816/W65816RegisterInfo.cpp
index 4e8f7f9..3ab6346 100644
--- a/src/llvm/lib/Target/W65816/W65816RegisterInfo.cpp
+++ b/src/llvm/lib/Target/W65816/W65816RegisterInfo.cpp
@@ -77,10 +77,46 @@ bool W65816RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   case W65816::STAfi: NewOpc = W65816::STA_StackRel; break;
   case W65816::ADCfi: NewOpc = W65816::ADC_StackRel; NeedsCarryPrefix = true; break;
   case W65816::SBCfi: NewOpc = W65816::SBC_StackRel; NeedsCarryPrefix = true; IsSub = true; break;
+  // ADCEfi / SBCEfi are the chained-carry variants used as the hi half of a
+  // multi-precision split.  No CLC/SEC prefix — they read the carry left
+  // in P by the previous addc/adde/subc/sube.
+  case W65816::ADCEfi: NewOpc = W65816::ADC_StackRel; break;
+  case W65816::SBCEfi: NewOpc = W65816::SBC_StackRel; break;
   case W65816::ANDfi: NewOpc = W65816::AND_StackRel; break;
   case W65816::ORAfi: NewOpc = W65816::ORA_StackRel; break;
   case W65816::EORfi: NewOpc = W65816::EOR_StackRel; break;
   case W65816::CMPfi: NewOpc = W65816::CMP_StackRel; break;
+  case W65816::LDAfi_indY: NewOpc = W65816::LDA_StackRelIndY; break;
+  case W65816::STAfi_indY: NewOpc = W65816::STA_StackRelIndY; break;
+  case W65816::ADDframe: {
+    // LEA-equivalent: emit "TSC; CLC; ADC #disp" so A holds SP + disp,
+    // i.e. the address of the stack slot.  TSC has no carry side-effect
+    // (it just transfers SP into A), so the CLC + ADC is needed for a
+    // clean unsigned add.  Disp uses the same FrameOffset+ImmOffset+
+    // StackSize formula as the load/store cases.
+    int FI = MI.getOperand(FIOperandNum).getIndex();
+    int FrameOffset = MFI.getObjectOffset(FI);
+    int ImmOffset = MI.getOperand(FIOperandNum + 1).getImm();
+    int Disp = FrameOffset + ImmOffset + (int)MFI.getStackSize();
+    if (Disp < 0 || Disp > 0xFFFF)
+      report_fatal_error("W65816: frame offset out of i16 LEA range");
+    // TSC: A = SP (implicit def of A, use of SP).
+    BuildMI(*MI.getParent(), II, MI.getDebugLoc(), TII.get(W65816::TSC))
+        .addReg(W65816::A, RegState::ImplicitDefine)
+        .addReg(W65816::SP, RegState::Implicit);
+    // CLC: clears C.  Models as P-def, P-use (preserves N/V/Z).
+    BuildMI(*MI.getParent(), II, MI.getDebugLoc(), TII.get(W65816::CLC))
+        .addReg(W65816::P, RegState::ImplicitDefine);
+    // ADC #imm: reads A and P, writes A and P.
+    BuildMI(*MI.getParent(), II, MI.getDebugLoc(), TII.get(W65816::ADC_Imm16))
+        .addImm(Disp)
+        .addReg(W65816::A, RegState::Implicit)
+        .addReg(W65816::A, RegState::ImplicitDefine)
+        .addReg(W65816::P, RegState::Implicit)
+        .addReg(W65816::P, RegState::ImplicitDefine);
+    MI.eraseFromParent();
+    return true;
+  }
   default:
     llvm_unreachable("W65816: unhandled instruction in eliminateFrameIndex");
   }
@@ -108,8 +144,49 @@ bool W65816RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
     BuildMI(*MI.getParent(), II, MI.getDebugLoc(),
             TII.get(IsSub ? W65816::SEC : W65816::CLC));
   }
-  BuildMI(*MI.getParent(), II, MI.getDebugLoc(), TII.get(NewOpc))
-      .addImm(Offset);
+  // The MC instructions (LDA_StackRel, STA_StackRel, ADC_StackRel,
+  // ADC_Imm16, etc.) don't have explicit Defs/Uses on the accumulator
+  // because that's an implicit hardware semantic of every 65816
+  // arithmetic/load/store.  Without an explicit Def/Use, post-RA
+  // passes (Machine Copy Propagation in particular) miss that an ADC
+  // d,S between a TXA and a TAX redefines $a, and elide the TAX as
+  // "redundant" — corrupting the return value.  Add the implicit
+  // operands here so dataflow tracking is correct.  Match the
+  // original pseudo's read/write semantics: LDA defs A only; STA uses
+  // A only; ADC/SBC/AND/ORA/EOR/CMP read A and write A (CMP only
+  // sets flags, but it still uses A — modelling it as Use is
+  // sufficient since it doesn't change A).
+  auto Builder = BuildMI(*MI.getParent(), II, MI.getDebugLoc(),
+                         TII.get(NewOpc)).addImm(Offset);
+  switch (NewOpc) {
+  case W65816::LDA_StackRel:
+  case W65816::LDA_StackRelIndY:
+    Builder.addReg(W65816::A, RegState::ImplicitDefine);
+    break;
+  case W65816::STA_StackRel:
+  case W65816::STA_StackRelIndY:
+    Builder.addReg(W65816::A, RegState::Implicit);
+    break;
+  case W65816::ADC_StackRel:
+  case W65816::SBC_StackRel:
+    Builder.addReg(W65816::A, RegState::Implicit)
+           .addReg(W65816::A, RegState::ImplicitDefine)
+           .addReg(W65816::P, RegState::Implicit)
+           .addReg(W65816::P, RegState::ImplicitDefine);
+    break;
+  case W65816::AND_StackRel:
+  case W65816::ORA_StackRel:
+  case W65816::EOR_StackRel:
+    Builder.addReg(W65816::A, RegState::Implicit)
+           .addReg(W65816::A, RegState::ImplicitDefine);
+    break;
+  case W65816::CMP_StackRel:
+    Builder.addReg(W65816::A, RegState::Implicit)
+           .addReg(W65816::P, RegState::ImplicitDefine);
+    break;
+  default:
+    break;
+  }
   MI.eraseFromParent();
   return true;
 }
diff --git a/src/llvm/lib/Target/W65816/W65816StackSlotCleanup.cpp b/src/llvm/lib/Target/W65816/W65816StackSlotCleanup.cpp
new file mode 100644
index 0000000..c9272c4
--- /dev/null
+++ b/src/llvm/lib/Target/W65816/W65816StackSlotCleanup.cpp
@@ -0,0 +1,355 @@
+//===-- W65816StackSlotCleanup.cpp - Remove redundant spill/reload pairs --===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Post-RA cleanup that erases redundant STAfi+LDAfi pairs to the same
+// stack slot when no instruction in between writes A or that slot.
+//
+// The greedy register allocator routinely emits this pattern when
+// materialising a COPY of $a into a vreg that gets allocated back to
+// $a — the spill+reload cycle is a no-op since A already holds the
+// stored value.  The standard MachineLateInstrsCleanup pass only
+// detects identical instructions; it doesn't recognise that
+// `LDAfi slot` after `STAfi $a, slot` is a no-op.  We do the
+// simple per-block scan here.
+//
+// Conservative: only matches adjacent STAfi+LDAfi pairs (no scan for
+// instructions in between).  In practice the greedy-allocator-emitted
+// pattern is always adjacent or near-adjacent, and the scheduler keeps
+// it that way because the LDAfi feeds the next instruction.  If
+// future codegen breaks this assumption, generalise to a longer scan
+// with explicit clobber tracking.
+//
+//===----------------------------------------------------------------------===//
+
+#include "W65816.h"
+#include "W65816InstrInfo.h"
+#include "W65816Subtarget.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "w65816-stack-slot-cleanup"
+
+namespace {
+
+class W65816StackSlotCleanup : public MachineFunctionPass {
+public:
+  static char ID;
+
+  W65816StackSlotCleanup() : MachineFunctionPass(ID) {}
+
+  StringRef getPassName() const override {
+    return "W65816 redundant stack-slot spill/reload elimination";
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+};
+
+} // namespace
+
+char W65816StackSlotCleanup::ID = 0;
+
+INITIALIZE_PASS(W65816StackSlotCleanup, DEBUG_TYPE,
+                "W65816 redundant stack-slot spill/reload elimination",
+                false, false)
+
+FunctionPass *llvm::createW65816StackSlotCleanup() {
+  return new W65816StackSlotCleanup();
+}
+
+// Returns true if MI references frame index FI as one of its operands.
+// Used to bail dead-store removal when an intervening instruction
+// reads or writes the slot.
+static bool referencesFrameIndex(const MachineInstr &MI, int FI) {
+  for (const MachineOperand &MO : MI.operands())
+    if (MO.isFI() && MO.getIndex() == FI)
+      return true;
+  return false;
+}
+
+// Match `STAfi reg1, FI, 0; ... ; STAfi reg2, FI, 0` (kill via overwrite)
+// or `STAfi reg, FI, 0; ... ; <return> (no read in between)` (dead store
+// at function exit).  Both mean the first STAfi is dead.  Conservative:
+// bails on anything that references the slot, calls, inline asm.  The
+// slot must be a *local* (non-fixed) FrameIndex — args live across the
+// function so we can't kill stores to fixed slots.
+static bool tryEliminateDeadStore(MachineBasicBlock &MBB,
+                                  MachineInstr &StaMI) {
+  if (StaMI.getOpcode() != W65816::STAfi)
+    return false;
+  if (StaMI.getNumOperands() < 3 ||
+      !StaMI.getOperand(1).isFI() ||
+      !StaMI.getOperand(2).isImm() || StaMI.getOperand(2).getImm() != 0)
+    return false;
+  int StoredFI = StaMI.getOperand(1).getIndex();
+
+  // Don't try to kill a store to a fixed (arg) slot — those are
+  // observable to the caller.  Locals/spills are fair game.
+  const MachineFunction *MF = StaMI.getMF();
+  if (MF->getFrameInfo().isFixedObjectIndex(StoredFI))
+    return false;
+
+  auto It = std::next(StaMI.getIterator());
+  while (It != MBB.end()) {
+    MachineInstr &MI = *It;
+    if (MI.isDebugInstr()) {
+      ++It;
+      continue;
+    }
+    // A subsequent STAfi to the same slot, offset 0, kills our store.
+    if (MI.getOpcode() == W65816::STAfi &&
+        MI.getNumOperands() >= 3 &&
+        MI.getOperand(1).isFI() &&
+        MI.getOperand(1).getIndex() == StoredFI &&
+        MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) {
+      // Found the killing store.  Erase the first.
+      StaMI.eraseFromParent();
+      return true;
+    }
+    // A return that doesn't read the slot kills the store too — the
+    // local goes out of scope at function exit.
+    if (MI.isReturn() && !referencesFrameIndex(MI, StoredFI)) {
+      StaMI.eraseFromParent();
+      return true;
+    }
+    // Anything else that touches the slot (load, ADC d,S, etc.) means
+    // the first store IS observed — bail.
+    if (referencesFrameIndex(MI, StoredFI))
+      return false;
+    // Inline asm / branches: too tricky.  Calls are OK to walk past —
+    // local (non-fixed) slots are addressed at offsets the callee
+    // can't reach (callee's S has been shifted down by JSL's
+    // 3-byte return frame and any of its own pha/tsc adjustments,
+    // so its `(4,s)` reads land above our locals).  We've already
+    // bailed on fixed slots above, so reaching here means the slot
+    // is local and call-safe.
+    if (MI.isInlineAsm() || MI.isBranch())
+      return false;
+    ++It;
+  }
+  // Walked off the end of the BB without seeing a return/use.  Bail
+  // (could fall through to a successor that reads the slot).
+  return false;
+}
+
+// Match `STAfi reg, FI, 0; ... ; LDAfi destReg, FI, 0` when reg == destReg
+// and nothing in between clobbers reg or the slot.  Erase the LDAfi.
+static bool tryEliminateLoadAfterStore(MachineBasicBlock &MBB,
+                                       MachineInstr &StaMI,
+                                       const TargetRegisterInfo *TRI) {
+  if (StaMI.getOpcode() != W65816::STAfi)
+    return false;
+  if (StaMI.getNumOperands() < 3 ||
+      !StaMI.getOperand(0).isReg() ||
+      !StaMI.getOperand(1).isFI() ||
+      !StaMI.getOperand(2).isImm() || StaMI.getOperand(2).getImm() != 0)
+    return false;
+  Register StoredReg = StaMI.getOperand(0).getReg();
+  int StoredFI = StaMI.getOperand(1).getIndex();
+
+  // Walk forward looking for the matching LDAfi.  Bail on any
+  // instruction that could clobber StoredReg or write the slot.
+  auto It = std::next(StaMI.getIterator());
+  while (It != MBB.end()) {
+    MachineInstr &MI = *It;
+    if (MI.isDebugInstr()) {
+      ++It;
+      continue;
+    }
+    if (MI.getOpcode() == W65816::LDAfi &&
+        MI.getNumOperands() >= 3 &&
+        MI.getOperand(1).isFI() &&
+        MI.getOperand(1).getIndex() == StoredFI &&
+        MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0 &&
+        MI.getOperand(0).isReg() &&
+        MI.getOperand(0).getReg() == StoredReg) {
+      MI.eraseFromParent();
+      return true;
+    }
+    // Calls clobber A — be safe.
+    if (MI.isCall())
+      return false;
+    // Any other instruction that defines StoredReg or stores to the
+    // slot invalidates the redundancy — bail.
+    if (MI.modifiesRegister(StoredReg, TRI))
+      return false;
+    if (MI.getOpcode() == W65816::STAfi &&
+        MI.getNumOperands() >= 2 && MI.getOperand(1).isFI() &&
+        MI.getOperand(1).getIndex() == StoredFI)
+      return false;
+    ++It;
+  }
+  return false;
+}
+
+bool W65816StackSlotCleanup::runOnMachineFunction(MachineFunction &MF) {
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+  bool Changed = false;
+
+  // Pass 0: rewrite `LDAi16imm $a, imm` immediately followed by
+  // `COPY $x = $a` (with no intervening A clobber) into
+  // `LDXi16imm $x, imm`.  Run BEFORE the spill/reload cleanups so
+  // the disappearing A clobber unblocks subsequent STAfi+LDAfi
+  // pair removal.
+  for (MachineBasicBlock &MBB : MF) {
+    SmallVector<MachineInstr *, 4> Worklist;
+    for (MachineInstr &MI : MBB)
+      if (MI.getOpcode() == W65816::LDAi16imm)
+        Worklist.push_back(&MI);
+    for (MachineInstr *Lda : Worklist) {
+      if (Lda->getNumOperands() < 2 || !Lda->getOperand(0).isReg() ||
+          Lda->getOperand(0).getReg() != W65816::A)
+        continue;
+      auto It = std::next(Lda->getIterator());
+      while (It != MBB.end() && It->isDebugInstr())
+        ++It;
+      if (It == MBB.end())
+        continue;
+      MachineInstr &Next = *It;
+      if (!Next.isCopy())
+        continue;
+      Register DstReg = Next.getOperand(0).getReg();
+      Register SrcReg = Next.getOperand(1).getReg();
+      if (DstReg != W65816::X || SrcReg != W65816::A)
+        continue;
+      const MachineOperand &ImmMO = Lda->getOperand(1);
+      const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
+      MachineInstrBuilder Mib =
+          BuildMI(MBB, Lda->getIterator(), Lda->getDebugLoc(),
+                  TII->get(W65816::LDXi16imm), W65816::X);
+      if (ImmMO.isImm())
+        Mib.addImm(ImmMO.getImm());
+      else
+        Mib.add(ImmMO);
+      Lda->eraseFromParent();
+      Next.eraseFromParent();
+      Changed = true;
+    }
+  }
+
+  // Pass 1: redundant LDAfi after STAfi (load-after-same-store with
+  // matching register).  Two-pass over Stores worklist to avoid
+  // iterator invalidation when we erase the LDAfi mid-walk.
+  for (MachineBasicBlock &MBB : MF) {
+    SmallVector<MachineInstr *, 8> Stores;
+    for (MachineInstr &MI : MBB)
+      if (MI.getOpcode() == W65816::STAfi)
+        Stores.push_back(&MI);
+    for (MachineInstr *StaMI : Stores)
+      if (tryEliminateLoadAfterStore(MBB, *StaMI, TRI))
+        Changed = true;
+  }
+
+  // Pass 2: dead stores (STAfi to slot followed by another STAfi to
+  // the same slot with no intervening read).  This catches the
+  // arg0_lo "preserve" spill that the regalloc emits even though the
+  // value is consumed by the very next instruction.
+  for (MachineBasicBlock &MBB : MF) {
+    SmallVector<MachineInstr *, 8> Stores;
+    for (MachineInstr &MI : MBB)
+      if (MI.getOpcode() == W65816::STAfi)
+        Stores.push_back(&MI);
+    for (MachineInstr *StaMI : Stores)
+      if (tryEliminateDeadStore(MBB, *StaMI))
+        Changed = true;
+  }
+
+  // Pass 2.5: deleted (logic moved to Pass 0 above).
+  // `COPY $x = $a` (with no intervening A use/def) into
+  // `LDXi16imm $x, imm`, removing the A clobber.  Without this, the
+  // regalloc materialises i16 constants via Acc16 (LDAi16imm) even
+  // when the only consumer is CopyToReg($x), forcing a TAX round-trip
+  // and (often) a spill+reload of A's previous value.  Common case:
+  // the high half of `(zext i16 to i32)` returns, where hi = 0.
+  for (MachineBasicBlock &MBB : MF) {
+    SmallVector<MachineInstr *, 4> Worklist;
+    for (MachineInstr &MI : MBB)
+      if (MI.getOpcode() == W65816::LDAi16imm)
+        Worklist.push_back(&MI);
+    for (MachineInstr *Lda : Worklist) {
+      // The LDA's def must be $a (post-RA) and the next instruction
+      // must be a COPY $x = $a.
+      if (Lda->getNumOperands() < 2 || !Lda->getOperand(0).isReg() ||
+          Lda->getOperand(0).getReg() != W65816::A)
+        continue;
+      auto It = std::next(Lda->getIterator());
+      // Skip debug instructions.
+      while (It != MBB.end() && It->isDebugInstr())
+        ++It;
+      if (It == MBB.end())
+        continue;
+      MachineInstr &Next = *It;
+      if (!Next.isCopy())
+        continue;
+      Register DstReg = Next.getOperand(0).getReg();
+      Register SrcReg = Next.getOperand(1).getReg();
+      if (DstReg != W65816::X || SrcReg != W65816::A)
+        continue;
+      // Replace LDAi16imm with LDXi16imm and erase the COPY.
+      const MachineOperand &ImmMO = Lda->getOperand(1);
+      const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
+      MachineInstrBuilder Mib =
+          BuildMI(MBB, Lda->getIterator(), Lda->getDebugLoc(),
+                  TII->get(W65816::LDXi16imm), W65816::X);
+      if (ImmMO.isImm())
+        Mib.addImm(ImmMO.getImm());
+      else
+        Mib.add(ImmMO);
+      Lda->eraseFromParent();
+      Next.eraseFromParent();
+      Changed = true;
+    }
+  }
+
+  // Pass 3: zero-size unused local frame objects so the
+  // PrologueEpilogue pass shrinks the prologue PHAs / TSC reservation.
+  // Walk the MIR collecting which FIs are still referenced; any
+  // *non-fixed* (local) FI with no remaining reference is dead.  We
+  // can't safely remove it (RemoveStackObject can shift indexes); we
+  // just zero-size it via setObjectSize, which is enough for the
+  // frame layout pass to skip it.
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  if (MFI.getNumObjects() > 0) {
+    BitVector Used(MFI.getObjectIndexEnd() - MFI.getObjectIndexBegin());
+    auto Mark = [&](int FI) {
+      int Idx = FI - MFI.getObjectIndexBegin();
+      if (Idx >= 0 && Idx < (int)Used.size())
+        Used.set(Idx);
+    };
+    for (MachineBasicBlock &MBB : MF)
+      for (MachineInstr &MI : MBB)
+        for (MachineOperand &MO : MI.operands())
+          if (MO.isFI())
+            Mark(MO.getIndex());
+    for (int FI = MFI.getObjectIndexBegin();
+         FI < MFI.getObjectIndexEnd(); ++FI) {
+      // Skip fixed (arg) slots — those are "owned" by the caller.
+      if (MFI.isFixedObjectIndex(FI))
+        continue;
+      int Idx = FI - MFI.getObjectIndexBegin();
+      if (Idx < 0 || Idx >= (int)Used.size() || Used.test(Idx))
+        continue;
+      // Already zero-sized?  Skip.
+      if (MFI.getObjectSize(FI) == 0)
+        continue;
+      // Don't touch dead-stripped objects either.
+      if (MFI.isDeadObjectIndex(FI))
+        continue;
+      MFI.setObjectSize(FI, 0);
+      Changed = true;
+    }
+  }
+
+  return Changed;
+}
diff --git a/src/llvm/lib/Target/W65816/W65816TargetMachine.cpp b/src/llvm/lib/Target/W65816/W65816TargetMachine.cpp
index e24f832..f93d608 100644
--- a/src/llvm/lib/Target/W65816/W65816TargetMachine.cpp
+++ b/src/llvm/lib/Target/W65816/W65816TargetMachine.cpp
@@ -39,6 +39,7 @@ LLVMInitializeW65816Target() {
   PassRegistry &PR = *PassRegistry::getPassRegistry();
   initializeW65816AsmPrinterPass(PR);
   initializeW65816DAGToDAGISelLegacyPass(PR);
+  initializeW65816StackSlotCleanupPass(PR);
 }
 
 static Reloc::Model getEffectiveRelocModel(std::optional<Reloc::Model> RM) {
@@ -74,6 +75,7 @@ public:
   }
 
   bool addInstSelector() override;
+  void addPostRegAlloc() override;
 };
 
 } // namespace
@@ -82,6 +84,10 @@ TargetPassConfig *W65816TargetMachine::createPassConfig(PassManagerBase &PM) {
   return new W65816PassConfig(*this, PM);
 }
 
+void W65816PassConfig::addPostRegAlloc() {
+  addPass(createW65816StackSlotCleanup());
+}
+
 MachineFunctionInfo *W65816TargetMachine::createMachineFunctionInfo(
     BumpPtrAllocator &Allocator, const Function &F,
     const TargetSubtargetInfo *STI) const {