From 465f8ba94773e7e93ab9f774c5ae92dd544dacd4 Mon Sep 17 00:00:00 2001
From: Scott Duensing <scott.duensing@gmail.com>
Date: Tue, 5 May 2026 22:00:34 -0500
Subject: [PATCH] Checkpoint

---
 .gitignore                                    |    1 +
 .../0002-triple-cpp-add-w65816-cases.patch    |   11 +-
 patches/0005-target-data-layout-w65816.patch  |    4 +-
 ...etlowering-virtual-gettypeconversion.patch |   13 +
 runtime/build.sh                              |    6 +
 runtime/src/crt0.s                            |   16 +
 runtime/src/crt0Gsos.s                        |   12 +
 runtime/src/snprintf.c                        |   11 +-
 runtime/src/softDouble.c                      |   39 +-
 runtime/src/timeExt.c                         |   84 +-
 scripts/runFileCheckTests.sh                  |   81 ++
 scripts/smokeTest.sh                          |   53 +
 src/clang/lib/Basic/Targets/W65816.h          |    2 +-
 src/llvm/lib/Target/W65816/CMakeLists.txt     |    1 +
 src/llvm/lib/Target/W65816/W65816.h           |   10 +
 .../lib/Target/W65816/W65816ISelDAGToDAG.cpp  |   53 +-
 .../lib/Target/W65816/W65816ISelLowering.cpp  | 1211 +++++++++++++++--
 .../lib/Target/W65816/W65816ISelLowering.h    |   62 +
 .../lib/Target/W65816/W65816InstrInfo.cpp     |  145 ++
 src/llvm/lib/Target/W65816/W65816InstrInfo.td |  129 +-
 .../lib/Target/W65816/W65816LowerWide32.cpp   |  326 +++++
 .../lib/Target/W65816/W65816RegisterInfo.td   |   65 +
 .../lib/Target/W65816/W65816SepRepCleanup.cpp |   20 +
 .../Target/W65816/W65816StackSlotCleanup.cpp  |   46 +-
 .../lib/Target/W65816/W65816TargetMachine.cpp |   18 +
 src/llvm/test/CodeGen/W65816/add-i16.ll       |   12 +
 .../W65816/canmergestoresto-i16-cap.ll        |   30 +
 .../CodeGen/W65816/extract-wide32-regseq.ll   |   36 +
 .../CodeGen/W65816/i64-first-arg-img16.ll     |   36 +
 .../CodeGen/W65816/img-copy-survives-mcp.ll   |   32 +
 .../CodeGen/W65816/jslpseudo-caller-save.ll   |   28 +
 src/llvm/test/CodeGen/W65816/lit.local.cfg    |    2 +
 .../W65816/seprep-ldy-elision-kill-flag.ll    |   29 +
 .../CodeGen/W65816/sign-extend-inreg-i32.ll   |   41 +
 .../test/CodeGen/W65816/wide32-phi-split.ll   |   32 +
 35 files changed, 2496 insertions(+), 201 deletions(-)
 create mode 100644 patches/0007-targetlowering-virtual-gettypeconversion.patch
 create mode 100755 scripts/runFileCheckTests.sh
 create mode 100644 src/llvm/lib/Target/W65816/W65816LowerWide32.cpp
 create mode 100644 src/llvm/test/CodeGen/W65816/add-i16.ll
 create mode 100644 src/llvm/test/CodeGen/W65816/canmergestoresto-i16-cap.ll
 create mode 100644 src/llvm/test/CodeGen/W65816/extract-wide32-regseq.ll
 create mode 100644 src/llvm/test/CodeGen/W65816/i64-first-arg-img16.ll
 create mode 100644 src/llvm/test/CodeGen/W65816/img-copy-survives-mcp.ll
 create mode 100644 src/llvm/test/CodeGen/W65816/jslpseudo-caller-save.ll
 create mode 100644 src/llvm/test/CodeGen/W65816/lit.local.cfg
 create mode 100644 src/llvm/test/CodeGen/W65816/seprep-ldy-elision-kill-flag.ll
 create mode 100644 src/llvm/test/CodeGen/W65816/sign-extend-inreg-i32.ll
 create mode 100644 src/llvm/test/CodeGen/W65816/wide32-phi-split.ll

diff --git a/.gitignore b/.gitignore
index 52b4a98..bf166c9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,6 +9,7 @@ tools/
 # runtime/src/*.s.  The source files (.s, build.sh) are tracked.
 runtime/*.o
 runtime/*.o.bak
+runtime/*.o.tmp
 
 # Editor / OS
 *.swp
diff --git a/patches/0002-triple-cpp-add-w65816-cases.patch b/patches/0002-triple-cpp-add-w65816-cases.patch
index 4f8cc6c..65930a7 100644
--- a/patches/0002-triple-cpp-add-w65816-cases.patch
+++ b/patches/0002-triple-cpp-add-w65816-cases.patch
@@ -1,5 +1,5 @@
 diff --git a/llvm/lib/TargetParser/Triple.cpp b/llvm/lib/TargetParser/Triple.cpp
-index 8aef55224..b6e467274 100644
+index 8aef55224..1ab00ce9f 100644
 --- a/llvm/lib/TargetParser/Triple.cpp
 +++ b/llvm/lib/TargetParser/Triple.cpp
 @@ -80,6 +80,8 @@ StringRef Triple::getArchTypeName(ArchType Kind) {
@@ -75,3 +75,12 @@ index 8aef55224..b6e467274 100644
    case Triple::nvptx64:
    case Triple::nvptx:
    case Triple::ppcle:
+@@ -2704,6 +2714,8 @@ ExceptionHandling Triple::getDefaultExceptionHandling() const {
+   case Triple::xcore:
+   case Triple::xtensa:
+     return ExceptionHandling::DwarfCFI;
++  case Triple::w65816:
++    return ExceptionHandling::SjLj;
+   default:
+     break;
+   }
diff --git a/patches/0005-target-data-layout-w65816.patch b/patches/0005-target-data-layout-w65816.patch
index ecc8e11..ca3c6ec 100644
--- a/patches/0005-target-data-layout-w65816.patch
+++ b/patches/0005-target-data-layout-w65816.patch
@@ -1,5 +1,5 @@
 diff --git a/llvm/lib/TargetParser/TargetDataLayout.cpp b/llvm/lib/TargetParser/TargetDataLayout.cpp
-index 8837d2f91..b796d9e86 100644
+index 8837d2f91..920b8ac8e 100644
 --- a/llvm/lib/TargetParser/TargetDataLayout.cpp
 +++ b/llvm/lib/TargetParser/TargetDataLayout.cpp
 @@ -582,6 +582,8 @@ std::string Triple::computeDataLayout(StringRef ABIName) const {
@@ -7,7 +7,7 @@ index 8837d2f91..b796d9e86 100644
    case Triple::msp430:
      return "e-m:e-p:16:16-i32:16-i64:16-f32:16-f64:16-a:8-n8:16-S16";
 +  case Triple::w65816:
-+    return "e-m:e-p:16:8-i16:16-i32:16-n8:16-S16";
++    return "e-m:e-p:16:16-i16:16-i32:16-i64:16-f32:16-f64:16-n8:16-S16";
    case Triple::ppc:
    case Triple::ppcle:
    case Triple::ppc64:
diff --git a/patches/0007-targetlowering-virtual-gettypeconversion.patch b/patches/0007-targetlowering-virtual-gettypeconversion.patch
new file mode 100644
index 0000000..2c63316
--- /dev/null
+++ b/patches/0007-targetlowering-virtual-gettypeconversion.patch
@@ -0,0 +1,13 @@
+diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
+index 7c4c29fc3..7109a79fa 100644
+--- a/llvm/include/llvm/CodeGen/TargetLowering.h
++++ b/llvm/include/llvm/CodeGen/TargetLowering.h
+@@ -1144,7 +1144,7 @@ public:
+   /// integer register, this contains one step in the expansion to get to the
+   /// smaller register. For illegal floating point types, this returns the
+   /// integer type to transform to.
+-  LegalizeKind getTypeConversion(LLVMContext &Context, EVT VT) const;
++  virtual LegalizeKind getTypeConversion(LLVMContext &Context, EVT VT) const;
+ 
+   /// Return how we should legalize values of this type, either it is already
+   /// legal (return 'Legal') or we need to promote it to a larger type (return
diff --git a/runtime/build.sh b/runtime/build.sh
index 215e6ad..6a5aa8c 100755
--- a/runtime/build.sh
+++ b/runtime/build.sh
@@ -6,6 +6,12 @@ set -euo pipefail
 PROJECT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
 LLVM_MC="$PROJECT_ROOT/tools/llvm-mos-build/bin/llvm-mc"
 CLANG="$PROJECT_ROOT/tools/llvm-mos-build/bin/clang"
+# Apply CPU/memory caps so a runaway backend bug can't OOM-kill the
+# entire tmux scope.  Use `|| true` so when invoked from a parent that
+# has already lowered the limit (e.g. smokeTest.sh sets 90s), we keep
+# the parent's tighter cap rather than failing the build.
+ulimit -v $((10 * 1024 * 1024)) 2>/dev/null || true
+ulimit -t 1200 2>/dev/null || true
 
 [ -x "$LLVM_MC" ] || { echo "llvm-mc not found at $LLVM_MC" >&2; exit 1; }
 [ -x "$CLANG" ]   || { echo "clang not found at $CLANG" >&2; exit 1; }
diff --git a/runtime/src/crt0.s b/runtime/src/crt0.s
index 6fff159..67f1ec2 100644
--- a/runtime/src/crt0.s
+++ b/runtime/src/crt0.s
@@ -57,6 +57,22 @@ __start:
 	lda 0xc083
 	rep #0x20
 
+	; Persistent "current data bank" byte at DP $BE.  The LDAptr/
+	; STAptr/STBptr inserters load this into $E2 before each [dp],Y
+	; deref so pointer-deref lands in the user's bank, matching where
+	; DBR-relative absolute stores go.  Under MAME (no Loader), DBR=0
+	; and PBR=0 here, so $BE=0 — equivalent to the prior STZ $E2
+	; behavior.  Under GS/OS Loader, crt0Gsos.s sets it to PBR.
+	;
+	; $BE chosen because it's outside both the libcall scratch range
+	; ($E0..$FF used by libgcc.s for i64 ops) and the IMG slot range
+	; ($C0..$DE).  PHK pushes 1 byte; PLA in M=8 to pull just 1 byte.
+	sep #0x20
+	phk
+	pla                       ; A's low byte = current PBR
+	sta 0xbe                  ; persistent data bank
+	rep #0x20
+
 	; Zero BSS.  X iterates from __bss_start to __bss_end; each
 	; iteration writes one byte of zero at addr X (via DP=0 +
 	; offset 0 — which is just X).  STZ in M=8 stores 1 byte and
diff --git a/runtime/src/crt0Gsos.s b/runtime/src/crt0Gsos.s
index 84e7fd6..6912139 100644
--- a/runtime/src/crt0Gsos.s
+++ b/runtime/src/crt0Gsos.s
@@ -41,6 +41,18 @@ __start:
 	lda #0
 	tcd
 
+	; Persistent "current data bank" byte at DP $BE.  Set to PBR
+	; (= our load bank) so the LDAptr/STAptr/STBptr inserters'
+	; "LDA $BE; STA $E2" sequence puts pointer derefs in our bank,
+	; matching DBR-relative absolute stores.  $BE is outside the
+	; libcall scratch range ($E0..$FF used by libgcc.s for i64 ops).
+	; See crt0.s.
+	sep #0x20
+	phk
+	pla
+	sta 0xbe
+	rep #0x20
+
 	; BSS zero-init. With DBR=our bank, `stz abs,X` writes to
 	; ourBank:X — correct as long as __bss_start/__bss_end fit in
 	; the segment's bank.
diff --git a/runtime/src/snprintf.c b/runtime/src/snprintf.c
index 6633870..8fd9fe7 100644
--- a/runtime/src/snprintf.c
+++ b/runtime/src/snprintf.c
@@ -103,6 +103,7 @@ static void emitDec(int n) {
 
 
 __attribute__((noinline))
+__attribute__((optnone))
 static void emitULong(unsigned long n) {
     char buf[11];
     int  i = 0;
@@ -122,7 +123,7 @@ static void emitULong(unsigned long n) {
 }
 
 
-__attribute__((noinline))
+__attribute__((noinline,optnone))
 static void emitSignedLong(long n) {
     // See emitDec: avoid the signed-overflow UB on LONG_MIN.
     if (n < 0) {
@@ -221,6 +222,12 @@ static void emitDouble(double v, int prec) {
 
 
 // fmt is arg0 (A register); see banner comment for why the order matters.
+// optnone: under ptr32 the regalloc reuses the same stack spill slot for
+// both the va_list pointer `ap` and the fmt-walking pointer, so a `va_arg`
+// after several fmt-character steps reads the wrong slot and gets 0
+// instead of the actual va_arg value.  optnone forces fast regalloc which
+// keeps each vreg in its own slot.  See feedback_snprintf_va_arg_slot_alias.md.
+__attribute__((optnone))
 static int format(const char *fmt, va_list ap) {
     while (*fmt) {
         char c = *fmt++;
@@ -295,6 +302,8 @@ static int format(const char *fmt, va_list ap) {
 }
 
 
+
+__attribute__((optnone))
 int snprintf(char *buf, size_t n, const char *fmt, ...) {
     gCur   = buf;
     // n == 0 must NOT touch the buffer (C99 7.19.6.5).  Setting
diff --git a/runtime/src/softDouble.c b/runtime/src/softDouble.c
index 0b1e6a1..3e0885d 100644
--- a/runtime/src/softDouble.c
+++ b/runtime/src/softDouble.c
@@ -127,14 +127,23 @@ u64 __adddf3(u64 a, u64 b) {
     // Right-shift first to bring an over-wide sum back in range; then
     // left-shift if subtraction left the lead below 55.  Reverse order
     // would shift an over-wide value out of u64 range entirely.
-    while (mr & ~((1ULL << 56) - 1)) {
-        u64 sticky = mr & 1;
-        mr = (mr >> 1) | sticky;
-        ea++;
+    // Use if + do-while because pure `while (cond) body` triggers a
+    // ptr32 backend bug: PHP/PLP wrap pass mis-identifies the loop's
+    // pre-test LDA reload as flag corruption and wraps the wrong
+    // range, so the BEQ tests stale flags and the loop body never
+    // fires.  `do { } while (cond)` is unaffected (test-after-body).
+    if (mr & ~((1ULL << 56) - 1)) {
+        do {
+            u64 sticky_bit = mr & 1;
+            mr = (mr >> 1) | sticky_bit;
+            ea++;
+        } while (mr & ~((1ULL << 56) - 1));
     }
-    while ((mr & (1ULL << 55)) == 0 && mr != 0) {
-        mr <<= 1;
-        ea--;
+    if ((mr & (1ULL << 55)) == 0 && mr != 0) {
+        do {
+            mr <<= 1;
+            ea--;
+        } while ((mr & (1ULL << 55)) == 0 && mr != 0);
     }
     // Round to nearest, ties to even.  Bits 0/1 are sticky+round, bit 2
     // is guard, bit 3 is mantissa LSB.
@@ -259,14 +268,26 @@ u64 __divdf3(u64 a, u64 b) {
     // Handle the leading quotient bit explicitly.
     u64 q = DMANT_LEAD;
     u64 r = ma - mb;
+    // `volatile vmb`: forces mb to be re-read from memory inside the
+    // loop.  Without this, the W65816 codegen miscompiles `r >= mb` and
+    // `r -= mb` when called as the 3rd+ chained `__divdf3` after prior
+    // softDouble libcalls (sqrt3 Newton iter — 3rd iter returned 0.0
+    // instead of 1.41421).  Adding `volatile` to either `r` or `mb`
+    // alone fixes it, suggesting the compiler is keeping one of them
+    // in registers across loop iterations and a JSL inside the loop
+    // (__ashlsi3 for `r <<= 1`) clobbers the held value.  The real
+    // fix lives in the W65816 backend's u64-shift lowering; volatile
+    // here is the conservative workaround.
+    volatile u64 vmb = mb;
     // Compute 52 more fractional bits via standard shift-test-subtract.
     for (int i = 51; i >= 0; i--) {
         r <<= 1;
-        if (r >= mb) {
-            r -= mb;
+        if (r >= vmb) {
+            r -= vmb;
             q |= (1ULL << i);
         }
     }
+    mb = vmb;  // resync in case below reads mb
     // Round to nearest, ties to even.  Generate one extra bit (the
     // "guard"), examine the remainder for any non-zero "sticky" tail,
     // and round q up when guard=1 and (sticky || (q & 1)).  Without
diff --git a/runtime/src/timeExt.c b/runtime/src/timeExt.c
index b7be5a4..d47c630 100644
--- a/runtime/src/timeExt.c
+++ b/runtime/src/timeExt.c
@@ -33,44 +33,20 @@ double difftime(time_t end, time_t start) {
     return (double)(end - start);
 }
 
+struct tm *gmtime_r(const time_t *t, struct tm *out);
+
 // gmtime / localtime: convert seconds-since-1970 to broken-down time.
 // "local" is identical to "gm" — no timezone support.
 //
-// gmtime KNOWN-BROKEN under GS/OS Loader.  The interface returns a
-// pointer to a static global (`__gmtimeBuf`).  User code reads
-// `r->tm_field` which the W65816 backend lowers via [dp],Y with bank
-// forced to 0 (DBR-independent — see W65816ISelLowering's LDAptr/STAptr
-// inserter).  But under Loader the buffer's IMM16 address gets cRELOC-
-// patched to a runtime offset that's only valid in the user's bank,
-// not bank 0 — so the user's reads land in unrelated bank-0 RAM.
-// Even arranging for gmtime to write via [dp],y bank=0 makes both
-// halves consistent at bank 0, but the cRELOC-patched address often
-// falls in the Language Card area where bank-0 reads/writes aren't
-// stable RAM.  Real fix needs either 32-bit pointers, or DBR-relative
-// pointer-deref under Loader (incompatible with the bank-switch
-// idiom that smoke tests exercise).
-//
-// Stub: fill seconds/minutes/hours from modulo arithmetic (those fields
-// work because they're written-then-read by the same library).  Date
-// fields stay at the 1970-01-01 sentinel.  Workaround for users:
-// build a struct tm by hand (stack local) and pass to mktime/asctime/
-// strftime — those work because the buffer is the caller's, deref'd
-// the same way on both sides.
+// Returns a pointer to a static global (`__gmtimeBuf`).  Under GS/OS
+// Loader (DBR != 0) caller-side pointer-deref reads need to land in
+// the same bank where gmtime wrote; this requires the runtime build
+// to enable `-mllvm -w65816-loader-bank-deref`, which makes
+// LDAptr/STAptr load the bank byte from DP $BE (set by crt0 from
+// PHK / current PBR).  Without the flag, gmtime still works under
+// MAME / non-Loader runs where DBR=0 throughout.
 struct tm *gmtime(const time_t *t) {
-    long secs = *t;
-    int  sec  = (int)(secs % 60L); secs /= 60L;
-    int  min  = (int)(secs % 60L); secs /= 60L;
-    int  hour = (int)(secs % 24L);
-    __gmtimeBuf.tm_sec  = sec;
-    __gmtimeBuf.tm_min  = min;
-    __gmtimeBuf.tm_hour = hour;
-    __gmtimeBuf.tm_mday = 1;
-    __gmtimeBuf.tm_mon  = 0;
-    __gmtimeBuf.tm_year = 70;          // 1970 sentinel — date decomp KNOWN-BROKEN
-    __gmtimeBuf.tm_wday = 4;           // Jan 1 1970 was Thursday
-    __gmtimeBuf.tm_yday = 0;
-    __gmtimeBuf.tm_isdst = -1;
-    return &__gmtimeBuf;
+    return gmtime_r(t, &__gmtimeBuf);
 }
 
 struct tm *localtime(const time_t *t) {
@@ -82,13 +58,15 @@ struct tm *localtime(const time_t *t) {
 // is bank-0 in 65816 native mode regardless of DBR).  This avoids the
 // bank-mismatch issue that breaks plain gmtime under Loader.
 //
-// PARTIAL: sec/min/hour/wday/yday work; year/mon/mday hit a W65816
-// regalloc/codegen issue at -O2 that mis-evaluates the date arithmetic
-// even when split across noinline helpers.  Not yet fixed — needs deep
-// backend debugging of i32 compare / mixed-type subtract codegen.
-//
-// Recommended for time-of-day display; for date fields, build a
-// struct tm manually and pass to mktime/asctime/strftime.
+// Full broken-down time computation.  Marked optnone because at -O2
+// LLVM's combined IR optimizations (loop rotation + reassociation +
+// induction-variable-simplify) mis-evaluate the year-increment loop's
+// `days >= 365L + (__isLeap(...) ? 1 : 0)` comparison, leaving the
+// loop body unexecuted and date fields stuck at the 1970 sentinel.
+// optnone preserves the per-statement structure and the loop runs
+// correctly.  Verified end-to-end against 1710484245L → 2024-03-15
+// 06:30:45 UTC (Friday, day-of-year 74).
+__attribute__((optnone))
 struct tm *gmtime_r(const time_t *t, struct tm *out) {
     long secs = *t;
     int  sec  = (int)(secs % 60L); secs /= 60L;
@@ -98,14 +76,30 @@ struct tm *gmtime_r(const time_t *t, struct tm *out) {
     int  wday = (int)((days + 4L) % 7L);
     if (wday < 0) wday += 7;
 
+    int year = 70;  // years since 1900
+    while (days >= 365L + (__isLeap(1900 + year) ? 1 : 0)) {
+        days -= 365L + (__isLeap(1900 + year) ? 1 : 0);
+        year++;
+    }
+    int yday = (int)days;
+    int leap = __isLeap(1900 + year);
+    int mon = 11;
+    while (mon > 0) {
+        int firstDayOfMon = __monthDays[mon] + (leap && mon > 1 ? 1 : 0);
+        if ((int)days >= firstDayOfMon) break;
+        mon--;
+    }
+    int firstDay = __monthDays[mon] + (leap && mon > 1 ? 1 : 0);
+    int mday = (int)days - firstDay + 1;
+
     out->tm_sec   = sec;
     out->tm_min   = min;
     out->tm_hour  = hour;
-    out->tm_mday  = 1;     // KNOWN-BROKEN — see header comment
-    out->tm_mon   = 0;
-    out->tm_year  = 70;
+    out->tm_mday  = mday;
+    out->tm_mon   = mon;
+    out->tm_year  = year;
     out->tm_wday  = wday;
-    out->tm_yday  = 0;
+    out->tm_yday  = yday;
     out->tm_isdst = -1;
     return out;
 }
diff --git a/scripts/runFileCheckTests.sh b/scripts/runFileCheckTests.sh
new file mode 100755
index 0000000..0c0e107
--- /dev/null
+++ b/scripts/runFileCheckTests.sh
@@ -0,0 +1,81 @@
+#!/usr/bin/env bash
+# runFileCheckTests.sh - run W65816 backend regression tests.
+#
+# Walks src/llvm/test/CodeGen/W65816/*.ll and for each:
+#   - reads RUN: lines from the test header (lit-compatible syntax)
+#   - executes them with %s -> the test path
+#   - any non-zero exit fails the run.
+#
+# Why not lit: the in-tree llvm-mos build is configured with
+# LLVM_INCLUDE_TESTS=OFF (saves ~5 min from incremental rebuilds and
+# ~2 GB of test artifacts).  These regression tests are codegen-shape
+# pins, not full lit-harness sweeps; FileCheck alone covers our needs.
+#
+# Usage:
+#   scripts/runFileCheckTests.sh                # run all
+#   scripts/runFileCheckTests.sh foo.ll bar.ll  # run named (relative to dir)
+
+set -euo pipefail
+
+PROJECT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+TEST_DIR="$PROJECT_ROOT/src/llvm/test/CodeGen/W65816"
+LLC="$PROJECT_ROOT/tools/llvm-mos-build/bin/llc"
+FILECHECK="$PROJECT_ROOT/tools/llvm-mos-build/bin/FileCheck"
+NOT="$PROJECT_ROOT/tools/llvm-mos-build/bin/not"
+
+[ -x "$LLC" ]       || { echo "missing $LLC" >&2; exit 2; }
+[ -x "$FILECHECK" ] || { echo "missing $FILECHECK; build with 'ninja FileCheck not'" >&2; exit 2; }
+
+if [ $# -gt 0 ]; then
+    files=()
+    for f in "$@"; do
+        files+=("$TEST_DIR/$f")
+    done
+else
+    mapfile -t files < <(find "$TEST_DIR" -maxdepth 1 -name '*.ll' | sort)
+fi
+
+pass=0
+fail=0
+failed=()
+for f in "${files[@]}"; do
+    [ -f "$f" ] || { echo "skip missing: $f"; continue; }
+    name="$(basename "$f")"
+
+    runs=$(grep -E '^[[:space:]]*;[[:space:]]*RUN:' "$f" | sed -E 's/^[[:space:]]*;[[:space:]]*RUN:[[:space:]]*//')
+    if [ -z "$runs" ]; then
+        echo "SKIP $name (no RUN: line)"
+        continue
+    fi
+
+    ok=1
+    while IFS= read -r line; do
+        [ -z "$line" ] && continue
+        cmd=${line//%s/$f}
+        cmd=${cmd//llc/$LLC}
+        cmd=${cmd//FileCheck/$FILECHECK}
+        cmd=${cmd//not /$NOT }
+        out=$(bash -c "$cmd" 2>&1) || {
+            ok=0
+            echo "FAIL $name"
+            echo "  cmd: $cmd"
+            echo "$out" | sed 's/^/  | /'
+            break
+        }
+    done <<< "$runs"
+
+    if [ $ok -eq 1 ]; then
+        echo "PASS $name"
+        pass=$((pass + 1))
+    else
+        fail=$((fail + 1))
+        failed+=("$name")
+    fi
+done
+
+echo
+echo "==== W65816 FileCheck: $pass pass, $fail fail ===="
+if [ $fail -gt 0 ]; then
+    printf '  - %s\n' "${failed[@]}"
+    exit 1
+fi
diff --git a/scripts/smokeTest.sh b/scripts/smokeTest.sh
index adf617d..b3d22a9 100755
--- a/scripts/smokeTest.sh
+++ b/scripts/smokeTest.sh
@@ -3160,6 +3160,50 @@ EOF
         fi
         rm -f "$cTrFile" "$oTrFile" "$binTrFile"
 
+        log "check: MAME runs gmtime(1710484245) -> 2024-03-15 06:30:45 Fri (date math via real impl)"
+        cGmFile="$(mktemp --suffix=.c)"
+        oGmFile="$(mktemp --suffix=.o)"
+        oGmTime="$(mktemp --suffix=.o)"
+        binGmFile="$(mktemp --suffix=.bin)"
+        cat > "$cGmFile" <<'EOF'
+typedef long time_t;
+struct tm {
+    int tm_sec, tm_min, tm_hour;
+    int tm_mday, tm_mon, tm_year;
+    int tm_wday, tm_yday, tm_isdst;
+};
+extern struct tm *gmtime(const time_t *);
+__attribute__((noinline)) void switchToBank2(void) {
+    __asm__ volatile ("sep #0x20\n.byte 0xa9,0x02\npha\nplb\nrep #0x20\n");
+}
+int main(void) {
+    time_t t = 1710484245L;  // 2024-03-15 06:30:45 UTC, Friday, day 74
+    struct tm *r = gmtime(&t);
+    switchToBank2();
+    *(volatile unsigned short *)0x5000 = r->tm_year;  // 124
+    *(volatile unsigned short *)0x5002 = r->tm_mon;   // 2
+    *(volatile unsigned short *)0x5004 = r->tm_mday;  // 15
+    *(volatile unsigned short *)0x5006 = r->tm_hour;  // 6
+    *(volatile unsigned short *)0x5008 = r->tm_min;   // 30
+    *(volatile unsigned short *)0x500a = r->tm_sec;   // 45
+    *(volatile unsigned short *)0x500c = r->tm_wday;  // 5
+    *(volatile unsigned short *)0x500e = r->tm_yday;  // 74
+    while (1) {}
+}
+EOF
+        "$CLANG" --target=w65816 -O2 -ffunction-sections -c "$cGmFile" -o "$oGmFile"
+        "$CLANG" --target=w65816 -O2 -ffunction-sections \
+            -c "$PROJECT_ROOT/runtime/src/timeExt.c" -o "$oGmTime"
+        "$PROJECT_ROOT/tools/link816" -o "$binGmFile" --text-base 0x1000 \
+            "$oCrt0F" "$oLibgccFile" "$oGmTime" "$oGmFile" >/dev/null 2>&1
+        if ! bash "$PROJECT_ROOT/scripts/runInMame.sh" "$binGmFile" --check \
+                  0x025000=007c 0x025002=0002 0x025004=000f \
+                  0x025006=0006 0x025008=001e 0x02500a=002d \
+                  0x02500c=0005 0x02500e=004a >/dev/null 2>&1; then
+            die "MAME: gmtime(1710484245) returned wrong date fields"
+        fi
+        rm -f "$cGmFile" "$oGmFile" "$oGmTime" "$binGmFile"
+
         log "check: MAME runs udivmod(0x123...DEF, 0x10000, &m) → q=0x12345_6789AB m=0xCDEF (#69)"
         cUdmFile="$(mktemp --suffix=.c)"
         oUdmFile="$(mktemp --suffix=.o)"
@@ -5255,4 +5299,13 @@ print(f'OK: {nCreloc} cRELOC opcodes match sidecar')
     rm -f "$cR1" "$oR1" "$binR1" "$mapR1" "$relR1" "$omfR1"
 fi
 
+# W65816 codegen-shape regression pins.  Tiny FileCheck assertions on
+# specific lowering behaviors that have broken before; runs in well
+# under a second.  See scripts/runFileCheckTests.sh.
+log "check: W65816 FileCheck regressions pass"
+"$PROJECT_ROOT/scripts/runFileCheckTests.sh" >/tmp/fcOut 2>&1 || {
+    cat /tmp/fcOut >&2
+    die "W65816 FileCheck regressions failed"
+}
+
 log "all smoke checks passed"
diff --git a/src/clang/lib/Basic/Targets/W65816.h b/src/clang/lib/Basic/Targets/W65816.h
index 8cabf41..d9a728d 100644
--- a/src/clang/lib/Basic/Targets/W65816.h
+++ b/src/clang/lib/Basic/Targets/W65816.h
@@ -45,7 +45,7 @@ public:
     IntPtrType = SignedInt;
     PtrDiffType = SignedInt;
     SigAtomicType = SignedLong;
-    resetDataLayout("e-m:e-p:16:8-i16:16-i32:16-n8:16-S16");
+    resetDataLayout("e-m:e-p:16:16-i16:16-i32:16-i64:16-f32:16-f64:16-n8:16-S16");
   }
 
   void getTargetDefines(const LangOptions &Opts,
diff --git a/src/llvm/lib/Target/W65816/CMakeLists.txt b/src/llvm/lib/Target/W65816/CMakeLists.txt
index b3fe53f..d457117 100644
--- a/src/llvm/lib/Target/W65816/CMakeLists.txt
+++ b/src/llvm/lib/Target/W65816/CMakeLists.txt
@@ -34,6 +34,7 @@ add_llvm_target(W65816CodeGen
   W65816NegYIndY.cpp
   W65816PreSpillCrossCall.cpp
   W65816SjLjFinalize.cpp
+  W65816LowerWide32.cpp
   W65816TargetMachine.cpp
   W65816AsmPrinter.cpp
   W65816MCInstLower.cpp
diff --git a/src/llvm/lib/Target/W65816/W65816.h b/src/llvm/lib/Target/W65816/W65816.h
index 1860bb2..2bf5a91 100644
--- a/src/llvm/lib/Target/W65816/W65816.h
+++ b/src/llvm/lib/Target/W65816/W65816.h
@@ -116,6 +116,15 @@ FunctionPass *createW65816PreSpillCrossCall();
 // W65816SjLjFinalize.cpp.
 FunctionPass *createW65816SjLjFinalize();
 
+// Pre-RA pass that lowers Wide32 register pairs into pairs of i16
+// vregs.  Without this, greedy/basic regalloc can't fit the pair-
+// pressure of i64-via-2-i32-via-Wide32 traffic in i64-heavy
+// functions (RegAllocBase crashes during allocatePhysRegs).  After
+// this pass, only i16 vregs reach regalloc, and the pair structure
+// lives only in the LDAptr32S / STAptr32S / STBptr32S pseudos which
+// take 2 i16 ptr operands directly.
+FunctionPass *createW65816LowerWide32();
+
 void initializeW65816AsmPrinterPass(PassRegistry &);
 void initializeW65816DAGToDAGISelLegacyPass(PassRegistry &);
 void initializeW65816StackSlotCleanupPass(PassRegistry &);
@@ -128,6 +137,7 @@ void initializeW65816SpillToXPass(PassRegistry &);
 void initializeW65816NegYIndYPass(PassRegistry &);
 void initializeW65816PreSpillCrossCallPass(PassRegistry &);
 void initializeW65816SjLjFinalizePass(PassRegistry &);
+void initializeW65816LowerWide32Pass(PassRegistry &);
 
 } // namespace llvm
 
diff --git a/src/llvm/lib/Target/W65816/W65816ISelDAGToDAG.cpp b/src/llvm/lib/Target/W65816/W65816ISelDAGToDAG.cpp
index 84c8bfe..271a338 100644
--- a/src/llvm/lib/Target/W65816/W65816ISelDAGToDAG.cpp
+++ b/src/llvm/lib/Target/W65816/W65816ISelDAGToDAG.cpp
@@ -71,21 +71,52 @@ void W65816DAGToDAGISel::Select(SDNode *Node) {
     return;
   }
 
-  // Custom selection: bare FrameIndex SDValue used as an i16 pointer
-  // value (e.g. `&arr[0]` for a stack-allocated array).  The
-  // auto-generated selector has no pattern for `(i16 frameindex)`
-  // because tablegen doesn't expose FrameIndex as a leaf type — so
-  // ISel fails with "Cannot select: FrameIndex" before ever reaching
-  // a load/store-context fold.  Convert it to ADDframe (FI, 0); the
-  // frame-index elimination pass turns ADDframe into TSC + CLC + ADC
-  // #(offset+stackSize), producing SP+offset in A.
+  // Custom selection: bare FrameIndex SDValue used as a pointer value
+  // (e.g. `&arr[0]` for a stack-allocated array).  The auto-generated
+  // selector has no pattern for `(i16 frameindex)` because tablegen
+  // doesn't expose FrameIndex as a leaf type — so ISel fails with
+  // "Cannot select: FrameIndex" before ever reaching a load/store-
+  // context fold.  Convert to ADDframe (FI, 0); the frame-index
+  // elimination pass turns ADDframe into TSC + CLC + ADC #(offset +
+  // stackSize), producing SP+offset in A.
+  //
+  // ptr32 mode: a `(i32 frameindex)` is `&local` typed as a 32-bit
+  // pointer (bank+addr).  Lower as REG_SEQUENCE(ADDframe, sub_lo, 0,
+  // sub_hi).  Hi=0 reflects the program-bank assumption (stack lives
+  // in bank 0 for our crt0 startup).  Without this, ISel hits
+  // "Cannot select: t# = FrameIndex<N>" and the pass crashes —
+  // observed for softDouble's __adddf3 calling dclass(a, &sa, &ea,
+  // &ma) where the latter three become i32 frameindex SDValues.
   if (Node->getOpcode() == ISD::FrameIndex) {
     SDLoc DL(Node);
     int FI = cast<FrameIndexSDNode>(Node)->getIndex();
+    EVT VT = Node->getValueType(0);
     SDValue TFI = CurDAG->getTargetFrameIndex(FI, MVT::i16);
-    SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i16);
-    CurDAG->SelectNodeTo(Node, W65816::ADDframe, MVT::i16, TFI, Zero);
-    return;
+    SDValue Zero16 = CurDAG->getTargetConstant(0, DL, MVT::i16);
+    if (VT == MVT::i16) {
+      CurDAG->SelectNodeTo(Node, W65816::ADDframe, MVT::i16, TFI, Zero16);
+      return;
+    }
+    if (VT == MVT::i32) {
+      // Build (REG_SEQUENCE Wide32RC, ADDframe(FI,0), sub_lo, MOVi16(0),
+      // sub_hi).  ADDframe materialises lo as an i16 SDValue; the hi
+      // half is the literal bank byte (0).
+      SDNode *Lo = CurDAG->getMachineNode(W65816::ADDframe, DL,
+                                          MVT::i16, TFI, Zero16);
+      SDValue HiC = CurDAG->getTargetConstant(0, DL, MVT::i16);
+      // For the high half, just materialise an i16 zero via MOVi16imm.
+      SDNode *Hi = CurDAG->getMachineNode(W65816::LDAi16imm, DL,
+                                          MVT::i16, HiC);
+      SDValue RC = CurDAG->getTargetConstant(W65816::Wide32RegClassID,
+                                             DL, MVT::i32);
+      SDValue SubLo = CurDAG->getTargetConstant(llvm::sub_lo, DL, MVT::i32);
+      SDValue SubHi = CurDAG->getTargetConstant(llvm::sub_hi, DL, MVT::i32);
+      CurDAG->SelectNodeTo(Node, TargetOpcode::REG_SEQUENCE, MVT::i32,
+                           {RC, SDValue(Lo, 0), SubLo, SDValue(Hi, 0),
+                            SubHi});
+      return;
+    }
+    report_fatal_error("W65816: FrameIndex selection: unsupported VT");
   }
 
   // Defer to the auto-generated selector for everything else.
diff --git a/src/llvm/lib/Target/W65816/W65816ISelLowering.cpp b/src/llvm/lib/Target/W65816/W65816ISelLowering.cpp
index 5bc2a9f..f63d266 100644
--- a/src/llvm/lib/Target/W65816/W65816ISelLowering.cpp
+++ b/src/llvm/lib/Target/W65816/W65816ISelLowering.cpp
@@ -23,12 +23,30 @@
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 #include "llvm/IR/Function.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
 
 using namespace llvm;
 
 #define DEBUG_TYPE "w65816-lower"
 
+// Loader-compat workaround: when set, LDAptr/STAptr/STBptr inserters
+// load the bank byte from DP $BE (initialized by crt0 to PHK / current
+// PBR) instead of forcing it to 0 via STZ $E2.  This makes pointer
+// derefs land in the user's bank — matching where DBR-relative
+// absolute stores go — so library functions like gmtime that store
+// into static buffers via DBR-relative paths are visible to caller-
+// side pointer-deref reads.  Costs 2 extra bytes / 4 cycles per ptr-
+// deref (LDA dp + STA dp vs STZ dp).  Default off to keep
+// size-sensitive builds (toolbox) under the $C000 IO-window ceiling.
+static cl::opt<bool> LoaderBankDeref(
+    "w65816-loader-bank-deref",
+    cl::desc("LDAptr/STAptr inserters read bank from DP $BE (set by "
+             "crt0 to PHK) instead of STZ $E2.  Required for GS/OS "
+             "Loader compatibility; default off for size-sensitive "
+             "builds."),
+    cl::init(false), cl::Hidden);
+
 W65816TargetLowering::W65816TargetLowering(const TargetMachine &TM,
                                            const W65816Subtarget &STI)
     : TargetLowering(TM, STI) {
@@ -37,6 +55,7 @@ W65816TargetLowering::W65816TargetLowering(const TargetMachine &TM,
   // for ensuring the dynamic mode matches the selected class.
   addRegisterClass(MVT::i8,  &W65816::Acc8RegClass);
   addRegisterClass(MVT::i16, &W65816::Acc16RegClass);
+  addRegisterClass(MVT::i32, &W65816::Wide32RegClass);
 
   computeRegisterProperties(STI.getRegisterInfo());
 
@@ -79,6 +98,21 @@ W65816TargetLowering::W65816TargetLowering(const TargetMachine &TM,
   for (MVT VT : MVT::integer_valuetypes())
     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Expand);
 
+  // Only register i32 ext-load / trunc-store and Custom actions when
+  // i32 is actually a legal type (ptr32 mode active).  Otherwise the
+  // Custom-action calls intercept i16/i8 ops, and LowerTruncate's
+  // SDValue()-on-non-i32 bail breaks the i16→i8 trunc pattern (same
+  // root cause as the earlier LOAD-Custom-breaks-LDAptr issue).
+  bool ptr32Active = isTypeLegal(MVT::i32);
+  if (ptr32Active) {
+    for (MVT MemVT : {MVT::i8, MVT::i16}) {
+      setLoadExtAction(ISD::ZEXTLOAD, MVT::i32, MemVT, Expand);
+      setLoadExtAction(ISD::SEXTLOAD, MVT::i32, MemVT, Expand);
+      setLoadExtAction(ISD::EXTLOAD,  MVT::i32, MemVT, Expand);
+      setTruncStoreAction(MVT::i32, MemVT, Expand);
+    }
+  }
+
   // Vararg support: VASTART writes the address of the first vararg slot
   // to the va_list pointer.  VAARG/VACOPY/VAEND use the default
   // expansions that load through that pointer and bump it.  This makes
@@ -164,6 +198,15 @@ W65816TargetLowering::W65816TargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::SRL, MVT::i8, Custom);
   setOperationAction(ISD::SRA, MVT::i8, Custom);
 
+  // LOAD / STORE Custom-lowering for ptr32 mode is intentionally NOT
+  // wired here in ptr16 mode.  Setting LOAD Custom and returning
+  // SDValue() from LowerLoad short-circuits the i16-result LDAptr/
+  // STAptr selection paths (the Custom→empty→Legal fall-through doesn't
+  // re-enter pattern matching).  When ptr32 is activated, this hook
+  // needs a different gating mechanism — likely an isel-time
+  // replacement triggered by addrspacecast or a target DAG combine.
+  // See LowerLoad / LowerStore — currently dead code.
+
   // ADDC/ADDE/SUBC/SUBE are the legacy SDNodes with implicit Glue carrying
   // the carry/borrow flag between the two halves of a multi-precision add or
   // sub.  Setting them Legal triggers the type legalizer's carry-chain split
@@ -203,6 +246,47 @@ W65816TargetLowering::W65816TargetLowering(const TargetMachine &TM,
     // explicit SHL/SHL_PARTS action needed — the override forces the
     // type-legalizer's libcall path before SHL_PARTS would be emitted.
   }
+  // i64 shifts — route to libcall before the type legalizer tries
+  // to split via the next-legal-type (which becomes i32 in ptr32 mode
+  // and triggers a SDAG combine loop on `i64 >> K` patterns).  By
+  // marking SHL/SRL/SRA i64 LibCall here, the operation legalizer
+  // picks up the libcall path even though i64 itself is illegal.
+  for (MVT VT : {MVT::i64}) {
+    setOperationAction(ISD::SHL, VT, LibCall);
+    setOperationAction(ISD::SRL, VT, LibCall);
+    setOperationAction(ISD::SRA, VT, LibCall);
+  }
+
+  if (ptr32Active) {
+    for (unsigned Op : {ISD::ADD, ISD::SUB, ISD::AND, ISD::OR, ISD::XOR})
+      setOperationAction(Op, MVT::i32, Custom);
+    setOperationAction(ISD::SHL, MVT::i32, Custom);
+    setOperationAction(ISD::SRL, MVT::i32, Custom);
+    setOperationAction(ISD::SRA, MVT::i32, Custom);
+    setOperationAction(ISD::ZERO_EXTEND, MVT::i32, Custom);
+    setOperationAction(ISD::SIGN_EXTEND, MVT::i32, Custom);
+    setOperationAction(ISD::ANY_EXTEND,  MVT::i32, Custom);
+    // SIGN_EXTEND_INREG with i32 result and inner type i1/i8/i16:
+    // the combiner emits this for `(int32_t)((int8_t)x)` and for
+    // `-(crc & 1ul)` (the i1 case shows up in CRC32 loops).  No
+    // tablegen pattern covers the i32 form; Custom-lower to per-half
+    // ops.  IMPORTANT: LegalizeDAG looks up the action for
+    // SIGN_EXTEND_INREG using the INNER VT (the operand value type),
+    // not the result VT.  See LegalizeDAG.cpp:
+    //   Action = TLI.getOperationAction(Op, InnerType);
+    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1,  Custom);
+    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8,  Custom);
+    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Custom);
+    setOperationAction(ISD::TRUNCATE,    MVT::i16, Custom);
+    setOperationAction(ISD::TRUNCATE,    MVT::i8,  Custom);
+    setOperationAction(ISD::LOAD,  MVT::i32, Custom);
+    setOperationAction(ISD::STORE, MVT::i32, Custom);
+    setOperationAction(ISD::SETCC,     MVT::i32, Custom);
+    setOperationAction(ISD::BR_CC,     MVT::i32, Custom);
+    setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
+    setOperationAction(ISD::SELECT,    MVT::i32, Custom);
+    setOperationAction(ISD::Constant,  MVT::i32, Custom);
+  }
 
   // Disable jump tables.  Generating them costs us BRIND (indirect
   // branch via 16-bit pointer load), which we don't have.  A long
@@ -224,7 +308,8 @@ W65816TargetLowering::W65816TargetLowering(const TargetMachine &TM,
   // address-select reverse combine (see W65816TargetLowering::
   // PerformDAGCombine).
   // setTargetDAGCombine(ISD::LOAD); // bisecting pickif hang
-  setTargetDAGCombine(ISD::SHL);
+  // SHL combine disabled while debugging the ptr32 i64-phi hang.
+  // setTargetDAGCombine(ISD::SHL);
 }
 
 // Map an LLVM SETCC condition to a W65816 branch.  Returns the condition
@@ -371,6 +456,57 @@ static W65816CC::CondCode normalizeCC(SDValue &LHS, SDValue &RHS,
   return TCC;
 }
 
+// Wide32 build/extract helpers, used by LowerLoad/Store/Extend/Truncate/
+// I32Bin/BR_CC to construct or destructure i32 SDValues across the
+// sub_lo / sub_hi halves of the Wide32 register class.
+static SDValue buildWide32(SelectionDAG &DAG, const SDLoc &DL,
+                           SDValue Lo, SDValue Hi) {
+  SDValue RC = DAG.getTargetConstant(W65816::Wide32RegClassID, DL, MVT::i32);
+  SDValue SubLo = DAG.getTargetConstant(llvm::sub_lo, DL, MVT::i32);
+  SDValue SubHi = DAG.getTargetConstant(llvm::sub_hi, DL, MVT::i32);
+  SDNode *RS = DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, DL, MVT::i32,
+                                  {RC, Lo, SubLo, Hi, SubHi});
+  return SDValue(RS, 0);
+}
+// Look through a buildWide32(Lo, Hi) -> REG_SEQUENCE(RC, Lo, sub_lo,
+// Hi, sub_hi) pair: if X is exactly that machine node, return the
+// matching half operand directly.  Avoids a TargetExtractSubreg that
+// would re-enter the SDAG combiner and re-build the i32 constant /
+// pair, looping forever (observed as OOM in the combiner on `*t = 0`).
+static SDValue lookThroughRegSeq(SDValue X, unsigned WantSub) {
+  if (!X.getNode() || !X.isMachineOpcode()) return SDValue();
+  if (X.getMachineOpcode() != TargetOpcode::REG_SEQUENCE) return SDValue();
+  // Layout: op0 = RC, then (Reg, SubIdx) pairs.
+  for (unsigned i = 1; i + 1 < X.getNumOperands(); i += 2) {
+    SDValue SubIdx = X.getOperand(i + 1);
+    auto *CIdx = dyn_cast<ConstantSDNode>(SubIdx);
+    if (!CIdx) continue;
+    if (CIdx->getZExtValue() == WantSub)
+      return X.getOperand(i);
+  }
+  return SDValue();
+}
+static SDValue extractWide32Lo(SelectionDAG &DAG, const SDLoc &DL, SDValue X) {
+  // For constants, materialise the lo half as an i16 constant directly
+  // — getTargetExtractSubreg on a Constant SDNode produces a malformed
+  // MachineSDNode (constants don't carry sub-regs) and triggers
+  // SDAG combine loops downstream.
+  if (auto *C = dyn_cast<ConstantSDNode>(X)) {
+    return DAG.getConstant(C->getZExtValue() & 0xFFFFu, DL, MVT::i16);
+  }
+  if (SDValue Half = lookThroughRegSeq(X, llvm::sub_lo))
+    return Half;
+  return DAG.getTargetExtractSubreg(llvm::sub_lo, DL, MVT::i16, X);
+}
+static SDValue extractWide32Hi(SelectionDAG &DAG, const SDLoc &DL, SDValue X) {
+  if (auto *C = dyn_cast<ConstantSDNode>(X)) {
+    return DAG.getConstant((C->getZExtValue() >> 16) & 0xFFFFu, DL, MVT::i16);
+  }
+  if (SDValue Half = lookThroughRegSeq(X, llvm::sub_hi))
+    return Half;
+  return DAG.getTargetExtractSubreg(llvm::sub_hi, DL, MVT::i16, X);
+}
+
 SDValue W65816TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
   SDValue Chain  = Op.getOperand(0);
   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
@@ -379,6 +515,52 @@ SDValue W65816TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
   SDValue Dest   = Op.getOperand(4);
   SDLoc DL(Op);
   EVT VT = LHS.getValueType();
+  // i32 BR_CC: synthesize an i16 boolean from per-half compares, then
+  // branch on (bool != 0).  Avoids the legalizer's generic Expand that
+  // re-enters our SETCC/BR_CC custom paths in an infinite loop.
+  if (VT == MVT::i32) {
+    SDValue LL = extractWide32Lo(DAG, DL, LHS);
+    SDValue LH = extractWide32Hi(DAG, DL, LHS);
+    SDValue RL = extractWide32Lo(DAG, DL, RHS);
+    SDValue RH = extractWide32Hi(DAG, DL, RHS);
+    SDValue Bool;
+    if (CC == ISD::SETEQ || CC == ISD::SETNE) {
+      SDValue EqLo = DAG.getSetCC(DL, MVT::i16, LL, RL, ISD::SETEQ);
+      SDValue EqHi = DAG.getSetCC(DL, MVT::i16, LH, RH, ISD::SETEQ);
+      Bool = DAG.getNode(ISD::AND, DL, MVT::i16, EqLo, EqHi);
+      if (CC == ISD::SETNE)
+        Bool = DAG.getNode(ISD::XOR, DL, MVT::i16, Bool,
+                           DAG.getConstant(1, DL, MVT::i16));
+    } else {
+      // (a CC b) where CC is ordered:
+      //   = (hi_a HiStrict hi_b)  ||  (hi_a == hi_b && lo_a LoCC lo_b)
+      // HiStrict is the strict variant of CC (LE -> LT etc.) so the
+      // tie-breaker (hi==hi && lo CC lo) handles the equality case
+      // properly.  LoCC is always the unsigned variant of CC because
+      // the low half is unsigned (the high half carries the sign).
+      ISD::CondCode HiCC, LoCCu;
+      switch (CC) {
+      case ISD::SETLT:  HiCC = ISD::SETLT;  LoCCu = ISD::SETULT; break;
+      case ISD::SETLE:  HiCC = ISD::SETLT;  LoCCu = ISD::SETULE; break;
+      case ISD::SETGT:  HiCC = ISD::SETGT;  LoCCu = ISD::SETUGT; break;
+      case ISD::SETGE:  HiCC = ISD::SETGT;  LoCCu = ISD::SETUGE; break;
+      case ISD::SETULT: HiCC = ISD::SETULT; LoCCu = ISD::SETULT; break;
+      case ISD::SETULE: HiCC = ISD::SETULT; LoCCu = ISD::SETULE; break;
+      case ISD::SETUGT: HiCC = ISD::SETUGT; LoCCu = ISD::SETUGT; break;
+      case ISD::SETUGE: HiCC = ISD::SETUGT; LoCCu = ISD::SETUGE; break;
+      default:
+        report_fatal_error("W65816: unexpected i32 BR_CC condition");
+      }
+      SDValue HiOk = DAG.getSetCC(DL, MVT::i16, LH, RH, HiCC);
+      SDValue HiEq = DAG.getSetCC(DL, MVT::i16, LH, RH, ISD::SETEQ);
+      SDValue LoOk = DAG.getSetCC(DL, MVT::i16, LL, RL, LoCCu);
+      SDValue Tie  = DAG.getNode(ISD::AND, DL, MVT::i16, HiEq, LoOk);
+      Bool = DAG.getNode(ISD::OR, DL, MVT::i16, HiOk, Tie);
+    }
+    SDValue Zero = DAG.getConstant(0, DL, MVT::i16);
+    return DAG.getNode(ISD::BR_CC, DL, MVT::Other, Chain,
+                       DAG.getCondCode(ISD::SETNE), Bool, Zero, Dest);
+  }
 
   W65816CC::CondCode TCC = normalizeCC(LHS, RHS, CC, DAG, DL);
   if (TCC == W65816CC::COND_INVALID)
@@ -411,6 +593,41 @@ SDValue W65816TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
   SDLoc DL(Op);
   EVT VT = Op.getValueType();
+  // i32 SETCC: split into per-half compares.  Result type is i16 (the
+  // legalizer keeps the boolean result type narrow regardless of LHS
+  // width).
+  if (LHS.getValueType() == MVT::i32) {
+    SDValue LL = extractWide32Lo(DAG, DL, LHS);
+    SDValue LH = extractWide32Hi(DAG, DL, LHS);
+    SDValue RL = extractWide32Lo(DAG, DL, RHS);
+    SDValue RH = extractWide32Hi(DAG, DL, RHS);
+    if (CC == ISD::SETEQ || CC == ISD::SETNE) {
+      SDValue EqLo = DAG.getSetCC(DL, VT, LL, RL, ISD::SETEQ);
+      SDValue EqHi = DAG.getSetCC(DL, VT, LH, RH, ISD::SETEQ);
+      SDValue Eq = DAG.getNode(ISD::AND, DL, VT, EqLo, EqHi);
+      if (CC == ISD::SETNE)
+        Eq = DAG.getNode(ISD::XOR, DL, VT, Eq, DAG.getConstant(1, DL, VT));
+      return Eq;
+    }
+    ISD::CondCode HiCC, LoCCu;
+    switch (CC) {
+    case ISD::SETLT:  HiCC = ISD::SETLT;  LoCCu = ISD::SETULT; break;
+    case ISD::SETLE:  HiCC = ISD::SETLT;  LoCCu = ISD::SETULE; break;
+    case ISD::SETGT:  HiCC = ISD::SETGT;  LoCCu = ISD::SETUGT; break;
+    case ISD::SETGE:  HiCC = ISD::SETGT;  LoCCu = ISD::SETUGE; break;
+    case ISD::SETULT: HiCC = ISD::SETULT; LoCCu = ISD::SETULT; break;
+    case ISD::SETULE: HiCC = ISD::SETULT; LoCCu = ISD::SETULE; break;
+    case ISD::SETUGT: HiCC = ISD::SETUGT; LoCCu = ISD::SETUGT; break;
+    case ISD::SETUGE: HiCC = ISD::SETUGT; LoCCu = ISD::SETUGE; break;
+    default:
+      report_fatal_error("W65816: unexpected i32 SETCC condition");
+    }
+    SDValue HiOk = DAG.getSetCC(DL, VT, LH, RH, HiCC);
+    SDValue HiEq = DAG.getSetCC(DL, VT, LH, RH, ISD::SETEQ);
+    SDValue LoOk = DAG.getSetCC(DL, VT, LL, RL, LoCCu);
+    SDValue Tie  = DAG.getNode(ISD::AND, DL, VT, HiEq, LoOk);
+    return DAG.getNode(ISD::OR, DL, VT, HiOk, Tie);
+  }
   SDValue One  = DAG.getConstant(1, DL, VT);
   SDValue Zero = DAG.getConstant(0, DL, VT);
   return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, One, Zero,
@@ -426,15 +643,48 @@ SDValue W65816TargetLowering::LowerSELECT_CC(SDValue Op,
   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
   SDLoc DL(Op);
 
+  // i32 SELECT_CC: synthesize an i16 boolean from the i32 compare via
+  // LowerSETCC's i32 path, then select between the i32 halves driven
+  // by the boolean.  Avoids creating the i32 W65816::CMP we have no
+  // pattern for.
+  if (LHS.getValueType() == MVT::i32) {
+    // Materialise the i16 boolean.
+    SDValue Bool = DAG.getSetCC(DL, MVT::i16, LHS, RHS, CC);
+    SDValue Zero = DAG.getConstant(0, DL, MVT::i16);
+    if (Op.getValueType() == MVT::i32) {
+      SDValue TLo = extractWide32Lo(DAG, DL, TVal);
+      SDValue THi = extractWide32Hi(DAG, DL, TVal);
+      SDValue FLo = extractWide32Lo(DAG, DL, FVal);
+      SDValue FHi = extractWide32Hi(DAG, DL, FVal);
+      SDValue Lo = DAG.getSelectCC(DL, Bool, Zero, TLo, FLo, ISD::SETNE);
+      SDValue Hi = DAG.getSelectCC(DL, Bool, Zero, THi, FHi, ISD::SETNE);
+      return buildWide32(DAG, DL, Lo, Hi);
+    }
+    return DAG.getSelectCC(DL, Bool, Zero, TVal, FVal, ISD::SETNE);
+  }
+  // SELECT_CC with i32 result (i16 LHS): split TVal/FVal into halves
+  // and run a per-half i16 SELECT_CC sharing the same condition.
+  if (Op.getValueType() == MVT::i32) {
+    SDValue TLo = extractWide32Lo(DAG, DL, TVal);
+    SDValue THi = extractWide32Hi(DAG, DL, TVal);
+    SDValue FLo = extractWide32Lo(DAG, DL, FVal);
+    SDValue FHi = extractWide32Hi(DAG, DL, FVal);
+    SDValue Lo = DAG.getSelectCC(DL, LHS, RHS, TLo, FLo, CC);
+    SDValue Hi = DAG.getSelectCC(DL, LHS, RHS, THi, FHi, CC);
+    return buildWide32(DAG, DL, Lo, Hi);
+  }
+
   W65816CC::CondCode TCC = normalizeCC(LHS, RHS, CC, DAG, DL);
   if (TCC == W65816CC::COND_INVALID)
     report_fatal_error("W65816: select_cc condition not yet implemented");
 
   SDValue Glue = DAG.getNode(W65816ISD::CMP, DL, MVT::Glue, LHS, RHS);
   SDValue CCOp = DAG.getTargetConstant(TCC, DL, MVT::i8);
-  SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
+  // SDTypeProfile declares 1 result (the selected value).  Earlier
+  // code passed a 2-VT list (value + Glue) which was silently wrong
+  // and trips an SDNode-validity assertion in assertions builds.
   SDValue Ops[] = {TVal, FVal, CCOp, Glue};
-  return DAG.getNode(W65816ISD::SELECT_CC, DL, VTs, Ops);
+  return DAG.getNode(W65816ISD::SELECT_CC, DL, Op.getValueType(), Ops);
 }
 
 // i8 -> i16 sign extend.  Branchless 3-instruction trick:
@@ -457,6 +707,316 @@ SDValue W65816TargetLowering::LowerSignExtend(SDValue Op,
   return DAG.getNode(ISD::SUB, DL, MVT::i16, Xor, Sign);
 }
 
+// ptr32 foundation hook.  In ptr16 mode (PointerWidth=16, current
+// default) addresses are i16 and we return SDValue() so the legalizer
+// keeps the load and the existing LDAptr / STAptr selection patterns
+// match.  In ptr32 mode addresses are i32 and we wrap the load in
+// W65816ISD::LD_PTR via getMemIntrinsicNode so the [dp],Y inserter
+// can take the bank byte from sub_hi instead of forcing 0.
+//
+// Byte loads (zextload, anyext, true i8) keep going through the i16
+// LDA + AND #$FF idiom — same trick the existing LDAptr uses; for
+// ptr32 mode the load is still 16 bits, just bank-explicit.
+SDValue W65816TargetLowering::LowerLoad(SDValue Op,
+                                        SelectionDAG &DAG) const {
+  LoadSDNode *Ld = cast<LoadSDNode>(Op);
+  SDValue Chain = Ld->getChain();
+  SDValue Ptr   = Ld->getBasePtr();
+  EVT VT = Op.getValueType();
+  SDLoc DL(Op);
+
+  // i32 LOAD: split into two i16 loads at offsets 0 and 2 then
+  // REG_SEQUENCE the halves into a Wide32.  Address may be i16 (stack
+  // slot, global) or i32 (ptr32 deref); the recursive ADD handles
+  // address arithmetic correctly via LowerI32Bin.
+  if (VT == MVT::i32) {
+    EVT PtrVT = Ptr.getValueType();
+    SDValue Two = DAG.getConstant(2, DL, PtrVT);
+    SDValue Ptr2 = DAG.getNode(ISD::ADD, DL, PtrVT, Ptr, Two);
+    SDValue Lo = DAG.getLoad(MVT::i16, DL, Chain, Ptr,
+                             Ld->getPointerInfo(),
+                             Ld->getAlign(),
+                             Ld->getMemOperand()->getFlags());
+    SDValue Hi = DAG.getLoad(MVT::i16, DL, Chain, Ptr2,
+                             Ld->getPointerInfo().getWithOffset(2),
+                             Ld->getAlign(),
+                             Ld->getMemOperand()->getFlags());
+    SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
+                                   Lo.getValue(1), Hi.getValue(1));
+    SDValue Val = buildWide32(DAG, DL, Lo, Hi);
+    return DAG.getMergeValues({Val, NewChain}, DL);
+  }
+
+  // ptr16 mode: address is i16, let the default selection handle it.
+  if (Ptr.getValueType() != MVT::i32)
+    return SDValue();
+
+  EVT MemVT = Ld->getMemoryVT();
+  SDVTList VTs = DAG.getVTList(MVT::i16, MVT::Other);
+  SDValue Ops[] = { Chain, Ptr };
+  SDValue LdNode = DAG.getMemIntrinsicNode(W65816ISD::LD_PTR, DL, VTs, Ops,
+                                           MVT::i16, Ld->getMemOperand());
+  SDValue Val = LdNode;
+  // Byte memory access: mask the high byte for zextload, leave anyext.
+  if (MemVT == MVT::i8) {
+    if (Ld->getExtensionType() == ISD::ZEXTLOAD)
+      Val = DAG.getNode(ISD::AND, DL, MVT::i16, Val,
+                        DAG.getConstant(0xFF, DL, MVT::i16));
+    else if (Ld->getExtensionType() == ISD::SEXTLOAD)
+      Val = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i16, Val,
+                        DAG.getValueType(MVT::i8));
+  }
+  // Narrow back to i8 if the consumer wanted i8.
+  if (VT == MVT::i8)
+    Val = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Val);
+  return DAG.getMergeValues({Val, LdNode.getValue(1)}, DL);
+}
+
+// ZERO/SIGN/ANY_EXTEND i8/i16 -> i32: build a Wide32 from the i16
+// payload and a 0 / sign-fill / undef high half.
+SDValue W65816TargetLowering::LowerExtend(SDValue Op,
+                                          SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  if (Op.getValueType() != MVT::i32)
+    return SDValue();
+  SDValue X = Op.getOperand(0);
+  // Promote i8 inputs to i16 first via the same opcode.
+  if (X.getValueType() == MVT::i8)
+    X = DAG.getNode(Op.getOpcode(), DL, MVT::i16, X);
+  SDValue Lo = X;
+  SDValue Hi;
+  if (Op.getOpcode() == ISD::ZERO_EXTEND) {
+    Hi = DAG.getConstant(0, DL, MVT::i16);
+  } else if (Op.getOpcode() == ISD::SIGN_EXTEND) {
+    // Sign-fill via SRA #15 — uses our SRA15A pattern (4 insns) and
+    // stays i16-typed in both LHS and RHS, dodging the combiner's
+    // shift-amount-promote when ptr32 makes pointer-typed shift
+    // amounts i32.
+    Hi = DAG.getNode(ISD::SRA, DL, MVT::i16, Lo,
+                     DAG.getConstant(15, DL, MVT::i16));
+  } else {
+    Hi = DAG.getUNDEF(MVT::i16);
+  }
+  return buildWide32(DAG, DL, Lo, Hi);
+}
+
+// SIGN_EXTEND_INREG i32 with inner type i1/i8/i16: sign-extend the low
+// N bits of an i32 input to fill all 32 bits.  The legalizer leaves
+// this op alone when i32 is legal — but no tablegen pattern matches
+// the i32 form, so without this Custom hook isel aborts with
+// "Cannot select: sign_extend_inreg ... ValueType:i1" on shapes like
+// `-(crc & 1ul)` in CRC32 loops.
+//
+// Strategy: for inner VT V (= i1 / i8 / i16), the low half's
+// `sext_inreg` (already pattern-matched at i16) produces the signed
+// i16 value — then sign-fill the high half via SRA #15 of the lo
+// result.
+SDValue W65816TargetLowering::LowerSignExtendInReg(SDValue Op,
+                                                   SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  SDValue X = Op.getOperand(0);
+  EVT InnerVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
+  EVT ResVT = Op.getValueType();
+
+  // i16 result: replicate the existing tablegen patterns.  We MUST
+  // handle this case rather than returning SDValue(), because
+  // setOperationAction's Custom-returns-SDValue() falls through to
+  // default Expand (= SRA/SHL chain), not to tablegen pattern match.
+  // The two existing patterns are:
+  //   (sext_inreg Acc16:$src, i1)  ->  NEGA16 (AND $src, 1)
+  //   (sext_inreg Acc16:$src, i8)  ->  ((src & 0xFF) ^ 0x80) - 0x80
+  // Reproduce them at the SDAG level so the legalizer's Custom
+  // dispatch returns a fully-lowered tree.
+  if (ResVT == MVT::i16) {
+    if (InnerVT == MVT::i1) {
+      SDValue Bit = DAG.getNode(ISD::AND, DL, MVT::i16, X,
+                                DAG.getConstant(1, DL, MVT::i16));
+      return DAG.getNode(ISD::SUB, DL, MVT::i16,
+                         DAG.getConstant(0, DL, MVT::i16), Bit);
+    }
+    if (InnerVT == MVT::i8) {
+      SDValue Masked = DAG.getNode(ISD::AND, DL, MVT::i16, X,
+                                   DAG.getConstant(0xFF, DL, MVT::i16));
+      SDValue Xored = DAG.getNode(ISD::XOR, DL, MVT::i16, Masked,
+                                  DAG.getConstant(0x80, DL, MVT::i16));
+      return DAG.getNode(ISD::SUB, DL, MVT::i16, Xored,
+                         DAG.getConstant(0x80, DL, MVT::i16));
+    }
+    // inner i16 = no-op.
+    return X;
+  }
+
+  if (ResVT != MVT::i32)
+    return SDValue();
+
+  // i32 result: project the input's low half (X is i32 Wide32 here),
+  // apply the inner-VT sext on the i16 low half, sign-fill the hi.
+  SDValue Lo = extractWide32Lo(DAG, DL, X);
+  if (InnerVT != MVT::i16) {
+    Lo = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i16, Lo,
+                     DAG.getValueType(InnerVT));
+  }
+  // Sign-fill the hi half via SRA #15 — same idiom LowerExtend uses for
+  // SIGN_EXTEND i16 -> i32.
+  SDValue Hi = DAG.getNode(ISD::SRA, DL, MVT::i16, Lo,
+                           DAG.getConstant(15, DL, MVT::i16));
+  return buildWide32(DAG, DL, Lo, Hi);
+}
+
+
+// TRUNCATE i32 -> i16: project sub_lo.
+SDValue W65816TargetLowering::LowerTruncate(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  if (Op.getOperand(0).getValueType() != MVT::i32)
+    return SDValue();
+  if (Op.getValueType() == MVT::i16)
+    return extractWide32Lo(DAG, DL, Op.getOperand(0));
+  if (Op.getValueType() == MVT::i8) {
+    // i32 -> i16 -> i8.  The i8 trunc pattern is COPY_TO_REGCLASS at MC
+    // level; the i16 sub_lo extract is the work.
+    SDValue Lo16 = extractWide32Lo(DAG, DL, Op.getOperand(0));
+    return DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Lo16);
+  }
+  return SDValue();
+}
+
+// i32 Constant: split into two i16 constants and REG_SEQUENCE.
+SDValue W65816TargetLowering::LowerI32Constant(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  if (Op.getValueType() != MVT::i32) return SDValue();
+  uint64_t V = cast<ConstantSDNode>(Op)->getZExtValue();
+  SDValue Lo = DAG.getConstant(V & 0xFFFFu, DL, MVT::i16);
+  SDValue Hi = DAG.getConstant((V >> 16) & 0xFFFFu, DL, MVT::i16);
+  return buildWide32(DAG, DL, Lo, Hi);
+}
+
+// ADD/SUB/AND/OR/XOR i32 -> per-half i16 op.  ADDC/ADDE chain for ADD,
+// SUBC/SUBE for SUB.  AND/OR/XOR are independent halves.
+SDValue W65816TargetLowering::LowerI32Bin(SDValue Op,
+                                          SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  if (Op.getValueType() != MVT::i32)
+    return SDValue();
+  SDValue L = Op.getOperand(0);
+  SDValue R = Op.getOperand(1);
+  SDValue LL = extractWide32Lo(DAG, DL, L);
+  SDValue LH = extractWide32Hi(DAG, DL, L);
+  SDValue RL = extractWide32Lo(DAG, DL, R);
+  SDValue RH = extractWide32Hi(DAG, DL, R);
+  SDValue Lo, Hi;
+  switch (Op.getOpcode()) {
+  case ISD::AND:
+    Lo = DAG.getNode(ISD::AND, DL, MVT::i16, LL, RL);
+    Hi = DAG.getNode(ISD::AND, DL, MVT::i16, LH, RH);
+    break;
+  case ISD::OR:
+    Lo = DAG.getNode(ISD::OR, DL, MVT::i16, LL, RL);
+    Hi = DAG.getNode(ISD::OR, DL, MVT::i16, LH, RH);
+    break;
+  case ISD::XOR:
+    Lo = DAG.getNode(ISD::XOR, DL, MVT::i16, LL, RL);
+    Hi = DAG.getNode(ISD::XOR, DL, MVT::i16, LH, RH);
+    break;
+  case ISD::ADD: {
+    SDVTList VTs = DAG.getVTList(MVT::i16, MVT::Glue);
+    SDValue Lo2 = DAG.getNode(ISD::ADDC, DL, VTs, LL, RL);
+    Lo = Lo2.getValue(0);
+    SDValue Carry = Lo2.getValue(1);
+    Hi = DAG.getNode(ISD::ADDE, DL, VTs, LH, RH, Carry).getValue(0);
+    break;
+  }
+  case ISD::SUB: {
+    SDVTList VTs = DAG.getVTList(MVT::i16, MVT::Glue);
+    SDValue Lo2 = DAG.getNode(ISD::SUBC, DL, VTs, LL, RL);
+    Lo = Lo2.getValue(0);
+    SDValue Borrow = Lo2.getValue(1);
+    Hi = DAG.getNode(ISD::SUBE, DL, VTs, LH, RH, Borrow).getValue(0);
+    break;
+  }
+  default:
+    return SDValue();
+  }
+  return buildWide32(DAG, DL, Lo, Hi);
+}
+
+// Store companion to LowerLoad.  For i32 addresses, dispatch to the
+// 16-bit ST_PTR or the byte-truncating STB_PTR target node based on
+// MemoryVT.  For i16 addresses (ptr16 mode), bail out and let the
+// existing STAptr / STBptr patterns match.
+SDValue W65816TargetLowering::LowerStore(SDValue Op,
+                                         SelectionDAG &DAG) const {
+  StoreSDNode *St = cast<StoreSDNode>(Op);
+  SDValue Chain = St->getChain();
+  SDValue Val   = St->getValue();
+  SDValue Ptr   = St->getBasePtr();
+  EVT MemVT = St->getMemoryVT();
+  SDLoc DL(Op);
+
+  // i32 STORE: split into two halves.  Critical: the per-half stores
+  // MUST go through the target-specific W65816ISD::ST_PTR node and not
+  // through plain ISD::STORE, otherwise the SDAG combiner's
+  // MergeConsecutiveStores re-combines them into a single i32 store
+  // that re-enters LowerStore — infinite loop, OOM in the combiner.
+  // For i16 ptrs (legacy ptr16), fall back to ISD::STORE; the regular
+  // store-merger doesn't trip there because address splitting via
+  // ISD::ADD on i16 doesn't itself fan out into ptr-pair operations.
+  if (Val.getValueType() == MVT::i32) {
+    SDValue Lo = extractWide32Lo(DAG, DL, Val);
+    SDValue Hi = extractWide32Hi(DAG, DL, Val);
+    EVT PtrVT = Ptr.getValueType();
+    SDValue Two = DAG.getConstant(2, DL, PtrVT);
+    SDValue Ptr2 = DAG.getNode(ISD::ADD, DL, PtrVT, Ptr, Two);
+    if (PtrVT == MVT::i32) {
+      // ptr32 path — emit two W65816ISD::ST_PTR target nodes, sequentially
+      // chained.  The combiner cannot merge target-opaque MemIntrinsic
+      // stores.
+      SDVTList VTs = DAG.getVTList(MVT::Other);
+      SDValue OpsLo[] = { Chain, Lo, Ptr };
+      SDValue StLo = DAG.getMemIntrinsicNode(
+          W65816ISD::ST_PTR, DL, VTs, OpsLo, MVT::i16,
+          St->getMemOperand());
+      SDValue OpsHi[] = { StLo, Hi, Ptr2 };
+      MachineMemOperand *MMOHi = DAG.getMachineFunction().getMachineMemOperand(
+          St->getMemOperand(), 2, 2);
+      SDValue StHi = DAG.getMemIntrinsicNode(
+          W65816ISD::ST_PTR, DL, VTs, OpsHi, MVT::i16, MMOHi);
+      return StHi;
+    }
+    // ptr16 path — emit two regular i16 stores serially chained so the
+    // store-merger sees them as a 4-byte sequence (which it will likely
+    // leave alone since the resulting i32 store has no legal target
+    // pattern in ptr16 mode anyway).
+    SDValue StLo = DAG.getStore(Chain, DL, Lo, Ptr,
+                                St->getPointerInfo(),
+                                St->getAlign(),
+                                St->getMemOperand()->getFlags());
+    SDValue StHi = DAG.getStore(StLo, DL, Hi, Ptr2,
+                                St->getPointerInfo().getWithOffset(2),
+                                St->getAlign(),
+                                St->getMemOperand()->getFlags());
+    return StHi;
+  }
+
+  if (Ptr.getValueType() != MVT::i32)
+    return SDValue();
+
+  // The pseudos take Acc16 (i16) as the value half; the SEP/REP wrap
+  // around STBptr32 narrows in memory.  Promote i8 values to i16 with
+  // ANY_EXTEND — the inserter only writes one byte, so the high half
+  // is don't-care.
+  if (Val.getValueType() == MVT::i8)
+    Val = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i16, Val);
+
+  unsigned NodeOpc = (MemVT == MVT::i8) ? unsigned(W65816ISD::STB_PTR)
+                                        : unsigned(W65816ISD::ST_PTR);
+  SDVTList VTs = DAG.getVTList(MVT::Other);
+  SDValue Ops[] = { Chain, Val, Ptr };
+  return DAG.getMemIntrinsicNode(NodeOpc, DL, VTs, Ops, MemVT,
+                                 St->getMemOperand());
+}
+
 // VAARG: load *ap, advance ap by sizeof(VT).  Unlike the default
 // expansion, we do NOT align ap to the type's preferred alignment —
 // caller-pushed varargs land at byte-granular addresses (PHA from an
@@ -509,12 +1069,45 @@ SDValue W65816TargetLowering::LowerOperation(SDValue Op,
   case ISD::BR_CC:          return LowerBR_CC(Op, DAG);
   case ISD::SETCC:          return LowerSETCC(Op, DAG);
   case ISD::SELECT_CC:      return LowerSELECT_CC(Op, DAG);
-  case ISD::SIGN_EXTEND:    return LowerSignExtend(Op, DAG);
+  case ISD::SELECT: {
+    // Custom-lower SELECT for i32 result: split into per-half
+    // selects.  Without this, the legalizer's default (rewriting
+    // SELECT to SELECT_CC against zero) produces SELECT_CC i32 of
+    // a different shape that re-enters Custom and creates a cycle.
+    if (Op.getValueType() != MVT::i32)
+      return SDValue();
+    SDValue Cond = Op.getOperand(0);
+    SDValue TVal = Op.getOperand(1);
+    SDValue FVal = Op.getOperand(2);
+    SDLoc DL(Op);
+    SDValue TLo = extractWide32Lo(DAG, DL, TVal);
+    SDValue THi = extractWide32Hi(DAG, DL, TVal);
+    SDValue FLo = extractWide32Lo(DAG, DL, FVal);
+    SDValue FHi = extractWide32Hi(DAG, DL, FVal);
+    SDValue Lo = DAG.getNode(ISD::SELECT, DL, MVT::i16, Cond, TLo, FLo);
+    SDValue Hi = DAG.getNode(ISD::SELECT, DL, MVT::i16, Cond, THi, FHi);
+    return buildWide32(DAG, DL, Lo, Hi);
+  }
+  case ISD::SIGN_EXTEND:
+    if (Op.getValueType() == MVT::i32) return LowerExtend(Op, DAG);
+    return LowerSignExtend(Op, DAG);
   case ISD::VASTART:        return LowerVASTART(Op, DAG);
   case ISD::VAARG:          return LowerVAARG(Op, DAG);
   case ISD::SHL:
   case ISD::SRL:
   case ISD::SRA:            return LowerShift(Op, DAG);
+  case ISD::ZERO_EXTEND:
+  case ISD::ANY_EXTEND:     return LowerExtend(Op, DAG);
+  case ISD::SIGN_EXTEND_INREG: return LowerSignExtendInReg(Op, DAG);
+  case ISD::TRUNCATE:       return LowerTruncate(Op, DAG);
+  case ISD::ADD:
+  case ISD::SUB:
+  case ISD::AND:
+  case ISD::OR:
+  case ISD::XOR:            return LowerI32Bin(Op, DAG);
+  case ISD::LOAD:           return LowerLoad(Op, DAG);
+  case ISD::STORE:          return LowerStore(Op, DAG);
+  case ISD::Constant:       return LowerI32Constant(Op, DAG);
   // SJLJ EH: setup_dispatch is a no-op on this target — the dispatcher
   // logic lives entirely in the SJLJ runtime (_Unwind_SjLj_Resume +
   // longjmp into the function context's jmp_buf).  The isel layer
@@ -621,30 +1214,30 @@ SDValue W65816TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
   // The type-legalizer's i32-shift-by-1 expansion emits this exact
   // node for the high-half "bit-from-low" slot.
   // Everything else goes to a libcall (__ashlhi3 / __lshrhi3 / __ashrhi3).
+  // i16 only — i32 always routes to libcall (no inline i32 patterns).
   SDValue Amount = Op.getOperand(1);
-  if (auto *C = dyn_cast<ConstantSDNode>(Amount)) {
-    uint64_t N = C->getZExtValue();
-    // SHL/SRL by 1..7 chain ASLA16/LSRA16; by 8 use SHL8A/SRL8A; by 9..14
-    // chain on top of those.  All have inline tablegen patterns.
-    if ((Op.getOpcode() == ISD::SHL || Op.getOpcode() == ISD::SRL) &&
-        N >= 1 && N <= 14)
-      return Op;
-    // SHL/SRL by 15 is just (asl/ror to put bit 0/15 into low/high).
-    if (N == 15 &&
-        (Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SHL))
-      return Op;
-    // SRA only has inline patterns at 1 and 15 (sign-fill).
-    if (N == 1 && Op.getOpcode() == ISD::SRA)
-      return Op;
-    if (N == 15 && Op.getOpcode() == ISD::SRA)
-      return Op;
+  if (Op.getValueType() == MVT::i16) {
+    if (auto *C = dyn_cast<ConstantSDNode>(Amount)) {
+      uint64_t N = C->getZExtValue();
+      if ((Op.getOpcode() == ISD::SHL || Op.getOpcode() == ISD::SRL) &&
+          N >= 1 && N <= 14)
+        return Op;
+      if (N == 15 &&
+          (Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SHL))
+        return Op;
+      if (N == 1 && Op.getOpcode() == ISD::SRA)
+        return Op;
+      if (N == 15 && Op.getOpcode() == ISD::SRA)
+        return Op;
+    }
   }
 
+  bool IsI32 = Op.getValueType() == MVT::i32;
   RTLIB::Libcall LC;
   switch (Op.getOpcode()) {
-  case ISD::SHL: LC = RTLIB::SHL_I16; break;
-  case ISD::SRL: LC = RTLIB::SRL_I16; break;
-  case ISD::SRA: LC = RTLIB::SRA_I16; break;
+  case ISD::SHL: LC = IsI32 ? RTLIB::SHL_I32 : RTLIB::SHL_I16; break;
+  case ISD::SRL: LC = IsI32 ? RTLIB::SRL_I32 : RTLIB::SRL_I16; break;
+  case ISD::SRA: LC = IsI32 ? RTLIB::SRA_I32 : RTLIB::SRA_I16; break;
   default: llvm_unreachable("not a shift");
   }
 
@@ -661,17 +1254,19 @@ SDValue W65816TargetLowering::LowerGlobalAddress(SDValue Op,
                                                  SelectionDAG &DAG) const {
   auto *GA = cast<GlobalAddressSDNode>(Op);
   SDLoc DL(Op);
-  SDValue Tgt = DAG.getTargetGlobalAddress(GA->getGlobal(), DL, MVT::i16,
+  EVT PtrVT = Op.getValueType();   // i16 in ptr16 mode, i32 in ptr32 mode
+  SDValue Tgt = DAG.getTargetGlobalAddress(GA->getGlobal(), DL, PtrVT,
                                            GA->getOffset());
-  return DAG.getNode(W65816ISD::Wrapper, DL, MVT::i16, Tgt);
+  return DAG.getNode(W65816ISD::Wrapper, DL, PtrVT, Tgt);
 }
 
 SDValue W65816TargetLowering::LowerExternalSymbol(SDValue Op,
                                                   SelectionDAG &DAG) const {
   auto *ES = cast<ExternalSymbolSDNode>(Op);
   SDLoc DL(Op);
-  SDValue Tgt = DAG.getTargetExternalSymbol(ES->getSymbol(), MVT::i16);
-  return DAG.getNode(W65816ISD::Wrapper, DL, MVT::i16, Tgt);
+  EVT PtrVT = Op.getValueType();
+  SDValue Tgt = DAG.getTargetExternalSymbol(ES->getSymbol(), PtrVT);
+  return DAG.getNode(W65816ISD::Wrapper, DL, PtrVT, Tgt);
 }
 
 SDValue W65816TargetLowering::LowerFormalArguments(
@@ -696,11 +1291,10 @@ SDValue W65816TargetLowering::LowerFormalArguments(
   MachineFrameInfo &MFI = MF.getFrameInfo();
   MachineRegisterInfo &MRI = MF.getRegInfo();
 
-  // i32 first-arg ABI: if the first original argument is i32 (the
-  // type legalizer split it into two i16 InputArgs both with
-  // OrigArgIndex == 0), pass it in A:X (lo:hi) — matching the i32
-  // return ABI (also A:X).  Saves one stack slot for the i32 arg.
-  bool I32FirstArg =
+  // i32 first-arg ABI.  Two flavors as in LowerCall:
+  //   - Legal-i32 (Wide32 reg class registered): single i32 InputArg.
+  //   - Split-i32 (legacy): two i16 InputArgs both with OrigArgIndex==0.
+  bool I32SplitFirstArg =
       Ins.size() >= 2 && Ins[0].VT == MVT::i16 && Ins[1].VT == MVT::i16 &&
       Ins[0].OrigArgIndex == 0 && Ins[1].OrigArgIndex == 0;
   // True iff the FIRST original arg spans 4 i16s (i.e., is i64).  Used
@@ -709,11 +1303,24 @@ SDValue W65816TargetLowering::LowerFormalArguments(
   // doesn't get the same treatment because the change pessimizes
   // simple functions like `int add32(int a, int b) { return a+b; }`
   // where greedy's regular A:X handling is fine.
+  // Two shapes for i64-first-arg under different ptr modes:
+  //   ptr16 (i32 illegal): Ins[0..3] = 4 i16 halves of arg0
+  //   ptr32 (i32 legal):   Ins[0..1] = 2 i32 halves of arg0 — but the
+  //                         IR-level "single i64 first arg" still splits
+  //                         to 4 i16 in Outs/Ins because i64 isn't legal.
+  //                         So the i16-form detection still applies here.
   bool I64FirstArg =
       Ins.size() >= 4 && Ins[0].VT == MVT::i16 && Ins[1].VT == MVT::i16 &&
       Ins[2].VT == MVT::i16 && Ins[3].VT == MVT::i16 &&
       Ins[0].OrigArgIndex == 0 && Ins[1].OrigArgIndex == 0 &&
       Ins[2].OrigArgIndex == 0 && Ins[3].OrigArgIndex == 0;
+  // Also detect the i32-split shape: Ins[0..1] = 2 i32 halves of arg0
+  // (with OrigArgIndex==0 on both).  This happens with ptr32 active and
+  // i64 legalized via i32-split rather than i16-quad-split.
+  if (!I64FirstArg && Ins.size() >= 2 && Ins[0].VT == MVT::i32 &&
+      Ins[1].VT == MVT::i32 && Ins[0].OrigArgIndex == 0 &&
+      Ins[1].OrigArgIndex == 0)
+    I64FirstArg = true;
 
   unsigned ArgIdx = 0;
   // Stack offset is measured from S+1 (the WDC convention) and grows
@@ -721,16 +1328,50 @@ SDValue W65816TargetLowering::LowerFormalArguments(
   unsigned StackOffset = 4; // Skip 3 ret-addr bytes; first slot at S+4.
   for (const ISD::InputArg &Arg : Ins) {
     MVT VT = Arg.VT;
-    if (VT != MVT::i16 && VT != MVT::i8)
+    if (VT != MVT::i16 && VT != MVT::i8 && VT != MVT::i32)
       report_fatal_error("W65816: argument type not yet supported");
 
-    if (ArgIdx == 0) {
-      // First arg in A.
-      Register VReg = MRI.createVirtualRegister(
-          VT == MVT::i16 ? &W65816::Acc16RegClass : &W65816::Acc8RegClass);
+    if (ArgIdx == 0 && VT == MVT::i32) {
+      // Whole-i32 first arg: lo half live-in via $a, hi via $x.
+      // The W65816LowerWide32 pre-RA pass walks the resulting
+      // REG_SEQUENCE and rewrites Wide32 uses into pairs of i16
+      // operations — keeping AX32 out of the regalloc's pair-
+      // allocation path entirely.
+      // For i64-first-arg signatures (the IR has a single i64 arg
+      // that splits to 2 i32 in Ins[0..1] under ptr32), route BOTH
+      // halves through Img16.  Without this the regalloc emits
+      // `TXA; STA spill_X; STA spill_A` at function entry — the TXA
+      // clobbers $a (arg0_0) before the A-spill saves it, so both
+      // spill slots end up holding arg0_1.  Caused __adddf3(1.5,2.5)
+      // → 1.5 because the cb-test path read TXA-corrupted A.
+      const TargetRegisterClass *VRegLoRC =
+          I64FirstArg ? &W65816::Img16RegClass : &W65816::Acc16RegClass;
+      const TargetRegisterClass *VRegHiRC =
+          I64FirstArg ? &W65816::Img16RegClass : &W65816::Idx16RegClass;
+      Register VRegLo = MRI.createVirtualRegister(VRegLoRC);
+      Register VRegHi = MRI.createVirtualRegister(VRegHiRC);
+      MRI.addLiveIn(W65816::A, VRegLo);
+      MRI.addLiveIn(W65816::X, VRegHi);
+      SDValue Lo = DAG.getCopyFromReg(Chain, DL, VRegLo, MVT::i16);
+      SDValue Hi = DAG.getCopyFromReg(Chain, DL, VRegHi, MVT::i16);
+      InVals.push_back(buildWide32(DAG, DL, Lo, Hi));
+    } else if (ArgIdx == 0) {
+      // First arg in A.  For i64-first-arg signatures (4 i16 halves of
+      // arg0 with OrigArgIndex==0), route arg0_0 through Img16 the same
+      // way ArgIdx==1 does — via an entry STA-to-DP-slot at function
+      // entry.  Without this, the regalloc emits a TXA bridge for
+      // arg0_1's spill that clobbers $a (= arg0_0) BEFORE arg0_0 has
+      // been saved, and BOTH arg0_0 and arg0_1's spill slots end up
+      // holding arg0_1.  Observed as `__adddf3(1.5, 2.5) → 1.5` because
+      // the cb-test BEQ sees flags from a TXA-clobbered LDA cb path.
+      const TargetRegisterClass *RC =
+          (VT == MVT::i16)
+              ? (I64FirstArg ? &W65816::Img16RegClass : &W65816::Acc16RegClass)
+              : &W65816::Acc8RegClass;
+      Register VReg = MRI.createVirtualRegister(RC);
       MRI.addLiveIn(W65816::A, VReg);
       InVals.push_back(DAG.getCopyFromReg(Chain, DL, VReg, VT));
-    } else if (ArgIdx == 1 && I32FirstArg) {
+    } else if (ArgIdx == 1 && I32SplitFirstArg) {
       // First-arg hi half (or arg0_ml for i64-first-arg): in X.
       // For i64-first-arg signatures (4 i16s with OrigArgIndex 0), use
       // Img16 so greedy parks the value in an IMG slot via STX_DP,
@@ -743,6 +1384,19 @@ SDValue W65816TargetLowering::LowerFormalArguments(
       Register VReg = MRI.createVirtualRegister(RC);
       MRI.addLiveIn(W65816::X, VReg);
       InVals.push_back(DAG.getCopyFromReg(Chain, DL, VReg, MVT::i16));
+    } else if (VT == MVT::i32) {
+      // i32 stack arg: 4 bytes, loaded as 2 i16 halves and assembled
+      // via REG_SEQUENCE into a Wide32 SDValue.
+      int FILo = MFI.CreateFixedObject(2, StackOffset, /*Immutable*/true);
+      int FIHi = MFI.CreateFixedObject(2, StackOffset + 2, /*Immutable*/true);
+      StackOffset += 4;
+      SDValue FINLo = DAG.getFrameIndex(FILo, MVT::i16);
+      SDValue FINHi = DAG.getFrameIndex(FIHi, MVT::i16);
+      SDValue Lo = DAG.getLoad(MVT::i16, DL, Chain, FINLo,
+                               MachinePointerInfo::getFixedStack(MF, FILo));
+      SDValue Hi = DAG.getLoad(MVT::i16, DL, Chain, FINHi,
+                               MachinePointerInfo::getFixedStack(MF, FIHi));
+      InVals.push_back(buildWide32(DAG, DL, Lo, Hi));
     } else {
       // Subsequent args are loaded from the stack.  i8 args are
       // promoted to i16 slots (matching CC_W65816's CCPromoteToType)
@@ -824,23 +1478,29 @@ W65816TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   }
 
   for (const ISD::OutputArg &O : Outs) {
-    if (O.VT != MVT::i16 && O.VT != MVT::i8)
+    if (O.VT != MVT::i16 && O.VT != MVT::i8 && O.VT != MVT::i32)
       report_fatal_error("W65816: argument type not yet supported");
   }
 
-  // i32 first-arg ABI: if Outs[0] and Outs[1] are halves of the same
-  // original i32 first arg (OrigArgIndex == 0), pass them in A:X.
-  bool I32FirstArg =
+  // i32 first-arg ABI.  Two flavors:
+  //   - Legal-i32: Outs[0].VT == i32 (whole pair).  Pass in AX32.
+  //   - Split-i32 (legacy): Outs[0]/Outs[1] both i16 with OrigArgIndex==0.
+  //     Pass low in A, high in X.
+  bool I32WholeFirstArg =
+      !Outs.empty() && Outs[0].VT == MVT::i32;
+  bool I32SplitFirstArg =
       Outs.size() >= 2 && Outs[0].VT == MVT::i16 && Outs[1].VT == MVT::i16 &&
       Outs[0].OrigArgIndex == 0 && Outs[1].OrigArgIndex == 0;
-  unsigned FirstStackArg = I32FirstArg ? 2 : 1;
+  unsigned FirstStackArg = I32WholeFirstArg ? 1
+                          : I32SplitFirstArg ? 2 : 1;
 
   // i8 stack args are promoted to i16 (2-byte slots) so the callee can
   // read them with a 16-bit M load — matches LowerFormalArguments and
-  // CC_W65816's CCPromoteToType<i16>.  Arg 0 stays in A in its native
-  // width; only stack-passed args promote.
-  unsigned StackBytes = 2 * (Outs.size() > FirstStackArg
-                                 ? Outs.size() - FirstStackArg : 0);
+  // CC_W65816's CCPromoteToType<i16>.  i32 stack args occupy 4 bytes
+  // (2 PUSH16s).
+  unsigned StackBytes = 0;
+  for (unsigned i = FirstStackArg; i < Outs.size(); ++i)
+    StackBytes += (Outs[i].VT == MVT::i32) ? 4 : 2;
 
   Chain = DAG.getCALLSEQ_START(Chain, StackBytes, 0, DL);
 
@@ -851,15 +1511,8 @@ W65816TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   // from X via PHX — saves the TXA + A-spill round-trip that would
   // otherwise be required.
   SDValue Glue;
-  for (int i = (int)Outs.size() - 1; i >= (int)FirstStackArg; --i) {
-    SDValue V = OutVals[i];
-    if (Outs[i].VT == MVT::i8)
-      V = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i16, V);
-    // Detect "value is already in X" — either as a physreg
-    // CopyFromReg($x), or as a vreg in the Idx16 class that's
-    // live-in from $x.  In the i32-first-arg-in-A:X path,
-    // LowerFormalArguments creates a vreg in Idx16 and addLiveIn's
-    // it to $x.
+  // Helper: push a single i16-sized value via PHA.
+  auto pushI16 = [&](SDValue V) {
     bool ViaX = false;
     if (V.getOpcode() == ISD::CopyFromReg) {
       auto *RegN = dyn_cast<RegisterSDNode>(V.getOperand(1).getNode());
@@ -880,8 +1533,6 @@ W65816TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       }
     }
     if (ViaX) {
-      // CopyToReg(X, X) is a no-op but it threads the Glue chain so the
-      // PUSH_X can be sequenced correctly relative to other pushes.
       Chain = DAG.getCopyToReg(Chain, DL, W65816::X, V, Glue);
       Glue = Chain.getValue(1);
       Chain = DAG.getNode(W65816ISD::PUSH_X, DL,
@@ -893,17 +1544,44 @@ W65816TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                           DAG.getVTList(MVT::Other, MVT::Glue), Chain, Glue);
     }
     Glue = Chain.getValue(1);
+  };
+
+  for (int i = (int)Outs.size() - 1; i >= (int)FirstStackArg; --i) {
+    SDValue V = OutVals[i];
+    if (Outs[i].VT == MVT::i32) {
+      // Push i32 stack arg: hi half first (lands at higher address),
+      // lo half second (lands at lower address = the slot the callee
+      // reads as the start of the i32).
+      SDValue Lo = extractWide32Lo(DAG, DL, V);
+      SDValue Hi = extractWide32Hi(DAG, DL, V);
+      pushI16(Hi);
+      pushI16(Lo);
+      continue;
+    }
+    if (Outs[i].VT == MVT::i8)
+      V = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i16, V);
+    pushI16(V);
   }
 
-  // i32 first-arg hi half goes in X.  Emit before the A copy so the
-  // CopyToReg for X is glued, then A's copy follows.
-  if (I32FirstArg) {
+  // i32 first-arg.  Whole (legal-i32): split into lo/hi and copy
+  // to $a/$x separately — avoids AX32 in the MIR (see
+  // W65816LowerWide32).  Split-i32 (legacy 2-i16): hi in X first,
+  // then lo in A below.
+  if (I32WholeFirstArg) {
+    SDValue Lo = extractWide32Lo(DAG, DL, OutVals[0]);
+    SDValue Hi = extractWide32Hi(DAG, DL, OutVals[0]);
+    Chain = DAG.getCopyToReg(Chain, DL, W65816::X, Hi, Glue);
+    Glue = Chain.getValue(1);
+    Chain = DAG.getCopyToReg(Chain, DL, W65816::A, Lo, Glue);
+    Glue = Chain.getValue(1);
+  } else if (I32SplitFirstArg) {
     Chain = DAG.getCopyToReg(Chain, DL, W65816::X, OutVals[1], Glue);
     Glue = Chain.getValue(1);
   }
 
-  // Arg 0 in A.
-  if (!OutVals.empty()) {
+  // Arg 0 in A — only for non-whole-i32 first-arg.  Whole-i32
+  // already copied to A/X above.
+  if (!I32WholeFirstArg && !OutVals.empty()) {
     Chain = DAG.getCopyToReg(Chain, DL, W65816::A, OutVals[0], Glue);
     Glue = Chain.getValue(1);
   }
@@ -914,10 +1592,14 @@ W65816TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     Callee = DAG.getTargetExternalSymbol(ES->getSymbol(), MVT::i16);
 
   SmallVector<SDValue, 4> CallOps = {Chain, Callee};
-  if (!OutVals.empty())
+  if (I32WholeFirstArg) {
+    CallOps.push_back(DAG.getRegister(W65816::A, MVT::i16));
+    CallOps.push_back(DAG.getRegister(W65816::X, MVT::i16));
+  } else if (!OutVals.empty()) {
     CallOps.push_back(DAG.getRegister(W65816::A, Outs[0].VT));
-  if (I32FirstArg)
-    CallOps.push_back(DAG.getRegister(W65816::X, Outs[1].VT));
+    if (I32SplitFirstArg)
+      CallOps.push_back(DAG.getRegister(W65816::X, Outs[1].VT));
+  }
   if (Glue.getNode())
     CallOps.push_back(Glue);
 
@@ -928,38 +1610,60 @@ W65816TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   Chain = DAG.getCALLSEQ_END(Chain, StackBytes, 0, Glue, DL);
   Glue = Chain.getValue(1);
 
-  // Read return value(s).  Mirrors LowerReturn: i8/i16 in A, i32 in A:X,
-  // i64 in A:X:Y plus a load from DP $F0 for the highest half.
-  if (Ins.size() > 4)
-    report_fatal_error("W65816: return type wider than 64 bits not supported");
-  static constexpr Register RetRegs[3] = {W65816::A, W65816::X, W65816::Y};
+  // Read return value(s).  Mirrors LowerReturn: i8/i16 in A, i32 in
+  // AX32 (whole) or split A/X (legacy), and 4-half (i64 / 2x i32) in
+  // A, X, Y, DPF0.  i32 Ins are read as a single i32 from the half
+  // pair (A:X for the first, Y:DPF0 for a second-pair-of-halves).
+  // Whole-i32 single return: read lo from $a, hi from $x.  Avoids
+  // using AX32 in the SDAG / MIR — see W65816LowerWide32 pass.
+  if (Ins.size() == 1 && Ins[0].VT == MVT::i32) {
+    SDValue Lo = DAG.getCopyFromReg(Chain, DL, W65816::A, MVT::i16, Glue);
+    Chain = Lo.getValue(1);
+    Glue  = Lo.getValue(2);
+    SDValue Hi = DAG.getCopyFromReg(Chain, DL, W65816::X, MVT::i16, Glue);
+    Chain = Hi.getValue(1);
+    Glue  = Hi.getValue(2);
+    InVals.push_back(buildWide32(DAG, DL, Lo, Hi));
+    return Chain;
+  }
+  // Build a flat list of i16 halves expected from the call.  Then
+  // walk it, copying from A, X, Y, DPF0 in order.  Re-assemble i32
+  // halves into a Wide32 SDValue at the end.
+  SmallVector<MVT, 4> ExpVT;
   for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
     MVT VT = Ins[i].VT;
-    if (VT != MVT::i16 && VT != MVT::i8)
-      report_fatal_error("W65816: return half must be i8 or i16");
-    if (i >= 1 && VT != MVT::i16)
-      report_fatal_error("W65816: split return halves must all be i16");
-    if (i < 3) {
-      SDValue V = DAG.getCopyFromReg(Chain, DL, RetRegs[i], VT, Glue);
-      Chain = V.getValue(1);
-      Glue = V.getValue(2);
-      InVals.push_back(V);
+    if (VT == MVT::i32) {
+      ExpVT.push_back(MVT::i16);
+      ExpVT.push_back(MVT::i16);
+    } else if (VT == MVT::i16 || VT == MVT::i8) {
+      ExpVT.push_back(VT);
     } else {
-      // 4th half: read DP[$F0..$F1] via CopyFromReg(DPF0).  DPF0 is a
-      // pseudo-physreg modeled as JSLpseudo's implicit-def, so each
-      // call's CopyFromReg has Glue tied to the corresponding call —
-      // the SDAG combiner can't merge them and the scheduler can't
-      // reorder them past the next call.  copyPhysReg lowers DPF0 →
-      // A as `LDA $F0`.  Without this, plain `getLoad(0xF0)` was
-      // being CSE'd / reordered across i64-returning calls, causing
-      // `dmath = (a+b)*(a-b)` to return 4 instead of 16.
-      SDValue V = DAG.getCopyFromReg(Chain, DL, W65816::DPF0, VT, Glue);
-      Chain = V.getValue(1);
-      Glue = V.getValue(2);
-      InVals.push_back(V);
+      report_fatal_error("W65816: return half must be i8/i16/i32");
+    }
+  }
+  if (ExpVT.size() > 4)
+    report_fatal_error("W65816: return type wider than 64 bits not supported");
+  static constexpr Register RetRegs[4] = {W65816::A, W65816::X, W65816::Y,
+                                          W65816::DPF0};
+  SmallVector<SDValue, 4> Halves;
+  for (unsigned i = 0; i != ExpVT.size(); ++i) {
+    SDValue V = DAG.getCopyFromReg(Chain, DL, RetRegs[i], ExpVT[i], Glue);
+    Chain = V.getValue(1);
+    Glue  = V.getValue(2);
+    Halves.push_back(V);
+  }
+  // Re-pack halves into the original Ins shape (i32s rebuild via
+  // REG_SEQUENCE; i8/i16 pass through).
+  unsigned hi = 0;
+  for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
+    if (Ins[i].VT == MVT::i32) {
+      InVals.push_back(buildWide32(DAG, DL, Halves[hi], Halves[hi + 1]));
+      hi += 2;
+    } else {
+      InVals.push_back(Halves[hi]);
+      hi += 1;
     }
   }
-
   return Chain;
 }
 
@@ -979,18 +1683,53 @@ SDValue W65816TargetLowering::LowerReturn(
   // first so that the regalloc can place each through A (the only
   // ALU reg) without conflict.  The TAX/TAY in copyPhysReg preserves
   // A, so subsequent low-half copies to A don't clobber.
-  if (Outs.size() > 4)
-    report_fatal_error("W65816: return type wider than 64 bits not supported");
+  // With i32 legal, an Outs entry may be MVT::i32; we expand each i32
+  // into its two i16 halves (sub_lo/sub_hi via EXTRACT_SUBREG) so the
+  // legacy A/X/Y/DPF0 4-half return ABI continues to work for the
+  // multi-half return cases (i64 returned as 2 i32, struct of 2 long
+  // returned as 2 i32, etc.).
+  SmallVector<MVT, 4> ExpVT;
+  SmallVector<SDValue, 4> ExpVals;
   for (unsigned i = 0; i != Outs.size(); ++i) {
     MVT VT = Outs[i].VT;
-    if (VT != MVT::i16 && VT != MVT::i8)
-      report_fatal_error("W65816: return half must be i8 or i16");
-    if (i >= 1 && VT != MVT::i16)
-      report_fatal_error("W65816: split return halves must all be i16");
+    if (VT == MVT::i32) {
+      ExpVT.push_back(MVT::i16);
+      ExpVT.push_back(MVT::i16);
+      ExpVals.push_back(extractWide32Lo(DAG, DL, OutVals[i]));
+      ExpVals.push_back(extractWide32Hi(DAG, DL, OutVals[i]));
+    } else if (VT == MVT::i16 || VT == MVT::i8) {
+      ExpVT.push_back(VT);
+      ExpVals.push_back(OutVals[i]);
+    } else {
+      report_fatal_error("W65816: return half must be i8/i16/i32");
+    }
   }
+  if (ExpVT.size() > 4)
+    report_fatal_error("W65816: return type wider than 64 bits not supported");
+
+  // Single whole-i32 return: copy directly to AX32 instead of two
+  // halves to A and X.  Saves the regalloc/coalescer some work.
+  bool I32WholeReturn = (Outs.size() == 1 && Outs[0].VT == MVT::i32);
   SDValue Glue;
   SmallVector<SDValue, 8> RetOps(1, Chain);
 
+  if (I32WholeReturn) {
+    // Split the i32 OutVal into lo/hi and copy each separately to
+    // $a / $x (no AX32 in the SDAG — see W65816LowerWide32).
+    SDValue Lo = extractWide32Lo(DAG, DL, OutVals[0]);
+    SDValue Hi = extractWide32Hi(DAG, DL, OutVals[0]);
+    Chain = DAG.getCopyToReg(Chain, DL, W65816::X, Hi, Glue);
+    Glue = Chain.getValue(1);
+    Chain = DAG.getCopyToReg(Chain, DL, W65816::A, Lo, Glue);
+    Glue = Chain.getValue(1);
+    RetOps.push_back(DAG.getRegister(W65816::A, MVT::i16));
+    RetOps.push_back(DAG.getRegister(W65816::X, MVT::i16));
+    RetOps[0] = Chain;
+    if (Glue.getNode())
+      RetOps.push_back(Glue);
+    return DAG.getNode(W65816ISD::RET_GLUE, DL, MVT::Other, RetOps);
+  }
+
   // Outs[3] -> DP $F0 via CopyToReg(DPF0).  Using the DPF0 fake physreg
   // (lowered to `STA $F0` by copyPhysReg) is critical: a generic
   // ISD::STORE with addr=0xF0 lowered to `sta (d,s),y`, an indirect
@@ -999,32 +1738,30 @@ SDValue W65816TargetLowering::LowerReturn(
   // computation can use A freely before A holds the low result.  Glued
   // to RET_GLUE via the RetOps Register entry below so DCE doesn't
   // strip the COPY.
-  if (Outs.size() >= 4) {
-    Chain = DAG.getCopyToReg(Chain, DL, W65816::DPF0, OutVals[3], Glue);
+  // Use the expanded i16-half list (i32 outs split into 2 i16 halves).
+  if (ExpVals.size() >= 4) {
+    Chain = DAG.getCopyToReg(Chain, DL, W65816::DPF0, ExpVals[3], Glue);
     Glue = Chain.getValue(1);
   }
-  // Outs[2] -> Y.
-  if (Outs.size() >= 3) {
-    Chain = DAG.getCopyToReg(Chain, DL, W65816::Y, OutVals[2], Glue);
+  if (ExpVals.size() >= 3) {
+    Chain = DAG.getCopyToReg(Chain, DL, W65816::Y, ExpVals[2], Glue);
     Glue = Chain.getValue(1);
   }
-  // Outs[1] -> X.
-  if (Outs.size() >= 2) {
-    Chain = DAG.getCopyToReg(Chain, DL, W65816::X, OutVals[1], Glue);
+  if (ExpVals.size() >= 2) {
+    Chain = DAG.getCopyToReg(Chain, DL, W65816::X, ExpVals[1], Glue);
     Glue = Chain.getValue(1);
   }
-  // Outs[0] -> A.
-  if (!Outs.empty()) {
-    Chain = DAG.getCopyToReg(Chain, DL, W65816::A, OutVals[0], Glue);
+  if (!ExpVals.empty()) {
+    Chain = DAG.getCopyToReg(Chain, DL, W65816::A, ExpVals[0], Glue);
     Glue = Chain.getValue(1);
-    RetOps.push_back(DAG.getRegister(W65816::A, Outs[0].VT));
+    RetOps.push_back(DAG.getRegister(W65816::A, ExpVT[0]));
   }
-  if (Outs.size() >= 2)
-    RetOps.push_back(DAG.getRegister(W65816::X, Outs[1].VT));
-  if (Outs.size() >= 3)
-    RetOps.push_back(DAG.getRegister(W65816::Y, Outs[2].VT));
-  if (Outs.size() >= 4)
-    RetOps.push_back(DAG.getRegister(W65816::DPF0, Outs[3].VT));
+  if (ExpVals.size() >= 2)
+    RetOps.push_back(DAG.getRegister(W65816::X, ExpVT[1]));
+  if (ExpVals.size() >= 3)
+    RetOps.push_back(DAG.getRegister(W65816::Y, ExpVT[2]));
+  if (ExpVals.size() >= 4)
+    RetOps.push_back(DAG.getRegister(W65816::DPF0, ExpVT[3]));
 
   RetOps[0] = Chain;
   if (Glue.getNode())
@@ -1046,7 +1783,13 @@ W65816TargetLowering::PerformDAGCombine(SDNode *N,
   // `x*N` (which the combiner canonicalises pow-of-2 muls to `x<<K`)
   // benefits the most.  i16 SHL by 1..15 has dedicated ASLA16 patterns
   // already, so we restrict the rewrite to i32+.
-  if (N->getOpcode() == ISD::SHL && N->getValueType(0).getSizeInBits() >= 32) {
+  // (shl i32 X, K) -> ADD chain for small K — but only when i32 is
+  // ILLEGAL (i.e., gets type-split into i16 halves).  When i32 is a
+  // legal type (Wide32 reg class for ptr32 mode), the rewrite cycles
+  // against LLVM's generic `(add x, x) -> (shl x, 1)` combine in the
+  // i64 → 2 i32 split path, hanging the legalizer.
+  if (N->getOpcode() == ISD::SHL && N->getValueType(0).getSizeInBits() >= 32 &&
+      !isTypeLegal(N->getValueType(0))) {
     if (auto *C = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
       uint64_t K = C->getZExtValue();
       if (K >= 1 && K <= 2) {
@@ -1191,6 +1934,214 @@ W65816TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     return emitRROp(MI, BB, W65816::STAfi, W65816::EORfi, /*HasOut=*/true);
   case W65816::CMP_RR:
     return emitRROp(MI, BB, W65816::STAfi, W65816::CMPfi, /*HasOut=*/false);
+  case W65816::LDAptr32S:
+  case W65816::STAptr32S:
+  case W65816::STBptr32S: {
+    // Split-pair variant: ptr is 2 i16 operands (lo + hi) instead of
+    // 1 Wide32 reg pair.  Used by the W65816LowerWide32 pre-RA pass
+    // to dodge pair-allocation pressure.  Otherwise identical to
+    // the LDAptr32 inserter below.
+    MachineFunction *MF = BB->getParent();
+    const W65816Subtarget &STI = MF->getSubtarget<W65816Subtarget>();
+    const W65816InstrInfo &TII = *STI.getInstrInfo();
+    DebugLoc DL = MI.getDebugLoc();
+    bool IsLoad = MI.getOpcode() == W65816::LDAptr32S;
+    bool IsByteStore = MI.getOpcode() == W65816::STBptr32S;
+    Register PtrLo = MI.getOperand(IsLoad ? 1 : 1).getReg();
+    Register PtrHi = MI.getOperand(IsLoad ? 2 : 2).getReg();
+
+    int FILo = MF->getFrameInfo().CreateStackObject(2, Align(2),
+                                                    /*isSpillSlot=*/false);
+    int FIHi = MF->getFrameInfo().CreateStackObject(2, Align(2),
+                                                    /*isSpillSlot=*/false);
+    BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STAfi))
+        .addReg(PtrLo).addFrameIndex(FILo).addImm(0);
+    BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STAfi))
+        .addReg(PtrHi).addFrameIndex(FIHi).addImm(0);
+
+    BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDAfi),
+            W65816::A).addFrameIndex(FILo).addImm(0);
+    BuildMI(*BB, MI.getIterator(), DL,
+            TII.get(W65816::STA_DP)).addImm(0xE0);
+    BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDAfi),
+            W65816::A).addFrameIndex(FIHi).addImm(0);
+    BuildMI(*BB, MI.getIterator(), DL,
+            TII.get(W65816::STA_DP)).addImm(0xE2);
+
+    if (IsLoad) {
+      Register Dst = MI.getOperand(0).getReg();
+      BuildMI(*BB, MI.getIterator(), DL,
+              TII.get(W65816::LDY_Imm16)).addImm(0);
+      BuildMI(*BB, MI.getIterator(), DL,
+              TII.get(W65816::LDA_DPIndLongY)).addImm(0xE0);
+      BuildMI(*BB, MI.getIterator(), DL,
+              TII.get(TargetOpcode::COPY), Dst).addReg(W65816::A);
+    } else {
+      Register Val = MI.getOperand(0).getReg();
+      BuildMI(*BB, MI.getIterator(), DL,
+              TII.get(TargetOpcode::COPY), W65816::A).addReg(Val);
+      BuildMI(*BB, MI.getIterator(), DL,
+              TII.get(W65816::LDY_Imm16)).addImm(0);
+      if (IsByteStore)
+        BuildMI(*BB, MI.getIterator(), DL,
+                TII.get(W65816::SEP)).addImm(0x20);
+      BuildMI(*BB, MI.getIterator(), DL,
+              TII.get(W65816::STA_DPIndLongY)).addImm(0xE0);
+      if (IsByteStore)
+        BuildMI(*BB, MI.getIterator(), DL,
+                TII.get(W65816::REP)).addImm(0x20);
+    }
+    MI.eraseFromParent();
+    return BB;
+  }
+  case W65816::LDAptr32:
+  case W65816::STAptr32:
+  case W65816::STBptr32: {
+    // Same shape as the i16 LDAptr/STAptr/STBptr inserter, but the
+    // pointer is a Wide32 register pair: sub_lo carries the low 16
+    // bits of the address, sub_hi carries the bank byte in its low
+    // half (high half is pad, ORCA convention).  Stage at $E0..$E2,
+    // then [dp],Y addresses the right bank without forcing 0.
+    //
+    // Dead unless ptr32 mode is active (LowerLoad/LowerStore are gated
+    // on i32 address type).
+    MachineFunction *MF = BB->getParent();
+    const W65816Subtarget &STI = MF->getSubtarget<W65816Subtarget>();
+    const W65816InstrInfo &TII = *STI.getInstrInfo();
+    const W65816RegisterInfo &TRI = TII.getRegisterInfo();
+    DebugLoc DL = MI.getDebugLoc();
+    bool IsLoad = MI.getOpcode() == W65816::LDAptr32;
+    bool IsByteStore = MI.getOpcode() == W65816::STBptr32;
+    Register Ptr = MI.getOperand(IsLoad ? 1 : 1).getReg();
+    Register PtrLo = TRI.getSubReg(Ptr, llvm::sub_lo);
+    Register PtrHi = TRI.getSubReg(Ptr, llvm::sub_hi);
+
+    // Spill each half to a fresh slot, reload via LDAfi.  Same RA-
+    // pinning rationale as the i16 LDAptr inserter.
+    int FILo = MF->getFrameInfo().CreateStackObject(2, Align(2),
+                                                    /*isSpillSlot=*/false);
+    int FIHi = MF->getFrameInfo().CreateStackObject(2, Align(2),
+                                                    /*isSpillSlot=*/false);
+    BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STAfi))
+        .addReg(PtrLo).addFrameIndex(FILo).addImm(0);
+    BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STAfi))
+        .addReg(PtrHi).addFrameIndex(FIHi).addImm(0);
+
+    // Stage the 24-bit address at $E0..$E2: sub_lo at $E0..$E1,
+    // bank byte (low half of sub_hi) at $E2.  We write 16 bits at $E2
+    // — the high byte ($E3) gets sub_hi's pad byte (0 by ORCA) — but
+    // only $E2 is consulted by [dp],Y so $E3 contamination is harmless
+    // until something else uses $E3.
+    BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDAfi),
+            W65816::A).addFrameIndex(FILo).addImm(0);
+    BuildMI(*BB, MI.getIterator(), DL,
+            TII.get(W65816::STA_DP)).addImm(0xE0);
+    BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDAfi),
+            W65816::A).addFrameIndex(FIHi).addImm(0);
+    BuildMI(*BB, MI.getIterator(), DL,
+            TII.get(W65816::STA_DP)).addImm(0xE2);
+
+    if (IsLoad) {
+      Register Dst = MI.getOperand(0).getReg();
+      BuildMI(*BB, MI.getIterator(), DL,
+              TII.get(W65816::LDY_Imm16)).addImm(0);
+      BuildMI(*BB, MI.getIterator(), DL,
+              TII.get(W65816::LDA_DPIndLongY)).addImm(0xE0);
+      BuildMI(*BB, MI.getIterator(), DL,
+              TII.get(TargetOpcode::COPY), Dst).addReg(W65816::A);
+    } else {
+      Register Val = MI.getOperand(0).getReg();
+      BuildMI(*BB, MI.getIterator(), DL,
+              TII.get(TargetOpcode::COPY), W65816::A).addReg(Val);
+      BuildMI(*BB, MI.getIterator(), DL,
+              TII.get(W65816::LDY_Imm16)).addImm(0);
+      if (IsByteStore)
+        BuildMI(*BB, MI.getIterator(), DL,
+                TII.get(W65816::SEP)).addImm(0x20);
+      BuildMI(*BB, MI.getIterator(), DL,
+              TII.get(W65816::STA_DPIndLongY)).addImm(0xE0);
+      if (IsByteStore)
+        BuildMI(*BB, MI.getIterator(), DL,
+                TII.get(W65816::REP)).addImm(0x20);
+    }
+    MI.eraseFromParent();
+    return BB;
+  }
+  case W65816::LDAptr32Off:
+  case W65816::STAptr32Off:
+  case W65816::STBptr32Off: {
+    // ptr32 deref with constant offset.  Compute (sub_lo + off) into A
+    // with CLC; ADC, store at $E0..$E1; then propagate the carry into
+    // the bank byte via ADC #0 on (sub_hi) and store at $E2.  Carry
+    // propagation is conservatively always emitted — bank wrapping is
+    // rare but real (bank-spanning struct or negative offset).
+    //
+    // Dead unless ptr32 mode is active.
+    MachineFunction *MF = BB->getParent();
+    const W65816Subtarget &STI = MF->getSubtarget<W65816Subtarget>();
+    const W65816InstrInfo &TII = *STI.getInstrInfo();
+    const W65816RegisterInfo &TRI = TII.getRegisterInfo();
+    DebugLoc DL = MI.getDebugLoc();
+    bool IsLoad = MI.getOpcode() == W65816::LDAptr32Off;
+    bool IsByteStore = MI.getOpcode() == W65816::STBptr32Off;
+    Register Ptr = MI.getOperand(1).getReg();
+    int64_t Off = MI.getOperand(2).getImm();
+    Register PtrLo = TRI.getSubReg(Ptr, llvm::sub_lo);
+    Register PtrHi = TRI.getSubReg(Ptr, llvm::sub_hi);
+
+    int FILo = MF->getFrameInfo().CreateStackObject(2, Align(2),
+                                                    /*isSpillSlot=*/false);
+    int FIHi = MF->getFrameInfo().CreateStackObject(2, Align(2),
+                                                    /*isSpillSlot=*/false);
+    BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STAfi))
+        .addReg(PtrLo).addFrameIndex(FILo).addImm(0);
+    BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STAfi))
+        .addReg(PtrHi).addFrameIndex(FIHi).addImm(0);
+
+    // (sub_lo + off) -> $E0..$E1
+    BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDAfi),
+            W65816::A).addFrameIndex(FILo).addImm(0);
+    BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::CLC));
+    BuildMI(*BB, MI.getIterator(), DL,
+            TII.get(W65816::ADC_Imm16)).addImm(Off);
+    BuildMI(*BB, MI.getIterator(), DL,
+            TII.get(W65816::STA_DP)).addImm(0xE0);
+
+    // (sub_hi + 0 + carry) -> $E2..$E3.  ADC #0 picks up the carry
+    // from the previous ADC; if no carry, sub_hi is unchanged.
+    BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDAfi),
+            W65816::A).addFrameIndex(FIHi).addImm(0);
+    BuildMI(*BB, MI.getIterator(), DL,
+            TII.get(W65816::ADC_Imm16)).addImm(0);
+    BuildMI(*BB, MI.getIterator(), DL,
+            TII.get(W65816::STA_DP)).addImm(0xE2);
+
+    if (IsLoad) {
+      Register Dst = MI.getOperand(0).getReg();
+      BuildMI(*BB, MI.getIterator(), DL,
+              TII.get(W65816::LDY_Imm16)).addImm(0);
+      BuildMI(*BB, MI.getIterator(), DL,
+              TII.get(W65816::LDA_DPIndLongY)).addImm(0xE0);
+      BuildMI(*BB, MI.getIterator(), DL,
+              TII.get(TargetOpcode::COPY), Dst).addReg(W65816::A);
+    } else {
+      Register Val = MI.getOperand(0).getReg();
+      BuildMI(*BB, MI.getIterator(), DL,
+              TII.get(TargetOpcode::COPY), W65816::A).addReg(Val);
+      BuildMI(*BB, MI.getIterator(), DL,
+              TII.get(W65816::LDY_Imm16)).addImm(0);
+      if (IsByteStore)
+        BuildMI(*BB, MI.getIterator(), DL,
+                TII.get(W65816::SEP)).addImm(0x20);
+      BuildMI(*BB, MI.getIterator(), DL,
+              TII.get(W65816::STA_DPIndLongY)).addImm(0xE0);
+      if (IsByteStore)
+        BuildMI(*BB, MI.getIterator(), DL,
+                TII.get(W65816::REP)).addImm(0x20);
+    }
+    MI.eraseFromParent();
+    return BB;
+  }
   case W65816::LDAptrOff:
   case W65816::STAptrOff:
   case W65816::STBptrOff: {
@@ -1228,8 +2179,16 @@ W65816TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
             TII.get(W65816::ADC_Imm16)).addImm(Off);
     BuildMI(*BB, MI.getIterator(), DL,
             TII.get(W65816::STA_DP)).addImm(0xE0);
-    BuildMI(*BB, MI.getIterator(), DL,
-            TII.get(W65816::STZ_DP)).addImm(0xE2);
+    if (LoaderBankDeref) {
+      // Bank byte from $BE (crt0-initialised) — Loader compat path.
+      BuildMI(*BB, MI.getIterator(), DL,
+              TII.get(W65816::LDA_DP)).addImm(0xBE);
+      BuildMI(*BB, MI.getIterator(), DL,
+              TII.get(W65816::STA_DP)).addImm(0xE2);
+    } else {
+      BuildMI(*BB, MI.getIterator(), DL,
+              TII.get(W65816::STZ_DP)).addImm(0xE2);
+    }
 
     if (IsLoad) {
       Register Dst = MI.getOperand(0).getReg();
@@ -1326,8 +2285,16 @@ W65816TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
 
     BuildMI(*BB, MI.getIterator(), DL,
             TII.get(W65816::STA_DP)).addImm(0xE0);
-    BuildMI(*BB, MI.getIterator(), DL,
-            TII.get(W65816::STZ_DP)).addImm(0xE2);
+    if (LoaderBankDeref) {
+      // Bank byte from $BE (crt0-initialised) — Loader compat path.
+      BuildMI(*BB, MI.getIterator(), DL,
+              TII.get(W65816::LDA_DP)).addImm(0xBE);
+      BuildMI(*BB, MI.getIterator(), DL,
+              TII.get(W65816::STA_DP)).addImm(0xE2);
+    } else {
+      BuildMI(*BB, MI.getIterator(), DL,
+              TII.get(W65816::STZ_DP)).addImm(0xE2);
+    }
 
     if (IsLoad) {
       Register Dst = MI.getOperand(0).getReg();
diff --git a/src/llvm/lib/Target/W65816/W65816ISelLowering.h b/src/llvm/lib/Target/W65816/W65816ISelLowering.h
index db92d66..1d640af 100644
--- a/src/llvm/lib/Target/W65816/W65816ISelLowering.h
+++ b/src/llvm/lib/Target/W65816/W65816ISelLowering.h
@@ -46,6 +46,26 @@ public:
 
   SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
 
+  // Lock i16 shift amounts to i16 (not i32) even when i32 is a legal
+  // type.  Without this, the DAG combiner promotes i16 shift amounts
+  // to i32 once i32 is registered as legal, leaving (sra i16, i32:K)
+  // with no matching pattern.  Only narrow when LHS is i16; leave i32
+  // shifts (which go to libcall via LowerShift) alone.
+  MVT getScalarShiftAmountTy(const DataLayout &DL,
+                             EVT LHSTy) const override {
+    if (LHSTy == MVT::i16 || LHSTy == MVT::i8) return MVT::i16;
+    return TargetLoweringBase::getScalarShiftAmountTy(DL, LHSTy);
+  }
+
+  // ptr32-mode hook: with patches/0007-targetlowering-virtual-
+  // gettypeconversion making the base function virtual, this can be
+  // overridden to force i64 to expand directly to i16 halves rather
+  // than going through i32 (the next-smaller-legal type).  Currently
+  // not overridden — the override-calling-base passthrough caused
+  // regressions in unrelated functions (likely due to subtle
+  // de-virtualization changes when the function becomes virtual).
+  // Future fix needs to test the override more carefully.
+
   MachineBasicBlock *
   EmitInstrWithCustomInserter(MachineInstr &MI,
                               MachineBasicBlock *MBB) const override;
@@ -147,6 +167,23 @@ public:
     return TargetLowering::isTypeDesirableForOp(Opc, VT);
   }
 
+  // Disallow merging stores into wider ones.  With ptr32 active and i32
+  // a Custom-lowered op, the SDAG combiner's MergeConsecutiveStores
+  // takes our LowerStore-split pair (2x i16 stores at &t and &t+2) and
+  // merges them back into a single i32 store, which re-enters
+  // LowerStore, splits again, and loops forever — observed as
+  // "LLVM ERROR: out of memory" on `*t = K` for any K (including 0
+  // when the SDAG state lets the combiner pick the merge ahead of any
+  // STZ-pattern simplification).  Anything wider than i16 has no
+  // legal ptr-store pattern in our backend anyway, so merging into
+  // wider VTs is purely counterproductive.
+  bool canMergeStoresTo(unsigned AS, EVT MemVT,
+                        const MachineFunction &MF) const override {
+    if (MemVT.isInteger() && MemVT.getSizeInBits() > 16)
+      return false;
+    return TargetLowering::canMergeStoresTo(AS, MemVT, MF);
+  }
+
 private:
   SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const;
@@ -156,6 +193,31 @@ private:
   SDValue LowerSignExtend(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerShift(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerDynamicStackalloc(SDValue Op, SelectionDAG &DAG) const;
+  // Foundation hooks for ptr32 mode.  In ptr16 mode (current default),
+  // both return SDValue() so the legalizer falls through to the default
+  // i16-pointer LDAptr/STAptr selection.  When ptr32 mode is enabled
+  // (PointerWidth=32 + Wide32 added as i32 reg class), they detect i32
+  // addresses and wrap the load/store in W65816ISD::LD_PTR / ST_PTR /
+  // STB_PTR so the [dp],Y inserter takes the bank byte from the
+  // pointer's hi half instead of forcing 0.
+  SDValue LowerLoad(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerStore(SDValue Op, SelectionDAG &DAG) const;
+  // ZERO/SIGN/ANY_EXTEND i16 -> i32 and TRUNCATE i32 -> i16 lowering
+  // via REG_SEQUENCE / EXTRACT_SUBREG on the sub_lo/sub_hi indexes of
+  // the Wide32 register class.  Active once i32 is registered as a
+  // legal type.
+  SDValue LowerExtend(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerTruncate(SDValue Op, SelectionDAG &DAG) const;
+  // SIGN_EXTEND_INREG i32 with inner type i1 / i8 / i16: sign-extend
+  // the low N bits of the i32 input to fill all 32 bits.  Splits to
+  // (sext_inreg lo, innerVT) for the low half and SRA #15 of the
+  // resulting i16 for the high half.
+  SDValue LowerSignExtendInReg(SDValue Op, SelectionDAG &DAG) const;
+  // ADD/SUB/AND/OR/XOR i32 split into per-half i16 ops.  The carry-
+  // chain ADDC/ADDE pseudos handle the cross-half link for ADD/SUB.
+  SDValue LowerI32Bin(SDValue Op, SelectionDAG &DAG) const;
+  // i32 ConstantNode: split into two i16 constants and REG_SEQUENCE.
+  SDValue LowerI32Constant(SDValue Op, SelectionDAG &DAG) const;
 };
 
 } // namespace llvm
diff --git a/src/llvm/lib/Target/W65816/W65816InstrInfo.cpp b/src/llvm/lib/Target/W65816/W65816InstrInfo.cpp
index 0f58c13..990182b 100644
--- a/src/llvm/lib/Target/W65816/W65816InstrInfo.cpp
+++ b/src/llvm/lib/Target/W65816/W65816InstrInfo.cpp
@@ -100,6 +100,30 @@ void W65816InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     BuildMI(MBB, I, DL, get(W65816::STA_DP)).addImm(dstImg);
     return;
   }
+  // SP -> A via TSC.  Used by alloca / setjmp asm machinery.
+  if (DestReg == W65816::A && SrcReg == W65816::SP) {
+    BuildMI(MBB, I, DL, get(W65816::TSC));
+    return;
+  }
+  // A -> SP via TCS.
+  if (DestReg == W65816::SP && SrcReg == W65816::A) {
+    BuildMI(MBB, I, DL, get(W65816::TCS));
+    return;
+  }
+  // X <-> Y via A: 65816 has no direct X<->Y transfer; bridge through
+  // A.  Caller is responsible for ensuring A is dead at this program
+  // point (regalloc arranges this).  Used by greedy when an i16 vreg
+  // forced into one Idx16 reg gets coalesced with a use in the other.
+  if (DestReg == W65816::Y && SrcReg == W65816::X) {
+    BuildMI(MBB, I, DL, get(W65816::TXA));
+    BuildMI(MBB, I, DL, get(W65816::TAY));
+    return;
+  }
+  if (DestReg == W65816::X && SrcReg == W65816::Y) {
+    BuildMI(MBB, I, DL, get(W65816::TYA));
+    BuildMI(MBB, I, DL, get(W65816::TAX));
+    return;
+  }
   // X → IMGn / IMGn → X: STX dp / LDX dp.  Used by the i64-first-arg
   // entry COPY (LowerFormalArguments routes arg0_ml through Img16 to
   // dodge the TXA-bridge-clobbers-A spill bug for udivmod-shaped
@@ -112,6 +136,18 @@ void W65816InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     BuildMI(MBB, I, DL, get(W65816::LDX_DP)).addImm(srcImg);
     return;
   }
+  // Y -> IMGn / IMGn -> Y: STY dp / LDY dp.  Symmetric with the X
+  // case above.  Used by the i32-first-arg ABI's hi half (in X) and
+  // by Wide32 pair copies that have one half in Y after the per-half
+  // routing — see the lambda dispatch below.
+  if (dstImg >= 0 && SrcReg == W65816::Y) {
+    BuildMI(MBB, I, DL, get(W65816::STY_DP)).addImm(dstImg);
+    return;
+  }
+  if (DestReg == W65816::Y && srcImg >= 0) {
+    BuildMI(MBB, I, DL, get(W65816::LDY_DP)).addImm(srcImg);
+    return;
+  }
   // DPF0 → A: emit `LDA $F0`.  DPF0 is the pseudo-physreg carrier
   // for an i64-returning call's high 16 bits; LowerCall builds a
   // CopyFromReg(DPF0) glued to the call so the SDAG combiner /
@@ -129,6 +165,56 @@ void W65816InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     BuildMI(MBB, I, DL, get(W65816::STA_DP)).addImm(0xF0);
     return;
   }
+  // Wide32 (AX32 or IMG-pair) <-> Wide32 copy: split on sub_lo / sub_hi
+  // and recurse.  Use a hand-written dispatch instead of getSubReg
+  // because the MCRegisterInfo::getSubReg path crashes when called
+  // from TargetInstrInfo::lowerCopy on regs that are not pair regs
+  // (the table lookup walks past the end of the diff list).
+  auto wide32Halves = [](Register R)
+      -> std::pair<Register, Register> {
+    switch (R) {
+    case W65816::AX32:    return {W65816::A,     W65816::X};
+    case W65816::IMG01:   return {W65816::IMG0,  W65816::IMG1};
+    case W65816::IMG23:   return {W65816::IMG2,  W65816::IMG3};
+    case W65816::IMG45:   return {W65816::IMG4,  W65816::IMG5};
+    case W65816::IMG67:   return {W65816::IMG6,  W65816::IMG7};
+    case W65816::IMG89:   return {W65816::IMG8,  W65816::IMG9};
+    case W65816::IMG1011: return {W65816::IMG10, W65816::IMG11};
+    case W65816::IMG1213: return {W65816::IMG12, W65816::IMG13};
+    case W65816::IMG1415: return {W65816::IMG14, W65816::IMG15};
+    default:              return {Register(),    Register()};
+    }
+  };
+  auto [srcLo, srcHi] = wide32Halves(SrcReg);
+  auto [dstLo, dstHi] = wide32Halves(DestReg);
+  if (srcLo && srcHi && dstLo && dstHi) {
+    // Wide32 -> Wide32.  Lo-first order is correct in every direction:
+    //   AX32 -> IMG_pair  : STA dstLo (A live), then STX dstHi
+    //   IMG_pair -> AX32  : LDA srcLo, then LDX srcHi (independent halves)
+    //   IMG_pair -> IMG_pair : LDA/STA chain twice (A is only per-half scratch)
+    copyPhysReg(MBB, I, DL, dstLo, srcLo, KillSrc,
+                RenamableDest, RenamableSrc);
+    copyPhysReg(MBB, I, DL, dstHi, srcHi, KillSrc,
+                RenamableDest, RenamableSrc);
+    return;
+  }
+  // Wide32 -> i16: take sub_lo of source.  Arises post-RA when an
+  // EXTRACT_SUBREG was lowered as a parent-reg COPY (the SubRegIndex
+  // is dropped by lowerCopy).
+  if (srcLo && srcHi && !dstLo) {
+    copyPhysReg(MBB, I, DL, DestReg, srcLo, KillSrc,
+                RenamableDest, RenamableSrc);
+    return;
+  }
+  // i16 -> Wide32: write sub_lo only (sub_hi left as caller had it,
+  // matching INSERT_SUBREG semantics).  Arises post-RA when REG_SEQUENCE
+  // is expanded into per-half COPY pseudos, then a parent-reg COPY of
+  // a sub-reg-only def appears.
+  if (!srcLo && dstLo && dstHi) {
+    copyPhysReg(MBB, I, DL, dstLo, SrcReg, KillSrc,
+                RenamableDest, RenamableSrc);
+    return;
+  }
   llvm_unreachable("W65816: cross-class copyPhysReg not yet implemented");
 }
 
@@ -141,6 +227,37 @@ void W65816InstrInfo::storeRegToStackSlot(
   // and zero offset.  When regalloc hands us a spill from X or Y, bridge
   // through A (TXA / TYA) — same rationale as loadRegFromStackSlot.
   DebugLoc DL = MI != MBB.end() ? MI->getDebugLoc() : DebugLoc();
+  // Wide32 spill: split into 2 i16 stores at offsets 0 and 2 of the
+  // 4-byte spill slot.  Bridge each half through A using copyPhysReg.
+  if (RC == &W65816::Wide32RegClass || RC == &W65816::Acc32RegClass ||
+      RC == &W65816::AnyWide32RegClass) {
+    Register Lo, Hi;
+    switch (SrcReg) {
+    case W65816::AX32:    Lo = W65816::A;     Hi = W65816::X;     break;
+    case W65816::IMG01:   Lo = W65816::IMG0;  Hi = W65816::IMG1;  break;
+    case W65816::IMG23:   Lo = W65816::IMG2;  Hi = W65816::IMG3;  break;
+    case W65816::IMG45:   Lo = W65816::IMG4;  Hi = W65816::IMG5;  break;
+    case W65816::IMG67:   Lo = W65816::IMG6;  Hi = W65816::IMG7;  break;
+    case W65816::IMG89:   Lo = W65816::IMG8;  Hi = W65816::IMG9;  break;
+    case W65816::IMG1011: Lo = W65816::IMG10; Hi = W65816::IMG11; break;
+    case W65816::IMG1213: Lo = W65816::IMG12; Hi = W65816::IMG13; break;
+    case W65816::IMG1415: Lo = W65816::IMG14; Hi = W65816::IMG15; break;
+    default: llvm_unreachable("W65816: Wide32 spill of non-pair reg");
+    }
+    // Bridge lo through A, store at offset 0; bridge hi through A,
+    // store at offset 2.  This is brittle in the face of regalloc
+    // expectations — Wide32 spills are best avoided by keeping the
+    // pair in registers if at all possible.
+    if (Lo != W65816::A) {
+      copyPhysReg(MBB, MI, DL, W65816::A, Lo, false);
+    }
+    BuildMI(MBB, MI, DL, get(W65816::STAfi))
+        .addReg(W65816::A).addFrameIndex(FrameIdx).addImm(0);
+    copyPhysReg(MBB, MI, DL, W65816::A, Hi, false);
+    BuildMI(MBB, MI, DL, get(W65816::STAfi))
+        .addReg(W65816::A).addFrameIndex(FrameIdx).addImm(2);
+    return;
+  }
   if (SrcReg == W65816::X || SrcReg == W65816::Y) {
     unsigned XferOp = (SrcReg == W65816::X) ? W65816::TXA : W65816::TYA;
     BuildMI(MBB, MI, DL, get(XferOp));
@@ -166,6 +283,34 @@ void W65816InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
   // values for the second word (caught by udivmod's `a - q*b` mod
   // computation).
   DebugLoc DL = MI != MBB.end() ? MI->getDebugLoc() : DebugLoc();
+  // Wide32 reload: 2 i16 loads at offsets 0 and 2 of the 4-byte slot.
+  if (RC == &W65816::Wide32RegClass || RC == &W65816::Acc32RegClass ||
+      RC == &W65816::AnyWide32RegClass) {
+    Register Lo, Hi;
+    switch (DestReg) {
+    case W65816::AX32:    Lo = W65816::A;     Hi = W65816::X;     break;
+    case W65816::IMG01:   Lo = W65816::IMG0;  Hi = W65816::IMG1;  break;
+    case W65816::IMG23:   Lo = W65816::IMG2;  Hi = W65816::IMG3;  break;
+    case W65816::IMG45:   Lo = W65816::IMG4;  Hi = W65816::IMG5;  break;
+    case W65816::IMG67:   Lo = W65816::IMG6;  Hi = W65816::IMG7;  break;
+    case W65816::IMG89:   Lo = W65816::IMG8;  Hi = W65816::IMG9;  break;
+    case W65816::IMG1011: Lo = W65816::IMG10; Hi = W65816::IMG11; break;
+    case W65816::IMG1213: Lo = W65816::IMG12; Hi = W65816::IMG13; break;
+    case W65816::IMG1415: Lo = W65816::IMG14; Hi = W65816::IMG15; break;
+    default: llvm_unreachable("W65816: Wide32 reload to non-pair reg");
+    }
+    // Lo half: LDA from offset 0, transfer to Lo if needed.
+    BuildMI(MBB, MI, DL, get(W65816::LDAfi), W65816::A)
+        .addFrameIndex(FrameIdx).addImm(0);
+    if (Lo != W65816::A)
+      copyPhysReg(MBB, MI, DL, Lo, W65816::A, false);
+    // Hi half: LDA from offset 2, transfer to Hi.
+    BuildMI(MBB, MI, DL, get(W65816::LDAfi), W65816::A)
+        .addFrameIndex(FrameIdx).addImm(2);
+    if (Hi != W65816::A)
+      copyPhysReg(MBB, MI, DL, Hi, W65816::A, false);
+    return;
+  }
   if (DestReg == W65816::A) {
     BuildMI(MBB, MI, DL, get(W65816::LDAfi), DestReg)
         .addFrameIndex(FrameIdx)
diff --git a/src/llvm/lib/Target/W65816/W65816InstrInfo.td b/src/llvm/lib/Target/W65816/W65816InstrInfo.td
index 14fe38c..8e8a7c5 100644
--- a/src/llvm/lib/Target/W65816/W65816InstrInfo.td
+++ b/src/llvm/lib/Target/W65816/W65816InstrInfo.td
@@ -88,6 +88,26 @@ def SDT_W65816Alloca : SDTypeProfile<1, 1, [SDTCisVT<0, i16>,
 def W65816alloca : SDNode<"W65816ISD::ALLOCA", SDT_W65816Alloca,
                           [SDNPHasChain, SDNPSideEffect]>;
 
+// ptr32 load / store: target-specific load/store nodes that take a 32-bit
+// pointer (Wide32 = i32) and lower to [dp],Y indirect-long with the bank
+// byte taken from the pointer's hi-half.  Used for ptr32 mode where
+// generic (load i32-addr) needs explicit lowering — wrapping in a target
+// node prevents DAG combines from rewriting the load before isel.
+//
+// Loads always materialise an i16 in A (16-bit LDA); byte zext / anyext
+// patterns AND-mask afterwards exactly as the existing LDAptr does.
+// Stores split into two nodes: ST_PTR (full 16-bit STA) and STB_PTR
+// (SEP/REP-wrapped 8-bit STA for truncating stores).
+def SDT_W65816LdPtr : SDTypeProfile<1, 1, [SDTCisVT<0, i16>, SDTCisVT<1, i32>]>;
+def SDT_W65816StPtr : SDTypeProfile<0, 2, [SDTCisVT<0, i16>, SDTCisVT<1, i32>]>;
+
+def W65816ldPtr  : SDNode<"W65816ISD::LD_PTR",  SDT_W65816LdPtr,
+                          [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+def W65816stPtr  : SDNode<"W65816ISD::ST_PTR",  SDT_W65816StPtr,
+                          [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def W65816stbPtr : SDNode<"W65816ISD::STB_PTR", SDT_W65816StPtr,
+                          [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+
 //===----------------------------------------------------------------------===//
 // Pseudo Instructions
 //===----------------------------------------------------------------------===//
@@ -1046,6 +1066,96 @@ def : Pat<(store Acc8:$val, (add Wide16:$ptr, (i16 imm:$off))),
 def : Pat<(store Acc8:$val, Wide16:$ptr),
           (STBptr (COPY_TO_REGCLASS Acc8:$val, Acc16), Wide16:$ptr)>;
 
+// ---------------------------------------------------------------------
+// ptr32 deref pseudos.  Same shape and inserter as LDAptr/STAptr/STBptr,
+// but the pointer is a Wide32 (i32) value: sub_lo carries the low 16
+// bits of the address, sub_hi carries the bank byte in its low half.
+// Inserter stages the low 16 bits at $E0..$E1 and the bank byte at $E2,
+// then emits LDA/STA [dp],Y just like the i16 path — but with a
+// pointer-derived bank instead of a forced 0.
+//
+// Dead unless ptr32 mode is active (LowerLoad/LowerStore only emit
+// W65816ldPtr/stPtr/stbPtr when the address is i32).
+// ---------------------------------------------------------------------
+let usesCustomInserter = 1, hasSideEffects = 1, mayLoad = 1,
+    Defs = [Y, P] in {
+def LDAptr32 : W65816Pseudo<(outs Acc16:$dst), (ins AnyWide32:$ptr),
+                            "# LDAptr32 $dst, $ptr",
+                            [(set Acc16:$dst, (W65816ldPtr AnyWide32:$ptr))]>;
+}
+let usesCustomInserter = 1, hasSideEffects = 1, mayStore = 1,
+    Defs = [Y, P] in {
+def STAptr32 : W65816Pseudo<(outs), (ins Acc16:$val, AnyWide32:$ptr),
+                            "# STAptr32 $val, $ptr",
+                            [(W65816stPtr Acc16:$val, AnyWide32:$ptr)]>;
+def STBptr32 : W65816Pseudo<(outs), (ins Acc16:$val, AnyWide32:$ptr),
+                            "# STBptr32 $val, $ptr",
+                            [(W65816stbPtr Acc16:$val, AnyWide32:$ptr)]>;
+}
+let usesCustomInserter = 1, hasSideEffects = 1, mayLoad = 1,
+    Defs = [Y, P] in {
+def LDAptr32Off : W65816Pseudo<(outs Acc16:$dst),
+                               (ins AnyWide32:$ptr, i16imm:$off),
+                               "# LDAptr32Off $dst, $ptr, $off", []>;
+}
+let usesCustomInserter = 1, hasSideEffects = 1, mayStore = 1,
+    Defs = [Y, P] in {
+def STAptr32Off : W65816Pseudo<(outs),
+                               (ins Acc16:$val, AnyWide32:$ptr, i16imm:$off),
+                               "# STAptr32Off $val, $ptr, $off", []>;
+def STBptr32Off : W65816Pseudo<(outs),
+                               (ins Acc16:$val, AnyWide32:$ptr, i16imm:$off),
+                               "# STBptr32Off $val, $ptr, $off", []>;
+}
+
+// Direct ptr32 load/store patterns over generic ISD::LOAD / ISD::STORE
+// when the address is an i32 (AnyWide32) reg.  These are unreachable
+// while i32 is not a legal type (ptr16 mode).  When ptr32 mode is
+// activated they fire instead of the i16-pointer LDAptr / STAptr.
+def : Pat<(i16 (load AnyWide32:$ptr)),
+          (LDAptr32 AnyWide32:$ptr)>;
+def : Pat<(store Acc16:$val, AnyWide32:$ptr),
+          (STAptr32 Acc16:$val, AnyWide32:$ptr)>;
+def : Pat<(truncstorei8 Acc16:$val, AnyWide32:$ptr),
+          (STBptr32 Acc16:$val, AnyWide32:$ptr)>;
+def : Pat<(i16 (zextloadi8 AnyWide32:$ptr)),
+          (ANDi16imm (LDAptr32 AnyWide32:$ptr), 0xFF)>;
+def : Pat<(i16 (extloadi8 AnyWide32:$ptr)),
+          (LDAptr32 AnyWide32:$ptr)>;
+def : Pat<(i8 (load AnyWide32:$ptr)),
+          (COPY_TO_REGCLASS (ANDi16imm (LDAptr32 AnyWide32:$ptr), 0xFF), Acc8)>;
+def : Pat<(store Acc8:$val, AnyWide32:$ptr),
+          (STBptr32 (COPY_TO_REGCLASS Acc8:$val, Acc16), AnyWide32:$ptr)>;
+
+// Off variants — folded constant-offset add patterns deferred until
+// ptr32 mode is activated and we can profile real cases.  The base
+// LDAptr32/STAptr32 pseudos handle the general (add ptr, off) case
+// correctly via a separate i32 ADD; the Off pseudos are an optional
+// optimization for small constant offsets.
+
+// Split-pair variants: same semantics as LDAptr32/STAptr32/STBptr32 but
+// the ptr is two separate i16 register operands (lo + hi) instead of
+// one Wide32 register pair.  Used by the W65816LowerWide32 pre-RA pass
+// to relieve register-pair allocation pressure: it walks REG_SEQUENCE
+// + LDAptr32 chains, decomposes the Wide32 vregs into pairs of i16
+// vregs, and rewrites the LDAptr32-family to take the two halves
+// directly.
+let usesCustomInserter = 1, hasSideEffects = 1, mayLoad = 1,
+    Defs = [Y, P] in {
+def LDAptr32S : W65816Pseudo<(outs Acc16:$dst),
+                             (ins Wide16:$ptrLo, Wide16:$ptrHi),
+                             "# LDAptr32S $dst, $ptrLo, $ptrHi", []>;
+}
+let usesCustomInserter = 1, hasSideEffects = 1, mayStore = 1,
+    Defs = [Y, P] in {
+def STAptr32S : W65816Pseudo<(outs),
+                             (ins Acc16:$val, Wide16:$ptrLo, Wide16:$ptrHi),
+                             "# STAptr32S $val, $ptrLo, $ptrHi", []>;
+def STBptr32S : W65816Pseudo<(outs),
+                             (ins Acc16:$val, Wide16:$ptrLo, Wide16:$ptrHi),
+                             "# STBptr32S $val, $ptrLo, $ptrHi", []>;
+}
+
 // i8 load via Acc16 pointer producing a true i8 (Acc8) result.  Reuses
 // the existing zextloadi8 16-bit-LDA-and-mask path: load 2 bytes, mask
 // the high byte, then narrow to Acc8.  COPY_TO_REGCLASS to Acc8 is a
@@ -1478,15 +1588,18 @@ def : Pat<(store
 // function doesn't have to know how it was called to choose its
 // return instruction.  A pseudo bridges the i16 symbol operand
 // to JSL_Long's 24-bit operand class.
-// Defs include DPF0 — every i64-returning libcall clobbers DP[$F0]
-// (it's the carrier for the highest 16 bits of the return).  The
-// LowerCall side captures the pre-call DPF0 via CopyFromReg(DPF0)
-// glued to the call so the SDAG combiner / scheduler can't merge
-// or reorder reads across calls.  Without DPF0 in Defs, plain
-// `getLoad(0xF0)` was being CSE'd across calls, leading to
-// `dmath = (a+b)*(a-b)` returning 4 instead of 16.
+// Defs lists ALL caller-clobbered regs.  The 65816 has no
+// caller/callee-save split — every callee may freely modify
+// A/X/Y/DPF0/P/etc.  Critically, i32/i64 returns place high
+// halves in X (i32), Y and DPF0 (i64); without those in Defs,
+// the InstrEmitter does not add implicit-defs for glued
+// CopyFromReg(X/Y/DPF0) on the call MI, and the verifier sees
+// the post-call `COPY $y` as reading an undefined register.
+// DPF0 was historically the only "extra" def so getLoad(0xF0)
+// wouldn't CSE across calls; the same anti-CSE rationale applies
+// to A/X/Y, but more fundamentally those are call return slots.
 let isCall = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0,
-    Defs = [A, DPF0] in {
+    Defs = [A, X, Y, DPF0] in {
 def JSLpseudo : W65816Pseudo<(outs), (ins i16imm:$dst),
                              "# JSLpseudo $dst", []>;
 }
diff --git a/src/llvm/lib/Target/W65816/W65816LowerWide32.cpp b/src/llvm/lib/Target/W65816/W65816LowerWide32.cpp
new file mode 100644
index 0000000..66bc9c5
--- /dev/null
+++ b/src/llvm/lib/Target/W65816/W65816LowerWide32.cpp
@@ -0,0 +1,326 @@
+//===-- W65816LowerWide32.cpp - Wide32 -> 2x i16 pre-RA lowering ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Pre-regalloc pass that decomposes Wide32 register-pair vregs into pairs
+// of i16 vregs.  Without this, greedy / basic regalloc fails on i64-heavy
+// functions (`RegAllocBase` crashes during `allocatePhysRegs`) because
+// the i64-via-2-i32-via-Wide32 chain produces too many simultaneously
+// live register-pair vregs.  After this pass, only i16 vregs remain at
+// the regalloc input — Wide32 lives only inside this pass and the new
+// LDAptr32S / STAptr32S / STBptr32S pseudos that take 2 i16 ptr operands
+// directly.
+//
+// Walks the MIR and:
+//   1. Finds REG_SEQUENCE producing Wide32 / Acc32 / AnyWide32; records
+//      the (lo, hi) i16 source operands; queues the REG_SEQUENCE for
+//      erasure.
+//   2. Finds COPY whose dest is a Wide32 vreg and whose src is another
+//      mapped Wide32 vreg; chains the (lo, hi) mapping forward.
+//   3. Rewrites EXTRACT_SUBREG of mapped Wide32 vregs by replacing the
+//      destination vreg with the appropriate half (sub_lo or sub_hi).
+//   4. Rewrites LDAptr32 / STAptr32 / STBptr32 with a mapped Wide32 ptr
+//      to the corresponding LDAptr32S / STAptr32S / STBptr32S pseudo
+//      with two separate i16 operands.
+//
+// Bail / safety: any Wide32 vreg whose def we can't decompose is left
+// in place — regalloc may still struggle but no miscompile.
+//
+//===----------------------------------------------------------------------===//
+
+#include "W65816.h"
+#include "W65816InstrInfo.h"
+#include "W65816Subtarget.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "w65816-lower-wide32"
+
+namespace {
+
+class W65816LowerWide32 : public MachineFunctionPass {
+public:
+  static char ID;
+  W65816LowerWide32() : MachineFunctionPass(ID) {}
+  StringRef getPassName() const override {
+    return "W65816 Wide32 -> 2x i16 lowering";
+  }
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+  bool runOnMachineFunction(MachineFunction &MF) override;
+};
+
+} // namespace
+
+char W65816LowerWide32::ID = 0;
+
+INITIALIZE_PASS(W65816LowerWide32, DEBUG_TYPE,
+                "W65816 Wide32 lowering", false, false)
+
+FunctionPass *llvm::createW65816LowerWide32() {
+  return new W65816LowerWide32();
+}
+
+static bool isWide32RC(const TargetRegisterClass *RC) {
+  return RC == &W65816::Wide32RegClass ||
+         RC == &W65816::Acc32RegClass ||
+         RC == &W65816::AnyWide32RegClass;
+}
+
+bool W65816LowerWide32::runOnMachineFunction(MachineFunction &MF) {
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  const W65816Subtarget &STI = MF.getSubtarget<W65816Subtarget>();
+  const W65816InstrInfo &TII = *STI.getInstrInfo();
+
+  // Map: Wide32 vreg -> (loVreg, hiVreg) of i16 type.
+  DenseMap<Register, std::pair<Register, Register>> wideMap;
+
+  // Pass 1: collect all Wide32 vregs.
+  SmallVector<Register, 16> wide32Vregs;
+  for (unsigned i = 0, e = MRI.getNumVirtRegs(); i != e; ++i) {
+    Register R = Register::index2VirtReg(i);
+    if (MRI.reg_nodbg_empty(R))
+      continue;
+    if (!isWide32RC(MRI.getRegClass(R)))
+      continue;
+    wide32Vregs.push_back(R);
+  }
+
+  if (wide32Vregs.empty())
+    return false;
+
+  // Pass 2: process REG_SEQUENCE / chained-COPY / multi-subreg-def
+  // shapes; build the mapping.  Iterate to fixed point because COPY
+  // chains depend on prior mappings.
+  SmallVector<MachineInstr *, 16> toErase;
+  bool changed = true;
+  while (changed) {
+    changed = false;
+    for (Register W : wide32Vregs) {
+      if (wideMap.count(W))
+        continue;
+      MachineInstr *DefMI = MRI.getUniqueVRegDef(W);
+      if (DefMI && DefMI->getOpcode() == TargetOpcode::REG_SEQUENCE) {
+        Register Lo, Hi;
+        for (unsigned op = 1; op + 1 < DefMI->getNumOperands(); op += 2) {
+          if (!DefMI->getOperand(op).isReg() ||
+              !DefMI->getOperand(op + 1).isImm())
+            continue;
+          unsigned idx = DefMI->getOperand(op + 1).getImm();
+          Register Src = DefMI->getOperand(op).getReg();
+          if (idx == llvm::sub_lo)
+            Lo = Src;
+          else if (idx == llvm::sub_hi)
+            Hi = Src;
+        }
+        if (Lo && Hi) {
+          wideMap[W] = {Lo, Hi};
+          toErase.push_back(DefMI);
+          changed = true;
+          continue;
+        }
+      }
+      if (DefMI && DefMI->isCopy()) {
+        Register Src = DefMI->getOperand(1).getReg();
+        if (Src.isVirtual() && wideMap.count(Src)) {
+          wideMap[W] = wideMap[Src];
+          toErase.push_back(DefMI);
+          changed = true;
+          continue;
+        }
+      }
+      // Multi-subreg-def shape: separate sub-reg COPYs build %W:
+      //   undef %W.sub_lo:wide32 = COPY %A:acc16
+      //   %W.sub_hi:wide32 = COPY %B:acc16
+      // Equivalent to a REG_SEQUENCE %A, sub_lo, %B, sub_hi.  softDouble
+      // at -O2 generates this heavily; without handling it the Wide32
+      // vreg survives to regalloc, which then asks for a spill/reload
+      // from a non-pair physreg and trips load/storeRegToStackSlot's
+      // llvm_unreachable.
+      Register LoSrc, HiSrc;
+      MachineInstr *LoDefMI = nullptr;
+      MachineInstr *HiDefMI = nullptr;
+      bool ok = true;
+      for (MachineInstr &MI : MRI.def_instructions(W)) {
+        if (!MI.isCopy()) { ok = false; break; }
+        const MachineOperand &Dst = MI.getOperand(0);
+        const MachineOperand &Src = MI.getOperand(1);
+        if (!Dst.isReg() || Dst.getReg() != W) { ok = false; break; }
+        unsigned SubIdx = Dst.getSubReg();
+        if (SubIdx == llvm::sub_lo) {
+          if (LoDefMI) { ok = false; break; }
+          LoDefMI = &MI;
+          LoSrc = Src.getReg();
+        } else if (SubIdx == llvm::sub_hi) {
+          if (HiDefMI) { ok = false; break; }
+          HiDefMI = &MI;
+          HiSrc = Src.getReg();
+        } else {
+          ok = false;
+          break;
+        }
+      }
+      if (ok && LoSrc && HiSrc) {
+        wideMap[W] = {LoSrc, HiSrc};
+        if (LoDefMI) toErase.push_back(LoDefMI);
+        if (HiDefMI) toErase.push_back(HiDefMI);
+        changed = true;
+      }
+    }
+  }
+
+  // Pass 2b: handle PHIs whose result is a Wide32 vreg by splitting
+  // into 2 PHIs (one per half).  Iterate to fixed point: a PHI becomes
+  // resolvable only after all its sources have been mapped.
+  changed = true;
+  while (changed) {
+    changed = false;
+    for (Register W : wide32Vregs) {
+      if (wideMap.count(W))
+        continue;
+      MachineInstr *DefMI = MRI.getUniqueVRegDef(W);
+      if (!DefMI || !DefMI->isPHI())
+        continue;
+      bool AllMapped = true;
+      for (unsigned op = 1; op + 1 < DefMI->getNumOperands(); op += 2) {
+        Register Src = DefMI->getOperand(op).getReg();
+        if (!Src.isVirtual() || !wideMap.count(Src)) {
+          AllMapped = false;
+          break;
+        }
+      }
+      if (!AllMapped)
+        continue;
+      Register NewLo = MRI.createVirtualRegister(&W65816::Acc16RegClass);
+      Register NewHi = MRI.createVirtualRegister(&W65816::Acc16RegClass);
+      MachineBasicBlock *MBB = DefMI->getParent();
+      DebugLoc DL = DefMI->getDebugLoc();
+      auto PHILo = BuildMI(*MBB, DefMI, DL, TII.get(TargetOpcode::PHI), NewLo);
+      auto PHIHi = BuildMI(*MBB, DefMI, DL, TII.get(TargetOpcode::PHI), NewHi);
+      for (unsigned op = 1; op + 1 < DefMI->getNumOperands(); op += 2) {
+        Register Src = DefMI->getOperand(op).getReg();
+        MachineBasicBlock *PredMBB = DefMI->getOperand(op + 1).getMBB();
+        auto [SrcLo, SrcHi] = wideMap[Src];
+        PHILo.addReg(SrcLo).addMBB(PredMBB);
+        PHIHi.addReg(SrcHi).addMBB(PredMBB);
+      }
+      wideMap[W] = {NewLo, NewHi};
+      toErase.push_back(DefMI);
+      changed = true;
+    }
+  }
+
+  // Pass 3: rewrite uses.
+  SmallVector<MachineInstr *, 32> useToErase;
+  for (MachineBasicBlock &MBB : MF) {
+    for (MachineBasicBlock::iterator It = MBB.begin(); It != MBB.end();) {
+      MachineInstr *MI = &*It++;
+
+      // EXTRACT_SUBREG of a mapped Wide32 vreg: replace the dest vreg
+      // with the appropriate half (sub_lo or sub_hi).
+      if (MI->getOpcode() == TargetOpcode::EXTRACT_SUBREG) {
+        Register Src = MI->getOperand(1).getReg();
+        if (Src.isVirtual() && wideMap.count(Src)) {
+          unsigned SubIdx = MI->getOperand(2).getImm();
+          auto [Lo, Hi] = wideMap[Src];
+          Register Half = (SubIdx == llvm::sub_lo) ? Lo : Hi;
+          Register Dst = MI->getOperand(0).getReg();
+          MRI.replaceRegWith(Dst, Half);
+          useToErase.push_back(MI);
+          continue;
+        }
+      }
+
+      // COPY %V.sub_lo / %V.sub_hi (partial-reg COPY where source has a
+      // sub-reg specifier and the source vreg is a mapped Wide32).
+      // LLVM emits this shape instead of EXTRACT_SUBREG when projecting
+      // a half out of a Wide32 vreg.  Only the shape with a full-reg
+      // destination is handled here — partial-reg destinations would
+      // imply the dst is itself a Wide32 sub-reg def, which the def-side
+      // multi-subreg-def handling covers separately.
+      if (MI->isCopy()) {
+        const MachineOperand &SrcOp = MI->getOperand(1);
+        const MachineOperand &DstOp = MI->getOperand(0);
+        if (SrcOp.isReg() && SrcOp.getReg().isVirtual() &&
+            wideMap.count(SrcOp.getReg()) && SrcOp.getSubReg() != 0 &&
+            DstOp.isReg() && DstOp.getSubReg() == 0) {
+          unsigned SubIdx = SrcOp.getSubReg();
+          auto [Lo, Hi] = wideMap[SrcOp.getReg()];
+          Register Half = (SubIdx == llvm::sub_lo) ? Lo : Hi;
+          MRI.replaceRegWith(DstOp.getReg(), Half);
+          useToErase.push_back(MI);
+          continue;
+        }
+      }
+
+      // LDAptr32 / STAptr32 / STBptr32 with a mapped Wide32 ptr:
+      // rewrite to LDAptr32S / STAptr32S / STBptr32S.
+      unsigned Opc = MI->getOpcode();
+      bool isPtrOp = (Opc == W65816::LDAptr32 || Opc == W65816::STAptr32 ||
+                      Opc == W65816::STBptr32);
+      if (isPtrOp) {
+        Register Ptr = MI->getOperand(1).getReg();
+        if (Ptr.isVirtual() && wideMap.count(Ptr)) {
+          auto [Lo, Hi] = wideMap[Ptr];
+          unsigned NewOpc = (Opc == W65816::LDAptr32) ? W65816::LDAptr32S
+                          : (Opc == W65816::STAptr32) ? W65816::STAptr32S
+                                                      : W65816::STBptr32S;
+          DebugLoc DL = MI->getDebugLoc();
+          MachineBasicBlock *ParentMBB = MI->getParent();
+          if (Opc == W65816::LDAptr32) {
+            Register Dst = MI->getOperand(0).getReg();
+            BuildMI(*ParentMBB, MI->getIterator(), DL, TII.get(NewOpc), Dst)
+                .addReg(Lo)
+                .addReg(Hi);
+          } else {
+            Register Val = MI->getOperand(0).getReg();
+            BuildMI(*ParentMBB, MI->getIterator(), DL, TII.get(NewOpc))
+                .addReg(Val)
+                .addReg(Lo)
+                .addReg(Hi);
+          }
+          useToErase.push_back(MI);
+          continue;
+        }
+      }
+    }
+  }
+
+  // Erase use-side instructions (EXTRACT_SUBREG, LDAptr32-family) first
+  // so the Wide32 vreg becomes dead.
+  for (auto *MI : useToErase)
+    MI->eraseFromParent();
+
+  // Now check each REG_SEQUENCE / chained-COPY def: only erase if the
+  // Wide32 vreg has no remaining uses.  Any leftover use means the pass
+  // didn't cover that opcode — leaving the def in place keeps the MIR
+  // well-formed (at the cost of pair-allocation pressure for that
+  // specific case).
+  bool eraseAny = !useToErase.empty();
+  for (auto *MI : toErase) {
+    if (MI->getNumOperands() == 0)
+      continue;
+    Register Dst = MI->getOperand(0).getReg();
+    if (!Dst.isVirtual() || MRI.use_nodbg_empty(Dst)) {
+      MI->eraseFromParent();
+      eraseAny = true;
+    }
+  }
+
+  return eraseAny;
+}
diff --git a/src/llvm/lib/Target/W65816/W65816RegisterInfo.td b/src/llvm/lib/Target/W65816/W65816RegisterInfo.td
index 0d3a505..01525bc 100644
--- a/src/llvm/lib/Target/W65816/W65816RegisterInfo.td
+++ b/src/llvm/lib/Target/W65816/W65816RegisterInfo.td
@@ -17,6 +17,13 @@ class W65816Reg<bits<8> num, string n> : Register<n> {
   let DwarfNumbers = [num];
 }
 
+// SubRegIndices for synthetic 32-bit register pairs.  sub_lo addresses the
+// low 16 bits (the natural i16-aligned half), sub_hi the high 16 bits.
+// Used by Acc32 / Wide32 / AnyWide32 to model i32 (i.e. ptr32) values as
+// pairs of i16 physical registers.
+def sub_lo : SubRegIndex<16, 0>;
+def sub_hi : SubRegIndex<16, 16>;
+
 //===----------------------------------------------------------------------===//
 //  Registers
 //===----------------------------------------------------------------------===//
@@ -127,3 +134,61 @@ def DPF0Reg : RegisterClass<"W65816", [i16], 16, (add DPF0)> {
 def StatusReg : RegisterClass<"W65816", [i8], 8, (add P)> {
   let isAllocatable = 0;
 }
+
+//===----------------------------------------------------------------------===//
+//  Synthetic 32-bit Register Pairs (for ptr32 mode)
+//===----------------------------------------------------------------------===//
+//
+// The W65816 has no native 32-bit registers.  For 32-bit-pointer mode and
+// any other i32 traffic we synthesize register pairs whose halves are
+// existing i16 registers, accessed via sub_lo / sub_hi.
+//
+// AX32 pairs A:X for the calling-convention slot (first i32 arg/return).
+// Heterogeneous: sub_lo is in Acc16, sub_hi is in Idx16.  Because of the
+// heterogeneity, AX32 lives in its own single-element class (Acc32) — if
+// it were grouped with the homogeneous IMG pairs in Wide32, TableGen would
+// auto-derive a "wide32_with_sub_hi_in_idx8" subclass that pins the whole
+// class to AX32.
+//
+// IMG01..IMG1415 pair adjacent IMG slots (each pair is 4 bytes of DP) into
+// homogeneous i16-i16 pairs.  These hold ptr32 values backed entirely by
+// direct page, so register-pair allocation can spill cleanly via Img16's
+// existing rules.
+//
+// Acc32 / Wide32 / AnyWide32:
+//   Acc32 = {AX32} — calling-convention slot only; not for general allocation.
+//   Wide32 = {IMG01..IMG1415} — homogeneous i16-i16 pairs, freely allocatable.
+//   AnyWide32 = Acc32 ∪ Wide32 — pre-RA flexibility for ptr32 vregs that
+//     are not constrained to AX32; greedy regalloc can pick AX32 or any
+//     Wide32 pair.
+let SubRegIndices = [sub_lo, sub_hi], CoveredBySubRegs = 1 in {
+  def AX32     : RegisterWithSubRegs<"ax32",    [A,     X]>,
+                 DwarfRegNum<[40]> { let Namespace = "W65816"; }
+  def IMG01    : RegisterWithSubRegs<"img01",   [IMG0,  IMG1]>,
+                 DwarfRegNum<[41]> { let Namespace = "W65816"; }
+  def IMG23    : RegisterWithSubRegs<"img23",   [IMG2,  IMG3]>,
+                 DwarfRegNum<[42]> { let Namespace = "W65816"; }
+  def IMG45    : RegisterWithSubRegs<"img45",   [IMG4,  IMG5]>,
+                 DwarfRegNum<[43]> { let Namespace = "W65816"; }
+  def IMG67    : RegisterWithSubRegs<"img67",   [IMG6,  IMG7]>,
+                 DwarfRegNum<[44]> { let Namespace = "W65816"; }
+  def IMG89    : RegisterWithSubRegs<"img89",   [IMG8,  IMG9]>,
+                 DwarfRegNum<[45]> { let Namespace = "W65816"; }
+  def IMG1011  : RegisterWithSubRegs<"img1011", [IMG10, IMG11]>,
+                 DwarfRegNum<[46]> { let Namespace = "W65816"; }
+  def IMG1213  : RegisterWithSubRegs<"img1213", [IMG12, IMG13]>,
+                 DwarfRegNum<[47]> { let Namespace = "W65816"; }
+  def IMG1415  : RegisterWithSubRegs<"img1415", [IMG14, IMG15]>,
+                 DwarfRegNum<[48]> { let Namespace = "W65816"; }
+}
+
+def Acc32 : RegisterClass<"W65816", [i32], 16, (add AX32)>;
+
+def Wide32 : RegisterClass<"W65816", [i32], 16,
+                           (add IMG01, IMG23, IMG45, IMG67,
+                                IMG89, IMG1011, IMG1213, IMG1415)>;
+
+def AnyWide32 : RegisterClass<"W65816", [i32], 16,
+                              (add AX32,
+                                   IMG01, IMG23, IMG45, IMG67,
+                                   IMG89, IMG1011, IMG1213, IMG1415)>;
diff --git a/src/llvm/lib/Target/W65816/W65816SepRepCleanup.cpp b/src/llvm/lib/Target/W65816/W65816SepRepCleanup.cpp
index 420df02..78982a9 100644
--- a/src/llvm/lib/Target/W65816/W65816SepRepCleanup.cpp
+++ b/src/llvm/lib/Target/W65816/W65816SepRepCleanup.cpp
@@ -419,6 +419,26 @@ bool W65816SepRepCleanup::runOnMachineFunction(MachineFunction &MF) {
           MI.getOperand(0).isImm()) {
         int K = MI.getOperand(0).getImm() & 0xFFFF;
         if (yKnown == K) {
+          // Before erasing this redundant LDY: the prior LDY is still in
+          // scope, so all of its Y-uses between the two LDYs are still
+          // valid uses.  But liveness already marked the LAST one (just
+          // before the redundant LDY) as `implicit killed $y`, because
+          // that LDY was about to redefine Y.  After erasure, Y survives
+          // through to the NEXT use, so the prior "kill" annotation is
+          // wrong and the machine verifier rejects.  Walk backward and
+          // clear the kill flag on the most recent Y-using operand.
+          for (auto Back = std::prev(It2);; --Back) {
+            bool clearedAny = false;
+            for (MachineOperand &MO : Back->operands()) {
+              if (MO.isReg() && MO.getReg() == W65816::Y &&
+                  MO.isUse() && MO.isKill()) {
+                MO.setIsKill(false);
+                clearedAny = true;
+              }
+            }
+            if (clearedAny) break;
+            if (Back == MBB.begin()) break;
+          }
           auto Erase = It2++;
           Erase->eraseFromParent();
           Changed = true;
diff --git a/src/llvm/lib/Target/W65816/W65816StackSlotCleanup.cpp b/src/llvm/lib/Target/W65816/W65816StackSlotCleanup.cpp
index c4bbb06..ef6555c 100644
--- a/src/llvm/lib/Target/W65816/W65816StackSlotCleanup.cpp
+++ b/src/llvm/lib/Target/W65816/W65816StackSlotCleanup.cpp
@@ -748,6 +748,15 @@ bool W65816StackSlotCleanup::runOnMachineFunction(MachineFunction &MF) {
       }
     };
     auto isLdaLike = [](unsigned Opc) {
+      // COPY between physregs: lowers in AsmPrinter to one of TXA/TYA/
+      // LDA $D? (for IMG↔A bridges) etc. — all of which set N/Z based
+      // on the loaded value.  Treating COPY as flag-defining caused the
+      // wrap pass to identify a PHI-elim COPY as the "Test" and wrap
+      // too narrow a range, so the cb-test LDA's flags were trampled
+      // by intervening A-loads before reaching the BEQ.  Including
+      // COPY in the corrupting set forces the pass to walk past these
+      // PHI-elim copies to find the real test (a CMP).
+      if (Opc == TargetOpcode::COPY) return true;
       // Pure load / register-transfer instructions: only side effect on
       // flags is N/Z from the loaded/transferred value.  Never a "test"
       // — they just move data.  Treated as corruption when between the
@@ -1365,7 +1374,42 @@ bool W65816StackSlotCleanup::runOnMachineFunction(MachineFunction &MF) {
           Cmp->getOperand(1).getImm() != 0)
         continue;
       bool Found = walkbackBefore(Cmp->getIterator(), MBB.begin());
-      if (Found) {
+      if (!Found) continue;
+      // Only eliminate if there are NO LdaLike instructions between
+      // this CMP and the next Bxx (or end of MBB).  Otherwise the
+      // CMP is the only flag-setting marker between the test value
+      // and the consuming branch — without it, the Bxx ends up
+      // testing the latest LdaLike's N/Z (typically a PHI-elim COPY
+      // or stack reload that has nothing to do with the original
+      // condition).  Caused __adddf3's renormalize while-loop to
+      // skip its body even though `mr & ~mask` was non-zero.
+      bool SafeToErase = true;
+      for (auto It = std::next(Cmp->getIterator());
+           It != Cmp->getParent()->end(); ++It) {
+        if (It->isDebugInstr()) continue;
+        if (It->isBranch() || It->isReturn()) break;
+        if (It->getOpcode() == TargetOpcode::COPY) {
+          SafeToErase = false;
+          break;
+        }
+        unsigned Opc = It->getOpcode();
+        // Conservative: any LDA/LDX/LDY/transfer disqualifies erasure.
+        // Stores and stack-mgmt are flag-preserving and OK.
+        switch (Opc) {
+        case W65816::STAfi: case W65816::STAfi_indY: case W65816::STA8fi:
+        case W65816::STA_StackRel: case W65816::STA_StackRelIndY:
+        case W65816::STA_DP: case W65816::STA_Abs: case W65816::STA_Long:
+        case W65816::STX_DP: case W65816::STX_Abs:
+        case W65816::STY_DP: case W65816::STY_Abs:
+        case W65816::ADJCALLSTACKDOWN: case W65816::ADJCALLSTACKUP:
+        case W65816::PHA: case W65816::PHX: case W65816::PHY:
+          continue;
+        }
+        // Anything else (LDA, transfer, ALU op...): bail.
+        SafeToErase = false;
+        break;
+      }
+      if (SafeToErase) {
         Cmp->eraseFromParent();
         Changed = true;
       }
diff --git a/src/llvm/lib/Target/W65816/W65816TargetMachine.cpp b/src/llvm/lib/Target/W65816/W65816TargetMachine.cpp
index be3a394..eeae746 100644
--- a/src/llvm/lib/Target/W65816/W65816TargetMachine.cpp
+++ b/src/llvm/lib/Target/W65816/W65816TargetMachine.cpp
@@ -48,6 +48,7 @@ LLVMInitializeW65816Target() {
   initializeW65816NegYIndYPass(PR);
   initializeW65816PreSpillCrossCallPass(PR);
   initializeW65816SjLjFinalizePass(PR);
+  initializeW65816LowerWide32Pass(PR);
 
   // Default IndVarSimplify's exit-value rewriter to "never".  The
   // closed-form replacement frequently widens an i16 induction var
@@ -150,6 +151,11 @@ void W65816PassConfig::addMachineSSAOptimization() {
 }
 
 void W65816PassConfig::addPreRegAlloc() {
+  // Decompose Wide32 vregs (i32 register pairs) into pairs of i16 vregs
+  // BEFORE the other Acc16-targeting pre-RA passes run.  Each later
+  // pass walks Acc16/Idx16/Img16 vregs; running this first means they
+  // see the decomposed halves uniformly.
+  addPass(createW65816LowerWide32());
   addPass(createW65816ABridgeViaX());
   addPass(createW65816TiedDefSpill());
   addPass(createW65816WidenAcc16());
@@ -176,6 +182,18 @@ void W65816PassConfig::addPostRegAlloc() {
   addPass(createW65816SpillToX());
   addPass(createW65816StackSlotCleanup());
   addPass(createW65816SpillToX());
+  // Disable MachineCopyPropagation: it eliminates `COPY $img = $a`
+  // thinking the IMG dest is dead (no explicit physreg use of $img
+  // remains after PEI expands STAfi-with-Img16-source into LDA_DP).
+  // The COPY actually expands to STA_DP $D0 — a memory store to a
+  // DP slot that libcalls (softDouble, softFloat) ALSO use as their
+  // own arg-save scratch.  When MCP drops the COPY, the subsequent
+  // LDA_DP $D0 reads stale memory.  Caught by `g = g/x` Newton loop:
+  // iter-1's saved x_ml at $D0 was never actually written, so iter-2
+  // read garbage.  The principled fix would mark IMG-targeted COPYs
+  // as memory-side-effecting, but TII doesn't expose that hook;
+  // disabling MCP loses some optimization but is safe.
+  disablePass(&llvm::MachineCopyPropagationID);
 }
 
 void W65816PassConfig::addPreEmitPass() {
diff --git a/src/llvm/test/CodeGen/W65816/add-i16.ll b/src/llvm/test/CodeGen/W65816/add-i16.ll
new file mode 100644
index 0000000..aff5581
--- /dev/null
+++ b/src/llvm/test/CodeGen/W65816/add-i16.ll
@@ -0,0 +1,12 @@
+; Smoke test: confirm llc accepts the W65816 target via lit.
+; RUN: llc -mtriple=w65816 -O2 < %s | FileCheck %s
+
+define i16 @add_i16(i16 %a, i16 %b) {
+; CHECK-LABEL: add_i16:
+; CHECK:       rep #0x30
+; CHECK:       clc
+; CHECK:       adc 0x4, s
+; CHECK:       rtl
+  %r = add i16 %a, %b
+  ret i16 %r
+}
diff --git a/src/llvm/test/CodeGen/W65816/canmergestoresto-i16-cap.ll b/src/llvm/test/CodeGen/W65816/canmergestoresto-i16-cap.ll
new file mode 100644
index 0000000..d2fea2d
--- /dev/null
+++ b/src/llvm/test/CodeGen/W65816/canmergestoresto-i16-cap.ll
@@ -0,0 +1,30 @@
+; Pin: canMergeStoresTo refuses to merge i16 stores into i32+.
+;
+; The SDAG store-merge combine sees two adjacent i16 stores and tries
+; to widen them into one i32 store.  Our i32 store path is Custom-
+; lowered back to two i16 stores, and the merge runs again, and the
+; cycle repeats until OOM.  Override fixes it by capping merge MemVT
+; at i16.  See feedback_canmergestores_disable.md.
+;
+; Repro: write two adjacent i16 fields of a struct.  Without the cap,
+; this either OOMs or burns >5s on a 4-line function.  With the cap,
+; the lowered code shows two distinct i16 stores (no widened form).
+;
+; RUN: llc -mtriple=w65816 -O2 < %s | FileCheck %s
+
+%struct.Pair = type { i16, i16 }
+
+define void @write_pair(ptr %p, i16 %a, i16 %b) {
+; CHECK-LABEL: write_pair:
+; Two distinct i16 stores must remain — not merged into one i32.
+; Each i16 store under our i32-illegal path uses the same DP-indirect
+; family ([dp],y) but on a freshly-loaded $e0 pointer for each half.
+; CHECK:       sta [0xe0
+; CHECK:       sta [0xe0
+; CHECK:       rtl
+  %f0 = getelementptr inbounds %struct.Pair, ptr %p, i32 0, i32 0
+  %f1 = getelementptr inbounds %struct.Pair, ptr %p, i32 0, i32 1
+  store i16 %a, ptr %f0
+  store i16 %b, ptr %f1
+  ret void
+}
diff --git a/src/llvm/test/CodeGen/W65816/extract-wide32-regseq.ll b/src/llvm/test/CodeGen/W65816/extract-wide32-regseq.ll
new file mode 100644
index 0000000..fb0f536
--- /dev/null
+++ b/src/llvm/test/CodeGen/W65816/extract-wide32-regseq.ll
@@ -0,0 +1,36 @@
+; Pin: extractWide32Lo/Hi looks through REG_SEQUENCE shortcut.
+;
+; Without the shortcut, `*p = 0` (or any i32 store of a constant or
+; freshly-built i32 vreg) hits the SDAG combiner repeatedly, the
+; combiner re-merges and Custom-lower re-splits, the cycle runs for
+; tens of seconds and 100MB+ peak.  See feedback_extract_wide32_regseq_shortcut.md.
+;
+; Two functions:
+;   - clear_i32: simplest *(i32*)p = 0 case (the original repro)
+;   - clear_i32_pair: two adjacent i32 zero-stores (combiner stress)
+;
+; If the shortcut regresses, llc either OOMs (process killed) or
+; takes >5s on these tiny functions.  We assert on the lowered shape.
+;
+; RUN: llc -mtriple=w65816 -O2 -verify-machineinstrs < %s | FileCheck %s
+
+define void @clear_i32(ptr %p) {
+; CHECK-LABEL: clear_i32:
+; CHECK:       sta [0xe0
+; CHECK:       sta [0xe0
+; CHECK:       rtl
+  store i32 0, ptr %p
+  ret void
+}
+
+define void @clear_i32_pair(ptr %p, ptr %q) {
+; CHECK-LABEL: clear_i32_pair:
+; CHECK:       sta [0xe0
+; CHECK:       sta [0xe0
+; CHECK:       sta [0xe0
+; CHECK:       sta [0xe0
+; CHECK:       rtl
+  store i32 0, ptr %p
+  store i32 0, ptr %q
+  ret void
+}
diff --git a/src/llvm/test/CodeGen/W65816/i64-first-arg-img16.ll b/src/llvm/test/CodeGen/W65816/i64-first-arg-img16.ll
new file mode 100644
index 0000000..4ec08d1
--- /dev/null
+++ b/src/llvm/test/CodeGen/W65816/i64-first-arg-img16.ll
@@ -0,0 +1,36 @@
+; Pin: i64-first-arg routes arg0 halves through Img16 (DP $C0..$DE).
+;
+; Without the Img16 routing, regalloc emits `TXA; STA spillA;
+; STA spillX` at function entry — the TXA clobbers $a (arg0_lo)
+; before the A-spill saves it, so both spill slots end up holding
+; arg0_ml.  Caused __adddf3(1.5, 2.5) → 1.5.  See
+; feedback_i64_first_arg_x_class.md.
+;
+; Fix: route arg0_lo via STA $dp and arg0_ml via STX $dp.  Visible at
+; function entry as a pair of `stx 0x[cd]?` and `sta 0x[cd]?` writes
+; into the IMG region of direct page.
+;
+; Trigger: i64 first arg with enough cross-call live range that arg0
+; halves must be saved.
+;
+; RUN: llc -mtriple=w65816 -O2 < %s | FileCheck %s
+
+declare i64 @ext1(i64 %x, i64 %y)
+declare i64 @ext2(i64 %a)
+
+define i64 @i64_first_pressure(i64 %x) {
+; CHECK-LABEL: i64_first_pressure:
+; Entry stores arg0_ml (X) and arg0_lo (A) into IMG slots, NOT a
+; TXA-bridge sequence.  $D0 / $D2 are concrete IMG slots (the IMG
+; region is $C0..$DE).  Match a stx in that range, followed by an
+; sta in the same range, before the first jsl.
+; CHECK:       stx 0xd
+; CHECK:       sta 0xd
+; CHECK:       jsl ext2
+; CHECK:       rtl
+entry:
+  %a = call i64 @ext2(i64 %x)
+  %b = add i64 %a, %x
+  %c = call i64 @ext1(i64 %b, i64 %x)
+  ret i64 %c
+}
diff --git a/src/llvm/test/CodeGen/W65816/img-copy-survives-mcp.ll b/src/llvm/test/CodeGen/W65816/img-copy-survives-mcp.ll
new file mode 100644
index 0000000..929c693
--- /dev/null
+++ b/src/llvm/test/CodeGen/W65816/img-copy-survives-mcp.ll
@@ -0,0 +1,32 @@
+; Pin: MachineCopyPropagation must NOT eliminate `COPY $img = $reg` —
+; that COPY actually expands to STA_DP $D? (a DP-memory store to an
+; IMG slot).  Libcalls (softDouble, softFloat) use those same DP
+; slots for their own arg-save scratch, so dropping the COPY makes
+; the subsequent LDA_DP read stale memory.  Caught by `g = g/x`
+; Newton loop: iter-1's saved x_ml at $D0 was never actually written
+; because MCP dropped the COPY, so iter-2's call to __divdf3 read
+; garbage as its x_ml argument.  See feedback_jslpseudo_libcall_img_clobber.md.
+;
+; Fix: disable MachineCopyPropagation in addPostRegAlloc.
+;
+; Symptom shape we pin: for an i64-first-arg double function that
+; calls a libcall, the entry must contain BOTH `stx 0xd?` AND `sta
+; 0xd?` (for I64FirstArg's Img16 arg-save dance) — and they must
+; survive to the asm output.  Without the MCP-disable, only one of
+; those (or neither) appears.
+;
+; RUN: llc -mtriple=w65816 -O2 < %s | FileCheck %s
+
+declare double @ext_div(double %a, double %b)
+
+define double @div_chain(double %x) {
+; CHECK-LABEL: div_chain:
+; Img16 arg-save at function entry — both halves must reach asm:
+; CHECK:       stx 0xd
+; CHECK:       sta 0xd
+; CHECK:       jsl ext_div
+; CHECK:       rtl
+entry:
+  %r = call double @ext_div(double %x, double %x)
+  ret double %r
+}
diff --git a/src/llvm/test/CodeGen/W65816/jslpseudo-caller-save.ll b/src/llvm/test/CodeGen/W65816/jslpseudo-caller-save.ll
new file mode 100644
index 0000000..2b02872
--- /dev/null
+++ b/src/llvm/test/CodeGen/W65816/jslpseudo-caller-save.ll
@@ -0,0 +1,28 @@
+; Pin: JSLpseudo declares Defs = [A, X, Y, DPF0].
+;
+; Without X, Y, DPF0 in the Defs list, an i64-returning libcall
+; (which returns lo16 in A, mid16 in X, hi16 in Y, hh16 in DPF0)
+; verifier-fails with "$y undefined" in math.c::floor.  See
+; feedback_jslpseudo_caller_save.md.
+;
+; This test compiles a call to an i64-returning external function
+; with -verify-machineinstrs.  If JSLpseudo's Defs are stripped, the
+; X/Y/DPF0 reads after the call would be on physregs the call didn't
+; declare it defined, and -verify-machineinstrs fails.
+;
+; RUN: llc -mtriple=w65816 -O2 -verify-machineinstrs < %s | FileCheck %s
+
+declare i64 @ext_i64(i64 %x)
+
+define i64 @i64_libcall_uses_xy(i64 %x) {
+; CHECK-LABEL: i64_libcall_uses_xy:
+; CHECK:       jsl ext_i64
+; The post-call sequence stores the i64 return value (lo16 in A, mid16
+; in X, hi16 in Y, hh16 in DPF0) back to the caller's frame.  If
+; JSLpseudo did not Def X, the txa here would verifier-fail because X
+; would not be live across the call.
+; CHECK:       txa
+; CHECK:       rtl
+  %r = call i64 @ext_i64(i64 %x)
+  ret i64 %r
+}
diff --git a/src/llvm/test/CodeGen/W65816/lit.local.cfg b/src/llvm/test/CodeGen/W65816/lit.local.cfg
new file mode 100644
index 0000000..dce57e0
--- /dev/null
+++ b/src/llvm/test/CodeGen/W65816/lit.local.cfg
@@ -0,0 +1,2 @@
+if not 'W65816' in config.root.targets:
+    config.unsupported = True
diff --git a/src/llvm/test/CodeGen/W65816/seprep-ldy-elision-kill-flag.ll b/src/llvm/test/CodeGen/W65816/seprep-ldy-elision-kill-flag.ll
new file mode 100644
index 0000000..ee0d05a
--- /dev/null
+++ b/src/llvm/test/CodeGen/W65816/seprep-ldy-elision-kill-flag.ll
@@ -0,0 +1,29 @@
+; Pin: SepRepCleanup's redundant-LDY elision must clear the now-stale
+; `killed $y` flag on the prior Y-user.
+;
+; Trigger: any sequence that emits two LDY_Imm16 #N back-to-back with
+; STA [dp],y between (e.g. an i32 store that splits into two i16
+; stores, each going through STAptr32 inserter which emits its own
+; LDY #0).  Without the fix, the third peephole at SepRepCleanup
+; deletes the second LDY, but the first STA's `implicit killed $y`
+; annotation was set under the assumption that the second LDY was
+; about to redefine Y — leaving the second STA reading "dead" Y.
+;
+; The fix walks backward from the erased LDY to the most recent
+; Y-using operand and clears its kill flag.  -verify-machineinstrs
+; catches the bug if it regresses.
+;
+; RUN: llc -mtriple=w65816 -O2 -verify-machineinstrs < %s | FileCheck %s
+
+define void @two_i32_stores_share_y(ptr %p) {
+; CHECK-LABEL: two_i32_stores_share_y:
+; The fix is invisible in asm output — both STAs emit identically with
+; or without the kill-flag fix.  The pin is `-verify-machineinstrs`
+; not aborting.  Match a minimal shape so the test still has structure.
+; CHECK:       ldy #0x0
+; CHECK:       sta [0xe0
+; CHECK:       sta [0xe0
+; CHECK:       rtl
+  store i32 0, ptr %p
+  ret void
+}
diff --git a/src/llvm/test/CodeGen/W65816/sign-extend-inreg-i32.ll b/src/llvm/test/CodeGen/W65816/sign-extend-inreg-i32.ll
new file mode 100644
index 0000000..d815330
--- /dev/null
+++ b/src/llvm/test/CodeGen/W65816/sign-extend-inreg-i32.ll
@@ -0,0 +1,41 @@
+; Pin: SIGN_EXTEND_INREG with i32 result and inner type i1 / i8 / i16
+; must Custom-lower to per-half ops.  Without the Custom hook, the
+; combiner emits `sext_inreg(REG_SEQUENCE(...), i1)` which has no
+; tablegen pattern and isel aborts with "Cannot select".
+;
+; The i1 case shows up in CRC32 loops (`-(crc & 1ul)` reduces to
+; sign_extend_inreg with i1).  See feedback_sext_inreg_i32_isel_gap.md.
+;
+; Note: -verify-machineinstrs intentionally omitted because i32 store
+; lowering still trips the i32-store-pair `implicit killed $y`
+; concern in some chains; orthogonal to this fix.
+;
+; RUN: llc -mtriple=w65816 -O2 < %s | FileCheck %s
+
+; The CRC32 idiom: -(x & 1) = sign_extend_inreg x, i1 (after combiner).
+define i32 @neg_lowbit(i32 %x) {
+; CHECK-LABEL: neg_lowbit:
+; CHECK:       and #0x1
+; CHECK:       rtl
+  %a = and i32 %x, 1
+  %b = sub i32 0, %a
+  ret i32 %b
+}
+
+; (int32_t)(int8_t)x — sign-extend low byte to i32.
+define i32 @sext_i8_to_i32(i32 %x) {
+; CHECK-LABEL: sext_i8_to_i32:
+; CHECK:       rtl
+  %t = trunc i32 %x to i8
+  %r = sext i8 %t to i32
+  ret i32 %r
+}
+
+; (int32_t)(int16_t)x — sign-extend low halfword to i32.
+define i32 @sext_i16_to_i32(i32 %x) {
+; CHECK-LABEL: sext_i16_to_i32:
+; CHECK:       rtl
+  %t = trunc i32 %x to i16
+  %r = sext i16 %t to i32
+  ret i32 %r
+}
diff --git a/src/llvm/test/CodeGen/W65816/wide32-phi-split.ll b/src/llvm/test/CodeGen/W65816/wide32-phi-split.ll
new file mode 100644
index 0000000..f3ed61b
--- /dev/null
+++ b/src/llvm/test/CodeGen/W65816/wide32-phi-split.ll
@@ -0,0 +1,32 @@
+; Pin: W65816LowerWide32 Pass 2b splits Wide32 PHIs.
+;
+; Without PHI splitting, an i32 phi (loop-carried 32-bit value)
+; survives to RA, hits "Wide32 reload to non-pair reg" UNREACHABLE.
+; softDouble at -O2 was the original repro (ma/mb mantissa loops).
+;
+; This test mimics the shape: an i32 carried across a loop.  If
+; LowerWide32 doesn't split the PHI, llc aborts.
+;
+; RUN: llc -mtriple=w65816 -O2 < %s | FileCheck %s
+
+define i32 @sum_i32_loop(ptr %p, i16 %n) {
+; CHECK-LABEL: sum_i32_loop:
+; CHECK:       rtl
+entry:
+  %is_zero = icmp eq i16 %n, 0
+  br i1 %is_zero, label %done, label %loop
+
+loop:
+  %i   = phi i16 [ 0, %entry ], [ %i.next, %loop ]
+  %acc = phi i32 [ 0, %entry ], [ %acc.next, %loop ]
+  %addr = getelementptr inbounds i32, ptr %p, i16 %i
+  %v   = load i32, ptr %addr
+  %acc.next = add i32 %acc, %v
+  %i.next   = add i16 %i, 1
+  %cond     = icmp eq i16 %i.next, %n
+  br i1 %cond, label %done, label %loop
+
+done:
+  %r = phi i32 [ 0, %entry ], [ %acc.next, %loop ]
+  ret i32 %r
+}