From 18ef7e1fa6d5afcdb6a4bf3f3ff24fccf0e260fc Mon Sep 17 00:00:00 2001
From: Scott Duensing <scott.duensing@gmail.com>
Date: Fri, 1 May 2026 17:22:55 -0500
Subject: [PATCH] Checkpoint

---
 .gitignore                                    |  2 +
 STATUS.md                                     | 66 ++++++++++++-------
 runtime/build.sh                              | 10 +--
 runtime/src/libc.c                            | 57 +++++-----------
 runtime/src/snprintf.c                        | 45 +++++--------
 scripts/smokeTest.sh                          | 22 +++----
 src/link816/link816.cpp                       | 15 +++++
 src/llvm/lib/Target/W65816/W65816InstrInfo.td | 18 +++--
 .../Target/W65816/W65816StackSlotCleanup.cpp  | 50 +++++++++-----
 .../lib/Target/W65816/W65816TargetMachine.cpp | 20 ++----
 10 files changed, 162 insertions(+), 143 deletions(-)
diff --git a/.gitignore b/.gitignore
index 9abef7d..52b4a98 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,8 +8,10 @@ tools/
 # Runtime build artifacts: regenerable via runtime/build.sh from
 # runtime/src/*.s.  The source files (.s, build.sh) are tracked.
 runtime/*.o
+runtime/*.o.bak
 
 # Editor / OS
 *.swp
 *.swo
 .DS_Store
+*~
diff --git a/STATUS.md b/STATUS.md
index 67df5eb..5984da1 100644
--- a/STATUS.md
+++ b/STATUS.md
@@ -149,11 +149,16 @@ atol/llabs/atexit/exit/abort, and a smoke test that exercises
 malloc + struct pointers + strcmp/strcpy via a working hash table
 end-to-end in MAME.
 
-`strtok` / `strtok_r` live in their own TU built at `-O0` — the
-`-O2` codegen for the str==NULL continuation path miscompiles
-(documented as the same backend-fragility class as #70 / qsort,
-both mitigated by reaching for fast regalloc per-TU).  Multi-call
-strtok over "a,b,,c" works end-to-end in smoke.
+`strtok` / `strtok_r` live in their own TU at `-O2` (with
+`__attribute__((noinline))` on `strtok_r` so the strtok() wrapper
+doesn't duplicate it).  Multi-call strtok over "a,b,,c" works
+end-to-end in smoke.  Latent backend issue: at certain rodata
+layouts, -O2 strtok_r's BB0_7 inner CMP loop miscompiles due to
+LICM/sink interaction; current smoke layout passes but adding
+bytes upstream (e.g. growing softDouble.o) can shift delim into
+a failing address.  Surgical workaround `-mllvm -disable-machine-
+sink` on strtok.c is documented; not currently applied because
+smoke is green.
 
 A small **RPN calculator** test (smoke #87) chains strtok, atol,
 push/pop over a static stack, snprintf "%ld", and strcmp to verify
@@ -198,22 +203,19 @@ sidecar bytes.
 
 ## Known issues / workarounds
 
-- **Greedy register allocator mis-orders spills** in iterative
-  quicksort with `if/else` recursion choice (#70). Live-range
-  tracking for `hi` is wrong across the inner loop and post-loop
-  swap call, producing miscompiled code. Reproduces only at
-  `-O1`/`-O2` with greedy. Workarounds (any one):
-  - `__attribute__((noinline,optnone))` on the affected function —
-    routes through fast regalloc per-function. Verified in smoke
-    test; recommended for new code that hits this.
-  - `-mllvm -regalloc=fast` for the whole translation unit.
-    `softDouble.c` already uses this for `__muldf3` (build.sh
-    applies it automatically).
-  - Rewrite the loop with explicit recursion guards instead of
-    the iterative tail-elim form.
-
-  Real fix needs deeper greedy work; deferred behind the per-
-  function attribute since it covers the practical cases.
+- **#70 FIXED**: greedy regalloc + W65816StackSlotCleanup Pass -2
+  was deleting an entry-side store to a slot that the loop body
+  read.  Pass -2 collapses `LDAfi slotA; STAfi slotB; LDAfi slotC;
+  OPfi slotB` into `LDAfi slotC; OPfi slotA` (memory-to-memory copy
+  through A elimination), but didn't check whether slotB had other
+  refs in the function.  In iterative qsort, slotB happened to be
+  the spill home for `hi` — the Pass -2 transform deleted the only
+  initialiser, leaving the loop body's `lda <hi-slot>, s` reading
+  garbage.  Fix: function-wide `slotHasOtherRefs` safety check
+  before erasing the spill.  `softDouble.c` still uses
+  `-mllvm -regalloc=fast` for `__muldf3`'s 64×64→128 multiply
+  (different greedy bug — register-pressure-driven, not
+  spill-deletion-driven).
 
 - **(d,s),y / (sr,s),y addressing wraps the bank** when Y is
   negative as 16-bit unsigned. Worked around by `W65816NegYIndY`
@@ -228,12 +230,26 @@ sidecar bytes.
   beyond the existing DPF0 fake-physreg routing for the i64-return
   high half.
 
+- **strtok -O2 layout-sensitive miscompile FIXED** — modelling
+  `Uses=[P]` on the conditional branches (BEQ/BNE/BCS/BCC/BMI/BPL/
+  BVS/BVC) made MachineCSE see the dependency between an earlier
+  CMP and the consuming Bxx, eliminating an entire class of
+  layout-sensitive flag-corruption bugs.  Verified by sweeping
+  `--rodata-base` from text-end to text-end+300 in 13 increments
+  — every layout returns the correct strtok result.
+  As a follow-on, MachineCSE has been re-enabled (was previously
+  disabled in `W65816TargetMachine::addMachineSSAOptimization` as
+  a workaround for the same root cause).
+
 ## What's still needed for a "ship-ready" toolchain
 
-- **Greedy regalloc spill-ordering fix** — see above. Removes the
-  need for the per-file `-regalloc=fast` workaround on
-  `softDouble.c` and unblocks pattern-rich code that currently
-  must be compiled at `-O0` for correctness.
+- **softDouble.c -O1 hold-out** — `__muldf3`'s 64×64→128 multiply
+  with inlined alignment shifts overflows the greedy register
+  allocator at -O2 ("ran out of registers during register
+  allocation").  Builds correctly at -O1 (replaces the previous
+  -O2 + -mllvm -regalloc=fast workaround; -O1 is smaller and
+  doesn't require the non-default flag).
+
 
 - **More of the C standard library**: real `<stdio.h>` file I/O
   (`fopen`, `fread`, `fwrite`, `fseek` are currently stubs
diff --git a/runtime/build.sh b/runtime/build.sh
index c41dae1..e9865df 100755
--- a/runtime/build.sh
+++ b/runtime/build.sh
@@ -41,9 +41,11 @@ cc  "$SRC/extras.c"
 cc  "$SRC/strtok.c"
 cc  "$SRC/math.c"
 cc  "$SRC/softFloat.c"
-# softDouble.c needs -regalloc=fast: __muldf3's 64x64 -> 128 mul +
-# inlined alignment shifts overflows the greedy allocator on the
-# single-A target.
-cc  "$SRC/softDouble.c" -mllvm -regalloc=fast
+# softDouble.c builds at -O1 instead of -O2: __muldf3's 64x64 -> 128
+# mul + inlined alignment shifts overflows the greedy allocator on
+# the single-A target ("ran out of registers during register
+# allocation").  -O1 produces correct + smaller code than the
+# previous -O2 + -regalloc=fast workaround.
+cc  "$SRC/softDouble.c" -O1
 
 echo "runtime built: $(ls -1 "$OUT"/*.o | wc -l) objects"
diff --git a/runtime/src/libc.c b/runtime/src/libc.c
index 620a2e5..8551044 100644
--- a/runtime/src/libc.c
+++ b/runtime/src/libc.c
@@ -162,11 +162,9 @@ int puts(const char *s) {
 
 // ---- minimal printf ----
 
-// Forward-declared because varargs use stdarg.h's __builtin_va_list,
-// but our libc doesn't include stdarg.h yet — clang's built-in
-// va_arg/va_start/va_end work without an explicit include on most
-// targets.  Re-declare the types/macros locally to avoid including
-// the system header (which would pull in target-specific quirks).
+// Re-declare va_list / va_* locally rather than including stdarg.h —
+// clang's built-ins work directly and skip a header pull-in that's
+// otherwise harmless but adds compile time on every TU.
 typedef __builtin_va_list  va_list;
 #define va_start(ap, last) __builtin_va_start(ap, last)
 #define va_arg(ap, ty)     __builtin_va_arg(ap, ty)
@@ -208,12 +206,10 @@ static void writeStr(const char *s) {
     while (*s) { putchar(*s); s++; }
 }
 
-// Each format-spec handler is its own function so vprintf's main loop
-// stays small (avoids the W65816 backend's long-branch limitation
-// which fails to relax conditional branches > 128 bytes; nesting all
-// the format handlers inline produced functions whose internal Bxx
-// targets exceeded that range).
-__attribute__((noinline))
+// Format-spec handlers used to be marked noinline to keep vprintf's
+// main loop small for the long-branch limitation; now W65816BranchExpand
+// reliably promotes Bxx to BRL when needed, so the inliner is free to
+// merge them when it wants.
 static void writeSignedLong(long n) {
     if (n < 0) { putchar('-'); writeULong((unsigned long)(-n)); }
     else        writeULong((unsigned long)n);
@@ -225,7 +221,6 @@ static void writeSignedLong(long n) {
 // non-finite values.  Not IEEE-precise (intermediate truncation in
 // the soft-double mul/div), but good enough for typical formatted
 // numeric output.
-__attribute__((noinline))
 static void writeDouble(double v, int prec) {
     if (prec < 0) prec = 6;
     if (prec > 9) prec = 9;
@@ -585,46 +580,26 @@ int atexit(AtexitFn fn) {
     return 0;
 }
 
-// ---- File I/O via GS/OS toolbox calls ----
+// ---- File I/O stubs ----
 //
-// On a real Apple IIgs running GS/OS, these route through the GS/OS
-// dispatcher at $E100A8.  When running outside GS/OS (e.g., bare
-// MAME tests), every call returns failure so user code degrades
-// gracefully instead of trapping.
-//
-// Pclass-1 parameter blocks are stack-allocated as packed structs
-// matching the GS/OS class-1 layout; we pass the block's pointer
-// and call number to a single helper.
+// A real implementation would route through the GS/OS dispatcher at
+// $E100A8 (build a class-1 parm block, push its pointer, JSL with X
+// = callNum, copy the refNum out).  fopen would maintain a small
+// FD table mapping FILE* magic values back to GS/OS refNums.
+// Until that lands, every call returns failure so code that links
+// against stdio degrades gracefully instead of trapping.
 
-typedef unsigned long u32_t;
-typedef unsigned int  u16_t;
-typedef int           s16_t;
-
-// File descriptor table: fopen returns a FILE* whose 'magic' field
-// holds (u16)refNum + 0x8000 — distinguishing real fds from the
-// pre-baked stdin/stdout/stderr.
-#define FOPEN_MAGIC_BASE 0x8000
-
-// Static table of refNum-bearing FILE objects.  16 simultaneous opens.
-#define MAX_OPEN_FDS 16
-static FILE __fds[MAX_OPEN_FDS];
-static unsigned char __fdInUse[MAX_OPEN_FDS];
-
-// Stub fopen: a real implementation would build a GS/OS Open ($2010)
-// class-1 parm block (pathname as Pascal string, requestAccess, etc.),
-// invoke the dispatcher at $E100A8 with X=callNum and a PHA'd parm-
-// block pointer, then copy the refNum out.  For now, return NULL.
 FILE *fopen(const char *path, const char *mode) {
     (void)path; (void)mode;
     return (FILE *)0;
 }
 
-unsigned int fread(void *ptr, unsigned int size, unsigned int nmemb, FILE *stream) {
+size_t fread(void *ptr, size_t size, size_t nmemb, FILE *stream) {
     (void)ptr; (void)size; (void)nmemb; (void)stream;
     return 0;
 }
 
-unsigned int fwrite(const void *ptr, unsigned int size, unsigned int nmemb, FILE *stream) {
+size_t fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream) {
     (void)ptr; (void)size; (void)nmemb; (void)stream;
     return 0;
 }
diff --git a/runtime/src/snprintf.c b/runtime/src/snprintf.c
index e5bbbad..ee16452 100644
--- a/runtime/src/snprintf.c
+++ b/runtime/src/snprintf.c
@@ -19,17 +19,24 @@
 // number actually written.  This lets callers detect truncation.
 //
 // **Sink state lives in file-static globals** instead of an explicit
-// struct passed by pointer.  Two W65816 backend bugs forced this:
-//   (1) The address of a stack-resident struct is computed wrong
-//       (&s came out as SP+5 = address of s.end instead of SP+3).
-//       emit() then read garbage cur/end values, the cur >= end branch
-//       skipped every write, and snprintf returned the right length
-//       with an empty buffer.
+// struct passed by pointer.  This was originally a workaround for two
+// W65816 backend bugs (since fixed):
+//   (1) The address of a stack-resident struct used to be computed
+//       wrong (&s came out as SP+5 = address of s.end instead of SP+3).
 //   (2) Functions taking fmt as arg1 (stack) didn't initialize the
 //       fmt local before the loop body — first char came from the
 //       arg slot but the loop's fmt++ ran on uninitialized memory.
-//       Fixed by making fmt always arg0 (A reg).
+// The struct-sink form now compiles correctly, but switching back to it
+// would shift every TU's branch distances; left as-is for stability.
 // Single-threaded use only, but that matches the rest of this runtime.
+//
+// Reverse-emit pattern (used by emitUDec / emitULong / emitHex): the
+// natural countdown forms (`while (i > 0) emit(buf[--i])`,
+// `while (i > 0) { i--; emit(buf[i]); }`,
+// `for (j = i - 1; j >= 0; j--) emit(buf[j])`) all lower to a
+// do-while whose `dec a; bpl` exit condition runs the loop one
+// extra time on this backend, leaking a `buf[-1]` read.  Use the
+// forward count + index-arithmetic form instead.
 
 typedef unsigned int  size_t;
 typedef __builtin_va_list  va_list;
@@ -75,13 +82,7 @@ static void emitUDec(unsigned int n) {
         buf[i++] = '0' + (n % 10);
         n /= 10;
     }
-    // Reverse-emit using forward index arithmetic.  The natural
-    // countdown forms (`while (i > 0) emit(buf[--i])`,
-    // `while (i > 0) { i--; emit(buf[i]); }`,
-    // `for (j = i - 1; j >= 0; j--) emit(buf[j])`) all lower to a
-    // do-while whose `dec a; bpl` exit condition runs the loop one
-    // extra time on this backend, leaking a buf[-1] read.  The forward
-    // count + index-arithmetic form below avoids the bad lowering.
+    // Reverse-emit; see file header for the forward-index rationale.
     int top = i;
     for (int j = 0; j < top; j++) {
         emit(buf[top - 1 - j]);
@@ -112,13 +113,7 @@ static void emitULong(unsigned long n) {
         buf[i++] = '0' + (n % 10);
         n /= 10;
     }
-    // Reverse-emit using forward index arithmetic.  The natural
-    // countdown forms (`while (i > 0) emit(buf[--i])`,
-    // `while (i > 0) { i--; emit(buf[i]); }`,
-    // `for (j = i - 1; j >= 0; j--) emit(buf[j])`) all lower to a
-    // do-while whose `dec a; bpl` exit condition runs the loop one
-    // extra time on this backend, leaking a buf[-1] read.  The forward
-    // count + index-arithmetic form below avoids the bad lowering.
+    // Reverse-emit; see file header for the forward-index rationale.
     int top = i;
     for (int j = 0; j < top; j++) {
         emit(buf[top - 1 - j]);
@@ -152,13 +147,7 @@ static void emitHex(unsigned int n, int width) {
     while (i < width) {
         buf[i++] = '0';
     }
-    // Reverse-emit using forward index arithmetic.  The natural
-    // countdown forms (`while (i > 0) emit(buf[--i])`,
-    // `while (i > 0) { i--; emit(buf[i]); }`,
-    // `for (j = i - 1; j >= 0; j--) emit(buf[j])`) all lower to a
-    // do-while whose `dec a; bpl` exit condition runs the loop one
-    // extra time on this backend, leaking a buf[-1] read.  The forward
-    // count + index-arithmetic form below avoids the bad lowering.
+    // Reverse-emit; see file header for the forward-index rationale.
     int top = i;
     for (int j = 0; j < top; j++) {
         emit(buf[top - 1 - j]);
diff --git a/scripts/smokeTest.sh b/scripts/smokeTest.sh
index 64824b4..35a2e94 100755
--- a/scripts/smokeTest.sh
+++ b/scripts/smokeTest.sh
@@ -1104,10 +1104,10 @@ int toInt(double x) { return (int)x; }
 double fromInt(int n) { return (double)n; }
 EOF
     "$CLANG" --target=w65816 -O2 -ffunction-sections -c "$cDblFile" -o "$oDblFile"
-    # softDouble.c uses -regalloc=fast because __muldf3's 64x64 -> 128
+    # softDouble.c builds at -O1 (not -O2): __muldf3's 64x64 -> 128
     # multiply with the inlined alignment shifts overflows the greedy
-    # allocator's spill heuristics on the single-A target.
-    "$CLANG" --target=w65816 -O2 -ffunction-sections -mllvm -regalloc=fast \
+    # allocator's spill heuristics on the single-A target at -O2.
+    "$CLANG" --target=w65816 -O1 -ffunction-sections \
         -c "$PROJECT_ROOT/runtime/src/softDouble.c" -o "$oSdFile"
     "$PROJECT_ROOT/tools/link816" -o "$binDblFile" \
         --text-base 0x8000 --map "$mapDblFile" \
@@ -1304,7 +1304,7 @@ int main(void) {
 }
 EOF
     "$CLANG" --target=w65816 -O2 -ffunction-sections -c "$cDblMame" -o "$oDblMame"
-    "$CLANG" --target=w65816 -O2 -ffunction-sections -mllvm -regalloc=fast \
+    "$CLANG" --target=w65816 -O1 -ffunction-sections \
         -c "$PROJECT_ROOT/runtime/src/softDouble.c" -o "$oSdMame"
     "$PROJECT_ROOT/tools/link816" -o "$binDblMame" \
         --text-base 0x1000 \
@@ -1443,7 +1443,7 @@ EOF
             -c "$PROJECT_ROOT/runtime/src/math.c" -o "$oMathF"
         "$CLANG" --target=w65816 -O2 -ffunction-sections \
             -c "$PROJECT_ROOT/runtime/src/softFloat.c" -o "$oSfF"
-        "$CLANG" --target=w65816 -O2 -ffunction-sections -mllvm -regalloc=fast \
+        "$CLANG" --target=w65816 -O1 -ffunction-sections \
             -c "$PROJECT_ROOT/runtime/src/softDouble.c" -o "$oSdF"
         oCrt0F="$(mktemp --suffix=.o)"
         "$PROJECT_ROOT/tools/llvm-mos-build/bin/llvm-mc" -arch=w65816 \
@@ -2056,7 +2056,7 @@ EOF
         fi
         rm -f "$cAofFile" "$oAofFile" "$binAofFile"
 
-        log "check: MAME runs iterative qsort([3,1,2]) with optnone → [1,2,3] (#70 workaround)"
+        log "check: MAME runs iterative qsort([3,1,2]) at -O2 → [1,2,3] (#70 fixed)"
         cQsFile="$(mktemp --suffix=.c)"
         oQsFile="$(mktemp --suffix=.o)"
         binQsFile="$(mktemp --suffix=.bin)"
@@ -2067,10 +2067,10 @@ __attribute__((noinline)) void switchToBank2(void) {
 __attribute__((noinline)) void swap(short *a, short *b) {
     short t = *a; *a = *b; *b = t;
 }
-// optnone routes this function through fast regalloc (per the
-// W65816TargetMachine choice), avoiding the greedy spill-ordering
-// bug that mis-compiles the iterative if/else recursion form.
-__attribute__((noinline,optnone))
+// #70 fixed (W65816StackSlotCleanup Pass -2 safety check): greedy
+// regalloc no longer mis-compiles the iterative if/else recursion
+// form.  This compiles at -O2 + greedy now, no per-function workaround.
+__attribute__((noinline))
 void qsortIter(short *arr, short lo, short hi) {
     while (lo < hi) {
         short pivot = arr[hi];
@@ -2108,7 +2108,7 @@ EOF
         if ! bash "$PROJECT_ROOT/scripts/runInMame.sh" \
                   "$binQsFile" --check 0x025000=0001 0x025002=0002 \
                   0x025004=0003 >/dev/null 2>&1; then
-            die "MAME: optnone qsort([3,1,2]) wrong (workaround for #70 broken)"
+            die "MAME: -O2 qsort([3,1,2]) wrong (#70 fix regression — Pass -2 safety check)"
         fi
         rm -f "$cQsFile" "$oQsFile" "$binQsFile"
 
diff --git a/src/link816/link816.cpp b/src/link816/link816.cpp
index 160269b..ed5326f 100644
--- a/src/link816/link816.cpp
+++ b/src/link816/link816.cpp
@@ -460,6 +460,21 @@ struct Linker {
         L.bssSize    = curBss;
         L.rodataBase = rodataBase ? rodataBase : (textBase + curText);
         L.rodataSize = curRodata;
+        // Reject a --rodata-base that overlaps text.  Without this
+        // check, the gap between text-end and rodata-base goes
+        // negative, the unsigned subtraction wraps to ~4GB, and the
+        // image-write loop creates a multi-gigabyte file with no
+        // diagnostic.  Caught while sweeping --rodata-base values
+        // in a strtok layout-sensitivity test.
+        if (rodataBase && L.rodataBase < L.textBase + L.textSize) {
+            char msg[160];
+            std::snprintf(msg, sizeof(msg),
+                "--rodata-base 0x%X overlaps text 0x%X+%u "
+                "(rodata must start at or after 0x%X)",
+                L.rodataBase, L.textBase, L.textSize,
+                L.textBase + L.textSize);
+            die(msg);
+        }
         // .init_array goes immediately after .rodata in the image.
         L.initBase = L.rodataBase + L.rodataSize;
         L.initSize = curInit;
diff --git a/src/llvm/lib/Target/W65816/W65816InstrInfo.td b/src/llvm/lib/Target/W65816/W65816InstrInfo.td
index 550465f..0345959 100644
--- a/src/llvm/lib/Target/W65816/W65816InstrInfo.td
+++ b/src/llvm/lib/Target/W65816/W65816InstrInfo.td
@@ -187,10 +187,10 @@ def SELECT_CC8  : W65816Pseudo<(outs Acc8:$dst),
 // NOT model that with `Defs = [P]`.  Adding `Defs = [P]` lets the
 // scheduler legally place an LDA between CMP and Bxx (P just gets
 // re-defined; the latest def is what Bxx tests) — same flag-corruption
-// bug, different mechanism.  The proper fix is the 4-block SELECT_CC
-// inserter (landed) for SETCC patterns and a similar BR_CC stub-block
-// pass (still TODO) for `while`/`for`/`if-goto` tests — see
-// memory/project_known_issue_lda_flags.md.
+// bug, different mechanism.  Two complementary fixes carry the load:
+// the 4-block SELECT_CC inserter for SETCC patterns, and the post-RA
+// PHP/PLP wrap pass (W65816StackSlotCleanup Pass -2.5) for BR_CC
+// patterns (`while`/`for`/`if-goto`).  Both landed.
 let isAsCheapAsAMove = 1, isReMaterializable = 1,
     hasSideEffects = 0, mayLoad = 0, mayStore = 0 in {
 def LDAi16imm : W65816Pseudo<(outs Acc16:$dst), (ins i16imm:$imm),
@@ -1257,7 +1257,15 @@ def PEA : InstAbs<0xF4, "pea">;
 def PER : InstPCRel16<0x62, "per">;
 
 //---------------------------------------------------------------- Branches
-let isBranch = 1, isTerminator = 1, mayLoad = 0, mayStore = 0 in {
+// Conditional branches READ the P (status) register.  Without this
+// Uses, MachineCSE saw no dependency between an earlier CMP (which
+// defines P) and the consuming Bxx, and would happily reuse a
+// "redundant" CMP whose flags had been clobbered by an intervening
+// LDA/STA/ADC.  Modelling the dep is the principled fix; the
+// W65816TargetMachine workaround that disabled MachineCSE entirely
+// can come back out once this is verified.
+let isBranch = 1, isTerminator = 1, mayLoad = 0, mayStore = 0,
+    Uses = [P] in {
 def BEQ : InstPCRel8<0xF0, "beq">;
 def BNE : InstPCRel8<0xD0, "bne">;
 def BCS : InstPCRel8<0xB0, "bcs">;
diff --git a/src/llvm/lib/Target/W65816/W65816StackSlotCleanup.cpp b/src/llvm/lib/Target/W65816/W65816StackSlotCleanup.cpp
index f5310d3..32f2b8e 100644
--- a/src/llvm/lib/Target/W65816/W65816StackSlotCleanup.cpp
+++ b/src/llvm/lib/Target/W65816/W65816StackSlotCleanup.cpp
@@ -666,9 +666,9 @@ bool W65816StackSlotCleanup::runOnMachineFunction(MachineFunction &MF) {
 
   // Pass -2.5: BR_CC flag-corruption mitigation via PHP/PLP.  When a
   // flag-test (CMP/ORA/etc.) is followed by P-corrupting ops (LDA/LDX
-  // /AND/etc.) and then a flag-testing branch (Bxx), the branch ends
-  // up testing the corrupting op's N/Z instead of the test's.  This is
-  // a real correctness bug — `while (n > 0)` always exits on first
+  // /AND/etc.) and then a flag-testing branch (Bxx), the branch would
+  // test the corrupting op's N/Z instead of the test's.  This is a
+  // real correctness bug — `while (n > 0)` always exits on first
   // iteration; `eq_test(0)` returns 0; etc.  Wrap the corrupting span
   // with PHP (push flags) / PLP (pop flags), preserving the test's
   // flags across the corruption.  Costs 2 bytes / 8 cycles per
@@ -680,6 +680,14 @@ bool W65816StackSlotCleanup::runOnMachineFunction(MachineFunction &MF) {
   //     and LDA doesn't touch C, so they're not affected.
   //   - There's at least one P-corrupting instruction between the
   //     flag-defining test and the Bxx.
+  //
+  // NOTE: Bxx instructions now declare `Uses = [P]` (W65816InstrInfo.td),
+  // so the pre-RA scheduler / regalloc / peephole won't legally insert
+  // a P-corrupting op between a CMP and the consuming Bxx.  This pass
+  // is now mostly defensive — it stays in place to catch any post-RA
+  // pass that might still violate the dep, and to wrap the rare cases
+  // where the IR-level test is a load (LDA flag side-effect) rather
+  // than an explicit CMP.
   for (MachineBasicBlock &MBB : MF) {
     SmallVector<MachineInstr *, 4> Branches;
     for (MachineInstr &MI : MBB) {
@@ -945,18 +953,9 @@ bool W65816StackSlotCleanup::runOnMachineFunction(MachineFunction &MF) {
       if (!OpfiTarget) continue;
       // Verify slotB has no OTHER references in this function (besides
       // Sta and OpfiTarget).  If it does, we can't safely drop Sta.
-      bool OtherUse = false;
-      for (MachineBasicBlock &MBBO : MF) {
-        for (MachineInstr &MIO : MBBO) {
-          if (&MIO == &Sta || &MIO == OpfiTarget) continue;
-          for (const MachineOperand &MO : MIO.operands()) {
-            if (MO.isFI() && MO.getIndex() == SlotB) { OtherUse = true; break; }
-          }
-          if (OtherUse) break;
-        }
-        if (OtherUse) break;
-      }
-      if (OtherUse) continue;
+      const MachineInstr *ignoreS2c[] = {&Sta, OpfiTarget};
+      if (slotHasOtherRefs(MF, SlotB, ignoreS2c))
+        continue;
       // Apply rewrite.
       OpfiTarget->getOperand(RewriteIdx).setIndex(SlotA);
       Erased.insert(Lda);
@@ -1026,6 +1025,18 @@ bool W65816StackSlotCleanup::runOnMachineFunction(MachineFunction &MF) {
           Op.getOperand(FiIdx + 1).getImm() != 0)
         continue;
 
+      // Function-wide safety check: slotB must have no other refs
+      // besides Sta (which we erase) and Op (which we rewrite to use
+      // slotA instead).  Without this, deleting Sta orphans any other
+      // reader of slotB (e.g. a loop-header LDAfi reading the same
+      // copy that this entry-side STA initialises) — surfaces as the
+      // qsort #70 miscompile where greedy parks a value into a slot
+      // both for the immediate CMPfi and for a downstream loop-body
+      // reload, and Pass -2 silently kills the only initialiser.
+      const MachineInstr *ignoreS2[] = {&Sta, &Op};
+      if (slotHasOtherRefs(MF, SlotB, ignoreS2))
+        continue;
+
       // Rewrite OP to use slotA, drop Lda1+Sta.
       Op.getOperand(FiIdx).setIndex(SlotA);
       Lda1->eraseFromParent();
@@ -1097,6 +1108,15 @@ bool W65816StackSlotCleanup::runOnMachineFunction(MachineFunction &MF) {
       MachineInstr &Op = *It;
       if (!matchCommutativeFiOpOnSlot(Op, SlotB)) continue;
 
+      // Function-wide safety check: slot_b must have no other refs
+      // besides Sta2 (which we erase) and Op (which we rewrite).
+      // Same class of bug as #70's Pass -2: an inserter-spawned
+      // spill slot can get reused as a regalloc spill home, and
+      // erasing the only initialiser would orphan downstream reads.
+      const MachineInstr *ignoreS1[] = {&Sta2, &Op};
+      if (slotHasOtherRefs(MF, SlotB, ignoreS1))
+        continue;
+
       // Rewrite Op to use slot_a instead of slot_b, erase Sta2 + Lda.
       Op.getOperand(2).setIndex(SlotA);
       Sta2.eraseFromParent();
diff --git a/src/llvm/lib/Target/W65816/W65816TargetMachine.cpp b/src/llvm/lib/Target/W65816/W65816TargetMachine.cpp
index 6ca79fa..bb3072d 100644
--- a/src/llvm/lib/Target/W65816/W65816TargetMachine.cpp
+++ b/src/llvm/lib/Target/W65816/W65816TargetMachine.cpp
@@ -106,20 +106,12 @@ TargetPassConfig *W65816TargetMachine::createPassConfig(PassManagerBase &PM) {
 }
 
 void W65816PassConfig::addMachineSSAOptimization() {
-  // MachineCSE incorrectly eliminates "redundant" CMP instructions when
-  // it sees an earlier identical CMP elsewhere in the function — the
-  // P (status) flag is considered "available", but on this target P is
-  // clobbered by every intervening LDA/STA/ADC, so the surviving Bxx
-  // ends up dispatching on stale flags.  We don't model `Uses=[P]` on
-  // Bxx because doing so causes regalloc/layout shifts that uncovered
-  // a different latent bug in vprintf.  Disabling the pass entirely
-  // is the lower-cost workaround until the Bxx-Uses=[P] regression is
-  // root-caused.  Caught by `printf("%d", n)` returning 0.
-  //
-  // Other SSA opts (early-tailduplication, opt-phis, dead-mi-elim,
-  // licm, machine-sink, peephole-opt, etc.) still run by chaining
-  // through the default impl — we just skip MachineCSE.
-  disablePass(&MachineCSELegacyID);
+  // MachineCSE used to be disabled here because it incorrectly
+  // eliminated "redundant" CMP instructions: P was considered
+  // "available" but on this target P is clobbered by every
+  // intervening LDA/STA/ADC.  The principled fix is to model
+  // Uses=[P] on Bxx (so MachineCSE sees the dep) and let the
+  // pass run normally — that landed in W65816InstrInfo.td.
   TargetPassConfig::addMachineSSAOptimization();
 }