From 18ef7e1fa6d5afcdb6a4bf3f3ff24fccf0e260fc Mon Sep 17 00:00:00 2001 From: Scott Duensing Date: Fri, 1 May 2026 17:22:55 -0500 Subject: [PATCH] Checkpoint --- .gitignore | 2 + STATUS.md | 66 ++++++++++++------- runtime/build.sh | 10 +-- runtime/src/libc.c | 57 +++++----------- runtime/src/snprintf.c | 45 +++++-------- scripts/smokeTest.sh | 22 +++---- src/link816/link816.cpp | 15 +++++ src/llvm/lib/Target/W65816/W65816InstrInfo.td | 18 +++-- .../Target/W65816/W65816StackSlotCleanup.cpp | 50 +++++++++----- .../lib/Target/W65816/W65816TargetMachine.cpp | 20 ++---- 10 files changed, 162 insertions(+), 143 deletions(-) diff --git a/.gitignore b/.gitignore index 9abef7d..52b4a98 100644 --- a/.gitignore +++ b/.gitignore @@ -8,8 +8,10 @@ tools/ # Runtime build artifacts: regenerable via runtime/build.sh from # runtime/src/*.s. The source files (.s, build.sh) are tracked. runtime/*.o +runtime/*.o.bak # Editor / OS *.swp *.swo .DS_Store +*~ diff --git a/STATUS.md b/STATUS.md index 67df5eb..5984da1 100644 --- a/STATUS.md +++ b/STATUS.md @@ -149,11 +149,16 @@ atol/llabs/atexit/exit/abort, and a smoke test that exercises malloc + struct pointers + strcmp/strcpy via a working hash table end-to-end in MAME. -`strtok` / `strtok_r` live in their own TU built at `-O0` — the -`-O2` codegen for the str==NULL continuation path miscompiles -(documented as the same backend-fragility class as #70 / qsort, -both mitigated by reaching for fast regalloc per-TU). Multi-call -strtok over "a,b,,c" works end-to-end in smoke. +`strtok` / `strtok_r` live in their own TU at `-O2` (with +`__attribute__((noinline))` on `strtok_r` so the strtok() wrapper +doesn't duplicate it). Multi-call strtok over "a,b,,c" works +end-to-end in smoke. Latent backend issue: at certain rodata +layouts, -O2 strtok_r's BB0_7 inner CMP loop miscompiles due to +LICM/sink interaction; current smoke layout passes but adding +bytes upstream (e.g. growing softDouble.o) can shift delim into +a failing address. Surgical workaround `-mllvm -disable-machine- +sink` on strtok.c is documented; not currently applied because +smoke is green. A small **RPN calculator** test (smoke #87) chains strtok, atol, push/pop over a static stack, snprintf "%ld", and strcmp to verify @@ -198,22 +203,19 @@ sidecar bytes. ## Known issues / workarounds -- **Greedy register allocator mis-orders spills** in iterative - quicksort with `if/else` recursion choice (#70). Live-range - tracking for `hi` is wrong across the inner loop and post-loop - swap call, producing miscompiled code. Reproduces only at - `-O1`/`-O2` with greedy. Workarounds (any one): - - `__attribute__((noinline,optnone))` on the affected function — - routes through fast regalloc per-function. Verified in smoke - test; recommended for new code that hits this. - - `-mllvm -regalloc=fast` for the whole translation unit. - `softDouble.c` already uses this for `__muldf3` (build.sh - applies it automatically). - - Rewrite the loop with explicit recursion guards instead of - the iterative tail-elim form. - - Real fix needs deeper greedy work; deferred behind the per- - function attribute since it covers the practical cases. +- **#70 FIXED**: greedy regalloc + W65816StackSlotCleanup Pass -2 + was deleting an entry-side store to a slot that the loop body + read. Pass -2 collapses `LDAfi slotA; STAfi slotB; LDAfi slotC; + OPfi slotB` into `LDAfi slotC; OPfi slotA` (memory-to-memory copy + through A elimination), but didn't check whether slotB had other + refs in the function. In iterative qsort, slotB happened to be + the spill home for `hi` — the Pass -2 transform deleted the only + initialiser, leaving the loop body's `lda , s` reading + garbage. Fix: function-wide `slotHasOtherRefs` safety check + before erasing the spill. `softDouble.c` still uses + `-mllvm -regalloc=fast` for `__muldf3`'s 64×64→128 multiply + (different greedy bug — register-pressure-driven, not + spill-deletion-driven). - **(d,s),y / (sr,s),y addressing wraps the bank** when Y is negative as 16-bit unsigned. Worked around by `W65816NegYIndY` @@ -228,12 +230,26 @@ sidecar bytes. beyond the existing DPF0 fake-physreg routing for the i64-return high half. +- **strtok -O2 layout-sensitive miscompile FIXED** — modelling + `Uses=[P]` on the conditional branches (BEQ/BNE/BCS/BCC/BMI/BPL/ + BVS/BVC) made MachineCSE see the dependency between an earlier + CMP and the consuming Bxx, eliminating an entire class of + layout-sensitive flag-corruption bugs. Verified by sweeping + `--rodata-base` from text-end to text-end+300 in 13 increments + — every layout returns the correct strtok result. + As a follow-on, MachineCSE has been re-enabled (was previously + disabled in `W65816TargetMachine::addMachineSSAOptimization` as + a workaround for the same root cause). + ## What's still needed for a "ship-ready" toolchain -- **Greedy regalloc spill-ordering fix** — see above. Removes the - need for the per-file `-regalloc=fast` workaround on - `softDouble.c` and unblocks pattern-rich code that currently - must be compiled at `-O0` for correctness. +- **softDouble.c -O1 hold-out** — `__muldf3`'s 64×64→128 multiply + with inlined alignment shifts overflows the greedy register + allocator at -O2 ("ran out of registers during register + allocation"). Builds correctly at -O1 (replaces the previous + -O2 + -mllvm -regalloc=fast workaround; -O1 is smaller and + doesn't require the non-default flag). + - **More of the C standard library**: real `` file I/O (`fopen`, `fread`, `fwrite`, `fseek` are currently stubs diff --git a/runtime/build.sh b/runtime/build.sh index c41dae1..e9865df 100755 --- a/runtime/build.sh +++ b/runtime/build.sh @@ -41,9 +41,11 @@ cc "$SRC/extras.c" cc "$SRC/strtok.c" cc "$SRC/math.c" cc "$SRC/softFloat.c" -# softDouble.c needs -regalloc=fast: __muldf3's 64x64 -> 128 mul + -# inlined alignment shifts overflows the greedy allocator on the -# single-A target. -cc "$SRC/softDouble.c" -mllvm -regalloc=fast +# softDouble.c builds at -O1 instead of -O2: __muldf3's 64x64 -> 128 +# mul + inlined alignment shifts overflows the greedy allocator on +# the single-A target ("ran out of registers during register +# allocation"). -O1 produces correct + smaller code than the +# previous -O2 + -regalloc=fast workaround. +cc "$SRC/softDouble.c" -O1 echo "runtime built: $(ls -1 "$OUT"/*.o | wc -l) objects" diff --git a/runtime/src/libc.c b/runtime/src/libc.c index 620a2e5..8551044 100644 --- a/runtime/src/libc.c +++ b/runtime/src/libc.c @@ -162,11 +162,9 @@ int puts(const char *s) { // ---- minimal printf ---- -// Forward-declared because varargs use stdarg.h's __builtin_va_list, -// but our libc doesn't include stdarg.h yet — clang's built-in -// va_arg/va_start/va_end work without an explicit include on most -// targets. Re-declare the types/macros locally to avoid including -// the system header (which would pull in target-specific quirks). +// Re-declare va_list / va_* locally rather than including stdarg.h — +// clang's built-ins work directly and skip a header pull-in that's +// otherwise harmless but adds compile time on every TU. typedef __builtin_va_list va_list; #define va_start(ap, last) __builtin_va_start(ap, last) #define va_arg(ap, ty) __builtin_va_arg(ap, ty) @@ -208,12 +206,10 @@ static void writeStr(const char *s) { while (*s) { putchar(*s); s++; } } -// Each format-spec handler is its own function so vprintf's main loop -// stays small (avoids the W65816 backend's long-branch limitation -// which fails to relax conditional branches > 128 bytes; nesting all -// the format handlers inline produced functions whose internal Bxx -// targets exceeded that range). -__attribute__((noinline)) +// Format-spec handlers used to be marked noinline to keep vprintf's +// main loop small for the long-branch limitation; now W65816BranchExpand +// reliably promotes Bxx to BRL when needed, so the inliner is free to +// merge them when it wants. static void writeSignedLong(long n) { if (n < 0) { putchar('-'); writeULong((unsigned long)(-n)); } else writeULong((unsigned long)n); @@ -225,7 +221,6 @@ static void writeSignedLong(long n) { // non-finite values. Not IEEE-precise (intermediate truncation in // the soft-double mul/div), but good enough for typical formatted // numeric output. -__attribute__((noinline)) static void writeDouble(double v, int prec) { if (prec < 0) prec = 6; if (prec > 9) prec = 9; @@ -585,46 +580,26 @@ int atexit(AtexitFn fn) { return 0; } -// ---- File I/O via GS/OS toolbox calls ---- +// ---- File I/O stubs ---- // -// On a real Apple IIgs running GS/OS, these route through the GS/OS -// dispatcher at $E100A8. When running outside GS/OS (e.g., bare -// MAME tests), every call returns failure so user code degrades -// gracefully instead of trapping. -// -// Pclass-1 parameter blocks are stack-allocated as packed structs -// matching the GS/OS class-1 layout; we pass the block's pointer -// and call number to a single helper. +// A real implementation would route through the GS/OS dispatcher at +// $E100A8 (build a class-1 parm block, push its pointer, JSL with X +// = callNum, copy the refNum out). fopen would maintain a small +// FD table mapping FILE* magic values back to GS/OS refNums. +// Until that lands, every call returns failure so code that links +// against stdio degrades gracefully instead of trapping. -typedef unsigned long u32_t; -typedef unsigned int u16_t; -typedef int s16_t; - -// File descriptor table: fopen returns a FILE* whose 'magic' field -// holds (u16)refNum + 0x8000 — distinguishing real fds from the -// pre-baked stdin/stdout/stderr. -#define FOPEN_MAGIC_BASE 0x8000 - -// Static table of refNum-bearing FILE objects. 16 simultaneous opens. -#define MAX_OPEN_FDS 16 -static FILE __fds[MAX_OPEN_FDS]; -static unsigned char __fdInUse[MAX_OPEN_FDS]; - -// Stub fopen: a real implementation would build a GS/OS Open ($2010) -// class-1 parm block (pathname as Pascal string, requestAccess, etc.), -// invoke the dispatcher at $E100A8 with X=callNum and a PHA'd parm- -// block pointer, then copy the refNum out. For now, return NULL. FILE *fopen(const char *path, const char *mode) { (void)path; (void)mode; return (FILE *)0; } -unsigned int fread(void *ptr, unsigned int size, unsigned int nmemb, FILE *stream) { +size_t fread(void *ptr, size_t size, size_t nmemb, FILE *stream) { (void)ptr; (void)size; (void)nmemb; (void)stream; return 0; } -unsigned int fwrite(const void *ptr, unsigned int size, unsigned int nmemb, FILE *stream) { +size_t fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream) { (void)ptr; (void)size; (void)nmemb; (void)stream; return 0; } diff --git a/runtime/src/snprintf.c b/runtime/src/snprintf.c index e5bbbad..ee16452 100644 --- a/runtime/src/snprintf.c +++ b/runtime/src/snprintf.c @@ -19,17 +19,24 @@ // number actually written. This lets callers detect truncation. // // **Sink state lives in file-static globals** instead of an explicit -// struct passed by pointer. Two W65816 backend bugs forced this: -// (1) The address of a stack-resident struct is computed wrong -// (&s came out as SP+5 = address of s.end instead of SP+3). -// emit() then read garbage cur/end values, the cur >= end branch -// skipped every write, and snprintf returned the right length -// with an empty buffer. +// struct passed by pointer. This was originally a workaround for two +// W65816 backend bugs (since fixed): +// (1) The address of a stack-resident struct used to be computed +// wrong (&s came out as SP+5 = address of s.end instead of SP+3). // (2) Functions taking fmt as arg1 (stack) didn't initialize the // fmt local before the loop body — first char came from the // arg slot but the loop's fmt++ ran on uninitialized memory. -// Fixed by making fmt always arg0 (A reg). +// The struct-sink form now compiles correctly, but switching back to it +// would shift every TU's branch distances; left as-is for stability. // Single-threaded use only, but that matches the rest of this runtime. +// +// Reverse-emit pattern (used by emitUDec / emitULong / emitHex): the +// natural countdown forms (`while (i > 0) emit(buf[--i])`, +// `while (i > 0) { i--; emit(buf[i]); }`, +// `for (j = i - 1; j >= 0; j--) emit(buf[j])`) all lower to a +// do-while whose `dec a; bpl` exit condition runs the loop one +// extra time on this backend, leaking a `buf[-1]` read. Use the +// forward count + index-arithmetic form instead. typedef unsigned int size_t; typedef __builtin_va_list va_list; @@ -75,13 +82,7 @@ static void emitUDec(unsigned int n) { buf[i++] = '0' + (n % 10); n /= 10; } - // Reverse-emit using forward index arithmetic. The natural - // countdown forms (`while (i > 0) emit(buf[--i])`, - // `while (i > 0) { i--; emit(buf[i]); }`, - // `for (j = i - 1; j >= 0; j--) emit(buf[j])`) all lower to a - // do-while whose `dec a; bpl` exit condition runs the loop one - // extra time on this backend, leaking a buf[-1] read. The forward - // count + index-arithmetic form below avoids the bad lowering. + // Reverse-emit; see file header for the forward-index rationale. int top = i; for (int j = 0; j < top; j++) { emit(buf[top - 1 - j]); @@ -112,13 +113,7 @@ static void emitULong(unsigned long n) { buf[i++] = '0' + (n % 10); n /= 10; } - // Reverse-emit using forward index arithmetic. The natural - // countdown forms (`while (i > 0) emit(buf[--i])`, - // `while (i > 0) { i--; emit(buf[i]); }`, - // `for (j = i - 1; j >= 0; j--) emit(buf[j])`) all lower to a - // do-while whose `dec a; bpl` exit condition runs the loop one - // extra time on this backend, leaking a buf[-1] read. The forward - // count + index-arithmetic form below avoids the bad lowering. + // Reverse-emit; see file header for the forward-index rationale. int top = i; for (int j = 0; j < top; j++) { emit(buf[top - 1 - j]); @@ -152,13 +147,7 @@ static void emitHex(unsigned int n, int width) { while (i < width) { buf[i++] = '0'; } - // Reverse-emit using forward index arithmetic. The natural - // countdown forms (`while (i > 0) emit(buf[--i])`, - // `while (i > 0) { i--; emit(buf[i]); }`, - // `for (j = i - 1; j >= 0; j--) emit(buf[j])`) all lower to a - // do-while whose `dec a; bpl` exit condition runs the loop one - // extra time on this backend, leaking a buf[-1] read. The forward - // count + index-arithmetic form below avoids the bad lowering. + // Reverse-emit; see file header for the forward-index rationale. int top = i; for (int j = 0; j < top; j++) { emit(buf[top - 1 - j]); diff --git a/scripts/smokeTest.sh b/scripts/smokeTest.sh index 64824b4..35a2e94 100755 --- a/scripts/smokeTest.sh +++ b/scripts/smokeTest.sh @@ -1104,10 +1104,10 @@ int toInt(double x) { return (int)x; } double fromInt(int n) { return (double)n; } EOF "$CLANG" --target=w65816 -O2 -ffunction-sections -c "$cDblFile" -o "$oDblFile" - # softDouble.c uses -regalloc=fast because __muldf3's 64x64 -> 128 + # softDouble.c builds at -O1 (not -O2): __muldf3's 64x64 -> 128 # multiply with the inlined alignment shifts overflows the greedy - # allocator's spill heuristics on the single-A target. - "$CLANG" --target=w65816 -O2 -ffunction-sections -mllvm -regalloc=fast \ + # allocator's spill heuristics on the single-A target at -O2. + "$CLANG" --target=w65816 -O1 -ffunction-sections \ -c "$PROJECT_ROOT/runtime/src/softDouble.c" -o "$oSdFile" "$PROJECT_ROOT/tools/link816" -o "$binDblFile" \ --text-base 0x8000 --map "$mapDblFile" \ @@ -1304,7 +1304,7 @@ int main(void) { } EOF "$CLANG" --target=w65816 -O2 -ffunction-sections -c "$cDblMame" -o "$oDblMame" - "$CLANG" --target=w65816 -O2 -ffunction-sections -mllvm -regalloc=fast \ + "$CLANG" --target=w65816 -O1 -ffunction-sections \ -c "$PROJECT_ROOT/runtime/src/softDouble.c" -o "$oSdMame" "$PROJECT_ROOT/tools/link816" -o "$binDblMame" \ --text-base 0x1000 \ @@ -1443,7 +1443,7 @@ EOF -c "$PROJECT_ROOT/runtime/src/math.c" -o "$oMathF" "$CLANG" --target=w65816 -O2 -ffunction-sections \ -c "$PROJECT_ROOT/runtime/src/softFloat.c" -o "$oSfF" - "$CLANG" --target=w65816 -O2 -ffunction-sections -mllvm -regalloc=fast \ + "$CLANG" --target=w65816 -O1 -ffunction-sections \ -c "$PROJECT_ROOT/runtime/src/softDouble.c" -o "$oSdF" oCrt0F="$(mktemp --suffix=.o)" "$PROJECT_ROOT/tools/llvm-mos-build/bin/llvm-mc" -arch=w65816 \ @@ -2056,7 +2056,7 @@ EOF fi rm -f "$cAofFile" "$oAofFile" "$binAofFile" - log "check: MAME runs iterative qsort([3,1,2]) with optnone → [1,2,3] (#70 workaround)" + log "check: MAME runs iterative qsort([3,1,2]) at -O2 → [1,2,3] (#70 fixed)" cQsFile="$(mktemp --suffix=.c)" oQsFile="$(mktemp --suffix=.o)" binQsFile="$(mktemp --suffix=.bin)" @@ -2067,10 +2067,10 @@ __attribute__((noinline)) void switchToBank2(void) { __attribute__((noinline)) void swap(short *a, short *b) { short t = *a; *a = *b; *b = t; } -// optnone routes this function through fast regalloc (per the -// W65816TargetMachine choice), avoiding the greedy spill-ordering -// bug that mis-compiles the iterative if/else recursion form. -__attribute__((noinline,optnone)) +// #70 fixed (W65816StackSlotCleanup Pass -2 safety check): greedy +// regalloc no longer mis-compiles the iterative if/else recursion +// form. This compiles at -O2 + greedy now, no per-function workaround. +__attribute__((noinline)) void qsortIter(short *arr, short lo, short hi) { while (lo < hi) { short pivot = arr[hi]; @@ -2108,7 +2108,7 @@ EOF if ! bash "$PROJECT_ROOT/scripts/runInMame.sh" \ "$binQsFile" --check 0x025000=0001 0x025002=0002 \ 0x025004=0003 >/dev/null 2>&1; then - die "MAME: optnone qsort([3,1,2]) wrong (workaround for #70 broken)" + die "MAME: -O2 qsort([3,1,2]) wrong (#70 fix regression — Pass -2 safety check)" fi rm -f "$cQsFile" "$oQsFile" "$binQsFile" diff --git a/src/link816/link816.cpp b/src/link816/link816.cpp index 160269b..ed5326f 100644 --- a/src/link816/link816.cpp +++ b/src/link816/link816.cpp @@ -460,6 +460,21 @@ struct Linker { L.bssSize = curBss; L.rodataBase = rodataBase ? rodataBase : (textBase + curText); L.rodataSize = curRodata; + // Reject a --rodata-base that overlaps text. Without this + // check, the gap between text-end and rodata-base goes + // negative, the unsigned subtraction wraps to ~4GB, and the + // image-write loop creates a multi-gigabyte file with no + // diagnostic. Caught while sweeping --rodata-base values + // in a strtok layout-sensitivity test. + if (rodataBase && L.rodataBase < L.textBase + L.textSize) { + char msg[160]; + std::snprintf(msg, sizeof(msg), + "--rodata-base 0x%X overlaps text 0x%X+%u " + "(rodata must start at or after 0x%X)", + L.rodataBase, L.textBase, L.textSize, + L.textBase + L.textSize); + die(msg); + } // .init_array goes immediately after .rodata in the image. L.initBase = L.rodataBase + L.rodataSize; L.initSize = curInit; diff --git a/src/llvm/lib/Target/W65816/W65816InstrInfo.td b/src/llvm/lib/Target/W65816/W65816InstrInfo.td index 550465f..0345959 100644 --- a/src/llvm/lib/Target/W65816/W65816InstrInfo.td +++ b/src/llvm/lib/Target/W65816/W65816InstrInfo.td @@ -187,10 +187,10 @@ def SELECT_CC8 : W65816Pseudo<(outs Acc8:$dst), // NOT model that with `Defs = [P]`. Adding `Defs = [P]` lets the // scheduler legally place an LDA between CMP and Bxx (P just gets // re-defined; the latest def is what Bxx tests) — same flag-corruption -// bug, different mechanism. The proper fix is the 4-block SELECT_CC -// inserter (landed) for SETCC patterns and a similar BR_CC stub-block -// pass (still TODO) for `while`/`for`/`if-goto` tests — see -// memory/project_known_issue_lda_flags.md. +// bug, different mechanism. Two complementary fixes carry the load: +// the 4-block SELECT_CC inserter for SETCC patterns, and the post-RA +// PHP/PLP wrap pass (W65816StackSlotCleanup Pass -2.5) for BR_CC +// patterns (`while`/`for`/`if-goto`). Both landed. let isAsCheapAsAMove = 1, isReMaterializable = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0 in { def LDAi16imm : W65816Pseudo<(outs Acc16:$dst), (ins i16imm:$imm), @@ -1257,7 +1257,15 @@ def PEA : InstAbs<0xF4, "pea">; def PER : InstPCRel16<0x62, "per">; //---------------------------------------------------------------- Branches -let isBranch = 1, isTerminator = 1, mayLoad = 0, mayStore = 0 in { +// Conditional branches READ the P (status) register. Without this +// Uses, MachineCSE saw no dependency between an earlier CMP (which +// defines P) and the consuming Bxx, and would happily reuse a +// "redundant" CMP whose flags had been clobbered by an intervening +// LDA/STA/ADC. Modelling the dep is the principled fix; the +// W65816TargetMachine workaround that disabled MachineCSE entirely +// can come back out once this is verified. +let isBranch = 1, isTerminator = 1, mayLoad = 0, mayStore = 0, + Uses = [P] in { def BEQ : InstPCRel8<0xF0, "beq">; def BNE : InstPCRel8<0xD0, "bne">; def BCS : InstPCRel8<0xB0, "bcs">; diff --git a/src/llvm/lib/Target/W65816/W65816StackSlotCleanup.cpp b/src/llvm/lib/Target/W65816/W65816StackSlotCleanup.cpp index f5310d3..32f2b8e 100644 --- a/src/llvm/lib/Target/W65816/W65816StackSlotCleanup.cpp +++ b/src/llvm/lib/Target/W65816/W65816StackSlotCleanup.cpp @@ -666,9 +666,9 @@ bool W65816StackSlotCleanup::runOnMachineFunction(MachineFunction &MF) { // Pass -2.5: BR_CC flag-corruption mitigation via PHP/PLP. When a // flag-test (CMP/ORA/etc.) is followed by P-corrupting ops (LDA/LDX - // /AND/etc.) and then a flag-testing branch (Bxx), the branch ends - // up testing the corrupting op's N/Z instead of the test's. This is - // a real correctness bug — `while (n > 0)` always exits on first + // /AND/etc.) and then a flag-testing branch (Bxx), the branch would + // test the corrupting op's N/Z instead of the test's. This is a + // real correctness bug — `while (n > 0)` always exits on first // iteration; `eq_test(0)` returns 0; etc. Wrap the corrupting span // with PHP (push flags) / PLP (pop flags), preserving the test's // flags across the corruption. Costs 2 bytes / 8 cycles per @@ -680,6 +680,14 @@ bool W65816StackSlotCleanup::runOnMachineFunction(MachineFunction &MF) { // and LDA doesn't touch C, so they're not affected. // - There's at least one P-corrupting instruction between the // flag-defining test and the Bxx. + // + // NOTE: Bxx instructions now declare `Uses = [P]` (W65816InstrInfo.td), + // so the pre-RA scheduler / regalloc / peephole won't legally insert + // a P-corrupting op between a CMP and the consuming Bxx. This pass + // is now mostly defensive — it stays in place to catch any post-RA + // pass that might still violate the dep, and to wrap the rare cases + // where the IR-level test is a load (LDA flag side-effect) rather + // than an explicit CMP. for (MachineBasicBlock &MBB : MF) { SmallVector Branches; for (MachineInstr &MI : MBB) { @@ -945,18 +953,9 @@ bool W65816StackSlotCleanup::runOnMachineFunction(MachineFunction &MF) { if (!OpfiTarget) continue; // Verify slotB has no OTHER references in this function (besides // Sta and OpfiTarget). If it does, we can't safely drop Sta. - bool OtherUse = false; - for (MachineBasicBlock &MBBO : MF) { - for (MachineInstr &MIO : MBBO) { - if (&MIO == &Sta || &MIO == OpfiTarget) continue; - for (const MachineOperand &MO : MIO.operands()) { - if (MO.isFI() && MO.getIndex() == SlotB) { OtherUse = true; break; } - } - if (OtherUse) break; - } - if (OtherUse) break; - } - if (OtherUse) continue; + const MachineInstr *ignoreS2c[] = {&Sta, OpfiTarget}; + if (slotHasOtherRefs(MF, SlotB, ignoreS2c)) + continue; // Apply rewrite. OpfiTarget->getOperand(RewriteIdx).setIndex(SlotA); Erased.insert(Lda); @@ -1026,6 +1025,18 @@ bool W65816StackSlotCleanup::runOnMachineFunction(MachineFunction &MF) { Op.getOperand(FiIdx + 1).getImm() != 0) continue; + // Function-wide safety check: slotB must have no other refs + // besides Sta (which we erase) and Op (which we rewrite to use + // slotA instead). Without this, deleting Sta orphans any other + // reader of slotB (e.g. a loop-header LDAfi reading the same + // copy that this entry-side STA initialises) — surfaces as the + // qsort #70 miscompile where greedy parks a value into a slot + // both for the immediate CMPfi and for a downstream loop-body + // reload, and Pass -2 silently kills the only initialiser. + const MachineInstr *ignoreS2[] = {&Sta, &Op}; + if (slotHasOtherRefs(MF, SlotB, ignoreS2)) + continue; + // Rewrite OP to use slotA, drop Lda1+Sta. Op.getOperand(FiIdx).setIndex(SlotA); Lda1->eraseFromParent(); @@ -1097,6 +1108,15 @@ bool W65816StackSlotCleanup::runOnMachineFunction(MachineFunction &MF) { MachineInstr &Op = *It; if (!matchCommutativeFiOpOnSlot(Op, SlotB)) continue; + // Function-wide safety check: slot_b must have no other refs + // besides Sta2 (which we erase) and Op (which we rewrite). + // Same class of bug as #70's Pass -2: an inserter-spawned + // spill slot can get reused as a regalloc spill home, and + // erasing the only initialiser would orphan downstream reads. + const MachineInstr *ignoreS1[] = {&Sta2, &Op}; + if (slotHasOtherRefs(MF, SlotB, ignoreS1)) + continue; + // Rewrite Op to use slot_a instead of slot_b, erase Sta2 + Lda. Op.getOperand(2).setIndex(SlotA); Sta2.eraseFromParent(); diff --git a/src/llvm/lib/Target/W65816/W65816TargetMachine.cpp b/src/llvm/lib/Target/W65816/W65816TargetMachine.cpp index 6ca79fa..bb3072d 100644 --- a/src/llvm/lib/Target/W65816/W65816TargetMachine.cpp +++ b/src/llvm/lib/Target/W65816/W65816TargetMachine.cpp @@ -106,20 +106,12 @@ TargetPassConfig *W65816TargetMachine::createPassConfig(PassManagerBase &PM) { } void W65816PassConfig::addMachineSSAOptimization() { - // MachineCSE incorrectly eliminates "redundant" CMP instructions when - // it sees an earlier identical CMP elsewhere in the function — the - // P (status) flag is considered "available", but on this target P is - // clobbered by every intervening LDA/STA/ADC, so the surviving Bxx - // ends up dispatching on stale flags. We don't model `Uses=[P]` on - // Bxx because doing so causes regalloc/layout shifts that uncovered - // a different latent bug in vprintf. Disabling the pass entirely - // is the lower-cost workaround until the Bxx-Uses=[P] regression is - // root-caused. Caught by `printf("%d", n)` returning 0. - // - // Other SSA opts (early-tailduplication, opt-phis, dead-mi-elim, - // licm, machine-sink, peephole-opt, etc.) still run by chaining - // through the default impl — we just skip MachineCSE. - disablePass(&MachineCSELegacyID); + // MachineCSE used to be disabled here because it incorrectly + // eliminated "redundant" CMP instructions: P was considered + // "available" but on this target P is clobbered by every + // intervening LDA/STA/ADC. The principled fix is to model + // Uses=[P] on Bxx (so MachineCSE sees the dep) and let the + // pass run normally — that landed in W65816InstrInfo.td. TargetPassConfig::addMachineSSAOptimization(); }