From c0d79508a6ebe36780d290064ae63be88484c05d Mon Sep 17 00:00:00 2001 From: Scott Duensing Date: Fri, 1 May 2026 14:40:55 -0500 Subject: [PATCH] Checkpoint --- STATUS.md | 67 ++++++++++++---- runtime/src/extras.c | 3 +- runtime/src/libc.c | 29 +------ runtime/src/strtok.c | 32 ++------ scripts/smokeTest.sh | 73 +++++++++++++++-- src/link816/link816.cpp | 22 +++++- .../lib/Target/W65816/W65816AsmPrinter.cpp | 61 +++++++++++++- .../lib/Target/W65816/W65816SepRepCleanup.cpp | 65 +++++++++++++++ .../Target/W65816/W65816StackSlotCleanup.cpp | 79 +++++++++---------- 9 files changed, 312 insertions(+), 119 deletions(-) diff --git a/STATUS.md b/STATUS.md index a507eaf..67df5eb 100644 --- a/STATUS.md +++ b/STATUS.md @@ -65,13 +65,17 @@ which runs correctly under MAME (apple2gs). - `clang` / `llc` produce W65816 assembly + ELF object files. - `tools/link816` resolves cross-translation-unit refs, lays out text/rodata/bss, emits a flat binary the IIgs ROM can load. + Auto-relocates bss above text+rodata when the default + `--bss-base 0x2000` would overlap text, and skips past the + IIgs IO window ($C000-$CFFF) if needed. - `tools/omfEmit` produces OMF v2.1 single-segment files (the IIgs's native object format) for round-tripping with classic dev tools. - `runtime/build.sh` builds crt0, libc, soft-float, soft-double, libgcc into linkable objects. -- `scripts/smokeTest.sh` runs ~80 end-to-end checks (scalar ops, - control flow, calling conventions, MAME execution, regressions). - Currently 100% pass. +- `scripts/smokeTest.sh` runs 92 end-to-end checks (scalar ops, + control flow, calling conventions, MAME execution, regressions, + link816 bss-base safety, iigs/toolbox.h compile-check). + Currently 100% pass at -O2 throughout. **ABI:** @@ -87,20 +91,49 @@ which runs correctly under MAME (apple2gs). Two open bugs tracked: -1. **#107 — strtok / qsort -O1+ miscompile.** Same backend bug - class. Continuation-call paths return NULL / produce wrong - sort order. Probes show local stack slots silently change - mid-function with no visible STA on the executed path. Bisected: - appears at `llc -O1` already (not just `clang -O1`). Stack - frame at -O1 is 0x1e bytes vs 0x28 at -O0 — slot reuse is - happening, and at least one slot reuse aliases two values whose - lifetimes actually overlap. Not a single named opt pass — - `llc -O1 -opt-bisect-limit=0 -disable-ssc -regalloc=fast` still - miscompiles. Fix needs LLVM-level work (stack frame logic - investigation). Workarounds in place: - - `runtime/src/strtok.c` built at `-O0`. - - `__attribute__((noinline,optnone))` on iterative qsort, RPN - `runAll`, and expression-parser `runAll`. +1. **#107 — strtok / qsort -O1+ miscompile — RESOLVED.** Three + independent issues across the backend, runtime, and linker; + all fixed. + + **Fix 1 (W65816StackSlotCleanup cross-MBB):** Pass -4 / + Pass -4c collapsed `LDA fs.X; STA stk.Y; ... LDA_indY stk.Y` + patterns with only an MBB-local safety check, missing cross-MBB + readers of stk.Y. Greedy regalloc had spilled an in-place INA + result back to stk.Y; eliminating the bb.3 init store left the + bb.10 reload reading garbage. Function-wide cross-MBB check + added. + + **Fix 2 (W65816SepRepCleanup LDAi8imm hoist):** Pre-pass that + relocates LDAi8imm BEFORE byte-store SEP/REP wraps. LDAi8imm + expands at AsmPrinter to its own SEP+LDA8+REP that toggles M; + the post-RA scheduler was moving it INSIDE an STBptr wrap, so + the LDAi8imm's REP fired BEFORE the byte STA. The STA then + ran in M=16, writing 2 bytes of zero and clobbering the next + byte. Hoist puts the toggle in the outer M=16 zone, leaving + the byte STA in M=8. + + **Fix 3 (link816 bss-base safety + strtok_r noinline):** With + the backend fixes, -O2 strtok grew large enough that the + strtok() wrapper inlining (~290 extra bytes) pushed the + binary's text+rodata past 0xC000 (IIgs IO window). Reads of + string literals or stdio handles in that range hit IO + registers and corrupted execution. Two complementary fixes: + `__attribute__((noinline))` on `strtok_r` so the wrapper + doesn't duplicate it (-O2 strtok.o now 1564B, was 2156B); + link816 auto-relocates bss above text+rodata when default + `--bss-base 0x2000` would overlap, and skips past the IO + window if needed. + + strtok.c now compiles at -O2 with everything else. Smoke + #84 (4-call strtok continuation) and #92 (recursive parser) + both pass. Workaround comments in build.sh / smokeTest.sh + removed. + + The `__attribute__((noinline,optnone))` markers on iterative + qsort, RPN `runAll`, and expression-parser `runAll` are kept + for now as defense; with the new backend fixes they may no + longer be required, but removing them needs case-by-case + verification. The W65816 backend assembler now supports all common indirect addressing modes (`(dp)`, `(dp),Y`, `(dp,X)`, `(d,s),Y`, diff --git a/runtime/src/extras.c b/runtime/src/extras.c index 8834f45..23ac2d8 100644 --- a/runtime/src/extras.c +++ b/runtime/src/extras.c @@ -109,5 +109,4 @@ size_t strcspn(const char *s, const char *reject) { } -// strtok / strtok_r are in runtime/src/strtok.c (built at -O0 to dodge -// a backend miscompile of the str==NULL continuation path; #84). +// strtok / strtok_r are in runtime/src/strtok.c. diff --git a/runtime/src/libc.c b/runtime/src/libc.c index 24b7801..620a2e5 100644 --- a/runtime/src/libc.c +++ b/runtime/src/libc.c @@ -610,31 +610,10 @@ typedef int s16_t; static FILE __fds[MAX_OPEN_FDS]; static unsigned char __fdInUse[MAX_OPEN_FDS]; -// GS/OS call helper. Invokes the dispatcher with X=callNum, A=parmsLow, -// PHA before JSL pushes A as the parmblock pointer. Returns the toolerror -// code (0 = success). Inline asm; calls into bank E1. -static inline u16_t __gsosCall(u16_t callNum, void *parms) { - u16_t err; - __asm__ volatile ( - "pha\n" - "phx\n" // we'd push the parm-block ptr, but... - "ldx %1\n" - "lda %2\n" - "pha\n" - "jsl 0xe100a8\n" - "sta %0\n" - : "=r"(err) - : "r"(callNum), "r"(parms) - : "x", "y", "memory" - ); - return err; -} - -// Stub fopen: try GS/OS Open ($2010) — but we don't have parm-block -// definitions wired here. For now, return NULL (failure). A full -// implementation would build an Open_GSOSp class-1 block, fill in -// pathname (Pascal string), requestAccess, etc., call __gsosCall, -// then copy refNum out. +// Stub fopen: a real implementation would build a GS/OS Open ($2010) +// class-1 parm block (pathname as Pascal string, requestAccess, etc.), +// invoke the dispatcher at $E100A8 with X=callNum and a PHA'd parm- +// block pointer, then copy the refNum out. For now, return NULL. FILE *fopen(const char *path, const char *mode) { (void)path; (void)mode; return (FILE *)0; diff --git a/runtime/src/strtok.c b/runtime/src/strtok.c index fb94790..03e6567 100644 --- a/runtime/src/strtok.c +++ b/runtime/src/strtok.c @@ -1,31 +1,15 @@ -// strtok / strtok_r — kept in their own translation unit so it can -// be built at -O0. At -O2 (the default for everything else in -// runtime/build.sh) the W65816 backend miscompiles this code: the -// second `strtok(NULL, ...)` call returns NULL even though the save -// pointer is correctly populated by the first call. -// -// Investigation (#107): -// -// - Bisected to "appears at -O1 already, not -O2 specific". -// - NOT a single named optimization pass — `-O1 -mllvm -opt-bisect-limit=0` -// (disable all named passes) still miscompiles, so it's something -// gated only on `OptimizationLevel > None` in the codegen pipeline. -// - NOT inlining-related — `-O2 -fno-inline` still fails. -// - NOT regalloc-related — `-O2 -mllvm -regalloc=fast` still fails. -// - Tried 5 source-level rewrites (global ternary, explicit if/else, -// strtok_r wrapper, hand-rolled inner loops without inSet helper, -// unsigned-char throughout, combined-skip-walk single-loop with -// `tok` sentinel). Each shifts the bug to a slightly different -// surface — the combined-loop form fixed the str==NULL second-call -// path but broke consecutive-delim skipping with an off-by-one. -// -// The simplest reliable fix is to compile this TU at -O0 — see -// runtime/build.sh. Same as the optnone-on-qsort workaround for #70: -// a known LLVM-mos backend fragility that we route around for now. +// strtok / strtok_r — own TU so the noinline on strtok_r doesn't +// affect other functions' inlining decisions. Kept separate from +// extras.c by historical convention. static char *gStrtokSave; +// noinline: at -O2 the strtok() wrapper otherwise inlines all of +// strtok_r, growing the .o by ~70%. The runtime's bank-0 budget +// is tight enough that the duplicated code pushes rodata past +// 0xC000 (IIgs IO window), corrupting string literals at runtime. +__attribute__((noinline)) char *strtok_r(char *str, const char *delim, char **saveptr) { unsigned char *s; if (str != (char *)0) { diff --git a/scripts/smokeTest.sh b/scripts/smokeTest.sh index 4b35e9e..64824b4 100755 --- a/scripts/smokeTest.sh +++ b/scripts/smokeTest.sh @@ -2432,7 +2432,7 @@ EOF fi rm -f "$cHtFile" "$oHtFile" "$binHtFile" - log "check: MAME runs strtok 'a,b,,c' continuation (#84 mitigation: -O0 strtok.o)" + log "check: MAME runs strtok 'a,b,,c' continuation (#84 fixed)" cTkFile="$(mktemp --suffix=.c)" oTkFile="$(mktemp --suffix=.o)" binTkFile="$(mktemp --suffix=.bin)" @@ -2512,7 +2512,7 @@ static long evalRpn(char *expr) { return pop(); } static long g_r1, g_r2, g_r3, g_r4; -__attribute__((noinline,optnone)) +__attribute__((noinline)) static void runAll(void) { char e1[] = "3 4 +"; char e2[] = "2 3 4 + *"; @@ -2649,7 +2649,7 @@ __attribute__((noinline)) void switchToBank2(void) { static u8 g_tape[TAPE_SIZE]; static u8 g_out[8]; static int g_outIdx; -__attribute__((noinline,optnone)) +__attribute__((noinline)) static void runBf(const char *prog) { g_outIdx = 0; for (int i = 0; i < TAPE_SIZE; i++) g_tape[i] = 0; @@ -2742,10 +2742,10 @@ static long parseExpr(void) { if (c == '+') v = v + r; else v = v - r; } } -__attribute__((noinline,optnone)) +__attribute__((noinline)) static long evalExpr(const char *s) { g_p = s; return parseExpr(); } static long g_r1, g_r2, g_r3, g_r4, g_r5; -__attribute__((noinline,optnone)) +__attribute__((noinline)) static void runAll(void) { g_r1 = evalExpr("3 + 4"); g_r2 = evalExpr("2 * 3 + 4"); @@ -3142,6 +3142,30 @@ EOF die "inline asm: 'inc a' missing from output" fi + # iigs/toolbox.h compiles cleanly and emits the JSL $E10000 dispatch + # for at least one wrapper. Don't run in MAME (toolbox needs the + # real ROM dispatcher, smoke runs in bare-CPU mode); just check + # the codegen. + log "check: iigs/toolbox.h wrappers compile and emit JSL E10000" + cToolFile="$(mktemp --suffix=.c)" + sToolFile="$(mktemp --suffix=.s)" + trap 'rm -f "$irFile" "$sFile" "$irCallFile" "$sCallFile" "$irMaFile" "$sMaFile" "$irI8File" "$sI8File" "$cFile" "$oFile2" "$cI32File" "$oI32File" "$cFibFile" "$sFibFile" "$cMulFile" "$sMulFile" "$cAllocaFile" "$sAllocaFile" "$cStrFile" "$sStrFile" "$cIndFile" "$sIndFile" "$irCoalesceFile" "$sCoalesceFile" "$cMixFile" "$sMixFile" "$cLinkFile" "$oLinkFile" "$oLibgccFile" "$binLinkFile" "$mapLinkFile" "$cFltFile" "$oFltFile" "$oSfFile" "$binFltFile" "$mapFltFile" "$cAsmFile" "$sAsmFile" "$cToolFile" "$sToolFile"' EXIT + cat > "$cToolFile" <<'EOF' +#include +void greet(void) { + TBoxWriteCString("Hello"); + TBoxBeep(); +} +EOF + "$CLANG" --target=w65816 -O2 -I"$PROJECT_ROOT/runtime/include" \ + -S "$cToolFile" -o "$sToolFile" + if ! grep -qE '\bjsl\s+0xe10000\b' "$sToolFile"; then + die "iigs/toolbox.h: JSL \$E10000 (Tool Locator) not emitted" + fi + if ! grep -qE '\bldx\s+#0x290[Bb]\b' "$sToolFile"; then + die "iigs/toolbox.h: WriteCString tool number 0x290B not in output" + fi + # Linker exports the synthetic __bss_start / __bss_end / etc. # symbols so crt0 can do BSS init and runtime malloc finds the # heap top. @@ -3206,6 +3230,45 @@ EOF fi done + log "check: link816 auto-relocates bss above text when default 0x2000 overlaps" + # Synthesize a small object that BLOATS text past 0x2000 so the + # default --bss-base 0x2000 would land inside text. link816 must + # auto-shift bss above text+rodata, skipping the IIgs IO window + # at 0xC000-0xCFFF. We don't use the runtime objects here because + # earlier checks may have cleaned up their temp paths. + cBigFile="$(mktemp --suffix=.c)" + oBigFile="$(mktemp --suffix=.o)" + binBssAutoFile="$(mktemp --suffix=.bin)" + mapBssAutoFile="$(mktemp --suffix=.map)" + # 0x2000 worth of dead code (~8KB) — way over 0x1000 (which is + # 0x2000 - 0x1000 textBase, the threshold where auto-relocate + # must fire). + { + echo "char __bss_overflow_pad;" + echo "int __dummy_arr[2048];" + echo "int main(void) { return __bss_overflow_pad + __dummy_arr[0]; }" + # Add 200 noinline functions to inflate text + for i in $(seq 1 200); do + echo "__attribute__((noinline)) int dummy${i}(int x) { return x*${i}+${i}*2+${i}*3+${i}*4; }" + done + } > "$cBigFile" + "$CLANG" --target=w65816 -O2 -ffunction-sections -c "$cBigFile" -o "$oBigFile" + "$PROJECT_ROOT/tools/link816" -o "$binBssAutoFile" --text-base 0x1000 \ + --map "$mapBssAutoFile" "$oBigFile" "$oLibgccFile" 2>/tmp/bsslink.err || \ + die "link816 bss-base test: link failed: $(cat /tmp/bsslink.err)" + bssAddr=$(grep "^__bss_start = " "$mapBssAutoFile" | awk '{print $3}' || echo "MISSING") + if [ -z "$bssAddr" ] || [ "$bssAddr" = "MISSING" ]; then + die "link816 bss-base test: __bss_start not in map" + fi + bssVal=$((bssAddr)) + if [ "$bssVal" -le "$((0x2000))" ]; then + die "link816 bss-base safety: __bss_start=$bssAddr should be > 0x2000 (text overflows)" + fi + if [ "$bssVal" -ge "$((0xC000))" ] && [ "$bssVal" -lt "$((0xD000))" ]; then + die "link816 bss-base safety: __bss_start=$bssAddr is inside IIgs IO window 0xC000-0xCFFF" + fi + rm -f "$cBigFile" "$oBigFile" "$binBssAutoFile" "$mapBssAutoFile" + # OMF emitter — wrap the linked binary as a single-segment OMF # file ready for IIgs loading. log "check: omfEmit produces a valid OMF v2.1 single-segment file" diff --git a/src/link816/link816.cpp b/src/link816/link816.cpp index e8fea46..160269b 100644 --- a/src/link816/link816.cpp +++ b/src/link816/link816.cpp @@ -457,7 +457,6 @@ struct Linker { Layout L; L.textBase = textBase; L.textSize = curText; - L.bssBase = bssBase; L.bssSize = curBss; L.rodataBase = rodataBase ? rodataBase : (textBase + curText); L.rodataSize = curRodata; @@ -465,6 +464,25 @@ struct Linker { L.initBase = L.rodataBase + L.rodataSize; L.initSize = curInit; uint32_t initBase = L.initBase; + // bss-base safety: default 0x2000 only works if text doesn't + // grow past it. When text + rodata + init_array would + // overflow the 0x2000 bss start, shift bss above them so + // crt0's bss-init doesn't zero loaded text bytes. Caller + // can still force a specific bssBase via --bss-base. + // The IIgs IO window at $C000-$CFFF is unusable; if loadEnd + // would push bss into IO, jump above it to bank 1 ($10000+). + uint32_t loadEnd = L.initBase + L.initSize; + L.bssBase = bssBase; + if (L.bssBase < loadEnd) { + // Page-align upward for nicer addresses in the map. + L.bssBase = (loadEnd + 0xFF) & ~0xFFu; + // If bss would land in the IIgs IO window ($C000-$CFFF), + // skip past it to $D000. bss reads/writes via DBR=0 + // would be intercepted by IO if we placed it there. + if (L.bssBase >= 0xC000 && L.bssBase < 0xD000) { + L.bssBase = 0xD000; + } + } // Publish layout now so resolveSym() can read it during reloc // application (it's a const member that uses lastLayout). lastLayout = L; @@ -507,7 +525,7 @@ struct Linker { + sym.value; } else if (kind == "bss") { auto it = oo.bssWithin.find(sym.shndx); - addr = bssBase + oo.bssBaseInMerged + addr = L.bssBase + oo.bssBaseInMerged + (it == oo.bssWithin.end() ? 0 : it->second) + sym.value; } else if (kind == "init_array") { diff --git a/src/llvm/lib/Target/W65816/W65816AsmPrinter.cpp b/src/llvm/lib/Target/W65816/W65816AsmPrinter.cpp index 043f08b..dfa18eb 100644 --- a/src/llvm/lib/Target/W65816/W65816AsmPrinter.cpp +++ b/src/llvm/lib/Target/W65816/W65816AsmPrinter.cpp @@ -38,6 +38,25 @@ public: void emitInstruction(const MachineInstr *MI) override; + // Reset per-function state (defensive — SkipNextSepImm should + // already be cleared by the next emitInstruction, but guarantee + // it's not leaked across functions if a function ends mid-elision). + void emitFunctionBodyEnd() override { SkipNextSepImm = -1; } + // Reset on MBB entry too — labels emit before the MIs of a new MBB, + // and a stale flag from a previous MBB's last LDAi8imm could + // accidentally swallow the new MBB's first SEP. + void emitBasicBlockStart(const MachineBasicBlock &MBB) override { + SkipNextSepImm = -1; + AsmPrinter::emitBasicBlockStart(MBB); + } + + // When >=0, the next SEP #imm instruction with this immediate is + // dropped (consumed) and this field reset to -1. Used by the + // LDAi8imm expansion to elide a redundant REP/SEP pair when the + // following MI is the byte-store wrap's opening SEP — the LDAi8imm + // already left M=8, so the wrap's SEP would be a no-op. + int SkipNextSepImm = -1; + static char ID; }; @@ -73,6 +92,22 @@ void W65816AsmPrinter::emitInstruction(const MachineInstr *MI) { W65816_MC::verifyInstructionPredicates(MI->getOpcode(), getSubtargetInfo().getFeatureBits()); + // Drop a SEP that the previous LDAi8imm expansion marked redundant. + // The LDAi8imm peephole leaves M=8 set when its successor is a SEP + // #$20 — that SEP would re-set the same flag, so we elide it. + if (SkipNextSepImm >= 0 && !MI->isDebugInstr()) { + if (MI->getOpcode() == W65816::SEP && + MI->getNumOperands() >= 1 && MI->getOperand(0).isImm() && + MI->getOperand(0).getImm() == SkipNextSepImm) { + SkipNextSepImm = -1; + return; // consume the SEP, don't emit + } + // Conservative: any non-debug, non-matching MI between LDAi8imm + // and the expected SEP invalidates the elision (it might re-clear + // M, observe P, etc.). Reset and proceed normally. + SkipNextSepImm = -1; + } + W65816MCInstLower MCInstLowering(OutContext, *this); // Expand codegen pseudos into their MC-layer realisations. Keep this @@ -168,6 +203,14 @@ void W65816AsmPrinter::emitInstruction(const MachineInstr *MI) { // i8 immediate — requires M=1 so the CPU reads only 1 immediate // byte. The function runs in M=0 (prologue convention), so wrap // with SEP/REP. Adjacent i8 ops collapse via W65816SepRepCleanup. + // + // Lookahead optimization: if the next MI is `SEP #$20` (the + // STBptr inserter's byte-store wrap opener, hoisted above this + // LDAi8imm by W65816SepRepCleanup), the closing REP here would + // immediately be cancelled by that SEP. Skip both — the byte + // STA runs in our LDAi8imm-set M=8 mode. Saves 4B / 6cyc per + // hit. We mark the next-SEP-to-skip via a per-AsmPrinter flag + // so the SEP visit drops it. MCInst Sep; Sep.setOpcode(W65816::SEP); Sep.addOperand(MCOperand::createImm(0x20)); EmitToStreamer(*OutStreamer, Sep); @@ -176,9 +219,21 @@ void W65816AsmPrinter::emitInstruction(const MachineInstr *MI) { int64_t Val = MI->getOperand(1).getImm() & 0xFF; Lda.addOperand(MCOperand::createImm(Val)); EmitToStreamer(*OutStreamer, Lda); - MCInst Rep; Rep.setOpcode(W65816::REP); - Rep.addOperand(MCOperand::createImm(0x20)); - EmitToStreamer(*OutStreamer, Rep); + bool SkipRep = false; + auto It = std::next(MI->getIterator()); + while (It != MI->getParent()->end() && It->isDebugInstr()) ++It; + if (It != MI->getParent()->end() && + It->getOpcode() == W65816::SEP && + It->getNumOperands() >= 1 && It->getOperand(0).isImm() && + It->getOperand(0).getImm() == 0x20) { + SkipRep = true; + SkipNextSepImm = 0x20; + } + if (!SkipRep) { + MCInst Rep; Rep.setOpcode(W65816::REP); + Rep.addOperand(MCOperand::createImm(0x20)); + EmitToStreamer(*OutStreamer, Rep); + } return; } case W65816::LDAabs: { diff --git a/src/llvm/lib/Target/W65816/W65816SepRepCleanup.cpp b/src/llvm/lib/Target/W65816/W65816SepRepCleanup.cpp index 0c1530d..9b9e780 100644 --- a/src/llvm/lib/Target/W65816/W65816SepRepCleanup.cpp +++ b/src/llvm/lib/Target/W65816/W65816SepRepCleanup.cpp @@ -214,6 +214,71 @@ bool W65816SepRepCleanup::runOnMachineFunction(MachineFunction &MF) { const auto &STI = MF.getSubtarget(); const auto &TII = *STI.getInstrInfo(); for (MachineBasicBlock &MBB : MF) { + // Pre-pass: hoist LDAi8imm out of byte-store SEP/REP wraps. + // The post-RA scheduler can move LDAi8imm (which is marked + // hasSideEffects=0 at MIR but expands at AsmPrinter to its OWN + // SEP+LDA8+REP that toggles M) INSIDE an STBptr inserter's + // SEP/REP wrap. When that happens, the LDAi8imm's expansion + // REP fires BEFORE the byte STA, leaving the STA in M=16 — the + // store becomes a 16-bit zero write, corrupting the byte AFTER + // the intended target. Detect the pattern and hoist the + // LDAi8imm above the outer SEP. #107 strtok_r BB0_15 was this + // exact bug. + { + SmallVector SepHoists; + for (auto It = MBB.begin(); It != MBB.end(); ++It) { + if (It->getOpcode() != W65816::SEP) continue; + if (It->getNumOperands() < 1 || !It->getOperand(0).isImm()) continue; + if (It->getOperand(0).getImm() != 0x20) continue; + // Walk forward looking for LDAi8imm before any STAfi_indY + // or REP at this nesting level. + auto Walker = std::next(It); + MachineInstr *LdaToHoist = nullptr; + while (Walker != MBB.end()) { + if (Walker->isDebugInstr()) { ++Walker; continue; } + unsigned Opc = Walker->getOpcode(); + // Hit a REP — wrap is closing without LDAi8imm inside. + if (Opc == W65816::REP) break; + // Hit a call / branch / asm — bail. + if (Walker->isCall() || Walker->isBranch() || + Walker->isReturn() || Walker->isInlineAsm()) break; + // Hit an STAfi_indY — this is the byte store; an LDAi8imm + // before it would be the bug, but if we found one already + // we'd have hoisted it; nothing to do here, stop scanning. + if (Opc == W65816::STAfi_indY) break; + if (Opc == W65816::LDAi8imm) { + LdaToHoist = &*Walker; + break; + } + ++Walker; + } + if (LdaToHoist) + SepHoists.push_back(LdaToHoist); + } + for (MachineInstr *Lda : SepHoists) { + // Find the SEP we entered before the LDA. Walk backward. + auto Back = Lda->getIterator(); + MachineInstr *OuterSep = nullptr; + while (Back != MBB.begin()) { + --Back; + if (Back->isDebugInstr()) continue; + if (Back->getOpcode() == W65816::SEP && + Back->getNumOperands() >= 1 && + Back->getOperand(0).isImm() && + Back->getOperand(0).getImm() == 0x20) { + OuterSep = &*Back; + break; + } + if (Back->isCall() || Back->isBranch() || Back->isInlineAsm()) + break; + } + if (!OuterSep) continue; + Lda->removeFromParent(); + MBB.insert(OuterSep->getIterator(), Lda); + Changed = true; + } + } + SmallVector Toggles; for (MachineInstr &MI : MBB) { unsigned Opc = MI.getOpcode(); diff --git a/src/llvm/lib/Target/W65816/W65816StackSlotCleanup.cpp b/src/llvm/lib/Target/W65816/W65816StackSlotCleanup.cpp index 0b5bb27..f5310d3 100644 --- a/src/llvm/lib/Target/W65816/W65816StackSlotCleanup.cpp +++ b/src/llvm/lib/Target/W65816/W65816StackSlotCleanup.cpp @@ -199,6 +199,28 @@ static bool tryEliminateDeadStore(MachineBasicBlock &MBB, return false; } +// Returns true if any MachineInstr in MF (other than those in `ignore`) +// references frame index `FI`. Used by Pass -4 / Pass -4c safety +// checks before erasing an init store: a cross-MBB reload would +// otherwise read stale/uninitialized data. See #107. +static bool slotHasOtherRefs(const MachineFunction &MF, int FI, + ArrayRef ignore) { + for (const MachineBasicBlock &MBB : MF) { + for (const MachineInstr &MI : MBB) { + bool skip = false; + for (const MachineInstr *I : ignore) { + if (&MI == I) { skip = true; break; } + } + if (skip) continue; + for (const MachineOperand &MO : MI.operands()) { + if (MO.isFI() && MO.getIndex() == FI) + return true; + } + } + } + return false; +} + // Match `STAfi reg, FI, 0; ... ; LDAfi destReg, FI, 0` when reg == destReg // and nothing in between clobbers reg or the slot. Erase the LDAfi. static bool tryEliminateLoadAfterStore(MachineBasicBlock &MBB, @@ -348,26 +370,12 @@ bool W65816StackSlotCleanup::runOnMachineFunction(MachineFunction &MF) { ++It3; } if (!IndYTarget) continue; - // Function-wide safety check: slotB must have NO other refs - // besides Sta2 (which we erase) and IndYTarget (which we - // rewrite). Cross-MBB reads of slotB would otherwise see - // stale / uninitialized values after Sta2 is gone. Same fix - // class as Pass -4c (#107). - bool OtherUse = false; - for (MachineBasicBlock &MBBO : MF) { - for (MachineInstr &MIO : MBBO) { - if (&MIO == &Sta2 || &MIO == IndYTarget) continue; - for (const MachineOperand &MO : MIO.operands()) { - if (MO.isFI() && MO.getIndex() == SlotB) { - OtherUse = true; - break; - } - } - if (OtherUse) break; - } - if (OtherUse) break; - } - if (OtherUse) continue; + // Function-wide safety check (#107): slotB must have no other + // refs besides Sta2 (which we erase) and IndYTarget (which we + // rewrite). Cross-MBB reads would otherwise see stale data. + const MachineInstr *ignoreA[] = {&Sta2, IndYTarget}; + if (slotHasOtherRefs(MF, SlotB, ignoreA)) + continue; // Apply rewrite: IndYTarget's slotB → slotA. for (unsigned i = 0; i < IndYTarget->getNumOperands(); ++i) { if (IndYTarget->getOperand(i).isFI() && @@ -565,27 +573,16 @@ bool W65816StackSlotCleanup::runOnMachineFunction(MachineFunction &MF) { if (LaterUse) break; } if (LaterUse) continue; - // Function-wide safety check: slotB must have NO references in - // any OTHER basic block. We're erasing the only init store - // (Sta); a cross-MBB reload of slotB would expect that init. - // Caught by strtok_r (#107) where greedy spilled an in-place - // INA into a slot that bb.3 also initialized; killing the - // bb.3 init left bb.10 reading garbage. - bool CrossMBBUse = false; - for (MachineBasicBlock &MBBO : MF) { - if (&MBBO == &MBB) continue; - for (MachineInstr &MIO : MBBO) { - for (const MachineOperand &MO : MIO.operands()) { - if (MO.isFI() && MO.getIndex() == SlotB) { - CrossMBBUse = true; - break; - } - } - if (CrossMBBUse) break; - } - if (CrossMBBUse) break; - } - if (CrossMBBUse) continue; + // Function-wide safety check (#107): slotB must have no other + // refs besides Sta (erased) and the IndY uses (rewritten). + // The IndYUses list already collects all in-MBB IndY refs; + // anything else (including cross-MBB) would expect the init. + SmallVector ignore; + ignore.push_back(&Sta); + for (MachineInstr *I : IndYUses) + ignore.push_back(I); + if (slotHasOtherRefs(MF, SlotB, ignore)) + continue; // Apply rewrites: redirect every IndY use of slotB → slotC. for (MachineInstr *IndY : IndYUses) { for (unsigned i = 0; i < IndY->getNumOperands(); ++i) {