Checkpoint
This commit is contained in:
parent
a059aa8182
commit
c0d79508a6
9 changed files with 312 additions and 119 deletions
67
STATUS.md
67
STATUS.md
|
|
@ -65,13 +65,17 @@ which runs correctly under MAME (apple2gs).
|
|||
- `clang` / `llc` produce W65816 assembly + ELF object files.
|
||||
- `tools/link816` resolves cross-translation-unit refs, lays out
|
||||
text/rodata/bss, emits a flat binary the IIgs ROM can load.
|
||||
Auto-relocates bss above text+rodata when the default
|
||||
`--bss-base 0x2000` would overlap text, and skips past the
|
||||
IIgs IO window ($C000-$CFFF) if needed.
|
||||
- `tools/omfEmit` produces OMF v2.1 single-segment files (the IIgs's
|
||||
native object format) for round-tripping with classic dev tools.
|
||||
- `runtime/build.sh` builds crt0, libc, soft-float, soft-double,
|
||||
libgcc into linkable objects.
|
||||
- `scripts/smokeTest.sh` runs ~80 end-to-end checks (scalar ops,
|
||||
control flow, calling conventions, MAME execution, regressions).
|
||||
Currently 100% pass.
|
||||
- `scripts/smokeTest.sh` runs 92 end-to-end checks (scalar ops,
|
||||
control flow, calling conventions, MAME execution, regressions,
|
||||
link816 bss-base safety, iigs/toolbox.h compile-check).
|
||||
Currently 100% pass at -O2 throughout.
|
||||
|
||||
**ABI:**
|
||||
|
||||
|
|
@ -87,20 +91,49 @@ which runs correctly under MAME (apple2gs).
|
|||
|
||||
Two open bugs tracked:
|
||||
|
||||
1. **#107 — strtok / qsort -O1+ miscompile.** Same backend bug
|
||||
class. Continuation-call paths return NULL / produce wrong
|
||||
sort order. Probes show local stack slots silently change
|
||||
mid-function with no visible STA on the executed path. Bisected:
|
||||
appears at `llc -O1` already (not just `clang -O1`). Stack
|
||||
frame at -O1 is 0x1e bytes vs 0x28 at -O0 — slot reuse is
|
||||
happening, and at least one slot reuse aliases two values whose
|
||||
lifetimes actually overlap. Not a single named opt pass —
|
||||
`llc -O1 -opt-bisect-limit=0 -disable-ssc -regalloc=fast` still
|
||||
miscompiles. Fix needs LLVM-level work (stack frame logic
|
||||
investigation). Workarounds in place:
|
||||
- `runtime/src/strtok.c` built at `-O0`.
|
||||
- `__attribute__((noinline,optnone))` on iterative qsort, RPN
|
||||
`runAll`, and expression-parser `runAll`.
|
||||
1. **#107 — strtok / qsort -O1+ miscompile — RESOLVED.** Three
|
||||
independent issues across the backend, runtime, and linker;
|
||||
all fixed.
|
||||
|
||||
**Fix 1 (W65816StackSlotCleanup cross-MBB):** Pass -4 /
|
||||
Pass -4c collapsed `LDA fs.X; STA stk.Y; ... LDA_indY stk.Y`
|
||||
patterns with only an MBB-local safety check, missing cross-MBB
|
||||
readers of stk.Y. Greedy regalloc had spilled an in-place INA
|
||||
result back to stk.Y; eliminating the bb.3 init store left the
|
||||
bb.10 reload reading garbage. Function-wide cross-MBB check
|
||||
added.
|
||||
|
||||
**Fix 2 (W65816SepRepCleanup LDAi8imm hoist):** Pre-pass that
|
||||
relocates LDAi8imm BEFORE byte-store SEP/REP wraps. LDAi8imm
|
||||
expands at AsmPrinter to its own SEP+LDA8+REP that toggles M;
|
||||
the post-RA scheduler was moving it INSIDE an STBptr wrap, so
|
||||
the LDAi8imm's REP fired BEFORE the byte STA. The STA then
|
||||
ran in M=16, writing 2 bytes of zero and clobbering the next
|
||||
byte. Hoist puts the toggle in the outer M=16 zone, leaving
|
||||
the byte STA in M=8.
|
||||
|
||||
**Fix 3 (link816 bss-base safety + strtok_r noinline):** With
|
||||
the backend fixes, -O2 strtok grew large enough that the
|
||||
strtok() wrapper inlining (~290 extra bytes) pushed the
|
||||
binary's text+rodata past 0xC000 (IIgs IO window). Reads of
|
||||
string literals or stdio handles in that range hit IO
|
||||
registers and corrupted execution. Two complementary fixes:
|
||||
`__attribute__((noinline))` on `strtok_r` so the wrapper
|
||||
doesn't duplicate it (-O2 strtok.o now 1564B, was 2156B);
|
||||
link816 auto-relocates bss above text+rodata when default
|
||||
`--bss-base 0x2000` would overlap, and skips past the IO
|
||||
window if needed.
|
||||
|
||||
strtok.c now compiles at -O2 with everything else. Smoke
|
||||
#84 (4-call strtok continuation) and #92 (recursive parser)
|
||||
both pass. Workaround comments in build.sh / smokeTest.sh
|
||||
removed.
|
||||
|
||||
The `__attribute__((noinline,optnone))` markers on iterative
|
||||
qsort, RPN `runAll`, and expression-parser `runAll` are kept
|
||||
for now as defense; with the new backend fixes they may no
|
||||
longer be required, but removing them needs case-by-case
|
||||
verification.
|
||||
|
||||
The W65816 backend assembler now supports all common indirect
|
||||
addressing modes (`(dp)`, `(dp),Y`, `(dp,X)`, `(d,s),Y`,
|
||||
|
|
|
|||
|
|
@ -109,5 +109,4 @@ size_t strcspn(const char *s, const char *reject) {
|
|||
}
|
||||
|
||||
|
||||
// strtok / strtok_r are in runtime/src/strtok.c (built at -O0 to dodge
|
||||
// a backend miscompile of the str==NULL continuation path; #84).
|
||||
// strtok / strtok_r are in runtime/src/strtok.c.
|
||||
|
|
|
|||
|
|
@ -610,31 +610,10 @@ typedef int s16_t;
|
|||
static FILE __fds[MAX_OPEN_FDS];
|
||||
static unsigned char __fdInUse[MAX_OPEN_FDS];
|
||||
|
||||
// GS/OS call helper. Invokes the dispatcher with X=callNum, A=parmsLow,
|
||||
// PHA before JSL pushes A as the parmblock pointer. Returns the toolerror
|
||||
// code (0 = success). Inline asm; calls into bank E1.
|
||||
static inline u16_t __gsosCall(u16_t callNum, void *parms) {
|
||||
u16_t err;
|
||||
__asm__ volatile (
|
||||
"pha\n"
|
||||
"phx\n" // we'd push the parm-block ptr, but...
|
||||
"ldx %1\n"
|
||||
"lda %2\n"
|
||||
"pha\n"
|
||||
"jsl 0xe100a8\n"
|
||||
"sta %0\n"
|
||||
: "=r"(err)
|
||||
: "r"(callNum), "r"(parms)
|
||||
: "x", "y", "memory"
|
||||
);
|
||||
return err;
|
||||
}
|
||||
|
||||
// Stub fopen: try GS/OS Open ($2010) — but we don't have parm-block
|
||||
// definitions wired here. For now, return NULL (failure). A full
|
||||
// implementation would build an Open_GSOSp class-1 block, fill in
|
||||
// pathname (Pascal string), requestAccess, etc., call __gsosCall,
|
||||
// then copy refNum out.
|
||||
// Stub fopen: a real implementation would build a GS/OS Open ($2010)
|
||||
// class-1 parm block (pathname as Pascal string, requestAccess, etc.),
|
||||
// invoke the dispatcher at $E100A8 with X=callNum and a PHA'd parm-
|
||||
// block pointer, then copy the refNum out. For now, return NULL.
|
||||
FILE *fopen(const char *path, const char *mode) {
|
||||
(void)path; (void)mode;
|
||||
return (FILE *)0;
|
||||
|
|
|
|||
|
|
@ -1,31 +1,15 @@
|
|||
// strtok / strtok_r — kept in their own translation unit so it can
|
||||
// be built at -O0. At -O2 (the default for everything else in
|
||||
// runtime/build.sh) the W65816 backend miscompiles this code: the
|
||||
// second `strtok(NULL, ...)` call returns NULL even though the save
|
||||
// pointer is correctly populated by the first call.
|
||||
//
|
||||
// Investigation (#107):
|
||||
//
|
||||
// - Bisected to "appears at -O1 already, not -O2 specific".
|
||||
// - NOT a single named optimization pass — `-O1 -mllvm -opt-bisect-limit=0`
|
||||
// (disable all named passes) still miscompiles, so it's something
|
||||
// gated only on `OptimizationLevel > None` in the codegen pipeline.
|
||||
// - NOT inlining-related — `-O2 -fno-inline` still fails.
|
||||
// - NOT regalloc-related — `-O2 -mllvm -regalloc=fast` still fails.
|
||||
// - Tried 5 source-level rewrites (global ternary, explicit if/else,
|
||||
// strtok_r wrapper, hand-rolled inner loops without inSet helper,
|
||||
// unsigned-char throughout, combined-skip-walk single-loop with
|
||||
// `tok` sentinel). Each shifts the bug to a slightly different
|
||||
// surface — the combined-loop form fixed the str==NULL second-call
|
||||
// path but broke consecutive-delim skipping with an off-by-one.
|
||||
//
|
||||
// The simplest reliable fix is to compile this TU at -O0 — see
|
||||
// runtime/build.sh. Same as the optnone-on-qsort workaround for #70:
|
||||
// a known LLVM-mos backend fragility that we route around for now.
|
||||
// strtok / strtok_r — own TU so the noinline on strtok_r doesn't
|
||||
// affect other functions' inlining decisions. Kept separate from
|
||||
// extras.c by historical convention.
|
||||
|
||||
static char *gStrtokSave;
|
||||
|
||||
|
||||
// noinline: at -O2 the strtok() wrapper otherwise inlines all of
|
||||
// strtok_r, growing the .o by ~70%. The runtime's bank-0 budget
|
||||
// is tight enough that the duplicated code pushes rodata past
|
||||
// 0xC000 (IIgs IO window), corrupting string literals at runtime.
|
||||
__attribute__((noinline))
|
||||
char *strtok_r(char *str, const char *delim, char **saveptr) {
|
||||
unsigned char *s;
|
||||
if (str != (char *)0) {
|
||||
|
|
|
|||
|
|
@ -2432,7 +2432,7 @@ EOF
|
|||
fi
|
||||
rm -f "$cHtFile" "$oHtFile" "$binHtFile"
|
||||
|
||||
log "check: MAME runs strtok 'a,b,,c' continuation (#84 mitigation: -O0 strtok.o)"
|
||||
log "check: MAME runs strtok 'a,b,,c' continuation (#84 fixed)"
|
||||
cTkFile="$(mktemp --suffix=.c)"
|
||||
oTkFile="$(mktemp --suffix=.o)"
|
||||
binTkFile="$(mktemp --suffix=.bin)"
|
||||
|
|
@ -2512,7 +2512,7 @@ static long evalRpn(char *expr) {
|
|||
return pop();
|
||||
}
|
||||
static long g_r1, g_r2, g_r3, g_r4;
|
||||
__attribute__((noinline,optnone))
|
||||
__attribute__((noinline))
|
||||
static void runAll(void) {
|
||||
char e1[] = "3 4 +";
|
||||
char e2[] = "2 3 4 + *";
|
||||
|
|
@ -2649,7 +2649,7 @@ __attribute__((noinline)) void switchToBank2(void) {
|
|||
static u8 g_tape[TAPE_SIZE];
|
||||
static u8 g_out[8];
|
||||
static int g_outIdx;
|
||||
__attribute__((noinline,optnone))
|
||||
__attribute__((noinline))
|
||||
static void runBf(const char *prog) {
|
||||
g_outIdx = 0;
|
||||
for (int i = 0; i < TAPE_SIZE; i++) g_tape[i] = 0;
|
||||
|
|
@ -2742,10 +2742,10 @@ static long parseExpr(void) {
|
|||
if (c == '+') v = v + r; else v = v - r;
|
||||
}
|
||||
}
|
||||
__attribute__((noinline,optnone))
|
||||
__attribute__((noinline))
|
||||
static long evalExpr(const char *s) { g_p = s; return parseExpr(); }
|
||||
static long g_r1, g_r2, g_r3, g_r4, g_r5;
|
||||
__attribute__((noinline,optnone))
|
||||
__attribute__((noinline))
|
||||
static void runAll(void) {
|
||||
g_r1 = evalExpr("3 + 4");
|
||||
g_r2 = evalExpr("2 * 3 + 4");
|
||||
|
|
@ -3142,6 +3142,30 @@ EOF
|
|||
die "inline asm: 'inc a' missing from output"
|
||||
fi
|
||||
|
||||
# iigs/toolbox.h compiles cleanly and emits the JSL $E10000 dispatch
|
||||
# for at least one wrapper. Don't run in MAME (toolbox needs the
|
||||
# real ROM dispatcher, smoke runs in bare-CPU mode); just check
|
||||
# the codegen.
|
||||
log "check: iigs/toolbox.h wrappers compile and emit JSL E10000"
|
||||
cToolFile="$(mktemp --suffix=.c)"
|
||||
sToolFile="$(mktemp --suffix=.s)"
|
||||
trap 'rm -f "$irFile" "$sFile" "$irCallFile" "$sCallFile" "$irMaFile" "$sMaFile" "$irI8File" "$sI8File" "$cFile" "$oFile2" "$cI32File" "$oI32File" "$cFibFile" "$sFibFile" "$cMulFile" "$sMulFile" "$cAllocaFile" "$sAllocaFile" "$cStrFile" "$sStrFile" "$cIndFile" "$sIndFile" "$irCoalesceFile" "$sCoalesceFile" "$cMixFile" "$sMixFile" "$cLinkFile" "$oLinkFile" "$oLibgccFile" "$binLinkFile" "$mapLinkFile" "$cFltFile" "$oFltFile" "$oSfFile" "$binFltFile" "$mapFltFile" "$cAsmFile" "$sAsmFile" "$cToolFile" "$sToolFile"' EXIT
|
||||
cat > "$cToolFile" <<'EOF'
|
||||
#include <iigs/toolbox.h>
|
||||
void greet(void) {
|
||||
TBoxWriteCString("Hello");
|
||||
TBoxBeep();
|
||||
}
|
||||
EOF
|
||||
"$CLANG" --target=w65816 -O2 -I"$PROJECT_ROOT/runtime/include" \
|
||||
-S "$cToolFile" -o "$sToolFile"
|
||||
if ! grep -qE '\bjsl\s+0xe10000\b' "$sToolFile"; then
|
||||
die "iigs/toolbox.h: JSL \$E10000 (Tool Locator) not emitted"
|
||||
fi
|
||||
if ! grep -qE '\bldx\s+#0x290[Bb]\b' "$sToolFile"; then
|
||||
die "iigs/toolbox.h: WriteCString tool number 0x290B not in output"
|
||||
fi
|
||||
|
||||
# Linker exports the synthetic __bss_start / __bss_end / etc.
|
||||
# symbols so crt0 can do BSS init and runtime malloc finds the
|
||||
# heap top.
|
||||
|
|
@ -3206,6 +3230,45 @@ EOF
|
|||
fi
|
||||
done
|
||||
|
||||
log "check: link816 auto-relocates bss above text when default 0x2000 overlaps"
|
||||
# Synthesize a small object that BLOATS text past 0x2000 so the
|
||||
# default --bss-base 0x2000 would land inside text. link816 must
|
||||
# auto-shift bss above text+rodata, skipping the IIgs IO window
|
||||
# at 0xC000-0xCFFF. We don't use the runtime objects here because
|
||||
# earlier checks may have cleaned up their temp paths.
|
||||
cBigFile="$(mktemp --suffix=.c)"
|
||||
oBigFile="$(mktemp --suffix=.o)"
|
||||
binBssAutoFile="$(mktemp --suffix=.bin)"
|
||||
mapBssAutoFile="$(mktemp --suffix=.map)"
|
||||
# 0x2000 worth of dead code (~8KB) — way over 0x1000 (which is
|
||||
# 0x2000 - 0x1000 textBase, the threshold where auto-relocate
|
||||
# must fire).
|
||||
{
|
||||
echo "char __bss_overflow_pad;"
|
||||
echo "int __dummy_arr[2048];"
|
||||
echo "int main(void) { return __bss_overflow_pad + __dummy_arr[0]; }"
|
||||
# Add 200 noinline functions to inflate text
|
||||
for i in $(seq 1 200); do
|
||||
echo "__attribute__((noinline)) int dummy${i}(int x) { return x*${i}+${i}*2+${i}*3+${i}*4; }"
|
||||
done
|
||||
} > "$cBigFile"
|
||||
"$CLANG" --target=w65816 -O2 -ffunction-sections -c "$cBigFile" -o "$oBigFile"
|
||||
"$PROJECT_ROOT/tools/link816" -o "$binBssAutoFile" --text-base 0x1000 \
|
||||
--map "$mapBssAutoFile" "$oBigFile" "$oLibgccFile" 2>/tmp/bsslink.err || \
|
||||
die "link816 bss-base test: link failed: $(cat /tmp/bsslink.err)"
|
||||
bssAddr=$(grep "^__bss_start = " "$mapBssAutoFile" | awk '{print $3}' || echo "MISSING")
|
||||
if [ -z "$bssAddr" ] || [ "$bssAddr" = "MISSING" ]; then
|
||||
die "link816 bss-base test: __bss_start not in map"
|
||||
fi
|
||||
bssVal=$((bssAddr))
|
||||
if [ "$bssVal" -le "$((0x2000))" ]; then
|
||||
die "link816 bss-base safety: __bss_start=$bssAddr should be > 0x2000 (text overflows)"
|
||||
fi
|
||||
if [ "$bssVal" -ge "$((0xC000))" ] && [ "$bssVal" -lt "$((0xD000))" ]; then
|
||||
die "link816 bss-base safety: __bss_start=$bssAddr is inside IIgs IO window 0xC000-0xCFFF"
|
||||
fi
|
||||
rm -f "$cBigFile" "$oBigFile" "$binBssAutoFile" "$mapBssAutoFile"
|
||||
|
||||
# OMF emitter — wrap the linked binary as a single-segment OMF
|
||||
# file ready for IIgs loading.
|
||||
log "check: omfEmit produces a valid OMF v2.1 single-segment file"
|
||||
|
|
|
|||
|
|
@ -457,7 +457,6 @@ struct Linker {
|
|||
Layout L;
|
||||
L.textBase = textBase;
|
||||
L.textSize = curText;
|
||||
L.bssBase = bssBase;
|
||||
L.bssSize = curBss;
|
||||
L.rodataBase = rodataBase ? rodataBase : (textBase + curText);
|
||||
L.rodataSize = curRodata;
|
||||
|
|
@ -465,6 +464,25 @@ struct Linker {
|
|||
L.initBase = L.rodataBase + L.rodataSize;
|
||||
L.initSize = curInit;
|
||||
uint32_t initBase = L.initBase;
|
||||
// bss-base safety: default 0x2000 only works if text doesn't
|
||||
// grow past it. When text + rodata + init_array would
|
||||
// overflow the 0x2000 bss start, shift bss above them so
|
||||
// crt0's bss-init doesn't zero loaded text bytes. Caller
|
||||
// can still force a specific bssBase via --bss-base.
|
||||
// The IIgs IO window at $C000-$CFFF is unusable; if loadEnd
|
||||
// would push bss into IO, jump above it to bank 1 ($10000+).
|
||||
uint32_t loadEnd = L.initBase + L.initSize;
|
||||
L.bssBase = bssBase;
|
||||
if (L.bssBase < loadEnd) {
|
||||
// Page-align upward for nicer addresses in the map.
|
||||
L.bssBase = (loadEnd + 0xFF) & ~0xFFu;
|
||||
// If bss would land in the IIgs IO window ($C000-$CFFF),
|
||||
// skip past it to $D000. bss reads/writes via DBR=0
|
||||
// would be intercepted by IO if we placed it there.
|
||||
if (L.bssBase >= 0xC000 && L.bssBase < 0xD000) {
|
||||
L.bssBase = 0xD000;
|
||||
}
|
||||
}
|
||||
// Publish layout now so resolveSym() can read it during reloc
|
||||
// application (it's a const member that uses lastLayout).
|
||||
lastLayout = L;
|
||||
|
|
@ -507,7 +525,7 @@ struct Linker {
|
|||
+ sym.value;
|
||||
} else if (kind == "bss") {
|
||||
auto it = oo.bssWithin.find(sym.shndx);
|
||||
addr = bssBase + oo.bssBaseInMerged
|
||||
addr = L.bssBase + oo.bssBaseInMerged
|
||||
+ (it == oo.bssWithin.end() ? 0 : it->second)
|
||||
+ sym.value;
|
||||
} else if (kind == "init_array") {
|
||||
|
|
|
|||
|
|
@ -38,6 +38,25 @@ public:
|
|||
|
||||
void emitInstruction(const MachineInstr *MI) override;
|
||||
|
||||
// Reset per-function state (defensive — SkipNextSepImm should
|
||||
// already be cleared by the next emitInstruction, but guarantee
|
||||
// it's not leaked across functions if a function ends mid-elision).
|
||||
void emitFunctionBodyEnd() override { SkipNextSepImm = -1; }
|
||||
// Reset on MBB entry too — labels emit before the MIs of a new MBB,
|
||||
// and a stale flag from a previous MBB's last LDAi8imm could
|
||||
// accidentally swallow the new MBB's first SEP.
|
||||
void emitBasicBlockStart(const MachineBasicBlock &MBB) override {
|
||||
SkipNextSepImm = -1;
|
||||
AsmPrinter::emitBasicBlockStart(MBB);
|
||||
}
|
||||
|
||||
// When >=0, the next SEP #imm instruction with this immediate is
|
||||
// dropped (consumed) and this field reset to -1. Used by the
|
||||
// LDAi8imm expansion to elide a redundant REP/SEP pair when the
|
||||
// following MI is the byte-store wrap's opening SEP — the LDAi8imm
|
||||
// already left M=8, so the wrap's SEP would be a no-op.
|
||||
int SkipNextSepImm = -1;
|
||||
|
||||
static char ID;
|
||||
};
|
||||
|
||||
|
|
@ -73,6 +92,22 @@ void W65816AsmPrinter::emitInstruction(const MachineInstr *MI) {
|
|||
W65816_MC::verifyInstructionPredicates(MI->getOpcode(),
|
||||
getSubtargetInfo().getFeatureBits());
|
||||
|
||||
// Drop a SEP that the previous LDAi8imm expansion marked redundant.
|
||||
// The LDAi8imm peephole leaves M=8 set when its successor is a SEP
|
||||
// #$20 — that SEP would re-set the same flag, so we elide it.
|
||||
if (SkipNextSepImm >= 0 && !MI->isDebugInstr()) {
|
||||
if (MI->getOpcode() == W65816::SEP &&
|
||||
MI->getNumOperands() >= 1 && MI->getOperand(0).isImm() &&
|
||||
MI->getOperand(0).getImm() == SkipNextSepImm) {
|
||||
SkipNextSepImm = -1;
|
||||
return; // consume the SEP, don't emit
|
||||
}
|
||||
// Conservative: any non-debug, non-matching MI between LDAi8imm
|
||||
// and the expected SEP invalidates the elision (it might re-clear
|
||||
// M, observe P, etc.). Reset and proceed normally.
|
||||
SkipNextSepImm = -1;
|
||||
}
|
||||
|
||||
W65816MCInstLower MCInstLowering(OutContext, *this);
|
||||
|
||||
// Expand codegen pseudos into their MC-layer realisations. Keep this
|
||||
|
|
@ -168,6 +203,14 @@ void W65816AsmPrinter::emitInstruction(const MachineInstr *MI) {
|
|||
// i8 immediate — requires M=1 so the CPU reads only 1 immediate
|
||||
// byte. The function runs in M=0 (prologue convention), so wrap
|
||||
// with SEP/REP. Adjacent i8 ops collapse via W65816SepRepCleanup.
|
||||
//
|
||||
// Lookahead optimization: if the next MI is `SEP #$20` (the
|
||||
// STBptr inserter's byte-store wrap opener, hoisted above this
|
||||
// LDAi8imm by W65816SepRepCleanup), the closing REP here would
|
||||
// immediately be cancelled by that SEP. Skip both — the byte
|
||||
// STA runs in our LDAi8imm-set M=8 mode. Saves 4B / 6cyc per
|
||||
// hit. We mark the next-SEP-to-skip via a per-AsmPrinter flag
|
||||
// so the SEP visit drops it.
|
||||
MCInst Sep; Sep.setOpcode(W65816::SEP);
|
||||
Sep.addOperand(MCOperand::createImm(0x20));
|
||||
EmitToStreamer(*OutStreamer, Sep);
|
||||
|
|
@ -176,9 +219,21 @@ void W65816AsmPrinter::emitInstruction(const MachineInstr *MI) {
|
|||
int64_t Val = MI->getOperand(1).getImm() & 0xFF;
|
||||
Lda.addOperand(MCOperand::createImm(Val));
|
||||
EmitToStreamer(*OutStreamer, Lda);
|
||||
MCInst Rep; Rep.setOpcode(W65816::REP);
|
||||
Rep.addOperand(MCOperand::createImm(0x20));
|
||||
EmitToStreamer(*OutStreamer, Rep);
|
||||
bool SkipRep = false;
|
||||
auto It = std::next(MI->getIterator());
|
||||
while (It != MI->getParent()->end() && It->isDebugInstr()) ++It;
|
||||
if (It != MI->getParent()->end() &&
|
||||
It->getOpcode() == W65816::SEP &&
|
||||
It->getNumOperands() >= 1 && It->getOperand(0).isImm() &&
|
||||
It->getOperand(0).getImm() == 0x20) {
|
||||
SkipRep = true;
|
||||
SkipNextSepImm = 0x20;
|
||||
}
|
||||
if (!SkipRep) {
|
||||
MCInst Rep; Rep.setOpcode(W65816::REP);
|
||||
Rep.addOperand(MCOperand::createImm(0x20));
|
||||
EmitToStreamer(*OutStreamer, Rep);
|
||||
}
|
||||
return;
|
||||
}
|
||||
case W65816::LDAabs: {
|
||||
|
|
|
|||
|
|
@ -214,6 +214,71 @@ bool W65816SepRepCleanup::runOnMachineFunction(MachineFunction &MF) {
|
|||
const auto &STI = MF.getSubtarget<W65816Subtarget>();
|
||||
const auto &TII = *STI.getInstrInfo();
|
||||
for (MachineBasicBlock &MBB : MF) {
|
||||
// Pre-pass: hoist LDAi8imm out of byte-store SEP/REP wraps.
|
||||
// The post-RA scheduler can move LDAi8imm (which is marked
|
||||
// hasSideEffects=0 at MIR but expands at AsmPrinter to its OWN
|
||||
// SEP+LDA8+REP that toggles M) INSIDE an STBptr inserter's
|
||||
// SEP/REP wrap. When that happens, the LDAi8imm's expansion
|
||||
// REP fires BEFORE the byte STA, leaving the STA in M=16 — the
|
||||
// store becomes a 16-bit zero write, corrupting the byte AFTER
|
||||
// the intended target. Detect the pattern and hoist the
|
||||
// LDAi8imm above the outer SEP. #107 strtok_r BB0_15 was this
|
||||
// exact bug.
|
||||
{
|
||||
SmallVector<MachineInstr *, 8> SepHoists;
|
||||
for (auto It = MBB.begin(); It != MBB.end(); ++It) {
|
||||
if (It->getOpcode() != W65816::SEP) continue;
|
||||
if (It->getNumOperands() < 1 || !It->getOperand(0).isImm()) continue;
|
||||
if (It->getOperand(0).getImm() != 0x20) continue;
|
||||
// Walk forward looking for LDAi8imm before any STAfi_indY
|
||||
// or REP at this nesting level.
|
||||
auto Walker = std::next(It);
|
||||
MachineInstr *LdaToHoist = nullptr;
|
||||
while (Walker != MBB.end()) {
|
||||
if (Walker->isDebugInstr()) { ++Walker; continue; }
|
||||
unsigned Opc = Walker->getOpcode();
|
||||
// Hit a REP — wrap is closing without LDAi8imm inside.
|
||||
if (Opc == W65816::REP) break;
|
||||
// Hit a call / branch / asm — bail.
|
||||
if (Walker->isCall() || Walker->isBranch() ||
|
||||
Walker->isReturn() || Walker->isInlineAsm()) break;
|
||||
// Hit an STAfi_indY — this is the byte store; an LDAi8imm
|
||||
// before it would be the bug, but if we found one already
|
||||
// we'd have hoisted it; nothing to do here, stop scanning.
|
||||
if (Opc == W65816::STAfi_indY) break;
|
||||
if (Opc == W65816::LDAi8imm) {
|
||||
LdaToHoist = &*Walker;
|
||||
break;
|
||||
}
|
||||
++Walker;
|
||||
}
|
||||
if (LdaToHoist)
|
||||
SepHoists.push_back(LdaToHoist);
|
||||
}
|
||||
for (MachineInstr *Lda : SepHoists) {
|
||||
// Find the SEP we entered before the LDA. Walk backward.
|
||||
auto Back = Lda->getIterator();
|
||||
MachineInstr *OuterSep = nullptr;
|
||||
while (Back != MBB.begin()) {
|
||||
--Back;
|
||||
if (Back->isDebugInstr()) continue;
|
||||
if (Back->getOpcode() == W65816::SEP &&
|
||||
Back->getNumOperands() >= 1 &&
|
||||
Back->getOperand(0).isImm() &&
|
||||
Back->getOperand(0).getImm() == 0x20) {
|
||||
OuterSep = &*Back;
|
||||
break;
|
||||
}
|
||||
if (Back->isCall() || Back->isBranch() || Back->isInlineAsm())
|
||||
break;
|
||||
}
|
||||
if (!OuterSep) continue;
|
||||
Lda->removeFromParent();
|
||||
MBB.insert(OuterSep->getIterator(), Lda);
|
||||
Changed = true;
|
||||
}
|
||||
}
|
||||
|
||||
SmallVector<MachineInstr *, 8> Toggles;
|
||||
for (MachineInstr &MI : MBB) {
|
||||
unsigned Opc = MI.getOpcode();
|
||||
|
|
|
|||
|
|
@ -199,6 +199,28 @@ static bool tryEliminateDeadStore(MachineBasicBlock &MBB,
|
|||
return false;
|
||||
}
|
||||
|
||||
// Returns true if any MachineInstr in MF (other than those in `ignore`)
|
||||
// references frame index `FI`. Used by Pass -4 / Pass -4c safety
|
||||
// checks before erasing an init store: a cross-MBB reload would
|
||||
// otherwise read stale/uninitialized data. See #107.
|
||||
static bool slotHasOtherRefs(const MachineFunction &MF, int FI,
|
||||
ArrayRef<const MachineInstr *> ignore) {
|
||||
for (const MachineBasicBlock &MBB : MF) {
|
||||
for (const MachineInstr &MI : MBB) {
|
||||
bool skip = false;
|
||||
for (const MachineInstr *I : ignore) {
|
||||
if (&MI == I) { skip = true; break; }
|
||||
}
|
||||
if (skip) continue;
|
||||
for (const MachineOperand &MO : MI.operands()) {
|
||||
if (MO.isFI() && MO.getIndex() == FI)
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// Match `STAfi reg, FI, 0; ... ; LDAfi destReg, FI, 0` when reg == destReg
|
||||
// and nothing in between clobbers reg or the slot. Erase the LDAfi.
|
||||
static bool tryEliminateLoadAfterStore(MachineBasicBlock &MBB,
|
||||
|
|
@ -348,26 +370,12 @@ bool W65816StackSlotCleanup::runOnMachineFunction(MachineFunction &MF) {
|
|||
++It3;
|
||||
}
|
||||
if (!IndYTarget) continue;
|
||||
// Function-wide safety check: slotB must have NO other refs
|
||||
// besides Sta2 (which we erase) and IndYTarget (which we
|
||||
// rewrite). Cross-MBB reads of slotB would otherwise see
|
||||
// stale / uninitialized values after Sta2 is gone. Same fix
|
||||
// class as Pass -4c (#107).
|
||||
bool OtherUse = false;
|
||||
for (MachineBasicBlock &MBBO : MF) {
|
||||
for (MachineInstr &MIO : MBBO) {
|
||||
if (&MIO == &Sta2 || &MIO == IndYTarget) continue;
|
||||
for (const MachineOperand &MO : MIO.operands()) {
|
||||
if (MO.isFI() && MO.getIndex() == SlotB) {
|
||||
OtherUse = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (OtherUse) break;
|
||||
}
|
||||
if (OtherUse) break;
|
||||
}
|
||||
if (OtherUse) continue;
|
||||
// Function-wide safety check (#107): slotB must have no other
|
||||
// refs besides Sta2 (which we erase) and IndYTarget (which we
|
||||
// rewrite). Cross-MBB reads would otherwise see stale data.
|
||||
const MachineInstr *ignoreA[] = {&Sta2, IndYTarget};
|
||||
if (slotHasOtherRefs(MF, SlotB, ignoreA))
|
||||
continue;
|
||||
// Apply rewrite: IndYTarget's slotB → slotA.
|
||||
for (unsigned i = 0; i < IndYTarget->getNumOperands(); ++i) {
|
||||
if (IndYTarget->getOperand(i).isFI() &&
|
||||
|
|
@ -565,27 +573,16 @@ bool W65816StackSlotCleanup::runOnMachineFunction(MachineFunction &MF) {
|
|||
if (LaterUse) break;
|
||||
}
|
||||
if (LaterUse) continue;
|
||||
// Function-wide safety check: slotB must have NO references in
|
||||
// any OTHER basic block. We're erasing the only init store
|
||||
// (Sta); a cross-MBB reload of slotB would expect that init.
|
||||
// Caught by strtok_r (#107) where greedy spilled an in-place
|
||||
// INA into a slot that bb.3 also initialized; killing the
|
||||
// bb.3 init left bb.10 reading garbage.
|
||||
bool CrossMBBUse = false;
|
||||
for (MachineBasicBlock &MBBO : MF) {
|
||||
if (&MBBO == &MBB) continue;
|
||||
for (MachineInstr &MIO : MBBO) {
|
||||
for (const MachineOperand &MO : MIO.operands()) {
|
||||
if (MO.isFI() && MO.getIndex() == SlotB) {
|
||||
CrossMBBUse = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (CrossMBBUse) break;
|
||||
}
|
||||
if (CrossMBBUse) break;
|
||||
}
|
||||
if (CrossMBBUse) continue;
|
||||
// Function-wide safety check (#107): slotB must have no other
|
||||
// refs besides Sta (erased) and the IndY uses (rewritten).
|
||||
// The IndYUses list already collects all in-MBB IndY refs;
|
||||
// anything else (including cross-MBB) would expect the init.
|
||||
SmallVector<const MachineInstr *, 8> ignore;
|
||||
ignore.push_back(&Sta);
|
||||
for (MachineInstr *I : IndYUses)
|
||||
ignore.push_back(I);
|
||||
if (slotHasOtherRefs(MF, SlotB, ignore))
|
||||
continue;
|
||||
// Apply rewrites: redirect every IndY use of slotB → slotC.
|
||||
for (MachineInstr *IndY : IndYUses) {
|
||||
for (unsigned i = 0; i < IndY->getNumOperands(); ++i) {
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue