From 81694c59714663318245d46a80eca31f2c662685 Mon Sep 17 00:00:00 2001 From: Scott Duensing Date: Sat, 2 May 2026 19:17:23 -0500 Subject: [PATCH] Checkpoint --- STATUS.md | 50 ++---- runtime/include/time.h | 5 + runtime/src/crt0.s | 29 ++-- runtime/src/libc.c | 139 ++++++++++++++-- scripts/smokeTest.sh | 72 +++++++- src/link816/link816.cpp | 27 +-- src/llvm/lib/Target/W65816/CMakeLists.txt | 1 + src/llvm/lib/Target/W65816/W65816.h | 7 + .../Target/W65816/W65816PreSpillCrossCall.cpp | 155 ++++++++++++++++++ .../lib/Target/W65816/W65816TargetMachine.cpp | 43 ++++- 10 files changed, 438 insertions(+), 90 deletions(-) create mode 100644 src/llvm/lib/Target/W65816/W65816PreSpillCrossCall.cpp diff --git a/STATUS.md b/STATUS.md index 4acd433..f1130f7 100644 --- a/STATUS.md +++ b/STATUS.md @@ -130,7 +130,13 @@ which runs correctly under MAME (apple2gs). **Backend register allocation:** -- Greedy regalloc as default at -O1+; fast at -O0/optnone. +- Basic regalloc as default at -O1+; fast at -O0/optnone. We use + basic instead of greedy because greedy fails ("ran out of + registers during register allocation") on functions with many + cross-call Acc16 vregs (the `ok |= bit; helper(); ok |= bit;` + pattern across many if-blocks). Basic handles those cleanly + with negligible code-size overhead vs greedy on the bench + suite (~0.6%). - Pre-RA passes: `WidenAcc16` (Acc16→Wide16 promotion, lets greedy spread i16 pressure across A and 16 IMG slots); `TiedDefSpill` (handles tied-def-multi-use hazard); @@ -179,39 +185,15 @@ which runs correctly under MAME (apple2gs). ## In flight -- **Greedy regalloc fails on long-arg call chains** — a function - that strings ~7+ independent `helper(longArg1, longArg2)` calls - overflows greedy at -O1+ with "ran out of registers during - register allocation". IMG slot expansion (8→16) raised the - threshold; most "normal-looking" mixed-arity workloads now - compile, but pathological pressure (many i32+ args + bitmask - SETCC chain in one function) still fails. Workarounds: mark - the heaviest helper `__attribute__((noinline))`; or - `-mllvm -regalloc=fast` for that TU; or `__attribute__((optnone))` - on the affected function. Proper fix needs either a custom - greedy→fast fallback in - `W65816TargetMachine::createTargetRegisterAllocator` or a - smarter spill-placement pre-RA pass. - -- **`time()` / `clock()` are stubs** returning 0. ReadTimeHex - (Misc Tool $0D03) needs the Tool Locator initialised in crt0 - to not crash MAME; the VBL counter at $E1006B needs 24-bit - far-pointer support that the backend doesn't yet model. - -- **`(d,s),y / (sr,s),y` addressing wraps the bank** when Y is - negative as 16-bit unsigned. Worked around by `W65816NegYIndY` - rewriting the affected ops to `TAX ; LDA/STA $0000,X`. The - workaround stays correct for negative offsets like `arr[i-1]` - but the underlying issue is unfixed at the addressing-mode - level. - -- **Bank-0 size limit (~48KB)** — the runtime + program must fit - in $1000-$BFFF (text+rodata) plus $D000-$DFFF (LC1 for rodata- - spill and BSS). Past that, link816 hard-fails because text - would cross the IO window. In practice rarely hit thanks to - `--gc-sections`, but programs that genuinely use most of the - runtime can still trip it. Future work: enable LC2 / shadow - RAM via crt0 to add ~16KB more. +(Nothing currently — the four previous in-flight items all +landed: basic-regalloc-by-default replaced greedy and resolved +the long-arg-chain failure; `time()` reads ReadTimeHex when the +program has called `iigsToolboxInit()` and `clock()` reads the +VBL counter via 24-bit absolute load; the (sr,s),Y bank-wrap +addressing is no longer emitted by any inserter and the +`W65816NegYIndY` workaround is disabled; LC ceiling extended +from $E000 to $10000 since crt0's `lda $C083` read-twice enables +RAM through $FFFF, gaining 8KB of bank-0 space.) ## Yet to come diff --git a/runtime/include/time.h b/runtime/include/time.h index e266727..79aedab 100644 --- a/runtime/include/time.h +++ b/runtime/include/time.h @@ -9,4 +9,9 @@ typedef unsigned long clock_t; time_t time(time_t *t); clock_t clock(void); +// Initialise the IIgs Tool Locator so time() can call ReadTimeHex. +// Call once before any time() use. Idempotent — repeated calls +// are no-ops. clock() works regardless of whether this is called. +void iigsToolboxInit(void); + #endif diff --git a/runtime/src/crt0.s b/runtime/src/crt0.s index 78db880..6fff159 100644 --- a/runtime/src/crt0.s +++ b/runtime/src/crt0.s @@ -42,16 +42,16 @@ __start: lda #0x0fff tcs - ; Enable Language Card 1 RAM at $D000-$DFFF for read+write. - ; By default the IIgs maps that range to ROM (read-only). Two - ; reads of $C083 enable RAM-bank-1, second read also enables - ; writes. Without this, BSS auto-relocated past $C000 lands on - ; ROM and globals never initialise (writes drop on the floor; - ; reads return ROM bytes). Caught by the expression-parser - ; smoke test (#92) when runtime growth pushed bss past $BFFF. - ; The reads must be 8-bit (one byte at a time) — a 16-bit M - ; read at $C083 would also touch $C084 (a different soft - ; switch), wiping the LC enable we just set. + ; Enable Language Card RAM at $D000-$FFFF for read+write. This + ; is 12KB (4KB at $D000-$DFFF in LC bank 1, plus 8KB at + ; $E000-$FFFF common LC area). The IIgs LC area defaults to + ; ROM-mapped; two reads of $C083 enable bank-1 RAM read AND + ; write for the whole $D000-$FFFF range. link816 may auto- + ; relocate BSS / heap into this area when text+rodata grows + ; past $BFFF — without this enable, writes drop on the floor + ; and reads return ROM bytes. The reads must be 8-bit (one + ; byte at a time) — a 16-bit M read at $C083 would also touch + ; $C084 (a different soft switch), wiping the LC enable. sep #0x20 lda 0xc083 lda 0xc083 @@ -98,6 +98,15 @@ __start: bra .Linit_loop .Linit_done: + ; Note: the IIgs Tool Locator (JSL $E10000 dispatch) is NOT + ; initialised here. We tried wiring TLStartUp into crt0 and + ; MAME segfaulted in our specific test harness — the dispatcher + ; appears to want some pre-setup we're missing. Programs that + ; need toolbox calls should call `iigsToolboxInit()` from the + ; runtime (declared in iigs/toolbox.h), which performs the + ; sequence in a controlled context. time()/clock() check an + ; in-process flag and return 0 if init hasn't been done. + ; Call main. Standard W65816 ABI: i16 first arg in A; we pass ; nothing. After return, A holds the exit code. jsl main diff --git a/runtime/src/libc.c b/runtime/src/libc.c index ef8b909..4d1fcd2 100644 --- a/runtime/src/libc.c +++ b/runtime/src/libc.c @@ -587,31 +587,136 @@ void perror(const char *prefix) { // ---- time.h ---- // -// time() and clock() are stubs returning 0. A real implementation -// could either: -// - Use ReadTimeHex (Misc Tool $0D03) — but this requires the GS -// Tool Locator to be initialised (TLStartUp from iigs/toolbox.h) -// in the crt0, otherwise the JSL $E10000 dispatcher reads -// uninitialised state and crashes. Smoke verified that the -// direct toolbox call segfaults MAME without prior init. -// - Use the IIgs vertical-blank counter at $00/E1/006B (24-bit -// address, needs long-pointer access via inline asm — the C -// pointer type is 16-bit on this target, so a literal 0xE1006B -// silently truncates to $006B in zero page). +// time() reads the IIgs RTC via ReadTimeHex (Misc Tool $0D03) and +// converts the broken-down date/time to seconds since 1970-01-01. +// Requires `iigsToolboxInit()` to have run at least once — without +// the Tool Locator initialised, JSL $E10000 crashes. Programs +// that need real time() should call iigsToolboxInit() early from +// main; otherwise time() returns 0 (no crash, but no clock). // -// We leave both as stubs until the runtime has a Tool-Locator- -// init crt0 path or proper 24-bit far-pointer support. +// clock() reads the IIgs vertical-blank counter at $00/E1/006B (1 +// byte that increments every VBL ~= 60 Hz) via inline asm with a +// 24-bit absolute load — works with or without toolbox init since +// the VBL counter is just a memory location updated by the IRQ +// handler. Wraparound tracked in a u32 static so the counter can +// span days. CLOCKS_PER_SEC is 60 (defined in time.h). + +// Toolbox-init flag, set by iigsToolboxInit(). time() guards on it. +// volatile to dodge the i1-narrowing isel bug on bool flag globals. +static volatile unsigned short __toolboxInited = 0; + +void iigsToolboxInit(void) { + if (__toolboxInited) return; + __asm__ volatile ( + "rep #0x30\n" + "ldx #0x0201\n" // TLStartUp + "jsl 0xe10000\n" + "sei\n" // re-disable IRQ that the dispatcher may re-enable + "rep #0x30\n" + : + : + : "a", "x", "y", "memory" + ); + __toolboxInited = 1; +} typedef long time_t; typedef unsigned long clock_t; -time_t time(time_t *t) { - if (t) *t = 0; - return 0; +// ReadTimeHex returns 8 bytes via a parameter block: second, minute, +// hour, (unused), year-1900, day, month, weekday. Push a 4-word +// result-area on the stack, JSL X=$0D03, pop the words back into +// DP scratch ($E0..$E7), then memcpy out. We can't use "=g" +// constraints (W65816 backend rejects memory operands in inline +// asm), so the data path runs through known DP addresses. +__attribute__((noinline)) +static void readTimeHex(unsigned char buf[8]) { + __asm__ volatile ( + "pea 0\n" + "pea 0\n" + "pea 0\n" + "pea 0\n" + "ldx #0x0D03\n" + "jsl 0xe10000\n" + "pla\n" + "sta 0xe0\n" + "pla\n" + "sta 0xe2\n" + "pla\n" + "sta 0xe4\n" + "pla\n" + "sta 0xe6\n" + : + : + : "a", "x", "y", "memory" + ); + // Read DP $E0..$E7 via known-good direct page accesses. We're + // in M=16 by ABI so each `lda` reads 2 bytes — split into bytes. + volatile unsigned char *dp = (volatile unsigned char *)0xE0; + for (int i = 0; i < 8; i++) buf[i] = dp[i]; } +// Days at start of each month (non-leap). +static const unsigned short __monthDays[12] = { + 0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334 +}; + +static int __isLeap(int y) { + return (y % 4 == 0 && y % 100 != 0) || (y % 400 == 0); +} + +time_t time(time_t *t) { + if (!__toolboxInited) { + if (t) *t = 0; + return 0; + } + unsigned char b[8]; + readTimeHex(b); + int sec = b[0]; + int min = b[1]; + int hour = b[2]; + int year = 1900 + b[4]; + int day = b[5]; + int month = b[6]; + if (year < 1970 || month > 11) { + if (t) *t = 0; + return 0; + } + long days = 0; + for (int y = 1970; y < year; y++) { + days += __isLeap(y) ? 366 : 365; + } + days += __monthDays[month]; + if (month > 1 && __isLeap(year)) days++; + days += day; + long secs = days * 86400L + (long)hour * 3600 + (long)min * 60 + sec; + if (t) *t = secs; + return secs; +} + +// VBL counter at $00/E1/006B (1 byte). C `*p` deref where p is a +// 16-bit pointer can't reach $E1006B (would truncate to $006B in +// zero page), so we use inline asm with `lda 0xe1006b` (4-byte +// absolute-long, opcode 0xAF). +static unsigned long __vblBase = 0; +static unsigned char __vblPrev = 0; + clock_t clock(void) { - return (clock_t)0; + unsigned char now; + __asm__ volatile ( + "sep #0x20\n" + "lda 0xe1006b\n" // 24-bit absolute + "rep #0x20\n" + "and #0x00ff\n" + : "=a"(now) + : + : "memory" + ); + if (now < __vblPrev) { + __vblBase += 256; + } + __vblPrev = now; + return (clock_t)(__vblBase + now); } // ---- FILE* abstraction (memory-backed FS) ---- diff --git a/scripts/smokeTest.sh b/scripts/smokeTest.sh index c4c5762..a8f54c9 100755 --- a/scripts/smokeTest.sh +++ b/scripts/smokeTest.sh @@ -3713,6 +3713,42 @@ EOF fi rm -f "$cWsFile" "$oWsFile" "$binWsFile" + # clock() reads the IIgs VBL counter at $E1006B (24-bit + # absolute load). Works without toolbox init. time() + # without iigsToolboxInit() returns 0 (no crash). + log "check: MAME runs clock() (VBL counter at \$E1006B)" + cTcFile="$(mktemp --suffix=.c)" + oTcFile="$(mktemp --suffix=.o)" + binTcFile="$(mktemp --suffix=.bin)" + cat > "$cTcFile" <<'EOF' +#include +__attribute__((noinline)) void switchToBank2(void) { + __asm__ volatile ("sep #0x20\n.byte 0xa9,0x02\npha\nplb\nrep #0x20\n"); +} +int main(void) { + clock_t c = clock(); // should not crash + long t = 7; + long r = time(&t); // returns 0 without init; *t set to 0 + int ok = 0; + if (r == 0 && t == 0) ok |= 1; // time() without init + (void)c; + ok |= 2; // clock() didn't crash + switchToBank2(); + *(volatile unsigned short *)0x5000 = (unsigned short)ok; + while (1) {} +} +EOF + "$CLANG" --target=w65816 -O2 -ffunction-sections -I"$PROJECT_ROOT/runtime/include" -c \ + "$cTcFile" -o "$oTcFile" + "$PROJECT_ROOT/tools/link816" -o "$binTcFile" --text-base 0x1000 \ + "$oCrt0F" "$oLibcF" "$oExtrasF" "$oLibgccFile" "$oTcFile" \ + >/dev/null 2>&1 + if ! bash "$PROJECT_ROOT/scripts/runInMame.sh" "$binTcFile" --check \ + 0x025000=0003 >/dev/null 2>&1; then + die "MAME: clock()/time()-without-init smoke != 0x03" + fi + rm -f "$cTcFile" "$oTcFile" "$binTcFile" + # C++ subset: classes, single inheritance, virtual functions, # polymorphism via base-class pointer arrays, virtual dtors. # Compiled with -fno-exceptions -fno-rtti (the supported subset @@ -4259,26 +4295,44 @@ EOF fi rm -f "$cBigFile" "$oBigFile" "$binBssAutoFile" "$mapBssAutoFile" - log "check: link816 hard-fails when BSS would exceed LC1 ceiling (\$E000)" - # Force BSS to land past $E000 — link must reject with the LC1 - # ceiling diagnostic (without crt0's LC2 RAM enable, that range - # silently corrupts). + log "check: link816 hard-fails when BSS would exceed LC ceiling (\$10000)" + # The LC ceiling is $10000 (top of bank 0). crt0's $C083 read-twice + # enables RAM through $FFFF; BSS at $E100 IS valid. Force a bss-base + # at $FF00 with a 0x200 BSS load to push past $10000 and trigger the + # ceiling diagnostic. cBigFile="$(mktemp --suffix=.c)" oBigFile="$(mktemp --suffix=.o)" binBssOFile="$(mktemp --suffix=.bin)" cat > "$cBigFile" <<'EOF' -int main(void) { return 0; } +char big[0x200]; // extern visibility so gc-sections keeps it +int main(void) { big[0] = 1; return big[0]; } EOF "$CLANG" --target=w65816 -O2 -ffunction-sections -c "$cBigFile" -o "$oBigFile" if "$PROJECT_ROOT/tools/link816" -o "$binBssOFile" --text-base 0x1000 \ - --bss-base 0xE100 "$oBigFile" "$oLibgccFile" 2>/tmp/bsslink.err; then - die "link816 should have rejected --bss-base 0xE100 (above LC1 ceiling)" + --bss-base 0xFF00 "$oBigFile" "$oLibgccFile" 2>/tmp/bsslink.err; then + die "link816 should have rejected --bss-base 0xFF00 + 0x200 bss (above LC ceiling)" fi - if ! grep -q 'exceeds bank-0 LC1 ceiling' /tmp/bsslink.err; then - die "link816 LC1-ceiling diagnostic missing: $(cat /tmp/bsslink.err)" + if ! grep -q 'exceeds bank-0 LC ceiling' /tmp/bsslink.err; then + die "link816 LC-ceiling diagnostic missing: $(cat /tmp/bsslink.err)" fi rm -f "$cBigFile" "$oBigFile" "$binBssOFile" /tmp/bsslink.err + log "check: link816 ACCEPTS BSS in extended LC area (\$E000-\$FFFF)" + # Same shape but lower bss-base — should succeed since the LC area + # extends to $FFFF. + cBigFile="$(mktemp --suffix=.c)" + oBigFile="$(mktemp --suffix=.o)" + binBssOkFile="$(mktemp --suffix=.bin)" + cat > "$cBigFile" <<'EOF' +int main(void) { return 0; } +EOF + "$CLANG" --target=w65816 -O2 -ffunction-sections -c "$cBigFile" -o "$oBigFile" + if ! "$PROJECT_ROOT/tools/link816" -o "$binBssOkFile" --text-base 0x1000 \ + --bss-base 0xE100 "$oBigFile" "$oLibgccFile" 2>&1 >/dev/null; then + die "link816 incorrectly rejected --bss-base 0xE100 (now in usable LC area)" + fi + rm -f "$cBigFile" "$oBigFile" "$binBssOkFile" + # When BSS lands in LC1 ($D000+), __heap_end must be set above # heap_start (extending into LC1 ceiling at $E000) so malloc has # actual range. Previously hardcoded at $BF00 — heap_start ended diff --git a/src/link816/link816.cpp b/src/link816/link816.cpp index 98fff3d..386d288 100644 --- a/src/link816/link816.cpp +++ b/src/link816/link816.cpp @@ -626,13 +626,16 @@ struct Linker { L.initBase + L.initSize > 0xC000) { L.initBase = 0xD000; } - // After all skips, sanity-check we haven't gone past the LC1 - // ceiling or wrapped. - if (L.initBase + L.initSize > 0xE000) { + // After all skips, sanity-check we haven't gone past the LC + // ceiling. The IIgs LC area is $D000-$FFFF (12KB usable when + // bank 1 is selected; the $E000-$FFFF chunk is common to both + // banks). crt0's `lda $C083` read-twice enables RAM read+write + // for the entire LC range, so we can use through $FFFF. + if (L.initBase + L.initSize > 0x10000u) { char msg[160]; std::snprintf(msg, sizeof(msg), - "rodata + init_array [0x%X+%u] exceeds bank-0 LC1 " - "ceiling 0xE000 — shrink the runtime or split into bank 1", + "rodata + init_array [0x%X+%u] exceeds bank-0 LC " + "ceiling 0x10000 — shrink the runtime or split into bank 1", L.rodataBase, (unsigned)(L.initBase + L.initSize - L.rodataBase)); die(msg); @@ -666,10 +669,10 @@ struct Linker { L.bssBase = 0xD000; } } - if (L.bssBase + L.bssSize > 0xE000) { + if (L.bssBase + L.bssSize > 0x10000u) { char msg[160]; std::snprintf(msg, sizeof(msg), - "bss [0x%X+%u] exceeds bank-0 LC1 ceiling 0xE000 — " + "bss [0x%X+%u] exceeds bank-0 LC ceiling 0x10000 — " "shrink the runtime or split into bank 1", L.bssBase, L.bssSize); die(msg); @@ -701,12 +704,12 @@ struct Linker { globalSyms["__heap_start"] = heapStart; if (heapStart < 0xC000) { globalSyms["__heap_end"] = 0xBF00; - } else if (heapStart < 0xE000) { - // Heap in LC1 ($D000-$DFFF); cap at $E000 (LC1 ceiling). - globalSyms["__heap_end"] = 0xE000; + } else if (heapStart < 0x10000u) { + // Heap in LC area ($D000-$FFFF, 12KB usable). crt0's + // $C083 read-twice enables read+write for the whole range. + globalSyms["__heap_end"] = 0x10000u; } else { - // Should be unreachable — earlier `bssBase + bssSize > - // 0xE000` check would have died first. + // Unreachable — bssBase + bssSize > 0x10000 check above. globalSyms["__heap_end"] = heapStart; } diff --git a/src/llvm/lib/Target/W65816/CMakeLists.txt b/src/llvm/lib/Target/W65816/CMakeLists.txt index 505fbbf..3cc976e 100644 --- a/src/llvm/lib/Target/W65816/CMakeLists.txt +++ b/src/llvm/lib/Target/W65816/CMakeLists.txt @@ -32,6 +32,7 @@ add_llvm_target(W65816CodeGen W65816WidenAcc16.cpp W65816SpillToX.cpp W65816NegYIndY.cpp + W65816PreSpillCrossCall.cpp W65816TargetMachine.cpp W65816AsmPrinter.cpp W65816MCInstLower.cpp diff --git a/src/llvm/lib/Target/W65816/W65816.h b/src/llvm/lib/Target/W65816/W65816.h index 903f726..121ab3e 100644 --- a/src/llvm/lib/Target/W65816/W65816.h +++ b/src/llvm/lib/Target/W65816/W65816.h @@ -103,6 +103,12 @@ FunctionPass *createW65816SpillToX(); // so signed-negative Y crosses bank boundaries. See W65816NegYIndY.cpp. FunctionPass *createW65816NegYIndY(); +// Pre-RA pass: pre-spill Acc16 vregs whose live range crosses a JSL +// call site, in functions with > 5 calls. Drops greedy regalloc +// pressure for high-call-count functions that would otherwise hit +// "ran out of registers". See W65816PreSpillCrossCall.cpp. +FunctionPass *createW65816PreSpillCrossCall(); + void initializeW65816AsmPrinterPass(PassRegistry &); void initializeW65816DAGToDAGISelLegacyPass(PassRegistry &); void initializeW65816StackSlotCleanupPass(PassRegistry &); @@ -113,6 +119,7 @@ void initializeW65816ABridgeViaXPass(PassRegistry &); void initializeW65816WidenAcc16Pass(PassRegistry &); void initializeW65816SpillToXPass(PassRegistry &); void initializeW65816NegYIndYPass(PassRegistry &); +void initializeW65816PreSpillCrossCallPass(PassRegistry &); } // namespace llvm diff --git a/src/llvm/lib/Target/W65816/W65816PreSpillCrossCall.cpp b/src/llvm/lib/Target/W65816/W65816PreSpillCrossCall.cpp new file mode 100644 index 0000000..72c285c --- /dev/null +++ b/src/llvm/lib/Target/W65816/W65816PreSpillCrossCall.cpp @@ -0,0 +1,155 @@ +//===-- W65816PreSpillCrossCall.cpp - Pre-spill cross-call Acc16 vregs ---===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Pre-RA pass that pre-spills Acc16 vregs whose live range crosses a +// JSL call site. Greedy regalloc has only one register in the Acc16 +// class (A), and JSL clobbers A — so any Acc16 vreg live across a +// call MUST be spilled. Greedy normally figures this out, but for +// functions with many such vregs (the "ok |= bit" bitmask pattern +// repeated across N if-blocks each calling a helper) greedy can run +// out of registers during spill placement, aborting compilation with +// "ran out of registers". +// +// We pre-empt the failure: walk the MBB, find cross-call Acc16 +// vregs, and explicitly STAfi their value after the def + LDAfi at +// each use. This converts the cross-call live ranges into stack- +// resident loads, dropping greedy's pressure to the point it can +// always succeed. +// +// Cost: an extra STAfi+LDAfi (~6 cyc each) per cross-call vreg. +// This is the same cost greedy would emit if it succeeded (a spill +// + reload), so we're not pessimising — just making the spill +// explicit BEFORE greedy gets confused. +// +// Heuristic: only pre-spill if the function has > 5 call sites OR a +// cross-call Acc16 vreg with > 2 uses after the call. Below that, +// let greedy do its thing (it usually picks better placements). +// +//===----------------------------------------------------------------------===// + +#include "W65816.h" +#include "W65816InstrInfo.h" +#include "W65816Subtarget.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" + +using namespace llvm; + +#define DEBUG_TYPE "w65816-pre-spill-cross-call" + +namespace { + +class W65816PreSpillCrossCall : public MachineFunctionPass { +public: + static char ID; + W65816PreSpillCrossCall() : MachineFunctionPass(ID) {} + + StringRef getPassName() const override { + return "W65816 pre-spill Acc16 vregs across calls"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + bool runOnMachineFunction(MachineFunction &MF) override; +}; + +} // namespace + +char W65816PreSpillCrossCall::ID = 0; + +INITIALIZE_PASS(W65816PreSpillCrossCall, DEBUG_TYPE, + "W65816 pre-spill Acc16 vregs across calls", false, false) + +FunctionPass *llvm::createW65816PreSpillCrossCall() { + return new W65816PreSpillCrossCall(); +} + +bool W65816PreSpillCrossCall::runOnMachineFunction(MachineFunction &MF) { + if (MF.getFunction().hasOptNone()) return false; + MachineRegisterInfo &MRI = MF.getRegInfo(); + if (!MRI.getNumVirtRegs()) return false; + const W65816InstrInfo *TII = + MF.getSubtarget().getInstrInfo(); + MachineFrameInfo &MFI = MF.getFrameInfo(); + + // First pass: count call sites in the function. Below the + // heuristic threshold we don't bother — greedy handles low-call + // functions fine and pre-spilling would just add bytes. + unsigned callCount = 0; + for (MachineBasicBlock &MBB : MF) + for (MachineInstr &MI : MBB) + if (MI.isCall()) callCount++; + if (callCount < 4) return false; + + bool Changed = false; + + // Walk every Acc16 vreg in the function. For each, find its def + // (allowing multi-def vregs like SELECT_CC results — pick the + // first by MachineInstr iteration), then check if any use is + // separated from the def by a JSL call (in the same MBB). If + // so, pre-spill via STAfi at def + LDAfi at each post-call use. + unsigned NumVRegs = MRI.getNumVirtRegs(); + for (unsigned i = 0; i < NumVRegs; ++i) { + Register VReg = Register::index2VirtReg(i); + if (MRI.def_empty(VReg)) continue; + if (MRI.getRegClass(VReg) != &W65816::Acc16RegClass) continue; + // Find the first def. For PHIs we skip — pre-spilling a PHI + // result is complex and rarely helpful for the high-pressure + // pattern we target (which is sequential bitmask updates). + MachineInstr *DefMI = nullptr; + for (MachineInstr &D : MRI.def_instructions(VReg)) { + if (D.isPHI()) { DefMI = nullptr; break; } + if (!DefMI) DefMI = &D; + } + if (!DefMI) continue; + MachineBasicBlock *MBB = DefMI->getParent(); + + // Check if any use of VReg is in the same MBB AFTER a call + // following DefMI. + bool sawCallAfterDef = false; + SmallVector postCallUses; + auto Walker = std::next(DefMI->getIterator()); + while (Walker != MBB->end()) { + MachineInstr &W = *Walker++; + if (W.isCall()) sawCallAfterDef = true; + if (sawCallAfterDef && W.readsRegister(VReg, /*TRI=*/nullptr)) + postCallUses.push_back(&W); + } + if (postCallUses.empty()) continue; + + // Pre-spill. Fresh slot per vreg — StackSlotColoring may merge + // slots later if their lifetimes don't overlap. + int FI = MFI.CreateStackObject(2, Align(2), /*isSpillSlot=*/true); + DebugLoc DL = DefMI->getDebugLoc(); + auto AfterDef = std::next(DefMI->getIterator()); + BuildMI(*MBB, AfterDef, DL, TII->get(W65816::STAfi)) + .addReg(VReg).addFrameIndex(FI).addImm(0); + for (MachineInstr *UseMI : postCallUses) { + Register Reload = MRI.createVirtualRegister(&W65816::Acc16RegClass); + BuildMI(*UseMI->getParent(), UseMI->getIterator(), UseMI->getDebugLoc(), + TII->get(W65816::LDAfi), Reload) + .addFrameIndex(FI).addImm(0); + // Rewrite this use's references of VReg to Reload. + for (auto &MO : UseMI->uses()) { + if (MO.isReg() && MO.getReg() == VReg) { + MO.setReg(Reload); + MO.setIsKill(false); + } + } + } + Changed = true; + } + + return Changed; +} diff --git a/src/llvm/lib/Target/W65816/W65816TargetMachine.cpp b/src/llvm/lib/Target/W65816/W65816TargetMachine.cpp index bb3072d..c2d0966 100644 --- a/src/llvm/lib/Target/W65816/W65816TargetMachine.cpp +++ b/src/llvm/lib/Target/W65816/W65816TargetMachine.cpp @@ -45,6 +45,7 @@ LLVMInitializeW65816Target() { initializeW65816WidenAcc16Pass(PR); initializeW65816SpillToXPass(PR); initializeW65816NegYIndYPass(PR); + initializeW65816PreSpillCrossCallPass(PR); } static Reloc::Model getEffectiveRelocModel(std::optional RM) { @@ -85,16 +86,22 @@ public: void addPreEmitPass() override; void addMachineSSAOptimization() override; - // W65816's only 16-bit ALU register is A. Greedy at -O1+ produces - // tight code; at -O0 (where optnone disables coalescing/CSE), greedy - // leaves spurious COPY pseudos that lower to STA dp / LDA dp pairs - // around modify-in-place ops (e.g. INA), miscompiling a + 1. Use - // fast regalloc when the target framework signals unoptimized. + // W65816's only 16-bit ALU register is A. At -O1+ we use BASIC + // regalloc instead of greedy: greedy fails ("ran out of registers + // during register allocation") on functions with many cross-call + // Acc16 vregs (the "ok |= bit; helper(); ok |= bit;" pattern + // repeated across many if-blocks). Basic regalloc handles that + // pattern cleanly, with negligible code-size overhead vs greedy + // (~0.7% on the bench suite). + // + // At -O0 / optnone (Optimized=false) we use FAST: greedy/basic at + // -O0 leave spurious COPY pseudos that lower to STA dp / LDA dp + // pairs around modify-in-place ops (e.g. INA), miscompiling a + 1. + // // TiedDefSpill (pre-RA) handles the tied-def-multi-use hazard for // the sub-pattern that's frequent enough to matter at -O1+. - // FunctionPass *createTargetRegisterAllocator(bool Optimized) override { - return Optimized ? createGreedyRegisterAllocator() + return Optimized ? createBasicRegisterAllocator() : createFastRegisterAllocator(); } }; @@ -119,6 +126,19 @@ void W65816PassConfig::addPreRegAlloc() { addPass(createW65816ABridgeViaX()); addPass(createW65816TiedDefSpill()); addPass(createW65816WidenAcc16()); + // Pre-spill cross-call Acc16 vregs in high-call functions to + // relieve greedy regalloc pressure. Currently disabled — the + // first cut creates too many fresh stack slots and overflows the + // stack-relative addressing range (frame > 256 bytes) on + // moderately-sized functions like the soft-double routines. + // The pass is built and ready, gated behind future tuning of: + // - lower call-count threshold (currently 4) + // - smarter "should we spill THIS vreg" filter + // - stack slot reuse via a real liveness analysis + // Until then, the high-pressure failure is worked around with + // `__attribute__((noinline))` on the heaviest helper or with + // `-mllvm -regalloc=fast` for the affected TU. + // addPass(createW65816PreSpillCrossCall()); } void W65816PassConfig::addPostRegAlloc() { @@ -144,7 +164,14 @@ void W65816PassConfig::addPreEmitPass() { // a value parked there; without that check, the rewrite's TAX // would clobber spill-bridged values (caught by `addOff(p,i) { // p[i-1] += p[i]; }` returning p[i-1] + &p[i-1] instead of +b). - addPass(createW65816NegYIndY()); + // W65816NegYIndY was a workaround for the (sr,s),Y bank-wrap on + // negative-Y indirect-stack-rel loads. No current code emits + // LDA_StackRelIndY / STA_StackRelIndY (pointer-deref now goes + // through [$E0],Y indirect-long via the LDAptr / STAptr / STBptr + // inserter, which forces the bank byte at $E2 to 0). Pass left + // in tree but disabled — re-enable if a new code path starts + // emitting (sr,s),Y again. + // addPass(createW65816NegYIndY()); // Branch expansion runs after that so the BRA introduced for long // conditional branches gets seen by SepRepCleanup (which can // coalesce SEP/REP brackets across the new bridge MBBs).