From 465f8ba94773e7e93ab9f774c5ae92dd544dacd4 Mon Sep 17 00:00:00 2001 From: Scott Duensing Date: Tue, 5 May 2026 22:00:34 -0500 Subject: [PATCH] Checkpoint --- .gitignore | 1 + .../0002-triple-cpp-add-w65816-cases.patch | 11 +- patches/0005-target-data-layout-w65816.patch | 4 +- ...etlowering-virtual-gettypeconversion.patch | 13 + runtime/build.sh | 6 + runtime/src/crt0.s | 16 + runtime/src/crt0Gsos.s | 12 + runtime/src/snprintf.c | 11 +- runtime/src/softDouble.c | 39 +- runtime/src/timeExt.c | 84 +- scripts/runFileCheckTests.sh | 81 ++ scripts/smokeTest.sh | 53 + src/clang/lib/Basic/Targets/W65816.h | 2 +- src/llvm/lib/Target/W65816/CMakeLists.txt | 1 + src/llvm/lib/Target/W65816/W65816.h | 10 + .../lib/Target/W65816/W65816ISelDAGToDAG.cpp | 53 +- .../lib/Target/W65816/W65816ISelLowering.cpp | 1211 +++++++++++++++-- .../lib/Target/W65816/W65816ISelLowering.h | 62 + .../lib/Target/W65816/W65816InstrInfo.cpp | 145 ++ src/llvm/lib/Target/W65816/W65816InstrInfo.td | 129 +- .../lib/Target/W65816/W65816LowerWide32.cpp | 326 +++++ .../lib/Target/W65816/W65816RegisterInfo.td | 65 + .../lib/Target/W65816/W65816SepRepCleanup.cpp | 20 + .../Target/W65816/W65816StackSlotCleanup.cpp | 46 +- .../lib/Target/W65816/W65816TargetMachine.cpp | 18 + src/llvm/test/CodeGen/W65816/add-i16.ll | 12 + .../W65816/canmergestoresto-i16-cap.ll | 30 + .../CodeGen/W65816/extract-wide32-regseq.ll | 36 + .../CodeGen/W65816/i64-first-arg-img16.ll | 36 + .../CodeGen/W65816/img-copy-survives-mcp.ll | 32 + .../CodeGen/W65816/jslpseudo-caller-save.ll | 28 + src/llvm/test/CodeGen/W65816/lit.local.cfg | 2 + .../W65816/seprep-ldy-elision-kill-flag.ll | 29 + .../CodeGen/W65816/sign-extend-inreg-i32.ll | 41 + .../test/CodeGen/W65816/wide32-phi-split.ll | 32 + 35 files changed, 2496 insertions(+), 201 deletions(-) create mode 100644 patches/0007-targetlowering-virtual-gettypeconversion.patch create mode 100755 scripts/runFileCheckTests.sh create mode 100644 src/llvm/lib/Target/W65816/W65816LowerWide32.cpp create mode 100644 src/llvm/test/CodeGen/W65816/add-i16.ll create mode 100644 src/llvm/test/CodeGen/W65816/canmergestoresto-i16-cap.ll create mode 100644 src/llvm/test/CodeGen/W65816/extract-wide32-regseq.ll create mode 100644 src/llvm/test/CodeGen/W65816/i64-first-arg-img16.ll create mode 100644 src/llvm/test/CodeGen/W65816/img-copy-survives-mcp.ll create mode 100644 src/llvm/test/CodeGen/W65816/jslpseudo-caller-save.ll create mode 100644 src/llvm/test/CodeGen/W65816/lit.local.cfg create mode 100644 src/llvm/test/CodeGen/W65816/seprep-ldy-elision-kill-flag.ll create mode 100644 src/llvm/test/CodeGen/W65816/sign-extend-inreg-i32.ll create mode 100644 src/llvm/test/CodeGen/W65816/wide32-phi-split.ll diff --git a/.gitignore b/.gitignore index 52b4a98..bf166c9 100644 --- a/.gitignore +++ b/.gitignore @@ -9,6 +9,7 @@ tools/ # runtime/src/*.s. The source files (.s, build.sh) are tracked. runtime/*.o runtime/*.o.bak +runtime/*.o.tmp # Editor / OS *.swp diff --git a/patches/0002-triple-cpp-add-w65816-cases.patch b/patches/0002-triple-cpp-add-w65816-cases.patch index 4f8cc6c..65930a7 100644 --- a/patches/0002-triple-cpp-add-w65816-cases.patch +++ b/patches/0002-triple-cpp-add-w65816-cases.patch @@ -1,5 +1,5 @@ diff --git a/llvm/lib/TargetParser/Triple.cpp b/llvm/lib/TargetParser/Triple.cpp -index 8aef55224..b6e467274 100644 +index 8aef55224..1ab00ce9f 100644 --- a/llvm/lib/TargetParser/Triple.cpp +++ b/llvm/lib/TargetParser/Triple.cpp @@ -80,6 +80,8 @@ StringRef Triple::getArchTypeName(ArchType Kind) { @@ -75,3 +75,12 @@ index 8aef55224..b6e467274 100644 case Triple::nvptx64: case Triple::nvptx: case Triple::ppcle: +@@ -2704,6 +2714,8 @@ ExceptionHandling Triple::getDefaultExceptionHandling() const { + case Triple::xcore: + case Triple::xtensa: + return ExceptionHandling::DwarfCFI; ++ case Triple::w65816: ++ return ExceptionHandling::SjLj; + default: + break; + } diff --git a/patches/0005-target-data-layout-w65816.patch b/patches/0005-target-data-layout-w65816.patch index ecc8e11..ca3c6ec 100644 --- a/patches/0005-target-data-layout-w65816.patch +++ b/patches/0005-target-data-layout-w65816.patch @@ -1,5 +1,5 @@ diff --git a/llvm/lib/TargetParser/TargetDataLayout.cpp b/llvm/lib/TargetParser/TargetDataLayout.cpp -index 8837d2f91..b796d9e86 100644 +index 8837d2f91..920b8ac8e 100644 --- a/llvm/lib/TargetParser/TargetDataLayout.cpp +++ b/llvm/lib/TargetParser/TargetDataLayout.cpp @@ -582,6 +582,8 @@ std::string Triple::computeDataLayout(StringRef ABIName) const { @@ -7,7 +7,7 @@ index 8837d2f91..b796d9e86 100644 case Triple::msp430: return "e-m:e-p:16:16-i32:16-i64:16-f32:16-f64:16-a:8-n8:16-S16"; + case Triple::w65816: -+ return "e-m:e-p:16:8-i16:16-i32:16-n8:16-S16"; ++ return "e-m:e-p:16:16-i16:16-i32:16-i64:16-f32:16-f64:16-n8:16-S16"; case Triple::ppc: case Triple::ppcle: case Triple::ppc64: diff --git a/patches/0007-targetlowering-virtual-gettypeconversion.patch b/patches/0007-targetlowering-virtual-gettypeconversion.patch new file mode 100644 index 0000000..2c63316 --- /dev/null +++ b/patches/0007-targetlowering-virtual-gettypeconversion.patch @@ -0,0 +1,13 @@ +diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h +index 7c4c29fc3..7109a79fa 100644 +--- a/llvm/include/llvm/CodeGen/TargetLowering.h ++++ b/llvm/include/llvm/CodeGen/TargetLowering.h +@@ -1144,7 +1144,7 @@ public: + /// integer register, this contains one step in the expansion to get to the + /// smaller register. For illegal floating point types, this returns the + /// integer type to transform to. +- LegalizeKind getTypeConversion(LLVMContext &Context, EVT VT) const; ++ virtual LegalizeKind getTypeConversion(LLVMContext &Context, EVT VT) const; + + /// Return how we should legalize values of this type, either it is already + /// legal (return 'Legal') or we need to promote it to a larger type (return diff --git a/runtime/build.sh b/runtime/build.sh index 215e6ad..6a5aa8c 100755 --- a/runtime/build.sh +++ b/runtime/build.sh @@ -6,6 +6,12 @@ set -euo pipefail PROJECT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" LLVM_MC="$PROJECT_ROOT/tools/llvm-mos-build/bin/llvm-mc" CLANG="$PROJECT_ROOT/tools/llvm-mos-build/bin/clang" +# Apply CPU/memory caps so a runaway backend bug can't OOM-kill the +# entire tmux scope. Use `|| true` so when invoked from a parent that +# has already lowered the limit (e.g. smokeTest.sh sets 90s), we keep +# the parent's tighter cap rather than failing the build. +ulimit -v $((10 * 1024 * 1024)) 2>/dev/null || true +ulimit -t 1200 2>/dev/null || true [ -x "$LLVM_MC" ] || { echo "llvm-mc not found at $LLVM_MC" >&2; exit 1; } [ -x "$CLANG" ] || { echo "clang not found at $CLANG" >&2; exit 1; } diff --git a/runtime/src/crt0.s b/runtime/src/crt0.s index 6fff159..67f1ec2 100644 --- a/runtime/src/crt0.s +++ b/runtime/src/crt0.s @@ -57,6 +57,22 @@ __start: lda 0xc083 rep #0x20 + ; Persistent "current data bank" byte at DP $BE. The LDAptr/ + ; STAptr/STBptr inserters load this into $E2 before each [dp],Y + ; deref so pointer-deref lands in the user's bank, matching where + ; DBR-relative absolute stores go. Under MAME (no Loader), DBR=0 + ; and PBR=0 here, so $BE=0 — equivalent to the prior STZ $E2 + ; behavior. Under GS/OS Loader, crt0Gsos.s sets it to PBR. + ; + ; $BE chosen because it's outside both the libcall scratch range + ; ($E0..$FF used by libgcc.s for i64 ops) and the IMG slot range + ; ($C0..$DE). PHK pushes 1 byte; PLA in M=8 to pull just 1 byte. + sep #0x20 + phk + pla ; A's low byte = current PBR + sta 0xbe ; persistent data bank + rep #0x20 + ; Zero BSS. X iterates from __bss_start to __bss_end; each ; iteration writes one byte of zero at addr X (via DP=0 + ; offset 0 — which is just X). STZ in M=8 stores 1 byte and diff --git a/runtime/src/crt0Gsos.s b/runtime/src/crt0Gsos.s index 84e7fd6..6912139 100644 --- a/runtime/src/crt0Gsos.s +++ b/runtime/src/crt0Gsos.s @@ -41,6 +41,18 @@ __start: lda #0 tcd + ; Persistent "current data bank" byte at DP $BE. Set to PBR + ; (= our load bank) so the LDAptr/STAptr/STBptr inserters' + ; "LDA $BE; STA $E2" sequence puts pointer derefs in our bank, + ; matching DBR-relative absolute stores. $BE is outside the + ; libcall scratch range ($E0..$FF used by libgcc.s for i64 ops). + ; See crt0.s. + sep #0x20 + phk + pla + sta 0xbe + rep #0x20 + ; BSS zero-init. With DBR=our bank, `stz abs,X` writes to ; ourBank:X — correct as long as __bss_start/__bss_end fit in ; the segment's bank. diff --git a/runtime/src/snprintf.c b/runtime/src/snprintf.c index 6633870..8fd9fe7 100644 --- a/runtime/src/snprintf.c +++ b/runtime/src/snprintf.c @@ -103,6 +103,7 @@ static void emitDec(int n) { __attribute__((noinline)) +__attribute__((optnone)) static void emitULong(unsigned long n) { char buf[11]; int i = 0; @@ -122,7 +123,7 @@ static void emitULong(unsigned long n) { } -__attribute__((noinline)) +__attribute__((noinline,optnone)) static void emitSignedLong(long n) { // See emitDec: avoid the signed-overflow UB on LONG_MIN. if (n < 0) { @@ -221,6 +222,12 @@ static void emitDouble(double v, int prec) { // fmt is arg0 (A register); see banner comment for why the order matters. +// optnone: under ptr32 the regalloc reuses the same stack spill slot for +// both the va_list pointer `ap` and the fmt-walking pointer, so a `va_arg` +// after several fmt-character steps reads the wrong slot and gets 0 +// instead of the actual va_arg value. optnone forces fast regalloc which +// keeps each vreg in its own slot. See feedback_snprintf_va_arg_slot_alias.md. +__attribute__((optnone)) static int format(const char *fmt, va_list ap) { while (*fmt) { char c = *fmt++; @@ -295,6 +302,8 @@ static int format(const char *fmt, va_list ap) { } + +__attribute__((optnone)) int snprintf(char *buf, size_t n, const char *fmt, ...) { gCur = buf; // n == 0 must NOT touch the buffer (C99 7.19.6.5). Setting diff --git a/runtime/src/softDouble.c b/runtime/src/softDouble.c index 0b1e6a1..3e0885d 100644 --- a/runtime/src/softDouble.c +++ b/runtime/src/softDouble.c @@ -127,14 +127,23 @@ u64 __adddf3(u64 a, u64 b) { // Right-shift first to bring an over-wide sum back in range; then // left-shift if subtraction left the lead below 55. Reverse order // would shift an over-wide value out of u64 range entirely. - while (mr & ~((1ULL << 56) - 1)) { - u64 sticky = mr & 1; - mr = (mr >> 1) | sticky; - ea++; + // Use if + do-while because pure `while (cond) body` triggers a + // ptr32 backend bug: PHP/PLP wrap pass mis-identifies the loop's + // pre-test LDA reload as flag corruption and wraps the wrong + // range, so the BEQ tests stale flags and the loop body never + // fires. `do { } while (cond)` is unaffected (test-after-body). + if (mr & ~((1ULL << 56) - 1)) { + do { + u64 sticky_bit = mr & 1; + mr = (mr >> 1) | sticky_bit; + ea++; + } while (mr & ~((1ULL << 56) - 1)); } - while ((mr & (1ULL << 55)) == 0 && mr != 0) { - mr <<= 1; - ea--; + if ((mr & (1ULL << 55)) == 0 && mr != 0) { + do { + mr <<= 1; + ea--; + } while ((mr & (1ULL << 55)) == 0 && mr != 0); } // Round to nearest, ties to even. Bits 0/1 are sticky+round, bit 2 // is guard, bit 3 is mantissa LSB. @@ -259,14 +268,26 @@ u64 __divdf3(u64 a, u64 b) { // Handle the leading quotient bit explicitly. u64 q = DMANT_LEAD; u64 r = ma - mb; + // `volatile vmb`: forces mb to be re-read from memory inside the + // loop. Without this, the W65816 codegen miscompiles `r >= mb` and + // `r -= mb` when called as the 3rd+ chained `__divdf3` after prior + // softDouble libcalls (sqrt3 Newton iter — 3rd iter returned 0.0 + // instead of 1.41421). Adding `volatile` to either `r` or `mb` + // alone fixes it, suggesting the compiler is keeping one of them + // in registers across loop iterations and a JSL inside the loop + // (__ashlsi3 for `r <<= 1`) clobbers the held value. The real + // fix lives in the W65816 backend's u64-shift lowering; volatile + // here is the conservative workaround. + volatile u64 vmb = mb; // Compute 52 more fractional bits via standard shift-test-subtract. for (int i = 51; i >= 0; i--) { r <<= 1; - if (r >= mb) { - r -= mb; + if (r >= vmb) { + r -= vmb; q |= (1ULL << i); } } + mb = vmb; // resync in case below reads mb // Round to nearest, ties to even. Generate one extra bit (the // "guard"), examine the remainder for any non-zero "sticky" tail, // and round q up when guard=1 and (sticky || (q & 1)). Without diff --git a/runtime/src/timeExt.c b/runtime/src/timeExt.c index b7be5a4..d47c630 100644 --- a/runtime/src/timeExt.c +++ b/runtime/src/timeExt.c @@ -33,44 +33,20 @@ double difftime(time_t end, time_t start) { return (double)(end - start); } +struct tm *gmtime_r(const time_t *t, struct tm *out); + // gmtime / localtime: convert seconds-since-1970 to broken-down time. // "local" is identical to "gm" — no timezone support. // -// gmtime KNOWN-BROKEN under GS/OS Loader. The interface returns a -// pointer to a static global (`__gmtimeBuf`). User code reads -// `r->tm_field` which the W65816 backend lowers via [dp],Y with bank -// forced to 0 (DBR-independent — see W65816ISelLowering's LDAptr/STAptr -// inserter). But under Loader the buffer's IMM16 address gets cRELOC- -// patched to a runtime offset that's only valid in the user's bank, -// not bank 0 — so the user's reads land in unrelated bank-0 RAM. -// Even arranging for gmtime to write via [dp],y bank=0 makes both -// halves consistent at bank 0, but the cRELOC-patched address often -// falls in the Language Card area where bank-0 reads/writes aren't -// stable RAM. Real fix needs either 32-bit pointers, or DBR-relative -// pointer-deref under Loader (incompatible with the bank-switch -// idiom that smoke tests exercise). -// -// Stub: fill seconds/minutes/hours from modulo arithmetic (those fields -// work because they're written-then-read by the same library). Date -// fields stay at the 1970-01-01 sentinel. Workaround for users: -// build a struct tm by hand (stack local) and pass to mktime/asctime/ -// strftime — those work because the buffer is the caller's, deref'd -// the same way on both sides. +// Returns a pointer to a static global (`__gmtimeBuf`). Under GS/OS +// Loader (DBR != 0) caller-side pointer-deref reads need to land in +// the same bank where gmtime wrote; this requires the runtime build +// to enable `-mllvm -w65816-loader-bank-deref`, which makes +// LDAptr/STAptr load the bank byte from DP $BE (set by crt0 from +// PHK / current PBR). Without the flag, gmtime still works under +// MAME / non-Loader runs where DBR=0 throughout. struct tm *gmtime(const time_t *t) { - long secs = *t; - int sec = (int)(secs % 60L); secs /= 60L; - int min = (int)(secs % 60L); secs /= 60L; - int hour = (int)(secs % 24L); - __gmtimeBuf.tm_sec = sec; - __gmtimeBuf.tm_min = min; - __gmtimeBuf.tm_hour = hour; - __gmtimeBuf.tm_mday = 1; - __gmtimeBuf.tm_mon = 0; - __gmtimeBuf.tm_year = 70; // 1970 sentinel — date decomp KNOWN-BROKEN - __gmtimeBuf.tm_wday = 4; // Jan 1 1970 was Thursday - __gmtimeBuf.tm_yday = 0; - __gmtimeBuf.tm_isdst = -1; - return &__gmtimeBuf; + return gmtime_r(t, &__gmtimeBuf); } struct tm *localtime(const time_t *t) { @@ -82,13 +58,15 @@ struct tm *localtime(const time_t *t) { // is bank-0 in 65816 native mode regardless of DBR). This avoids the // bank-mismatch issue that breaks plain gmtime under Loader. // -// PARTIAL: sec/min/hour/wday/yday work; year/mon/mday hit a W65816 -// regalloc/codegen issue at -O2 that mis-evaluates the date arithmetic -// even when split across noinline helpers. Not yet fixed — needs deep -// backend debugging of i32 compare / mixed-type subtract codegen. -// -// Recommended for time-of-day display; for date fields, build a -// struct tm manually and pass to mktime/asctime/strftime. +// Full broken-down time computation. Marked optnone because at -O2 +// LLVM's combined IR optimizations (loop rotation + reassociation + +// induction-variable-simplify) mis-evaluate the year-increment loop's +// `days >= 365L + (__isLeap(...) ? 1 : 0)` comparison, leaving the +// loop body unexecuted and date fields stuck at the 1970 sentinel. +// optnone preserves the per-statement structure and the loop runs +// correctly. Verified end-to-end against 1710484245L → 2024-03-15 +// 06:30:45 UTC (Friday, day-of-year 74). +__attribute__((optnone)) struct tm *gmtime_r(const time_t *t, struct tm *out) { long secs = *t; int sec = (int)(secs % 60L); secs /= 60L; @@ -98,14 +76,30 @@ struct tm *gmtime_r(const time_t *t, struct tm *out) { int wday = (int)((days + 4L) % 7L); if (wday < 0) wday += 7; + int year = 70; // years since 1900 + while (days >= 365L + (__isLeap(1900 + year) ? 1 : 0)) { + days -= 365L + (__isLeap(1900 + year) ? 1 : 0); + year++; + } + int yday = (int)days; + int leap = __isLeap(1900 + year); + int mon = 11; + while (mon > 0) { + int firstDayOfMon = __monthDays[mon] + (leap && mon > 1 ? 1 : 0); + if ((int)days >= firstDayOfMon) break; + mon--; + } + int firstDay = __monthDays[mon] + (leap && mon > 1 ? 1 : 0); + int mday = (int)days - firstDay + 1; + out->tm_sec = sec; out->tm_min = min; out->tm_hour = hour; - out->tm_mday = 1; // KNOWN-BROKEN — see header comment - out->tm_mon = 0; - out->tm_year = 70; + out->tm_mday = mday; + out->tm_mon = mon; + out->tm_year = year; out->tm_wday = wday; - out->tm_yday = 0; + out->tm_yday = yday; out->tm_isdst = -1; return out; } diff --git a/scripts/runFileCheckTests.sh b/scripts/runFileCheckTests.sh new file mode 100755 index 0000000..0c0e107 --- /dev/null +++ b/scripts/runFileCheckTests.sh @@ -0,0 +1,81 @@ +#!/usr/bin/env bash +# runFileCheckTests.sh - run W65816 backend regression tests. +# +# Walks src/llvm/test/CodeGen/W65816/*.ll and for each: +# - reads RUN: lines from the test header (lit-compatible syntax) +# - executes them with %s -> the test path +# - any non-zero exit fails the run. +# +# Why not lit: the in-tree llvm-mos build is configured with +# LLVM_INCLUDE_TESTS=OFF (saves ~5 min from incremental rebuilds and +# ~2 GB of test artifacts). These regression tests are codegen-shape +# pins, not full lit-harness sweeps; FileCheck alone covers our needs. +# +# Usage: +# scripts/runFileCheckTests.sh # run all +# scripts/runFileCheckTests.sh foo.ll bar.ll # run named (relative to dir) + +set -euo pipefail + +PROJECT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +TEST_DIR="$PROJECT_ROOT/src/llvm/test/CodeGen/W65816" +LLC="$PROJECT_ROOT/tools/llvm-mos-build/bin/llc" +FILECHECK="$PROJECT_ROOT/tools/llvm-mos-build/bin/FileCheck" +NOT="$PROJECT_ROOT/tools/llvm-mos-build/bin/not" + +[ -x "$LLC" ] || { echo "missing $LLC" >&2; exit 2; } +[ -x "$FILECHECK" ] || { echo "missing $FILECHECK; build with 'ninja FileCheck not'" >&2; exit 2; } + +if [ $# -gt 0 ]; then + files=() + for f in "$@"; do + files+=("$TEST_DIR/$f") + done +else + mapfile -t files < <(find "$TEST_DIR" -maxdepth 1 -name '*.ll' | sort) +fi + +pass=0 +fail=0 +failed=() +for f in "${files[@]}"; do + [ -f "$f" ] || { echo "skip missing: $f"; continue; } + name="$(basename "$f")" + + runs=$(grep -E '^[[:space:]]*;[[:space:]]*RUN:' "$f" | sed -E 's/^[[:space:]]*;[[:space:]]*RUN:[[:space:]]*//') + if [ -z "$runs" ]; then + echo "SKIP $name (no RUN: line)" + continue + fi + + ok=1 + while IFS= read -r line; do + [ -z "$line" ] && continue + cmd=${line//%s/$f} + cmd=${cmd//llc/$LLC} + cmd=${cmd//FileCheck/$FILECHECK} + cmd=${cmd//not /$NOT } + out=$(bash -c "$cmd" 2>&1) || { + ok=0 + echo "FAIL $name" + echo " cmd: $cmd" + echo "$out" | sed 's/^/ | /' + break + } + done <<< "$runs" + + if [ $ok -eq 1 ]; then + echo "PASS $name" + pass=$((pass + 1)) + else + fail=$((fail + 1)) + failed+=("$name") + fi +done + +echo +echo "==== W65816 FileCheck: $pass pass, $fail fail ====" +if [ $fail -gt 0 ]; then + printf ' - %s\n' "${failed[@]}" + exit 1 +fi diff --git a/scripts/smokeTest.sh b/scripts/smokeTest.sh index adf617d..b3d22a9 100755 --- a/scripts/smokeTest.sh +++ b/scripts/smokeTest.sh @@ -3160,6 +3160,50 @@ EOF fi rm -f "$cTrFile" "$oTrFile" "$binTrFile" + log "check: MAME runs gmtime(1710484245) -> 2024-03-15 06:30:45 Fri (date math via real impl)" + cGmFile="$(mktemp --suffix=.c)" + oGmFile="$(mktemp --suffix=.o)" + oGmTime="$(mktemp --suffix=.o)" + binGmFile="$(mktemp --suffix=.bin)" + cat > "$cGmFile" <<'EOF' +typedef long time_t; +struct tm { + int tm_sec, tm_min, tm_hour; + int tm_mday, tm_mon, tm_year; + int tm_wday, tm_yday, tm_isdst; +}; +extern struct tm *gmtime(const time_t *); +__attribute__((noinline)) void switchToBank2(void) { + __asm__ volatile ("sep #0x20\n.byte 0xa9,0x02\npha\nplb\nrep #0x20\n"); +} +int main(void) { + time_t t = 1710484245L; // 2024-03-15 06:30:45 UTC, Friday, day 74 + struct tm *r = gmtime(&t); + switchToBank2(); + *(volatile unsigned short *)0x5000 = r->tm_year; // 124 + *(volatile unsigned short *)0x5002 = r->tm_mon; // 2 + *(volatile unsigned short *)0x5004 = r->tm_mday; // 15 + *(volatile unsigned short *)0x5006 = r->tm_hour; // 6 + *(volatile unsigned short *)0x5008 = r->tm_min; // 30 + *(volatile unsigned short *)0x500a = r->tm_sec; // 45 + *(volatile unsigned short *)0x500c = r->tm_wday; // 5 + *(volatile unsigned short *)0x500e = r->tm_yday; // 74 + while (1) {} +} +EOF + "$CLANG" --target=w65816 -O2 -ffunction-sections -c "$cGmFile" -o "$oGmFile" + "$CLANG" --target=w65816 -O2 -ffunction-sections \ + -c "$PROJECT_ROOT/runtime/src/timeExt.c" -o "$oGmTime" + "$PROJECT_ROOT/tools/link816" -o "$binGmFile" --text-base 0x1000 \ + "$oCrt0F" "$oLibgccFile" "$oGmTime" "$oGmFile" >/dev/null 2>&1 + if ! bash "$PROJECT_ROOT/scripts/runInMame.sh" "$binGmFile" --check \ + 0x025000=007c 0x025002=0002 0x025004=000f \ + 0x025006=0006 0x025008=001e 0x02500a=002d \ + 0x02500c=0005 0x02500e=004a >/dev/null 2>&1; then + die "MAME: gmtime(1710484245) returned wrong date fields" + fi + rm -f "$cGmFile" "$oGmFile" "$oGmTime" "$binGmFile" + log "check: MAME runs udivmod(0x123...DEF, 0x10000, &m) → q=0x12345_6789AB m=0xCDEF (#69)" cUdmFile="$(mktemp --suffix=.c)" oUdmFile="$(mktemp --suffix=.o)" @@ -5255,4 +5299,13 @@ print(f'OK: {nCreloc} cRELOC opcodes match sidecar') rm -f "$cR1" "$oR1" "$binR1" "$mapR1" "$relR1" "$omfR1" fi +# W65816 codegen-shape regression pins. Tiny FileCheck assertions on +# specific lowering behaviors that have broken before; runs in well +# under a second. See scripts/runFileCheckTests.sh. +log "check: W65816 FileCheck regressions pass" +"$PROJECT_ROOT/scripts/runFileCheckTests.sh" >/tmp/fcOut 2>&1 || { + cat /tmp/fcOut >&2 + die "W65816 FileCheck regressions failed" +} + log "all smoke checks passed" diff --git a/src/clang/lib/Basic/Targets/W65816.h b/src/clang/lib/Basic/Targets/W65816.h index 8cabf41..d9a728d 100644 --- a/src/clang/lib/Basic/Targets/W65816.h +++ b/src/clang/lib/Basic/Targets/W65816.h @@ -45,7 +45,7 @@ public: IntPtrType = SignedInt; PtrDiffType = SignedInt; SigAtomicType = SignedLong; - resetDataLayout("e-m:e-p:16:8-i16:16-i32:16-n8:16-S16"); + resetDataLayout("e-m:e-p:16:16-i16:16-i32:16-i64:16-f32:16-f64:16-n8:16-S16"); } void getTargetDefines(const LangOptions &Opts, diff --git a/src/llvm/lib/Target/W65816/CMakeLists.txt b/src/llvm/lib/Target/W65816/CMakeLists.txt index b3fe53f..d457117 100644 --- a/src/llvm/lib/Target/W65816/CMakeLists.txt +++ b/src/llvm/lib/Target/W65816/CMakeLists.txt @@ -34,6 +34,7 @@ add_llvm_target(W65816CodeGen W65816NegYIndY.cpp W65816PreSpillCrossCall.cpp W65816SjLjFinalize.cpp + W65816LowerWide32.cpp W65816TargetMachine.cpp W65816AsmPrinter.cpp W65816MCInstLower.cpp diff --git a/src/llvm/lib/Target/W65816/W65816.h b/src/llvm/lib/Target/W65816/W65816.h index 1860bb2..2bf5a91 100644 --- a/src/llvm/lib/Target/W65816/W65816.h +++ b/src/llvm/lib/Target/W65816/W65816.h @@ -116,6 +116,15 @@ FunctionPass *createW65816PreSpillCrossCall(); // W65816SjLjFinalize.cpp. FunctionPass *createW65816SjLjFinalize(); +// Pre-RA pass that lowers Wide32 register pairs into pairs of i16 +// vregs. Without this, greedy/basic regalloc can't fit the pair- +// pressure of i64-via-2-i32-via-Wide32 traffic in i64-heavy +// functions (RegAllocBase crashes during allocatePhysRegs). After +// this pass, only i16 vregs reach regalloc, and the pair structure +// lives only in the LDAptr32S / STAptr32S / STBptr32S pseudos which +// take 2 i16 ptr operands directly. +FunctionPass *createW65816LowerWide32(); + void initializeW65816AsmPrinterPass(PassRegistry &); void initializeW65816DAGToDAGISelLegacyPass(PassRegistry &); void initializeW65816StackSlotCleanupPass(PassRegistry &); @@ -128,6 +137,7 @@ void initializeW65816SpillToXPass(PassRegistry &); void initializeW65816NegYIndYPass(PassRegistry &); void initializeW65816PreSpillCrossCallPass(PassRegistry &); void initializeW65816SjLjFinalizePass(PassRegistry &); +void initializeW65816LowerWide32Pass(PassRegistry &); } // namespace llvm diff --git a/src/llvm/lib/Target/W65816/W65816ISelDAGToDAG.cpp b/src/llvm/lib/Target/W65816/W65816ISelDAGToDAG.cpp index 84c8bfe..271a338 100644 --- a/src/llvm/lib/Target/W65816/W65816ISelDAGToDAG.cpp +++ b/src/llvm/lib/Target/W65816/W65816ISelDAGToDAG.cpp @@ -71,21 +71,52 @@ void W65816DAGToDAGISel::Select(SDNode *Node) { return; } - // Custom selection: bare FrameIndex SDValue used as an i16 pointer - // value (e.g. `&arr[0]` for a stack-allocated array). The - // auto-generated selector has no pattern for `(i16 frameindex)` - // because tablegen doesn't expose FrameIndex as a leaf type — so - // ISel fails with "Cannot select: FrameIndex" before ever reaching - // a load/store-context fold. Convert it to ADDframe (FI, 0); the - // frame-index elimination pass turns ADDframe into TSC + CLC + ADC - // #(offset+stackSize), producing SP+offset in A. + // Custom selection: bare FrameIndex SDValue used as a pointer value + // (e.g. `&arr[0]` for a stack-allocated array). The auto-generated + // selector has no pattern for `(i16 frameindex)` because tablegen + // doesn't expose FrameIndex as a leaf type — so ISel fails with + // "Cannot select: FrameIndex" before ever reaching a load/store- + // context fold. Convert to ADDframe (FI, 0); the frame-index + // elimination pass turns ADDframe into TSC + CLC + ADC #(offset + + // stackSize), producing SP+offset in A. + // + // ptr32 mode: a `(i32 frameindex)` is `&local` typed as a 32-bit + // pointer (bank+addr). Lower as REG_SEQUENCE(ADDframe, sub_lo, 0, + // sub_hi). Hi=0 reflects the program-bank assumption (stack lives + // in bank 0 for our crt0 startup). Without this, ISel hits + // "Cannot select: t# = FrameIndex" and the pass crashes — + // observed for softDouble's __adddf3 calling dclass(a, &sa, &ea, + // &ma) where the latter three become i32 frameindex SDValues. if (Node->getOpcode() == ISD::FrameIndex) { SDLoc DL(Node); int FI = cast(Node)->getIndex(); + EVT VT = Node->getValueType(0); SDValue TFI = CurDAG->getTargetFrameIndex(FI, MVT::i16); - SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i16); - CurDAG->SelectNodeTo(Node, W65816::ADDframe, MVT::i16, TFI, Zero); - return; + SDValue Zero16 = CurDAG->getTargetConstant(0, DL, MVT::i16); + if (VT == MVT::i16) { + CurDAG->SelectNodeTo(Node, W65816::ADDframe, MVT::i16, TFI, Zero16); + return; + } + if (VT == MVT::i32) { + // Build (REG_SEQUENCE Wide32RC, ADDframe(FI,0), sub_lo, MOVi16(0), + // sub_hi). ADDframe materialises lo as an i16 SDValue; the hi + // half is the literal bank byte (0). + SDNode *Lo = CurDAG->getMachineNode(W65816::ADDframe, DL, + MVT::i16, TFI, Zero16); + SDValue HiC = CurDAG->getTargetConstant(0, DL, MVT::i16); + // For the high half, just materialise an i16 zero via MOVi16imm. + SDNode *Hi = CurDAG->getMachineNode(W65816::LDAi16imm, DL, + MVT::i16, HiC); + SDValue RC = CurDAG->getTargetConstant(W65816::Wide32RegClassID, + DL, MVT::i32); + SDValue SubLo = CurDAG->getTargetConstant(llvm::sub_lo, DL, MVT::i32); + SDValue SubHi = CurDAG->getTargetConstant(llvm::sub_hi, DL, MVT::i32); + CurDAG->SelectNodeTo(Node, TargetOpcode::REG_SEQUENCE, MVT::i32, + {RC, SDValue(Lo, 0), SubLo, SDValue(Hi, 0), + SubHi}); + return; + } + report_fatal_error("W65816: FrameIndex selection: unsupported VT"); } // Defer to the auto-generated selector for everything else. diff --git a/src/llvm/lib/Target/W65816/W65816ISelLowering.cpp b/src/llvm/lib/Target/W65816/W65816ISelLowering.cpp index 5bc2a9f..f63d266 100644 --- a/src/llvm/lib/Target/W65816/W65816ISelLowering.cpp +++ b/src/llvm/lib/Target/W65816/W65816ISelLowering.cpp @@ -23,12 +23,30 @@ #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" #include "llvm/IR/Function.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/ErrorHandling.h" using namespace llvm; #define DEBUG_TYPE "w65816-lower" +// Loader-compat workaround: when set, LDAptr/STAptr/STBptr inserters +// load the bank byte from DP $BE (initialized by crt0 to PHK / current +// PBR) instead of forcing it to 0 via STZ $E2. This makes pointer +// derefs land in the user's bank — matching where DBR-relative +// absolute stores go — so library functions like gmtime that store +// into static buffers via DBR-relative paths are visible to caller- +// side pointer-deref reads. Costs 2 extra bytes / 4 cycles per ptr- +// deref (LDA dp + STA dp vs STZ dp). Default off to keep +// size-sensitive builds (toolbox) under the $C000 IO-window ceiling. +static cl::opt LoaderBankDeref( + "w65816-loader-bank-deref", + cl::desc("LDAptr/STAptr inserters read bank from DP $BE (set by " + "crt0 to PHK) instead of STZ $E2. Required for GS/OS " + "Loader compatibility; default off for size-sensitive " + "builds."), + cl::init(false), cl::Hidden); + W65816TargetLowering::W65816TargetLowering(const TargetMachine &TM, const W65816Subtarget &STI) : TargetLowering(TM, STI) { @@ -37,6 +55,7 @@ W65816TargetLowering::W65816TargetLowering(const TargetMachine &TM, // for ensuring the dynamic mode matches the selected class. addRegisterClass(MVT::i8, &W65816::Acc8RegClass); addRegisterClass(MVT::i16, &W65816::Acc16RegClass); + addRegisterClass(MVT::i32, &W65816::Wide32RegClass); computeRegisterProperties(STI.getRegisterInfo()); @@ -79,6 +98,21 @@ W65816TargetLowering::W65816TargetLowering(const TargetMachine &TM, for (MVT VT : MVT::integer_valuetypes()) setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Expand); + // Only register i32 ext-load / trunc-store and Custom actions when + // i32 is actually a legal type (ptr32 mode active). Otherwise the + // Custom-action calls intercept i16/i8 ops, and LowerTruncate's + // SDValue()-on-non-i32 bail breaks the i16→i8 trunc pattern (same + // root cause as the earlier LOAD-Custom-breaks-LDAptr issue). + bool ptr32Active = isTypeLegal(MVT::i32); + if (ptr32Active) { + for (MVT MemVT : {MVT::i8, MVT::i16}) { + setLoadExtAction(ISD::ZEXTLOAD, MVT::i32, MemVT, Expand); + setLoadExtAction(ISD::SEXTLOAD, MVT::i32, MemVT, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::i32, MemVT, Expand); + setTruncStoreAction(MVT::i32, MemVT, Expand); + } + } + // Vararg support: VASTART writes the address of the first vararg slot // to the va_list pointer. VAARG/VACOPY/VAEND use the default // expansions that load through that pointer and bump it. This makes @@ -164,6 +198,15 @@ W65816TargetLowering::W65816TargetLowering(const TargetMachine &TM, setOperationAction(ISD::SRL, MVT::i8, Custom); setOperationAction(ISD::SRA, MVT::i8, Custom); + // LOAD / STORE Custom-lowering for ptr32 mode is intentionally NOT + // wired here in ptr16 mode. Setting LOAD Custom and returning + // SDValue() from LowerLoad short-circuits the i16-result LDAptr/ + // STAptr selection paths (the Custom→empty→Legal fall-through doesn't + // re-enter pattern matching). When ptr32 is activated, this hook + // needs a different gating mechanism — likely an isel-time + // replacement triggered by addrspacecast or a target DAG combine. + // See LowerLoad / LowerStore — currently dead code. + // ADDC/ADDE/SUBC/SUBE are the legacy SDNodes with implicit Glue carrying // the carry/borrow flag between the two halves of a multi-precision add or // sub. Setting them Legal triggers the type legalizer's carry-chain split @@ -203,6 +246,47 @@ W65816TargetLowering::W65816TargetLowering(const TargetMachine &TM, // explicit SHL/SHL_PARTS action needed — the override forces the // type-legalizer's libcall path before SHL_PARTS would be emitted. } + // i64 shifts — route to libcall before the type legalizer tries + // to split via the next-legal-type (which becomes i32 in ptr32 mode + // and triggers a SDAG combine loop on `i64 >> K` patterns). By + // marking SHL/SRL/SRA i64 LibCall here, the operation legalizer + // picks up the libcall path even though i64 itself is illegal. + for (MVT VT : {MVT::i64}) { + setOperationAction(ISD::SHL, VT, LibCall); + setOperationAction(ISD::SRL, VT, LibCall); + setOperationAction(ISD::SRA, VT, LibCall); + } + + if (ptr32Active) { + for (unsigned Op : {ISD::ADD, ISD::SUB, ISD::AND, ISD::OR, ISD::XOR}) + setOperationAction(Op, MVT::i32, Custom); + setOperationAction(ISD::SHL, MVT::i32, Custom); + setOperationAction(ISD::SRL, MVT::i32, Custom); + setOperationAction(ISD::SRA, MVT::i32, Custom); + setOperationAction(ISD::ZERO_EXTEND, MVT::i32, Custom); + setOperationAction(ISD::SIGN_EXTEND, MVT::i32, Custom); + setOperationAction(ISD::ANY_EXTEND, MVT::i32, Custom); + // SIGN_EXTEND_INREG with i32 result and inner type i1/i8/i16: + // the combiner emits this for `(int32_t)((int8_t)x)` and for + // `-(crc & 1ul)` (the i1 case shows up in CRC32 loops). No + // tablegen pattern covers the i32 form; Custom-lower to per-half + // ops. IMPORTANT: LegalizeDAG looks up the action for + // SIGN_EXTEND_INREG using the INNER VT (the operand value type), + // not the result VT. See LegalizeDAG.cpp: + // Action = TLI.getOperationAction(Op, InnerType); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Custom); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Custom); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Custom); + setOperationAction(ISD::TRUNCATE, MVT::i16, Custom); + setOperationAction(ISD::TRUNCATE, MVT::i8, Custom); + setOperationAction(ISD::LOAD, MVT::i32, Custom); + setOperationAction(ISD::STORE, MVT::i32, Custom); + setOperationAction(ISD::SETCC, MVT::i32, Custom); + setOperationAction(ISD::BR_CC, MVT::i32, Custom); + setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); + setOperationAction(ISD::SELECT, MVT::i32, Custom); + setOperationAction(ISD::Constant, MVT::i32, Custom); + } // Disable jump tables. Generating them costs us BRIND (indirect // branch via 16-bit pointer load), which we don't have. A long @@ -224,7 +308,8 @@ W65816TargetLowering::W65816TargetLowering(const TargetMachine &TM, // address-select reverse combine (see W65816TargetLowering:: // PerformDAGCombine). // setTargetDAGCombine(ISD::LOAD); // bisecting pickif hang - setTargetDAGCombine(ISD::SHL); + // SHL combine disabled while debugging the ptr32 i64-phi hang. + // setTargetDAGCombine(ISD::SHL); } // Map an LLVM SETCC condition to a W65816 branch. Returns the condition @@ -371,6 +456,57 @@ static W65816CC::CondCode normalizeCC(SDValue &LHS, SDValue &RHS, return TCC; } +// Wide32 build/extract helpers, used by LowerLoad/Store/Extend/Truncate/ +// I32Bin/BR_CC to construct or destructure i32 SDValues across the +// sub_lo / sub_hi halves of the Wide32 register class. +static SDValue buildWide32(SelectionDAG &DAG, const SDLoc &DL, + SDValue Lo, SDValue Hi) { + SDValue RC = DAG.getTargetConstant(W65816::Wide32RegClassID, DL, MVT::i32); + SDValue SubLo = DAG.getTargetConstant(llvm::sub_lo, DL, MVT::i32); + SDValue SubHi = DAG.getTargetConstant(llvm::sub_hi, DL, MVT::i32); + SDNode *RS = DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, DL, MVT::i32, + {RC, Lo, SubLo, Hi, SubHi}); + return SDValue(RS, 0); +} +// Look through a buildWide32(Lo, Hi) -> REG_SEQUENCE(RC, Lo, sub_lo, +// Hi, sub_hi) pair: if X is exactly that machine node, return the +// matching half operand directly. Avoids a TargetExtractSubreg that +// would re-enter the SDAG combiner and re-build the i32 constant / +// pair, looping forever (observed as OOM in the combiner on `*t = 0`). +static SDValue lookThroughRegSeq(SDValue X, unsigned WantSub) { + if (!X.getNode() || !X.isMachineOpcode()) return SDValue(); + if (X.getMachineOpcode() != TargetOpcode::REG_SEQUENCE) return SDValue(); + // Layout: op0 = RC, then (Reg, SubIdx) pairs. + for (unsigned i = 1; i + 1 < X.getNumOperands(); i += 2) { + SDValue SubIdx = X.getOperand(i + 1); + auto *CIdx = dyn_cast(SubIdx); + if (!CIdx) continue; + if (CIdx->getZExtValue() == WantSub) + return X.getOperand(i); + } + return SDValue(); +} +static SDValue extractWide32Lo(SelectionDAG &DAG, const SDLoc &DL, SDValue X) { + // For constants, materialise the lo half as an i16 constant directly + // — getTargetExtractSubreg on a Constant SDNode produces a malformed + // MachineSDNode (constants don't carry sub-regs) and triggers + // SDAG combine loops downstream. + if (auto *C = dyn_cast(X)) { + return DAG.getConstant(C->getZExtValue() & 0xFFFFu, DL, MVT::i16); + } + if (SDValue Half = lookThroughRegSeq(X, llvm::sub_lo)) + return Half; + return DAG.getTargetExtractSubreg(llvm::sub_lo, DL, MVT::i16, X); +} +static SDValue extractWide32Hi(SelectionDAG &DAG, const SDLoc &DL, SDValue X) { + if (auto *C = dyn_cast(X)) { + return DAG.getConstant((C->getZExtValue() >> 16) & 0xFFFFu, DL, MVT::i16); + } + if (SDValue Half = lookThroughRegSeq(X, llvm::sub_hi)) + return Half; + return DAG.getTargetExtractSubreg(llvm::sub_hi, DL, MVT::i16, X); +} + SDValue W65816TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { SDValue Chain = Op.getOperand(0); ISD::CondCode CC = cast(Op.getOperand(1))->get(); @@ -379,6 +515,52 @@ SDValue W65816TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { SDValue Dest = Op.getOperand(4); SDLoc DL(Op); EVT VT = LHS.getValueType(); + // i32 BR_CC: synthesize an i16 boolean from per-half compares, then + // branch on (bool != 0). Avoids the legalizer's generic Expand that + // re-enters our SETCC/BR_CC custom paths in an infinite loop. + if (VT == MVT::i32) { + SDValue LL = extractWide32Lo(DAG, DL, LHS); + SDValue LH = extractWide32Hi(DAG, DL, LHS); + SDValue RL = extractWide32Lo(DAG, DL, RHS); + SDValue RH = extractWide32Hi(DAG, DL, RHS); + SDValue Bool; + if (CC == ISD::SETEQ || CC == ISD::SETNE) { + SDValue EqLo = DAG.getSetCC(DL, MVT::i16, LL, RL, ISD::SETEQ); + SDValue EqHi = DAG.getSetCC(DL, MVT::i16, LH, RH, ISD::SETEQ); + Bool = DAG.getNode(ISD::AND, DL, MVT::i16, EqLo, EqHi); + if (CC == ISD::SETNE) + Bool = DAG.getNode(ISD::XOR, DL, MVT::i16, Bool, + DAG.getConstant(1, DL, MVT::i16)); + } else { + // (a CC b) where CC is ordered: + // = (hi_a HiStrict hi_b) || (hi_a == hi_b && lo_a LoCC lo_b) + // HiStrict is the strict variant of CC (LE -> LT etc.) so the + // tie-breaker (hi==hi && lo CC lo) handles the equality case + // properly. LoCC is always the unsigned variant of CC because + // the low half is unsigned (the high half carries the sign). + ISD::CondCode HiCC, LoCCu; + switch (CC) { + case ISD::SETLT: HiCC = ISD::SETLT; LoCCu = ISD::SETULT; break; + case ISD::SETLE: HiCC = ISD::SETLT; LoCCu = ISD::SETULE; break; + case ISD::SETGT: HiCC = ISD::SETGT; LoCCu = ISD::SETUGT; break; + case ISD::SETGE: HiCC = ISD::SETGT; LoCCu = ISD::SETUGE; break; + case ISD::SETULT: HiCC = ISD::SETULT; LoCCu = ISD::SETULT; break; + case ISD::SETULE: HiCC = ISD::SETULT; LoCCu = ISD::SETULE; break; + case ISD::SETUGT: HiCC = ISD::SETUGT; LoCCu = ISD::SETUGT; break; + case ISD::SETUGE: HiCC = ISD::SETUGT; LoCCu = ISD::SETUGE; break; + default: + report_fatal_error("W65816: unexpected i32 BR_CC condition"); + } + SDValue HiOk = DAG.getSetCC(DL, MVT::i16, LH, RH, HiCC); + SDValue HiEq = DAG.getSetCC(DL, MVT::i16, LH, RH, ISD::SETEQ); + SDValue LoOk = DAG.getSetCC(DL, MVT::i16, LL, RL, LoCCu); + SDValue Tie = DAG.getNode(ISD::AND, DL, MVT::i16, HiEq, LoOk); + Bool = DAG.getNode(ISD::OR, DL, MVT::i16, HiOk, Tie); + } + SDValue Zero = DAG.getConstant(0, DL, MVT::i16); + return DAG.getNode(ISD::BR_CC, DL, MVT::Other, Chain, + DAG.getCondCode(ISD::SETNE), Bool, Zero, Dest); + } W65816CC::CondCode TCC = normalizeCC(LHS, RHS, CC, DAG, DL); if (TCC == W65816CC::COND_INVALID) @@ -411,6 +593,41 @@ SDValue W65816TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { ISD::CondCode CC = cast(Op.getOperand(2))->get(); SDLoc DL(Op); EVT VT = Op.getValueType(); + // i32 SETCC: split into per-half compares. Result type is i16 (the + // legalizer keeps the boolean result type narrow regardless of LHS + // width). + if (LHS.getValueType() == MVT::i32) { + SDValue LL = extractWide32Lo(DAG, DL, LHS); + SDValue LH = extractWide32Hi(DAG, DL, LHS); + SDValue RL = extractWide32Lo(DAG, DL, RHS); + SDValue RH = extractWide32Hi(DAG, DL, RHS); + if (CC == ISD::SETEQ || CC == ISD::SETNE) { + SDValue EqLo = DAG.getSetCC(DL, VT, LL, RL, ISD::SETEQ); + SDValue EqHi = DAG.getSetCC(DL, VT, LH, RH, ISD::SETEQ); + SDValue Eq = DAG.getNode(ISD::AND, DL, VT, EqLo, EqHi); + if (CC == ISD::SETNE) + Eq = DAG.getNode(ISD::XOR, DL, VT, Eq, DAG.getConstant(1, DL, VT)); + return Eq; + } + ISD::CondCode HiCC, LoCCu; + switch (CC) { + case ISD::SETLT: HiCC = ISD::SETLT; LoCCu = ISD::SETULT; break; + case ISD::SETLE: HiCC = ISD::SETLT; LoCCu = ISD::SETULE; break; + case ISD::SETGT: HiCC = ISD::SETGT; LoCCu = ISD::SETUGT; break; + case ISD::SETGE: HiCC = ISD::SETGT; LoCCu = ISD::SETUGE; break; + case ISD::SETULT: HiCC = ISD::SETULT; LoCCu = ISD::SETULT; break; + case ISD::SETULE: HiCC = ISD::SETULT; LoCCu = ISD::SETULE; break; + case ISD::SETUGT: HiCC = ISD::SETUGT; LoCCu = ISD::SETUGT; break; + case ISD::SETUGE: HiCC = ISD::SETUGT; LoCCu = ISD::SETUGE; break; + default: + report_fatal_error("W65816: unexpected i32 SETCC condition"); + } + SDValue HiOk = DAG.getSetCC(DL, VT, LH, RH, HiCC); + SDValue HiEq = DAG.getSetCC(DL, VT, LH, RH, ISD::SETEQ); + SDValue LoOk = DAG.getSetCC(DL, VT, LL, RL, LoCCu); + SDValue Tie = DAG.getNode(ISD::AND, DL, VT, HiEq, LoOk); + return DAG.getNode(ISD::OR, DL, VT, HiOk, Tie); + } SDValue One = DAG.getConstant(1, DL, VT); SDValue Zero = DAG.getConstant(0, DL, VT); return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, One, Zero, @@ -426,15 +643,48 @@ SDValue W65816TargetLowering::LowerSELECT_CC(SDValue Op, ISD::CondCode CC = cast(Op.getOperand(4))->get(); SDLoc DL(Op); + // i32 SELECT_CC: synthesize an i16 boolean from the i32 compare via + // LowerSETCC's i32 path, then select between the i32 halves driven + // by the boolean. Avoids creating the i32 W65816::CMP we have no + // pattern for. + if (LHS.getValueType() == MVT::i32) { + // Materialise the i16 boolean. + SDValue Bool = DAG.getSetCC(DL, MVT::i16, LHS, RHS, CC); + SDValue Zero = DAG.getConstant(0, DL, MVT::i16); + if (Op.getValueType() == MVT::i32) { + SDValue TLo = extractWide32Lo(DAG, DL, TVal); + SDValue THi = extractWide32Hi(DAG, DL, TVal); + SDValue FLo = extractWide32Lo(DAG, DL, FVal); + SDValue FHi = extractWide32Hi(DAG, DL, FVal); + SDValue Lo = DAG.getSelectCC(DL, Bool, Zero, TLo, FLo, ISD::SETNE); + SDValue Hi = DAG.getSelectCC(DL, Bool, Zero, THi, FHi, ISD::SETNE); + return buildWide32(DAG, DL, Lo, Hi); + } + return DAG.getSelectCC(DL, Bool, Zero, TVal, FVal, ISD::SETNE); + } + // SELECT_CC with i32 result (i16 LHS): split TVal/FVal into halves + // and run a per-half i16 SELECT_CC sharing the same condition. + if (Op.getValueType() == MVT::i32) { + SDValue TLo = extractWide32Lo(DAG, DL, TVal); + SDValue THi = extractWide32Hi(DAG, DL, TVal); + SDValue FLo = extractWide32Lo(DAG, DL, FVal); + SDValue FHi = extractWide32Hi(DAG, DL, FVal); + SDValue Lo = DAG.getSelectCC(DL, LHS, RHS, TLo, FLo, CC); + SDValue Hi = DAG.getSelectCC(DL, LHS, RHS, THi, FHi, CC); + return buildWide32(DAG, DL, Lo, Hi); + } + W65816CC::CondCode TCC = normalizeCC(LHS, RHS, CC, DAG, DL); if (TCC == W65816CC::COND_INVALID) report_fatal_error("W65816: select_cc condition not yet implemented"); SDValue Glue = DAG.getNode(W65816ISD::CMP, DL, MVT::Glue, LHS, RHS); SDValue CCOp = DAG.getTargetConstant(TCC, DL, MVT::i8); - SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue); + // SDTypeProfile declares 1 result (the selected value). Earlier + // code passed a 2-VT list (value + Glue) which was silently wrong + // and trips an SDNode-validity assertion in assertions builds. SDValue Ops[] = {TVal, FVal, CCOp, Glue}; - return DAG.getNode(W65816ISD::SELECT_CC, DL, VTs, Ops); + return DAG.getNode(W65816ISD::SELECT_CC, DL, Op.getValueType(), Ops); } // i8 -> i16 sign extend. Branchless 3-instruction trick: @@ -457,6 +707,316 @@ SDValue W65816TargetLowering::LowerSignExtend(SDValue Op, return DAG.getNode(ISD::SUB, DL, MVT::i16, Xor, Sign); } +// ptr32 foundation hook. In ptr16 mode (PointerWidth=16, current +// default) addresses are i16 and we return SDValue() so the legalizer +// keeps the load and the existing LDAptr / STAptr selection patterns +// match. In ptr32 mode addresses are i32 and we wrap the load in +// W65816ISD::LD_PTR via getMemIntrinsicNode so the [dp],Y inserter +// can take the bank byte from sub_hi instead of forcing 0. +// +// Byte loads (zextload, anyext, true i8) keep going through the i16 +// LDA + AND #$FF idiom — same trick the existing LDAptr uses; for +// ptr32 mode the load is still 16 bits, just bank-explicit. +SDValue W65816TargetLowering::LowerLoad(SDValue Op, + SelectionDAG &DAG) const { + LoadSDNode *Ld = cast(Op); + SDValue Chain = Ld->getChain(); + SDValue Ptr = Ld->getBasePtr(); + EVT VT = Op.getValueType(); + SDLoc DL(Op); + + // i32 LOAD: split into two i16 loads at offsets 0 and 2 then + // REG_SEQUENCE the halves into a Wide32. Address may be i16 (stack + // slot, global) or i32 (ptr32 deref); the recursive ADD handles + // address arithmetic correctly via LowerI32Bin. + if (VT == MVT::i32) { + EVT PtrVT = Ptr.getValueType(); + SDValue Two = DAG.getConstant(2, DL, PtrVT); + SDValue Ptr2 = DAG.getNode(ISD::ADD, DL, PtrVT, Ptr, Two); + SDValue Lo = DAG.getLoad(MVT::i16, DL, Chain, Ptr, + Ld->getPointerInfo(), + Ld->getAlign(), + Ld->getMemOperand()->getFlags()); + SDValue Hi = DAG.getLoad(MVT::i16, DL, Chain, Ptr2, + Ld->getPointerInfo().getWithOffset(2), + Ld->getAlign(), + Ld->getMemOperand()->getFlags()); + SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, + Lo.getValue(1), Hi.getValue(1)); + SDValue Val = buildWide32(DAG, DL, Lo, Hi); + return DAG.getMergeValues({Val, NewChain}, DL); + } + + // ptr16 mode: address is i16, let the default selection handle it. + if (Ptr.getValueType() != MVT::i32) + return SDValue(); + + EVT MemVT = Ld->getMemoryVT(); + SDVTList VTs = DAG.getVTList(MVT::i16, MVT::Other); + SDValue Ops[] = { Chain, Ptr }; + SDValue LdNode = DAG.getMemIntrinsicNode(W65816ISD::LD_PTR, DL, VTs, Ops, + MVT::i16, Ld->getMemOperand()); + SDValue Val = LdNode; + // Byte memory access: mask the high byte for zextload, leave anyext. + if (MemVT == MVT::i8) { + if (Ld->getExtensionType() == ISD::ZEXTLOAD) + Val = DAG.getNode(ISD::AND, DL, MVT::i16, Val, + DAG.getConstant(0xFF, DL, MVT::i16)); + else if (Ld->getExtensionType() == ISD::SEXTLOAD) + Val = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i16, Val, + DAG.getValueType(MVT::i8)); + } + // Narrow back to i8 if the consumer wanted i8. + if (VT == MVT::i8) + Val = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Val); + return DAG.getMergeValues({Val, LdNode.getValue(1)}, DL); +} + +// ZERO/SIGN/ANY_EXTEND i8/i16 -> i32: build a Wide32 from the i16 +// payload and a 0 / sign-fill / undef high half. +SDValue W65816TargetLowering::LowerExtend(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + if (Op.getValueType() != MVT::i32) + return SDValue(); + SDValue X = Op.getOperand(0); + // Promote i8 inputs to i16 first via the same opcode. + if (X.getValueType() == MVT::i8) + X = DAG.getNode(Op.getOpcode(), DL, MVT::i16, X); + SDValue Lo = X; + SDValue Hi; + if (Op.getOpcode() == ISD::ZERO_EXTEND) { + Hi = DAG.getConstant(0, DL, MVT::i16); + } else if (Op.getOpcode() == ISD::SIGN_EXTEND) { + // Sign-fill via SRA #15 — uses our SRA15A pattern (4 insns) and + // stays i16-typed in both LHS and RHS, dodging the combiner's + // shift-amount-promote when ptr32 makes pointer-typed shift + // amounts i32. + Hi = DAG.getNode(ISD::SRA, DL, MVT::i16, Lo, + DAG.getConstant(15, DL, MVT::i16)); + } else { + Hi = DAG.getUNDEF(MVT::i16); + } + return buildWide32(DAG, DL, Lo, Hi); +} + +// SIGN_EXTEND_INREG i32 with inner type i1/i8/i16: sign-extend the low +// N bits of an i32 input to fill all 32 bits. The legalizer leaves +// this op alone when i32 is legal — but no tablegen pattern matches +// the i32 form, so without this Custom hook isel aborts with +// "Cannot select: sign_extend_inreg ... ValueType:i1" on shapes like +// `-(crc & 1ul)` in CRC32 loops. +// +// Strategy: for inner VT V (= i1 / i8 / i16), the low half's +// `sext_inreg` (already pattern-matched at i16) produces the signed +// i16 value — then sign-fill the high half via SRA #15 of the lo +// result. +SDValue W65816TargetLowering::LowerSignExtendInReg(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + SDValue X = Op.getOperand(0); + EVT InnerVT = cast(Op.getOperand(1))->getVT(); + EVT ResVT = Op.getValueType(); + + // i16 result: replicate the existing tablegen patterns. We MUST + // handle this case rather than returning SDValue(), because + // setOperationAction's Custom-returns-SDValue() falls through to + // default Expand (= SRA/SHL chain), not to tablegen pattern match. + // The two existing patterns are: + // (sext_inreg Acc16:$src, i1) -> NEGA16 (AND $src, 1) + // (sext_inreg Acc16:$src, i8) -> ((src & 0xFF) ^ 0x80) - 0x80 + // Reproduce them at the SDAG level so the legalizer's Custom + // dispatch returns a fully-lowered tree. + if (ResVT == MVT::i16) { + if (InnerVT == MVT::i1) { + SDValue Bit = DAG.getNode(ISD::AND, DL, MVT::i16, X, + DAG.getConstant(1, DL, MVT::i16)); + return DAG.getNode(ISD::SUB, DL, MVT::i16, + DAG.getConstant(0, DL, MVT::i16), Bit); + } + if (InnerVT == MVT::i8) { + SDValue Masked = DAG.getNode(ISD::AND, DL, MVT::i16, X, + DAG.getConstant(0xFF, DL, MVT::i16)); + SDValue Xored = DAG.getNode(ISD::XOR, DL, MVT::i16, Masked, + DAG.getConstant(0x80, DL, MVT::i16)); + return DAG.getNode(ISD::SUB, DL, MVT::i16, Xored, + DAG.getConstant(0x80, DL, MVT::i16)); + } + // inner i16 = no-op. + return X; + } + + if (ResVT != MVT::i32) + return SDValue(); + + // i32 result: project the input's low half (X is i32 Wide32 here), + // apply the inner-VT sext on the i16 low half, sign-fill the hi. + SDValue Lo = extractWide32Lo(DAG, DL, X); + if (InnerVT != MVT::i16) { + Lo = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i16, Lo, + DAG.getValueType(InnerVT)); + } + // Sign-fill the hi half via SRA #15 — same idiom LowerExtend uses for + // SIGN_EXTEND i16 -> i32. + SDValue Hi = DAG.getNode(ISD::SRA, DL, MVT::i16, Lo, + DAG.getConstant(15, DL, MVT::i16)); + return buildWide32(DAG, DL, Lo, Hi); +} + + +// TRUNCATE i32 -> i16: project sub_lo. +SDValue W65816TargetLowering::LowerTruncate(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + if (Op.getOperand(0).getValueType() != MVT::i32) + return SDValue(); + if (Op.getValueType() == MVT::i16) + return extractWide32Lo(DAG, DL, Op.getOperand(0)); + if (Op.getValueType() == MVT::i8) { + // i32 -> i16 -> i8. The i8 trunc pattern is COPY_TO_REGCLASS at MC + // level; the i16 sub_lo extract is the work. + SDValue Lo16 = extractWide32Lo(DAG, DL, Op.getOperand(0)); + return DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Lo16); + } + return SDValue(); +} + +// i32 Constant: split into two i16 constants and REG_SEQUENCE. +SDValue W65816TargetLowering::LowerI32Constant(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + if (Op.getValueType() != MVT::i32) return SDValue(); + uint64_t V = cast(Op)->getZExtValue(); + SDValue Lo = DAG.getConstant(V & 0xFFFFu, DL, MVT::i16); + SDValue Hi = DAG.getConstant((V >> 16) & 0xFFFFu, DL, MVT::i16); + return buildWide32(DAG, DL, Lo, Hi); +} + +// ADD/SUB/AND/OR/XOR i32 -> per-half i16 op. ADDC/ADDE chain for ADD, +// SUBC/SUBE for SUB. AND/OR/XOR are independent halves. +SDValue W65816TargetLowering::LowerI32Bin(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + if (Op.getValueType() != MVT::i32) + return SDValue(); + SDValue L = Op.getOperand(0); + SDValue R = Op.getOperand(1); + SDValue LL = extractWide32Lo(DAG, DL, L); + SDValue LH = extractWide32Hi(DAG, DL, L); + SDValue RL = extractWide32Lo(DAG, DL, R); + SDValue RH = extractWide32Hi(DAG, DL, R); + SDValue Lo, Hi; + switch (Op.getOpcode()) { + case ISD::AND: + Lo = DAG.getNode(ISD::AND, DL, MVT::i16, LL, RL); + Hi = DAG.getNode(ISD::AND, DL, MVT::i16, LH, RH); + break; + case ISD::OR: + Lo = DAG.getNode(ISD::OR, DL, MVT::i16, LL, RL); + Hi = DAG.getNode(ISD::OR, DL, MVT::i16, LH, RH); + break; + case ISD::XOR: + Lo = DAG.getNode(ISD::XOR, DL, MVT::i16, LL, RL); + Hi = DAG.getNode(ISD::XOR, DL, MVT::i16, LH, RH); + break; + case ISD::ADD: { + SDVTList VTs = DAG.getVTList(MVT::i16, MVT::Glue); + SDValue Lo2 = DAG.getNode(ISD::ADDC, DL, VTs, LL, RL); + Lo = Lo2.getValue(0); + SDValue Carry = Lo2.getValue(1); + Hi = DAG.getNode(ISD::ADDE, DL, VTs, LH, RH, Carry).getValue(0); + break; + } + case ISD::SUB: { + SDVTList VTs = DAG.getVTList(MVT::i16, MVT::Glue); + SDValue Lo2 = DAG.getNode(ISD::SUBC, DL, VTs, LL, RL); + Lo = Lo2.getValue(0); + SDValue Borrow = Lo2.getValue(1); + Hi = DAG.getNode(ISD::SUBE, DL, VTs, LH, RH, Borrow).getValue(0); + break; + } + default: + return SDValue(); + } + return buildWide32(DAG, DL, Lo, Hi); +} + +// Store companion to LowerLoad. For i32 addresses, dispatch to the +// 16-bit ST_PTR or the byte-truncating STB_PTR target node based on +// MemoryVT. For i16 addresses (ptr16 mode), bail out and let the +// existing STAptr / STBptr patterns match. +SDValue W65816TargetLowering::LowerStore(SDValue Op, + SelectionDAG &DAG) const { + StoreSDNode *St = cast(Op); + SDValue Chain = St->getChain(); + SDValue Val = St->getValue(); + SDValue Ptr = St->getBasePtr(); + EVT MemVT = St->getMemoryVT(); + SDLoc DL(Op); + + // i32 STORE: split into two halves. Critical: the per-half stores + // MUST go through the target-specific W65816ISD::ST_PTR node and not + // through plain ISD::STORE, otherwise the SDAG combiner's + // MergeConsecutiveStores re-combines them into a single i32 store + // that re-enters LowerStore — infinite loop, OOM in the combiner. + // For i16 ptrs (legacy ptr16), fall back to ISD::STORE; the regular + // store-merger doesn't trip there because address splitting via + // ISD::ADD on i16 doesn't itself fan out into ptr-pair operations. + if (Val.getValueType() == MVT::i32) { + SDValue Lo = extractWide32Lo(DAG, DL, Val); + SDValue Hi = extractWide32Hi(DAG, DL, Val); + EVT PtrVT = Ptr.getValueType(); + SDValue Two = DAG.getConstant(2, DL, PtrVT); + SDValue Ptr2 = DAG.getNode(ISD::ADD, DL, PtrVT, Ptr, Two); + if (PtrVT == MVT::i32) { + // ptr32 path — emit two W65816ISD::ST_PTR target nodes, sequentially + // chained. The combiner cannot merge target-opaque MemIntrinsic + // stores. + SDVTList VTs = DAG.getVTList(MVT::Other); + SDValue OpsLo[] = { Chain, Lo, Ptr }; + SDValue StLo = DAG.getMemIntrinsicNode( + W65816ISD::ST_PTR, DL, VTs, OpsLo, MVT::i16, + St->getMemOperand()); + SDValue OpsHi[] = { StLo, Hi, Ptr2 }; + MachineMemOperand *MMOHi = DAG.getMachineFunction().getMachineMemOperand( + St->getMemOperand(), 2, 2); + SDValue StHi = DAG.getMemIntrinsicNode( + W65816ISD::ST_PTR, DL, VTs, OpsHi, MVT::i16, MMOHi); + return StHi; + } + // ptr16 path — emit two regular i16 stores serially chained so the + // store-merger sees them as a 4-byte sequence (which it will likely + // leave alone since the resulting i32 store has no legal target + // pattern in ptr16 mode anyway). + SDValue StLo = DAG.getStore(Chain, DL, Lo, Ptr, + St->getPointerInfo(), + St->getAlign(), + St->getMemOperand()->getFlags()); + SDValue StHi = DAG.getStore(StLo, DL, Hi, Ptr2, + St->getPointerInfo().getWithOffset(2), + St->getAlign(), + St->getMemOperand()->getFlags()); + return StHi; + } + + if (Ptr.getValueType() != MVT::i32) + return SDValue(); + + // The pseudos take Acc16 (i16) as the value half; the SEP/REP wrap + // around STBptr32 narrows in memory. Promote i8 values to i16 with + // ANY_EXTEND — the inserter only writes one byte, so the high half + // is don't-care. + if (Val.getValueType() == MVT::i8) + Val = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i16, Val); + + unsigned NodeOpc = (MemVT == MVT::i8) ? unsigned(W65816ISD::STB_PTR) + : unsigned(W65816ISD::ST_PTR); + SDVTList VTs = DAG.getVTList(MVT::Other); + SDValue Ops[] = { Chain, Val, Ptr }; + return DAG.getMemIntrinsicNode(NodeOpc, DL, VTs, Ops, MemVT, + St->getMemOperand()); +} + // VAARG: load *ap, advance ap by sizeof(VT). Unlike the default // expansion, we do NOT align ap to the type's preferred alignment — // caller-pushed varargs land at byte-granular addresses (PHA from an @@ -509,12 +1069,45 @@ SDValue W65816TargetLowering::LowerOperation(SDValue Op, case ISD::BR_CC: return LowerBR_CC(Op, DAG); case ISD::SETCC: return LowerSETCC(Op, DAG); case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); - case ISD::SIGN_EXTEND: return LowerSignExtend(Op, DAG); + case ISD::SELECT: { + // Custom-lower SELECT for i32 result: split into per-half + // selects. Without this, the legalizer's default (rewriting + // SELECT to SELECT_CC against zero) produces SELECT_CC i32 of + // a different shape that re-enters Custom and creates a cycle. + if (Op.getValueType() != MVT::i32) + return SDValue(); + SDValue Cond = Op.getOperand(0); + SDValue TVal = Op.getOperand(1); + SDValue FVal = Op.getOperand(2); + SDLoc DL(Op); + SDValue TLo = extractWide32Lo(DAG, DL, TVal); + SDValue THi = extractWide32Hi(DAG, DL, TVal); + SDValue FLo = extractWide32Lo(DAG, DL, FVal); + SDValue FHi = extractWide32Hi(DAG, DL, FVal); + SDValue Lo = DAG.getNode(ISD::SELECT, DL, MVT::i16, Cond, TLo, FLo); + SDValue Hi = DAG.getNode(ISD::SELECT, DL, MVT::i16, Cond, THi, FHi); + return buildWide32(DAG, DL, Lo, Hi); + } + case ISD::SIGN_EXTEND: + if (Op.getValueType() == MVT::i32) return LowerExtend(Op, DAG); + return LowerSignExtend(Op, DAG); case ISD::VASTART: return LowerVASTART(Op, DAG); case ISD::VAARG: return LowerVAARG(Op, DAG); case ISD::SHL: case ISD::SRL: case ISD::SRA: return LowerShift(Op, DAG); + case ISD::ZERO_EXTEND: + case ISD::ANY_EXTEND: return LowerExtend(Op, DAG); + case ISD::SIGN_EXTEND_INREG: return LowerSignExtendInReg(Op, DAG); + case ISD::TRUNCATE: return LowerTruncate(Op, DAG); + case ISD::ADD: + case ISD::SUB: + case ISD::AND: + case ISD::OR: + case ISD::XOR: return LowerI32Bin(Op, DAG); + case ISD::LOAD: return LowerLoad(Op, DAG); + case ISD::STORE: return LowerStore(Op, DAG); + case ISD::Constant: return LowerI32Constant(Op, DAG); // SJLJ EH: setup_dispatch is a no-op on this target — the dispatcher // logic lives entirely in the SJLJ runtime (_Unwind_SjLj_Resume + // longjmp into the function context's jmp_buf). The isel layer @@ -621,30 +1214,30 @@ SDValue W65816TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { // The type-legalizer's i32-shift-by-1 expansion emits this exact // node for the high-half "bit-from-low" slot. // Everything else goes to a libcall (__ashlhi3 / __lshrhi3 / __ashrhi3). + // i16 only — i32 always routes to libcall (no inline i32 patterns). SDValue Amount = Op.getOperand(1); - if (auto *C = dyn_cast(Amount)) { - uint64_t N = C->getZExtValue(); - // SHL/SRL by 1..7 chain ASLA16/LSRA16; by 8 use SHL8A/SRL8A; by 9..14 - // chain on top of those. All have inline tablegen patterns. - if ((Op.getOpcode() == ISD::SHL || Op.getOpcode() == ISD::SRL) && - N >= 1 && N <= 14) - return Op; - // SHL/SRL by 15 is just (asl/ror to put bit 0/15 into low/high). - if (N == 15 && - (Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SHL)) - return Op; - // SRA only has inline patterns at 1 and 15 (sign-fill). - if (N == 1 && Op.getOpcode() == ISD::SRA) - return Op; - if (N == 15 && Op.getOpcode() == ISD::SRA) - return Op; + if (Op.getValueType() == MVT::i16) { + if (auto *C = dyn_cast(Amount)) { + uint64_t N = C->getZExtValue(); + if ((Op.getOpcode() == ISD::SHL || Op.getOpcode() == ISD::SRL) && + N >= 1 && N <= 14) + return Op; + if (N == 15 && + (Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SHL)) + return Op; + if (N == 1 && Op.getOpcode() == ISD::SRA) + return Op; + if (N == 15 && Op.getOpcode() == ISD::SRA) + return Op; + } } + bool IsI32 = Op.getValueType() == MVT::i32; RTLIB::Libcall LC; switch (Op.getOpcode()) { - case ISD::SHL: LC = RTLIB::SHL_I16; break; - case ISD::SRL: LC = RTLIB::SRL_I16; break; - case ISD::SRA: LC = RTLIB::SRA_I16; break; + case ISD::SHL: LC = IsI32 ? RTLIB::SHL_I32 : RTLIB::SHL_I16; break; + case ISD::SRL: LC = IsI32 ? RTLIB::SRL_I32 : RTLIB::SRL_I16; break; + case ISD::SRA: LC = IsI32 ? RTLIB::SRA_I32 : RTLIB::SRA_I16; break; default: llvm_unreachable("not a shift"); } @@ -661,17 +1254,19 @@ SDValue W65816TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { auto *GA = cast(Op); SDLoc DL(Op); - SDValue Tgt = DAG.getTargetGlobalAddress(GA->getGlobal(), DL, MVT::i16, + EVT PtrVT = Op.getValueType(); // i16 in ptr16 mode, i32 in ptr32 mode + SDValue Tgt = DAG.getTargetGlobalAddress(GA->getGlobal(), DL, PtrVT, GA->getOffset()); - return DAG.getNode(W65816ISD::Wrapper, DL, MVT::i16, Tgt); + return DAG.getNode(W65816ISD::Wrapper, DL, PtrVT, Tgt); } SDValue W65816TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const { auto *ES = cast(Op); SDLoc DL(Op); - SDValue Tgt = DAG.getTargetExternalSymbol(ES->getSymbol(), MVT::i16); - return DAG.getNode(W65816ISD::Wrapper, DL, MVT::i16, Tgt); + EVT PtrVT = Op.getValueType(); + SDValue Tgt = DAG.getTargetExternalSymbol(ES->getSymbol(), PtrVT); + return DAG.getNode(W65816ISD::Wrapper, DL, PtrVT, Tgt); } SDValue W65816TargetLowering::LowerFormalArguments( @@ -696,11 +1291,10 @@ SDValue W65816TargetLowering::LowerFormalArguments( MachineFrameInfo &MFI = MF.getFrameInfo(); MachineRegisterInfo &MRI = MF.getRegInfo(); - // i32 first-arg ABI: if the first original argument is i32 (the - // type legalizer split it into two i16 InputArgs both with - // OrigArgIndex == 0), pass it in A:X (lo:hi) — matching the i32 - // return ABI (also A:X). Saves one stack slot for the i32 arg. - bool I32FirstArg = + // i32 first-arg ABI. Two flavors as in LowerCall: + // - Legal-i32 (Wide32 reg class registered): single i32 InputArg. + // - Split-i32 (legacy): two i16 InputArgs both with OrigArgIndex==0. + bool I32SplitFirstArg = Ins.size() >= 2 && Ins[0].VT == MVT::i16 && Ins[1].VT == MVT::i16 && Ins[0].OrigArgIndex == 0 && Ins[1].OrigArgIndex == 0; // True iff the FIRST original arg spans 4 i16s (i.e., is i64). Used @@ -709,11 +1303,24 @@ SDValue W65816TargetLowering::LowerFormalArguments( // doesn't get the same treatment because the change pessimizes // simple functions like `int add32(int a, int b) { return a+b; }` // where greedy's regular A:X handling is fine. + // Two shapes for i64-first-arg under different ptr modes: + // ptr16 (i32 illegal): Ins[0..3] = 4 i16 halves of arg0 + // ptr32 (i32 legal): Ins[0..1] = 2 i32 halves of arg0 — but the + // IR-level "single i64 first arg" still splits + // to 4 i16 in Outs/Ins because i64 isn't legal. + // So the i16-form detection still applies here. bool I64FirstArg = Ins.size() >= 4 && Ins[0].VT == MVT::i16 && Ins[1].VT == MVT::i16 && Ins[2].VT == MVT::i16 && Ins[3].VT == MVT::i16 && Ins[0].OrigArgIndex == 0 && Ins[1].OrigArgIndex == 0 && Ins[2].OrigArgIndex == 0 && Ins[3].OrigArgIndex == 0; + // Also detect the i32-split shape: Ins[0..1] = 2 i32 halves of arg0 + // (with OrigArgIndex==0 on both). This happens with ptr32 active and + // i64 legalized via i32-split rather than i16-quad-split. + if (!I64FirstArg && Ins.size() >= 2 && Ins[0].VT == MVT::i32 && + Ins[1].VT == MVT::i32 && Ins[0].OrigArgIndex == 0 && + Ins[1].OrigArgIndex == 0) + I64FirstArg = true; unsigned ArgIdx = 0; // Stack offset is measured from S+1 (the WDC convention) and grows @@ -721,16 +1328,50 @@ SDValue W65816TargetLowering::LowerFormalArguments( unsigned StackOffset = 4; // Skip 3 ret-addr bytes; first slot at S+4. for (const ISD::InputArg &Arg : Ins) { MVT VT = Arg.VT; - if (VT != MVT::i16 && VT != MVT::i8) + if (VT != MVT::i16 && VT != MVT::i8 && VT != MVT::i32) report_fatal_error("W65816: argument type not yet supported"); - if (ArgIdx == 0) { - // First arg in A. - Register VReg = MRI.createVirtualRegister( - VT == MVT::i16 ? &W65816::Acc16RegClass : &W65816::Acc8RegClass); + if (ArgIdx == 0 && VT == MVT::i32) { + // Whole-i32 first arg: lo half live-in via $a, hi via $x. + // The W65816LowerWide32 pre-RA pass walks the resulting + // REG_SEQUENCE and rewrites Wide32 uses into pairs of i16 + // operations — keeping AX32 out of the regalloc's pair- + // allocation path entirely. + // For i64-first-arg signatures (the IR has a single i64 arg + // that splits to 2 i32 in Ins[0..1] under ptr32), route BOTH + // halves through Img16. Without this the regalloc emits + // `TXA; STA spill_X; STA spill_A` at function entry — the TXA + // clobbers $a (arg0_0) before the A-spill saves it, so both + // spill slots end up holding arg0_1. Caused __adddf3(1.5,2.5) + // → 1.5 because the cb-test path read TXA-corrupted A. + const TargetRegisterClass *VRegLoRC = + I64FirstArg ? &W65816::Img16RegClass : &W65816::Acc16RegClass; + const TargetRegisterClass *VRegHiRC = + I64FirstArg ? &W65816::Img16RegClass : &W65816::Idx16RegClass; + Register VRegLo = MRI.createVirtualRegister(VRegLoRC); + Register VRegHi = MRI.createVirtualRegister(VRegHiRC); + MRI.addLiveIn(W65816::A, VRegLo); + MRI.addLiveIn(W65816::X, VRegHi); + SDValue Lo = DAG.getCopyFromReg(Chain, DL, VRegLo, MVT::i16); + SDValue Hi = DAG.getCopyFromReg(Chain, DL, VRegHi, MVT::i16); + InVals.push_back(buildWide32(DAG, DL, Lo, Hi)); + } else if (ArgIdx == 0) { + // First arg in A. For i64-first-arg signatures (4 i16 halves of + // arg0 with OrigArgIndex==0), route arg0_0 through Img16 the same + // way ArgIdx==1 does — via an entry STA-to-DP-slot at function + // entry. Without this, the regalloc emits a TXA bridge for + // arg0_1's spill that clobbers $a (= arg0_0) BEFORE arg0_0 has + // been saved, and BOTH arg0_0 and arg0_1's spill slots end up + // holding arg0_1. Observed as `__adddf3(1.5, 2.5) → 1.5` because + // the cb-test BEQ sees flags from a TXA-clobbered LDA cb path. + const TargetRegisterClass *RC = + (VT == MVT::i16) + ? (I64FirstArg ? &W65816::Img16RegClass : &W65816::Acc16RegClass) + : &W65816::Acc8RegClass; + Register VReg = MRI.createVirtualRegister(RC); MRI.addLiveIn(W65816::A, VReg); InVals.push_back(DAG.getCopyFromReg(Chain, DL, VReg, VT)); - } else if (ArgIdx == 1 && I32FirstArg) { + } else if (ArgIdx == 1 && I32SplitFirstArg) { // First-arg hi half (or arg0_ml for i64-first-arg): in X. // For i64-first-arg signatures (4 i16s with OrigArgIndex 0), use // Img16 so greedy parks the value in an IMG slot via STX_DP, @@ -743,6 +1384,19 @@ SDValue W65816TargetLowering::LowerFormalArguments( Register VReg = MRI.createVirtualRegister(RC); MRI.addLiveIn(W65816::X, VReg); InVals.push_back(DAG.getCopyFromReg(Chain, DL, VReg, MVT::i16)); + } else if (VT == MVT::i32) { + // i32 stack arg: 4 bytes, loaded as 2 i16 halves and assembled + // via REG_SEQUENCE into a Wide32 SDValue. + int FILo = MFI.CreateFixedObject(2, StackOffset, /*Immutable*/true); + int FIHi = MFI.CreateFixedObject(2, StackOffset + 2, /*Immutable*/true); + StackOffset += 4; + SDValue FINLo = DAG.getFrameIndex(FILo, MVT::i16); + SDValue FINHi = DAG.getFrameIndex(FIHi, MVT::i16); + SDValue Lo = DAG.getLoad(MVT::i16, DL, Chain, FINLo, + MachinePointerInfo::getFixedStack(MF, FILo)); + SDValue Hi = DAG.getLoad(MVT::i16, DL, Chain, FINHi, + MachinePointerInfo::getFixedStack(MF, FIHi)); + InVals.push_back(buildWide32(DAG, DL, Lo, Hi)); } else { // Subsequent args are loaded from the stack. i8 args are // promoted to i16 slots (matching CC_W65816's CCPromoteToType) @@ -824,23 +1478,29 @@ W65816TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, } for (const ISD::OutputArg &O : Outs) { - if (O.VT != MVT::i16 && O.VT != MVT::i8) + if (O.VT != MVT::i16 && O.VT != MVT::i8 && O.VT != MVT::i32) report_fatal_error("W65816: argument type not yet supported"); } - // i32 first-arg ABI: if Outs[0] and Outs[1] are halves of the same - // original i32 first arg (OrigArgIndex == 0), pass them in A:X. - bool I32FirstArg = + // i32 first-arg ABI. Two flavors: + // - Legal-i32: Outs[0].VT == i32 (whole pair). Pass in AX32. + // - Split-i32 (legacy): Outs[0]/Outs[1] both i16 with OrigArgIndex==0. + // Pass low in A, high in X. + bool I32WholeFirstArg = + !Outs.empty() && Outs[0].VT == MVT::i32; + bool I32SplitFirstArg = Outs.size() >= 2 && Outs[0].VT == MVT::i16 && Outs[1].VT == MVT::i16 && Outs[0].OrigArgIndex == 0 && Outs[1].OrigArgIndex == 0; - unsigned FirstStackArg = I32FirstArg ? 2 : 1; + unsigned FirstStackArg = I32WholeFirstArg ? 1 + : I32SplitFirstArg ? 2 : 1; // i8 stack args are promoted to i16 (2-byte slots) so the callee can // read them with a 16-bit M load — matches LowerFormalArguments and - // CC_W65816's CCPromoteToType. Arg 0 stays in A in its native - // width; only stack-passed args promote. - unsigned StackBytes = 2 * (Outs.size() > FirstStackArg - ? Outs.size() - FirstStackArg : 0); + // CC_W65816's CCPromoteToType. i32 stack args occupy 4 bytes + // (2 PUSH16s). + unsigned StackBytes = 0; + for (unsigned i = FirstStackArg; i < Outs.size(); ++i) + StackBytes += (Outs[i].VT == MVT::i32) ? 4 : 2; Chain = DAG.getCALLSEQ_START(Chain, StackBytes, 0, DL); @@ -851,15 +1511,8 @@ W65816TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // from X via PHX — saves the TXA + A-spill round-trip that would // otherwise be required. SDValue Glue; - for (int i = (int)Outs.size() - 1; i >= (int)FirstStackArg; --i) { - SDValue V = OutVals[i]; - if (Outs[i].VT == MVT::i8) - V = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i16, V); - // Detect "value is already in X" — either as a physreg - // CopyFromReg($x), or as a vreg in the Idx16 class that's - // live-in from $x. In the i32-first-arg-in-A:X path, - // LowerFormalArguments creates a vreg in Idx16 and addLiveIn's - // it to $x. + // Helper: push a single i16-sized value via PHA. + auto pushI16 = [&](SDValue V) { bool ViaX = false; if (V.getOpcode() == ISD::CopyFromReg) { auto *RegN = dyn_cast(V.getOperand(1).getNode()); @@ -880,8 +1533,6 @@ W65816TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, } } if (ViaX) { - // CopyToReg(X, X) is a no-op but it threads the Glue chain so the - // PUSH_X can be sequenced correctly relative to other pushes. Chain = DAG.getCopyToReg(Chain, DL, W65816::X, V, Glue); Glue = Chain.getValue(1); Chain = DAG.getNode(W65816ISD::PUSH_X, DL, @@ -893,17 +1544,44 @@ W65816TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, DAG.getVTList(MVT::Other, MVT::Glue), Chain, Glue); } Glue = Chain.getValue(1); + }; + + for (int i = (int)Outs.size() - 1; i >= (int)FirstStackArg; --i) { + SDValue V = OutVals[i]; + if (Outs[i].VT == MVT::i32) { + // Push i32 stack arg: hi half first (lands at higher address), + // lo half second (lands at lower address = the slot the callee + // reads as the start of the i32). + SDValue Lo = extractWide32Lo(DAG, DL, V); + SDValue Hi = extractWide32Hi(DAG, DL, V); + pushI16(Hi); + pushI16(Lo); + continue; + } + if (Outs[i].VT == MVT::i8) + V = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i16, V); + pushI16(V); } - // i32 first-arg hi half goes in X. Emit before the A copy so the - // CopyToReg for X is glued, then A's copy follows. - if (I32FirstArg) { + // i32 first-arg. Whole (legal-i32): split into lo/hi and copy + // to $a/$x separately — avoids AX32 in the MIR (see + // W65816LowerWide32). Split-i32 (legacy 2-i16): hi in X first, + // then lo in A below. + if (I32WholeFirstArg) { + SDValue Lo = extractWide32Lo(DAG, DL, OutVals[0]); + SDValue Hi = extractWide32Hi(DAG, DL, OutVals[0]); + Chain = DAG.getCopyToReg(Chain, DL, W65816::X, Hi, Glue); + Glue = Chain.getValue(1); + Chain = DAG.getCopyToReg(Chain, DL, W65816::A, Lo, Glue); + Glue = Chain.getValue(1); + } else if (I32SplitFirstArg) { Chain = DAG.getCopyToReg(Chain, DL, W65816::X, OutVals[1], Glue); Glue = Chain.getValue(1); } - // Arg 0 in A. - if (!OutVals.empty()) { + // Arg 0 in A — only for non-whole-i32 first-arg. Whole-i32 + // already copied to A/X above. + if (!I32WholeFirstArg && !OutVals.empty()) { Chain = DAG.getCopyToReg(Chain, DL, W65816::A, OutVals[0], Glue); Glue = Chain.getValue(1); } @@ -914,10 +1592,14 @@ W65816TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, Callee = DAG.getTargetExternalSymbol(ES->getSymbol(), MVT::i16); SmallVector CallOps = {Chain, Callee}; - if (!OutVals.empty()) + if (I32WholeFirstArg) { + CallOps.push_back(DAG.getRegister(W65816::A, MVT::i16)); + CallOps.push_back(DAG.getRegister(W65816::X, MVT::i16)); + } else if (!OutVals.empty()) { CallOps.push_back(DAG.getRegister(W65816::A, Outs[0].VT)); - if (I32FirstArg) - CallOps.push_back(DAG.getRegister(W65816::X, Outs[1].VT)); + if (I32SplitFirstArg) + CallOps.push_back(DAG.getRegister(W65816::X, Outs[1].VT)); + } if (Glue.getNode()) CallOps.push_back(Glue); @@ -928,38 +1610,60 @@ W65816TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, Chain = DAG.getCALLSEQ_END(Chain, StackBytes, 0, Glue, DL); Glue = Chain.getValue(1); - // Read return value(s). Mirrors LowerReturn: i8/i16 in A, i32 in A:X, - // i64 in A:X:Y plus a load from DP $F0 for the highest half. - if (Ins.size() > 4) - report_fatal_error("W65816: return type wider than 64 bits not supported"); - static constexpr Register RetRegs[3] = {W65816::A, W65816::X, W65816::Y}; + // Read return value(s). Mirrors LowerReturn: i8/i16 in A, i32 in + // AX32 (whole) or split A/X (legacy), and 4-half (i64 / 2x i32) in + // A, X, Y, DPF0. i32 Ins are read as a single i32 from the half + // pair (A:X for the first, Y:DPF0 for a second-pair-of-halves). + // Whole-i32 single return: read lo from $a, hi from $x. Avoids + // using AX32 in the SDAG / MIR — see W65816LowerWide32 pass. + if (Ins.size() == 1 && Ins[0].VT == MVT::i32) { + SDValue Lo = DAG.getCopyFromReg(Chain, DL, W65816::A, MVT::i16, Glue); + Chain = Lo.getValue(1); + Glue = Lo.getValue(2); + SDValue Hi = DAG.getCopyFromReg(Chain, DL, W65816::X, MVT::i16, Glue); + Chain = Hi.getValue(1); + Glue = Hi.getValue(2); + InVals.push_back(buildWide32(DAG, DL, Lo, Hi)); + return Chain; + } + // Build a flat list of i16 halves expected from the call. Then + // walk it, copying from A, X, Y, DPF0 in order. Re-assemble i32 + // halves into a Wide32 SDValue at the end. + SmallVector ExpVT; for (unsigned i = 0, e = Ins.size(); i != e; ++i) { MVT VT = Ins[i].VT; - if (VT != MVT::i16 && VT != MVT::i8) - report_fatal_error("W65816: return half must be i8 or i16"); - if (i >= 1 && VT != MVT::i16) - report_fatal_error("W65816: split return halves must all be i16"); - if (i < 3) { - SDValue V = DAG.getCopyFromReg(Chain, DL, RetRegs[i], VT, Glue); - Chain = V.getValue(1); - Glue = V.getValue(2); - InVals.push_back(V); + if (VT == MVT::i32) { + ExpVT.push_back(MVT::i16); + ExpVT.push_back(MVT::i16); + } else if (VT == MVT::i16 || VT == MVT::i8) { + ExpVT.push_back(VT); } else { - // 4th half: read DP[$F0..$F1] via CopyFromReg(DPF0). DPF0 is a - // pseudo-physreg modeled as JSLpseudo's implicit-def, so each - // call's CopyFromReg has Glue tied to the corresponding call — - // the SDAG combiner can't merge them and the scheduler can't - // reorder them past the next call. copyPhysReg lowers DPF0 → - // A as `LDA $F0`. Without this, plain `getLoad(0xF0)` was - // being CSE'd / reordered across i64-returning calls, causing - // `dmath = (a+b)*(a-b)` to return 4 instead of 16. - SDValue V = DAG.getCopyFromReg(Chain, DL, W65816::DPF0, VT, Glue); - Chain = V.getValue(1); - Glue = V.getValue(2); - InVals.push_back(V); + report_fatal_error("W65816: return half must be i8/i16/i32"); + } + } + if (ExpVT.size() > 4) + report_fatal_error("W65816: return type wider than 64 bits not supported"); + static constexpr Register RetRegs[4] = {W65816::A, W65816::X, W65816::Y, + W65816::DPF0}; + SmallVector Halves; + for (unsigned i = 0; i != ExpVT.size(); ++i) { + SDValue V = DAG.getCopyFromReg(Chain, DL, RetRegs[i], ExpVT[i], Glue); + Chain = V.getValue(1); + Glue = V.getValue(2); + Halves.push_back(V); + } + // Re-pack halves into the original Ins shape (i32s rebuild via + // REG_SEQUENCE; i8/i16 pass through). + unsigned hi = 0; + for (unsigned i = 0, e = Ins.size(); i != e; ++i) { + if (Ins[i].VT == MVT::i32) { + InVals.push_back(buildWide32(DAG, DL, Halves[hi], Halves[hi + 1])); + hi += 2; + } else { + InVals.push_back(Halves[hi]); + hi += 1; } } - return Chain; } @@ -979,18 +1683,53 @@ SDValue W65816TargetLowering::LowerReturn( // first so that the regalloc can place each through A (the only // ALU reg) without conflict. The TAX/TAY in copyPhysReg preserves // A, so subsequent low-half copies to A don't clobber. - if (Outs.size() > 4) - report_fatal_error("W65816: return type wider than 64 bits not supported"); + // With i32 legal, an Outs entry may be MVT::i32; we expand each i32 + // into its two i16 halves (sub_lo/sub_hi via EXTRACT_SUBREG) so the + // legacy A/X/Y/DPF0 4-half return ABI continues to work for the + // multi-half return cases (i64 returned as 2 i32, struct of 2 long + // returned as 2 i32, etc.). + SmallVector ExpVT; + SmallVector ExpVals; for (unsigned i = 0; i != Outs.size(); ++i) { MVT VT = Outs[i].VT; - if (VT != MVT::i16 && VT != MVT::i8) - report_fatal_error("W65816: return half must be i8 or i16"); - if (i >= 1 && VT != MVT::i16) - report_fatal_error("W65816: split return halves must all be i16"); + if (VT == MVT::i32) { + ExpVT.push_back(MVT::i16); + ExpVT.push_back(MVT::i16); + ExpVals.push_back(extractWide32Lo(DAG, DL, OutVals[i])); + ExpVals.push_back(extractWide32Hi(DAG, DL, OutVals[i])); + } else if (VT == MVT::i16 || VT == MVT::i8) { + ExpVT.push_back(VT); + ExpVals.push_back(OutVals[i]); + } else { + report_fatal_error("W65816: return half must be i8/i16/i32"); + } } + if (ExpVT.size() > 4) + report_fatal_error("W65816: return type wider than 64 bits not supported"); + + // Single whole-i32 return: copy directly to AX32 instead of two + // halves to A and X. Saves the regalloc/coalescer some work. + bool I32WholeReturn = (Outs.size() == 1 && Outs[0].VT == MVT::i32); SDValue Glue; SmallVector RetOps(1, Chain); + if (I32WholeReturn) { + // Split the i32 OutVal into lo/hi and copy each separately to + // $a / $x (no AX32 in the SDAG — see W65816LowerWide32). + SDValue Lo = extractWide32Lo(DAG, DL, OutVals[0]); + SDValue Hi = extractWide32Hi(DAG, DL, OutVals[0]); + Chain = DAG.getCopyToReg(Chain, DL, W65816::X, Hi, Glue); + Glue = Chain.getValue(1); + Chain = DAG.getCopyToReg(Chain, DL, W65816::A, Lo, Glue); + Glue = Chain.getValue(1); + RetOps.push_back(DAG.getRegister(W65816::A, MVT::i16)); + RetOps.push_back(DAG.getRegister(W65816::X, MVT::i16)); + RetOps[0] = Chain; + if (Glue.getNode()) + RetOps.push_back(Glue); + return DAG.getNode(W65816ISD::RET_GLUE, DL, MVT::Other, RetOps); + } + // Outs[3] -> DP $F0 via CopyToReg(DPF0). Using the DPF0 fake physreg // (lowered to `STA $F0` by copyPhysReg) is critical: a generic // ISD::STORE with addr=0xF0 lowered to `sta (d,s),y`, an indirect @@ -999,32 +1738,30 @@ SDValue W65816TargetLowering::LowerReturn( // computation can use A freely before A holds the low result. Glued // to RET_GLUE via the RetOps Register entry below so DCE doesn't // strip the COPY. - if (Outs.size() >= 4) { - Chain = DAG.getCopyToReg(Chain, DL, W65816::DPF0, OutVals[3], Glue); + // Use the expanded i16-half list (i32 outs split into 2 i16 halves). + if (ExpVals.size() >= 4) { + Chain = DAG.getCopyToReg(Chain, DL, W65816::DPF0, ExpVals[3], Glue); Glue = Chain.getValue(1); } - // Outs[2] -> Y. - if (Outs.size() >= 3) { - Chain = DAG.getCopyToReg(Chain, DL, W65816::Y, OutVals[2], Glue); + if (ExpVals.size() >= 3) { + Chain = DAG.getCopyToReg(Chain, DL, W65816::Y, ExpVals[2], Glue); Glue = Chain.getValue(1); } - // Outs[1] -> X. - if (Outs.size() >= 2) { - Chain = DAG.getCopyToReg(Chain, DL, W65816::X, OutVals[1], Glue); + if (ExpVals.size() >= 2) { + Chain = DAG.getCopyToReg(Chain, DL, W65816::X, ExpVals[1], Glue); Glue = Chain.getValue(1); } - // Outs[0] -> A. - if (!Outs.empty()) { - Chain = DAG.getCopyToReg(Chain, DL, W65816::A, OutVals[0], Glue); + if (!ExpVals.empty()) { + Chain = DAG.getCopyToReg(Chain, DL, W65816::A, ExpVals[0], Glue); Glue = Chain.getValue(1); - RetOps.push_back(DAG.getRegister(W65816::A, Outs[0].VT)); + RetOps.push_back(DAG.getRegister(W65816::A, ExpVT[0])); } - if (Outs.size() >= 2) - RetOps.push_back(DAG.getRegister(W65816::X, Outs[1].VT)); - if (Outs.size() >= 3) - RetOps.push_back(DAG.getRegister(W65816::Y, Outs[2].VT)); - if (Outs.size() >= 4) - RetOps.push_back(DAG.getRegister(W65816::DPF0, Outs[3].VT)); + if (ExpVals.size() >= 2) + RetOps.push_back(DAG.getRegister(W65816::X, ExpVT[1])); + if (ExpVals.size() >= 3) + RetOps.push_back(DAG.getRegister(W65816::Y, ExpVT[2])); + if (ExpVals.size() >= 4) + RetOps.push_back(DAG.getRegister(W65816::DPF0, ExpVT[3])); RetOps[0] = Chain; if (Glue.getNode()) @@ -1046,7 +1783,13 @@ W65816TargetLowering::PerformDAGCombine(SDNode *N, // `x*N` (which the combiner canonicalises pow-of-2 muls to `x<getOpcode() == ISD::SHL && N->getValueType(0).getSizeInBits() >= 32) { + // (shl i32 X, K) -> ADD chain for small K — but only when i32 is + // ILLEGAL (i.e., gets type-split into i16 halves). When i32 is a + // legal type (Wide32 reg class for ptr32 mode), the rewrite cycles + // against LLVM's generic `(add x, x) -> (shl x, 1)` combine in the + // i64 → 2 i32 split path, hanging the legalizer. + if (N->getOpcode() == ISD::SHL && N->getValueType(0).getSizeInBits() >= 32 && + !isTypeLegal(N->getValueType(0))) { if (auto *C = dyn_cast(N->getOperand(1))) { uint64_t K = C->getZExtValue(); if (K >= 1 && K <= 2) { @@ -1191,6 +1934,214 @@ W65816TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, return emitRROp(MI, BB, W65816::STAfi, W65816::EORfi, /*HasOut=*/true); case W65816::CMP_RR: return emitRROp(MI, BB, W65816::STAfi, W65816::CMPfi, /*HasOut=*/false); + case W65816::LDAptr32S: + case W65816::STAptr32S: + case W65816::STBptr32S: { + // Split-pair variant: ptr is 2 i16 operands (lo + hi) instead of + // 1 Wide32 reg pair. Used by the W65816LowerWide32 pre-RA pass + // to dodge pair-allocation pressure. Otherwise identical to + // the LDAptr32 inserter below. + MachineFunction *MF = BB->getParent(); + const W65816Subtarget &STI = MF->getSubtarget(); + const W65816InstrInfo &TII = *STI.getInstrInfo(); + DebugLoc DL = MI.getDebugLoc(); + bool IsLoad = MI.getOpcode() == W65816::LDAptr32S; + bool IsByteStore = MI.getOpcode() == W65816::STBptr32S; + Register PtrLo = MI.getOperand(IsLoad ? 1 : 1).getReg(); + Register PtrHi = MI.getOperand(IsLoad ? 2 : 2).getReg(); + + int FILo = MF->getFrameInfo().CreateStackObject(2, Align(2), + /*isSpillSlot=*/false); + int FIHi = MF->getFrameInfo().CreateStackObject(2, Align(2), + /*isSpillSlot=*/false); + BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STAfi)) + .addReg(PtrLo).addFrameIndex(FILo).addImm(0); + BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STAfi)) + .addReg(PtrHi).addFrameIndex(FIHi).addImm(0); + + BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDAfi), + W65816::A).addFrameIndex(FILo).addImm(0); + BuildMI(*BB, MI.getIterator(), DL, + TII.get(W65816::STA_DP)).addImm(0xE0); + BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDAfi), + W65816::A).addFrameIndex(FIHi).addImm(0); + BuildMI(*BB, MI.getIterator(), DL, + TII.get(W65816::STA_DP)).addImm(0xE2); + + if (IsLoad) { + Register Dst = MI.getOperand(0).getReg(); + BuildMI(*BB, MI.getIterator(), DL, + TII.get(W65816::LDY_Imm16)).addImm(0); + BuildMI(*BB, MI.getIterator(), DL, + TII.get(W65816::LDA_DPIndLongY)).addImm(0xE0); + BuildMI(*BB, MI.getIterator(), DL, + TII.get(TargetOpcode::COPY), Dst).addReg(W65816::A); + } else { + Register Val = MI.getOperand(0).getReg(); + BuildMI(*BB, MI.getIterator(), DL, + TII.get(TargetOpcode::COPY), W65816::A).addReg(Val); + BuildMI(*BB, MI.getIterator(), DL, + TII.get(W65816::LDY_Imm16)).addImm(0); + if (IsByteStore) + BuildMI(*BB, MI.getIterator(), DL, + TII.get(W65816::SEP)).addImm(0x20); + BuildMI(*BB, MI.getIterator(), DL, + TII.get(W65816::STA_DPIndLongY)).addImm(0xE0); + if (IsByteStore) + BuildMI(*BB, MI.getIterator(), DL, + TII.get(W65816::REP)).addImm(0x20); + } + MI.eraseFromParent(); + return BB; + } + case W65816::LDAptr32: + case W65816::STAptr32: + case W65816::STBptr32: { + // Same shape as the i16 LDAptr/STAptr/STBptr inserter, but the + // pointer is a Wide32 register pair: sub_lo carries the low 16 + // bits of the address, sub_hi carries the bank byte in its low + // half (high half is pad, ORCA convention). Stage at $E0..$E2, + // then [dp],Y addresses the right bank without forcing 0. + // + // Dead unless ptr32 mode is active (LowerLoad/LowerStore are gated + // on i32 address type). + MachineFunction *MF = BB->getParent(); + const W65816Subtarget &STI = MF->getSubtarget(); + const W65816InstrInfo &TII = *STI.getInstrInfo(); + const W65816RegisterInfo &TRI = TII.getRegisterInfo(); + DebugLoc DL = MI.getDebugLoc(); + bool IsLoad = MI.getOpcode() == W65816::LDAptr32; + bool IsByteStore = MI.getOpcode() == W65816::STBptr32; + Register Ptr = MI.getOperand(IsLoad ? 1 : 1).getReg(); + Register PtrLo = TRI.getSubReg(Ptr, llvm::sub_lo); + Register PtrHi = TRI.getSubReg(Ptr, llvm::sub_hi); + + // Spill each half to a fresh slot, reload via LDAfi. Same RA- + // pinning rationale as the i16 LDAptr inserter. + int FILo = MF->getFrameInfo().CreateStackObject(2, Align(2), + /*isSpillSlot=*/false); + int FIHi = MF->getFrameInfo().CreateStackObject(2, Align(2), + /*isSpillSlot=*/false); + BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STAfi)) + .addReg(PtrLo).addFrameIndex(FILo).addImm(0); + BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STAfi)) + .addReg(PtrHi).addFrameIndex(FIHi).addImm(0); + + // Stage the 24-bit address at $E0..$E2: sub_lo at $E0..$E1, + // bank byte (low half of sub_hi) at $E2. We write 16 bits at $E2 + // — the high byte ($E3) gets sub_hi's pad byte (0 by ORCA) — but + // only $E2 is consulted by [dp],Y so $E3 contamination is harmless + // until something else uses $E3. + BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDAfi), + W65816::A).addFrameIndex(FILo).addImm(0); + BuildMI(*BB, MI.getIterator(), DL, + TII.get(W65816::STA_DP)).addImm(0xE0); + BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDAfi), + W65816::A).addFrameIndex(FIHi).addImm(0); + BuildMI(*BB, MI.getIterator(), DL, + TII.get(W65816::STA_DP)).addImm(0xE2); + + if (IsLoad) { + Register Dst = MI.getOperand(0).getReg(); + BuildMI(*BB, MI.getIterator(), DL, + TII.get(W65816::LDY_Imm16)).addImm(0); + BuildMI(*BB, MI.getIterator(), DL, + TII.get(W65816::LDA_DPIndLongY)).addImm(0xE0); + BuildMI(*BB, MI.getIterator(), DL, + TII.get(TargetOpcode::COPY), Dst).addReg(W65816::A); + } else { + Register Val = MI.getOperand(0).getReg(); + BuildMI(*BB, MI.getIterator(), DL, + TII.get(TargetOpcode::COPY), W65816::A).addReg(Val); + BuildMI(*BB, MI.getIterator(), DL, + TII.get(W65816::LDY_Imm16)).addImm(0); + if (IsByteStore) + BuildMI(*BB, MI.getIterator(), DL, + TII.get(W65816::SEP)).addImm(0x20); + BuildMI(*BB, MI.getIterator(), DL, + TII.get(W65816::STA_DPIndLongY)).addImm(0xE0); + if (IsByteStore) + BuildMI(*BB, MI.getIterator(), DL, + TII.get(W65816::REP)).addImm(0x20); + } + MI.eraseFromParent(); + return BB; + } + case W65816::LDAptr32Off: + case W65816::STAptr32Off: + case W65816::STBptr32Off: { + // ptr32 deref with constant offset. Compute (sub_lo + off) into A + // with CLC; ADC, store at $E0..$E1; then propagate the carry into + // the bank byte via ADC #0 on (sub_hi) and store at $E2. Carry + // propagation is conservatively always emitted — bank wrapping is + // rare but real (bank-spanning struct or negative offset). + // + // Dead unless ptr32 mode is active. + MachineFunction *MF = BB->getParent(); + const W65816Subtarget &STI = MF->getSubtarget(); + const W65816InstrInfo &TII = *STI.getInstrInfo(); + const W65816RegisterInfo &TRI = TII.getRegisterInfo(); + DebugLoc DL = MI.getDebugLoc(); + bool IsLoad = MI.getOpcode() == W65816::LDAptr32Off; + bool IsByteStore = MI.getOpcode() == W65816::STBptr32Off; + Register Ptr = MI.getOperand(1).getReg(); + int64_t Off = MI.getOperand(2).getImm(); + Register PtrLo = TRI.getSubReg(Ptr, llvm::sub_lo); + Register PtrHi = TRI.getSubReg(Ptr, llvm::sub_hi); + + int FILo = MF->getFrameInfo().CreateStackObject(2, Align(2), + /*isSpillSlot=*/false); + int FIHi = MF->getFrameInfo().CreateStackObject(2, Align(2), + /*isSpillSlot=*/false); + BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STAfi)) + .addReg(PtrLo).addFrameIndex(FILo).addImm(0); + BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STAfi)) + .addReg(PtrHi).addFrameIndex(FIHi).addImm(0); + + // (sub_lo + off) -> $E0..$E1 + BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDAfi), + W65816::A).addFrameIndex(FILo).addImm(0); + BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::CLC)); + BuildMI(*BB, MI.getIterator(), DL, + TII.get(W65816::ADC_Imm16)).addImm(Off); + BuildMI(*BB, MI.getIterator(), DL, + TII.get(W65816::STA_DP)).addImm(0xE0); + + // (sub_hi + 0 + carry) -> $E2..$E3. ADC #0 picks up the carry + // from the previous ADC; if no carry, sub_hi is unchanged. + BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::LDAfi), + W65816::A).addFrameIndex(FIHi).addImm(0); + BuildMI(*BB, MI.getIterator(), DL, + TII.get(W65816::ADC_Imm16)).addImm(0); + BuildMI(*BB, MI.getIterator(), DL, + TII.get(W65816::STA_DP)).addImm(0xE2); + + if (IsLoad) { + Register Dst = MI.getOperand(0).getReg(); + BuildMI(*BB, MI.getIterator(), DL, + TII.get(W65816::LDY_Imm16)).addImm(0); + BuildMI(*BB, MI.getIterator(), DL, + TII.get(W65816::LDA_DPIndLongY)).addImm(0xE0); + BuildMI(*BB, MI.getIterator(), DL, + TII.get(TargetOpcode::COPY), Dst).addReg(W65816::A); + } else { + Register Val = MI.getOperand(0).getReg(); + BuildMI(*BB, MI.getIterator(), DL, + TII.get(TargetOpcode::COPY), W65816::A).addReg(Val); + BuildMI(*BB, MI.getIterator(), DL, + TII.get(W65816::LDY_Imm16)).addImm(0); + if (IsByteStore) + BuildMI(*BB, MI.getIterator(), DL, + TII.get(W65816::SEP)).addImm(0x20); + BuildMI(*BB, MI.getIterator(), DL, + TII.get(W65816::STA_DPIndLongY)).addImm(0xE0); + if (IsByteStore) + BuildMI(*BB, MI.getIterator(), DL, + TII.get(W65816::REP)).addImm(0x20); + } + MI.eraseFromParent(); + return BB; + } case W65816::LDAptrOff: case W65816::STAptrOff: case W65816::STBptrOff: { @@ -1228,8 +2179,16 @@ W65816TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, TII.get(W65816::ADC_Imm16)).addImm(Off); BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STA_DP)).addImm(0xE0); - BuildMI(*BB, MI.getIterator(), DL, - TII.get(W65816::STZ_DP)).addImm(0xE2); + if (LoaderBankDeref) { + // Bank byte from $BE (crt0-initialised) — Loader compat path. + BuildMI(*BB, MI.getIterator(), DL, + TII.get(W65816::LDA_DP)).addImm(0xBE); + BuildMI(*BB, MI.getIterator(), DL, + TII.get(W65816::STA_DP)).addImm(0xE2); + } else { + BuildMI(*BB, MI.getIterator(), DL, + TII.get(W65816::STZ_DP)).addImm(0xE2); + } if (IsLoad) { Register Dst = MI.getOperand(0).getReg(); @@ -1326,8 +2285,16 @@ W65816TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, BuildMI(*BB, MI.getIterator(), DL, TII.get(W65816::STA_DP)).addImm(0xE0); - BuildMI(*BB, MI.getIterator(), DL, - TII.get(W65816::STZ_DP)).addImm(0xE2); + if (LoaderBankDeref) { + // Bank byte from $BE (crt0-initialised) — Loader compat path. + BuildMI(*BB, MI.getIterator(), DL, + TII.get(W65816::LDA_DP)).addImm(0xBE); + BuildMI(*BB, MI.getIterator(), DL, + TII.get(W65816::STA_DP)).addImm(0xE2); + } else { + BuildMI(*BB, MI.getIterator(), DL, + TII.get(W65816::STZ_DP)).addImm(0xE2); + } if (IsLoad) { Register Dst = MI.getOperand(0).getReg(); diff --git a/src/llvm/lib/Target/W65816/W65816ISelLowering.h b/src/llvm/lib/Target/W65816/W65816ISelLowering.h index db92d66..1d640af 100644 --- a/src/llvm/lib/Target/W65816/W65816ISelLowering.h +++ b/src/llvm/lib/Target/W65816/W65816ISelLowering.h @@ -46,6 +46,26 @@ public: SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; + // Lock i16 shift amounts to i16 (not i32) even when i32 is a legal + // type. Without this, the DAG combiner promotes i16 shift amounts + // to i32 once i32 is registered as legal, leaving (sra i16, i32:K) + // with no matching pattern. Only narrow when LHS is i16; leave i32 + // shifts (which go to libcall via LowerShift) alone. + MVT getScalarShiftAmountTy(const DataLayout &DL, + EVT LHSTy) const override { + if (LHSTy == MVT::i16 || LHSTy == MVT::i8) return MVT::i16; + return TargetLoweringBase::getScalarShiftAmountTy(DL, LHSTy); + } + + // ptr32-mode hook: with patches/0007-targetlowering-virtual- + // gettypeconversion making the base function virtual, this can be + // overridden to force i64 to expand directly to i16 halves rather + // than going through i32 (the next-smaller-legal type). Currently + // not overridden — the override-calling-base passthrough caused + // regressions in unrelated functions (likely due to subtle + // de-virtualization changes when the function becomes virtual). + // Future fix needs to test the override more carefully. + MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const override; @@ -147,6 +167,23 @@ public: return TargetLowering::isTypeDesirableForOp(Opc, VT); } + // Disallow merging stores into wider ones. With ptr32 active and i32 + // a Custom-lowered op, the SDAG combiner's MergeConsecutiveStores + // takes our LowerStore-split pair (2x i16 stores at &t and &t+2) and + // merges them back into a single i32 store, which re-enters + // LowerStore, splits again, and loops forever — observed as + // "LLVM ERROR: out of memory" on `*t = K` for any K (including 0 + // when the SDAG state lets the combiner pick the merge ahead of any + // STZ-pattern simplification). Anything wider than i16 has no + // legal ptr-store pattern in our backend anyway, so merging into + // wider VTs is purely counterproductive. + bool canMergeStoresTo(unsigned AS, EVT MemVT, + const MachineFunction &MF) const override { + if (MemVT.isInteger() && MemVT.getSizeInBits() > 16) + return false; + return TargetLowering::canMergeStoresTo(AS, MemVT, MF); + } + private: SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const; SDValue LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const; @@ -156,6 +193,31 @@ private: SDValue LowerSignExtend(SDValue Op, SelectionDAG &DAG) const; SDValue LowerShift(SDValue Op, SelectionDAG &DAG) const; SDValue LowerDynamicStackalloc(SDValue Op, SelectionDAG &DAG) const; + // Foundation hooks for ptr32 mode. In ptr16 mode (current default), + // both return SDValue() so the legalizer falls through to the default + // i16-pointer LDAptr/STAptr selection. When ptr32 mode is enabled + // (PointerWidth=32 + Wide32 added as i32 reg class), they detect i32 + // addresses and wrap the load/store in W65816ISD::LD_PTR / ST_PTR / + // STB_PTR so the [dp],Y inserter takes the bank byte from the + // pointer's hi half instead of forcing 0. + SDValue LowerLoad(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerStore(SDValue Op, SelectionDAG &DAG) const; + // ZERO/SIGN/ANY_EXTEND i16 -> i32 and TRUNCATE i32 -> i16 lowering + // via REG_SEQUENCE / EXTRACT_SUBREG on the sub_lo/sub_hi indexes of + // the Wide32 register class. Active once i32 is registered as a + // legal type. + SDValue LowerExtend(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerTruncate(SDValue Op, SelectionDAG &DAG) const; + // SIGN_EXTEND_INREG i32 with inner type i1 / i8 / i16: sign-extend + // the low N bits of the i32 input to fill all 32 bits. Splits to + // (sext_inreg lo, innerVT) for the low half and SRA #15 of the + // resulting i16 for the high half. + SDValue LowerSignExtendInReg(SDValue Op, SelectionDAG &DAG) const; + // ADD/SUB/AND/OR/XOR i32 split into per-half i16 ops. The carry- + // chain ADDC/ADDE pseudos handle the cross-half link for ADD/SUB. + SDValue LowerI32Bin(SDValue Op, SelectionDAG &DAG) const; + // i32 ConstantNode: split into two i16 constants and REG_SEQUENCE. + SDValue LowerI32Constant(SDValue Op, SelectionDAG &DAG) const; }; } // namespace llvm diff --git a/src/llvm/lib/Target/W65816/W65816InstrInfo.cpp b/src/llvm/lib/Target/W65816/W65816InstrInfo.cpp index 0f58c13..990182b 100644 --- a/src/llvm/lib/Target/W65816/W65816InstrInfo.cpp +++ b/src/llvm/lib/Target/W65816/W65816InstrInfo.cpp @@ -100,6 +100,30 @@ void W65816InstrInfo::copyPhysReg(MachineBasicBlock &MBB, BuildMI(MBB, I, DL, get(W65816::STA_DP)).addImm(dstImg); return; } + // SP -> A via TSC. Used by alloca / setjmp asm machinery. + if (DestReg == W65816::A && SrcReg == W65816::SP) { + BuildMI(MBB, I, DL, get(W65816::TSC)); + return; + } + // A -> SP via TCS. + if (DestReg == W65816::SP && SrcReg == W65816::A) { + BuildMI(MBB, I, DL, get(W65816::TCS)); + return; + } + // X <-> Y via A: 65816 has no direct X<->Y transfer; bridge through + // A. Caller is responsible for ensuring A is dead at this program + // point (regalloc arranges this). Used by greedy when an i16 vreg + // forced into one Idx16 reg gets coalesced with a use in the other. + if (DestReg == W65816::Y && SrcReg == W65816::X) { + BuildMI(MBB, I, DL, get(W65816::TXA)); + BuildMI(MBB, I, DL, get(W65816::TAY)); + return; + } + if (DestReg == W65816::X && SrcReg == W65816::Y) { + BuildMI(MBB, I, DL, get(W65816::TYA)); + BuildMI(MBB, I, DL, get(W65816::TAX)); + return; + } // X → IMGn / IMGn → X: STX dp / LDX dp. Used by the i64-first-arg // entry COPY (LowerFormalArguments routes arg0_ml through Img16 to // dodge the TXA-bridge-clobbers-A spill bug for udivmod-shaped @@ -112,6 +136,18 @@ void W65816InstrInfo::copyPhysReg(MachineBasicBlock &MBB, BuildMI(MBB, I, DL, get(W65816::LDX_DP)).addImm(srcImg); return; } + // Y -> IMGn / IMGn -> Y: STY dp / LDY dp. Symmetric with the X + // case above. Used by the i32-first-arg ABI's hi half (in X) and + // by Wide32 pair copies that have one half in Y after the per-half + // routing — see the lambda dispatch below. + if (dstImg >= 0 && SrcReg == W65816::Y) { + BuildMI(MBB, I, DL, get(W65816::STY_DP)).addImm(dstImg); + return; + } + if (DestReg == W65816::Y && srcImg >= 0) { + BuildMI(MBB, I, DL, get(W65816::LDY_DP)).addImm(srcImg); + return; + } // DPF0 → A: emit `LDA $F0`. DPF0 is the pseudo-physreg carrier // for an i64-returning call's high 16 bits; LowerCall builds a // CopyFromReg(DPF0) glued to the call so the SDAG combiner / @@ -129,6 +165,56 @@ void W65816InstrInfo::copyPhysReg(MachineBasicBlock &MBB, BuildMI(MBB, I, DL, get(W65816::STA_DP)).addImm(0xF0); return; } + // Wide32 (AX32 or IMG-pair) <-> Wide32 copy: split on sub_lo / sub_hi + // and recurse. Use a hand-written dispatch instead of getSubReg + // because the MCRegisterInfo::getSubReg path crashes when called + // from TargetInstrInfo::lowerCopy on regs that are not pair regs + // (the table lookup walks past the end of the diff list). + auto wide32Halves = [](Register R) + -> std::pair { + switch (R) { + case W65816::AX32: return {W65816::A, W65816::X}; + case W65816::IMG01: return {W65816::IMG0, W65816::IMG1}; + case W65816::IMG23: return {W65816::IMG2, W65816::IMG3}; + case W65816::IMG45: return {W65816::IMG4, W65816::IMG5}; + case W65816::IMG67: return {W65816::IMG6, W65816::IMG7}; + case W65816::IMG89: return {W65816::IMG8, W65816::IMG9}; + case W65816::IMG1011: return {W65816::IMG10, W65816::IMG11}; + case W65816::IMG1213: return {W65816::IMG12, W65816::IMG13}; + case W65816::IMG1415: return {W65816::IMG14, W65816::IMG15}; + default: return {Register(), Register()}; + } + }; + auto [srcLo, srcHi] = wide32Halves(SrcReg); + auto [dstLo, dstHi] = wide32Halves(DestReg); + if (srcLo && srcHi && dstLo && dstHi) { + // Wide32 -> Wide32. Lo-first order is correct in every direction: + // AX32 -> IMG_pair : STA dstLo (A live), then STX dstHi + // IMG_pair -> AX32 : LDA srcLo, then LDX srcHi (independent halves) + // IMG_pair -> IMG_pair : LDA/STA chain twice (A is only per-half scratch) + copyPhysReg(MBB, I, DL, dstLo, srcLo, KillSrc, + RenamableDest, RenamableSrc); + copyPhysReg(MBB, I, DL, dstHi, srcHi, KillSrc, + RenamableDest, RenamableSrc); + return; + } + // Wide32 -> i16: take sub_lo of source. Arises post-RA when an + // EXTRACT_SUBREG was lowered as a parent-reg COPY (the SubRegIndex + // is dropped by lowerCopy). + if (srcLo && srcHi && !dstLo) { + copyPhysReg(MBB, I, DL, DestReg, srcLo, KillSrc, + RenamableDest, RenamableSrc); + return; + } + // i16 -> Wide32: write sub_lo only (sub_hi left as caller had it, + // matching INSERT_SUBREG semantics). Arises post-RA when REG_SEQUENCE + // is expanded into per-half COPY pseudos, then a parent-reg COPY of + // a sub-reg-only def appears. + if (!srcLo && dstLo && dstHi) { + copyPhysReg(MBB, I, DL, dstLo, SrcReg, KillSrc, + RenamableDest, RenamableSrc); + return; + } llvm_unreachable("W65816: cross-class copyPhysReg not yet implemented"); } @@ -141,6 +227,37 @@ void W65816InstrInfo::storeRegToStackSlot( // and zero offset. When regalloc hands us a spill from X or Y, bridge // through A (TXA / TYA) — same rationale as loadRegFromStackSlot. DebugLoc DL = MI != MBB.end() ? MI->getDebugLoc() : DebugLoc(); + // Wide32 spill: split into 2 i16 stores at offsets 0 and 2 of the + // 4-byte spill slot. Bridge each half through A using copyPhysReg. + if (RC == &W65816::Wide32RegClass || RC == &W65816::Acc32RegClass || + RC == &W65816::AnyWide32RegClass) { + Register Lo, Hi; + switch (SrcReg) { + case W65816::AX32: Lo = W65816::A; Hi = W65816::X; break; + case W65816::IMG01: Lo = W65816::IMG0; Hi = W65816::IMG1; break; + case W65816::IMG23: Lo = W65816::IMG2; Hi = W65816::IMG3; break; + case W65816::IMG45: Lo = W65816::IMG4; Hi = W65816::IMG5; break; + case W65816::IMG67: Lo = W65816::IMG6; Hi = W65816::IMG7; break; + case W65816::IMG89: Lo = W65816::IMG8; Hi = W65816::IMG9; break; + case W65816::IMG1011: Lo = W65816::IMG10; Hi = W65816::IMG11; break; + case W65816::IMG1213: Lo = W65816::IMG12; Hi = W65816::IMG13; break; + case W65816::IMG1415: Lo = W65816::IMG14; Hi = W65816::IMG15; break; + default: llvm_unreachable("W65816: Wide32 spill of non-pair reg"); + } + // Bridge lo through A, store at offset 0; bridge hi through A, + // store at offset 2. This is brittle in the face of regalloc + // expectations — Wide32 spills are best avoided by keeping the + // pair in registers if at all possible. + if (Lo != W65816::A) { + copyPhysReg(MBB, MI, DL, W65816::A, Lo, false); + } + BuildMI(MBB, MI, DL, get(W65816::STAfi)) + .addReg(W65816::A).addFrameIndex(FrameIdx).addImm(0); + copyPhysReg(MBB, MI, DL, W65816::A, Hi, false); + BuildMI(MBB, MI, DL, get(W65816::STAfi)) + .addReg(W65816::A).addFrameIndex(FrameIdx).addImm(2); + return; + } if (SrcReg == W65816::X || SrcReg == W65816::Y) { unsigned XferOp = (SrcReg == W65816::X) ? W65816::TXA : W65816::TYA; BuildMI(MBB, MI, DL, get(XferOp)); @@ -166,6 +283,34 @@ void W65816InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, // values for the second word (caught by udivmod's `a - q*b` mod // computation). DebugLoc DL = MI != MBB.end() ? MI->getDebugLoc() : DebugLoc(); + // Wide32 reload: 2 i16 loads at offsets 0 and 2 of the 4-byte slot. + if (RC == &W65816::Wide32RegClass || RC == &W65816::Acc32RegClass || + RC == &W65816::AnyWide32RegClass) { + Register Lo, Hi; + switch (DestReg) { + case W65816::AX32: Lo = W65816::A; Hi = W65816::X; break; + case W65816::IMG01: Lo = W65816::IMG0; Hi = W65816::IMG1; break; + case W65816::IMG23: Lo = W65816::IMG2; Hi = W65816::IMG3; break; + case W65816::IMG45: Lo = W65816::IMG4; Hi = W65816::IMG5; break; + case W65816::IMG67: Lo = W65816::IMG6; Hi = W65816::IMG7; break; + case W65816::IMG89: Lo = W65816::IMG8; Hi = W65816::IMG9; break; + case W65816::IMG1011: Lo = W65816::IMG10; Hi = W65816::IMG11; break; + case W65816::IMG1213: Lo = W65816::IMG12; Hi = W65816::IMG13; break; + case W65816::IMG1415: Lo = W65816::IMG14; Hi = W65816::IMG15; break; + default: llvm_unreachable("W65816: Wide32 reload to non-pair reg"); + } + // Lo half: LDA from offset 0, transfer to Lo if needed. + BuildMI(MBB, MI, DL, get(W65816::LDAfi), W65816::A) + .addFrameIndex(FrameIdx).addImm(0); + if (Lo != W65816::A) + copyPhysReg(MBB, MI, DL, Lo, W65816::A, false); + // Hi half: LDA from offset 2, transfer to Hi. + BuildMI(MBB, MI, DL, get(W65816::LDAfi), W65816::A) + .addFrameIndex(FrameIdx).addImm(2); + if (Hi != W65816::A) + copyPhysReg(MBB, MI, DL, Hi, W65816::A, false); + return; + } if (DestReg == W65816::A) { BuildMI(MBB, MI, DL, get(W65816::LDAfi), DestReg) .addFrameIndex(FrameIdx) diff --git a/src/llvm/lib/Target/W65816/W65816InstrInfo.td b/src/llvm/lib/Target/W65816/W65816InstrInfo.td index 14fe38c..8e8a7c5 100644 --- a/src/llvm/lib/Target/W65816/W65816InstrInfo.td +++ b/src/llvm/lib/Target/W65816/W65816InstrInfo.td @@ -88,6 +88,26 @@ def SDT_W65816Alloca : SDTypeProfile<1, 1, [SDTCisVT<0, i16>, def W65816alloca : SDNode<"W65816ISD::ALLOCA", SDT_W65816Alloca, [SDNPHasChain, SDNPSideEffect]>; +// ptr32 load / store: target-specific load/store nodes that take a 32-bit +// pointer (Wide32 = i32) and lower to [dp],Y indirect-long with the bank +// byte taken from the pointer's hi-half. Used for ptr32 mode where +// generic (load i32-addr) needs explicit lowering — wrapping in a target +// node prevents DAG combines from rewriting the load before isel. +// +// Loads always materialise an i16 in A (16-bit LDA); byte zext / anyext +// patterns AND-mask afterwards exactly as the existing LDAptr does. +// Stores split into two nodes: ST_PTR (full 16-bit STA) and STB_PTR +// (SEP/REP-wrapped 8-bit STA for truncating stores). +def SDT_W65816LdPtr : SDTypeProfile<1, 1, [SDTCisVT<0, i16>, SDTCisVT<1, i32>]>; +def SDT_W65816StPtr : SDTypeProfile<0, 2, [SDTCisVT<0, i16>, SDTCisVT<1, i32>]>; + +def W65816ldPtr : SDNode<"W65816ISD::LD_PTR", SDT_W65816LdPtr, + [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; +def W65816stPtr : SDNode<"W65816ISD::ST_PTR", SDT_W65816StPtr, + [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; +def W65816stbPtr : SDNode<"W65816ISD::STB_PTR", SDT_W65816StPtr, + [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; + //===----------------------------------------------------------------------===// // Pseudo Instructions //===----------------------------------------------------------------------===// @@ -1046,6 +1066,96 @@ def : Pat<(store Acc8:$val, (add Wide16:$ptr, (i16 imm:$off))), def : Pat<(store Acc8:$val, Wide16:$ptr), (STBptr (COPY_TO_REGCLASS Acc8:$val, Acc16), Wide16:$ptr)>; +// --------------------------------------------------------------------- +// ptr32 deref pseudos. Same shape and inserter as LDAptr/STAptr/STBptr, +// but the pointer is a Wide32 (i32) value: sub_lo carries the low 16 +// bits of the address, sub_hi carries the bank byte in its low half. +// Inserter stages the low 16 bits at $E0..$E1 and the bank byte at $E2, +// then emits LDA/STA [dp],Y just like the i16 path — but with a +// pointer-derived bank instead of a forced 0. +// +// Dead unless ptr32 mode is active (LowerLoad/LowerStore only emit +// W65816ldPtr/stPtr/stbPtr when the address is i32). +// --------------------------------------------------------------------- +let usesCustomInserter = 1, hasSideEffects = 1, mayLoad = 1, + Defs = [Y, P] in { +def LDAptr32 : W65816Pseudo<(outs Acc16:$dst), (ins AnyWide32:$ptr), + "# LDAptr32 $dst, $ptr", + [(set Acc16:$dst, (W65816ldPtr AnyWide32:$ptr))]>; +} +let usesCustomInserter = 1, hasSideEffects = 1, mayStore = 1, + Defs = [Y, P] in { +def STAptr32 : W65816Pseudo<(outs), (ins Acc16:$val, AnyWide32:$ptr), + "# STAptr32 $val, $ptr", + [(W65816stPtr Acc16:$val, AnyWide32:$ptr)]>; +def STBptr32 : W65816Pseudo<(outs), (ins Acc16:$val, AnyWide32:$ptr), + "# STBptr32 $val, $ptr", + [(W65816stbPtr Acc16:$val, AnyWide32:$ptr)]>; +} +let usesCustomInserter = 1, hasSideEffects = 1, mayLoad = 1, + Defs = [Y, P] in { +def LDAptr32Off : W65816Pseudo<(outs Acc16:$dst), + (ins AnyWide32:$ptr, i16imm:$off), + "# LDAptr32Off $dst, $ptr, $off", []>; +} +let usesCustomInserter = 1, hasSideEffects = 1, mayStore = 1, + Defs = [Y, P] in { +def STAptr32Off : W65816Pseudo<(outs), + (ins Acc16:$val, AnyWide32:$ptr, i16imm:$off), + "# STAptr32Off $val, $ptr, $off", []>; +def STBptr32Off : W65816Pseudo<(outs), + (ins Acc16:$val, AnyWide32:$ptr, i16imm:$off), + "# STBptr32Off $val, $ptr, $off", []>; +} + +// Direct ptr32 load/store patterns over generic ISD::LOAD / ISD::STORE +// when the address is an i32 (AnyWide32) reg. These are unreachable +// while i32 is not a legal type (ptr16 mode). When ptr32 mode is +// activated they fire instead of the i16-pointer LDAptr / STAptr. +def : Pat<(i16 (load AnyWide32:$ptr)), + (LDAptr32 AnyWide32:$ptr)>; +def : Pat<(store Acc16:$val, AnyWide32:$ptr), + (STAptr32 Acc16:$val, AnyWide32:$ptr)>; +def : Pat<(truncstorei8 Acc16:$val, AnyWide32:$ptr), + (STBptr32 Acc16:$val, AnyWide32:$ptr)>; +def : Pat<(i16 (zextloadi8 AnyWide32:$ptr)), + (ANDi16imm (LDAptr32 AnyWide32:$ptr), 0xFF)>; +def : Pat<(i16 (extloadi8 AnyWide32:$ptr)), + (LDAptr32 AnyWide32:$ptr)>; +def : Pat<(i8 (load AnyWide32:$ptr)), + (COPY_TO_REGCLASS (ANDi16imm (LDAptr32 AnyWide32:$ptr), 0xFF), Acc8)>; +def : Pat<(store Acc8:$val, AnyWide32:$ptr), + (STBptr32 (COPY_TO_REGCLASS Acc8:$val, Acc16), AnyWide32:$ptr)>; + +// Off variants — folded constant-offset add patterns deferred until +// ptr32 mode is activated and we can profile real cases. The base +// LDAptr32/STAptr32 pseudos handle the general (add ptr, off) case +// correctly via a separate i32 ADD; the Off pseudos are an optional +// optimization for small constant offsets. + +// Split-pair variants: same semantics as LDAptr32/STAptr32/STBptr32 but +// the ptr is two separate i16 register operands (lo + hi) instead of +// one Wide32 register pair. Used by the W65816LowerWide32 pre-RA pass +// to relieve register-pair allocation pressure: it walks REG_SEQUENCE +// + LDAptr32 chains, decomposes the Wide32 vregs into pairs of i16 +// vregs, and rewrites the LDAptr32-family to take the two halves +// directly. +let usesCustomInserter = 1, hasSideEffects = 1, mayLoad = 1, + Defs = [Y, P] in { +def LDAptr32S : W65816Pseudo<(outs Acc16:$dst), + (ins Wide16:$ptrLo, Wide16:$ptrHi), + "# LDAptr32S $dst, $ptrLo, $ptrHi", []>; +} +let usesCustomInserter = 1, hasSideEffects = 1, mayStore = 1, + Defs = [Y, P] in { +def STAptr32S : W65816Pseudo<(outs), + (ins Acc16:$val, Wide16:$ptrLo, Wide16:$ptrHi), + "# STAptr32S $val, $ptrLo, $ptrHi", []>; +def STBptr32S : W65816Pseudo<(outs), + (ins Acc16:$val, Wide16:$ptrLo, Wide16:$ptrHi), + "# STBptr32S $val, $ptrLo, $ptrHi", []>; +} + // i8 load via Acc16 pointer producing a true i8 (Acc8) result. Reuses // the existing zextloadi8 16-bit-LDA-and-mask path: load 2 bytes, mask // the high byte, then narrow to Acc8. COPY_TO_REGCLASS to Acc8 is a @@ -1478,15 +1588,18 @@ def : Pat<(store // function doesn't have to know how it was called to choose its // return instruction. A pseudo bridges the i16 symbol operand // to JSL_Long's 24-bit operand class. -// Defs include DPF0 — every i64-returning libcall clobbers DP[$F0] -// (it's the carrier for the highest 16 bits of the return). The -// LowerCall side captures the pre-call DPF0 via CopyFromReg(DPF0) -// glued to the call so the SDAG combiner / scheduler can't merge -// or reorder reads across calls. Without DPF0 in Defs, plain -// `getLoad(0xF0)` was being CSE'd across calls, leading to -// `dmath = (a+b)*(a-b)` returning 4 instead of 16. +// Defs lists ALL caller-clobbered regs. The 65816 has no +// caller/callee-save split — every callee may freely modify +// A/X/Y/DPF0/P/etc. Critically, i32/i64 returns place high +// halves in X (i32), Y and DPF0 (i64); without those in Defs, +// the InstrEmitter does not add implicit-defs for glued +// CopyFromReg(X/Y/DPF0) on the call MI, and the verifier sees +// the post-call `COPY $y` as reading an undefined register. +// DPF0 was historically the only "extra" def so getLoad(0xF0) +// wouldn't CSE across calls; the same anti-CSE rationale applies +// to A/X/Y, but more fundamentally those are call return slots. let isCall = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, - Defs = [A, DPF0] in { + Defs = [A, X, Y, DPF0] in { def JSLpseudo : W65816Pseudo<(outs), (ins i16imm:$dst), "# JSLpseudo $dst", []>; } diff --git a/src/llvm/lib/Target/W65816/W65816LowerWide32.cpp b/src/llvm/lib/Target/W65816/W65816LowerWide32.cpp new file mode 100644 index 0000000..66bc9c5 --- /dev/null +++ b/src/llvm/lib/Target/W65816/W65816LowerWide32.cpp @@ -0,0 +1,326 @@ +//===-- W65816LowerWide32.cpp - Wide32 -> 2x i16 pre-RA lowering ---------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Pre-regalloc pass that decomposes Wide32 register-pair vregs into pairs +// of i16 vregs. Without this, greedy / basic regalloc fails on i64-heavy +// functions (`RegAllocBase` crashes during `allocatePhysRegs`) because +// the i64-via-2-i32-via-Wide32 chain produces too many simultaneously +// live register-pair vregs. After this pass, only i16 vregs remain at +// the regalloc input — Wide32 lives only inside this pass and the new +// LDAptr32S / STAptr32S / STBptr32S pseudos that take 2 i16 ptr operands +// directly. +// +// Walks the MIR and: +// 1. Finds REG_SEQUENCE producing Wide32 / Acc32 / AnyWide32; records +// the (lo, hi) i16 source operands; queues the REG_SEQUENCE for +// erasure. +// 2. Finds COPY whose dest is a Wide32 vreg and whose src is another +// mapped Wide32 vreg; chains the (lo, hi) mapping forward. +// 3. Rewrites EXTRACT_SUBREG of mapped Wide32 vregs by replacing the +// destination vreg with the appropriate half (sub_lo or sub_hi). +// 4. Rewrites LDAptr32 / STAptr32 / STBptr32 with a mapped Wide32 ptr +// to the corresponding LDAptr32S / STAptr32S / STBptr32S pseudo +// with two separate i16 operands. +// +// Bail / safety: any Wide32 vreg whose def we can't decompose is left +// in place — regalloc may still struggle but no miscompile. +// +//===----------------------------------------------------------------------===// + +#include "W65816.h" +#include "W65816InstrInfo.h" +#include "W65816Subtarget.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/TargetInstrInfo.h" +#include "llvm/Support/Debug.h" + +using namespace llvm; + +#define DEBUG_TYPE "w65816-lower-wide32" + +namespace { + +class W65816LowerWide32 : public MachineFunctionPass { +public: + static char ID; + W65816LowerWide32() : MachineFunctionPass(ID) {} + StringRef getPassName() const override { + return "W65816 Wide32 -> 2x i16 lowering"; + } + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } + bool runOnMachineFunction(MachineFunction &MF) override; +}; + +} // namespace + +char W65816LowerWide32::ID = 0; + +INITIALIZE_PASS(W65816LowerWide32, DEBUG_TYPE, + "W65816 Wide32 lowering", false, false) + +FunctionPass *llvm::createW65816LowerWide32() { + return new W65816LowerWide32(); +} + +static bool isWide32RC(const TargetRegisterClass *RC) { + return RC == &W65816::Wide32RegClass || + RC == &W65816::Acc32RegClass || + RC == &W65816::AnyWide32RegClass; +} + +bool W65816LowerWide32::runOnMachineFunction(MachineFunction &MF) { + MachineRegisterInfo &MRI = MF.getRegInfo(); + const W65816Subtarget &STI = MF.getSubtarget(); + const W65816InstrInfo &TII = *STI.getInstrInfo(); + + // Map: Wide32 vreg -> (loVreg, hiVreg) of i16 type. + DenseMap> wideMap; + + // Pass 1: collect all Wide32 vregs. + SmallVector wide32Vregs; + for (unsigned i = 0, e = MRI.getNumVirtRegs(); i != e; ++i) { + Register R = Register::index2VirtReg(i); + if (MRI.reg_nodbg_empty(R)) + continue; + if (!isWide32RC(MRI.getRegClass(R))) + continue; + wide32Vregs.push_back(R); + } + + if (wide32Vregs.empty()) + return false; + + // Pass 2: process REG_SEQUENCE / chained-COPY / multi-subreg-def + // shapes; build the mapping. Iterate to fixed point because COPY + // chains depend on prior mappings. + SmallVector toErase; + bool changed = true; + while (changed) { + changed = false; + for (Register W : wide32Vregs) { + if (wideMap.count(W)) + continue; + MachineInstr *DefMI = MRI.getUniqueVRegDef(W); + if (DefMI && DefMI->getOpcode() == TargetOpcode::REG_SEQUENCE) { + Register Lo, Hi; + for (unsigned op = 1; op + 1 < DefMI->getNumOperands(); op += 2) { + if (!DefMI->getOperand(op).isReg() || + !DefMI->getOperand(op + 1).isImm()) + continue; + unsigned idx = DefMI->getOperand(op + 1).getImm(); + Register Src = DefMI->getOperand(op).getReg(); + if (idx == llvm::sub_lo) + Lo = Src; + else if (idx == llvm::sub_hi) + Hi = Src; + } + if (Lo && Hi) { + wideMap[W] = {Lo, Hi}; + toErase.push_back(DefMI); + changed = true; + continue; + } + } + if (DefMI && DefMI->isCopy()) { + Register Src = DefMI->getOperand(1).getReg(); + if (Src.isVirtual() && wideMap.count(Src)) { + wideMap[W] = wideMap[Src]; + toErase.push_back(DefMI); + changed = true; + continue; + } + } + // Multi-subreg-def shape: separate sub-reg COPYs build %W: + // undef %W.sub_lo:wide32 = COPY %A:acc16 + // %W.sub_hi:wide32 = COPY %B:acc16 + // Equivalent to a REG_SEQUENCE %A, sub_lo, %B, sub_hi. softDouble + // at -O2 generates this heavily; without handling it the Wide32 + // vreg survives to regalloc, which then asks for a spill/reload + // from a non-pair physreg and trips load/storeRegToStackSlot's + // llvm_unreachable. + Register LoSrc, HiSrc; + MachineInstr *LoDefMI = nullptr; + MachineInstr *HiDefMI = nullptr; + bool ok = true; + for (MachineInstr &MI : MRI.def_instructions(W)) { + if (!MI.isCopy()) { ok = false; break; } + const MachineOperand &Dst = MI.getOperand(0); + const MachineOperand &Src = MI.getOperand(1); + if (!Dst.isReg() || Dst.getReg() != W) { ok = false; break; } + unsigned SubIdx = Dst.getSubReg(); + if (SubIdx == llvm::sub_lo) { + if (LoDefMI) { ok = false; break; } + LoDefMI = &MI; + LoSrc = Src.getReg(); + } else if (SubIdx == llvm::sub_hi) { + if (HiDefMI) { ok = false; break; } + HiDefMI = &MI; + HiSrc = Src.getReg(); + } else { + ok = false; + break; + } + } + if (ok && LoSrc && HiSrc) { + wideMap[W] = {LoSrc, HiSrc}; + if (LoDefMI) toErase.push_back(LoDefMI); + if (HiDefMI) toErase.push_back(HiDefMI); + changed = true; + } + } + } + + // Pass 2b: handle PHIs whose result is a Wide32 vreg by splitting + // into 2 PHIs (one per half). Iterate to fixed point: a PHI becomes + // resolvable only after all its sources have been mapped. + changed = true; + while (changed) { + changed = false; + for (Register W : wide32Vregs) { + if (wideMap.count(W)) + continue; + MachineInstr *DefMI = MRI.getUniqueVRegDef(W); + if (!DefMI || !DefMI->isPHI()) + continue; + bool AllMapped = true; + for (unsigned op = 1; op + 1 < DefMI->getNumOperands(); op += 2) { + Register Src = DefMI->getOperand(op).getReg(); + if (!Src.isVirtual() || !wideMap.count(Src)) { + AllMapped = false; + break; + } + } + if (!AllMapped) + continue; + Register NewLo = MRI.createVirtualRegister(&W65816::Acc16RegClass); + Register NewHi = MRI.createVirtualRegister(&W65816::Acc16RegClass); + MachineBasicBlock *MBB = DefMI->getParent(); + DebugLoc DL = DefMI->getDebugLoc(); + auto PHILo = BuildMI(*MBB, DefMI, DL, TII.get(TargetOpcode::PHI), NewLo); + auto PHIHi = BuildMI(*MBB, DefMI, DL, TII.get(TargetOpcode::PHI), NewHi); + for (unsigned op = 1; op + 1 < DefMI->getNumOperands(); op += 2) { + Register Src = DefMI->getOperand(op).getReg(); + MachineBasicBlock *PredMBB = DefMI->getOperand(op + 1).getMBB(); + auto [SrcLo, SrcHi] = wideMap[Src]; + PHILo.addReg(SrcLo).addMBB(PredMBB); + PHIHi.addReg(SrcHi).addMBB(PredMBB); + } + wideMap[W] = {NewLo, NewHi}; + toErase.push_back(DefMI); + changed = true; + } + } + + // Pass 3: rewrite uses. + SmallVector useToErase; + for (MachineBasicBlock &MBB : MF) { + for (MachineBasicBlock::iterator It = MBB.begin(); It != MBB.end();) { + MachineInstr *MI = &*It++; + + // EXTRACT_SUBREG of a mapped Wide32 vreg: replace the dest vreg + // with the appropriate half (sub_lo or sub_hi). + if (MI->getOpcode() == TargetOpcode::EXTRACT_SUBREG) { + Register Src = MI->getOperand(1).getReg(); + if (Src.isVirtual() && wideMap.count(Src)) { + unsigned SubIdx = MI->getOperand(2).getImm(); + auto [Lo, Hi] = wideMap[Src]; + Register Half = (SubIdx == llvm::sub_lo) ? Lo : Hi; + Register Dst = MI->getOperand(0).getReg(); + MRI.replaceRegWith(Dst, Half); + useToErase.push_back(MI); + continue; + } + } + + // COPY %V.sub_lo / %V.sub_hi (partial-reg COPY where source has a + // sub-reg specifier and the source vreg is a mapped Wide32). + // LLVM emits this shape instead of EXTRACT_SUBREG when projecting + // a half out of a Wide32 vreg. Only the shape with a full-reg + // destination is handled here — partial-reg destinations would + // imply the dst is itself a Wide32 sub-reg def, which the def-side + // multi-subreg-def handling covers separately. + if (MI->isCopy()) { + const MachineOperand &SrcOp = MI->getOperand(1); + const MachineOperand &DstOp = MI->getOperand(0); + if (SrcOp.isReg() && SrcOp.getReg().isVirtual() && + wideMap.count(SrcOp.getReg()) && SrcOp.getSubReg() != 0 && + DstOp.isReg() && DstOp.getSubReg() == 0) { + unsigned SubIdx = SrcOp.getSubReg(); + auto [Lo, Hi] = wideMap[SrcOp.getReg()]; + Register Half = (SubIdx == llvm::sub_lo) ? Lo : Hi; + MRI.replaceRegWith(DstOp.getReg(), Half); + useToErase.push_back(MI); + continue; + } + } + + // LDAptr32 / STAptr32 / STBptr32 with a mapped Wide32 ptr: + // rewrite to LDAptr32S / STAptr32S / STBptr32S. + unsigned Opc = MI->getOpcode(); + bool isPtrOp = (Opc == W65816::LDAptr32 || Opc == W65816::STAptr32 || + Opc == W65816::STBptr32); + if (isPtrOp) { + Register Ptr = MI->getOperand(1).getReg(); + if (Ptr.isVirtual() && wideMap.count(Ptr)) { + auto [Lo, Hi] = wideMap[Ptr]; + unsigned NewOpc = (Opc == W65816::LDAptr32) ? W65816::LDAptr32S + : (Opc == W65816::STAptr32) ? W65816::STAptr32S + : W65816::STBptr32S; + DebugLoc DL = MI->getDebugLoc(); + MachineBasicBlock *ParentMBB = MI->getParent(); + if (Opc == W65816::LDAptr32) { + Register Dst = MI->getOperand(0).getReg(); + BuildMI(*ParentMBB, MI->getIterator(), DL, TII.get(NewOpc), Dst) + .addReg(Lo) + .addReg(Hi); + } else { + Register Val = MI->getOperand(0).getReg(); + BuildMI(*ParentMBB, MI->getIterator(), DL, TII.get(NewOpc)) + .addReg(Val) + .addReg(Lo) + .addReg(Hi); + } + useToErase.push_back(MI); + continue; + } + } + } + } + + // Erase use-side instructions (EXTRACT_SUBREG, LDAptr32-family) first + // so the Wide32 vreg becomes dead. + for (auto *MI : useToErase) + MI->eraseFromParent(); + + // Now check each REG_SEQUENCE / chained-COPY def: only erase if the + // Wide32 vreg has no remaining uses. Any leftover use means the pass + // didn't cover that opcode — leaving the def in place keeps the MIR + // well-formed (at the cost of pair-allocation pressure for that + // specific case). + bool eraseAny = !useToErase.empty(); + for (auto *MI : toErase) { + if (MI->getNumOperands() == 0) + continue; + Register Dst = MI->getOperand(0).getReg(); + if (!Dst.isVirtual() || MRI.use_nodbg_empty(Dst)) { + MI->eraseFromParent(); + eraseAny = true; + } + } + + return eraseAny; +} diff --git a/src/llvm/lib/Target/W65816/W65816RegisterInfo.td b/src/llvm/lib/Target/W65816/W65816RegisterInfo.td index 0d3a505..01525bc 100644 --- a/src/llvm/lib/Target/W65816/W65816RegisterInfo.td +++ b/src/llvm/lib/Target/W65816/W65816RegisterInfo.td @@ -17,6 +17,13 @@ class W65816Reg num, string n> : Register { let DwarfNumbers = [num]; } +// SubRegIndices for synthetic 32-bit register pairs. sub_lo addresses the +// low 16 bits (the natural i16-aligned half), sub_hi the high 16 bits. +// Used by Acc32 / Wide32 / AnyWide32 to model i32 (i.e. ptr32) values as +// pairs of i16 physical registers. +def sub_lo : SubRegIndex<16, 0>; +def sub_hi : SubRegIndex<16, 16>; + //===----------------------------------------------------------------------===// // Registers //===----------------------------------------------------------------------===// @@ -127,3 +134,61 @@ def DPF0Reg : RegisterClass<"W65816", [i16], 16, (add DPF0)> { def StatusReg : RegisterClass<"W65816", [i8], 8, (add P)> { let isAllocatable = 0; } + +//===----------------------------------------------------------------------===// +// Synthetic 32-bit Register Pairs (for ptr32 mode) +//===----------------------------------------------------------------------===// +// +// The W65816 has no native 32-bit registers. For 32-bit-pointer mode and +// any other i32 traffic we synthesize register pairs whose halves are +// existing i16 registers, accessed via sub_lo / sub_hi. +// +// AX32 pairs A:X for the calling-convention slot (first i32 arg/return). +// Heterogeneous: sub_lo is in Acc16, sub_hi is in Idx16. Because of the +// heterogeneity, AX32 lives in its own single-element class (Acc32) — if +// it were grouped with the homogeneous IMG pairs in Wide32, TableGen would +// auto-derive a "wide32_with_sub_hi_in_idx8" subclass that pins the whole +// class to AX32. +// +// IMG01..IMG1415 pair adjacent IMG slots (each pair is 4 bytes of DP) into +// homogeneous i16-i16 pairs. These hold ptr32 values backed entirely by +// direct page, so register-pair allocation can spill cleanly via Img16's +// existing rules. +// +// Acc32 / Wide32 / AnyWide32: +// Acc32 = {AX32} — calling-convention slot only; not for general allocation. +// Wide32 = {IMG01..IMG1415} — homogeneous i16-i16 pairs, freely allocatable. +// AnyWide32 = Acc32 ∪ Wide32 — pre-RA flexibility for ptr32 vregs that +// are not constrained to AX32; greedy regalloc can pick AX32 or any +// Wide32 pair. +let SubRegIndices = [sub_lo, sub_hi], CoveredBySubRegs = 1 in { + def AX32 : RegisterWithSubRegs<"ax32", [A, X]>, + DwarfRegNum<[40]> { let Namespace = "W65816"; } + def IMG01 : RegisterWithSubRegs<"img01", [IMG0, IMG1]>, + DwarfRegNum<[41]> { let Namespace = "W65816"; } + def IMG23 : RegisterWithSubRegs<"img23", [IMG2, IMG3]>, + DwarfRegNum<[42]> { let Namespace = "W65816"; } + def IMG45 : RegisterWithSubRegs<"img45", [IMG4, IMG5]>, + DwarfRegNum<[43]> { let Namespace = "W65816"; } + def IMG67 : RegisterWithSubRegs<"img67", [IMG6, IMG7]>, + DwarfRegNum<[44]> { let Namespace = "W65816"; } + def IMG89 : RegisterWithSubRegs<"img89", [IMG8, IMG9]>, + DwarfRegNum<[45]> { let Namespace = "W65816"; } + def IMG1011 : RegisterWithSubRegs<"img1011", [IMG10, IMG11]>, + DwarfRegNum<[46]> { let Namespace = "W65816"; } + def IMG1213 : RegisterWithSubRegs<"img1213", [IMG12, IMG13]>, + DwarfRegNum<[47]> { let Namespace = "W65816"; } + def IMG1415 : RegisterWithSubRegs<"img1415", [IMG14, IMG15]>, + DwarfRegNum<[48]> { let Namespace = "W65816"; } +} + +def Acc32 : RegisterClass<"W65816", [i32], 16, (add AX32)>; + +def Wide32 : RegisterClass<"W65816", [i32], 16, + (add IMG01, IMG23, IMG45, IMG67, + IMG89, IMG1011, IMG1213, IMG1415)>; + +def AnyWide32 : RegisterClass<"W65816", [i32], 16, + (add AX32, + IMG01, IMG23, IMG45, IMG67, + IMG89, IMG1011, IMG1213, IMG1415)>; diff --git a/src/llvm/lib/Target/W65816/W65816SepRepCleanup.cpp b/src/llvm/lib/Target/W65816/W65816SepRepCleanup.cpp index 420df02..78982a9 100644 --- a/src/llvm/lib/Target/W65816/W65816SepRepCleanup.cpp +++ b/src/llvm/lib/Target/W65816/W65816SepRepCleanup.cpp @@ -419,6 +419,26 @@ bool W65816SepRepCleanup::runOnMachineFunction(MachineFunction &MF) { MI.getOperand(0).isImm()) { int K = MI.getOperand(0).getImm() & 0xFFFF; if (yKnown == K) { + // Before erasing this redundant LDY: the prior LDY is still in + // scope, so all of its Y-uses between the two LDYs are still + // valid uses. But liveness already marked the LAST one (just + // before the redundant LDY) as `implicit killed $y`, because + // that LDY was about to redefine Y. After erasure, Y survives + // through to the NEXT use, so the prior "kill" annotation is + // wrong and the machine verifier rejects. Walk backward and + // clear the kill flag on the most recent Y-using operand. + for (auto Back = std::prev(It2);; --Back) { + bool clearedAny = false; + for (MachineOperand &MO : Back->operands()) { + if (MO.isReg() && MO.getReg() == W65816::Y && + MO.isUse() && MO.isKill()) { + MO.setIsKill(false); + clearedAny = true; + } + } + if (clearedAny) break; + if (Back == MBB.begin()) break; + } auto Erase = It2++; Erase->eraseFromParent(); Changed = true; diff --git a/src/llvm/lib/Target/W65816/W65816StackSlotCleanup.cpp b/src/llvm/lib/Target/W65816/W65816StackSlotCleanup.cpp index c4bbb06..ef6555c 100644 --- a/src/llvm/lib/Target/W65816/W65816StackSlotCleanup.cpp +++ b/src/llvm/lib/Target/W65816/W65816StackSlotCleanup.cpp @@ -748,6 +748,15 @@ bool W65816StackSlotCleanup::runOnMachineFunction(MachineFunction &MF) { } }; auto isLdaLike = [](unsigned Opc) { + // COPY between physregs: lowers in AsmPrinter to one of TXA/TYA/ + // LDA $D? (for IMG↔A bridges) etc. — all of which set N/Z based + // on the loaded value. Treating COPY as flag-defining caused the + // wrap pass to identify a PHI-elim COPY as the "Test" and wrap + // too narrow a range, so the cb-test LDA's flags were trampled + // by intervening A-loads before reaching the BEQ. Including + // COPY in the corrupting set forces the pass to walk past these + // PHI-elim copies to find the real test (a CMP). + if (Opc == TargetOpcode::COPY) return true; // Pure load / register-transfer instructions: only side effect on // flags is N/Z from the loaded/transferred value. Never a "test" // — they just move data. Treated as corruption when between the @@ -1365,7 +1374,42 @@ bool W65816StackSlotCleanup::runOnMachineFunction(MachineFunction &MF) { Cmp->getOperand(1).getImm() != 0) continue; bool Found = walkbackBefore(Cmp->getIterator(), MBB.begin()); - if (Found) { + if (!Found) continue; + // Only eliminate if there are NO LdaLike instructions between + // this CMP and the next Bxx (or end of MBB). Otherwise the + // CMP is the only flag-setting marker between the test value + // and the consuming branch — without it, the Bxx ends up + // testing the latest LdaLike's N/Z (typically a PHI-elim COPY + // or stack reload that has nothing to do with the original + // condition). Caused __adddf3's renormalize while-loop to + // skip its body even though `mr & ~mask` was non-zero. + bool SafeToErase = true; + for (auto It = std::next(Cmp->getIterator()); + It != Cmp->getParent()->end(); ++It) { + if (It->isDebugInstr()) continue; + if (It->isBranch() || It->isReturn()) break; + if (It->getOpcode() == TargetOpcode::COPY) { + SafeToErase = false; + break; + } + unsigned Opc = It->getOpcode(); + // Conservative: any LDA/LDX/LDY/transfer disqualifies erasure. + // Stores and stack-mgmt are flag-preserving and OK. + switch (Opc) { + case W65816::STAfi: case W65816::STAfi_indY: case W65816::STA8fi: + case W65816::STA_StackRel: case W65816::STA_StackRelIndY: + case W65816::STA_DP: case W65816::STA_Abs: case W65816::STA_Long: + case W65816::STX_DP: case W65816::STX_Abs: + case W65816::STY_DP: case W65816::STY_Abs: + case W65816::ADJCALLSTACKDOWN: case W65816::ADJCALLSTACKUP: + case W65816::PHA: case W65816::PHX: case W65816::PHY: + continue; + } + // Anything else (LDA, transfer, ALU op...): bail. + SafeToErase = false; + break; + } + if (SafeToErase) { Cmp->eraseFromParent(); Changed = true; } diff --git a/src/llvm/lib/Target/W65816/W65816TargetMachine.cpp b/src/llvm/lib/Target/W65816/W65816TargetMachine.cpp index be3a394..eeae746 100644 --- a/src/llvm/lib/Target/W65816/W65816TargetMachine.cpp +++ b/src/llvm/lib/Target/W65816/W65816TargetMachine.cpp @@ -48,6 +48,7 @@ LLVMInitializeW65816Target() { initializeW65816NegYIndYPass(PR); initializeW65816PreSpillCrossCallPass(PR); initializeW65816SjLjFinalizePass(PR); + initializeW65816LowerWide32Pass(PR); // Default IndVarSimplify's exit-value rewriter to "never". The // closed-form replacement frequently widens an i16 induction var @@ -150,6 +151,11 @@ void W65816PassConfig::addMachineSSAOptimization() { } void W65816PassConfig::addPreRegAlloc() { + // Decompose Wide32 vregs (i32 register pairs) into pairs of i16 vregs + // BEFORE the other Acc16-targeting pre-RA passes run. Each later + // pass walks Acc16/Idx16/Img16 vregs; running this first means they + // see the decomposed halves uniformly. + addPass(createW65816LowerWide32()); addPass(createW65816ABridgeViaX()); addPass(createW65816TiedDefSpill()); addPass(createW65816WidenAcc16()); @@ -176,6 +182,18 @@ void W65816PassConfig::addPostRegAlloc() { addPass(createW65816SpillToX()); addPass(createW65816StackSlotCleanup()); addPass(createW65816SpillToX()); + // Disable MachineCopyPropagation: it eliminates `COPY $img = $a` + // thinking the IMG dest is dead (no explicit physreg use of $img + // remains after PEI expands STAfi-with-Img16-source into LDA_DP). + // The COPY actually expands to STA_DP $D0 — a memory store to a + // DP slot that libcalls (softDouble, softFloat) ALSO use as their + // own arg-save scratch. When MCP drops the COPY, the subsequent + // LDA_DP $D0 reads stale memory. Caught by `g = g/x` Newton loop: + // iter-1's saved x_ml at $D0 was never actually written, so iter-2 + // read garbage. The principled fix would mark IMG-targeted COPYs + // as memory-side-effecting, but TII doesn't expose that hook; + // disabling MCP loses some optimization but is safe. + disablePass(&llvm::MachineCopyPropagationID); } void W65816PassConfig::addPreEmitPass() { diff --git a/src/llvm/test/CodeGen/W65816/add-i16.ll b/src/llvm/test/CodeGen/W65816/add-i16.ll new file mode 100644 index 0000000..aff5581 --- /dev/null +++ b/src/llvm/test/CodeGen/W65816/add-i16.ll @@ -0,0 +1,12 @@ +; Smoke test: confirm llc accepts the W65816 target via lit. +; RUN: llc -mtriple=w65816 -O2 < %s | FileCheck %s + +define i16 @add_i16(i16 %a, i16 %b) { +; CHECK-LABEL: add_i16: +; CHECK: rep #0x30 +; CHECK: clc +; CHECK: adc 0x4, s +; CHECK: rtl + %r = add i16 %a, %b + ret i16 %r +} diff --git a/src/llvm/test/CodeGen/W65816/canmergestoresto-i16-cap.ll b/src/llvm/test/CodeGen/W65816/canmergestoresto-i16-cap.ll new file mode 100644 index 0000000..d2fea2d --- /dev/null +++ b/src/llvm/test/CodeGen/W65816/canmergestoresto-i16-cap.ll @@ -0,0 +1,30 @@ +; Pin: canMergeStoresTo refuses to merge i16 stores into i32+. +; +; The SDAG store-merge combine sees two adjacent i16 stores and tries +; to widen them into one i32 store. Our i32 store path is Custom- +; lowered back to two i16 stores, and the merge runs again, and the +; cycle repeats until OOM. Override fixes it by capping merge MemVT +; at i16. See feedback_canmergestores_disable.md. +; +; Repro: write two adjacent i16 fields of a struct. Without the cap, +; this either OOMs or burns >5s on a 4-line function. With the cap, +; the lowered code shows two distinct i16 stores (no widened form). +; +; RUN: llc -mtriple=w65816 -O2 < %s | FileCheck %s + +%struct.Pair = type { i16, i16 } + +define void @write_pair(ptr %p, i16 %a, i16 %b) { +; CHECK-LABEL: write_pair: +; Two distinct i16 stores must remain — not merged into one i32. +; Each i16 store under our i32-illegal path uses the same DP-indirect +; family ([dp],y) but on a freshly-loaded $e0 pointer for each half. +; CHECK: sta [0xe0 +; CHECK: sta [0xe0 +; CHECK: rtl + %f0 = getelementptr inbounds %struct.Pair, ptr %p, i32 0, i32 0 + %f1 = getelementptr inbounds %struct.Pair, ptr %p, i32 0, i32 1 + store i16 %a, ptr %f0 + store i16 %b, ptr %f1 + ret void +} diff --git a/src/llvm/test/CodeGen/W65816/extract-wide32-regseq.ll b/src/llvm/test/CodeGen/W65816/extract-wide32-regseq.ll new file mode 100644 index 0000000..fb0f536 --- /dev/null +++ b/src/llvm/test/CodeGen/W65816/extract-wide32-regseq.ll @@ -0,0 +1,36 @@ +; Pin: extractWide32Lo/Hi looks through REG_SEQUENCE shortcut. +; +; Without the shortcut, `*p = 0` (or any i32 store of a constant or +; freshly-built i32 vreg) hits the SDAG combiner repeatedly, the +; combiner re-merges and Custom-lower re-splits, the cycle runs for +; tens of seconds and 100MB+ peak. See feedback_extract_wide32_regseq_shortcut.md. +; +; Two functions: +; - clear_i32: simplest *(i32*)p = 0 case (the original repro) +; - clear_i32_pair: two adjacent i32 zero-stores (combiner stress) +; +; If the shortcut regresses, llc either OOMs (process killed) or +; takes >5s on these tiny functions. We assert on the lowered shape. +; +; RUN: llc -mtriple=w65816 -O2 -verify-machineinstrs < %s | FileCheck %s + +define void @clear_i32(ptr %p) { +; CHECK-LABEL: clear_i32: +; CHECK: sta [0xe0 +; CHECK: sta [0xe0 +; CHECK: rtl + store i32 0, ptr %p + ret void +} + +define void @clear_i32_pair(ptr %p, ptr %q) { +; CHECK-LABEL: clear_i32_pair: +; CHECK: sta [0xe0 +; CHECK: sta [0xe0 +; CHECK: sta [0xe0 +; CHECK: sta [0xe0 +; CHECK: rtl + store i32 0, ptr %p + store i32 0, ptr %q + ret void +} diff --git a/src/llvm/test/CodeGen/W65816/i64-first-arg-img16.ll b/src/llvm/test/CodeGen/W65816/i64-first-arg-img16.ll new file mode 100644 index 0000000..4ec08d1 --- /dev/null +++ b/src/llvm/test/CodeGen/W65816/i64-first-arg-img16.ll @@ -0,0 +1,36 @@ +; Pin: i64-first-arg routes arg0 halves through Img16 (DP $C0..$DE). +; +; Without the Img16 routing, regalloc emits `TXA; STA spillA; +; STA spillX` at function entry — the TXA clobbers $a (arg0_lo) +; before the A-spill saves it, so both spill slots end up holding +; arg0_ml. Caused __adddf3(1.5, 2.5) → 1.5. See +; feedback_i64_first_arg_x_class.md. +; +; Fix: route arg0_lo via STA $dp and arg0_ml via STX $dp. Visible at +; function entry as a pair of `stx 0x[cd]?` and `sta 0x[cd]?` writes +; into the IMG region of direct page. +; +; Trigger: i64 first arg with enough cross-call live range that arg0 +; halves must be saved. +; +; RUN: llc -mtriple=w65816 -O2 < %s | FileCheck %s + +declare i64 @ext1(i64 %x, i64 %y) +declare i64 @ext2(i64 %a) + +define i64 @i64_first_pressure(i64 %x) { +; CHECK-LABEL: i64_first_pressure: +; Entry stores arg0_ml (X) and arg0_lo (A) into IMG slots, NOT a +; TXA-bridge sequence. $D0 / $D2 are concrete IMG slots (the IMG +; region is $C0..$DE). Match a stx in that range, followed by an +; sta in the same range, before the first jsl. +; CHECK: stx 0xd +; CHECK: sta 0xd +; CHECK: jsl ext2 +; CHECK: rtl +entry: + %a = call i64 @ext2(i64 %x) + %b = add i64 %a, %x + %c = call i64 @ext1(i64 %b, i64 %x) + ret i64 %c +} diff --git a/src/llvm/test/CodeGen/W65816/img-copy-survives-mcp.ll b/src/llvm/test/CodeGen/W65816/img-copy-survives-mcp.ll new file mode 100644 index 0000000..929c693 --- /dev/null +++ b/src/llvm/test/CodeGen/W65816/img-copy-survives-mcp.ll @@ -0,0 +1,32 @@ +; Pin: MachineCopyPropagation must NOT eliminate `COPY $img = $reg` — +; that COPY actually expands to STA_DP $D? (a DP-memory store to an +; IMG slot). Libcalls (softDouble, softFloat) use those same DP +; slots for their own arg-save scratch, so dropping the COPY makes +; the subsequent LDA_DP read stale memory. Caught by `g = g/x` +; Newton loop: iter-1's saved x_ml at $D0 was never actually written +; because MCP dropped the COPY, so iter-2's call to __divdf3 read +; garbage as its x_ml argument. See feedback_jslpseudo_libcall_img_clobber.md. +; +; Fix: disable MachineCopyPropagation in addPostRegAlloc. +; +; Symptom shape we pin: for an i64-first-arg double function that +; calls a libcall, the entry must contain BOTH `stx 0xd?` AND `sta +; 0xd?` (for I64FirstArg's Img16 arg-save dance) — and they must +; survive to the asm output. Without the MCP-disable, only one of +; those (or neither) appears. +; +; RUN: llc -mtriple=w65816 -O2 < %s | FileCheck %s + +declare double @ext_div(double %a, double %b) + +define double @div_chain(double %x) { +; CHECK-LABEL: div_chain: +; Img16 arg-save at function entry — both halves must reach asm: +; CHECK: stx 0xd +; CHECK: sta 0xd +; CHECK: jsl ext_div +; CHECK: rtl +entry: + %r = call double @ext_div(double %x, double %x) + ret double %r +} diff --git a/src/llvm/test/CodeGen/W65816/jslpseudo-caller-save.ll b/src/llvm/test/CodeGen/W65816/jslpseudo-caller-save.ll new file mode 100644 index 0000000..2b02872 --- /dev/null +++ b/src/llvm/test/CodeGen/W65816/jslpseudo-caller-save.ll @@ -0,0 +1,28 @@ +; Pin: JSLpseudo declares Defs = [A, X, Y, DPF0]. +; +; Without X, Y, DPF0 in the Defs list, an i64-returning libcall +; (which returns lo16 in A, mid16 in X, hi16 in Y, hh16 in DPF0) +; verifier-fails with "$y undefined" in math.c::floor. See +; feedback_jslpseudo_caller_save.md. +; +; This test compiles a call to an i64-returning external function +; with -verify-machineinstrs. If JSLpseudo's Defs are stripped, the +; X/Y/DPF0 reads after the call would be on physregs the call didn't +; declare it defined, and -verify-machineinstrs fails. +; +; RUN: llc -mtriple=w65816 -O2 -verify-machineinstrs < %s | FileCheck %s + +declare i64 @ext_i64(i64 %x) + +define i64 @i64_libcall_uses_xy(i64 %x) { +; CHECK-LABEL: i64_libcall_uses_xy: +; CHECK: jsl ext_i64 +; The post-call sequence stores the i64 return value (lo16 in A, mid16 +; in X, hi16 in Y, hh16 in DPF0) back to the caller's frame. If +; JSLpseudo did not Def X, the txa here would verifier-fail because X +; would not be live across the call. +; CHECK: txa +; CHECK: rtl + %r = call i64 @ext_i64(i64 %x) + ret i64 %r +} diff --git a/src/llvm/test/CodeGen/W65816/lit.local.cfg b/src/llvm/test/CodeGen/W65816/lit.local.cfg new file mode 100644 index 0000000..dce57e0 --- /dev/null +++ b/src/llvm/test/CodeGen/W65816/lit.local.cfg @@ -0,0 +1,2 @@ +if not 'W65816' in config.root.targets: + config.unsupported = True diff --git a/src/llvm/test/CodeGen/W65816/seprep-ldy-elision-kill-flag.ll b/src/llvm/test/CodeGen/W65816/seprep-ldy-elision-kill-flag.ll new file mode 100644 index 0000000..ee0d05a --- /dev/null +++ b/src/llvm/test/CodeGen/W65816/seprep-ldy-elision-kill-flag.ll @@ -0,0 +1,29 @@ +; Pin: SepRepCleanup's redundant-LDY elision must clear the now-stale +; `killed $y` flag on the prior Y-user. +; +; Trigger: any sequence that emits two LDY_Imm16 #N back-to-back with +; STA [dp],y between (e.g. an i32 store that splits into two i16 +; stores, each going through STAptr32 inserter which emits its own +; LDY #0). Without the fix, the third peephole at SepRepCleanup +; deletes the second LDY, but the first STA's `implicit killed $y` +; annotation was set under the assumption that the second LDY was +; about to redefine Y — leaving the second STA reading "dead" Y. +; +; The fix walks backward from the erased LDY to the most recent +; Y-using operand and clears its kill flag. -verify-machineinstrs +; catches the bug if it regresses. +; +; RUN: llc -mtriple=w65816 -O2 -verify-machineinstrs < %s | FileCheck %s + +define void @two_i32_stores_share_y(ptr %p) { +; CHECK-LABEL: two_i32_stores_share_y: +; The fix is invisible in asm output — both STAs emit identically with +; or without the kill-flag fix. The pin is `-verify-machineinstrs` +; not aborting. Match a minimal shape so the test still has structure. +; CHECK: ldy #0x0 +; CHECK: sta [0xe0 +; CHECK: sta [0xe0 +; CHECK: rtl + store i32 0, ptr %p + ret void +} diff --git a/src/llvm/test/CodeGen/W65816/sign-extend-inreg-i32.ll b/src/llvm/test/CodeGen/W65816/sign-extend-inreg-i32.ll new file mode 100644 index 0000000..d815330 --- /dev/null +++ b/src/llvm/test/CodeGen/W65816/sign-extend-inreg-i32.ll @@ -0,0 +1,41 @@ +; Pin: SIGN_EXTEND_INREG with i32 result and inner type i1 / i8 / i16 +; must Custom-lower to per-half ops. Without the Custom hook, the +; combiner emits `sext_inreg(REG_SEQUENCE(...), i1)` which has no +; tablegen pattern and isel aborts with "Cannot select". +; +; The i1 case shows up in CRC32 loops (`-(crc & 1ul)` reduces to +; sign_extend_inreg with i1). See feedback_sext_inreg_i32_isel_gap.md. +; +; Note: -verify-machineinstrs intentionally omitted because i32 store +; lowering still trips the i32-store-pair `implicit killed $y` +; concern in some chains; orthogonal to this fix. +; +; RUN: llc -mtriple=w65816 -O2 < %s | FileCheck %s + +; The CRC32 idiom: -(x & 1) = sign_extend_inreg x, i1 (after combiner). +define i32 @neg_lowbit(i32 %x) { +; CHECK-LABEL: neg_lowbit: +; CHECK: and #0x1 +; CHECK: rtl + %a = and i32 %x, 1 + %b = sub i32 0, %a + ret i32 %b +} + +; (int32_t)(int8_t)x — sign-extend low byte to i32. +define i32 @sext_i8_to_i32(i32 %x) { +; CHECK-LABEL: sext_i8_to_i32: +; CHECK: rtl + %t = trunc i32 %x to i8 + %r = sext i8 %t to i32 + ret i32 %r +} + +; (int32_t)(int16_t)x — sign-extend low halfword to i32. +define i32 @sext_i16_to_i32(i32 %x) { +; CHECK-LABEL: sext_i16_to_i32: +; CHECK: rtl + %t = trunc i32 %x to i16 + %r = sext i16 %t to i32 + ret i32 %r +} diff --git a/src/llvm/test/CodeGen/W65816/wide32-phi-split.ll b/src/llvm/test/CodeGen/W65816/wide32-phi-split.ll new file mode 100644 index 0000000..f3ed61b --- /dev/null +++ b/src/llvm/test/CodeGen/W65816/wide32-phi-split.ll @@ -0,0 +1,32 @@ +; Pin: W65816LowerWide32 Pass 2b splits Wide32 PHIs. +; +; Without PHI splitting, an i32 phi (loop-carried 32-bit value) +; survives to RA, hits "Wide32 reload to non-pair reg" UNREACHABLE. +; softDouble at -O2 was the original repro (ma/mb mantissa loops). +; +; This test mimics the shape: an i32 carried across a loop. If +; LowerWide32 doesn't split the PHI, llc aborts. +; +; RUN: llc -mtriple=w65816 -O2 < %s | FileCheck %s + +define i32 @sum_i32_loop(ptr %p, i16 %n) { +; CHECK-LABEL: sum_i32_loop: +; CHECK: rtl +entry: + %is_zero = icmp eq i16 %n, 0 + br i1 %is_zero, label %done, label %loop + +loop: + %i = phi i16 [ 0, %entry ], [ %i.next, %loop ] + %acc = phi i32 [ 0, %entry ], [ %acc.next, %loop ] + %addr = getelementptr inbounds i32, ptr %p, i16 %i + %v = load i32, ptr %addr + %acc.next = add i32 %acc, %v + %i.next = add i16 %i, 1 + %cond = icmp eq i16 %i.next, %n + br i1 %cond, label %done, label %loop + +done: + %r = phi i32 [ 0, %entry ], [ %acc.next, %loop ] + ret i32 %r +}