Checkpoint
This commit is contained in:
parent
05fc37d323
commit
465f8ba947
35 changed files with 2496 additions and 201 deletions
1
.gitignore
vendored
1
.gitignore
vendored
|
|
@ -9,6 +9,7 @@ tools/
|
|||
# runtime/src/*.s. The source files (.s, build.sh) are tracked.
|
||||
runtime/*.o
|
||||
runtime/*.o.bak
|
||||
runtime/*.o.tmp
|
||||
|
||||
# Editor / OS
|
||||
*.swp
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
diff --git a/llvm/lib/TargetParser/Triple.cpp b/llvm/lib/TargetParser/Triple.cpp
|
||||
index 8aef55224..b6e467274 100644
|
||||
index 8aef55224..1ab00ce9f 100644
|
||||
--- a/llvm/lib/TargetParser/Triple.cpp
|
||||
+++ b/llvm/lib/TargetParser/Triple.cpp
|
||||
@@ -80,6 +80,8 @@ StringRef Triple::getArchTypeName(ArchType Kind) {
|
||||
|
|
@ -75,3 +75,12 @@ index 8aef55224..b6e467274 100644
|
|||
case Triple::nvptx64:
|
||||
case Triple::nvptx:
|
||||
case Triple::ppcle:
|
||||
@@ -2704,6 +2714,8 @@ ExceptionHandling Triple::getDefaultExceptionHandling() const {
|
||||
case Triple::xcore:
|
||||
case Triple::xtensa:
|
||||
return ExceptionHandling::DwarfCFI;
|
||||
+ case Triple::w65816:
|
||||
+ return ExceptionHandling::SjLj;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
diff --git a/llvm/lib/TargetParser/TargetDataLayout.cpp b/llvm/lib/TargetParser/TargetDataLayout.cpp
|
||||
index 8837d2f91..b796d9e86 100644
|
||||
index 8837d2f91..920b8ac8e 100644
|
||||
--- a/llvm/lib/TargetParser/TargetDataLayout.cpp
|
||||
+++ b/llvm/lib/TargetParser/TargetDataLayout.cpp
|
||||
@@ -582,6 +582,8 @@ std::string Triple::computeDataLayout(StringRef ABIName) const {
|
||||
|
|
@ -7,7 +7,7 @@ index 8837d2f91..b796d9e86 100644
|
|||
case Triple::msp430:
|
||||
return "e-m:e-p:16:16-i32:16-i64:16-f32:16-f64:16-a:8-n8:16-S16";
|
||||
+ case Triple::w65816:
|
||||
+ return "e-m:e-p:16:8-i16:16-i32:16-n8:16-S16";
|
||||
+ return "e-m:e-p:16:16-i16:16-i32:16-i64:16-f32:16-f64:16-n8:16-S16";
|
||||
case Triple::ppc:
|
||||
case Triple::ppcle:
|
||||
case Triple::ppc64:
|
||||
|
|
|
|||
13
patches/0007-targetlowering-virtual-gettypeconversion.patch
Normal file
13
patches/0007-targetlowering-virtual-gettypeconversion.patch
Normal file
|
|
@ -0,0 +1,13 @@
|
|||
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
|
||||
index 7c4c29fc3..7109a79fa 100644
|
||||
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
|
||||
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
|
||||
@@ -1144,7 +1144,7 @@ public:
|
||||
/// integer register, this contains one step in the expansion to get to the
|
||||
/// smaller register. For illegal floating point types, this returns the
|
||||
/// integer type to transform to.
|
||||
- LegalizeKind getTypeConversion(LLVMContext &Context, EVT VT) const;
|
||||
+ virtual LegalizeKind getTypeConversion(LLVMContext &Context, EVT VT) const;
|
||||
|
||||
/// Return how we should legalize values of this type, either it is already
|
||||
/// legal (return 'Legal') or we need to promote it to a larger type (return
|
||||
|
|
@ -6,6 +6,12 @@ set -euo pipefail
|
|||
PROJECT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
||||
LLVM_MC="$PROJECT_ROOT/tools/llvm-mos-build/bin/llvm-mc"
|
||||
CLANG="$PROJECT_ROOT/tools/llvm-mos-build/bin/clang"
|
||||
# Apply CPU/memory caps so a runaway backend bug can't OOM-kill the
|
||||
# entire tmux scope. Use `|| true` so when invoked from a parent that
|
||||
# has already lowered the limit (e.g. smokeTest.sh sets 90s), we keep
|
||||
# the parent's tighter cap rather than failing the build.
|
||||
ulimit -v $((10 * 1024 * 1024)) 2>/dev/null || true
|
||||
ulimit -t 1200 2>/dev/null || true
|
||||
|
||||
[ -x "$LLVM_MC" ] || { echo "llvm-mc not found at $LLVM_MC" >&2; exit 1; }
|
||||
[ -x "$CLANG" ] || { echo "clang not found at $CLANG" >&2; exit 1; }
|
||||
|
|
|
|||
|
|
@ -57,6 +57,22 @@ __start:
|
|||
lda 0xc083
|
||||
rep #0x20
|
||||
|
||||
; Persistent "current data bank" byte at DP $BE. The LDAptr/
|
||||
; STAptr/STBptr inserters load this into $E2 before each [dp],Y
|
||||
; deref so pointer-deref lands in the user's bank, matching where
|
||||
; DBR-relative absolute stores go. Under MAME (no Loader), DBR=0
|
||||
; and PBR=0 here, so $BE=0 — equivalent to the prior STZ $E2
|
||||
; behavior. Under GS/OS Loader, crt0Gsos.s sets it to PBR.
|
||||
;
|
||||
; $BE chosen because it's outside both the libcall scratch range
|
||||
; ($E0..$FF used by libgcc.s for i64 ops) and the IMG slot range
|
||||
; ($C0..$DE). PHK pushes 1 byte; PLA in M=8 to pull just 1 byte.
|
||||
sep #0x20
|
||||
phk
|
||||
pla ; A's low byte = current PBR
|
||||
sta 0xbe ; persistent data bank
|
||||
rep #0x20
|
||||
|
||||
; Zero BSS. X iterates from __bss_start to __bss_end; each
|
||||
; iteration writes one byte of zero at addr X (via DP=0 +
|
||||
; offset 0 — which is just X). STZ in M=8 stores 1 byte and
|
||||
|
|
|
|||
|
|
@ -41,6 +41,18 @@ __start:
|
|||
lda #0
|
||||
tcd
|
||||
|
||||
; Persistent "current data bank" byte at DP $BE. Set to PBR
|
||||
; (= our load bank) so the LDAptr/STAptr/STBptr inserters'
|
||||
; "LDA $BE; STA $E2" sequence puts pointer derefs in our bank,
|
||||
; matching DBR-relative absolute stores. $BE is outside the
|
||||
; libcall scratch range ($E0..$FF used by libgcc.s for i64 ops).
|
||||
; See crt0.s.
|
||||
sep #0x20
|
||||
phk
|
||||
pla
|
||||
sta 0xbe
|
||||
rep #0x20
|
||||
|
||||
; BSS zero-init. With DBR=our bank, `stz abs,X` writes to
|
||||
; ourBank:X — correct as long as __bss_start/__bss_end fit in
|
||||
; the segment's bank.
|
||||
|
|
|
|||
|
|
@ -103,6 +103,7 @@ static void emitDec(int n) {
|
|||
|
||||
|
||||
__attribute__((noinline))
|
||||
__attribute__((optnone))
|
||||
static void emitULong(unsigned long n) {
|
||||
char buf[11];
|
||||
int i = 0;
|
||||
|
|
@ -122,7 +123,7 @@ static void emitULong(unsigned long n) {
|
|||
}
|
||||
|
||||
|
||||
__attribute__((noinline))
|
||||
__attribute__((noinline,optnone))
|
||||
static void emitSignedLong(long n) {
|
||||
// See emitDec: avoid the signed-overflow UB on LONG_MIN.
|
||||
if (n < 0) {
|
||||
|
|
@ -221,6 +222,12 @@ static void emitDouble(double v, int prec) {
|
|||
|
||||
|
||||
// fmt is arg0 (A register); see banner comment for why the order matters.
|
||||
// optnone: under ptr32 the regalloc reuses the same stack spill slot for
|
||||
// both the va_list pointer `ap` and the fmt-walking pointer, so a `va_arg`
|
||||
// after several fmt-character steps reads the wrong slot and gets 0
|
||||
// instead of the actual va_arg value. optnone forces fast regalloc which
|
||||
// keeps each vreg in its own slot. See feedback_snprintf_va_arg_slot_alias.md.
|
||||
__attribute__((optnone))
|
||||
static int format(const char *fmt, va_list ap) {
|
||||
while (*fmt) {
|
||||
char c = *fmt++;
|
||||
|
|
@ -295,6 +302,8 @@ static int format(const char *fmt, va_list ap) {
|
|||
}
|
||||
|
||||
|
||||
|
||||
__attribute__((optnone))
|
||||
int snprintf(char *buf, size_t n, const char *fmt, ...) {
|
||||
gCur = buf;
|
||||
// n == 0 must NOT touch the buffer (C99 7.19.6.5). Setting
|
||||
|
|
|
|||
|
|
@ -127,14 +127,23 @@ u64 __adddf3(u64 a, u64 b) {
|
|||
// Right-shift first to bring an over-wide sum back in range; then
|
||||
// left-shift if subtraction left the lead below 55. Reverse order
|
||||
// would shift an over-wide value out of u64 range entirely.
|
||||
while (mr & ~((1ULL << 56) - 1)) {
|
||||
u64 sticky = mr & 1;
|
||||
mr = (mr >> 1) | sticky;
|
||||
ea++;
|
||||
// Use if + do-while because pure `while (cond) body` triggers a
|
||||
// ptr32 backend bug: PHP/PLP wrap pass mis-identifies the loop's
|
||||
// pre-test LDA reload as flag corruption and wraps the wrong
|
||||
// range, so the BEQ tests stale flags and the loop body never
|
||||
// fires. `do { } while (cond)` is unaffected (test-after-body).
|
||||
if (mr & ~((1ULL << 56) - 1)) {
|
||||
do {
|
||||
u64 sticky_bit = mr & 1;
|
||||
mr = (mr >> 1) | sticky_bit;
|
||||
ea++;
|
||||
} while (mr & ~((1ULL << 56) - 1));
|
||||
}
|
||||
while ((mr & (1ULL << 55)) == 0 && mr != 0) {
|
||||
mr <<= 1;
|
||||
ea--;
|
||||
if ((mr & (1ULL << 55)) == 0 && mr != 0) {
|
||||
do {
|
||||
mr <<= 1;
|
||||
ea--;
|
||||
} while ((mr & (1ULL << 55)) == 0 && mr != 0);
|
||||
}
|
||||
// Round to nearest, ties to even. Bits 0/1 are sticky+round, bit 2
|
||||
// is guard, bit 3 is mantissa LSB.
|
||||
|
|
@ -259,14 +268,26 @@ u64 __divdf3(u64 a, u64 b) {
|
|||
// Handle the leading quotient bit explicitly.
|
||||
u64 q = DMANT_LEAD;
|
||||
u64 r = ma - mb;
|
||||
// `volatile vmb`: forces mb to be re-read from memory inside the
|
||||
// loop. Without this, the W65816 codegen miscompiles `r >= mb` and
|
||||
// `r -= mb` when called as the 3rd+ chained `__divdf3` after prior
|
||||
// softDouble libcalls (sqrt3 Newton iter — 3rd iter returned 0.0
|
||||
// instead of 1.41421). Adding `volatile` to either `r` or `mb`
|
||||
// alone fixes it, suggesting the compiler is keeping one of them
|
||||
// in registers across loop iterations and a JSL inside the loop
|
||||
// (__ashlsi3 for `r <<= 1`) clobbers the held value. The real
|
||||
// fix lives in the W65816 backend's u64-shift lowering; volatile
|
||||
// here is the conservative workaround.
|
||||
volatile u64 vmb = mb;
|
||||
// Compute 52 more fractional bits via standard shift-test-subtract.
|
||||
for (int i = 51; i >= 0; i--) {
|
||||
r <<= 1;
|
||||
if (r >= mb) {
|
||||
r -= mb;
|
||||
if (r >= vmb) {
|
||||
r -= vmb;
|
||||
q |= (1ULL << i);
|
||||
}
|
||||
}
|
||||
mb = vmb; // resync in case below reads mb
|
||||
// Round to nearest, ties to even. Generate one extra bit (the
|
||||
// "guard"), examine the remainder for any non-zero "sticky" tail,
|
||||
// and round q up when guard=1 and (sticky || (q & 1)). Without
|
||||
|
|
|
|||
|
|
@ -33,44 +33,20 @@ double difftime(time_t end, time_t start) {
|
|||
return (double)(end - start);
|
||||
}
|
||||
|
||||
struct tm *gmtime_r(const time_t *t, struct tm *out);
|
||||
|
||||
// gmtime / localtime: convert seconds-since-1970 to broken-down time.
|
||||
// "local" is identical to "gm" — no timezone support.
|
||||
//
|
||||
// gmtime KNOWN-BROKEN under GS/OS Loader. The interface returns a
|
||||
// pointer to a static global (`__gmtimeBuf`). User code reads
|
||||
// `r->tm_field` which the W65816 backend lowers via [dp],Y with bank
|
||||
// forced to 0 (DBR-independent — see W65816ISelLowering's LDAptr/STAptr
|
||||
// inserter). But under Loader the buffer's IMM16 address gets cRELOC-
|
||||
// patched to a runtime offset that's only valid in the user's bank,
|
||||
// not bank 0 — so the user's reads land in unrelated bank-0 RAM.
|
||||
// Even arranging for gmtime to write via [dp],y bank=0 makes both
|
||||
// halves consistent at bank 0, but the cRELOC-patched address often
|
||||
// falls in the Language Card area where bank-0 reads/writes aren't
|
||||
// stable RAM. Real fix needs either 32-bit pointers, or DBR-relative
|
||||
// pointer-deref under Loader (incompatible with the bank-switch
|
||||
// idiom that smoke tests exercise).
|
||||
//
|
||||
// Stub: fill seconds/minutes/hours from modulo arithmetic (those fields
|
||||
// work because they're written-then-read by the same library). Date
|
||||
// fields stay at the 1970-01-01 sentinel. Workaround for users:
|
||||
// build a struct tm by hand (stack local) and pass to mktime/asctime/
|
||||
// strftime — those work because the buffer is the caller's, deref'd
|
||||
// the same way on both sides.
|
||||
// Returns a pointer to a static global (`__gmtimeBuf`). Under GS/OS
|
||||
// Loader (DBR != 0) caller-side pointer-deref reads need to land in
|
||||
// the same bank where gmtime wrote; this requires the runtime build
|
||||
// to enable `-mllvm -w65816-loader-bank-deref`, which makes
|
||||
// LDAptr/STAptr load the bank byte from DP $BE (set by crt0 from
|
||||
// PHK / current PBR). Without the flag, gmtime still works under
|
||||
// MAME / non-Loader runs where DBR=0 throughout.
|
||||
struct tm *gmtime(const time_t *t) {
|
||||
long secs = *t;
|
||||
int sec = (int)(secs % 60L); secs /= 60L;
|
||||
int min = (int)(secs % 60L); secs /= 60L;
|
||||
int hour = (int)(secs % 24L);
|
||||
__gmtimeBuf.tm_sec = sec;
|
||||
__gmtimeBuf.tm_min = min;
|
||||
__gmtimeBuf.tm_hour = hour;
|
||||
__gmtimeBuf.tm_mday = 1;
|
||||
__gmtimeBuf.tm_mon = 0;
|
||||
__gmtimeBuf.tm_year = 70; // 1970 sentinel — date decomp KNOWN-BROKEN
|
||||
__gmtimeBuf.tm_wday = 4; // Jan 1 1970 was Thursday
|
||||
__gmtimeBuf.tm_yday = 0;
|
||||
__gmtimeBuf.tm_isdst = -1;
|
||||
return &__gmtimeBuf;
|
||||
return gmtime_r(t, &__gmtimeBuf);
|
||||
}
|
||||
|
||||
struct tm *localtime(const time_t *t) {
|
||||
|
|
@ -82,13 +58,15 @@ struct tm *localtime(const time_t *t) {
|
|||
// is bank-0 in 65816 native mode regardless of DBR). This avoids the
|
||||
// bank-mismatch issue that breaks plain gmtime under Loader.
|
||||
//
|
||||
// PARTIAL: sec/min/hour/wday/yday work; year/mon/mday hit a W65816
|
||||
// regalloc/codegen issue at -O2 that mis-evaluates the date arithmetic
|
||||
// even when split across noinline helpers. Not yet fixed — needs deep
|
||||
// backend debugging of i32 compare / mixed-type subtract codegen.
|
||||
//
|
||||
// Recommended for time-of-day display; for date fields, build a
|
||||
// struct tm manually and pass to mktime/asctime/strftime.
|
||||
// Full broken-down time computation. Marked optnone because at -O2
|
||||
// LLVM's combined IR optimizations (loop rotation + reassociation +
|
||||
// induction-variable-simplify) mis-evaluate the year-increment loop's
|
||||
// `days >= 365L + (__isLeap(...) ? 1 : 0)` comparison, leaving the
|
||||
// loop body unexecuted and date fields stuck at the 1970 sentinel.
|
||||
// optnone preserves the per-statement structure and the loop runs
|
||||
// correctly. Verified end-to-end against 1710484245L → 2024-03-15
|
||||
// 06:30:45 UTC (Friday, day-of-year 74).
|
||||
__attribute__((optnone))
|
||||
struct tm *gmtime_r(const time_t *t, struct tm *out) {
|
||||
long secs = *t;
|
||||
int sec = (int)(secs % 60L); secs /= 60L;
|
||||
|
|
@ -98,14 +76,30 @@ struct tm *gmtime_r(const time_t *t, struct tm *out) {
|
|||
int wday = (int)((days + 4L) % 7L);
|
||||
if (wday < 0) wday += 7;
|
||||
|
||||
int year = 70; // years since 1900
|
||||
while (days >= 365L + (__isLeap(1900 + year) ? 1 : 0)) {
|
||||
days -= 365L + (__isLeap(1900 + year) ? 1 : 0);
|
||||
year++;
|
||||
}
|
||||
int yday = (int)days;
|
||||
int leap = __isLeap(1900 + year);
|
||||
int mon = 11;
|
||||
while (mon > 0) {
|
||||
int firstDayOfMon = __monthDays[mon] + (leap && mon > 1 ? 1 : 0);
|
||||
if ((int)days >= firstDayOfMon) break;
|
||||
mon--;
|
||||
}
|
||||
int firstDay = __monthDays[mon] + (leap && mon > 1 ? 1 : 0);
|
||||
int mday = (int)days - firstDay + 1;
|
||||
|
||||
out->tm_sec = sec;
|
||||
out->tm_min = min;
|
||||
out->tm_hour = hour;
|
||||
out->tm_mday = 1; // KNOWN-BROKEN — see header comment
|
||||
out->tm_mon = 0;
|
||||
out->tm_year = 70;
|
||||
out->tm_mday = mday;
|
||||
out->tm_mon = mon;
|
||||
out->tm_year = year;
|
||||
out->tm_wday = wday;
|
||||
out->tm_yday = 0;
|
||||
out->tm_yday = yday;
|
||||
out->tm_isdst = -1;
|
||||
return out;
|
||||
}
|
||||
|
|
|
|||
81
scripts/runFileCheckTests.sh
Executable file
81
scripts/runFileCheckTests.sh
Executable file
|
|
@ -0,0 +1,81 @@
|
|||
#!/usr/bin/env bash
|
||||
# runFileCheckTests.sh - run W65816 backend regression tests.
|
||||
#
|
||||
# Walks src/llvm/test/CodeGen/W65816/*.ll and for each:
|
||||
# - reads RUN: lines from the test header (lit-compatible syntax)
|
||||
# - executes them with %s -> the test path
|
||||
# - any non-zero exit fails the run.
|
||||
#
|
||||
# Why not lit: the in-tree llvm-mos build is configured with
|
||||
# LLVM_INCLUDE_TESTS=OFF (saves ~5 min from incremental rebuilds and
|
||||
# ~2 GB of test artifacts). These regression tests are codegen-shape
|
||||
# pins, not full lit-harness sweeps; FileCheck alone covers our needs.
|
||||
#
|
||||
# Usage:
|
||||
# scripts/runFileCheckTests.sh # run all
|
||||
# scripts/runFileCheckTests.sh foo.ll bar.ll # run named (relative to dir)
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
PROJECT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
||||
TEST_DIR="$PROJECT_ROOT/src/llvm/test/CodeGen/W65816"
|
||||
LLC="$PROJECT_ROOT/tools/llvm-mos-build/bin/llc"
|
||||
FILECHECK="$PROJECT_ROOT/tools/llvm-mos-build/bin/FileCheck"
|
||||
NOT="$PROJECT_ROOT/tools/llvm-mos-build/bin/not"
|
||||
|
||||
[ -x "$LLC" ] || { echo "missing $LLC" >&2; exit 2; }
|
||||
[ -x "$FILECHECK" ] || { echo "missing $FILECHECK; build with 'ninja FileCheck not'" >&2; exit 2; }
|
||||
|
||||
if [ $# -gt 0 ]; then
|
||||
files=()
|
||||
for f in "$@"; do
|
||||
files+=("$TEST_DIR/$f")
|
||||
done
|
||||
else
|
||||
mapfile -t files < <(find "$TEST_DIR" -maxdepth 1 -name '*.ll' | sort)
|
||||
fi
|
||||
|
||||
pass=0
|
||||
fail=0
|
||||
failed=()
|
||||
for f in "${files[@]}"; do
|
||||
[ -f "$f" ] || { echo "skip missing: $f"; continue; }
|
||||
name="$(basename "$f")"
|
||||
|
||||
runs=$(grep -E '^[[:space:]]*;[[:space:]]*RUN:' "$f" | sed -E 's/^[[:space:]]*;[[:space:]]*RUN:[[:space:]]*//')
|
||||
if [ -z "$runs" ]; then
|
||||
echo "SKIP $name (no RUN: line)"
|
||||
continue
|
||||
fi
|
||||
|
||||
ok=1
|
||||
while IFS= read -r line; do
|
||||
[ -z "$line" ] && continue
|
||||
cmd=${line//%s/$f}
|
||||
cmd=${cmd//llc/$LLC}
|
||||
cmd=${cmd//FileCheck/$FILECHECK}
|
||||
cmd=${cmd//not /$NOT }
|
||||
out=$(bash -c "$cmd" 2>&1) || {
|
||||
ok=0
|
||||
echo "FAIL $name"
|
||||
echo " cmd: $cmd"
|
||||
echo "$out" | sed 's/^/ | /'
|
||||
break
|
||||
}
|
||||
done <<< "$runs"
|
||||
|
||||
if [ $ok -eq 1 ]; then
|
||||
echo "PASS $name"
|
||||
pass=$((pass + 1))
|
||||
else
|
||||
fail=$((fail + 1))
|
||||
failed+=("$name")
|
||||
fi
|
||||
done
|
||||
|
||||
echo
|
||||
echo "==== W65816 FileCheck: $pass pass, $fail fail ===="
|
||||
if [ $fail -gt 0 ]; then
|
||||
printf ' - %s\n' "${failed[@]}"
|
||||
exit 1
|
||||
fi
|
||||
|
|
@ -3160,6 +3160,50 @@ EOF
|
|||
fi
|
||||
rm -f "$cTrFile" "$oTrFile" "$binTrFile"
|
||||
|
||||
log "check: MAME runs gmtime(1710484245) -> 2024-03-15 06:30:45 Fri (date math via real impl)"
|
||||
cGmFile="$(mktemp --suffix=.c)"
|
||||
oGmFile="$(mktemp --suffix=.o)"
|
||||
oGmTime="$(mktemp --suffix=.o)"
|
||||
binGmFile="$(mktemp --suffix=.bin)"
|
||||
cat > "$cGmFile" <<'EOF'
|
||||
typedef long time_t;
|
||||
struct tm {
|
||||
int tm_sec, tm_min, tm_hour;
|
||||
int tm_mday, tm_mon, tm_year;
|
||||
int tm_wday, tm_yday, tm_isdst;
|
||||
};
|
||||
extern struct tm *gmtime(const time_t *);
|
||||
__attribute__((noinline)) void switchToBank2(void) {
|
||||
__asm__ volatile ("sep #0x20\n.byte 0xa9,0x02\npha\nplb\nrep #0x20\n");
|
||||
}
|
||||
int main(void) {
|
||||
time_t t = 1710484245L; // 2024-03-15 06:30:45 UTC, Friday, day 74
|
||||
struct tm *r = gmtime(&t);
|
||||
switchToBank2();
|
||||
*(volatile unsigned short *)0x5000 = r->tm_year; // 124
|
||||
*(volatile unsigned short *)0x5002 = r->tm_mon; // 2
|
||||
*(volatile unsigned short *)0x5004 = r->tm_mday; // 15
|
||||
*(volatile unsigned short *)0x5006 = r->tm_hour; // 6
|
||||
*(volatile unsigned short *)0x5008 = r->tm_min; // 30
|
||||
*(volatile unsigned short *)0x500a = r->tm_sec; // 45
|
||||
*(volatile unsigned short *)0x500c = r->tm_wday; // 5
|
||||
*(volatile unsigned short *)0x500e = r->tm_yday; // 74
|
||||
while (1) {}
|
||||
}
|
||||
EOF
|
||||
"$CLANG" --target=w65816 -O2 -ffunction-sections -c "$cGmFile" -o "$oGmFile"
|
||||
"$CLANG" --target=w65816 -O2 -ffunction-sections \
|
||||
-c "$PROJECT_ROOT/runtime/src/timeExt.c" -o "$oGmTime"
|
||||
"$PROJECT_ROOT/tools/link816" -o "$binGmFile" --text-base 0x1000 \
|
||||
"$oCrt0F" "$oLibgccFile" "$oGmTime" "$oGmFile" >/dev/null 2>&1
|
||||
if ! bash "$PROJECT_ROOT/scripts/runInMame.sh" "$binGmFile" --check \
|
||||
0x025000=007c 0x025002=0002 0x025004=000f \
|
||||
0x025006=0006 0x025008=001e 0x02500a=002d \
|
||||
0x02500c=0005 0x02500e=004a >/dev/null 2>&1; then
|
||||
die "MAME: gmtime(1710484245) returned wrong date fields"
|
||||
fi
|
||||
rm -f "$cGmFile" "$oGmFile" "$oGmTime" "$binGmFile"
|
||||
|
||||
log "check: MAME runs udivmod(0x123...DEF, 0x10000, &m) → q=0x12345_6789AB m=0xCDEF (#69)"
|
||||
cUdmFile="$(mktemp --suffix=.c)"
|
||||
oUdmFile="$(mktemp --suffix=.o)"
|
||||
|
|
@ -5255,4 +5299,13 @@ print(f'OK: {nCreloc} cRELOC opcodes match sidecar')
|
|||
rm -f "$cR1" "$oR1" "$binR1" "$mapR1" "$relR1" "$omfR1"
|
||||
fi
|
||||
|
||||
# W65816 codegen-shape regression pins. Tiny FileCheck assertions on
|
||||
# specific lowering behaviors that have broken before; runs in well
|
||||
# under a second. See scripts/runFileCheckTests.sh.
|
||||
log "check: W65816 FileCheck regressions pass"
|
||||
"$PROJECT_ROOT/scripts/runFileCheckTests.sh" >/tmp/fcOut 2>&1 || {
|
||||
cat /tmp/fcOut >&2
|
||||
die "W65816 FileCheck regressions failed"
|
||||
}
|
||||
|
||||
log "all smoke checks passed"
|
||||
|
|
|
|||
|
|
@ -45,7 +45,7 @@ public:
|
|||
IntPtrType = SignedInt;
|
||||
PtrDiffType = SignedInt;
|
||||
SigAtomicType = SignedLong;
|
||||
resetDataLayout("e-m:e-p:16:8-i16:16-i32:16-n8:16-S16");
|
||||
resetDataLayout("e-m:e-p:16:16-i16:16-i32:16-i64:16-f32:16-f64:16-n8:16-S16");
|
||||
}
|
||||
|
||||
void getTargetDefines(const LangOptions &Opts,
|
||||
|
|
|
|||
|
|
@ -34,6 +34,7 @@ add_llvm_target(W65816CodeGen
|
|||
W65816NegYIndY.cpp
|
||||
W65816PreSpillCrossCall.cpp
|
||||
W65816SjLjFinalize.cpp
|
||||
W65816LowerWide32.cpp
|
||||
W65816TargetMachine.cpp
|
||||
W65816AsmPrinter.cpp
|
||||
W65816MCInstLower.cpp
|
||||
|
|
|
|||
|
|
@ -116,6 +116,15 @@ FunctionPass *createW65816PreSpillCrossCall();
|
|||
// W65816SjLjFinalize.cpp.
|
||||
FunctionPass *createW65816SjLjFinalize();
|
||||
|
||||
// Pre-RA pass that lowers Wide32 register pairs into pairs of i16
|
||||
// vregs. Without this, greedy/basic regalloc can't fit the pair-
|
||||
// pressure of i64-via-2-i32-via-Wide32 traffic in i64-heavy
|
||||
// functions (RegAllocBase crashes during allocatePhysRegs). After
|
||||
// this pass, only i16 vregs reach regalloc, and the pair structure
|
||||
// lives only in the LDAptr32S / STAptr32S / STBptr32S pseudos which
|
||||
// take 2 i16 ptr operands directly.
|
||||
FunctionPass *createW65816LowerWide32();
|
||||
|
||||
void initializeW65816AsmPrinterPass(PassRegistry &);
|
||||
void initializeW65816DAGToDAGISelLegacyPass(PassRegistry &);
|
||||
void initializeW65816StackSlotCleanupPass(PassRegistry &);
|
||||
|
|
@ -128,6 +137,7 @@ void initializeW65816SpillToXPass(PassRegistry &);
|
|||
void initializeW65816NegYIndYPass(PassRegistry &);
|
||||
void initializeW65816PreSpillCrossCallPass(PassRegistry &);
|
||||
void initializeW65816SjLjFinalizePass(PassRegistry &);
|
||||
void initializeW65816LowerWide32Pass(PassRegistry &);
|
||||
|
||||
} // namespace llvm
|
||||
|
||||
|
|
|
|||
|
|
@ -71,21 +71,52 @@ void W65816DAGToDAGISel::Select(SDNode *Node) {
|
|||
return;
|
||||
}
|
||||
|
||||
// Custom selection: bare FrameIndex SDValue used as an i16 pointer
|
||||
// value (e.g. `&arr[0]` for a stack-allocated array). The
|
||||
// auto-generated selector has no pattern for `(i16 frameindex)`
|
||||
// because tablegen doesn't expose FrameIndex as a leaf type — so
|
||||
// ISel fails with "Cannot select: FrameIndex" before ever reaching
|
||||
// a load/store-context fold. Convert it to ADDframe (FI, 0); the
|
||||
// frame-index elimination pass turns ADDframe into TSC + CLC + ADC
|
||||
// #(offset+stackSize), producing SP+offset in A.
|
||||
// Custom selection: bare FrameIndex SDValue used as a pointer value
|
||||
// (e.g. `&arr[0]` for a stack-allocated array). The auto-generated
|
||||
// selector has no pattern for `(i16 frameindex)` because tablegen
|
||||
// doesn't expose FrameIndex as a leaf type — so ISel fails with
|
||||
// "Cannot select: FrameIndex" before ever reaching a load/store-
|
||||
// context fold. Convert to ADDframe (FI, 0); the frame-index
|
||||
// elimination pass turns ADDframe into TSC + CLC + ADC #(offset +
|
||||
// stackSize), producing SP+offset in A.
|
||||
//
|
||||
// ptr32 mode: a `(i32 frameindex)` is `&local` typed as a 32-bit
|
||||
// pointer (bank+addr). Lower as REG_SEQUENCE(ADDframe, sub_lo, 0,
|
||||
// sub_hi). Hi=0 reflects the program-bank assumption (stack lives
|
||||
// in bank 0 for our crt0 startup). Without this, ISel hits
|
||||
// "Cannot select: t# = FrameIndex<N>" and the pass crashes —
|
||||
// observed for softDouble's __adddf3 calling dclass(a, &sa, &ea,
|
||||
// &ma) where the latter three become i32 frameindex SDValues.
|
||||
if (Node->getOpcode() == ISD::FrameIndex) {
|
||||
SDLoc DL(Node);
|
||||
int FI = cast<FrameIndexSDNode>(Node)->getIndex();
|
||||
EVT VT = Node->getValueType(0);
|
||||
SDValue TFI = CurDAG->getTargetFrameIndex(FI, MVT::i16);
|
||||
SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i16);
|
||||
CurDAG->SelectNodeTo(Node, W65816::ADDframe, MVT::i16, TFI, Zero);
|
||||
return;
|
||||
SDValue Zero16 = CurDAG->getTargetConstant(0, DL, MVT::i16);
|
||||
if (VT == MVT::i16) {
|
||||
CurDAG->SelectNodeTo(Node, W65816::ADDframe, MVT::i16, TFI, Zero16);
|
||||
return;
|
||||
}
|
||||
if (VT == MVT::i32) {
|
||||
// Build (REG_SEQUENCE Wide32RC, ADDframe(FI,0), sub_lo, MOVi16(0),
|
||||
// sub_hi). ADDframe materialises lo as an i16 SDValue; the hi
|
||||
// half is the literal bank byte (0).
|
||||
SDNode *Lo = CurDAG->getMachineNode(W65816::ADDframe, DL,
|
||||
MVT::i16, TFI, Zero16);
|
||||
SDValue HiC = CurDAG->getTargetConstant(0, DL, MVT::i16);
|
||||
// For the high half, just materialise an i16 zero via MOVi16imm.
|
||||
SDNode *Hi = CurDAG->getMachineNode(W65816::LDAi16imm, DL,
|
||||
MVT::i16, HiC);
|
||||
SDValue RC = CurDAG->getTargetConstant(W65816::Wide32RegClassID,
|
||||
DL, MVT::i32);
|
||||
SDValue SubLo = CurDAG->getTargetConstant(llvm::sub_lo, DL, MVT::i32);
|
||||
SDValue SubHi = CurDAG->getTargetConstant(llvm::sub_hi, DL, MVT::i32);
|
||||
CurDAG->SelectNodeTo(Node, TargetOpcode::REG_SEQUENCE, MVT::i32,
|
||||
{RC, SDValue(Lo, 0), SubLo, SDValue(Hi, 0),
|
||||
SubHi});
|
||||
return;
|
||||
}
|
||||
report_fatal_error("W65816: FrameIndex selection: unsupported VT");
|
||||
}
|
||||
|
||||
// Defer to the auto-generated selector for everything else.
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load diff
|
|
@ -46,6 +46,26 @@ public:
|
|||
|
||||
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
|
||||
|
||||
// Lock i16 shift amounts to i16 (not i32) even when i32 is a legal
|
||||
// type. Without this, the DAG combiner promotes i16 shift amounts
|
||||
// to i32 once i32 is registered as legal, leaving (sra i16, i32:K)
|
||||
// with no matching pattern. Only narrow when LHS is i16; leave i32
|
||||
// shifts (which go to libcall via LowerShift) alone.
|
||||
MVT getScalarShiftAmountTy(const DataLayout &DL,
|
||||
EVT LHSTy) const override {
|
||||
if (LHSTy == MVT::i16 || LHSTy == MVT::i8) return MVT::i16;
|
||||
return TargetLoweringBase::getScalarShiftAmountTy(DL, LHSTy);
|
||||
}
|
||||
|
||||
// ptr32-mode hook: with patches/0007-targetlowering-virtual-
|
||||
// gettypeconversion making the base function virtual, this can be
|
||||
// overridden to force i64 to expand directly to i16 halves rather
|
||||
// than going through i32 (the next-smaller-legal type). Currently
|
||||
// not overridden — the override-calling-base passthrough caused
|
||||
// regressions in unrelated functions (likely due to subtle
|
||||
// de-virtualization changes when the function becomes virtual).
|
||||
// Future fix needs to test the override more carefully.
|
||||
|
||||
MachineBasicBlock *
|
||||
EmitInstrWithCustomInserter(MachineInstr &MI,
|
||||
MachineBasicBlock *MBB) const override;
|
||||
|
|
@ -147,6 +167,23 @@ public:
|
|||
return TargetLowering::isTypeDesirableForOp(Opc, VT);
|
||||
}
|
||||
|
||||
// Disallow merging stores into wider ones. With ptr32 active and i32
|
||||
// a Custom-lowered op, the SDAG combiner's MergeConsecutiveStores
|
||||
// takes our LowerStore-split pair (2x i16 stores at &t and &t+2) and
|
||||
// merges them back into a single i32 store, which re-enters
|
||||
// LowerStore, splits again, and loops forever — observed as
|
||||
// "LLVM ERROR: out of memory" on `*t = K` for any K (including 0
|
||||
// when the SDAG state lets the combiner pick the merge ahead of any
|
||||
// STZ-pattern simplification). Anything wider than i16 has no
|
||||
// legal ptr-store pattern in our backend anyway, so merging into
|
||||
// wider VTs is purely counterproductive.
|
||||
bool canMergeStoresTo(unsigned AS, EVT MemVT,
|
||||
const MachineFunction &MF) const override {
|
||||
if (MemVT.isInteger() && MemVT.getSizeInBits() > 16)
|
||||
return false;
|
||||
return TargetLowering::canMergeStoresTo(AS, MemVT, MF);
|
||||
}
|
||||
|
||||
private:
|
||||
SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const;
|
||||
|
|
@ -156,6 +193,31 @@ private:
|
|||
SDValue LowerSignExtend(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue LowerShift(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue LowerDynamicStackalloc(SDValue Op, SelectionDAG &DAG) const;
|
||||
// Foundation hooks for ptr32 mode. In ptr16 mode (current default),
|
||||
// both return SDValue() so the legalizer falls through to the default
|
||||
// i16-pointer LDAptr/STAptr selection. When ptr32 mode is enabled
|
||||
// (PointerWidth=32 + Wide32 added as i32 reg class), they detect i32
|
||||
// addresses and wrap the load/store in W65816ISD::LD_PTR / ST_PTR /
|
||||
// STB_PTR so the [dp],Y inserter takes the bank byte from the
|
||||
// pointer's hi half instead of forcing 0.
|
||||
SDValue LowerLoad(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue LowerStore(SDValue Op, SelectionDAG &DAG) const;
|
||||
// ZERO/SIGN/ANY_EXTEND i16 -> i32 and TRUNCATE i32 -> i16 lowering
|
||||
// via REG_SEQUENCE / EXTRACT_SUBREG on the sub_lo/sub_hi indexes of
|
||||
// the Wide32 register class. Active once i32 is registered as a
|
||||
// legal type.
|
||||
SDValue LowerExtend(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue LowerTruncate(SDValue Op, SelectionDAG &DAG) const;
|
||||
// SIGN_EXTEND_INREG i32 with inner type i1 / i8 / i16: sign-extend
|
||||
// the low N bits of the i32 input to fill all 32 bits. Splits to
|
||||
// (sext_inreg lo, innerVT) for the low half and SRA #15 of the
|
||||
// resulting i16 for the high half.
|
||||
SDValue LowerSignExtendInReg(SDValue Op, SelectionDAG &DAG) const;
|
||||
// ADD/SUB/AND/OR/XOR i32 split into per-half i16 ops. The carry-
|
||||
// chain ADDC/ADDE pseudos handle the cross-half link for ADD/SUB.
|
||||
SDValue LowerI32Bin(SDValue Op, SelectionDAG &DAG) const;
|
||||
// i32 ConstantNode: split into two i16 constants and REG_SEQUENCE.
|
||||
SDValue LowerI32Constant(SDValue Op, SelectionDAG &DAG) const;
|
||||
};
|
||||
|
||||
} // namespace llvm
|
||||
|
|
|
|||
|
|
@ -100,6 +100,30 @@ void W65816InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
|
|||
BuildMI(MBB, I, DL, get(W65816::STA_DP)).addImm(dstImg);
|
||||
return;
|
||||
}
|
||||
// SP -> A via TSC. Used by alloca / setjmp asm machinery.
|
||||
if (DestReg == W65816::A && SrcReg == W65816::SP) {
|
||||
BuildMI(MBB, I, DL, get(W65816::TSC));
|
||||
return;
|
||||
}
|
||||
// A -> SP via TCS.
|
||||
if (DestReg == W65816::SP && SrcReg == W65816::A) {
|
||||
BuildMI(MBB, I, DL, get(W65816::TCS));
|
||||
return;
|
||||
}
|
||||
// X <-> Y via A: 65816 has no direct X<->Y transfer; bridge through
|
||||
// A. Caller is responsible for ensuring A is dead at this program
|
||||
// point (regalloc arranges this). Used by greedy when an i16 vreg
|
||||
// forced into one Idx16 reg gets coalesced with a use in the other.
|
||||
if (DestReg == W65816::Y && SrcReg == W65816::X) {
|
||||
BuildMI(MBB, I, DL, get(W65816::TXA));
|
||||
BuildMI(MBB, I, DL, get(W65816::TAY));
|
||||
return;
|
||||
}
|
||||
if (DestReg == W65816::X && SrcReg == W65816::Y) {
|
||||
BuildMI(MBB, I, DL, get(W65816::TYA));
|
||||
BuildMI(MBB, I, DL, get(W65816::TAX));
|
||||
return;
|
||||
}
|
||||
// X → IMGn / IMGn → X: STX dp / LDX dp. Used by the i64-first-arg
|
||||
// entry COPY (LowerFormalArguments routes arg0_ml through Img16 to
|
||||
// dodge the TXA-bridge-clobbers-A spill bug for udivmod-shaped
|
||||
|
|
@ -112,6 +136,18 @@ void W65816InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
|
|||
BuildMI(MBB, I, DL, get(W65816::LDX_DP)).addImm(srcImg);
|
||||
return;
|
||||
}
|
||||
// Y -> IMGn / IMGn -> Y: STY dp / LDY dp. Symmetric with the X
|
||||
// case above. Used by the i32-first-arg ABI's hi half (in X) and
|
||||
// by Wide32 pair copies that have one half in Y after the per-half
|
||||
// routing — see the lambda dispatch below.
|
||||
if (dstImg >= 0 && SrcReg == W65816::Y) {
|
||||
BuildMI(MBB, I, DL, get(W65816::STY_DP)).addImm(dstImg);
|
||||
return;
|
||||
}
|
||||
if (DestReg == W65816::Y && srcImg >= 0) {
|
||||
BuildMI(MBB, I, DL, get(W65816::LDY_DP)).addImm(srcImg);
|
||||
return;
|
||||
}
|
||||
// DPF0 → A: emit `LDA $F0`. DPF0 is the pseudo-physreg carrier
|
||||
// for an i64-returning call's high 16 bits; LowerCall builds a
|
||||
// CopyFromReg(DPF0) glued to the call so the SDAG combiner /
|
||||
|
|
@ -129,6 +165,56 @@ void W65816InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
|
|||
BuildMI(MBB, I, DL, get(W65816::STA_DP)).addImm(0xF0);
|
||||
return;
|
||||
}
|
||||
// Wide32 (AX32 or IMG-pair) <-> Wide32 copy: split on sub_lo / sub_hi
|
||||
// and recurse. Use a hand-written dispatch instead of getSubReg
|
||||
// because the MCRegisterInfo::getSubReg path crashes when called
|
||||
// from TargetInstrInfo::lowerCopy on regs that are not pair regs
|
||||
// (the table lookup walks past the end of the diff list).
|
||||
auto wide32Halves = [](Register R)
|
||||
-> std::pair<Register, Register> {
|
||||
switch (R) {
|
||||
case W65816::AX32: return {W65816::A, W65816::X};
|
||||
case W65816::IMG01: return {W65816::IMG0, W65816::IMG1};
|
||||
case W65816::IMG23: return {W65816::IMG2, W65816::IMG3};
|
||||
case W65816::IMG45: return {W65816::IMG4, W65816::IMG5};
|
||||
case W65816::IMG67: return {W65816::IMG6, W65816::IMG7};
|
||||
case W65816::IMG89: return {W65816::IMG8, W65816::IMG9};
|
||||
case W65816::IMG1011: return {W65816::IMG10, W65816::IMG11};
|
||||
case W65816::IMG1213: return {W65816::IMG12, W65816::IMG13};
|
||||
case W65816::IMG1415: return {W65816::IMG14, W65816::IMG15};
|
||||
default: return {Register(), Register()};
|
||||
}
|
||||
};
|
||||
auto [srcLo, srcHi] = wide32Halves(SrcReg);
|
||||
auto [dstLo, dstHi] = wide32Halves(DestReg);
|
||||
if (srcLo && srcHi && dstLo && dstHi) {
|
||||
// Wide32 -> Wide32. Lo-first order is correct in every direction:
|
||||
// AX32 -> IMG_pair : STA dstLo (A live), then STX dstHi
|
||||
// IMG_pair -> AX32 : LDA srcLo, then LDX srcHi (independent halves)
|
||||
// IMG_pair -> IMG_pair : LDA/STA chain twice (A is only per-half scratch)
|
||||
copyPhysReg(MBB, I, DL, dstLo, srcLo, KillSrc,
|
||||
RenamableDest, RenamableSrc);
|
||||
copyPhysReg(MBB, I, DL, dstHi, srcHi, KillSrc,
|
||||
RenamableDest, RenamableSrc);
|
||||
return;
|
||||
}
|
||||
// Wide32 -> i16: take sub_lo of source. Arises post-RA when an
|
||||
// EXTRACT_SUBREG was lowered as a parent-reg COPY (the SubRegIndex
|
||||
// is dropped by lowerCopy).
|
||||
if (srcLo && srcHi && !dstLo) {
|
||||
copyPhysReg(MBB, I, DL, DestReg, srcLo, KillSrc,
|
||||
RenamableDest, RenamableSrc);
|
||||
return;
|
||||
}
|
||||
// i16 -> Wide32: write sub_lo only (sub_hi left as caller had it,
|
||||
// matching INSERT_SUBREG semantics). Arises post-RA when REG_SEQUENCE
|
||||
// is expanded into per-half COPY pseudos, then a parent-reg COPY of
|
||||
// a sub-reg-only def appears.
|
||||
if (!srcLo && dstLo && dstHi) {
|
||||
copyPhysReg(MBB, I, DL, dstLo, SrcReg, KillSrc,
|
||||
RenamableDest, RenamableSrc);
|
||||
return;
|
||||
}
|
||||
llvm_unreachable("W65816: cross-class copyPhysReg not yet implemented");
|
||||
}
|
||||
|
||||
|
|
@ -141,6 +227,37 @@ void W65816InstrInfo::storeRegToStackSlot(
|
|||
// and zero offset. When regalloc hands us a spill from X or Y, bridge
|
||||
// through A (TXA / TYA) — same rationale as loadRegFromStackSlot.
|
||||
DebugLoc DL = MI != MBB.end() ? MI->getDebugLoc() : DebugLoc();
|
||||
// Wide32 spill: split into 2 i16 stores at offsets 0 and 2 of the
|
||||
// 4-byte spill slot. Bridge each half through A using copyPhysReg.
|
||||
if (RC == &W65816::Wide32RegClass || RC == &W65816::Acc32RegClass ||
|
||||
RC == &W65816::AnyWide32RegClass) {
|
||||
Register Lo, Hi;
|
||||
switch (SrcReg) {
|
||||
case W65816::AX32: Lo = W65816::A; Hi = W65816::X; break;
|
||||
case W65816::IMG01: Lo = W65816::IMG0; Hi = W65816::IMG1; break;
|
||||
case W65816::IMG23: Lo = W65816::IMG2; Hi = W65816::IMG3; break;
|
||||
case W65816::IMG45: Lo = W65816::IMG4; Hi = W65816::IMG5; break;
|
||||
case W65816::IMG67: Lo = W65816::IMG6; Hi = W65816::IMG7; break;
|
||||
case W65816::IMG89: Lo = W65816::IMG8; Hi = W65816::IMG9; break;
|
||||
case W65816::IMG1011: Lo = W65816::IMG10; Hi = W65816::IMG11; break;
|
||||
case W65816::IMG1213: Lo = W65816::IMG12; Hi = W65816::IMG13; break;
|
||||
case W65816::IMG1415: Lo = W65816::IMG14; Hi = W65816::IMG15; break;
|
||||
default: llvm_unreachable("W65816: Wide32 spill of non-pair reg");
|
||||
}
|
||||
// Bridge lo through A, store at offset 0; bridge hi through A,
|
||||
// store at offset 2. This is brittle in the face of regalloc
|
||||
// expectations — Wide32 spills are best avoided by keeping the
|
||||
// pair in registers if at all possible.
|
||||
if (Lo != W65816::A) {
|
||||
copyPhysReg(MBB, MI, DL, W65816::A, Lo, false);
|
||||
}
|
||||
BuildMI(MBB, MI, DL, get(W65816::STAfi))
|
||||
.addReg(W65816::A).addFrameIndex(FrameIdx).addImm(0);
|
||||
copyPhysReg(MBB, MI, DL, W65816::A, Hi, false);
|
||||
BuildMI(MBB, MI, DL, get(W65816::STAfi))
|
||||
.addReg(W65816::A).addFrameIndex(FrameIdx).addImm(2);
|
||||
return;
|
||||
}
|
||||
if (SrcReg == W65816::X || SrcReg == W65816::Y) {
|
||||
unsigned XferOp = (SrcReg == W65816::X) ? W65816::TXA : W65816::TYA;
|
||||
BuildMI(MBB, MI, DL, get(XferOp));
|
||||
|
|
@ -166,6 +283,34 @@ void W65816InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
|
|||
// values for the second word (caught by udivmod's `a - q*b` mod
|
||||
// computation).
|
||||
DebugLoc DL = MI != MBB.end() ? MI->getDebugLoc() : DebugLoc();
|
||||
// Wide32 reload: 2 i16 loads at offsets 0 and 2 of the 4-byte slot.
|
||||
if (RC == &W65816::Wide32RegClass || RC == &W65816::Acc32RegClass ||
|
||||
RC == &W65816::AnyWide32RegClass) {
|
||||
Register Lo, Hi;
|
||||
switch (DestReg) {
|
||||
case W65816::AX32: Lo = W65816::A; Hi = W65816::X; break;
|
||||
case W65816::IMG01: Lo = W65816::IMG0; Hi = W65816::IMG1; break;
|
||||
case W65816::IMG23: Lo = W65816::IMG2; Hi = W65816::IMG3; break;
|
||||
case W65816::IMG45: Lo = W65816::IMG4; Hi = W65816::IMG5; break;
|
||||
case W65816::IMG67: Lo = W65816::IMG6; Hi = W65816::IMG7; break;
|
||||
case W65816::IMG89: Lo = W65816::IMG8; Hi = W65816::IMG9; break;
|
||||
case W65816::IMG1011: Lo = W65816::IMG10; Hi = W65816::IMG11; break;
|
||||
case W65816::IMG1213: Lo = W65816::IMG12; Hi = W65816::IMG13; break;
|
||||
case W65816::IMG1415: Lo = W65816::IMG14; Hi = W65816::IMG15; break;
|
||||
default: llvm_unreachable("W65816: Wide32 reload to non-pair reg");
|
||||
}
|
||||
// Lo half: LDA from offset 0, transfer to Lo if needed.
|
||||
BuildMI(MBB, MI, DL, get(W65816::LDAfi), W65816::A)
|
||||
.addFrameIndex(FrameIdx).addImm(0);
|
||||
if (Lo != W65816::A)
|
||||
copyPhysReg(MBB, MI, DL, Lo, W65816::A, false);
|
||||
// Hi half: LDA from offset 2, transfer to Hi.
|
||||
BuildMI(MBB, MI, DL, get(W65816::LDAfi), W65816::A)
|
||||
.addFrameIndex(FrameIdx).addImm(2);
|
||||
if (Hi != W65816::A)
|
||||
copyPhysReg(MBB, MI, DL, Hi, W65816::A, false);
|
||||
return;
|
||||
}
|
||||
if (DestReg == W65816::A) {
|
||||
BuildMI(MBB, MI, DL, get(W65816::LDAfi), DestReg)
|
||||
.addFrameIndex(FrameIdx)
|
||||
|
|
|
|||
|
|
@ -88,6 +88,26 @@ def SDT_W65816Alloca : SDTypeProfile<1, 1, [SDTCisVT<0, i16>,
|
|||
def W65816alloca : SDNode<"W65816ISD::ALLOCA", SDT_W65816Alloca,
|
||||
[SDNPHasChain, SDNPSideEffect]>;
|
||||
|
||||
// ptr32 load / store: target-specific load/store nodes that take a 32-bit
|
||||
// pointer (Wide32 = i32) and lower to [dp],Y indirect-long with the bank
|
||||
// byte taken from the pointer's hi-half. Used for ptr32 mode where
|
||||
// generic (load i32-addr) needs explicit lowering — wrapping in a target
|
||||
// node prevents DAG combines from rewriting the load before isel.
|
||||
//
|
||||
// Loads always materialise an i16 in A (16-bit LDA); byte zext / anyext
|
||||
// patterns AND-mask afterwards exactly as the existing LDAptr does.
|
||||
// Stores split into two nodes: ST_PTR (full 16-bit STA) and STB_PTR
|
||||
// (SEP/REP-wrapped 8-bit STA for truncating stores).
|
||||
def SDT_W65816LdPtr : SDTypeProfile<1, 1, [SDTCisVT<0, i16>, SDTCisVT<1, i32>]>;
|
||||
def SDT_W65816StPtr : SDTypeProfile<0, 2, [SDTCisVT<0, i16>, SDTCisVT<1, i32>]>;
|
||||
|
||||
def W65816ldPtr : SDNode<"W65816ISD::LD_PTR", SDT_W65816LdPtr,
|
||||
[SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
|
||||
def W65816stPtr : SDNode<"W65816ISD::ST_PTR", SDT_W65816StPtr,
|
||||
[SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
|
||||
def W65816stbPtr : SDNode<"W65816ISD::STB_PTR", SDT_W65816StPtr,
|
||||
[SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// Pseudo Instructions
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
|
@ -1046,6 +1066,96 @@ def : Pat<(store Acc8:$val, (add Wide16:$ptr, (i16 imm:$off))),
|
|||
def : Pat<(store Acc8:$val, Wide16:$ptr),
|
||||
(STBptr (COPY_TO_REGCLASS Acc8:$val, Acc16), Wide16:$ptr)>;
|
||||
|
||||
// ---------------------------------------------------------------------
|
||||
// ptr32 deref pseudos. Same shape and inserter as LDAptr/STAptr/STBptr,
|
||||
// but the pointer is a Wide32 (i32) value: sub_lo carries the low 16
|
||||
// bits of the address, sub_hi carries the bank byte in its low half.
|
||||
// Inserter stages the low 16 bits at $E0..$E1 and the bank byte at $E2,
|
||||
// then emits LDA/STA [dp],Y just like the i16 path — but with a
|
||||
// pointer-derived bank instead of a forced 0.
|
||||
//
|
||||
// Dead unless ptr32 mode is active (LowerLoad/LowerStore only emit
|
||||
// W65816ldPtr/stPtr/stbPtr when the address is i32).
|
||||
// ---------------------------------------------------------------------
|
||||
let usesCustomInserter = 1, hasSideEffects = 1, mayLoad = 1,
|
||||
Defs = [Y, P] in {
|
||||
def LDAptr32 : W65816Pseudo<(outs Acc16:$dst), (ins AnyWide32:$ptr),
|
||||
"# LDAptr32 $dst, $ptr",
|
||||
[(set Acc16:$dst, (W65816ldPtr AnyWide32:$ptr))]>;
|
||||
}
|
||||
let usesCustomInserter = 1, hasSideEffects = 1, mayStore = 1,
|
||||
Defs = [Y, P] in {
|
||||
def STAptr32 : W65816Pseudo<(outs), (ins Acc16:$val, AnyWide32:$ptr),
|
||||
"# STAptr32 $val, $ptr",
|
||||
[(W65816stPtr Acc16:$val, AnyWide32:$ptr)]>;
|
||||
def STBptr32 : W65816Pseudo<(outs), (ins Acc16:$val, AnyWide32:$ptr),
|
||||
"# STBptr32 $val, $ptr",
|
||||
[(W65816stbPtr Acc16:$val, AnyWide32:$ptr)]>;
|
||||
}
|
||||
let usesCustomInserter = 1, hasSideEffects = 1, mayLoad = 1,
|
||||
Defs = [Y, P] in {
|
||||
def LDAptr32Off : W65816Pseudo<(outs Acc16:$dst),
|
||||
(ins AnyWide32:$ptr, i16imm:$off),
|
||||
"# LDAptr32Off $dst, $ptr, $off", []>;
|
||||
}
|
||||
let usesCustomInserter = 1, hasSideEffects = 1, mayStore = 1,
|
||||
Defs = [Y, P] in {
|
||||
def STAptr32Off : W65816Pseudo<(outs),
|
||||
(ins Acc16:$val, AnyWide32:$ptr, i16imm:$off),
|
||||
"# STAptr32Off $val, $ptr, $off", []>;
|
||||
def STBptr32Off : W65816Pseudo<(outs),
|
||||
(ins Acc16:$val, AnyWide32:$ptr, i16imm:$off),
|
||||
"# STBptr32Off $val, $ptr, $off", []>;
|
||||
}
|
||||
|
||||
// Direct ptr32 load/store patterns over generic ISD::LOAD / ISD::STORE
|
||||
// when the address is an i32 (AnyWide32) reg. These are unreachable
|
||||
// while i32 is not a legal type (ptr16 mode). When ptr32 mode is
|
||||
// activated they fire instead of the i16-pointer LDAptr / STAptr.
|
||||
def : Pat<(i16 (load AnyWide32:$ptr)),
|
||||
(LDAptr32 AnyWide32:$ptr)>;
|
||||
def : Pat<(store Acc16:$val, AnyWide32:$ptr),
|
||||
(STAptr32 Acc16:$val, AnyWide32:$ptr)>;
|
||||
def : Pat<(truncstorei8 Acc16:$val, AnyWide32:$ptr),
|
||||
(STBptr32 Acc16:$val, AnyWide32:$ptr)>;
|
||||
def : Pat<(i16 (zextloadi8 AnyWide32:$ptr)),
|
||||
(ANDi16imm (LDAptr32 AnyWide32:$ptr), 0xFF)>;
|
||||
def : Pat<(i16 (extloadi8 AnyWide32:$ptr)),
|
||||
(LDAptr32 AnyWide32:$ptr)>;
|
||||
def : Pat<(i8 (load AnyWide32:$ptr)),
|
||||
(COPY_TO_REGCLASS (ANDi16imm (LDAptr32 AnyWide32:$ptr), 0xFF), Acc8)>;
|
||||
def : Pat<(store Acc8:$val, AnyWide32:$ptr),
|
||||
(STBptr32 (COPY_TO_REGCLASS Acc8:$val, Acc16), AnyWide32:$ptr)>;
|
||||
|
||||
// Off variants — folded constant-offset add patterns deferred until
|
||||
// ptr32 mode is activated and we can profile real cases. The base
|
||||
// LDAptr32/STAptr32 pseudos handle the general (add ptr, off) case
|
||||
// correctly via a separate i32 ADD; the Off pseudos are an optional
|
||||
// optimization for small constant offsets.
|
||||
|
||||
// Split-pair variants: same semantics as LDAptr32/STAptr32/STBptr32 but
|
||||
// the ptr is two separate i16 register operands (lo + hi) instead of
|
||||
// one Wide32 register pair. Used by the W65816LowerWide32 pre-RA pass
|
||||
// to relieve register-pair allocation pressure: it walks REG_SEQUENCE
|
||||
// + LDAptr32 chains, decomposes the Wide32 vregs into pairs of i16
|
||||
// vregs, and rewrites the LDAptr32-family to take the two halves
|
||||
// directly.
|
||||
let usesCustomInserter = 1, hasSideEffects = 1, mayLoad = 1,
|
||||
Defs = [Y, P] in {
|
||||
def LDAptr32S : W65816Pseudo<(outs Acc16:$dst),
|
||||
(ins Wide16:$ptrLo, Wide16:$ptrHi),
|
||||
"# LDAptr32S $dst, $ptrLo, $ptrHi", []>;
|
||||
}
|
||||
let usesCustomInserter = 1, hasSideEffects = 1, mayStore = 1,
|
||||
Defs = [Y, P] in {
|
||||
def STAptr32S : W65816Pseudo<(outs),
|
||||
(ins Acc16:$val, Wide16:$ptrLo, Wide16:$ptrHi),
|
||||
"# STAptr32S $val, $ptrLo, $ptrHi", []>;
|
||||
def STBptr32S : W65816Pseudo<(outs),
|
||||
(ins Acc16:$val, Wide16:$ptrLo, Wide16:$ptrHi),
|
||||
"# STBptr32S $val, $ptrLo, $ptrHi", []>;
|
||||
}
|
||||
|
||||
// i8 load via Acc16 pointer producing a true i8 (Acc8) result. Reuses
|
||||
// the existing zextloadi8 16-bit-LDA-and-mask path: load 2 bytes, mask
|
||||
// the high byte, then narrow to Acc8. COPY_TO_REGCLASS to Acc8 is a
|
||||
|
|
@ -1478,15 +1588,18 @@ def : Pat<(store
|
|||
// function doesn't have to know how it was called to choose its
|
||||
// return instruction. A pseudo bridges the i16 symbol operand
|
||||
// to JSL_Long's 24-bit operand class.
|
||||
// Defs include DPF0 — every i64-returning libcall clobbers DP[$F0]
|
||||
// (it's the carrier for the highest 16 bits of the return). The
|
||||
// LowerCall side captures the pre-call DPF0 via CopyFromReg(DPF0)
|
||||
// glued to the call so the SDAG combiner / scheduler can't merge
|
||||
// or reorder reads across calls. Without DPF0 in Defs, plain
|
||||
// `getLoad(0xF0)` was being CSE'd across calls, leading to
|
||||
// `dmath = (a+b)*(a-b)` returning 4 instead of 16.
|
||||
// Defs lists ALL caller-clobbered regs. The 65816 has no
|
||||
// caller/callee-save split — every callee may freely modify
|
||||
// A/X/Y/DPF0/P/etc. Critically, i32/i64 returns place high
|
||||
// halves in X (i32), Y and DPF0 (i64); without those in Defs,
|
||||
// the InstrEmitter does not add implicit-defs for glued
|
||||
// CopyFromReg(X/Y/DPF0) on the call MI, and the verifier sees
|
||||
// the post-call `COPY $y` as reading an undefined register.
|
||||
// DPF0 was historically the only "extra" def so getLoad(0xF0)
|
||||
// wouldn't CSE across calls; the same anti-CSE rationale applies
|
||||
// to A/X/Y, but more fundamentally those are call return slots.
|
||||
let isCall = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0,
|
||||
Defs = [A, DPF0] in {
|
||||
Defs = [A, X, Y, DPF0] in {
|
||||
def JSLpseudo : W65816Pseudo<(outs), (ins i16imm:$dst),
|
||||
"# JSLpseudo $dst", []>;
|
||||
}
|
||||
|
|
|
|||
326
src/llvm/lib/Target/W65816/W65816LowerWide32.cpp
Normal file
326
src/llvm/lib/Target/W65816/W65816LowerWide32.cpp
Normal file
|
|
@ -0,0 +1,326 @@
|
|||
//===-- W65816LowerWide32.cpp - Wide32 -> 2x i16 pre-RA lowering ---------===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// Pre-regalloc pass that decomposes Wide32 register-pair vregs into pairs
|
||||
// of i16 vregs. Without this, greedy / basic regalloc fails on i64-heavy
|
||||
// functions (`RegAllocBase` crashes during `allocatePhysRegs`) because
|
||||
// the i64-via-2-i32-via-Wide32 chain produces too many simultaneously
|
||||
// live register-pair vregs. After this pass, only i16 vregs remain at
|
||||
// the regalloc input — Wide32 lives only inside this pass and the new
|
||||
// LDAptr32S / STAptr32S / STBptr32S pseudos that take 2 i16 ptr operands
|
||||
// directly.
|
||||
//
|
||||
// Walks the MIR and:
|
||||
// 1. Finds REG_SEQUENCE producing Wide32 / Acc32 / AnyWide32; records
|
||||
// the (lo, hi) i16 source operands; queues the REG_SEQUENCE for
|
||||
// erasure.
|
||||
// 2. Finds COPY whose dest is a Wide32 vreg and whose src is another
|
||||
// mapped Wide32 vreg; chains the (lo, hi) mapping forward.
|
||||
// 3. Rewrites EXTRACT_SUBREG of mapped Wide32 vregs by replacing the
|
||||
// destination vreg with the appropriate half (sub_lo or sub_hi).
|
||||
// 4. Rewrites LDAptr32 / STAptr32 / STBptr32 with a mapped Wide32 ptr
|
||||
// to the corresponding LDAptr32S / STAptr32S / STBptr32S pseudo
|
||||
// with two separate i16 operands.
|
||||
//
|
||||
// Bail / safety: any Wide32 vreg whose def we can't decompose is left
|
||||
// in place — regalloc may still struggle but no miscompile.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "W65816.h"
|
||||
#include "W65816InstrInfo.h"
|
||||
#include "W65816Subtarget.h"
|
||||
#include "llvm/ADT/DenseMap.h"
|
||||
#include "llvm/ADT/SmallVector.h"
|
||||
#include "llvm/CodeGen/MachineFunction.h"
|
||||
#include "llvm/CodeGen/MachineFunctionPass.h"
|
||||
#include "llvm/CodeGen/MachineInstr.h"
|
||||
#include "llvm/CodeGen/MachineInstrBuilder.h"
|
||||
#include "llvm/CodeGen/MachineRegisterInfo.h"
|
||||
#include "llvm/CodeGen/TargetInstrInfo.h"
|
||||
#include "llvm/Support/Debug.h"
|
||||
|
||||
using namespace llvm;
|
||||
|
||||
#define DEBUG_TYPE "w65816-lower-wide32"
|
||||
|
||||
namespace {
|
||||
|
||||
class W65816LowerWide32 : public MachineFunctionPass {
|
||||
public:
|
||||
static char ID;
|
||||
W65816LowerWide32() : MachineFunctionPass(ID) {}
|
||||
StringRef getPassName() const override {
|
||||
return "W65816 Wide32 -> 2x i16 lowering";
|
||||
}
|
||||
void getAnalysisUsage(AnalysisUsage &AU) const override {
|
||||
AU.setPreservesCFG();
|
||||
MachineFunctionPass::getAnalysisUsage(AU);
|
||||
}
|
||||
bool runOnMachineFunction(MachineFunction &MF) override;
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
char W65816LowerWide32::ID = 0;
|
||||
|
||||
INITIALIZE_PASS(W65816LowerWide32, DEBUG_TYPE,
|
||||
"W65816 Wide32 lowering", false, false)
|
||||
|
||||
FunctionPass *llvm::createW65816LowerWide32() {
|
||||
return new W65816LowerWide32();
|
||||
}
|
||||
|
||||
static bool isWide32RC(const TargetRegisterClass *RC) {
|
||||
return RC == &W65816::Wide32RegClass ||
|
||||
RC == &W65816::Acc32RegClass ||
|
||||
RC == &W65816::AnyWide32RegClass;
|
||||
}
|
||||
|
||||
bool W65816LowerWide32::runOnMachineFunction(MachineFunction &MF) {
|
||||
MachineRegisterInfo &MRI = MF.getRegInfo();
|
||||
const W65816Subtarget &STI = MF.getSubtarget<W65816Subtarget>();
|
||||
const W65816InstrInfo &TII = *STI.getInstrInfo();
|
||||
|
||||
// Map: Wide32 vreg -> (loVreg, hiVreg) of i16 type.
|
||||
DenseMap<Register, std::pair<Register, Register>> wideMap;
|
||||
|
||||
// Pass 1: collect all Wide32 vregs.
|
||||
SmallVector<Register, 16> wide32Vregs;
|
||||
for (unsigned i = 0, e = MRI.getNumVirtRegs(); i != e; ++i) {
|
||||
Register R = Register::index2VirtReg(i);
|
||||
if (MRI.reg_nodbg_empty(R))
|
||||
continue;
|
||||
if (!isWide32RC(MRI.getRegClass(R)))
|
||||
continue;
|
||||
wide32Vregs.push_back(R);
|
||||
}
|
||||
|
||||
if (wide32Vregs.empty())
|
||||
return false;
|
||||
|
||||
// Pass 2: process REG_SEQUENCE / chained-COPY / multi-subreg-def
|
||||
// shapes; build the mapping. Iterate to fixed point because COPY
|
||||
// chains depend on prior mappings.
|
||||
SmallVector<MachineInstr *, 16> toErase;
|
||||
bool changed = true;
|
||||
while (changed) {
|
||||
changed = false;
|
||||
for (Register W : wide32Vregs) {
|
||||
if (wideMap.count(W))
|
||||
continue;
|
||||
MachineInstr *DefMI = MRI.getUniqueVRegDef(W);
|
||||
if (DefMI && DefMI->getOpcode() == TargetOpcode::REG_SEQUENCE) {
|
||||
Register Lo, Hi;
|
||||
for (unsigned op = 1; op + 1 < DefMI->getNumOperands(); op += 2) {
|
||||
if (!DefMI->getOperand(op).isReg() ||
|
||||
!DefMI->getOperand(op + 1).isImm())
|
||||
continue;
|
||||
unsigned idx = DefMI->getOperand(op + 1).getImm();
|
||||
Register Src = DefMI->getOperand(op).getReg();
|
||||
if (idx == llvm::sub_lo)
|
||||
Lo = Src;
|
||||
else if (idx == llvm::sub_hi)
|
||||
Hi = Src;
|
||||
}
|
||||
if (Lo && Hi) {
|
||||
wideMap[W] = {Lo, Hi};
|
||||
toErase.push_back(DefMI);
|
||||
changed = true;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
if (DefMI && DefMI->isCopy()) {
|
||||
Register Src = DefMI->getOperand(1).getReg();
|
||||
if (Src.isVirtual() && wideMap.count(Src)) {
|
||||
wideMap[W] = wideMap[Src];
|
||||
toErase.push_back(DefMI);
|
||||
changed = true;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
// Multi-subreg-def shape: separate sub-reg COPYs build %W:
|
||||
// undef %W.sub_lo:wide32 = COPY %A:acc16
|
||||
// %W.sub_hi:wide32 = COPY %B:acc16
|
||||
// Equivalent to a REG_SEQUENCE %A, sub_lo, %B, sub_hi. softDouble
|
||||
// at -O2 generates this heavily; without handling it the Wide32
|
||||
// vreg survives to regalloc, which then asks for a spill/reload
|
||||
// from a non-pair physreg and trips load/storeRegToStackSlot's
|
||||
// llvm_unreachable.
|
||||
Register LoSrc, HiSrc;
|
||||
MachineInstr *LoDefMI = nullptr;
|
||||
MachineInstr *HiDefMI = nullptr;
|
||||
bool ok = true;
|
||||
for (MachineInstr &MI : MRI.def_instructions(W)) {
|
||||
if (!MI.isCopy()) { ok = false; break; }
|
||||
const MachineOperand &Dst = MI.getOperand(0);
|
||||
const MachineOperand &Src = MI.getOperand(1);
|
||||
if (!Dst.isReg() || Dst.getReg() != W) { ok = false; break; }
|
||||
unsigned SubIdx = Dst.getSubReg();
|
||||
if (SubIdx == llvm::sub_lo) {
|
||||
if (LoDefMI) { ok = false; break; }
|
||||
LoDefMI = &MI;
|
||||
LoSrc = Src.getReg();
|
||||
} else if (SubIdx == llvm::sub_hi) {
|
||||
if (HiDefMI) { ok = false; break; }
|
||||
HiDefMI = &MI;
|
||||
HiSrc = Src.getReg();
|
||||
} else {
|
||||
ok = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (ok && LoSrc && HiSrc) {
|
||||
wideMap[W] = {LoSrc, HiSrc};
|
||||
if (LoDefMI) toErase.push_back(LoDefMI);
|
||||
if (HiDefMI) toErase.push_back(HiDefMI);
|
||||
changed = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Pass 2b: handle PHIs whose result is a Wide32 vreg by splitting
|
||||
// into 2 PHIs (one per half). Iterate to fixed point: a PHI becomes
|
||||
// resolvable only after all its sources have been mapped.
|
||||
changed = true;
|
||||
while (changed) {
|
||||
changed = false;
|
||||
for (Register W : wide32Vregs) {
|
||||
if (wideMap.count(W))
|
||||
continue;
|
||||
MachineInstr *DefMI = MRI.getUniqueVRegDef(W);
|
||||
if (!DefMI || !DefMI->isPHI())
|
||||
continue;
|
||||
bool AllMapped = true;
|
||||
for (unsigned op = 1; op + 1 < DefMI->getNumOperands(); op += 2) {
|
||||
Register Src = DefMI->getOperand(op).getReg();
|
||||
if (!Src.isVirtual() || !wideMap.count(Src)) {
|
||||
AllMapped = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!AllMapped)
|
||||
continue;
|
||||
Register NewLo = MRI.createVirtualRegister(&W65816::Acc16RegClass);
|
||||
Register NewHi = MRI.createVirtualRegister(&W65816::Acc16RegClass);
|
||||
MachineBasicBlock *MBB = DefMI->getParent();
|
||||
DebugLoc DL = DefMI->getDebugLoc();
|
||||
auto PHILo = BuildMI(*MBB, DefMI, DL, TII.get(TargetOpcode::PHI), NewLo);
|
||||
auto PHIHi = BuildMI(*MBB, DefMI, DL, TII.get(TargetOpcode::PHI), NewHi);
|
||||
for (unsigned op = 1; op + 1 < DefMI->getNumOperands(); op += 2) {
|
||||
Register Src = DefMI->getOperand(op).getReg();
|
||||
MachineBasicBlock *PredMBB = DefMI->getOperand(op + 1).getMBB();
|
||||
auto [SrcLo, SrcHi] = wideMap[Src];
|
||||
PHILo.addReg(SrcLo).addMBB(PredMBB);
|
||||
PHIHi.addReg(SrcHi).addMBB(PredMBB);
|
||||
}
|
||||
wideMap[W] = {NewLo, NewHi};
|
||||
toErase.push_back(DefMI);
|
||||
changed = true;
|
||||
}
|
||||
}
|
||||
|
||||
// Pass 3: rewrite uses.
|
||||
SmallVector<MachineInstr *, 32> useToErase;
|
||||
for (MachineBasicBlock &MBB : MF) {
|
||||
for (MachineBasicBlock::iterator It = MBB.begin(); It != MBB.end();) {
|
||||
MachineInstr *MI = &*It++;
|
||||
|
||||
// EXTRACT_SUBREG of a mapped Wide32 vreg: replace the dest vreg
|
||||
// with the appropriate half (sub_lo or sub_hi).
|
||||
if (MI->getOpcode() == TargetOpcode::EXTRACT_SUBREG) {
|
||||
Register Src = MI->getOperand(1).getReg();
|
||||
if (Src.isVirtual() && wideMap.count(Src)) {
|
||||
unsigned SubIdx = MI->getOperand(2).getImm();
|
||||
auto [Lo, Hi] = wideMap[Src];
|
||||
Register Half = (SubIdx == llvm::sub_lo) ? Lo : Hi;
|
||||
Register Dst = MI->getOperand(0).getReg();
|
||||
MRI.replaceRegWith(Dst, Half);
|
||||
useToErase.push_back(MI);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// COPY %V.sub_lo / %V.sub_hi (partial-reg COPY where source has a
|
||||
// sub-reg specifier and the source vreg is a mapped Wide32).
|
||||
// LLVM emits this shape instead of EXTRACT_SUBREG when projecting
|
||||
// a half out of a Wide32 vreg. Only the shape with a full-reg
|
||||
// destination is handled here — partial-reg destinations would
|
||||
// imply the dst is itself a Wide32 sub-reg def, which the def-side
|
||||
// multi-subreg-def handling covers separately.
|
||||
if (MI->isCopy()) {
|
||||
const MachineOperand &SrcOp = MI->getOperand(1);
|
||||
const MachineOperand &DstOp = MI->getOperand(0);
|
||||
if (SrcOp.isReg() && SrcOp.getReg().isVirtual() &&
|
||||
wideMap.count(SrcOp.getReg()) && SrcOp.getSubReg() != 0 &&
|
||||
DstOp.isReg() && DstOp.getSubReg() == 0) {
|
||||
unsigned SubIdx = SrcOp.getSubReg();
|
||||
auto [Lo, Hi] = wideMap[SrcOp.getReg()];
|
||||
Register Half = (SubIdx == llvm::sub_lo) ? Lo : Hi;
|
||||
MRI.replaceRegWith(DstOp.getReg(), Half);
|
||||
useToErase.push_back(MI);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// LDAptr32 / STAptr32 / STBptr32 with a mapped Wide32 ptr:
|
||||
// rewrite to LDAptr32S / STAptr32S / STBptr32S.
|
||||
unsigned Opc = MI->getOpcode();
|
||||
bool isPtrOp = (Opc == W65816::LDAptr32 || Opc == W65816::STAptr32 ||
|
||||
Opc == W65816::STBptr32);
|
||||
if (isPtrOp) {
|
||||
Register Ptr = MI->getOperand(1).getReg();
|
||||
if (Ptr.isVirtual() && wideMap.count(Ptr)) {
|
||||
auto [Lo, Hi] = wideMap[Ptr];
|
||||
unsigned NewOpc = (Opc == W65816::LDAptr32) ? W65816::LDAptr32S
|
||||
: (Opc == W65816::STAptr32) ? W65816::STAptr32S
|
||||
: W65816::STBptr32S;
|
||||
DebugLoc DL = MI->getDebugLoc();
|
||||
MachineBasicBlock *ParentMBB = MI->getParent();
|
||||
if (Opc == W65816::LDAptr32) {
|
||||
Register Dst = MI->getOperand(0).getReg();
|
||||
BuildMI(*ParentMBB, MI->getIterator(), DL, TII.get(NewOpc), Dst)
|
||||
.addReg(Lo)
|
||||
.addReg(Hi);
|
||||
} else {
|
||||
Register Val = MI->getOperand(0).getReg();
|
||||
BuildMI(*ParentMBB, MI->getIterator(), DL, TII.get(NewOpc))
|
||||
.addReg(Val)
|
||||
.addReg(Lo)
|
||||
.addReg(Hi);
|
||||
}
|
||||
useToErase.push_back(MI);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Erase use-side instructions (EXTRACT_SUBREG, LDAptr32-family) first
|
||||
// so the Wide32 vreg becomes dead.
|
||||
for (auto *MI : useToErase)
|
||||
MI->eraseFromParent();
|
||||
|
||||
// Now check each REG_SEQUENCE / chained-COPY def: only erase if the
|
||||
// Wide32 vreg has no remaining uses. Any leftover use means the pass
|
||||
// didn't cover that opcode — leaving the def in place keeps the MIR
|
||||
// well-formed (at the cost of pair-allocation pressure for that
|
||||
// specific case).
|
||||
bool eraseAny = !useToErase.empty();
|
||||
for (auto *MI : toErase) {
|
||||
if (MI->getNumOperands() == 0)
|
||||
continue;
|
||||
Register Dst = MI->getOperand(0).getReg();
|
||||
if (!Dst.isVirtual() || MRI.use_nodbg_empty(Dst)) {
|
||||
MI->eraseFromParent();
|
||||
eraseAny = true;
|
||||
}
|
||||
}
|
||||
|
||||
return eraseAny;
|
||||
}
|
||||
|
|
@ -17,6 +17,13 @@ class W65816Reg<bits<8> num, string n> : Register<n> {
|
|||
let DwarfNumbers = [num];
|
||||
}
|
||||
|
||||
// SubRegIndices for synthetic 32-bit register pairs. sub_lo addresses the
|
||||
// low 16 bits (the natural i16-aligned half), sub_hi the high 16 bits.
|
||||
// Used by Acc32 / Wide32 / AnyWide32 to model i32 (i.e. ptr32) values as
|
||||
// pairs of i16 physical registers.
|
||||
def sub_lo : SubRegIndex<16, 0>;
|
||||
def sub_hi : SubRegIndex<16, 16>;
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// Registers
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
|
@ -127,3 +134,61 @@ def DPF0Reg : RegisterClass<"W65816", [i16], 16, (add DPF0)> {
|
|||
def StatusReg : RegisterClass<"W65816", [i8], 8, (add P)> {
|
||||
let isAllocatable = 0;
|
||||
}
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// Synthetic 32-bit Register Pairs (for ptr32 mode)
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// The W65816 has no native 32-bit registers. For 32-bit-pointer mode and
|
||||
// any other i32 traffic we synthesize register pairs whose halves are
|
||||
// existing i16 registers, accessed via sub_lo / sub_hi.
|
||||
//
|
||||
// AX32 pairs A:X for the calling-convention slot (first i32 arg/return).
|
||||
// Heterogeneous: sub_lo is in Acc16, sub_hi is in Idx16. Because of the
|
||||
// heterogeneity, AX32 lives in its own single-element class (Acc32) — if
|
||||
// it were grouped with the homogeneous IMG pairs in Wide32, TableGen would
|
||||
// auto-derive a "wide32_with_sub_hi_in_idx8" subclass that pins the whole
|
||||
// class to AX32.
|
||||
//
|
||||
// IMG01..IMG1415 pair adjacent IMG slots (each pair is 4 bytes of DP) into
|
||||
// homogeneous i16-i16 pairs. These hold ptr32 values backed entirely by
|
||||
// direct page, so register-pair allocation can spill cleanly via Img16's
|
||||
// existing rules.
|
||||
//
|
||||
// Acc32 / Wide32 / AnyWide32:
|
||||
// Acc32 = {AX32} — calling-convention slot only; not for general allocation.
|
||||
// Wide32 = {IMG01..IMG1415} — homogeneous i16-i16 pairs, freely allocatable.
|
||||
// AnyWide32 = Acc32 ∪ Wide32 — pre-RA flexibility for ptr32 vregs that
|
||||
// are not constrained to AX32; greedy regalloc can pick AX32 or any
|
||||
// Wide32 pair.
|
||||
let SubRegIndices = [sub_lo, sub_hi], CoveredBySubRegs = 1 in {
|
||||
def AX32 : RegisterWithSubRegs<"ax32", [A, X]>,
|
||||
DwarfRegNum<[40]> { let Namespace = "W65816"; }
|
||||
def IMG01 : RegisterWithSubRegs<"img01", [IMG0, IMG1]>,
|
||||
DwarfRegNum<[41]> { let Namespace = "W65816"; }
|
||||
def IMG23 : RegisterWithSubRegs<"img23", [IMG2, IMG3]>,
|
||||
DwarfRegNum<[42]> { let Namespace = "W65816"; }
|
||||
def IMG45 : RegisterWithSubRegs<"img45", [IMG4, IMG5]>,
|
||||
DwarfRegNum<[43]> { let Namespace = "W65816"; }
|
||||
def IMG67 : RegisterWithSubRegs<"img67", [IMG6, IMG7]>,
|
||||
DwarfRegNum<[44]> { let Namespace = "W65816"; }
|
||||
def IMG89 : RegisterWithSubRegs<"img89", [IMG8, IMG9]>,
|
||||
DwarfRegNum<[45]> { let Namespace = "W65816"; }
|
||||
def IMG1011 : RegisterWithSubRegs<"img1011", [IMG10, IMG11]>,
|
||||
DwarfRegNum<[46]> { let Namespace = "W65816"; }
|
||||
def IMG1213 : RegisterWithSubRegs<"img1213", [IMG12, IMG13]>,
|
||||
DwarfRegNum<[47]> { let Namespace = "W65816"; }
|
||||
def IMG1415 : RegisterWithSubRegs<"img1415", [IMG14, IMG15]>,
|
||||
DwarfRegNum<[48]> { let Namespace = "W65816"; }
|
||||
}
|
||||
|
||||
def Acc32 : RegisterClass<"W65816", [i32], 16, (add AX32)>;
|
||||
|
||||
def Wide32 : RegisterClass<"W65816", [i32], 16,
|
||||
(add IMG01, IMG23, IMG45, IMG67,
|
||||
IMG89, IMG1011, IMG1213, IMG1415)>;
|
||||
|
||||
def AnyWide32 : RegisterClass<"W65816", [i32], 16,
|
||||
(add AX32,
|
||||
IMG01, IMG23, IMG45, IMG67,
|
||||
IMG89, IMG1011, IMG1213, IMG1415)>;
|
||||
|
|
|
|||
|
|
@ -419,6 +419,26 @@ bool W65816SepRepCleanup::runOnMachineFunction(MachineFunction &MF) {
|
|||
MI.getOperand(0).isImm()) {
|
||||
int K = MI.getOperand(0).getImm() & 0xFFFF;
|
||||
if (yKnown == K) {
|
||||
// Before erasing this redundant LDY: the prior LDY is still in
|
||||
// scope, so all of its Y-uses between the two LDYs are still
|
||||
// valid uses. But liveness already marked the LAST one (just
|
||||
// before the redundant LDY) as `implicit killed $y`, because
|
||||
// that LDY was about to redefine Y. After erasure, Y survives
|
||||
// through to the NEXT use, so the prior "kill" annotation is
|
||||
// wrong and the machine verifier rejects. Walk backward and
|
||||
// clear the kill flag on the most recent Y-using operand.
|
||||
for (auto Back = std::prev(It2);; --Back) {
|
||||
bool clearedAny = false;
|
||||
for (MachineOperand &MO : Back->operands()) {
|
||||
if (MO.isReg() && MO.getReg() == W65816::Y &&
|
||||
MO.isUse() && MO.isKill()) {
|
||||
MO.setIsKill(false);
|
||||
clearedAny = true;
|
||||
}
|
||||
}
|
||||
if (clearedAny) break;
|
||||
if (Back == MBB.begin()) break;
|
||||
}
|
||||
auto Erase = It2++;
|
||||
Erase->eraseFromParent();
|
||||
Changed = true;
|
||||
|
|
|
|||
|
|
@ -748,6 +748,15 @@ bool W65816StackSlotCleanup::runOnMachineFunction(MachineFunction &MF) {
|
|||
}
|
||||
};
|
||||
auto isLdaLike = [](unsigned Opc) {
|
||||
// COPY between physregs: lowers in AsmPrinter to one of TXA/TYA/
|
||||
// LDA $D? (for IMG↔A bridges) etc. — all of which set N/Z based
|
||||
// on the loaded value. Treating COPY as flag-defining caused the
|
||||
// wrap pass to identify a PHI-elim COPY as the "Test" and wrap
|
||||
// too narrow a range, so the cb-test LDA's flags were trampled
|
||||
// by intervening A-loads before reaching the BEQ. Including
|
||||
// COPY in the corrupting set forces the pass to walk past these
|
||||
// PHI-elim copies to find the real test (a CMP).
|
||||
if (Opc == TargetOpcode::COPY) return true;
|
||||
// Pure load / register-transfer instructions: only side effect on
|
||||
// flags is N/Z from the loaded/transferred value. Never a "test"
|
||||
// — they just move data. Treated as corruption when between the
|
||||
|
|
@ -1365,7 +1374,42 @@ bool W65816StackSlotCleanup::runOnMachineFunction(MachineFunction &MF) {
|
|||
Cmp->getOperand(1).getImm() != 0)
|
||||
continue;
|
||||
bool Found = walkbackBefore(Cmp->getIterator(), MBB.begin());
|
||||
if (Found) {
|
||||
if (!Found) continue;
|
||||
// Only eliminate if there are NO LdaLike instructions between
|
||||
// this CMP and the next Bxx (or end of MBB). Otherwise the
|
||||
// CMP is the only flag-setting marker between the test value
|
||||
// and the consuming branch — without it, the Bxx ends up
|
||||
// testing the latest LdaLike's N/Z (typically a PHI-elim COPY
|
||||
// or stack reload that has nothing to do with the original
|
||||
// condition). Caused __adddf3's renormalize while-loop to
|
||||
// skip its body even though `mr & ~mask` was non-zero.
|
||||
bool SafeToErase = true;
|
||||
for (auto It = std::next(Cmp->getIterator());
|
||||
It != Cmp->getParent()->end(); ++It) {
|
||||
if (It->isDebugInstr()) continue;
|
||||
if (It->isBranch() || It->isReturn()) break;
|
||||
if (It->getOpcode() == TargetOpcode::COPY) {
|
||||
SafeToErase = false;
|
||||
break;
|
||||
}
|
||||
unsigned Opc = It->getOpcode();
|
||||
// Conservative: any LDA/LDX/LDY/transfer disqualifies erasure.
|
||||
// Stores and stack-mgmt are flag-preserving and OK.
|
||||
switch (Opc) {
|
||||
case W65816::STAfi: case W65816::STAfi_indY: case W65816::STA8fi:
|
||||
case W65816::STA_StackRel: case W65816::STA_StackRelIndY:
|
||||
case W65816::STA_DP: case W65816::STA_Abs: case W65816::STA_Long:
|
||||
case W65816::STX_DP: case W65816::STX_Abs:
|
||||
case W65816::STY_DP: case W65816::STY_Abs:
|
||||
case W65816::ADJCALLSTACKDOWN: case W65816::ADJCALLSTACKUP:
|
||||
case W65816::PHA: case W65816::PHX: case W65816::PHY:
|
||||
continue;
|
||||
}
|
||||
// Anything else (LDA, transfer, ALU op...): bail.
|
||||
SafeToErase = false;
|
||||
break;
|
||||
}
|
||||
if (SafeToErase) {
|
||||
Cmp->eraseFromParent();
|
||||
Changed = true;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -48,6 +48,7 @@ LLVMInitializeW65816Target() {
|
|||
initializeW65816NegYIndYPass(PR);
|
||||
initializeW65816PreSpillCrossCallPass(PR);
|
||||
initializeW65816SjLjFinalizePass(PR);
|
||||
initializeW65816LowerWide32Pass(PR);
|
||||
|
||||
// Default IndVarSimplify's exit-value rewriter to "never". The
|
||||
// closed-form replacement frequently widens an i16 induction var
|
||||
|
|
@ -150,6 +151,11 @@ void W65816PassConfig::addMachineSSAOptimization() {
|
|||
}
|
||||
|
||||
void W65816PassConfig::addPreRegAlloc() {
|
||||
// Decompose Wide32 vregs (i32 register pairs) into pairs of i16 vregs
|
||||
// BEFORE the other Acc16-targeting pre-RA passes run. Each later
|
||||
// pass walks Acc16/Idx16/Img16 vregs; running this first means they
|
||||
// see the decomposed halves uniformly.
|
||||
addPass(createW65816LowerWide32());
|
||||
addPass(createW65816ABridgeViaX());
|
||||
addPass(createW65816TiedDefSpill());
|
||||
addPass(createW65816WidenAcc16());
|
||||
|
|
@ -176,6 +182,18 @@ void W65816PassConfig::addPostRegAlloc() {
|
|||
addPass(createW65816SpillToX());
|
||||
addPass(createW65816StackSlotCleanup());
|
||||
addPass(createW65816SpillToX());
|
||||
// Disable MachineCopyPropagation: it eliminates `COPY $img = $a`
|
||||
// thinking the IMG dest is dead (no explicit physreg use of $img
|
||||
// remains after PEI expands STAfi-with-Img16-source into LDA_DP).
|
||||
// The COPY actually expands to STA_DP $D0 — a memory store to a
|
||||
// DP slot that libcalls (softDouble, softFloat) ALSO use as their
|
||||
// own arg-save scratch. When MCP drops the COPY, the subsequent
|
||||
// LDA_DP $D0 reads stale memory. Caught by `g = g/x` Newton loop:
|
||||
// iter-1's saved x_ml at $D0 was never actually written, so iter-2
|
||||
// read garbage. The principled fix would mark IMG-targeted COPYs
|
||||
// as memory-side-effecting, but TII doesn't expose that hook;
|
||||
// disabling MCP loses some optimization but is safe.
|
||||
disablePass(&llvm::MachineCopyPropagationID);
|
||||
}
|
||||
|
||||
void W65816PassConfig::addPreEmitPass() {
|
||||
|
|
|
|||
12
src/llvm/test/CodeGen/W65816/add-i16.ll
Normal file
12
src/llvm/test/CodeGen/W65816/add-i16.ll
Normal file
|
|
@ -0,0 +1,12 @@
|
|||
; Smoke test: confirm llc accepts the W65816 target via lit.
|
||||
; RUN: llc -mtriple=w65816 -O2 < %s | FileCheck %s
|
||||
|
||||
define i16 @add_i16(i16 %a, i16 %b) {
|
||||
; CHECK-LABEL: add_i16:
|
||||
; CHECK: rep #0x30
|
||||
; CHECK: clc
|
||||
; CHECK: adc 0x4, s
|
||||
; CHECK: rtl
|
||||
%r = add i16 %a, %b
|
||||
ret i16 %r
|
||||
}
|
||||
30
src/llvm/test/CodeGen/W65816/canmergestoresto-i16-cap.ll
Normal file
30
src/llvm/test/CodeGen/W65816/canmergestoresto-i16-cap.ll
Normal file
|
|
@ -0,0 +1,30 @@
|
|||
; Pin: canMergeStoresTo refuses to merge i16 stores into i32+.
|
||||
;
|
||||
; The SDAG store-merge combine sees two adjacent i16 stores and tries
|
||||
; to widen them into one i32 store. Our i32 store path is Custom-
|
||||
; lowered back to two i16 stores, and the merge runs again, and the
|
||||
; cycle repeats until OOM. Override fixes it by capping merge MemVT
|
||||
; at i16. See feedback_canmergestores_disable.md.
|
||||
;
|
||||
; Repro: write two adjacent i16 fields of a struct. Without the cap,
|
||||
; this either OOMs or burns >5s on a 4-line function. With the cap,
|
||||
; the lowered code shows two distinct i16 stores (no widened form).
|
||||
;
|
||||
; RUN: llc -mtriple=w65816 -O2 < %s | FileCheck %s
|
||||
|
||||
%struct.Pair = type { i16, i16 }
|
||||
|
||||
define void @write_pair(ptr %p, i16 %a, i16 %b) {
|
||||
; CHECK-LABEL: write_pair:
|
||||
; Two distinct i16 stores must remain — not merged into one i32.
|
||||
; Each i16 store under our i32-illegal path uses the same DP-indirect
|
||||
; family ([dp],y) but on a freshly-loaded $e0 pointer for each half.
|
||||
; CHECK: sta [0xe0
|
||||
; CHECK: sta [0xe0
|
||||
; CHECK: rtl
|
||||
%f0 = getelementptr inbounds %struct.Pair, ptr %p, i32 0, i32 0
|
||||
%f1 = getelementptr inbounds %struct.Pair, ptr %p, i32 0, i32 1
|
||||
store i16 %a, ptr %f0
|
||||
store i16 %b, ptr %f1
|
||||
ret void
|
||||
}
|
||||
36
src/llvm/test/CodeGen/W65816/extract-wide32-regseq.ll
Normal file
36
src/llvm/test/CodeGen/W65816/extract-wide32-regseq.ll
Normal file
|
|
@ -0,0 +1,36 @@
|
|||
; Pin: extractWide32Lo/Hi looks through REG_SEQUENCE shortcut.
|
||||
;
|
||||
; Without the shortcut, `*p = 0` (or any i32 store of a constant or
|
||||
; freshly-built i32 vreg) hits the SDAG combiner repeatedly, the
|
||||
; combiner re-merges and Custom-lower re-splits, the cycle runs for
|
||||
; tens of seconds and 100MB+ peak. See feedback_extract_wide32_regseq_shortcut.md.
|
||||
;
|
||||
; Two functions:
|
||||
; - clear_i32: simplest *(i32*)p = 0 case (the original repro)
|
||||
; - clear_i32_pair: two adjacent i32 zero-stores (combiner stress)
|
||||
;
|
||||
; If the shortcut regresses, llc either OOMs (process killed) or
|
||||
; takes >5s on these tiny functions. We assert on the lowered shape.
|
||||
;
|
||||
; RUN: llc -mtriple=w65816 -O2 -verify-machineinstrs < %s | FileCheck %s
|
||||
|
||||
define void @clear_i32(ptr %p) {
|
||||
; CHECK-LABEL: clear_i32:
|
||||
; CHECK: sta [0xe0
|
||||
; CHECK: sta [0xe0
|
||||
; CHECK: rtl
|
||||
store i32 0, ptr %p
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @clear_i32_pair(ptr %p, ptr %q) {
|
||||
; CHECK-LABEL: clear_i32_pair:
|
||||
; CHECK: sta [0xe0
|
||||
; CHECK: sta [0xe0
|
||||
; CHECK: sta [0xe0
|
||||
; CHECK: sta [0xe0
|
||||
; CHECK: rtl
|
||||
store i32 0, ptr %p
|
||||
store i32 0, ptr %q
|
||||
ret void
|
||||
}
|
||||
36
src/llvm/test/CodeGen/W65816/i64-first-arg-img16.ll
Normal file
36
src/llvm/test/CodeGen/W65816/i64-first-arg-img16.ll
Normal file
|
|
@ -0,0 +1,36 @@
|
|||
; Pin: i64-first-arg routes arg0 halves through Img16 (DP $C0..$DE).
|
||||
;
|
||||
; Without the Img16 routing, regalloc emits `TXA; STA spillA;
|
||||
; STA spillX` at function entry — the TXA clobbers $a (arg0_lo)
|
||||
; before the A-spill saves it, so both spill slots end up holding
|
||||
; arg0_ml. Caused __adddf3(1.5, 2.5) → 1.5. See
|
||||
; feedback_i64_first_arg_x_class.md.
|
||||
;
|
||||
; Fix: route arg0_lo via STA $dp and arg0_ml via STX $dp. Visible at
|
||||
; function entry as a pair of `stx 0x[cd]?` and `sta 0x[cd]?` writes
|
||||
; into the IMG region of direct page.
|
||||
;
|
||||
; Trigger: i64 first arg with enough cross-call live range that arg0
|
||||
; halves must be saved.
|
||||
;
|
||||
; RUN: llc -mtriple=w65816 -O2 < %s | FileCheck %s
|
||||
|
||||
declare i64 @ext1(i64 %x, i64 %y)
|
||||
declare i64 @ext2(i64 %a)
|
||||
|
||||
define i64 @i64_first_pressure(i64 %x) {
|
||||
; CHECK-LABEL: i64_first_pressure:
|
||||
; Entry stores arg0_ml (X) and arg0_lo (A) into IMG slots, NOT a
|
||||
; TXA-bridge sequence. $D0 / $D2 are concrete IMG slots (the IMG
|
||||
; region is $C0..$DE). Match a stx in that range, followed by an
|
||||
; sta in the same range, before the first jsl.
|
||||
; CHECK: stx 0xd
|
||||
; CHECK: sta 0xd
|
||||
; CHECK: jsl ext2
|
||||
; CHECK: rtl
|
||||
entry:
|
||||
%a = call i64 @ext2(i64 %x)
|
||||
%b = add i64 %a, %x
|
||||
%c = call i64 @ext1(i64 %b, i64 %x)
|
||||
ret i64 %c
|
||||
}
|
||||
32
src/llvm/test/CodeGen/W65816/img-copy-survives-mcp.ll
Normal file
32
src/llvm/test/CodeGen/W65816/img-copy-survives-mcp.ll
Normal file
|
|
@ -0,0 +1,32 @@
|
|||
; Pin: MachineCopyPropagation must NOT eliminate `COPY $img = $reg` —
|
||||
; that COPY actually expands to STA_DP $D? (a DP-memory store to an
|
||||
; IMG slot). Libcalls (softDouble, softFloat) use those same DP
|
||||
; slots for their own arg-save scratch, so dropping the COPY makes
|
||||
; the subsequent LDA_DP read stale memory. Caught by `g = g/x`
|
||||
; Newton loop: iter-1's saved x_ml at $D0 was never actually written
|
||||
; because MCP dropped the COPY, so iter-2's call to __divdf3 read
|
||||
; garbage as its x_ml argument. See feedback_jslpseudo_libcall_img_clobber.md.
|
||||
;
|
||||
; Fix: disable MachineCopyPropagation in addPostRegAlloc.
|
||||
;
|
||||
; Symptom shape we pin: for an i64-first-arg double function that
|
||||
; calls a libcall, the entry must contain BOTH `stx 0xd?` AND `sta
|
||||
; 0xd?` (for I64FirstArg's Img16 arg-save dance) — and they must
|
||||
; survive to the asm output. Without the MCP-disable, only one of
|
||||
; those (or neither) appears.
|
||||
;
|
||||
; RUN: llc -mtriple=w65816 -O2 < %s | FileCheck %s
|
||||
|
||||
declare double @ext_div(double %a, double %b)
|
||||
|
||||
define double @div_chain(double %x) {
|
||||
; CHECK-LABEL: div_chain:
|
||||
; Img16 arg-save at function entry — both halves must reach asm:
|
||||
; CHECK: stx 0xd
|
||||
; CHECK: sta 0xd
|
||||
; CHECK: jsl ext_div
|
||||
; CHECK: rtl
|
||||
entry:
|
||||
%r = call double @ext_div(double %x, double %x)
|
||||
ret double %r
|
||||
}
|
||||
28
src/llvm/test/CodeGen/W65816/jslpseudo-caller-save.ll
Normal file
28
src/llvm/test/CodeGen/W65816/jslpseudo-caller-save.ll
Normal file
|
|
@ -0,0 +1,28 @@
|
|||
; Pin: JSLpseudo declares Defs = [A, X, Y, DPF0].
|
||||
;
|
||||
; Without X, Y, DPF0 in the Defs list, an i64-returning libcall
|
||||
; (which returns lo16 in A, mid16 in X, hi16 in Y, hh16 in DPF0)
|
||||
; verifier-fails with "$y undefined" in math.c::floor. See
|
||||
; feedback_jslpseudo_caller_save.md.
|
||||
;
|
||||
; This test compiles a call to an i64-returning external function
|
||||
; with -verify-machineinstrs. If JSLpseudo's Defs are stripped, the
|
||||
; X/Y/DPF0 reads after the call would be on physregs the call didn't
|
||||
; declare it defined, and -verify-machineinstrs fails.
|
||||
;
|
||||
; RUN: llc -mtriple=w65816 -O2 -verify-machineinstrs < %s | FileCheck %s
|
||||
|
||||
declare i64 @ext_i64(i64 %x)
|
||||
|
||||
define i64 @i64_libcall_uses_xy(i64 %x) {
|
||||
; CHECK-LABEL: i64_libcall_uses_xy:
|
||||
; CHECK: jsl ext_i64
|
||||
; The post-call sequence stores the i64 return value (lo16 in A, mid16
|
||||
; in X, hi16 in Y, hh16 in DPF0) back to the caller's frame. If
|
||||
; JSLpseudo did not Def X, the txa here would verifier-fail because X
|
||||
; would not be live across the call.
|
||||
; CHECK: txa
|
||||
; CHECK: rtl
|
||||
%r = call i64 @ext_i64(i64 %x)
|
||||
ret i64 %r
|
||||
}
|
||||
2
src/llvm/test/CodeGen/W65816/lit.local.cfg
Normal file
2
src/llvm/test/CodeGen/W65816/lit.local.cfg
Normal file
|
|
@ -0,0 +1,2 @@
|
|||
if not 'W65816' in config.root.targets:
|
||||
config.unsupported = True
|
||||
29
src/llvm/test/CodeGen/W65816/seprep-ldy-elision-kill-flag.ll
Normal file
29
src/llvm/test/CodeGen/W65816/seprep-ldy-elision-kill-flag.ll
Normal file
|
|
@ -0,0 +1,29 @@
|
|||
; Pin: SepRepCleanup's redundant-LDY elision must clear the now-stale
|
||||
; `killed $y` flag on the prior Y-user.
|
||||
;
|
||||
; Trigger: any sequence that emits two LDY_Imm16 #N back-to-back with
|
||||
; STA [dp],y between (e.g. an i32 store that splits into two i16
|
||||
; stores, each going through STAptr32 inserter which emits its own
|
||||
; LDY #0). Without the fix, the third peephole at SepRepCleanup
|
||||
; deletes the second LDY, but the first STA's `implicit killed $y`
|
||||
; annotation was set under the assumption that the second LDY was
|
||||
; about to redefine Y — leaving the second STA reading "dead" Y.
|
||||
;
|
||||
; The fix walks backward from the erased LDY to the most recent
|
||||
; Y-using operand and clears its kill flag. -verify-machineinstrs
|
||||
; catches the bug if it regresses.
|
||||
;
|
||||
; RUN: llc -mtriple=w65816 -O2 -verify-machineinstrs < %s | FileCheck %s
|
||||
|
||||
define void @two_i32_stores_share_y(ptr %p) {
|
||||
; CHECK-LABEL: two_i32_stores_share_y:
|
||||
; The fix is invisible in asm output — both STAs emit identically with
|
||||
; or without the kill-flag fix. The pin is `-verify-machineinstrs`
|
||||
; not aborting. Match a minimal shape so the test still has structure.
|
||||
; CHECK: ldy #0x0
|
||||
; CHECK: sta [0xe0
|
||||
; CHECK: sta [0xe0
|
||||
; CHECK: rtl
|
||||
store i32 0, ptr %p
|
||||
ret void
|
||||
}
|
||||
41
src/llvm/test/CodeGen/W65816/sign-extend-inreg-i32.ll
Normal file
41
src/llvm/test/CodeGen/W65816/sign-extend-inreg-i32.ll
Normal file
|
|
@ -0,0 +1,41 @@
|
|||
; Pin: SIGN_EXTEND_INREG with i32 result and inner type i1 / i8 / i16
|
||||
; must Custom-lower to per-half ops. Without the Custom hook, the
|
||||
; combiner emits `sext_inreg(REG_SEQUENCE(...), i1)` which has no
|
||||
; tablegen pattern and isel aborts with "Cannot select".
|
||||
;
|
||||
; The i1 case shows up in CRC32 loops (`-(crc & 1ul)` reduces to
|
||||
; sign_extend_inreg with i1). See feedback_sext_inreg_i32_isel_gap.md.
|
||||
;
|
||||
; Note: -verify-machineinstrs intentionally omitted because i32 store
|
||||
; lowering still trips the i32-store-pair `implicit killed $y`
|
||||
; concern in some chains; orthogonal to this fix.
|
||||
;
|
||||
; RUN: llc -mtriple=w65816 -O2 < %s | FileCheck %s
|
||||
|
||||
; The CRC32 idiom: -(x & 1) = sign_extend_inreg x, i1 (after combiner).
|
||||
define i32 @neg_lowbit(i32 %x) {
|
||||
; CHECK-LABEL: neg_lowbit:
|
||||
; CHECK: and #0x1
|
||||
; CHECK: rtl
|
||||
%a = and i32 %x, 1
|
||||
%b = sub i32 0, %a
|
||||
ret i32 %b
|
||||
}
|
||||
|
||||
; (int32_t)(int8_t)x — sign-extend low byte to i32.
|
||||
define i32 @sext_i8_to_i32(i32 %x) {
|
||||
; CHECK-LABEL: sext_i8_to_i32:
|
||||
; CHECK: rtl
|
||||
%t = trunc i32 %x to i8
|
||||
%r = sext i8 %t to i32
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
; (int32_t)(int16_t)x — sign-extend low halfword to i32.
|
||||
define i32 @sext_i16_to_i32(i32 %x) {
|
||||
; CHECK-LABEL: sext_i16_to_i32:
|
||||
; CHECK: rtl
|
||||
%t = trunc i32 %x to i16
|
||||
%r = sext i16 %t to i32
|
||||
ret i32 %r
|
||||
}
|
||||
32
src/llvm/test/CodeGen/W65816/wide32-phi-split.ll
Normal file
32
src/llvm/test/CodeGen/W65816/wide32-phi-split.ll
Normal file
|
|
@ -0,0 +1,32 @@
|
|||
; Pin: W65816LowerWide32 Pass 2b splits Wide32 PHIs.
|
||||
;
|
||||
; Without PHI splitting, an i32 phi (loop-carried 32-bit value)
|
||||
; survives to RA, hits "Wide32 reload to non-pair reg" UNREACHABLE.
|
||||
; softDouble at -O2 was the original repro (ma/mb mantissa loops).
|
||||
;
|
||||
; This test mimics the shape: an i32 carried across a loop. If
|
||||
; LowerWide32 doesn't split the PHI, llc aborts.
|
||||
;
|
||||
; RUN: llc -mtriple=w65816 -O2 < %s | FileCheck %s
|
||||
|
||||
define i32 @sum_i32_loop(ptr %p, i16 %n) {
|
||||
; CHECK-LABEL: sum_i32_loop:
|
||||
; CHECK: rtl
|
||||
entry:
|
||||
%is_zero = icmp eq i16 %n, 0
|
||||
br i1 %is_zero, label %done, label %loop
|
||||
|
||||
loop:
|
||||
%i = phi i16 [ 0, %entry ], [ %i.next, %loop ]
|
||||
%acc = phi i32 [ 0, %entry ], [ %acc.next, %loop ]
|
||||
%addr = getelementptr inbounds i32, ptr %p, i16 %i
|
||||
%v = load i32, ptr %addr
|
||||
%acc.next = add i32 %acc, %v
|
||||
%i.next = add i16 %i, 1
|
||||
%cond = icmp eq i16 %i.next, %n
|
||||
br i1 %cond, label %done, label %loop
|
||||
|
||||
done:
|
||||
%r = phi i32 [ 0, %entry ], [ %acc.next, %loop ]
|
||||
ret i32 %r
|
||||
}
|
||||
Loading…
Add table
Reference in a new issue