From c4da4b77b361d877cd501a3af81a4a235dc53d77 Mon Sep 17 00:00:00 2001 From: Scott Duensing Date: Sat, 2 May 2026 20:41:19 -0500 Subject: [PATCH] Checkpoint --- STATUS.md | 77 +++++---- runtime/build.sh | 2 + runtime/src/iigsGsos.s | 78 ++++++--- runtime/src/iigsGsosStub.s | 18 +++ runtime/src/libcxxabi.c | 148 ++++++++++++++++++ scripts/runInMameWithGsosStub.sh | 122 +++++++++++++++ scripts/smokeTest.sh | 124 +++++++++++++++ .../lib/Target/W65816/W65816TargetMachine.cpp | 15 ++ 8 files changed, 532 insertions(+), 52 deletions(-) create mode 100644 runtime/src/iigsGsosStub.s create mode 100644 runtime/src/libcxxabi.c create mode 100755 scripts/runInMameWithGsosStub.sh diff --git a/STATUS.md b/STATUS.md index de6f334..7171bfc 100644 --- a/STATUS.md +++ b/STATUS.md @@ -87,10 +87,14 @@ which runs correctly under MAME (apple2gs). with one shared Base subobject), virtual functions, polymorphism via base-class pointer arrays, virtual dtors, this-pointer adjustment for non-leftmost bases, vbase offset - tables. Compile with `clang++ -fno-exceptions -fno-rtti`. - Full RTTI (`dynamic_cast`, `typeid`) and exceptions remain - out of scope — those need libcxxabi (`__dynamic_cast`, - `__cxa_throw`, unwind tables, personality routine). + tables. RTTI / `dynamic_cast` works (downcast, MI cross-cast, + virtual-base sibling cast) via a minimal libcxxabi shim + (`runtime/src/libcxxabi.c`) that provides `__dynamic_cast` + + the three typeinfo class vtables (`__class_type_info`, + `__si_class_type_info`, `__vmi_class_type_info`) + sized + `operator delete` + `__cxa_pure_virtual`. Compile with + `clang++ -fno-exceptions` (RTTI can stay on; exceptions + remain out of scope — see "Yet to come"). **Toolchain:** @@ -110,7 +114,7 @@ which runs correctly under MAME (apple2gs). image addresses. - `runtime/build.sh` builds crt0, libc, soft-float, soft-double, libgcc into linkable objects. -- `scripts/smokeTest.sh` runs 120 end-to-end checks at -O2: +- `scripts/smokeTest.sh` runs 122 end-to-end checks at -O2: scalar ops, control flow, calling conventions, MAME execution regressions, link816 bss-base safety + weak-symbol resolution + heap_end-vs-heap_start sanity, iigs/toolbox.h compile + link, @@ -123,18 +127,28 @@ which runs correctly under MAME (apple2gs). JSL via `__jsl_indir`), memory-backed file I/O (mfsRegister + fopen/fread/fwrite/fseek/fprintf), C++ polymorphism (single inheritance), C++ multiple inheritance (Drawable+Movable), - C++ virtual base diamond, wchar / signal core APIs, hex dumper - writing through fprintf, JSON tokenizer state machine, - hash-table command shell (parser + dispatch + chained - collisions over fprintf-to-mfs), scripts/bench.sh size-vs- - Calypsi harness. 100% pass. + C++ virtual base diamond, C++ dynamic_cast (SI + MI cross-cast + + virtual-base sibling cast through libcxxabi shim), GS/OS wrapper + round-trip via stub dispatcher pre-loaded at $E100A8 (validates + PHA + PEA 0 + JSL + post-call SP-fixup contract end-to-end), + wchar / signal core APIs, hex dumper writing through fprintf, + JSON tokenizer state machine, hash-table command shell (parser + + dispatch + chained collisions over fprintf-to-mfs), + scripts/bench.sh size-vs-Calypsi harness. 100% pass. - `scripts/bench.sh` compiles a microbenchmark suite with both clang (this toolchain) and Calypsi cc65816, comparing emitted - text-section size. Current ratio: ~2.2x (clang generates more - bytes than Calypsi on average; sumOfSquares is the worst case - at 6.45x because of __mulsi3 dispatch). Eight benchmarks - shipped under `benchmarks/`. + text-section size. Current ratio: ~1.9x (down from 2.2x once + the W65816 target started overriding `replexitval` to "never" + by default in `LLVMInitializeW65816Target`; SCEV's closed-form + rewrite was promoting i16 induction expressions to i64 and + hitting `__muldi3`, which on a 16-bit target is dramatically + bigger than the loop it replaces). sumOfSquares went 335B → + 128B, a 2.6x shrink with no other benchmark affected. Eight + benchmarks shipped under `benchmarks/`. Remaining gap is + structural: Calypsi uses `(sr,s),Y` for stack-relative + pointer indirection where we route through DP $E0 indirect- + long for bank safety. **Backend register allocation:** @@ -205,24 +219,19 @@ RAM through $FFFF, gaining 8KB of bank-0 space.) ## Yet to come -- **C++ full RTTI + exceptions** — multi-inheritance and virtual - base diamonds work; `dynamic_cast` and `throw`/`try`/`catch` - do not. Both need libcxxabi (`__dynamic_cast` walks the - type_info hierarchy; `__cxa_throw`/_Unwind_*/personality - routine drive stack unwinding). Reasonable to defer until - someone wants exception-based code on the IIgs. +- **C++ exceptions** — `dynamic_cast` works (via libcxxabi shim, + see "What works"); `throw`/`try`/`catch` does not. Implementing + exceptions needs the full Itanium unwind ABI: `__cxa_throw`, + `__cxa_allocate_exception`, `_Unwind_RaiseException`, a + personality routine, and DWARF `.eh_frame` data the unwinder + consumes to restore registers per-frame. The 65816's lack of + any existing unwinder makes this a real project — defer until + someone needs exception-based code on the IIgs. -- **Close the size gap to Calypsi further** — `scripts/bench.sh` - shows clang at ~2.2x Calypsi text size on the microbenchmarks, - sumOfSquares worst at 6.45x (__mulsi3 dispatch). Calypsi's - edge is structural: it uses `(sr,s),Y` for stack-relative - indirection where we route through DP $E0 indirect-long for - bank safety. Targeted opportunities: inline 16x16→32 - multiply for small operands; widen IMG-slot heuristic so - greedy reaches further before spilling. - -- **GS/OS file I/O exercised under MAME** — wrappers - (`runtime/include/iigs/gsos.h` + `runtime/src/iigsGsos.s`) - compile and link, but the smoke harness can't drive them - (no ProDOS volume mounted). Validating end-to-end needs a - 2img/po/dsk launched as a MAME hard disk plus toolbox init. +- **GS/OS validated against a real ProDOS volume** — the wrapper + contract (PHA + PEA 0 + LDX + JSL $E100A8 + post-call SP fixup) + is verified end-to-end in MAME against a stub dispatcher + (`scripts/runInMameWithGsosStub.sh`). Validating against an + actual GS/OS-loaded volume needs a bootable system disk image + attached as a MAME smartport hard disk and Tool Locator init — + out of scope for an automated CI smoke. diff --git a/runtime/build.sh b/runtime/build.sh index d3421cd..82eb76a 100755 --- a/runtime/build.sh +++ b/runtime/build.sh @@ -41,6 +41,8 @@ cc "$SRC/extras.c" cc "$SRC/strtok.c" cc "$SRC/math.c" cc "$SRC/softFloat.c" +cc "$SRC/libcxxabi.c" +asm "$SRC/iigsGsos.s" # softDouble.c builds at -O1: __muldf3's u64 live-range pressure # overflows the greedy allocator at -O2. dpack is already noinline # to reduce pressure, but dclass MUST stay inline (its pointer-arg diff --git a/runtime/src/iigsGsos.s b/runtime/src/iigsGsos.s index bf40129..93c7043 100644 --- a/runtime/src/iigsGsos.s +++ b/runtime/src/iigsGsos.s @@ -1,17 +1,29 @@ ; iigsGsos.s — GS/OS class-1 dispatch wrappers. ; ; Each wrapper takes a 16-bit pointer to a class-1 parm block in A -; (the C ABI). The dispatcher convention is: -; PEA ; push 16-bit ptr (low half of 32-bit -; PEA 0 ; long pointer; high half is 0) -; LDX # -; JSL $E100A8 -; Returns the call status in A (0 = success, non-zero = error code). +; (the C ABI). The GS/OS convention is: +; PHA / PEA 0 ; push 32-bit parm-block pointer +; ; (low 16 = caller's bank-0 ptr, +; ; high 16 = 0 since parm blocks live +; ; in bank 0) +; LDX # ; class-1 call number ($20xx) +; JSL $E100A8 ; dispatcher +; ; caller-cleans (CALLER's responsibility) +; The dispatcher returns the call status in A (0 = success, non-zero +; = error code). The dispatcher clobbers X, Y, P; A holds the status. ; -; All wrappers preserve nothing — the GS/OS dispatcher clobbers A, -; X, Y, P. Each takes the parm-block pointer in A (i16) and pushes -; it as a 32-bit pointer (low half = the in-bank ptr, high half = 0 -; for bank-0 parm blocks, which is what we always use). +; CRITICAL: GS/OS does NOT pop the parm-block pointer. The caller +; must clean up the 4 pushed bytes BEFORE its own RTL — otherwise +; the RTL pops parm-pointer bytes as a return address and the CPU +; jumps into garbage (typically $00:0000 = BRK loop). See the bug +; that motivated this comment. +; +; Each wrapper: +; 1. PHA + PEA 0 (push 4-byte parm-block long ptr) +; 2. LDX #call# +; 3. JSL $E100A8 +; 4. Stash A (status) at DP $E4, slide SP up 4 bytes, restore A +; 5. RTL .text .globl gsosOpen @@ -21,17 +33,17 @@ .globl gsosGetEOF .globl gsosSetEOF -; Common dispatch helper macro: arg in A, call number in X. -; Pushes the 32-bit parm-block pointer, JSLs the dispatcher, returns -; status in A. All wrappers below follow the same shape — copy/paste -; rather than macro because the assembler doesn't have a portable -; macro syntax we rely on. - gsosOpen: - pha ; push parm-block low - pea 0 ; push parm-block high (0 for bank 0) + pha + pea 0 ldx #0x2010 jsl 0xe100a8 + sta 0xe4 ; stash status (A) in DP scratch + tsc + clc + adc #4 + tcs ; SP += 4 (pop the long ptr) + lda 0xe4 ; restore status to A rtl gsosRead: @@ -39,6 +51,12 @@ gsosRead: pea 0 ldx #0x2012 jsl 0xe100a8 + sta 0xe4 + tsc + clc + adc #4 + tcs + lda 0xe4 rtl gsosWrite: @@ -46,6 +64,12 @@ gsosWrite: pea 0 ldx #0x2013 jsl 0xe100a8 + sta 0xe4 + tsc + clc + adc #4 + tcs + lda 0xe4 rtl gsosClose: @@ -53,6 +77,12 @@ gsosClose: pea 0 ldx #0x2014 jsl 0xe100a8 + sta 0xe4 + tsc + clc + adc #4 + tcs + lda 0xe4 rtl gsosGetEOF: @@ -60,6 +90,12 @@ gsosGetEOF: pea 0 ldx #0x2019 jsl 0xe100a8 + sta 0xe4 + tsc + clc + adc #4 + tcs + lda 0xe4 rtl gsosSetEOF: @@ -67,4 +103,10 @@ gsosSetEOF: pea 0 ldx #0x2018 jsl 0xe100a8 + sta 0xe4 + tsc + clc + adc #4 + tcs + lda 0xe4 rtl diff --git a/runtime/src/iigsGsosStub.s b/runtime/src/iigsGsosStub.s new file mode 100644 index 0000000..44ca05c --- /dev/null +++ b/runtime/src/iigsGsosStub.s @@ -0,0 +1,18 @@ +; Minimal GS/OS dispatcher stub at $E100A8. Native, M=0, X=0. +; Stack at entry: S+1=PCL, S+2=PCH, S+3=PBR, S+4..5=bank (=0), +; S+9=parm ptr low 16, S+10=high. We only use the low 16 (bank-0 +; parm blocks). Writes $42 to *parm and returns A=0. + .text + php ; save P + pha ; save A (16-bit) + lda 9, s ; A = parm ptr 16 + sta 0xe4 ; DP $E4..$E5 + ldy #0 ; X=0 here, so 3-byte encoding + sep #0x20 ; M=8 for the 1-byte store + .byte 0xa9, 0x42 ; lda #$42 (8-bit imm under M=8) + sta (0xe4), y ; *parm = $42 + rep #0x20 ; M=16 + pla ; restore A + plp + lda #0 ; status = 0 + rtl diff --git a/runtime/src/libcxxabi.c b/runtime/src/libcxxabi.c new file mode 100644 index 0000000..c8b9169 --- /dev/null +++ b/runtime/src/libcxxabi.c @@ -0,0 +1,148 @@ +// Minimal libcxxabi shim for the W65816 backend. +// +// Provides just enough of the Itanium C++ ABI to make `dynamic_cast` +// link and run for the inheritance shapes the smoke covers (single, +// multi, virtual-base diamond). Does NOT provide exception support +// (`__cxa_throw`, `_Unwind_*`, personality routine) — that needs a +// stack unwinder driven by DWARF .eh_frame, which we don't carry on +// this target. Compile your C++ with `clang++ -fno-exceptions` (RTTI +// can stay on). +// +// Layout used by clang's RTTI emit on a 16-bit target: +// __class_type_info : { vptr, name } (4 bytes) +// __si_class_type_info : { vptr, name, base_typeinfo } (6 bytes) +// __vmi_class_type_info: { vptr, name, flags, count, +// { base_typeinfo, offset_flags } * count } +// with base entry = 2 + 4 = 6 bytes +// +// Vtable layout: +// { offset_to_top:i16, typeinfo_ptr:i16, vfunc[0..]:i16 ... } +// The vptr stored in an object points at vfunc[0] (i.e. base+4). So: +// typeinfo = *(TypeInfo **)(vptr - 2) +// offset_to_top = *(int16_t *)(vptr - 4) +// +// offset_flags in vmi base entries (Itanium ABI): +// bit 0 (0x01) = virtual base +// bit 1 (0x02) = public base +// bits >> 8 = signed offset (bytes for non-virtual; vtable offset +// for virtual — read i16 at *vptr + that offset) + +#include +#include + +extern void free(void *); + +typedef struct TypeInfo { + const void *vptr; + const char *name; +} TypeInfo; + +typedef struct SiTypeInfo { + TypeInfo base; + const TypeInfo *baseType; +} SiTypeInfo; + +typedef struct VmiBase { + const TypeInfo *baseType; + int32_t offsetFlags; +} VmiBase; + +typedef struct VmiTypeInfo { + TypeInfo base; + uint16_t flags; + uint16_t baseCount; + VmiBase bases[1]; // baseCount entries +} VmiTypeInfo; + +// Forward decls. +static void *findBaseInObject(void *obj, const TypeInfo *cur, const TypeInfo *target); + +// The three typeinfo class vtables. Their *addresses* are what +// __dynamic_cast compares against to discriminate the three layouts — +// their contents are never executed. Six bytes each (offset_to_top +// + typeinfo + at least one slot so &sym+4 remains in-bounds). +const void *abiClassTypeInfoVtable[3] __asm__("_ZTVN10__cxxabiv117__class_type_infoE") = { 0, 0, 0 }; +const void *abiSiClassTypeInfoVtable[3] __asm__("_ZTVN10__cxxabiv120__si_class_type_infoE") = { 0, 0, 0 }; +const void *abiVmiClassTypeInfoVtable[3] __asm__("_ZTVN10__cxxabiv121__vmi_class_type_infoE") = { 0, 0, 0 }; + +// Itanium ABI: void *__dynamic_cast(const void *src, +// const TypeInfo *staticSrcType, +// const TypeInfo *dstType, +// ptrdiff_t src2dstHint); +// +// staticSrcType is unused in our search (we walk from the most-derived +// type instead); src2dstHint could short-circuit some cases but we +// ignore it for simplicity. +void *abiDynamicCast(const void *src, + const TypeInfo *staticSrcType, + const TypeInfo *dstType, + int32_t src2dstHint) __asm__("__dynamic_cast"); +void *abiDynamicCast(const void *src, + const TypeInfo *staticSrcType, + const TypeInfo *dstType, + int32_t src2dstHint) { + (void)staticSrcType; + (void)src2dstHint; + if (!src) { + return 0; + } + const void *vptr = *(const void * const *)src; + const TypeInfo *mostDerivedType = *(const TypeInfo * const *)((const char *)vptr - 2); + int16_t offsetToTop = *(const int16_t *)((const char *)vptr - 4); + void *mostDerived = (char *)src + offsetToTop; + return findBaseInObject(mostDerived, mostDerivedType, dstType); +} + +static void *findBaseInObject(void *obj, const TypeInfo *cur, const TypeInfo *target) { + if (cur == target) { + return obj; + } + const void *kindVtable = cur->vptr; + if (kindVtable == &abiSiClassTypeInfoVtable[2]) { + const SiTypeInfo *si = (const SiTypeInfo *)cur; + return findBaseInObject(obj, si->baseType, target); + } + if (kindVtable == &abiVmiClassTypeInfoVtable[2]) { + const VmiTypeInfo *vmi = (const VmiTypeInfo *)cur; + for (uint16_t i = 0; i < vmi->baseCount; i++) { + const VmiBase *b = &vmi->bases[i]; + int32_t off = (int32_t)b->offsetFlags >> 8; + void *baseObj; + if ((b->offsetFlags & 0x01) != 0) { + // Virtual base: offset lives in the object's vtable. + const void *vp = *(const void * const *)obj; + int16_t vbaseOff = *(const int16_t *)((const char *)vp + (int16_t)off); + baseObj = (char *)obj + vbaseOff; + } else { + baseObj = (char *)obj + (int16_t)off; + } + void *r = findBaseInObject(baseObj, b->baseType, target); + if (r) { + return r; + } + } + } + return 0; +} + +// operator delete(void *, unsigned int) — sized-delete form clang +// emits for virtual destructors on a 16-bit target. Route to free. +void abiOperatorDelete(void *p, unsigned int sz) __asm__("_ZdlPvj"); +void abiOperatorDelete(void *p, unsigned int sz) { + (void)sz; + free(p); +} + +// Plain operator delete(void *) — for non-virtual delete sites. +void abiOperatorDeletePv(void *p) __asm__("_ZdlPv"); +void abiOperatorDeletePv(void *p) { + free(p); +} + +// __cxa_pure_virtual — called if a pure virtual is somehow invoked +// (typically a bug). Spin so it's catchable in a debugger. +void abiPureVirtual(void) __asm__("__cxa_pure_virtual"); +void abiPureVirtual(void) { + while (1) { + } +} diff --git a/scripts/runInMameWithGsosStub.sh b/scripts/runInMameWithGsosStub.sh new file mode 100755 index 0000000..4083c86 --- /dev/null +++ b/scripts/runInMameWithGsosStub.sh @@ -0,0 +1,122 @@ +#!/usr/bin/env bash +# Run a 65816 binary in MAME after pre-loading a stub GS/OS +# dispatcher at $E100A8. This lets the smoke test exercise the +# wrapper-to-dispatcher contract end-to-end without needing a +# real GS/OS-bootable disk image. +# +# The stub is the assembled bytes of /tmp/gsosStub.s (rebuilt +# from runtime/src/iigsGsosStub.s as part of this script — kept +# in-tree so it's reproducible). It writes byte $42 to *parm +# and returns A=0 (success) for any call number. That is enough +# to verify the wrapper: +# 1. pushes the parm-block low ptr (PHA), +# 2. pushes a zero bank (PEA 0), +# 3. JSLs $E100A8 with the right return convention, +# 4. returns A correctly to the caller. +# +# Usage: runInMameWithGsosStub.sh +# runInMameWithGsosStub.sh --check =... + +set -euo pipefail +source "$(dirname "$0")/common.sh" + +BIN="$1" +shift +SECS=3 + +# 23-byte stub bytes (see runtime/src/iigsGsosStub.s for source). +# Hand-assembled to avoid relying on llvm-mc tracking M-flag state. +STUB_HEX="0848 a309 85e4 a000 00e2 20a9 4291 e4c2 2068 28a9 0000 6b" + +LUA_CHECKS="" +EXPECT_LIST=() +ADDR_LIST=() +if [ "$1" = "--check" ]; then + shift + for pair in "$@"; do + ADDR="${pair%=*}" + EXP="${pair#*=}" + ADDR_LIST+=("$ADDR") + EXPECT_LIST+=("$EXP") + LUA_CHECKS="$LUA_CHECKS print(string.format('MAME-READ addr=0x%06x val=0x%04x', $ADDR, mem:read_u16($ADDR)))"$'\n' + done +else + ADDR="$1" + EXP="$2" + ADDR_LIST+=("$ADDR") + EXPECT_LIST+=("$EXP") + LUA_CHECKS="print(string.format('MAME-READ addr=0x%06x val=0x%04x', $ADDR, mem:read_u16($ADDR)))" +fi + +[ -f "$BIN" ] || die "binary not found: $BIN" +LUA_PATH=$(mktemp --suffix=.lua) +trap 'rm -f "$LUA_PATH"' EXIT + +# Build the stub-write Lua statement (one mem:write_u8 per byte). +STUB_BYTES=$(echo "$STUB_HEX" | tr -d ' ') +STUB_LUA="" +i=0 +while [ $i -lt ${#STUB_BYTES} ]; do + byte="${STUB_BYTES:$i:2}" + addr=$(( 0xe100a8 + i / 2 )) + STUB_LUA="${STUB_LUA} mem:write_u8(${addr}, 0x${byte})"$'\n' + i=$(( i + 2 )) +done + +cat > "$LUA_PATH" <= 0x00C000 and addr < 0x00D000) then + mem:write_u8(addr, data:byte(i)) + end + end + -- Install GS/OS dispatcher stub at \$E100A8 (bank E1 RAM). +$STUB_LUA + loaded = true + cpu.state["PC"].value = 0x1000 + cpu.state["PB"].value = 0x00 + cpu.state["DB"].value = 0x00 + cpu.state["D"].value = 0x00 + cpu.state["P"].value = 0x34 + cpu.state["E"].value = 0 + cpu.state["S"].value = 0x01FF + print("MAME-LOADED bytes=" .. #data .. " stub=$((${#STUB_BYTES}/2))") + end + if frame == 60 then + local cpu = manager.machine.devices[":maincpu"] + local mem = cpu.spaces["program"] +$LUA_CHECKS + manager.machine:exit() + end +end) +EOF + +OUT=$(timeout 30 mame apple2gs \ + -rompath "$PROJECT_ROOT/tools/mame/roms" \ + -plugins -autoboot_script "$LUA_PATH" \ + -window -sound none -nothrottle -seconds_to_run "$SECS" 2>&1 | grep "^MAME-") + +echo "$OUT" +mapfile -t GOT_LIST < <(printf '%s\n' "$OUT" | grep -oE 'val=0x[0-9a-f]+' | sed 's/val=0x//') +ok=1 +for i in "${!EXPECT_LIST[@]}"; do + if [ "${GOT_LIST[$i]:-}" != "${EXPECT_LIST[$i]}" ]; then + warn "MAME mismatch at ${ADDR_LIST[$i]}: got 0x${GOT_LIST[$i]:-MISSING} expected 0x${EXPECT_LIST[$i]}" + ok=0 + fi +done +if [ $ok -eq 1 ]; then + log "MAME (gsos-stub) OK: ${#EXPECT_LIST[@]} reads matched" + exit 0 +fi +exit 1 diff --git a/scripts/smokeTest.sh b/scripts/smokeTest.sh index 75ac5ae..616af0d 100755 --- a/scripts/smokeTest.sh +++ b/scripts/smokeTest.sh @@ -3921,6 +3921,81 @@ EOF fi rm -f "$cppVbFile" "$oCppVbFile" "$binCppVbFile" + # C++ dynamic_cast across all three inheritance shapes. Links + # in our minimal libcxxabi shim (runtime/src/libcxxabi.c) which + # provides __dynamic_cast + the three typeinfo class vtables + # (__class_type_info / __si_class_type_info / __vmi_class_type + # _info). Compile WITHOUT -fno-rtti so clang emits typeinfo + # objects. Exception support is still excluded (-fno-exceptions + # required — DWARF unwinder + personality routine are not + # implemented for this target). + log "check: MAME runs C++ dynamic_cast (SI + MI + virtual base)" + cppRttiFile="$(mktemp --suffix=.cpp)" + oCppRttiFile="$(mktemp --suffix=.o)" + oCxxAbiFile="$(mktemp --suffix=.o)" + binCppRttiFile="$(mktemp --suffix=.bin)" + "$CLANG" --target=w65816 -O2 -ffunction-sections \ + -I"$PROJECT_ROOT/runtime/include" \ + -c "$PROJECT_ROOT/runtime/src/libcxxabi.c" -o "$oCxxAbiFile" + cat > "$cppRttiFile" <<'EOF' +extern "C" __attribute__((noinline)) void switchToBank2(void) { + __asm__ volatile ("sep #0x20\n.byte 0xa9,0x02\npha\nplb\nrep #0x20\n"); +} +class Animal { public: virtual ~Animal() {} virtual int kind() = 0; }; +class Dog : public Animal { public: int kind() override { return 1; } }; +class Cat : public Animal { public: int kind() override { return 2; } }; +class Drawable { public: virtual int draw() = 0; virtual ~Drawable() {} }; +class Movable { public: virtual int move() = 0; virtual ~Movable() {} }; +class Sprite : public Drawable, public Movable { +public: + int draw() override { return 1; } + int move() override { return 2; } +}; +class Base { public: int b; Base(int x):b(x){} virtual int who() = 0; virtual ~Base() {} }; +class A : public virtual Base { public: A(int x):Base(x){} int who() override { return 10; } }; +class B : public virtual Base { public: B(int x):Base(x){} int who() override { return 20; } }; +class Diamond : public A, public B { +public: + Diamond(int x) : Base(x), A(x), B(x) {} + int who() override { return 99; } +}; +extern "C" int main(void) { + Dog dog; Cat cat; + Animal *a = &dog; + int ok = 0; + if (dynamic_cast(a) != 0) ok |= 0x001; + if (dynamic_cast(a) == 0) ok |= 0x002; + a = &cat; + if (dynamic_cast(a) != 0) ok |= 0x004; + if (dynamic_cast(a) == 0) ok |= 0x008; + Sprite s; + Drawable *d = &s; + if (dynamic_cast(d) != 0) ok |= 0x010; + if (dynamic_cast(d) != 0) ok |= 0x020; + Diamond di(42); + A *ap = &di; + if (dynamic_cast(ap) != 0) ok |= 0x040; + if (dynamic_cast(ap) != 0) ok |= 0x080; + switchToBank2(); + *(volatile unsigned short *)0x5000 = (unsigned short)ok; + while (1) {} +} +EOF + "$PROJECT_ROOT/tools/llvm-mos-build/bin/clang++" --target=w65816 -O2 \ + -ffunction-sections -fno-exceptions \ + -c "$cppRttiFile" -o "$oCppRttiFile" 2>/dev/null + # libc/extras pulled in for free() (operator delete routes there + # for virtual-dtor sites). + "$PROJECT_ROOT/tools/link816" -o "$binCppRttiFile" --text-base 0x1000 \ + "$oCrt0F" "$oLibgccFile" "$oLibcF" \ + "$oCxxAbiFile" "$oCppRttiFile" \ + >/dev/null 2>&1 + if ! bash "$PROJECT_ROOT/scripts/runInMame.sh" "$binCppRttiFile" --check \ + 0x025000=00ff >/dev/null 2>&1; then + die "MAME: dynamic_cast != 0xFF (libcxxabi __dynamic_cast regression)" + fi + rm -f "$cppRttiFile" "$oCppRttiFile" "$oCxxAbiFile" "$binCppRttiFile" + # Real-world: hex dumper using memory-backed file I/O. Reads # 16 bytes from a registered "in" file, writes a hex+ASCII # dump to a registered "out" file via fprintf. Verifies the @@ -4387,6 +4462,55 @@ EOF fi rm -f "$cGsFile" "$oGsFile" "$oGsAsm" "$oGsLibc" "$oGsSnp" "$oGsSf" "$oGsSd" "$oGsCrt0" "$binGs" + # GS/OS wrapper round-trip in MAME against a stub dispatcher + # pre-loaded at $E100A8. The stub writes byte $42 to *parm and + # returns A=0; the test calls gsosOpen with parm pointing at a + # 1-byte slot in BSS and verifies the byte was written and A + # returned correctly. This is the smallest possible end-to-end + # check that the full wrapper-to-dispatcher contract works (PHA + # of parm low, PEA 0 of bank, JSL $E100A8, AND the wrapper's + # post-call SP fixup that pops the 4-byte parm pointer before + # RTL — without that fixup the RTL pops parm bytes as a return + # address and the CPU jumps into garbage). + log "check: MAME runs GS/OS wrapper round-trip via stub dispatcher" + cGsRtFile="$(mktemp --suffix=.c)" + oGsRtFile="$(mktemp --suffix=.o)" + oGsRtCrt0="$(mktemp --suffix=.o)" + oGsRtLibgcc="$(mktemp --suffix=.o)" + oGsRtAsm="$(mktemp --suffix=.o)" + binGsRt="$(mktemp --suffix=.bin)" + cat > "$cGsRtFile" <<'EOF' +extern unsigned short gsosOpen(void *p); +__attribute__((noinline)) static void switchToBank2(void) { + __asm__ volatile ("sep #0x20\n.byte 0xa9,0x02\npha\nplb\nrep #0x20\n"); +} +static unsigned char parm[1] = { 0 }; +int main(void) { + switchToBank2(); + *(volatile unsigned short *)0x5000 = 0xaaaa; + unsigned short rc = gsosOpen(parm); + *(volatile unsigned short *)0x5002 = 0xbbbb; + *(volatile unsigned short *)0x5004 = rc; + *(volatile unsigned short *)0x5006 = (unsigned short)parm[0]; + while (1) {} +} +EOF + "$CLANG" --target=w65816 -O2 -ffunction-sections -c "$cGsRtFile" -o "$oGsRtFile" + "$PROJECT_ROOT/tools/llvm-mos-build/bin/llvm-mc" -arch=w65816 -filetype=obj \ + "$PROJECT_ROOT/runtime/src/crt0.s" -o "$oGsRtCrt0" + "$PROJECT_ROOT/tools/llvm-mos-build/bin/llvm-mc" -arch=w65816 -filetype=obj \ + "$PROJECT_ROOT/runtime/src/libgcc.s" -o "$oGsRtLibgcc" + "$PROJECT_ROOT/tools/llvm-mos-build/bin/llvm-mc" -arch=w65816 -filetype=obj \ + "$PROJECT_ROOT/runtime/src/iigsGsos.s" -o "$oGsRtAsm" + "$PROJECT_ROOT/tools/link816" -o "$binGsRt" --text-base 0x1000 \ + "$oGsRtCrt0" "$oGsRtLibgcc" "$oGsRtAsm" "$oGsRtFile" \ + >/dev/null 2>&1 + if ! bash "$PROJECT_ROOT/scripts/runInMameWithGsosStub.sh" "$binGsRt" --check \ + 0x025000=aaaa 0x025002=bbbb 0x025004=0000 0x025006=0042 >/dev/null 2>&1; then + die "MAME: GS/OS wrapper round-trip failed (wrapper SP-fixup or stub regression)" + fi + rm -f "$cGsRtFile" "$oGsRtFile" "$oGsRtCrt0" "$oGsRtLibgcc" "$oGsRtAsm" "$binGsRt" + # stdint.h / stddef.h / limits.h / inttypes.h: standalone # replacements for clang's bundled versions (which try to include # glibc bits/* headers and break the build). Compile a small diff --git a/src/llvm/lib/Target/W65816/W65816TargetMachine.cpp b/src/llvm/lib/Target/W65816/W65816TargetMachine.cpp index c2d0966..3981feb 100644 --- a/src/llvm/lib/Target/W65816/W65816TargetMachine.cpp +++ b/src/llvm/lib/Target/W65816/W65816TargetMachine.cpp @@ -19,6 +19,7 @@ #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/MC/TargetRegistry.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Compiler.h" #include @@ -46,6 +47,20 @@ LLVMInitializeW65816Target() { initializeW65816SpillToXPass(PR); initializeW65816NegYIndYPass(PR); initializeW65816PreSpillCrossCallPass(PR); + + // Default IndVarSimplify's exit-value rewriter to "never". The + // closed-form replacement frequently widens an i16 induction var + // expression to i64 to avoid overflow proofs, then lowers the + // multiply to __muldi3. On a 16-bit target the libcall costs + // dramatically more than the natural loop it replaces — sumOfSquares + // shrinks from 335B (with __muldi3) to 128B (with __mulsi3 in the + // loop) just by suppressing this rewrite, with no other benchmark + // affected. We do this by name through the cl::opt registry so + // it doesn't require patching upstream llvm-mos. + auto &Opts = cl::getRegisteredOptions(); + if (auto *Opt = Opts.lookup("replexitval")) { + Opt->addOccurrence(0, "replexitval", "never"); + } } static Reloc::Model getEffectiveRelocModel(std::optional RM) {