diff --git a/.gitignore b/.gitignore index 20d4275..8487729 100644 --- a/.gitignore +++ b/.gitignore @@ -19,6 +19,13 @@ tools/ *.map *.reloc +# Exception: demo .rsrc/ fixture directories ship TYPECODE_ID.bin files +# as source. Each rsrcBundle test reads them at build time and emits +# the AppleSingle + sidecar in the same dir; the .apl + sidecar are +# build artifacts (caught by *.bin above for the sidecar; .apl is +# tracked by name). We carve out the source fixtures here. +!demos/*.rsrc/*.bin + # Per-target build directories. tests/coremark/build/ tests/lua/build/ diff --git a/demos/rsrcProbe.apl b/demos/rsrcProbe.apl index ecef918..96e3f9c 100644 Binary files a/demos/rsrcProbe.apl and b/demos/rsrcProbe.apl differ diff --git a/demos/rsrcProbe.c b/demos/rsrcProbe.c index 0c7b74c..bb29ca4 100644 --- a/demos/rsrcProbe.c +++ b/demos/rsrcProbe.c @@ -1,59 +1,134 @@ -// rsrcProbe.c - Phase 3.4 stub-only Resource Manager smoke probe. +// rsrcProbe.c - Phase 3.4 real Resource Manager smoke probe. // -// What this verifies right now: -// - resourceProbeInit() returns RES_ERR_BLOCKED (the stub-only path), -// - iigsLoadResource() returns NULL with err = RES_ERR_BLOCKED, -// - iigsGetResourceSize() returns 0 with err = RES_ERR_BLOCKED, -// - the runtime resource.o links cleanly under -O2, -// - the demo's OMF can be bundled with rsrcBundle.py (post-step in -// demos/build.sh when demos/rsrcProbe.rsrc/ is present). -// -// Marker discipline. Page-1 ($70..$73) per the cursorProbe.c -// convention - runViaFinder.sh samples direct-page bytes reliably -// across MAME timings, and full-24-bit BSS-style markers (0x025000) -// don't survive the Loader/Finder relocation games on GS/OS 6.0.2. +// Replaces the stub-only probe. Builds a tiny in-memory .rsrc fixture, +// registers it with mfsRegister, opens it via openResourceFile, loads +// a known rText resource, and verifies the bytes match the expected +// payload. This exercises the real parser path top-to-bottom without +// needing a ProDOS resource fork. // +// Markers (page-1 direct page, per cursorProbe convention): // $70 := 0x99 end-of-main success sentinel -// $71 := initRc as int8 (expected 0xff = (uint8_t)RES_ERR_BLOCKED) -// $72 := loadErr (expected 0xff) -// $73 := 0x01 if resourceRuntimeEnabled()==0 (today's stub answer) +// $71 := 0x01 if openResourceFile succeeded (refnum != 0) +// $72 := 0x01 if loadResource returned a non-NULL handle whose +// bytes match "HELLO" and size is 5 +// $73 := 0x01 if loadResource second call returned the SAME handle +// (cache hit) and closeResourceFile returned RES_OK // // Build: bash demos/build.sh rsrcProbe // Run: bash scripts/runViaFinder.sh demos/rsrcProbe.omf \ -// --check 0x70=0x99 -// runViaFinder LAUNCHES the OMF and samples at frame 6000; no keypress -// is required because we drop into while(1) immediately after writing -// the markers. +// --check 0x70=0x99 0x71=0x01 0x72=0x01 0x73=0x01 #include +#include +#include #include "iigs/resource.h" +// rResourceMap fixture: header + 5-byte rText payload + one rIndex entry. +// +// Header (24 bytes, little-endian): +// rmVersion = 0x0000 +// rmToIndex = 0x0000001D (29) +// rmFileNum = 0 +// rmID = 0 +// rmIndexSize = 0x00000014 (20 bytes = 1 entry) +// rmIndexUsed = 0x00000001 +// rmFreeListSize = 0 +// rmFreeListUsed = 0 +// rmPad = 0 +// Payload (5 bytes) at offset 24: "HELLO" +// rIndex entry (20 bytes) at offset 29: +// rType = 0x8014 (rText) +// rID = 0x00000001 +// rOffset = 0x00000018 (24) +// rAttr = 0 +// rSize = 0x00000005 +// rHandle = 0 +static const uint8_t kFixture[49] = { + // header + 0x00, 0x00, // rmVersion + 0x1D, 0x00, 0x00, 0x00, // rmToIndex = 29 + 0x00, 0x00, // rmFileNum + 0x00, 0x00, // rmID + 0x14, 0x00, 0x00, 0x00, // rmIndexSize = 20 + 0x01, 0x00, 0x00, 0x00, // rmIndexUsed = 1 + 0x00, 0x00, // rmFreeListSize + 0x00, 0x00, // rmFreeListUsed + 0x00, 0x00, // rmPad + // payload at offset 24: "HELLO" + 0x48, 0x45, 0x4C, 0x4C, 0x4F, + // rIndex entry at offset 29 + 0x14, 0x80, // rType = 0x8014 + 0x01, 0x00, 0x00, 0x00, // rID = 1 + 0x18, 0x00, 0x00, 0x00, // rOffset = 24 + 0x00, 0x00, // rAttr + 0x05, 0x00, 0x00, 0x00, // rSize = 5 + 0x00, 0x00, 0x00, 0x00 // rHandle +}; + + +static const char kFixturePath[] = "rsrc.fixture"; +static const char kExpectedText[] = "HELLO"; +static const uint32_t kExpectedSize = 5; + + int main(void) { volatile uint8_t *mark0 = (volatile uint8_t *)0x70; volatile uint8_t *mark1 = (volatile uint8_t *)0x71; volatile uint8_t *mark2 = (volatile uint8_t *)0x72; volatile uint8_t *mark3 = (volatile uint8_t *)0x73; - *mark0 = 0x10; // entry sentinel: we did reach main() - int initRc = resourceProbeInit(); - *mark1 = (uint8_t)initRc; + *mark0 = 0x10; + *mark1 = 0x00; + *mark2 = 0x00; + *mark3 = 0x00; - int loadErr = 0; - void **h = iigsLoadResource(RES_TYPE_RTEXT, 1, &loadErr); - (void)h; - *mark2 = (uint8_t)loadErr; + // Stage the fixture as a read-only memory-backed file. Cast away + // const for the mfsRegister buffer pointer; the resource manager + // only ever reads. + if (mfsRegister(kFixturePath, (void *)kFixture, sizeof(kFixture), sizeof(kFixture), 0) != 0) { + while (1) { + } + } - int sizeErr = 0; - uint32_t sz = iigsGetResourceSize(RES_TYPE_RTEXT, 1, &sizeErr); - (void)sz; + resourceProbeInit(); - *mark3 = (uint8_t)(resourceRuntimeEnabled() == 0 ? 0x01 : 0x00); + int rcOpen = 0; + ResourceRefNumT ref = openResourceFile(kFixturePath, 0, 0, &rcOpen); + if (ref != 0 && rcOpen == RES_OK) { + *mark1 = 0x01; + } + + int rcLoad = 0; + void **h = loadResource(RES_TYPE_RTEXT, 1, &rcLoad); + if (h && rcLoad == RES_OK) { + const uint8_t *bytes = (const uint8_t *)*h; + uint32_t sz = getResourceSize(h); + int match = (sz == kExpectedSize); + if (match) { + for (uint32_t i = 0; i < kExpectedSize; i++) { + if (bytes[i] != (uint8_t)kExpectedText[i]) { + match = 0; + break; + } + } + } + if (match) { + *mark2 = 0x01; + } + } + + // Second load - cache hit must return the SAME handle. Then + // close the file, which must report RES_OK. + int rcLoad2 = 0; + void **h2 = loadResource(RES_TYPE_RTEXT, 1, &rcLoad2); + int sameHandle = (h2 == h && h2 != 0); + int rcClose = closeResourceFile(ref); + if (sameHandle && rcClose == RES_OK) { + *mark3 = 0x01; + } - // Success marker last - if any of the calls above trapped (which - // they shouldn't in stub-only mode), the harness will see $70 - // != 0x99 and report failure. *mark0 = 0x99; while (1) { diff --git a/demos/rsrcProbe.rsrc/8005_0001.bin b/demos/rsrcProbe.rsrc/8005_0001.bin new file mode 100644 index 0000000..36d13a3 --- /dev/null +++ b/demos/rsrcProbe.rsrc/8005_0001.bin @@ -0,0 +1 @@ +iconBytesPlaceholder \ No newline at end of file diff --git a/demos/rsrcProbe.rsrc/8014_0001.bin b/demos/rsrcProbe.rsrc/8014_0001.bin new file mode 100644 index 0000000..d9605cb --- /dev/null +++ b/demos/rsrcProbe.rsrc/8014_0001.bin @@ -0,0 +1 @@ +HELLO \ No newline at end of file diff --git a/runtime/include/iigs/resource.h b/runtime/include/iigs/resource.h index 32f0710..836f281 100644 --- a/runtime/include/iigs/resource.h +++ b/runtime/include/iigs/resource.h @@ -1,34 +1,37 @@ // iigs/resource.h - typed-C facade over the IIgs Resource Manager. // -// Phase 3.4 STUB-ONLY landing. The bundler + linker integration ship -// fully (see tools/rsrcBundle/), but the *runtime* path is blocked on -// Phase 1.1 (the GS/OS fopen hang). GS/OS 6.0.2 + ResourceStartUp + -// OpenResourceFile reaches the same path that hangs in fopen today, so -// the LoadResource()/GetResourceSize() entry points below return error -// codes instead of calling the toolbox. When Phase 1.1 lands, flip -// IIGS_RESOURCE_RUNTIME_ENABLED to 1 (or define it at the compiler -// level) and rebuild the runtime - the same C surface stays. +// Phase 3.4 REAL implementation: parses .rsrc resource forks via the +// stdio surface (fopen/fread/fseek/fclose) and serves resources from a +// per-file cache. Read-only. No AddResource, no DetachResource, no +// partial-load, no encryption - those are features we do not yet need. // // What you GET today: -// - resourceProbeInit() reports whether the runtime path is enabled. -// - LoadResource() / GetResourceSize() return RES_ERR_BLOCKED unless -// IIGS_RESOURCE_RUNTIME_ENABLED is set at compile time. +// - openResourceFile(path, accessByte, fileType) -> refNum (>0) or +// 0 on failure (errno-style code lands in *err if provided). +// - loadResource(type, id) -> Handle (void **) on success; cached so +// repeated calls return the same handle. *handle points at the +// resource bytes (already read from the file). +// - releaseResource(verb, handle) -> 0 on success. verb 0 just +// releases the current load; verb 1 also evicts the cache entry +// and frees the data. +// - closeResourceFile(refNum) -> 0 on success. Frees all cached +// handles owned by that file. // -// HLock semantics (IMPORTANT for future Phase 1.1 unblock): -// The toolbox LoadResource() returns a HANDLE (void **) to a master -// pointer in MM-relocatable storage. The application MUST call -// HLock() before dereferencing if it intends to call ANY toolbox -// routine that could trigger a heap compaction (most do). Without -// the HLock, the master pointer can be rewritten under you between -// the LoadResource and the deref. The typed wrappers below DO NOT -// call HLock for you - that is a deliberate choice because over- -// locking is a memory-fragmentation footgun and the right scope is -// workload-specific. Callers should: -// void **h = LoadResourceTyped(0x8014, 1); -// HLock(h); -// const RTextT *t = (const RTextT *)*h; -// ... use t ... -// HUnlock(h); +// On-disk format (Apple IIgs Toolbox Reference Vol 3, ch.42): +// File offset 0: rResourceMap header (24 bytes, little-endian fields +// because the 65816 is LE). Field rmToIndex is the file offset of +// the rIndex table; rmIndexUsed is the number of valid entries; the +// remaining header fields are bookkeeping/zero at build time. +// Body bytes: resource payloads at the offsets recorded in rIndex. +// At rmToIndex: array of 20-byte rIndex entries, each: +// uint16 rType, uint32 rID, uint32 rOffset, uint16 rAttr, +// uint32 rSize, uint32 rHandle (zero on disk). +// +// HLock semantics: +// The handles we return are NOT relocatable - they point straight at +// a malloc'd payload buffer. That means HLock/HUnlock are no-ops +// here. The void ** indirection is preserved so that real Memory +// Manager handles can swap in later without changing callers. #ifndef IIGS_RESOURCE_H #define IIGS_RESOURCE_H @@ -40,36 +43,39 @@ extern "C" { #include -// Flip to 1 (or pass -DIIGS_RESOURCE_RUNTIME_ENABLED=1 on the build line) -// once Phase 1.1 unblocks GS/OS fopen on 6.0.2. At that point the typed -// wrappers below dispatch into the live toolbox; until then they stub. -#ifndef IIGS_RESOURCE_RUNTIME_ENABLED -#define IIGS_RESOURCE_RUNTIME_ENABLED 0 -#endif - - // Status codes returned by the typed wrappers. Mirror the runtime's // existing errno-style convention (negative = error). enum { RES_OK = 0, - RES_ERR_BLOCKED = -1, // Phase 1.1 runtime path still blocked - RES_ERR_NOT_STARTED = -2, // resourceProbeInit() not called yet - RES_ERR_NOT_FOUND = -3, // OpenResourceFile / LoadResource failed - RES_ERR_TOOLBOX = -4 // Resource Manager returned non-zero + RES_ERR_BLOCKED = -1, // legacy stub marker - kept for + // backwards compat with old probes + RES_ERR_NOT_STARTED = -2, // openResourceFile not called yet + RES_ERR_NOT_FOUND = -3, // file open / resource lookup failed + RES_ERR_TOOLBOX = -4, // map header corrupt / IO failure + RES_ERR_NO_MEM = -5, // malloc failed + RES_ERR_BAD_HANDLE = -6 // release/close given an unknown ref }; // Resource type codes we expect to bundle. See Apple IIgs Toolbox -// Reference Vol 3 chapter 42 for the canonical list. Defined here as -// constants so callers don't have to use raw hex. +// Reference Vol 3 chapter 42 for the canonical list. #define RES_TYPE_RICON 0x8005 #define RES_TYPE_RTEXT 0x8014 #define RES_TYPE_RPSTRING 0x8015 #define RES_TYPE_RCSTRING 0x8016 -// Resource ID type matching the toolbox (32-bit on disk and in the -// rIndex; the public API uses uint32_t). +// Build-time tunables. These cap the per-process resource footprint. +#ifndef IIGS_RES_MAX_FILES +#define IIGS_RES_MAX_FILES 2 +#endif + +#ifndef IIGS_RES_MAX_HANDLES +#define IIGS_RES_MAX_HANDLES 16 +#endif + + +// Resource ID (32-bit on disk and in the rIndex). typedef uint32_t IigsResIdT; @@ -78,37 +84,87 @@ typedef uint32_t IigsResIdT; typedef uint16_t IigsResTypeT; -// One-shot Resource Manager bring-up. Calls MMStartUp + TLStartUp + -// ResourceStartUp + OpenResourceFile (on our own pathname) when the -// runtime path is enabled. Always callable; safe to call more than -// once (subsequent calls are no-ops). -// -// Returns: -// RES_OK if the resource fork was opened (or the stub -// path "succeeded" with no-op behavior), -// RES_ERR_BLOCKED if compiled with IIGS_RESOURCE_RUNTIME_ENABLED=0 -// (the default until Phase 1.1 lands), -// RES_ERR_TOOLBOX if any of the StartUp calls returned non-zero. +// 24-byte resource map header at the start of every .rsrc file. +typedef struct { + uint16_t rmVersion; + uint32_t rmToIndex; + uint16_t rmFileNum; + uint16_t rmID; + uint32_t rmIndexSize; + uint32_t rmIndexUsed; + uint16_t rmFreeListSize; + uint16_t rmFreeListUsed; + uint16_t rmPad; +} ResourceMapHeaderT; + + +// 20-byte rIndex entry. +typedef struct { + uint16_t rType; + uint32_t rID; + uint32_t rOffset; + uint16_t rAttr; + uint32_t rSize; + uint32_t rHandle; +} ResourceIndexEntryT; + + +// Refnum returned by openResourceFile. Zero means "no file"; valid +// refnums start at 1. +typedef uint16_t ResourceRefNumT; + + +// One-shot init. Returns RES_OK; safe to call more than once. int resourceProbeInit(void); -// Read whether the runtime path is live. Cheap; returns 1 iff a -// successful resourceProbeInit() has run AND the build enabled the -// runtime path. Returns 0 in the stub-only landing. +// Reports whether the Resource Manager is alive. Always 1 after +// resourceProbeInit() has run. int resourceRuntimeEnabled(void); -// LoadResource typed wrapper. Returns a HANDLE (void **) on success, -// or NULL on failure (and sets *err if non-NULL). +// Opens a resource fork at `path`. `accessByte` and `fileType` are +// accepted for API parity with the toolbox but ignored on read-only +// in-memory backends. Returns refnum (>0) on success, 0 on failure. +// If `err` is non-NULL it receives RES_OK or one of RES_ERR_*. +ResourceRefNumT openResourceFile(const char *path, uint8_t accessByte, + uint16_t fileType, int *err); + + +// Closes a resource fork and frees any handles cached for that file. +// Returns RES_OK or RES_ERR_BAD_HANDLE. +int closeResourceFile(ResourceRefNumT refNum); + + +// Loads a resource by (type, id). Searches all open resource files +// in open order and returns a cached handle if the same (type, id) +// was previously loaded from any open file. Returns NULL on failure. // -// Caller is responsible for HLock/HUnlock pairing around any usage that -// crosses a toolbox call; see HLock semantics block at the top of this -// file. +// The returned handle is `void **`; `*handle` is the resource bytes. +void **loadResource(IigsResTypeT type, IigsResIdT id, int *err); + + +// Releases a previously-loaded resource. +// verb 0: keep the cached payload (cheap; the handle may be reused). +// verb 1: evict the cache entry and free the payload. +// Returns RES_OK on success. +int releaseResource(int verb, void **handle); + + +// Convenience: byte size of the resource pointed to by `handle`. +// Returns 0 if `handle` is not in the cache. +uint32_t getResourceSize(void **handle); + + +// ---- Legacy stub API kept for backwards compatibility ---- +// The pre-Phase-3.4 stub exposed iigsLoadResource / iigsGetResourceSize +// for the rsrcProbe markers. Those now dispatch to the real +// implementation when at least one resource file is open. They report +// RES_ERR_NOT_STARTED when no file is open (instead of the old +// RES_ERR_BLOCKED), preserving the "did Phase 3.4 land?" signal. void **iigsLoadResource(IigsResTypeT resType, IigsResIdT resId, int *err); -// GetResourceSize typed wrapper. Returns the byte size of the resource -// or 0 on failure (and sets *err if non-NULL). uint32_t iigsGetResourceSize(IigsResTypeT resType, IigsResIdT resId, int *err); diff --git a/runtime/src/libc.c b/runtime/src/libc.c index 24195e7..104a8df 100644 --- a/runtime/src/libc.c +++ b/runtime/src/libc.c @@ -384,102 +384,11 @@ typedef __builtin_va_list va_list; #define va_arg(ap, ty) __builtin_va_arg(ap, ty) #define va_end(ap) __builtin_va_end(ap) -static void writeUDec(unsigned int n) { - char buf[6]; // 16-bit: max 5 digits + null - int i = 0; - if (n == 0) { putchar('0'); return; } - while (n > 0) { buf[i++] = '0' + (n % 10); n /= 10; } - while (i > 0) putchar(buf[--i]); -} - -static void writeDec(int n) { - // For INT_MIN, `-n` overflows signed int (UB). Negate as unsigned - // — well-defined (two's-complement wrap), and the magnitude is - // identical for the print path. - if (n < 0) { putchar('-'); writeUDec((unsigned int)(0u - (unsigned int)n)); } - else writeUDec((unsigned int)n); -} - -static void writeULong(unsigned long n) { - char buf[11]; // 32-bit: max 10 digits + null - int i = 0; - if (n == 0) { putchar('0'); return; } - while (n > 0) { buf[i++] = '0' + (n % 10); n /= 10; } - while (i > 0) putchar(buf[--i]); -} - -static void writeHex(unsigned int n, int width) { - static const char digits[] = "0123456789abcdef"; - // unsigned int is 16-bit on this target -> at most 4 hex digits. - // Cap width to that; without it `printf("%08x", ...)` blew past - // the buf[] tail and corrupted the stack. - char buf[4]; - if (width > 4) width = 4; - int i = 0; - if (n == 0) { buf[i++] = '0'; } - while (n > 0 && i < 4) { buf[i++] = digits[n & 0xF]; n >>= 4; } - while (i < width) buf[i++] = '0'; - while (i > 0) putchar(buf[--i]); -} - -static void writeStr(const char *s) { - if (!s) s = "(null)"; - while (*s) { putchar(*s); s++; } -} - -// Format-spec handlers used to be marked noinline to keep vprintf's -// main loop small for the long-branch limitation; now W65816BranchExpand -// reliably promotes Bxx to BRL when needed, so the inliner is free to -// merge them when it wants. -static void writeSignedLong(long n) { - // See writeDec: avoid the signed-overflow UB on LONG_MIN. - if (n < 0) { putchar('-'); writeULong(0ul - (unsigned long)n); } - else writeULong((unsigned long)n); -} - -// Minimal %f / %g support. Uses double soft-float; precision capped -// at 6 fractional digits (the C default). Doesn't handle Inf/NaN -// specially — prints the integer extraction, which will be 0 for -// non-finite values. Not IEEE-precise (intermediate truncation in -// the soft-double mul/div), but good enough for typical formatted -// numeric output. -static void writeDouble(double v, int prec) { - if (prec < 0) prec = 6; - if (prec > 9) prec = 9; - // Test the IEEE-754 sign bit (so -0.0 prints with the sign per - // C99) and avoid the soft-float __ltdf2 comparison, which has - // historically miscompiled for negative inputs (see snprintf.c - // banner for the same workaround). - unsigned long long vbits; - __builtin_memcpy(&vbits, &v, 8); - if (vbits & ((unsigned long long)1 << 63)) { - putchar('-'); - vbits &= ~((unsigned long long)1 << 63); - __builtin_memcpy(&v, &vbits, 8); - } - long ipart = (long)v; - writeULong((unsigned long)ipart); - if (prec == 0) return; - putchar('.'); - double frac = v - (double)ipart; - // Multiply fraction by 10^prec, then print as integer with leading zeros. - long mul = 1; - for (int i = 0; i < prec; i++) mul *= 10; - long fdigits = (long)(frac * (double)mul); - if (fdigits < 0) fdigits = -fdigits; - char buf[10]; - int n = 0; - long scale = mul / 10; - while (n < prec) { - if (scale == 0) scale = 1; - long d = fdigits / scale; - buf[n++] = '0' + (char)(d % 10); - scale /= 10; - if (scale == 0) break; - } - while (n < prec) buf[n++] = '0'; - for (int i = 0; i < n; i++) putchar(buf[i]); -} +// vprintf / printf used to dispatch through their own small format +// helpers (writeUDec/writeDec/writeULong/writeHex/writeStr/writeSignedLong/ +// writeDouble). Once vprintf was rewritten to route through vsnprintf +// (so printf and snprintf share one format engine in snprintf.c), the +// helpers became dead weight and were removed. extern int vsnprintf(char *buf, size_t n, const char *fmt, va_list ap); @@ -724,10 +633,11 @@ void free(void *p) { } void *calloc(size_t nmemb, size_t size) { - // size_t is 16-bit on this target; nmemb*size can overflow and - // wrap to a small value (e.g. calloc(65536, 1) -> 0 -> 2-byte - // alloc), then the caller writes way past the returned region. - // Bail when the multiplication would overflow. + // size_t is 32-bit, so the multiply itself won't overflow for any + // realistic input. The 0xFFFF cap is a "fits in one 64KB bank" + // sanity check: the heap lives in bank 0 below the IO window, so + // any single allocation must fit there. calloc(65536, 1) returns + // null rather than silently truncating into the IO range. if (size != 0 && nmemb > (size_t)0xFFFF / size) return (void *)0; size_t total = nmemb * size; void *p = malloc(total); @@ -757,6 +667,15 @@ void *realloc(void *ptr, size_t n) { typedef void (*AtexitFn)(void); static AtexitFn __atexitFn = (AtexitFn)0; +// BRK $00 then spin -- halts a 65816 in BRK so MAME's debugger catches +// it; the spin loop guards against the (rare) case where BRK returns. +static void __halt(void) __attribute__((noreturn)); +static void __halt(void) { + __asm__ volatile (".byte 0x00, 0x00"); + while (1) {} +} + + void exit(int code) { (void)code; // C99 7.20.4.3: exit() must invoke registered atexit handlers in @@ -766,9 +685,7 @@ void exit(int code) { __atexitFn = (AtexitFn)0; // prevent re-entry if fn calls exit fn(); } - // BRK $00 — halts a 65816 in BRK, MAME's debugger catches. - __asm__ volatile (".byte 0x00, 0x00"); - while (1) {} // unreachable + __halt(); } // ---- errno ---- @@ -1128,9 +1045,9 @@ typedef struct __sFILE { static char __tmpNames[MFS_MAX_FILES][LIBC_L_TMPNAM]; static FILE __mfs[MFS_MAX_FILES] = { - { FILE_KIND_STDIN, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0 }, - { FILE_KIND_STDOUT, 1, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0 }, - { FILE_KIND_STDERR, 1, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0 }, + { .kind = FILE_KIND_STDIN, .unget = -1 }, + { .kind = FILE_KIND_STDOUT, .writable = 1, .unget = -1 }, + { .kind = FILE_KIND_STDERR, .writable = 1, .unget = -1 }, }; FILE *stdin = &__mfs[0]; @@ -1278,9 +1195,6 @@ int fclose(FILE *stream) { return 0; } -// Forward decls for routines that live in snprintf.c. -extern int vsnprintf(char *buf, size_t n, const char *fmt, va_list ap); - // Forward decl for vfprintf so fprintf can call it. int vfprintf(FILE *stream, const char *fmt, va_list ap); size_t fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream); @@ -1377,8 +1291,7 @@ static AtexitFn __quickFn = (AtexitFn)0; void _Exit(int code) { (void)code; - __asm__ volatile (".byte 0x00, 0x00"); - while (1) {} // unreachable + __halt(); } void quick_exit(int code) { @@ -1388,8 +1301,7 @@ void quick_exit(int code) { __quickFn = (AtexitFn)0; fn(); } - __asm__ volatile (".byte 0x00, 0x00"); - while (1) {} // unreachable + __halt(); } int at_quick_exit(AtexitFn fn) { @@ -1438,20 +1350,26 @@ static void initFileMem(FILE *f, const MfsEntry *reg, int wantWrite) { // LIBC_PATH_MAX (kept in sync with limits.h's PATH_MAX) so user code // that bounds-checks against PATH_MAX stays consistent with what fopen // will accept. -static struct { +typedef struct __GsosPathBufT { u16 length; char text[LIBC_PATH_MAX]; -} __gsosPathBuf; +} __GsosPathBufT; -static int __buildGSString(const char *path) { +static __GsosPathBufT __gsosPathBuf; + +static int __fillGSString(__GsosPathBufT *buf, const char *path) { size_t n = 0; while (path[n] && n < LIBC_PATH_MAX) n++; if (path[n]) return -1; // path > PATH_MAX chars - __gsosPathBuf.length = (u16)n; - for (size_t i = 0; i < n; i++) __gsosPathBuf.text[i] = path[i]; + buf->length = (u16)n; + for (size_t i = 0; i < n; i++) buf->text[i] = path[i]; return 0; } +static int __buildGSString(const char *path) { + return __fillGSString(&__gsosPathBuf, path); +} + FILE *fopen(const char *path, const char *mode) { if (!path || !mode) return (FILE *)0; int wantWrite = 0; @@ -1486,7 +1404,6 @@ FILE *fopen(const char *path, const char *mode) { if (reg) { initFileMem(f, reg, wantWrite); - (void)wantRead; if (truncate) f->size = 0; if (append) f->pos = f->size; return f; @@ -1547,15 +1464,16 @@ FILE *fopen(const char *path, const char *mode) { gsosSetMark(&m); } } - (void)wantRead; return f; } size_t fread(void *ptr, size_t size, size_t nmemb, FILE *stream) { if (!stream) return 0; if (size == 0 || nmemb == 0) return 0; - // Avoid 32-bit overflow on size * nmemb: cap nmemb so each item - // (size bytes) fits in remaining 16-bit address space. + // size_t is u32 here, so the multiply itself can't overflow. The + // 0xFFFE cap is a "single 64KB bank" limit -- the underlying + // mem/GSOS backends address by 16-bit offset, so any single fread + // must fit in one bank. if (nmemb > (size_t)0xFFFE / size) nmemb = (size_t)0xFFFE / size; if (stream->kind == FILE_KIND_GSOS) { // Drain unget byte first if present. @@ -1605,8 +1523,10 @@ size_t fread(void *ptr, size_t size, size_t nmemb, FILE *stream) { size_t fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream) { if (!stream) return 0; if (size == 0 || nmemb == 0) return 0; - // Cap nmemb so each item (size bytes) fits in the address space - // — avoids 32-bit `size * nmemb` that the i32 multiply path triggers. + // size_t is u32 here, so the multiply itself can't overflow. The + // 0xFFFE cap is a "single 64KB bank" limit -- the underlying + // mem/GSOS backends address by 16-bit offset, so any single fwrite + // must fit in one bank. if (nmemb > (size_t)0xFFFE / size) nmemb = (size_t)0xFFFE / size; const char *in = (const char *)ptr; if (stream->kind == FILE_KIND_STDOUT || stream->kind == FILE_KIND_STDERR) { @@ -1814,7 +1734,6 @@ void setbuf(FILE *stream, char *buf) { // GS/OS. This matches both ProDOS `/VOL/FILE` and HFS `:Vol:File:` // conventions without forcing the caller to declare which. -int mfsUnregister(const char *path); extern int rand(void); // True when `path` looks like a GS/OS volume path (contains `/` or @@ -1863,18 +1782,10 @@ static int __sameParentDir(const char *a, const char *b) { // simultaneously (old+new for ChangePath), and Destroy of the source // at the end of the cross-dir fallback can reuse __gsosPathBuf for the // source name. Keeps the destination name alive across all calls. -static struct { - u16 length; - char text[LIBC_PATH_MAX]; -} __gsosPathBuf2; +static __GsosPathBufT __gsosPathBuf2; static int __buildGSString2(const char *path) { - size_t n = 0; - while (path[n] && n < LIBC_PATH_MAX) n++; - if (path[n]) return -1; - __gsosPathBuf2.length = (u16)n; - for (size_t i = 0; i < n; i++) __gsosPathBuf2.text[i] = path[i]; - return 0; + return __fillGSString(&__gsosPathBuf2, path); } int remove(const char *path) { diff --git a/runtime/src/resource.c b/runtime/src/resource.c index 3802bd7..c31a523 100644 --- a/runtime/src/resource.c +++ b/runtime/src/resource.c @@ -1,149 +1,479 @@ -// resource.c - iigs/resource.h implementation. Phase 3.4 STUB-ONLY -// landing. +// resource.c - Apple IIgs Resource Manager - real implementation. // -// Phase 1.1 (GS/OS fopen hang on 6.0.2) blocks the live runtime path. -// ResourceStartUp + OpenResourceFile reaches the same blocking code, -// so all three entry points (init, load, size) return RES_ERR_BLOCKED -// unless the build defines IIGS_RESOURCE_RUNTIME_ENABLED=1. When that -// flips on (Phase 1.1 lands), the toolbox calls below activate and the -// typed wrappers route through the real Resource Manager. +// Replaces the Phase 3.4 stub. Reads .rsrc resource forks via the +// stdio surface (fopen/fread/fseek/fclose) and caches loaded payloads +// by (type, id) so repeated loadResource() calls return the same +// handle. Read-only - no AddResource / DetachResource / partial-load. // -// HLock semantics: -// LoadResource (toolbox 0x0E1E) returns a HANDLE - a pointer to a -// master pointer in Memory-Manager-relocatable storage. Until you -// call HLock(handle), any subsequent toolbox call can compact the -// heap and move the underlying bytes. The typed wrappers DO NOT -// call HLock for the caller; that is the caller's responsibility -// per the contract in iigs/resource.h. +// File format (Apple IIgs Toolbox Reference Vol 3, ch.42): +// bytes 0..23 : ResourceMapHeaderT (little-endian fields) +// bytes ... : payload blobs at offsets recorded in the index +// bytes at rmToIndex : rmIndexUsed * ResourceIndexEntryT entries // -// Why we stub instead of returning best-effort answers: -// A real LoadResource that silently returned NULL would be ambiguous -// with "resource not found". RES_ERR_BLOCKED lets the demo + smoke -// harness distinguish "Phase 1.1 hasn't landed" from "your TYPECODE_ID -// bundle was missing a resource". Once Phase 1.1 lands, callers see -// the real error codes (RES_ERR_NOT_FOUND, RES_ERR_TOOLBOX) instead. +// Handle convention: we return a `void **` whose dereference yields the +// resource bytes. The handle storage lives in this file's static +// table; the bytes themselves are malloc'd at first load and freed at +// releaseResource(verb=1) or closeResourceFile(). #include "iigs/resource.h" -#include "iigs/toolbox.h" + +#include +#include +#include +#include -// Set to non-zero by a successful resourceProbeInit() call. Read by -// resourceRuntimeEnabled() to report status without re-running init. -// In the stub-only landing this never reaches 1 because the runtime -// path is compiled out. -static int gResourceReady = 0; +// --- Prototypes --- +static int freeHandleSlot(int slot); +static int findHandleByPtr(void **handle); +static int findHandleByTypeId(IigsResTypeT type, IigsResIdT id); +static int findHandleSlot(void); +static int findOpenFileSlot(void); +static int loadIndex(int fileSlot); +static void *readPayload(int fileSlot, uint32_t offset, uint32_t size); +static int readU16(FILE *f, uint16_t *out); +static int readU32(FILE *f, uint32_t *out); +static int readMapHeader(FILE *f, ResourceMapHeaderT *hdr); -// Cached refNum from OpenResourceFile. Populated only when the -// runtime path is enabled. unsigned short to match the toolbox -// signature (refNum is a 16-bit GS/OS fileID). -static unsigned short gResourceRefNum = 0; +// --- Internal types --- +typedef struct { + int inUse; + FILE *fp; + ResourceMapHeaderT hdr; + ResourceIndexEntryT *index; // malloc'd; rmIndexUsed entries + uint16_t refNum; // 1..N, matches slot+1 +} ResourceFileT; -// Stub flag to keep the unused-static-warning quiet when the runtime -// path is compiled out. The compiler folds the function bodies below -// to constant returns under -O2 anyway; this just keeps -Wunused happy -// across both build modes. -static void touchUnused(void) { - (void)gResourceRefNum; -} +typedef struct { + int inUse; + int fileSlot; // which ResourceFileT owns it + IigsResTypeT type; + IigsResIdT id; + void *data; // payload bytes + uint32_t size; + void *masterPtr; // master ptr cell -> &data +} HandleSlotT; -#if IIGS_RESOURCE_RUNTIME_ENABLED -// Path passed to OpenResourceFile. When the runtime path is live the -// expectation is that this is the application's own pathname (the OMF -// the Loader launched), so OpenResourceFile attaches to the file's -// resource fork. GS/OS holds the boot pathname in a known low-memory -// vector; we resolve it at init time and cache here. -// -// The exact pathname-resolution sequence is intentionally NOT implemented -// in this stub-only landing - it is part of the Phase 1.1 unblock work -// (the same code that fixes fopen will plumb the pathname through). -static char gOwnPathName[256] = { 0 }; -#endif +// --- State --- +// Declared volatile to defeat the GlobalOpt i1-narrowing pass that +// otherwise produces an `i1, zext` load the W65816 backend can't select. +// (See MEMORY.md: feedback_i1_load_custom.md.) +static volatile int gResourceReady = 0; +static ResourceFileT gFiles[IIGS_RES_MAX_FILES]; +static HandleSlotT gHandles[IIGS_RES_MAX_HANDLES]; -int resourceProbeInit(void) { - touchUnused(); -#if IIGS_RESOURCE_RUNTIME_ENABLED - // Live path - placeholder until Phase 1.1 lands. We deliberately - // do not call ResourceStartUp here in the stub-only landing because - // (a) it requires MMStartUp to have run already and (b) calling - // ResourceStartUp on a userId we don't own would corrupt the - // toolbox's per-app state. Phase 1.1's actual implementation will - // look like: - // MMStartUp(); - // TLStartUp(); - // ResourceStartUp(myUserId); - // gResourceRefNum = OpenResourceFile(0x0001, NULL, gOwnPathName); - // gResourceReady = (gResourceRefNum != 0) ? 1 : 0; - return RES_ERR_BLOCKED; -#else - return RES_ERR_BLOCKED; -#endif -} - - -int resourceRuntimeEnabled(void) { - return gResourceReady; -} - - -void **iigsLoadResource(IigsResTypeT resType, IigsResIdT resId, int *err) { - (void)resType; - (void)resId; -#if IIGS_RESOURCE_RUNTIME_ENABLED - if (!gResourceReady) { - if (err) { - *err = RES_ERR_NOT_STARTED; +int closeResourceFile(ResourceRefNumT refNum) { + if (refNum == 0 || refNum > IIGS_RES_MAX_FILES) { + return RES_ERR_BAD_HANDLE; + } + int slot = (int)refNum - 1; + if (!gFiles[slot].inUse) { + return RES_ERR_BAD_HANDLE; + } + // Free every cached handle owned by this file. + for (int i = 0; i < IIGS_RES_MAX_HANDLES; i++) { + if (gHandles[i].inUse && gHandles[i].fileSlot == slot) { + freeHandleSlot(i); } - return (void **)0; } - // Phase 1.1 will plug LoadResource(resType, resId) here. Toolbox - // pushes 4-byte ID as a long, returns handle in PHA slot. Caller - // must HLock() before dereferencing (see header notes). - void **h = (void **)LoadResource((unsigned short)resType, (long)resId); - if (!h) { - if (err) { - *err = RES_ERR_NOT_FOUND; - } - return (void **)0; + if (gFiles[slot].index) { + free(gFiles[slot].index); + gFiles[slot].index = (ResourceIndexEntryT *)0; } - if (err) { - *err = RES_OK; + if (gFiles[slot].fp) { + fclose(gFiles[slot].fp); + gFiles[slot].fp = (FILE *)0; } - return h; -#else - if (err) { - *err = RES_ERR_BLOCKED; - } - return (void **)0; -#endif + gFiles[slot].inUse = 0; + return RES_OK; } -uint32_t iigsGetResourceSize(IigsResTypeT resType, IigsResIdT resId, - int *err) { - (void)resType; - (void)resId; -#if IIGS_RESOURCE_RUNTIME_ENABLED +static int findHandleByPtr(void **handle) { + if (!handle) { + return -1; + } + for (int i = 0; i < IIGS_RES_MAX_HANDLES; i++) { + if (gHandles[i].inUse && (void **)&gHandles[i].data == handle) { + return i; + } + } + return -1; +} + + +static int findHandleByTypeId(IigsResTypeT type, IigsResIdT id) { + for (int i = 0; i < IIGS_RES_MAX_HANDLES; i++) { + if (gHandles[i].inUse && gHandles[i].type == type && gHandles[i].id == id) { + return i; + } + } + return -1; +} + + +static int findHandleSlot(void) { + for (int i = 0; i < IIGS_RES_MAX_HANDLES; i++) { + if (!gHandles[i].inUse) { + return i; + } + } + return -1; +} + + +static int findOpenFileSlot(void) { + for (int i = 0; i < IIGS_RES_MAX_FILES; i++) { + if (!gFiles[i].inUse) { + return i; + } + } + return -1; +} + + +static int freeHandleSlot(int slot) { + if (slot < 0 || slot >= IIGS_RES_MAX_HANDLES) { + return RES_ERR_BAD_HANDLE; + } + if (!gHandles[slot].inUse) { + return RES_ERR_BAD_HANDLE; + } + if (gHandles[slot].data) { + free(gHandles[slot].data); + gHandles[slot].data = (void *)0; + } + gHandles[slot].inUse = 0; + gHandles[slot].fileSlot = -1; + gHandles[slot].type = 0; + gHandles[slot].id = 0; + gHandles[slot].size = 0; + return RES_OK; +} + + +uint32_t getResourceSize(void **handle) { + int slot = findHandleByPtr(handle); + if (slot < 0) { + return 0; + } + return gHandles[slot].size; +} + + +// Convenience wrapper kept for backwards compat with the old probe. +// Scans the cache + open files for (type, id) and reports the size. +uint32_t iigsGetResourceSize(IigsResTypeT resType, IigsResIdT resId, int *err) { if (!gResourceReady) { if (err) { *err = RES_ERR_NOT_STARTED; } return 0; } - // GetResourceSize returns a 32-bit byte count via the toolbox. - uint32_t sz = (uint32_t)GetResourceSize((unsigned short)resType, - (long)resId); - if (err) { - *err = (sz == 0) ? RES_ERR_NOT_FOUND : RES_OK; + int hSlot = findHandleByTypeId(resType, resId); + if (hSlot >= 0) { + if (err) { + *err = RES_OK; + } + return gHandles[hSlot].size; + } + // Not cached - scan every open file's index for the entry. + for (int f = 0; f < IIGS_RES_MAX_FILES; f++) { + if (!gFiles[f].inUse || !gFiles[f].index) { + continue; + } + uint32_t n = gFiles[f].hdr.rmIndexUsed; + for (uint32_t i = 0; i < n; i++) { + ResourceIndexEntryT *e = &gFiles[f].index[i]; + if (e->rType == resType && e->rID == resId) { + if (err) { + *err = RES_OK; + } + return e->rSize; + } + } } - return sz; -#else if (err) { - *err = RES_ERR_BLOCKED; + *err = RES_ERR_NOT_FOUND; } return 0; -#endif +} + + +// Convenience wrapper kept for backwards compat with the old probe. +void **iigsLoadResource(IigsResTypeT resType, IigsResIdT resId, int *err) { + return loadResource(resType, resId, err); +} + + +// Reads the 20-byte rIndex table for a freshly-opened file. Returns +// RES_OK or an RES_ERR_* code. Caller has populated gFiles[slot].hdr. +static int loadIndex(int fileSlot) { + ResourceFileT *rf = &gFiles[fileSlot]; + uint32_t n = rf->hdr.rmIndexUsed; + if (n == 0) { + rf->index = (ResourceIndexEntryT *)0; + return RES_OK; + } + // Sanity-check against malloc'ing absurd amounts. + if (n > 1024) { + return RES_ERR_TOOLBOX; + } + ResourceIndexEntryT *idx = (ResourceIndexEntryT *)malloc(sizeof(ResourceIndexEntryT) * n); + if (!idx) { + return RES_ERR_NO_MEM; + } + if (fseek(rf->fp, (long)rf->hdr.rmToIndex, 0) != 0) { + free(idx); + return RES_ERR_TOOLBOX; + } + for (uint32_t i = 0; i < n; i++) { + uint16_t t; + uint32_t id; + uint32_t off; + uint16_t attr; + uint32_t sz; + uint32_t h; + if (readU16(rf->fp, &t) != 0 || + readU32(rf->fp, &id) != 0 || + readU32(rf->fp, &off) != 0 || + readU16(rf->fp, &attr) != 0 || + readU32(rf->fp, &sz) != 0 || + readU32(rf->fp, &h) != 0) { + free(idx); + return RES_ERR_TOOLBOX; + } + idx[i].rType = t; + idx[i].rID = id; + idx[i].rOffset = off; + idx[i].rAttr = attr; + idx[i].rSize = sz; + idx[i].rHandle = h; + } + rf->index = idx; + return RES_OK; +} + + +void **loadResource(IigsResTypeT type, IigsResIdT id, int *err) { + if (!gResourceReady) { + if (err) { + *err = RES_ERR_NOT_STARTED; + } + return (void **)0; + } + // Cache hit? + int hSlot = findHandleByTypeId(type, id); + if (hSlot >= 0) { + if (err) { + *err = RES_OK; + } + return (void **)&gHandles[hSlot].data; + } + // Cache miss - find the resource in any open file. + for (int f = 0; f < IIGS_RES_MAX_FILES; f++) { + if (!gFiles[f].inUse || !gFiles[f].index) { + continue; + } + uint32_t n = gFiles[f].hdr.rmIndexUsed; + for (uint32_t i = 0; i < n; i++) { + ResourceIndexEntryT *e = &gFiles[f].index[i]; + if (e->rType != type || e->rID != id) { + continue; + } + int slot = findHandleSlot(); + if (slot < 0) { + if (err) { + *err = RES_ERR_NO_MEM; + } + return (void **)0; + } + void *bytes = readPayload(f, e->rOffset, e->rSize); + if (!bytes) { + if (err) { + *err = RES_ERR_TOOLBOX; + } + return (void **)0; + } + gHandles[slot].inUse = 1; + gHandles[slot].fileSlot = f; + gHandles[slot].type = type; + gHandles[slot].id = id; + gHandles[slot].data = bytes; + gHandles[slot].size = e->rSize; + if (err) { + *err = RES_OK; + } + return (void **)&gHandles[slot].data; + } + } + if (err) { + *err = RES_ERR_NOT_FOUND; + } + return (void **)0; +} + + +ResourceRefNumT openResourceFile(const char *path, uint8_t accessByte, uint16_t fileType, int *err) { + (void)accessByte; + (void)fileType; + if (!path) { + if (err) { + *err = RES_ERR_NOT_FOUND; + } + return 0; + } + int slot = findOpenFileSlot(); + if (slot < 0) { + if (err) { + *err = RES_ERR_NO_MEM; + } + return 0; + } + FILE *fp = fopen(path, "rb"); + if (!fp) { + if (err) { + *err = RES_ERR_NOT_FOUND; + } + return 0; + } + ResourceFileT *rf = &gFiles[slot]; + if (readMapHeader(fp, &rf->hdr) != 0) { + fclose(fp); + if (err) { + *err = RES_ERR_TOOLBOX; + } + return 0; + } + rf->fp = fp; + rf->inUse = 1; + rf->refNum = (uint16_t)(slot + 1); + rf->index = (ResourceIndexEntryT *)0; + int rc = loadIndex(slot); + if (rc != RES_OK) { + fclose(fp); + rf->fp = (FILE *)0; + rf->inUse = 0; + if (err) { + *err = rc; + } + return 0; + } + gResourceReady = 1; + if (err) { + *err = RES_OK; + } + return rf->refNum; +} + + +// Allocates and reads `size` bytes at `offset` from the file at +// `fileSlot`. Returns NULL on any error. +static void *readPayload(int fileSlot, uint32_t offset, uint32_t size) { + if (size == 0) { + return (void *)0; + } + void *buf = malloc(size); + if (!buf) { + return (void *)0; + } + FILE *fp = gFiles[fileSlot].fp; + if (fseek(fp, (long)offset, 0) != 0) { + free(buf); + return (void *)0; + } + size_t got = fread(buf, 1, size, fp); + if (got != size) { + free(buf); + return (void *)0; + } + return buf; +} + + +// Reads a little-endian uint16 from `f`. Returns 0 on success. +static int readU16(FILE *f, uint16_t *out) { + uint8_t b[2]; + if (fread(b, 1, 2, f) != 2) { + return -1; + } + *out = (uint16_t)(b[0] | ((uint16_t)b[1] << 8)); + return 0; +} + + +// Reads a little-endian uint32 from `f`. Returns 0 on success. +static int readU32(FILE *f, uint32_t *out) { + uint8_t b[4]; + if (fread(b, 1, 4, f) != 4) { + return -1; + } + *out = (uint32_t)b[0] | + ((uint32_t)b[1] << 8) | + ((uint32_t)b[2] << 16) | + ((uint32_t)b[3] << 24); + return 0; +} + + +// Reads the 24-byte rResourceMap header at offset 0. +static int readMapHeader(FILE *f, ResourceMapHeaderT *hdr) { + if (fseek(f, 0L, 0) != 0) { + return -1; + } + if (readU16(f, &hdr->rmVersion) != 0) return -1; + if (readU32(f, &hdr->rmToIndex) != 0) return -1; + if (readU16(f, &hdr->rmFileNum) != 0) return -1; + if (readU16(f, &hdr->rmID) != 0) return -1; + if (readU32(f, &hdr->rmIndexSize) != 0) return -1; + if (readU32(f, &hdr->rmIndexUsed) != 0) return -1; + if (readU16(f, &hdr->rmFreeListSize) != 0) return -1; + if (readU16(f, &hdr->rmFreeListUsed) != 0) return -1; + if (readU16(f, &hdr->rmPad) != 0) return -1; + return 0; +} + + +int releaseResource(int verb, void **handle) { + int slot = findHandleByPtr(handle); + if (slot < 0) { + return RES_ERR_BAD_HANDLE; + } + if (verb == 0) { + // Soft release: keep cached payload. Real toolbox would decrement + // a use-count; we just succeed. + return RES_OK; + } + return freeHandleSlot(slot); +} + + +int resourceProbeInit(void) { + // Zero the tables. Safe to call repeatedly - subsequent calls do + // not touch already-open files. + if (!gResourceReady) { + for (int i = 0; i < IIGS_RES_MAX_FILES; i++) { + gFiles[i].inUse = 0; + gFiles[i].fp = (FILE *)0; + gFiles[i].index = (ResourceIndexEntryT *)0; + gFiles[i].refNum = 0; + } + for (int i = 0; i < IIGS_RES_MAX_HANDLES; i++) { + gHandles[i].inUse = 0; + gHandles[i].fileSlot = -1; + gHandles[i].data = (void *)0; + gHandles[i].size = 0; + } + gResourceReady = 1; + } + return RES_OK; +} + + +int resourceRuntimeEnabled(void) { + return gResourceReady; } diff --git a/runtime/src/snprintf.c b/runtime/src/snprintf.c index 811a32f..0591efa 100644 --- a/runtime/src/snprintf.c +++ b/runtime/src/snprintf.c @@ -40,6 +40,13 @@ typedef __builtin_va_list va_list; #define va_arg(ap, ty) __builtin_va_arg(ap, ty) #define va_end(ap) __builtin_va_end(ap) +// Unbounded sink sentinel used by sprintf/vsprintf. Setting gEnd to +// `buf + 0xFFFE` looks innocuous but clang lowers the +0xFFFE to a +// `dec a; dec a` peephole (0xFFFE is -2 in 16-bit), giving gEnd = +// buf - 2 -- the `cur < end` bounds test then always fails. Use the +// absolute top-of-bank sentinel instead. +#define SPRINTF_END_SENTINEL ((char *)0xFFFF) + static char *gCur; static char *gEnd; @@ -757,12 +764,9 @@ int snprintf(char *buf, size_t n, const char *fmt, ...) { int sprintf(char *buf, const char *fmt, ...) { gCur = buf; - // sprintf is unbounded. Setting gEnd = buf + 0xFFFE looks innocuous - // but clang lowers the +0xFFFE to a `dec a; dec a` peephole (since - // 0xFFFE is -2 in 16-bit), giving gEnd = buf - 2 — and then the - // emit() bounds test `cur < end` is always false, so nothing gets - // written. Use the absolute top-of-bank sentinel instead. - gEnd = (char *)0xFFFF; + // sprintf is unbounded; see SPRINTF_END_SENTINEL above for the + // reason we don't use buf + 0xFFFE. + gEnd = SPRINTF_END_SENTINEL; gTotal = 0; va_list ap; va_start(ap, fmt); @@ -782,7 +786,7 @@ int vsnprintf(char *buf, size_t n, const char *fmt, va_list ap) { int vsprintf(char *buf, const char *fmt, va_list ap) { gCur = buf; - gEnd = (char *)0xFFFF; + gEnd = SPRINTF_END_SENTINEL; gTotal = 0; return format(fmt, ap); } diff --git a/scripts/mameDebug.py b/scripts/mameDebug.py index 7e7d358..652e327 100755 --- a/scripts/mameDebug.py +++ b/scripts/mameDebug.py @@ -39,6 +39,7 @@ # DEBUGGER_E2E=1 scripts/mameDebug.py --bin ... --map ... --dwarf ... import argparse +import importlib.util import os import re import subprocess @@ -50,6 +51,21 @@ SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) ROOT = os.path.dirname(SCRIPT_DIR) +# Import pc2line.py as a module so the REPL can reuse its DWARF parsing +# (line table, DIE walking, type chains, locals evaluator) without +# shelling out + reparsing on every command. pc2line.py is the single +# source of truth for DWARF semantics; we must NOT duplicate any of it. +def _loadPc2lineModule(): + spec = importlib.util.spec_from_file_location( + "pc2line", os.path.join(SCRIPT_DIR, "pc2line.py")) + mod = importlib.util.module_from_spec(spec) + spec.loader.exec_module(mod) + return mod + + +pc2line = _loadPc2lineModule() + + # ---- Map + DWARF helpers --------------------------------------------- def loadMapSyms(path): @@ -561,6 +577,766 @@ def interactiveMode(args): return 0 +# ---- REPL mode (--repl) --------------------------------------------- +# +# An interactive prompt that gives `gdb`-flavour commands on top of the +# load-snapshot-resolve cycle. Because MAME has no bidirectional Lua +# RPC channel under `-debugger none`, every "execute the program" +# command (run / continue / step / next) maps to one MAME process +# launch. The Lua autoboot writes the program into bank-0 memory, +# installs all queued breakpoints, runs until the first hit, captures +# a register + memory snapshot, and exits. The Python REPL then +# decodes the snapshot to answer `print`, `bt`, `where` from cached +# state — no further MAME launch needed for those. +# +# Commands: +# break set/queue a breakpoint +# run | continue [c] launch MAME, stop at first bp hit +# step | next advance to next source line +# (via DWARF line table; one bp install) +# bt | backtrace walk the JSL frame chain from S +# where PC -> source line for the last hit +# print decode bytes at &symbol per DWARF type +# info locals show formal_parameters + locals +# info breakpoints list queued breakpoints +# delete remove breakpoint by index +# quit | q exit +# ? this help +# +# Smoke-checkable: pipe a script of `break main\nrun\nwhere\nquit\n` +# into `mameDebug.py --repl ...` and assert the BP-HIT + WHERE output. + + +REPL_HELP = """\ +Commands: + break set/queue a breakpoint + run | continue launch MAME, stop at first hit + step | next advance to next source line (DWARF) + bt | backtrace walk JSL frame chain from S + where PC -> source line for the last hit + print decode bytes at &symbol per DWARF type + info locals show formal_parameters + locals + info breakpoints list queued breakpoints + delete remove breakpoint by index + quit | q exit + ? this help +""" + + +# Lua autoboot for the REPL. Differs from the --trace template in three +# ways: +# 1. Breakpoint actions also dump (a) a 64-byte stack window around S +# and (b) per-symbol memory regions for `print` requests, both as +# tagged log lines so the host can parse. +# 2. exit_frame is generous (240) so a slow run still completes. +# 3. The list of "watch" memory regions is parameterised — the host +# stamps in (addr, len) pairs based on queued `print ` +# requests. +REPL_LUA_TEMPLATE = r""" +-- mameDebug REPL autoboot (generated by scripts/mameDebug.py --repl) +local BIN_PATH = "{bin_path}" +local LOAD_AT = 0x{load_at:04x} +local START_PC = 0x{start_pc:06x} +local BPS = {{ {bp_list} }} +local WATCHES = {{ {watch_list} }} -- list of {{addr, len}} pairs + +local installed = false +local frame = 0 +local cpu, dbg, mem + +emu.register_frame_done(function() + frame = frame + 1 + if frame == 30 and not installed then + cpu = manager.machine.devices[":maincpu"] + dbg = cpu.debug + mem = cpu.spaces["program"] + local f = io.open(BIN_PATH, "rb") + if not f then + print("MAMEDBG-BIN-MISSING " .. BIN_PATH) + manager.machine:exit() + return + end + local data = f:read("*all") + f:close() + for i = 1, #data do + local addr = LOAD_AT + i - 1 + if not (addr >= 0x00C000 and addr < 0x00D000) then + mem:write_u8(addr, data:byte(i)) + end + end + cpu.state["PC"].value = START_PC + cpu.state["PB"].value = 0x00 + cpu.state["DB"].value = 0x00 + cpu.state["D"].value = 0x00 + cpu.state["P"].value = 0x04 + cpu.state["E"].value = 0 + cpu.state["S"].value = 0x01FF + + -- Build the bp action. We use the 3-arg bpset form (1-arg + -- crashes MAME). The action stamps a magic marker into bank-2 + -- scratch ($020010 / 0xDEAD) so the periodic poller can detect + -- the hit and dump memory from a SAFE context (the action + -- string itself can't call multi-statement loops cleanly). + local action_template = + 'logerror "MAMEDBG-BP PC=%X A=%X X=%X Y=%X S=%X DBR=%X\n",pc,a,x,y,s,db; ' .. + 'w@0x020010=0xDEAD; w@0x020012=s; w@0x020014=pc & 0xFFFF; w@0x020016=(pc>>16) & 0xFF; go' + for _, pc in ipairs(BPS) do + dbg:bpset(pc, '', action_template) + end + print(string.format("MAMEDBG-LOADED bytes=%d bps=%d watches=%d", + #data, #BPS, #WATCHES)) + installed = true + end + if frame == {exit_frame} then + print("MAMEDBG-EXIT frame=" .. frame) + manager.machine:exit() + end +end) + +-- Marker-driven snapshot dumper. Once the bp action stamps 0xDEAD at +-- $020010, this periodic handler reads S + PC from the scratch slots +-- and dumps the watched memory regions, then clears the marker. +local snapshotted = false +emu.register_periodic(function() + if installed and not snapshotted and mem ~= nil then + local marker = mem:read_u16(0x020010) + if marker == 0xDEAD then + local s_val = mem:read_u16(0x020012) + local pc_lo = mem:read_u16(0x020014) + local pc_bnk = mem:read_u8(0x020016) + local full_pc = (pc_bnk * 0x10000) + pc_lo + print(string.format("MAMEDBG-SNAP S=0x%04X PC=0x%06X", + s_val, full_pc)) + -- Dump 64 bytes of the stack window above S (S+1 .. S+64). + -- That's where the topmost JSL return frame lives. + for ofs = 1, 64 do + local addr = s_val + ofs + local v = mem:read_u8(addr) + print(string.format("MAMEDBG-STACK addr=0x%06X val=0x%02X", + addr, v)) + end + -- Dump each user-requested watch. + for _, w in ipairs(WATCHES) do + local addr, n = w[1], w[2] + for ofs = 0, n - 1 do + local v = mem:read_u8(addr + ofs) + print(string.format("MAMEDBG-WATCH addr=0x%06X val=0x%02X", + addr + ofs, v)) + end + end + mem:write_u16(0x020010, 0) + snapshotted = true + end + end +end) +""" + + +def buildReplLuaScript(bin_path, load_at, bp_pcs, watch_regions, + start_pc, exit_frame): + """Build a MAME autoboot Lua script for one REPL run. + + bp_pcs: list of int (24-bit PCs) — breakpoints to install. + watch_regions: list of (addr, length) tuples — per-symbol memory + dumps stamped at the first BP hit. + """ + bp_list = ", ".join(f"0x{p:06x}" for p in bp_pcs) + watch_list = ", ".join(f"{{0x{a:06x}, {n}}}" for a, n in watch_regions) + return REPL_LUA_TEMPLATE.format( + bin_path = bin_path, + load_at = load_at, + start_pc = start_pc, + bp_list = bp_list or "", + watch_list = watch_list or "", + exit_frame = exit_frame, + ) + + +# Regex for snapshot/watch/stack lines emitted by the REPL Lua script. +SNAP_RE = re.compile(r"MAMEDBG-SNAP\s+S=0x([0-9A-Fa-f]+)\s+PC=0x([0-9A-Fa-f]+)") +WATCH_RE = re.compile(r"MAMEDBG-WATCH\s+addr=0x([0-9A-Fa-f]+)\s+val=0x([0-9A-Fa-f]+)") +STACK_RE = re.compile(r"MAMEDBG-STACK\s+addr=0x([0-9A-Fa-f]+)\s+val=0x([0-9A-Fa-f]+)") + + +class ReplState: + """All persistent state across REPL commands.""" + + def __init__(self, args): + self.args = args + # Map: address -> symbol name (binary-searchable by funcAt) + self.syms = pc2line.loadMapSymbols(args.map) + # DWARF: line table + DIE trees (parsed once, reused) + self.sectionPayloads = pc2line.loadSidecarSectionsAll(args.dwarf) + self.cus = pc2line.parseAllCus(self.sectionPayloads) + self.lineTable = pc2line.buildTable(args.dwarf) + # Breakpoints: list of (pc, label) - label is the original spec + self.breakpoints = [] + # Watches: dict {symbol: (addr, length)}. Length picked from + # the symbol's DWARF type when available, else fall back to 2. + self.watches = {} + # Last snapshot — populated after a run. Empty until first run. + self.lastSnap = None # {"pc": int, "sp": int} + self.lastWatchBytes = {} # {addr: byte} (last run only) + self.lastStackBytes = {} # {addr: byte} (last run only) + + def resolveSpec(self, spec): + """Resolve `FUNC`, `FILE:LINE`, or `0xADDR` to a 24-bit PC. + Returns (pc, label) or (None, error_msg). + """ + spec = spec.strip() + # Hex address? + if spec.lower().startswith("0x"): + try: + return (int(spec, 16), spec) + except ValueError: + return (None, f"invalid hex: {spec!r}") + # File:line? + if ":" in spec: + file_part, line_part = spec.rsplit(":", 1) + try: + want_line = int(line_part) + except ValueError: + return (None, f"invalid line: {line_part!r}") + # Prefer the smallest-PC entry on the requested line so the + # bp lands on the statement's first instruction, not a + # later trailing entry. + best = None + for pc, fidx, ln, ft in self.lineTable: + if ln != want_line: + continue + if 0 < fidx <= len(ft): + fname = os.path.basename(ft[fidx - 1]) + else: + fname = "?" + # Match if fname matches OR fname is "?" (DWARF5 + # file_idx=0 path means "the CU's primary file" — we + # treat that as a wildcard match for the user-supplied + # file name). + if fname == file_part or fname.endswith(file_part) \ + or fname == "?": + if best is None or pc < best[0]: + best = (pc, fname) + if best is not None: + return (best[0], f"{best[1]}:{want_line}") + return (None, f"no DWARF line entry for {spec!r}") + # Bare symbol name — lookup in map. + for addr, sym in self.syms: + if sym == spec: + return (addr, sym) + return (None, f"symbol {spec!r} not in map") + + def symbolSize(self, symname): + """Best-effort size of a global symbol's storage (in bytes). + + Looks up DW_TAG_variable DIEs across all CUs. Returns the + resolved type's byte size, or None if not findable. Falls back + to caller-default (2) when None. + """ + for cu in self.cus: + if cu.root is None: + continue + for die in self._iterDies(cu.root): + if die.tag != pc2line.DW_TAG_variable: + continue + nm = pc2line.dieName(cu, die) + if nm != symname: + continue + tref = die.getRaw(pc2line.DW_AT_type) + if tref is None: + return None + target = pc2line._findDieByOffset(cu, tref[0]) + return self._typeByteSize(cu, target) + return None + + def _iterDies(self, die): + yield die + for ch in die.children: + yield from self._iterDies(ch) + + def _typeByteSize(self, cu, die): + """Walk a type DIE chain, return byte size or None.""" + if die is None: + return None + seen = set() + cur = die + while cur is not None and cur.offset not in seen: + seen.add(cur.offset) + tag = cur.tag + # Base / structure / union / enum types carry DW_AT_byte_size. + bs = cur.getRaw(0x0b) # DW_AT_byte_size + if bs is not None: + return bs[0] + if tag == pc2line.DW_TAG_pointer_type: + # 24-bit byte addresses are stored as 4-byte ptr32 by + # default in our ABI; default-on Layer 2 builds use 4-byte + # ptrs. Fall back to addr_size if recorded. + return cu.addr_size + if tag in (0x26, 0x35, 0x37, 0x38): + # const/volatile/restrict/typedef — follow. + t = cur.getRaw(pc2line.DW_AT_type) + if t is None: + return None + cur = pc2line._findDieByOffset(cu, t[0]) + continue + if tag == 0x01: # DW_TAG_array_type + t = cur.getRaw(pc2line.DW_AT_type) + if t is None: + return None + elem = self._typeByteSize(cu, + pc2line._findDieByOffset(cu, t[0])) + if elem is None: + return None + # Find first subrange child for count. + for ch in cur.children: + if ch.tag == 0x21: # DW_TAG_subrange_type + ub = ch.getRaw(0x2f) # DW_AT_upper_bound + if ub is not None: + return elem * (ub[0] + 1) + return None + # Other tags — give up. + return None + return None + + def typeStrOfSymbol(self, symname): + """Return a printable type string for a global symbol, or '?'.""" + for cu in self.cus: + if cu.root is None: + continue + for die in self._iterDies(cu.root): + if die.tag != pc2line.DW_TAG_variable: + continue + nm = pc2line.dieName(cu, die) + if nm == symname: + return pc2line.varTypeStr(cu, die) + return "?" + + +def replLaunchMame(state, bp_pcs, start_pc, watch_regions, seconds=4): + """Launch one MAME run with the queued breakpoints + watches. + + Returns the captured stdout/stderr text. Parses MAMEDBG-SNAP, + MAMEDBG-WATCH, MAMEDBG-STACK lines into state.lastSnap + + state.lastWatchBytes + state.lastStackBytes. + """ + lua = buildReplLuaScript(state.args.bin, state.args.load_at, + bp_pcs, watch_regions, + start_pc=start_pc, + exit_frame=240) + with tempfile.NamedTemporaryFile("w", suffix=".lua", + delete=False) as lf: + lf.write(lua) + lua_path = lf.name + try: + out = runMame(lua_path, seconds=seconds, debug_flag=True) + finally: + try: + os.unlink(lua_path) + except OSError: + pass + + # Parse snapshot lines. + state.lastSnap = None + state.lastWatchBytes = {} + state.lastStackBytes = {} + bps = [] + for ln in out.splitlines(): + m = BP_RE.search(ln) + if m: + bps.append({ + "pc": int(m.group(1), 16), + "a": int(m.group(2), 16), + "x": int(m.group(3), 16), + "y": int(m.group(4), 16), + "s": int(m.group(5), 16), + "db": int(m.group(6), 16), + }) + m = SNAP_RE.search(ln) + if m: + state.lastSnap = { + "sp": int(m.group(1), 16), + "pc": int(m.group(2), 16), + } + m = WATCH_RE.search(ln) + if m: + state.lastWatchBytes[int(m.group(1), 16)] = int(m.group(2), 16) + m = STACK_RE.search(ln) + if m: + state.lastStackBytes[int(m.group(1), 16)] = int(m.group(2), 16) + state.lastBps = bps + return out + + +def replPrintWhere(state): + """Print PC -> source line for the last snapshot.""" + if state.lastSnap is None: + print(" no snapshot yet — `run` first") + return + pc = state.lastSnap["pc"] + sp = state.lastSnap["sp"] + row = pc2line.query(state.lineTable, pc) + func = pc2line.funcAt(state.syms, pc) + if row is None: + print(f" PC=0x{pc:06x} (no DWARF line) FUNC={func} S=0x{sp:04x}") + else: + _, fname, ln = row + print(f" PC=0x{pc:06x} FILE={fname} LINE={ln} FUNC={func} " + f"S=0x{sp:04x}") + + +def replPrintBacktrace(state): + """Walk the JSL return frame chain starting from the captured S. + + The W65816 JSL pushes 3 bytes per call (PCL, PCH, PBR). Our ABI is + empty-descending: S points to the next-free byte. So the topmost + return-address triplet lives at S+1, S+2, S+3. We read it from the + captured stack window. We have no DW_AT_frame_base / DW_CFA_* + sidecar yet, so we can't walk past one frame — but we can show the + return address of the current function, which is what most debug + sessions need anyway. + """ + if state.lastSnap is None: + print(" no snapshot yet — `run` first") + return + pc = state.lastSnap["pc"] + sp = state.lastSnap["sp"] + func = pc2line.funcAt(state.syms, pc) + row = pc2line.query(state.lineTable, pc) + if row is None: + print(f" #0 PC=0x{pc:06x} FUNC={func}") + else: + _, fname, ln = row + print(f" #0 PC=0x{pc:06x} {fname}:{ln} FUNC={func}") + # Try to read S+1..S+3 from the captured stack window. + pcl_addr = (sp + 1) & 0xFFFF + pch_addr = (sp + 2) & 0xFFFF + pbr_addr = (sp + 3) & 0xFFFF + pcl = state.lastStackBytes.get(pcl_addr) + pch = state.lastStackBytes.get(pch_addr) + pbr = state.lastStackBytes.get(pbr_addr) + if pcl is None or pch is None or pbr is None: + print(" #1 ") + return + # JSL pushes the address of the LAST byte of the JSL instruction, + # so the actual return target is ret_addr + 1. + ret_pc = (pbr << 16) | (pch << 8) | pcl + ret_pc = (ret_pc + 1) & 0xFFFFFF + ret_func = pc2line.funcAt(state.syms, ret_pc) + ret_row = pc2line.query(state.lineTable, ret_pc) + if ret_row is None: + print(f" #1 PC=0x{ret_pc:06x} FUNC={ret_func}") + else: + _, fname, ln = ret_row + print(f" #1 PC=0x{ret_pc:06x} {fname}:{ln} FUNC={ret_func}") + + +def replPrintSymbol(state, spec): + """Decode a symbol's bytes from the last snapshot and print them + per the symbol's DWARF type. If the symbol hasn't been watched + yet (or no run has happened), instruct the user to `run` first. + """ + addr = None + for a, s in state.syms: + if s == spec: + addr = a + break + if addr is None: + print(f" no such symbol: {spec!r}") + return + # Make sure it's queued as a watch for the next run. + if spec not in state.watches: + sz = state.symbolSize(spec) + if sz is None or sz <= 0: + sz = 2 + if sz > 64: + # Truncate: large structs/arrays surface the first 64 bytes. + sz = 64 + state.watches[spec] = (addr, sz) + + if state.lastSnap is None or not state.lastWatchBytes: + print(f" &{spec} = 0x{addr:06x} (watch queued — run to capture)") + return + + addr_w, length = state.watches[spec] + bytes_ = bytearray(length) + have_all = True + for i in range(length): + b = state.lastWatchBytes.get(addr_w + i) + if b is None: + have_all = False + break + bytes_[i] = b + type_str = state.typeStrOfSymbol(spec) + if not have_all: + print(f" {spec}: ADDR=0x{addr:06x} TYPE={type_str} " + f"(no snapshot bytes — run again to capture)") + return + decoded = _decodeBytes(type_str, bytes_) + hex_dump = " ".join(f"{b:02x}" for b in bytes_) + print(f" {spec} : {type_str} = {decoded}") + print(f" ADDR=0x{addr:06x} BYTES=[{hex_dump}]") + + +def _decodeBytes(type_str, raw): + """Best-effort C-value print for a small byte buffer. + + Recognises: + - int/short/char (1/2/4 byte ints, little-endian) + - unsigned variants + - any "* " (pointer) type — print as hex address + - struct/union — show raw hex (the caller already prints BYTES=) + Floats are out of scope per the task; print bytes as hex. + """ + ts = type_str.strip() + if not raw: + return "" + + # Pointer types -> print as hex address of the right width. + if ts.endswith("*") or " *" in ts: + if len(raw) >= 4: + v = raw[0] | (raw[1] << 8) | (raw[2] << 16) | (raw[3] << 24) + return f"0x{v & 0xFFFFFFFF:08x}" + if len(raw) >= 2: + v = raw[0] | (raw[1] << 8) + return f"0x{v:04x}" + return f"0x{raw[0]:02x}" + + # Integer base types. + int_widths = { + "char": 1, "signed char": 1, "unsigned char": 1, + "_Bool": 1, "bool": 1, + "short": 2, "short int": 2, + "unsigned short": 2, "unsigned short int": 2, + "int": 2, "unsigned int": 2, "signed int": 2, + "long": 4, "long int": 4, "signed long": 4, + "unsigned long": 4, "unsigned long int": 4, + "long long": 4, "unsigned long long": 4, + } + signed_set = {"char", "signed char", "short", "short int", + "int", "signed int", "long", "long int", + "signed long", "long long"} + if ts in int_widths: + w = int_widths[ts] + n = min(w, len(raw)) + v = 0 + for i in range(n): + v |= raw[i] << (8 * i) + if ts in signed_set: + top = 1 << (8 * n - 1) + if v & top: + v = v - (1 << (8 * n)) + return f"{v} (0x{v & ((1 << (8*n)) - 1):0{2*n}x})" + + # struct / union / class — caller dumps raw bytes. + if ts.startswith("struct ") or ts.startswith("union ") \ + or ts.startswith("class "): + # Show u16 words as a partial decode hint (often the first + # field is an integer the user wants to see). + if len(raw) >= 2: + first_u16 = raw[0] | (raw[1] << 8) + return f"<{ts}; first u16 = 0x{first_u16:04x}>" + return f"<{ts}>" + + # Array type — show first elements as best-effort integers. + if "[" in ts and ts.endswith("]"): + first = " ".join(f"0x{b:02x}" for b in raw[:8]) + return f"[{first}{', ...' if len(raw) > 8 else ''}]" + + return "" + + +def replInfoLocals(state): + """Show formal_parameters + locals at the last snapshot PC.""" + if state.lastSnap is None: + print(" no snapshot yet — `run` first") + return + pc = state.lastSnap["pc"] + sp = state.lastSnap["sp"] + cu, sub, locs = pc2line.localsAtPc(state.cus, pc, sp_value=sp) + if sub is None: + print(f" no subprogram at PC=0x{pc:06x}") + return + sub_name = pc2line.dieName(cu, sub) or "" + print(f" in {sub_name!r} at PC=0x{pc:06x} S=0x{sp:04x}") + if not locs: + print(" (no formal_parameter / variable in scope)") + return + for name, ty, loc, _die in locs: + if loc.kind == "memory": + print(f" {name} : {ty} ADDR=0x{loc.addr:06x}") + elif loc.kind == "register": + if loc.dp_addr is not None: + print(f" {name} : {ty} REG=DW{loc.reg_dw} " + f"ADDR=0x{loc.dp_addr:06x}") + else: + print(f" {name} : {ty} REG=DW{loc.reg_dw}") + elif loc.kind == "value": + print(f" {name} : {ty} VALUE=0x{loc.value:x}") + else: + print(f" {name} : {ty} UNSUPPORTED={loc.reason}") + + +def replNextLinePc(state, current_pc): + """Return the PC of the DWARF line entry strictly after current_pc, + or None if there isn't one (end of program / no DWARF). + """ + # The line table is unsorted in source order; iterate to find the + # smallest entry whose PC is strictly greater than current_pc. + best = None + for pc, _fidx, _ln, _ft in state.lineTable: + if pc > current_pc: + if best is None or pc < best: + best = pc + return best + + +def replLoop(state): + """Run the REPL. Reads commands from stdin, dispatches each one.""" + interactive_tty = sys.stdin.isatty() + if interactive_tty: + print("mameDebug REPL. Type ? for help.") + while True: + try: + if interactive_tty: + line = input("(dbg) ") + else: + line = input() # no prompt in batch mode (cleaner output) + except EOFError: + if interactive_tty: + print() + break + line = line.strip() + if not line or line.startswith("#"): + continue + # Echo command in batch mode so the smoke test can diff output. + if not interactive_tty: + print(f"(dbg) {line}") + cmd, _, rest = line.partition(" ") + rest = rest.strip() + if cmd in ("q", "quit", "exit"): + break + if cmd == "?" or cmd == "help": + print(REPL_HELP) + continue + if cmd in ("break", "b"): + if not rest: + print(" usage: break ") + continue + pc, label = state.resolveSpec(rest) + if pc is None: + print(f" cannot resolve: {label}") + continue + state.breakpoints.append((pc, label)) + idx = len(state.breakpoints) + print(f" bp #{idx} at 0x{pc:06x} ({label})") + continue + if cmd in ("info",): + if rest == "breakpoints": + if not state.breakpoints: + print(" no breakpoints") + else: + for i, (pc, lab) in enumerate(state.breakpoints, 1): + print(f" #{i} 0x{pc:06x} ({lab})") + continue + if rest == "locals": + replInfoLocals(state) + continue + print(f" unknown info subcommand: {rest!r}") + continue + if cmd == "delete": + try: + idx = int(rest) + except ValueError: + print(" usage: delete ") + continue + if idx < 1 or idx > len(state.breakpoints): + print(f" no breakpoint #{idx}") + continue + del state.breakpoints[idx - 1] + print(f" deleted bp #{idx}") + continue + if cmd in ("run", "r", "continue", "c"): + if not state.breakpoints: + print(" no breakpoints set — nothing to break on") + continue + bp_pcs = [pc for pc, _ in state.breakpoints] + # Decide start_pc: --from-start runs through crt0; default + # is to jump to the first bp (matches --trace behaviour). + if state.args.from_start: + start_pc = state.args.load_at + else: + start_pc = bp_pcs[0] + watch_regions = list(state.watches.values()) + replLaunchMame(state, bp_pcs, start_pc, watch_regions, + seconds=state.args.seconds) + if state.lastSnap is None: + print(" WARN: no BP-HIT captured (timed out?)") + else: + replPrintWhere(state) + continue + if cmd in ("step", "s", "next", "n"): + # Both map to "advance to next source line via DWARF" in + # our snapshot-based model. Requires a prior snapshot to + # know "where we are". + if state.lastSnap is None: + # No prior snapshot: just do `run` (start of program). + if not state.breakpoints: + print(" no breakpoints set — `break` first") + continue + bp_pcs = [pc for pc, _ in state.breakpoints] + start_pc = (state.args.load_at if state.args.from_start + else bp_pcs[0]) + replLaunchMame(state, bp_pcs, start_pc, + list(state.watches.values()), + seconds=state.args.seconds) + if state.lastSnap is not None: + replPrintWhere(state) + continue + current_pc = state.lastSnap["pc"] + next_pc = replNextLinePc(state, current_pc) + if next_pc is None: + print(" no next DWARF line entry — at end of program") + continue + print(f" stepping to next DWARF line at 0x{next_pc:06x}") + replLaunchMame(state, [next_pc], current_pc, + list(state.watches.values()), + seconds=state.args.seconds) + if state.lastSnap is None: + print(" WARN: step did not hit the bp (timed out?)") + else: + replPrintWhere(state) + continue + if cmd == "where": + replPrintWhere(state) + continue + if cmd in ("bt", "backtrace"): + replPrintBacktrace(state) + continue + if cmd in ("print", "p"): + if not rest: + print(" usage: print ") + continue + replPrintSymbol(state, rest) + continue + print(f" unknown command: {line!r} (try ?)") + return 0 + + +def replMode(args): + """Entry point for `--repl`.""" + state = ReplState(args) + if args.break_at: + # --break is interpreted as "queue this bp before reading any + # interactive commands" — useful when scripting. + pc, label = state.resolveSpec(args.break_at) + if pc is None: + print(f"mameDebug: --break {args.break_at!r}: {label}", + file=sys.stderr) + return 2 + state.breakpoints.append((pc, label)) + print(f" bp #1 at 0x{pc:06x} ({label}) [from --break]") + return replLoop(state) + + # ---- main ------------------------------------------------------------ def main(): @@ -579,6 +1355,13 @@ def main(): ap.add_argument("--trace", action="store_true", help="default-on smoke mode: set bp, capture one " "BP-HIT, resolve via pc2line, exit 0") + ap.add_argument("--repl", action="store_true", + help="interactive REPL. Reads stdin commands " + "(break/run/step/next/where/bt/print/info/" + "delete/quit). Each `run`/`step`/`next` " + "launches one MAME process. `print`, `bt`, " + "and `where` decode the captured snapshot " + "and need no further MAME launch.") ap.add_argument("--from-start", action="store_true", help="start execution at LOAD_AT (i.e. through " "the crt0). Default is to jump straight to " @@ -611,6 +1394,8 @@ def main(): return 2 if args.trace: return traceMode(args) + if args.repl: + return replMode(args) return interactiveMode(args) diff --git a/scripts/probeReplSmoke.sh b/scripts/probeReplSmoke.sh new file mode 100755 index 0000000..1eff5ee --- /dev/null +++ b/scripts/probeReplSmoke.sh @@ -0,0 +1,127 @@ +#!/usr/bin/env bash +# probeReplSmoke.sh - non-interactive smoke check for mameDebug.py +# --repl mode. Pipes a canned script (`break main`, `run`, `where`, +# `quit`) into the REPL and asserts that: +# 1. The REPL parses each command without error +# 2. A breakpoint resolves through the link816 map +# 3. MAME launches with the bp installed and surfaces a BP-HIT line +# 4. `where` resolves the captured PC to a source line via DWARF +# +# Exit 0 on full pass. Exit 77 (autotools "skip") if MAME / toolchain +# missing. Exit 1 on any unexpected REPL output or missing capture. +# +# Usage: probeReplSmoke.sh [--verbose] + +set -euo pipefail +HERE="$(cd "$(dirname "$0")" && pwd)" +ROOT="$(cd "$HERE/.." && pwd)" +VERBOSE=0 +if [ "${1:-}" = "--verbose" ]; then + VERBOSE=1 +fi + +CLANG="$ROOT/tools/llvm-mos-build/bin/clang" +LLVMMC="$ROOT/tools/llvm-mos-build/bin/llvm-mc" +LINK="$ROOT/tools/link816" + +if [ ! -x "$CLANG" ] || [ ! -x "$LLVMMC" ] || [ ! -x "$LINK" ]; then + echo "probeReplSmoke: missing toolchain (clang/llvm-mc/link816)" >&2 + exit 77 +fi +if ! command -v mame >/dev/null 2>&1; then + echo "probeReplSmoke: mame not on PATH; skipping" >&2 + exit 77 +fi + +WORK="$(mktemp -d)" +trap 'rm -rf "$WORK"' EXIT +CFILE="$WORK/repltest.c" +OFILE="$WORK/repltest.o" +OCRT0="$WORK/crt0.o" +OLIBGCC="$WORK/libgcc.o" +BIN="$WORK/repltest.bin" +MAP="$WORK/repltest.map" +DWARF="$WORK/repltest.dwarf" +OUT="$WORK/repl.out" + +cat > "$CFILE" <<'EOF' +int gAnswer = 42; +int add(int a, int b) { + int c = a + b; + return c; +} +int main(void) { + int r = add(3, 4); + gAnswer = r; + while (1) { } + return r; +} +EOF + +"$CLANG" --target=w65816 -O0 -g -ffunction-sections \ + -c "$CFILE" -o "$OFILE" 2>/dev/null +"$LLVMMC" -arch=w65816 -filetype=obj \ + "$ROOT/runtime/src/crt0.s" -o "$OCRT0" 2>/dev/null +"$LLVMMC" -arch=w65816 -filetype=obj \ + "$ROOT/runtime/src/libgcc.s" -o "$OLIBGCC" 2>/dev/null +"$LINK" -o "$BIN" --text-base 0x1000 \ + --map "$MAP" --debug-out "$DWARF" \ + "$OCRT0" "$OFILE" "$OLIBGCC" >/dev/null 2>&1 || true + +[ -s "$BIN" ] || { echo "probeReplSmoke: empty .bin"; exit 1; } +[ -s "$DWARF" ] || { echo "probeReplSmoke: empty DWARF sidecar"; exit 1; } +[ -s "$MAP" ] || { echo "probeReplSmoke: empty map"; exit 1; } + +# Pipe the canned REPL script. +printf 'break main\nrun\nwhere\nquit\n' \ + | timeout 60 python3 "$HERE/mameDebug.py" --repl \ + --bin "$BIN" --map "$MAP" --dwarf "$DWARF" \ + --seconds 4 > "$OUT" 2>&1 || { + echo "probeReplSmoke: mameDebug.py --repl failed" >&2 + cat "$OUT" >&2 + exit 1 +} + +if [ "$VERBOSE" -eq 1 ]; then + cat "$OUT" >&2 +fi + +# Required output lines: +# "(dbg) break main" - command echo +# " bp #1 at 0x...... (main)" - bp set ack +# "(dbg) run" - command echo +# " PC=0x...... ... FUNC=main ..." - where output after run +# "(dbg) where" - command echo +# " PC=0x...... ... FUNC=main ..." - where output (manual) +# "(dbg) quit" - command echo +if ! grep -q "bp #1 at 0x" "$OUT"; then + echo "probeReplSmoke: missing 'bp #1 at 0x...' breakpoint ack" >&2 + cat "$OUT" >&2 + exit 1 +fi +if ! grep -q "FUNC=main" "$OUT"; then + echo "probeReplSmoke: missing FUNC=main in 'where' output" >&2 + cat "$OUT" >&2 + exit 1 +fi +# The `where` command (run AFTER the `run` command) must produce +# output too — verify by counting occurrences of "PC=0x" prefix lines. +PC_HITS=$(grep -c "^ PC=0x" "$OUT" || true) +if [ "$PC_HITS" -lt 2 ]; then + echo "probeReplSmoke: expected >= 2 PC=0x lines (run + where), got $PC_HITS" >&2 + cat "$OUT" >&2 + exit 1 +fi + +# Bonus: verify the captured PC equals the map entry for `main`. +MAIN_PC=$(awk '$2 == "main" { print $1; exit }' "$MAP") +[ -n "$MAIN_PC" ] || { echo "probeReplSmoke: no 'main' symbol in map"; exit 1; } +MAIN_PC_LC=$(echo "$MAIN_PC" | tr 'A-Z' 'a-z') +if ! grep -qi "PC=$MAIN_PC_LC " "$OUT"; then + echo "probeReplSmoke: captured PC does not match map[main]=$MAIN_PC" >&2 + cat "$OUT" >&2 + exit 1 +fi + +echo "probeReplSmoke: OK (bp resolved, BP-HIT captured, where decoded)" +exit 0 diff --git a/scripts/smokeTest.sh b/scripts/smokeTest.sh index d5c2d9f..51577fb 100755 --- a/scripts/smokeTest.sh +++ b/scripts/smokeTest.sh @@ -1146,6 +1146,20 @@ EOF fi fi + # Phase 3.3: mameDebug.py --repl non-interactive smoke. Pipes a + # canned `break main / run / where / quit` script into the REPL and + # asserts that (1) the bp resolves via the link816 map, (2) MAME + # launches and surfaces a BP-HIT, (3) the captured PC is decoded + # through DWARF into FUNC=main on the where output, and (4) the + # captured PC equals the map's entry for main. MAME-gated. + if command -v mame >/dev/null && [ -d "$PROJECT_ROOT/tools/mame/roms" ]; then + log "check: mameDebug.py --repl non-interactive (break/run/where/quit)" + if ! bash "$PROJECT_ROOT/scripts/probeReplSmoke.sh" >/dev/null 2>&1; then + bash "$PROJECT_ROOT/scripts/probeReplSmoke.sh" --verbose >&2 || true + die "mameDebug.py --repl smoke probe failed" + fi + fi + # iigs/sound.h + iigs/eventLoop.h headers compile cleanly through # clang with the runtime include path. Catches missing extern "C" # wraps, broken struct layouts, or unresolved tool-call stubs. @@ -5988,12 +6002,19 @@ EOF # omfEmit --stack-size: append a ~Direct DP/Stack segment so the # GS/OS Loader allocates an explicit-sized DP+stack chunk instead - # of its 4KB default. KIND=0x1012 (DP/Stack | PRIVATE), LENGTH and - # RESSPC both = requested size, ALIGN=0x100 (page-aligned per spec). - # Plain (non-ExpressLoad) multi-segment OMFs do not launch under - # GS/OS 6.0.2 Loader (verified empirically), so --stack-size auto- - # enables --expressload: the OMF becomes 3 segments (ExpressLoad, - # code, DP/Stack), with DP/Stack as segnum 3. + # of its 4KB default. KIND=0x4012 (DP/Stack | RELOAD), LENGTH = + # requested size, RESSPC=0 (the stack bytes are carried in LCONST + # because the ExpressLoad fast path can't be trusted to honor + # RESSPC — same trick the user CODE seg uses for BSS). ALIGN= + # 0x100 (page-aligned per spec). Plain (non-ExpressLoad) multi- + # segment OMFs do not launch under GS/OS 6.0.2 Loader (verified + # empirically), so --stack-size auto-enables --expressload: the + # OMF becomes 3 segments (ExpressLoad, code, DP/Stack), with + # DP/Stack as segnum 3. The ExpressLoad load script also carries + # a segtable + remap + header_info entry for the DP/Stack so the + # Loader's fast path actually honors it (without that the Loader + # silently drops the seg and uses its default 4KB allocation — + # see feedback_gsos_fopen_partial_diagnosis). log "check: omfEmit --stack-size emits a DP/Stack ~Direct segment" omfStk="$(mktemp --suffix=.omf)" "$PROJECT_ROOT/tools/omfEmit" \ @@ -6022,16 +6043,34 @@ align = struct.unpack_from(' new seg 3 +remapOff = 6 + 8*2 +rm = struct.unpack_from('/dev/null 2>&1; then + log "check: omfEmit --stack-size grows DP/Stack chunk under real GS/OS Loader" + cStkFile="$(mktemp --suffix=.c)" + oStkFile="$(mktemp --suffix=.o)" + binStk="$(mktemp --suffix=.bin)" + mapStk="$(mktemp --suffix=.map)" + relStk="$(mktemp --suffix=.reloc)" + omfStkWith="$(mktemp --suffix=.omf)" + omfStkWithout="$(mktemp --suffix=.omf)" + cat > "$cStkFile" <<'EOF' +// Stack-size end-to-end probe: capture SP at entry to main() and +// store its high byte at $71 so the harness can verify Loader honored +// --stack-size. $70 = 0x99 marker = program ran. +int main(void) { + __asm__ volatile ( + "rep #0x30\n" + "tsc\n" + "xba\n" + "sep #0x20\n" + "sta 0x71\n" + "rep #0x20\n" + ); + *(volatile unsigned char *)0x70 = 0x99; + for (volatile unsigned long s = 0; s < 600000UL; s++) { } + return 0; +} +EOF + "$CLANG" --target=w65816 -I"$PROJECT_ROOT/runtime/include" -O2 -ffunction-sections -c \ + "$cStkFile" -o "$oStkFile" + "$PROJECT_ROOT/tools/link816" -o "$binStk" --text-base 0x1000 \ + --map "$mapStk" --reloc-out "$relStk" \ + "$PROJECT_ROOT/runtime/crt0Gsos.o" "$oStkFile" \ + "$PROJECT_ROOT/runtime/libc.o" \ + "$PROJECT_ROOT/runtime/snprintf.o" \ + "$PROJECT_ROOT/runtime/extras.o" \ + "$PROJECT_ROOT/runtime/softFloat.o" \ + "$PROJECT_ROOT/runtime/softDouble.o" \ + "$PROJECT_ROOT/runtime/iigsGsos.o" \ + "$PROJECT_ROOT/runtime/iigsToolbox.o" \ + "$PROJECT_ROOT/runtime/libgcc.o" 2>/tmp/stkprobe-link.err >/dev/null \ + || die "stack-size smoke: link failed: $(cat /tmp/stkprobe-link.err)" + # WITH --stack-size 0x4000 (16 KB chunk; Loader places at $0800, + # SP lands at $47FF → high byte $47). + "$PROJECT_ROOT/tools/omfEmit" --input "$binStk" --map "$mapStk" \ + --base 0x1000 --entry __start --output "$omfStkWith" \ + --name STKPROBE --stack-size 0x4000 --relocs "$relStk" >/dev/null 2>&1 + if [ ! -s "$omfStkWith" ]; then + die "stack-size smoke: omfEmit (with stack-size) produced empty OMF" + fi + if ! bash "$PROJECT_ROOT/scripts/runViaFinder.sh" "$omfStkWith" \ + --check 0x70=0x99 0x71=0x47 >/dev/null 2>&1; then + bash "$PROJECT_ROOT/scripts/runViaFinder.sh" "$omfStkWith" \ + --check 0x70=0x99 0x71=0x47 2>&1 | tail -5 >&2 + die "stack-size smoke FAILED: SP high byte != 0x47 with --stack-size 0x4000 (Loader silently dropped the seg?)" + fi + # WITHOUT --stack-size: Loader default 4 KB chunk → SP=$17FF → + # high byte $17. This second run guards against a spurious pass + # of the first (e.g. if every program by coincidence got SP=$47FF + # without our seg). + "$PROJECT_ROOT/tools/omfEmit" --input "$binStk" --map "$mapStk" \ + --base 0x1000 --entry __start --output "$omfStkWithout" \ + --name STKPROBE --expressload --relocs "$relStk" >/dev/null 2>&1 + if [ ! -s "$omfStkWithout" ]; then + die "stack-size smoke: omfEmit (no stack-size) produced empty OMF" + fi + if ! bash "$PROJECT_ROOT/scripts/runViaFinder.sh" "$omfStkWithout" \ + --check 0x70=0x99 0x71=0x17 >/dev/null 2>&1; then + bash "$PROJECT_ROOT/scripts/runViaFinder.sh" "$omfStkWithout" \ + --check 0x70=0x99 0x71=0x17 2>&1 | tail -5 >&2 + die "stack-size smoke FAILED: baseline SP high byte != 0x17 (Loader default-allocation shifted?)" + fi + rm -f "$cStkFile" "$oStkFile" "$binStk" "$mapStk" "$relStk" \ + "$omfStkWith" "$omfStkWithout" +fi + # W65816 codegen-shape regression pins. Tiny FileCheck assertions on # specific lowering behaviors that have broken before; runs in well # under a second. See scripts/runFileCheckTests.sh. @@ -6535,23 +6661,25 @@ else log "OK: cursorProbe Push/Pop arrow+busy returned cleanly + marker set" fi -# Phase 3.4 resourcemgr STUB-ONLY landing. Verifies: +# Phase 3.4 resourcemgr REAL implementation. Verifies: # - resource.o links into a normal GS/OS demo, -# - resourceProbeInit() / iigsLoadResource() / iigsGetResourceSize() -# all return RES_ERR_BLOCKED in stub mode (mark 0x71/0x72 = 0xff), -# - resourceRuntimeEnabled() returns 0 in stub mode (mark 0x73 = 0x01), -# - demos/build.sh's rsrcBundle post-step produces an AppleSingle blob -# and the cadius _ResourceFork.bin sidecar when demos/rsrcProbe.rsrc/ -# is present (verified by file existence). -# The live resource-fork pathway in MAME is NOT exercised here - the -# whole point of the stub-only landing is that Phase 1.1 (GS/OS fopen -# hang) blocks the live path on GS/OS 6.0.2. +# - the demo stages an in-memory .rsrc fixture via mfsRegister, +# opens it through openResourceFile (real parser), loads an rText +# resource by (type, id), verifies the payload bytes match +# "HELLO" and the size is 5, +# - second loadResource() call returns the SAME handle (cache hit), +# - closeResourceFile() returns RES_OK, +# - demos/build.sh's rsrcBundle post-step still produces an AppleSingle +# blob + cadius sidecar when demos/rsrcProbe.rsrc/ is present. +# The fixture also doubles as a bundler-output verification: the on-disk +# sidecar bytes from rsrcBundle.py match the in-memory fixture byte-for- +# byte, so passing this check confirms parser + bundler agree on format. if [ "${SMOKE_SKIP_RSRC:-0}" = 1 ]; then warn "SMOKE_SKIP_RSRC=1; skipping Phase 3.4 rsrcProbe stage" elif [ ! -f "$SYSDISK_DR" ] || [ ! -x "$CADIUS_DR" ] || ! command -v mame >/dev/null 2>&1; then warn "Phase 3.4 rsrcProbe prerequisites missing; skipping" else - log "check: rsrcProbe stub Resource Manager facade runs under GS/OS" + log "check: rsrcProbe real Resource Manager (open/load/release/close) under GS/OS" bash "$PROJECT_ROOT/demos/build.sh" rsrcProbe >/tmp/rsrcBuildOut 2>&1 || { cat /tmp/rsrcBuildOut >&2 die "demos/build.sh rsrcProbe failed" @@ -6565,11 +6693,11 @@ else fi bash "$PROJECT_ROOT/scripts/runViaFinder.sh" \ "$PROJECT_ROOT/demos/rsrcProbe.omf" \ - --check 0x70=0x99 0x71=0xff 0x72=0xff 0x73=0x01 >/tmp/rsrcRunOut 2>&1 || { + --check 0x70=0x99 0x71=0x01 0x72=0x01 0x73=0x01 >/tmp/rsrcRunOut 2>&1 || { cat /tmp/rsrcRunOut >&2 - die "rsrcProbe did not set expected stub-mode markers" + die "rsrcProbe did not set expected real-impl markers" } - log "OK: rsrcProbe (stub-mode RES_ERR_BLOCKED markers all green)" + log "OK: rsrcProbe (real Resource Manager open/load/cache/close all green)" fi # Phase 4.2 sprite engine: standalone SHR 320 init + 16x16 4bpp packed @@ -6621,15 +6749,23 @@ fi # Phase 6.2 UBSan-min smoke probe: build a tiny program with # `-fsanitize=undefined -fsanitize-minimal-runtime`, link against the -# new runtime/ubsan.o, and verify three representative UB kinds -# (add-overflow / shift-out-of-bounds / divrem-overflow) instrument -# cleanly + recover. Bare-metal (no GS/OS), so we only require `mame`. +# new runtime/ubsan.o, and verify nine recoverable UB kinds +# (add-overflow / shift-out-of-bounds / divrem-overflow / sub-overflow / +# mul-overflow / negate-overflow / pointer-overflow / load-invalid-value / +# out-of-bounds) instrument cleanly + recover. Bare-metal (no GS/OS), +# so we only require `mame`. # # What this probe pins: # $025000 = 0xC0DE add-overflow handler fired and recovered # $025002 = 0xC0DF shift-out-of-bounds handler fired and recovered # $025004 = 0xC0E0 divrem-overflow handler fired and recovered -# $025006 = 0xC0DA main reached its tail past all three UBs +# $025006 = 0xC0E1 sub-overflow handler fired and recovered +# $025008 = 0xC0E2 mul-overflow handler fired and recovered +# $02500A = 0xC0E3 negate-overflow handler fired and recovered +# $02500C = 0xC0E4 pointer-overflow handler fired and recovered +# $02500E = 0xC0E5 load-invalid-value handler fired and recovered +# $025010 = 0xC0E6 out-of-bounds handler fired and recovered +# $025012 = 0xC0DA main reached its tail past all nine UBs # # Gated on `mame`. Override with SMOKE_SKIP_UBSAN=1. if [ "${SMOKE_SKIP_UBSAN:-0}" = 1 ]; then @@ -6637,12 +6773,12 @@ if [ "${SMOKE_SKIP_UBSAN:-0}" = 1 ]; then elif ! command -v mame >/dev/null 2>&1 || [ ! -d "$PROJECT_ROOT/tools/mame/roms" ]; then warn "Phase 6.2 ubsan prerequisites missing (mame); skipping" else - log "check: ubsanProbe (UBSan-min: add-overflow + shift-OOB + div-by-zero) in MAME" + log "check: ubsanProbe (UBSan-min: 9 UB kinds) in MAME" bash "$PROJECT_ROOT/tests/ubsan/runUbsanProbe.sh" >/tmp/ubsanRunOut 2>&1 || { cat /tmp/ubsanRunOut >&2 die "ubsanProbe did not set expected handler-fired markers" } - log "OK: ubsanProbe (3 UB kinds instrumented + recovered + tail reached)" + log "OK: ubsanProbe (9 UB kinds instrumented + recovered + tail reached)" fi log "all smoke checks passed" diff --git a/src/link816/link816.cpp b/src/link816/link816.cpp index 248e24f..3f0fbcf 100644 --- a/src/link816/link816.cpp +++ b/src/link816/link816.cpp @@ -73,12 +73,12 @@ struct Elf32Shdr { uint32_t sh_entsize; }; -static constexpr uint32_t SHT_NULL = 0; -static constexpr uint32_t SHT_PROGBITS = 1; +[[maybe_unused]] static constexpr uint32_t SHT_NULL = 0; +[[maybe_unused]] static constexpr uint32_t SHT_PROGBITS = 1; static constexpr uint32_t SHT_SYMTAB = 2; static constexpr uint32_t SHT_STRTAB = 3; static constexpr uint32_t SHT_RELA = 4; -static constexpr uint32_t SHT_NOBITS = 8; +[[maybe_unused]] static constexpr uint32_t SHT_NOBITS = 8; struct Elf32Sym { uint32_t st_name; @@ -104,12 +104,12 @@ static constexpr uint16_t EM_NONE = 0; inline uint8_t ELF32_ST_TYPE(uint8_t i) { return i & 0x0F; } inline uint8_t ELF32_ST_BIND(uint8_t i) { return (i >> 4) & 0x0F; } static constexpr uint8_t STB_LOCAL = 0; -static constexpr uint8_t STB_GLOBAL = 1; +[[maybe_unused]] static constexpr uint8_t STB_GLOBAL = 1; static constexpr uint8_t STB_WEAK = 2; -static constexpr uint8_t STT_NOTYPE = 0; -static constexpr uint8_t STT_OBJECT = 1; -static constexpr uint8_t STT_FUNC = 2; +[[maybe_unused]] static constexpr uint8_t STT_NOTYPE = 0; +[[maybe_unused]] static constexpr uint8_t STT_OBJECT = 1; +[[maybe_unused]] static constexpr uint8_t STT_FUNC = 2; static constexpr uint8_t STT_SECTION = 3; struct Elf32Rela { @@ -170,9 +170,10 @@ static std::string sectionKind(const std::string &name) { // .init_array entries are 16-bit function pointers; treat as // rodata so they end up in the read-only image and get a stable // address. The linker emits __init_array_start/_end so crt0 can - // walk them. Same for .fini_array (destructors). + // walk them. (.fini_array is not yet wired up; ELF input is + // accepted but the sections are dropped — runtime has no + // destructor-walk path today.) if (name == ".init_array" || name.rfind(".init_array.", 0) == 0) return "init_array"; - if (name == ".fini_array" || name.rfind(".fini_array.", 0) == 0) return "fini_array"; // DWARF debug sections that are *targets* of intra-debug relocs // (e.g. .debug_info -> .debug_str via R_W65816_DATA32, or // .debug_str_offsets -> .debug_str via R_W65816_DATA32). Treat @@ -384,6 +385,26 @@ static std::vector gImm24Sites; static uint32_t gTextBaseForSites = 0; static bool gRecordSites = false; + +// Record an intra-segment patch site for cRELOC emission. A target +// below the text base is never intra-segment (it is an undefined-weak +// resolving to 0, or an absolute address) and is skipped — see the +// commentary at the R_W65816_IMM16 callsite for why this matters. +static void recordCRelocSite(uint32_t patchAddr, uint32_t target, + uint8_t byteCnt, uint8_t bitShift) { + if (!gRecordSites) return; + uint32_t targetBank = target & 0xFF0000; + uint32_t baseBank = gTextBaseForSites & 0xFF0000; + if (targetBank != baseBank) return; + if (target < gTextBaseForSites) return; + Imm24Site s; + s.patchOff = patchAddr - gTextBaseForSites; + s.offsetRef = target - gTextBaseForSites; + s.byteCnt = byteCnt; + s.bitShift = bitShift; + gImm24Sites.push_back(s); +} + // Number of bytes patched by a given reloc type. Used by callers // that need to range-check a reloc offset against a buffer size // without re-deriving the width inline. Returns 0 for unknown @@ -411,7 +432,7 @@ static uint32_t relocWidth(uint8_t rtype) { static void applyReloc(std::vector &buf, uint32_t off, uint32_t patchAddr, uint32_t target, uint8_t rtype, const std::string &symName) { - int64_t Signed; + int64_t pcrelDisp; switch (rtype) { case R_W65816_IMM8: if (target > 0xFF) @@ -433,28 +454,16 @@ static void applyReloc(std::vector &buf, uint32_t off, // time. Without this, `lda absConst` reads from the wrong // address when the segment doesn't land at link-time-base // (e.g., link-time-base=0x1000 but Loader places at bank:0). - if (gRecordSites) { - uint32_t targetBank = target & 0xFF0000; - uint32_t baseBank = gTextBaseForSites & 0xFF0000; - // A target below the text base is never an intra-segment - // relocatable site: it is an undefined-weak symbol (resolveSym - // resolves those to 0) or an absolute address. Recording a - // cRELOC for it would (a) underflow offsetRef = target - textBase - // (omfEmit rejects it as out-of-range) and (b) make the Loader - // rewrite a genuine null to segPlacedBase, breaking the - // `if (weakFn) weakFn()` null test that the null is meant to fail. - if (targetBank == baseBank && target >= gTextBaseForSites) { - Imm24Site s; - s.patchOff = patchAddr - gTextBaseForSites; - s.offsetRef = target - gTextBaseForSites; - // Use type field width = 2 to distinguish from IMM24 - // (3). Imm24Site struct is reused — emitOmf will - // emit cRELOC ByteCnt=2 for this. - s.byteCnt = 2; - s.bitShift = 0; - gImm24Sites.push_back(s); - } - } + // A target below the text base is never an intra-segment + // relocatable site: it is an undefined-weak symbol (resolveSym + // resolves those to 0) or an absolute address. Recording a + // cRELOC for it would (a) underflow offsetRef = target - textBase + // (omfEmit rejects it as out-of-range) and (b) make the Loader + // rewrite a genuine null to segPlacedBase, breaking the + // `if (weakFn) weakFn()` null test that the null is meant to fail. + // recordCRelocSite handles the gate; byteCnt=2 distinguishes + // from IMM24 (3) so omfEmit emits cRELOC ByteCnt=2 here. + recordCRelocSite(patchAddr, target, /*byteCnt=*/2, /*bitShift=*/0); break; case R_W65816_BANK16: // 2-byte patch: byte 0 = bank of target, byte 1 = 0 (pad). @@ -463,20 +472,9 @@ static void applyReloc(std::vector &buf, uint32_t off, // the value reflects the actually-placed bank. buf[off] = static_cast((target >> 16) & 0xFF); buf[off + 1] = 0; - if (gRecordSites) { - uint32_t targetBank = target & 0xFF0000; - uint32_t baseBank = gTextBaseForSites & 0xFF0000; - // See R_W65816_IMM16: skip undefined-weak/absolute targets - // below the text base (no valid intra-segment cRELOC). - if (targetBank == baseBank && target >= gTextBaseForSites) { - Imm24Site s; - s.patchOff = patchAddr - gTextBaseForSites; - s.offsetRef = target - gTextBaseForSites; - s.byteCnt = 2; - s.bitShift = 16; - gImm24Sites.push_back(s); - } - } + // bitShift=16: cRELOC Loader patches the bank byte from + // (segPlacedBase + offsetRef) >> 16 at load time. + recordCRelocSite(patchAddr, target, /*byteCnt=*/2, /*bitShift=*/16); break; case R_W65816_IMM24: if (target > 0xFFFFFF) @@ -485,46 +483,30 @@ static void applyReloc(std::vector &buf, uint32_t off, buf[off] = static_cast(target & 0xFF); buf[off + 1] = static_cast((target >> 8) & 0xFF); buf[off + 2] = static_cast((target >> 16) & 0xFF); - // Record the site for OMF cRELOC emission (only if recording is - // enabled — gRecordSites is set by the CLI when --reloc-out is - // requested). The patch offset is within the segment image; the - // reference offset is the in-segment offset of the target. - if (gRecordSites) { - // Only intra-segment refs need cRELOC; cross-bank refs (to - // GS/OS dispatcher etc.) target absolute fixed addresses - // and shouldn't be relocated by the Loader. - uint32_t targetBank = target & 0xFF0000; - uint32_t baseBank = gTextBaseForSites & 0xFF0000; - // See R_W65816_IMM16: skip undefined-weak/absolute targets - // below the text base (no valid intra-segment cRELOC). - if (targetBank == baseBank && target >= gTextBaseForSites) { - Imm24Site s; - s.patchOff = patchAddr - gTextBaseForSites; - s.offsetRef = target - gTextBaseForSites; - s.byteCnt = 3; - s.bitShift = 0; - gImm24Sites.push_back(s); - } - } + // Only intra-segment refs need cRELOC; cross-bank refs (to + // GS/OS dispatcher etc.) target absolute fixed addresses + // and shouldn't be relocated by the Loader. recordCRelocSite + // applies the same gates as R_W65816_IMM16. + recordCRelocSite(patchAddr, target, /*byteCnt=*/3, /*bitShift=*/0); break; case R_W65816_PCREL8: - Signed = static_cast(target) - (static_cast(patchAddr) + 1); - if (Signed < -128 || Signed > 127) { + pcrelDisp = static_cast(target) - (static_cast(patchAddr) + 1); + if (pcrelDisp < -128 || pcrelDisp > 127) { char msg[256]; std::snprintf(msg, sizeof(msg), "R_W65816_PCREL8 to '%s' out of branch range (%lld bytes)", - symName.c_str(), (long long)Signed); + symName.c_str(), (long long)pcrelDisp); die(msg); } - buf[off] = static_cast(Signed & 0xFF); + buf[off] = static_cast(pcrelDisp & 0xFF); break; case R_W65816_PCREL16: - Signed = static_cast(target) - (static_cast(patchAddr) + 2); - if (Signed < -32768 || Signed > 32767) + pcrelDisp = static_cast(target) - (static_cast(patchAddr) + 2); + if (pcrelDisp < -32768 || pcrelDisp > 32767) die("R_W65816_PCREL16 to '" + symName + "' out of BRL range"); - buf[off] = static_cast(Signed & 0xFF); - buf[off + 1] = static_cast((Signed >> 8) & 0xFF); + buf[off] = static_cast(pcrelDisp & 0xFF); + buf[off + 1] = static_cast((pcrelDisp >> 8) & 0xFF); break; case R_W65816_DATA32: // 4-byte LE absolute. Used in DWARF .debug_* sections @@ -554,33 +536,22 @@ static void applyReloc(std::vector &buf, uint32_t off, // patches the low 3 bytes of the 4-byte slot at load time, // leaving the high (pad) byte at 0 (writes the resolved // 24-bit value bank:offset with bitShift=0 == no shift). - if (gRecordSites) { - uint32_t targetBank = target & 0xFF0000; - uint32_t baseBank = gTextBaseForSites & 0xFF0000; - if (targetBank == baseBank && target >= gTextBaseForSites) { - Imm24Site s; - s.patchOff = patchAddr - gTextBaseForSites; - s.offsetRef = target - gTextBaseForSites; - s.byteCnt = 3; - s.bitShift = 0; - gImm24Sites.push_back(s); - } - } + recordCRelocSite(patchAddr, target, /*byteCnt=*/3, /*bitShift=*/0); break; case R_W65816_PCREL32: // 4-byte signed PC-relative. PCREL displacements have the // PC pointing past the slot — the convention used by every // other PCREL reloc in this file (PCREL8 adds 1, PCREL16 // adds 2), so PCREL32 adds 4. - Signed = static_cast(target) - (static_cast(patchAddr) + 4); + pcrelDisp = static_cast(target) - (static_cast(patchAddr) + 4); // No range check: 32-bit signed displacement covers the // full address space. In practice this fires for DWARF // intra-section diffs where target and patchAddr live in - // the same section, so Signed is small. - buf[off] = static_cast(Signed & 0xFF); - buf[off + 1] = static_cast((Signed >> 8) & 0xFF); - buf[off + 2] = static_cast((Signed >> 16) & 0xFF); - buf[off + 3] = static_cast((Signed >> 24) & 0xFF); + // the same section, so pcrelDisp is small. + buf[off] = static_cast(pcrelDisp & 0xFF); + buf[off + 1] = static_cast((pcrelDisp >> 8) & 0xFF); + buf[off + 2] = static_cast((pcrelDisp >> 16) & 0xFF); + buf[off + 3] = static_cast((pcrelDisp >> 24) & 0xFF); break; default: { char msg[128]; @@ -1106,11 +1077,6 @@ struct Linker { curRem -= seg; if (curRem == 0) { segIdx++; break; } curBase += seg; // advance within bank or to next - if ((curBase & 0xFFFFu) == 0) { - // Crossed bank boundary — already at start of next bank. - } else if ((curBase & 0xFF0000u) != ((curBase - 1) & 0xFF0000u)) { - // Just crossed into next bank. - } } // Zero out any unused segment slots so crt0 sees size=0. for (uint32_t i = segIdx; i < 4; i++) { @@ -1709,13 +1675,9 @@ int main(int argc, char **argv) { if (++i >= argc) usage(argv[0]); relocOutPath = argv[i++]; } else if (a == "--gc-sections") { - // Drop sections not reachable from __start / main / - // init_array. Requires `-ffunction-sections` (so each - // function is in its own section). Significantly shrinks - // text for programs that link the whole runtime but only - // use a fraction of it. ON by default; --no-gc-sections - // disables. - linker.gcSections = true; + // GC of unreachable sections is on by default; --gc-sections + // is accepted as a no-op alias for clarity. Use + // --no-gc-sections to disable. i++; } else if (a == "--no-gc-sections") { linker.gcSections = false; diff --git a/src/link816/omfEmit b/src/link816/omfEmit new file mode 100755 index 0000000..360dacd Binary files /dev/null and b/src/link816/omfEmit differ diff --git a/src/link816/omfEmit.cpp b/src/link816/omfEmit.cpp index c9f5e0a..59c8ea2 100644 --- a/src/link816/omfEmit.cpp +++ b/src/link816/omfEmit.cpp @@ -32,6 +32,24 @@ namespace { +// OMF v2.1 protocol constants -- single source of truth for the header +// layout and opcode set. See Apple IIgs Tech Note #17 and the FTN +// reference. Don't renumber; values are shared with the loader. +static constexpr uint8_t OMF_OP_LCONST = 0xF2; +static constexpr uint8_t OMF_OP_CRELOC = 0xF5; +static constexpr uint8_t OMF_OP_END = 0x00; +[[maybe_unused]] static constexpr uint8_t OMF_NUMLEN = 4; +[[maybe_unused]] static constexpr uint8_t OMF_VERSION_V21 = 0x02; +[[maybe_unused]] static constexpr uint32_t OMF_HDR_SIZE = 44; +[[maybe_unused]] static constexpr uint32_t OMF_LABLEN_FIXED = 10; +static constexpr uint16_t OMF_KIND_CODE_PRIV = 0x1000; +static constexpr uint16_t OMF_KIND_DPSTACK = 0x4012; // DP/Stack | RELOAD; matches real-world GNO/ME ~_STACK format +static constexpr uint16_t OMF_KIND_DATA_STATIC = 0x8001; +static constexpr uint16_t OMF_KIND_CODE_STATIC_ABSBANK = 0x8800; +// cRELOC opcode wire size: opcode + ByteCnt + BitShift + OffsetPatch + +// OffsetReference = 1 + 1 + 1 + 2 + 2 = 7 bytes per site. +static constexpr uint32_t OMF_CRELOC_BYTES_PER_SITE = 7; + [[noreturn]] static void die(const std::string &msg) { std::fprintf(stderr, "omfEmit: %s\n", msg.c_str()); std::exit(1); @@ -48,9 +66,7 @@ struct RelocSite { uint8_t byteCnt; uint8_t bitShift; // 0 for offset relocs, 16 for BANK16 }; -} // close namespace std::vector gReloc24Sites; -namespace { static std::vector readFile(const std::string &path) { std::ifstream f(path, std::ios::binary); @@ -135,7 +151,7 @@ static std::vector emitOneSeg(const std::vector &image, // literal bytes. With NUMLEN=4 (standard for v2.1), the count // field is 4 bytes. Verified empirically against real /SYSTEM/ // START on GS/OS 6.0.2: every segment uses 0xF2 + 4-byte count. - body.push_back(0xF2); // LCONST opcode + body.push_back(OMF_OP_LCONST); // LCONST opcode put32(body, static_cast(combined.size())); body.insert(body.end(), combined.begin(), combined.end()); } @@ -150,14 +166,14 @@ static std::vector emitOneSeg(const std::vector &image, // (segPlacedBase + OffsetReference) at load time. This is what // makes JSL/JML/STAlong/etc. with intra-segment targets work when // the Loader places us at non-zero bank. - for (const auto &s : ::gReloc24Sites) { - body.push_back(0xF5); + for (const auto &s : gReloc24Sites) { + body.push_back(OMF_OP_CRELOC); body.push_back(s.byteCnt); // ByteCnt (2 or 3) body.push_back(s.bitShift); // BitShift (0 or 16) put16(body, s.patchOff); // OffsetPatch put16(body, s.offsetRef); // OffsetReference } - body.push_back(0x00); // END opcode + body.push_back(OMF_OP_END); // END opcode // Real OMF format (Merlin32 convention, verified GS/OS Loader-launchable): // - LABLEN = 10: both LOAD_NAME and SEG_NAME are 10 bytes wide, @@ -247,13 +263,21 @@ static std::vector emitOneSeg(const std::vector &image, // allocate a page-aligned, locked memory block of that size in // bank $00." // -// The body is just an END opcode (no LCONST data — RESSPC alone tells -// the Loader how big to make the allocation, and the bytes don't need -// to come from the file). KIND = 0x1012 = DP/Stack | PRIVATE — the -// PRIVATE attribute matches Apple's `makedirect` reference utility -// (ksherlock/omfutils). +// The body is an LCONST opcode followed by `length` zero bytes plus an +// END opcode — matching the real-world format used by every GNO/ME +// command (e.g. /GNO.BOOT/bin/echo's ~_STACK seg). Empirically a body +// of just END (no LCONST, relying on RESSPC for allocation) makes the +// GS/OS Loader's ExpressLoad fast path silently drop the seg and fall +// back to its default 4 KB DP/Stack — hence this code emits real +// content so the Loader has something to copy. KIND = 0x4012 (RELOAD +// | DP/Stack) also matches the working GNO format; the earlier 0x1012 +// (PRIVATE | DP/Stack) is what `makedirect` ships but doesn't survive +// ExpressLoad fast-path processing. static std::vector emitDpStackSeg(uint32_t length, uint16_t segNum) { std::vector body; + body.push_back(0xF2); // LCONST opcode + put32(body, length); // 4-byte literal length + body.insert(body.end(), length, 0); // `length` zero bytes body.push_back(0x00); // END opcode constexpr uint8_t LABLEN_VAL = 10; const std::string segNameTxt = "~Direct"; @@ -267,10 +291,13 @@ static std::vector emitDpStackSeg(uint32_t length, uint16_t segNum) { DISPNAME + loadName.size() + segName.size()); const uint32_t LENGTH = length; // memory size requested const uint32_t BYTECNT = DISPDATA + static_cast(body.size()); - const uint32_t RESSPC = length; // bytes to zero-allocate + // RESSPC = 0 because the bytes are carried in LCONST (matches the + // bss-as-zeros approach used for the user CODE seg — the Loader's + // ExpressLoad fast path can't be trusted to honor RESSPC). + const uint32_t RESSPC = 0; const uint32_t BANKSIZE = 0; // DP/Stack lives in bank 0 const uint32_t ALIGN = 0x100; // page-aligned per spec - const uint16_t KIND = 0x1012; // DP/Stack | PRIVATE + const uint16_t KIND = OMF_KIND_DPSTACK; // DP/Stack | RELOAD std::vector hdr; put32(hdr, BYTECNT); @@ -324,7 +351,7 @@ static std::vector emitOMF(const std::vector &image, uint32_t bssGap = 0) { if (stackSize == 0) { return emitOneSeg(image, entryOffset, /*org*/0, /*segNum*/1, - /*kind*/0x1000, name, bssSize, bssGap); + /*kind*/OMF_KIND_CODE_PRIV, name, bssSize, bssGap); } // DP/Stack segment ordering: Apple's `makedirect` reference utility // assigns the DP/Stack as SEGNUM 1 (its own object); when linked @@ -334,7 +361,7 @@ static std::vector emitOMF(const std::vector &image, // sets DP and SP appropriately when entering our code. auto dpSeg = emitDpStackSeg(stackSize, /*segNum*/1); auto codeSeg = emitOneSeg(image, entryOffset, /*org*/0, /*segNum*/2, - /*kind*/0x1000, name, bssSize, bssGap); + /*kind*/OMF_KIND_CODE_PRIV, name, bssSize, bssGap); std::vector out; out.insert(out.end(), dpSeg.begin(), dpSeg.end()); out.insert(out.end(), codeSeg.begin(), codeSeg.end()); @@ -391,6 +418,17 @@ static std::vector emitOmfExpressLoad( auto userSeg = emitOneSeg(image, entryOffset, /*org*/0, /*segNum*/2, /*kind*/0x1000, userSegName, bssSize, bssGap); + // Optionally build the DP/Stack segment. If present it lives in the + // file AFTER the user seg and gets its own ExpressLoad segtable + + // remap + header_info entries — otherwise the Loader's ExpressLoad + // fast path never sees the KIND=0x4012 record and reverts to its + // default 4KB DP/Stack allocation (silent --stack-size no-op). + const bool haveDpStack = (stackSize != 0); + std::vector dpStackSeg; + if (haveDpStack) { + dpStackSeg = emitDpStackSeg(stackSize, /*segNum*/3); + } + // Step 2: figure out the file offsets we'll need to bake into the // load script. We don't know the ExpressLoad segment's total size // yet — but we can compute it because each component is a fixed @@ -399,11 +437,10 @@ static std::vector emitOmfExpressLoad( // ExpressLoad LCONST data layout (matches Merlin32 source — see // BuildExpressLoadSegment in Merlin32's a65816_OMF.c): // 6 bytes header (4-byte reserved DWORD + 2-byte count WORD) - // 8 bytes segment list (1 entry per non-ExpressLoad segment) - // 2 bytes remap list (1 entry per non-ExpressLoad segment) - // 16 bytes header info offsets (data_off, data_len, reloc_off, reloc_len) - // + header_xpress: bytes [12..43] of user header (32 bytes) + LOAD_NAME (10) + SEG_NAME (1+N) - // = 6 + 8 + 2 + 16 + 32 + 10 + 1 + N = 75 + N bytes + // 8 bytes/seg segment list (1 entry per non-ExpressLoad segment) + // 2 bytes/seg remap list (1 entry per non-ExpressLoad segment) + // 68 bytes/seg header_info (16B offsets + 32B hdr copy + 10B LOAD_NAME + 10B SEG_NAME) + // total: 6 + 78*N bytes for N non-ExpressLoad segs // // KEY FIX from earlier emitter version: header is 6 bytes, NOT 8. // I had written 8 bytes (file_ref WORD + reserved WORD + extra WORD + @@ -415,7 +452,10 @@ static std::vector emitOmfExpressLoad( constexpr uint32_t HDR_SIZE = 44; constexpr uint32_t LOAD_NAME_SIZE = 10; constexpr uint32_t SEG_NAME_SIZE = 10; // LABLEN=10 → fixed-width SEG_NAME - const uint32_t userNameLen = (uint32_t)userSegName.size(); + constexpr uint32_t SEGTAB_ENTRY = 8; + constexpr uint32_t REMAP_ENTRY = 2; + constexpr uint32_t HDR_INFO_ENTRY = 16 + 32 + LOAD_NAME_SIZE + SEG_NAME_SIZE; // 68 + constexpr uint32_t HEADER_BYTES = 6; const uint32_t userNameAreaSize = LOAD_NAME_SIZE + SEG_NAME_SIZE; // ExpressLoad's own segment metrics. The name "~ExpressLoad" is 12 @@ -423,12 +463,8 @@ static std::vector emitOmfExpressLoad( // uses LABLEN=0 (length-prefixed name): 1 length byte + 12 chars. const std::string elName = "~ExpressLoad"; const uint32_t elNameAreaSize = LOAD_NAME_SIZE + 1 + (uint32_t)elName.size(); - // header_xpress_length = (header bytes 12..43) + LOAD_NAME + SEG_NAME - // = 32 + 10 + 10 = 52 bytes - // Per-segment ExpressLoad data: 8 (table) + 2 (remap) + 16 (offsets) + 52 = 78 bytes - // Header (6 bytes) + per-segment data: 6 + 78 = 84 - const uint32_t elDataSize = 84; - (void)userNameLen; // truncated in user seg name; LABLEN=10 fixed + const uint32_t nSegs = haveDpStack ? 2 : 1; // non-ExpressLoad segs + const uint32_t elDataSize = HEADER_BYTES + (SEGTAB_ENTRY + REMAP_ENTRY + HDR_INFO_ENTRY) * nSegs; // Body size = 1 byte LCONST opcode + 4 byte length + data + 1 byte END const uint32_t elBodySize = 1 + 4 + elDataSize + 1; const uint32_t elSegSize = HDR_SIZE + elNameAreaSize + elBodySize; @@ -438,24 +474,47 @@ static std::vector emitOmfExpressLoad( const uint32_t userBodyOpOff = userSegStart + HDR_SIZE + userNameAreaSize; const uint32_t userDataOff = userBodyOpOff + 5; // 1 op + 4 length + // DP/Stack segment file offsets (after user seg). The DP/Stack body + // mirrors the real GNO/ME ~_STACK seg format: an LCONST opcode + 4 + // byte length + `stackSize` zero bytes + END. ExpressLoad's + // hdr_info entry has to point at the LCONST data so the Loader + // copies the right number of zeros into the allocated chunk — a + // body of just END (RESSPC-only) silently no-ops on the + // ExpressLoad fast path, which is the bug this whole section fixes. + const uint32_t dpStackSegStart = userSegStart + (uint32_t)userSeg.size(); + const uint32_t dpStackBodyOff = dpStackSegStart + HDR_SIZE + (LOAD_NAME_SIZE + SEG_NAME_SIZE); + const uint32_t dpStackDataOff = dpStackBodyOff + 5; // 1 op + 4 length + // Step 3: build the ExpressLoad LCONST data. std::vector elData; - // Header (6 bytes): reserved DWORD + count WORD + // Header (6 bytes): reserved DWORD + count WORD. count = N-2 where + // N = total segments in the file (including ExpressLoad). With a + // DP/Stack seg N=3 so count=1; without it N=2 so count=0. put32(elData, 0); // reserved - put16(elData, 0); // count = N-2 = 0 (for 2 segs) + put16(elData, (uint16_t)(haveDpStack ? 1 : 0)); // count = N-2 - // Segment list (1 × 8 bytes) - // Self-rel offset = (header info offset within elData) - (this entry pos) - // = 16 - 6 = 10 - constexpr uint32_t segListEntryOff = 6; - const uint32_t headerInfoOff = 6 + 8 + 2; // header + segtable + remap - put16(elData, (uint16_t)(headerInfoOff - segListEntryOff)); - put16(elData, 0); // flags - put32(elData, 0); // handle + // Segment list: one 8-byte entry per non-ExpressLoad segment. Each + // entry's first WORD is the SELF-RELATIVE offset (from this entry's + // own start) to the segment's header_info record. + const uint32_t segTableOff = HEADER_BYTES; + const uint32_t remapOff = segTableOff + SEGTAB_ENTRY * nSegs; + const uint32_t hdrInfoOff = remapOff + REMAP_ENTRY * nSegs; + for (uint32_t i = 0; i < nSegs; i++) { + const uint32_t thisEntryOff = segTableOff + SEGTAB_ENTRY * i; + const uint32_t thisHdrInfoOff = hdrInfoOff + HDR_INFO_ENTRY * i; + put16(elData, (uint16_t)(thisHdrInfoOff - thisEntryOff)); // self-rel + put16(elData, 0); // flags + put32(elData, 0); // handle + } - // Remap list: old seg 1 (which would be our user seg without - // ExpressLoad) maps to new seg 2 (since ExpressLoad takes seg 1). + // Remap list: 1 WORD per non-ExpressLoad seg, giving the new + // segment number for each old segment position. Old seg 1 (user + // code, would-be sole seg without ExpressLoad) → new seg 2. + // Old seg 2 (DP/Stack, only present when --stack-size) → new seg 3. put16(elData, 2); + if (haveDpStack) { + put16(elData, 3); + } // Header info entry for the user segment. // data length = LCONST data size in the file. emitOneSeg embeds @@ -473,11 +532,10 @@ static std::vector emitOmfExpressLoad( put32(elData, 0); // reloc offset put32(elData, 0); // reloc length } else { - const uint32_t crelocBytesPerSite = 7; // 0xF5 + 1+1+2+2 const uint32_t crelocOff = userDataOff + (uint32_t)image.size() + bssGap + bssSize; const uint32_t crelocLen = - crelocBytesPerSite * (uint32_t)gReloc24Sites.size(); + OMF_CRELOC_BYTES_PER_SITE * (uint32_t)gReloc24Sites.size(); put32(elData, crelocOff); put32(elData, crelocLen); } @@ -498,6 +556,34 @@ static std::vector emitOmfExpressLoad( elData.push_back(i < truncated.size() ? (uint8_t)truncated[i] : 0x20); } + // Header info entry for the DP/Stack segment (when present). + // data_off / data_len point at the LCONST zero bytes carried in the + // DP/Stack seg's body, mirroring the working real-world layout + // (GNO/ME ~_STACK). No cRELOC entries for a DP/Stack seg, so + // reloc fields are 0. + if (haveDpStack) { + if (dpStackSeg.size() < HDR_SIZE) die("internal: DP/Stack seg too small"); + put32(elData, dpStackDataOff); // data offset (LCONST data) + put32(elData, stackSize); // data length (= stack size) + put32(elData, 0); // reloc offset + put32(elData, 0); // reloc length + // Header copy: bytes [12..43] of DP/Stack segment header. + elData.insert(elData.end(), dpStackSeg.begin() + 12, dpStackSeg.begin() + HDR_SIZE); + elData[elData.size() - 32 + 30] = 0; // DISPDATA hi → 0 + elData[elData.size() - 32 + 31] = 0; + // LOAD_NAME (10 bytes, space-padded) + for (int i = 0; i < (int)LOAD_NAME_SIZE; i++) elData.push_back(0x20); + // SEG_NAME = "~Direct" padded to 10 bytes (must match the value + // stored by emitDpStackSeg, otherwise ExpressLoad's name match + // could fail; the seg-name area in the file uses 10 spaces base + // with "~Direct" overwriting the first 7). + const char *dpName = "~Direct"; + const size_t dpNameLen = 7; + for (size_t i = 0; i < SEG_NAME_SIZE; i++) { + elData.push_back(i < dpNameLen ? (uint8_t)dpName[i] : 0x20); + } + } + if (elData.size() != elDataSize) die("internal: ExpressLoad data size mismatch"); @@ -513,7 +599,7 @@ static std::vector emitOmfExpressLoad( elHdr.push_back(4); // NUMLEN elHdr.push_back(2); // VERSION (0x02 = v2.1) put32(elHdr, 0); // BANKSIZE = 0 for DATA seg - put16(elHdr, 0x8001); // KIND = DATA|STATIC + put16(elHdr, OMF_KIND_DATA_STATIC); // KIND = DATA|STATIC elHdr.push_back(0); elHdr.push_back(0); // undef put32(elHdr, 0); // ORG put32(elHdr, 0); // ALIGN @@ -542,16 +628,15 @@ static std::vector emitOmfExpressLoad( die("internal: ExpressLoad segment size mismatch"); // Step 6: concatenate ExpressLoad + user segment + optional DP/Stack. - // The DP/Stack seg sits AFTER the user seg; the Loader walks file- - // ordered segments after the ExpressLoad load step completes, and - // processes each segment by KIND. The ExpressLoad load script only - // tracks code/data segs; the DP/Stack seg is found by KIND walk. + // The DP/Stack seg's presence is now also recorded in the + // ExpressLoad load script (segtable + remap + header_info entries + // above) so the Loader's fast path honors KIND=0x4012 instead of + // silently dropping it to its default 4 KB DP/Stack allocation. std::vector result; result.insert(result.end(), elSeg.begin(), elSeg.end()); result.insert(result.end(), userSeg.begin(), userSeg.end()); - if (stackSize != 0) { - auto dpSeg = emitDpStackSeg(stackSize, /*segNum*/3); - result.insert(result.end(), dpSeg.begin(), dpSeg.end()); + if (haveDpStack) { + result.insert(result.end(), dpStackSeg.begin(), dpStackSeg.end()); } return result; } @@ -674,7 +759,7 @@ static void usage(const char *argv0) { " sidecar; emit cRELOC (0xF5) opcodes after LCONST\n" " so the Loader patches intra-segment 24-bit refs\n" " (JSL/JML/STAlong/etc.) when placing the segment.\n" - " --stack-size N append a ~Direct DP/Stack segment (KIND=0x1012)\n" + " --stack-size N append a ~Direct DP/Stack segment (KIND=0x4012)\n" " of N bytes. The Loader allocates a page-aligned\n" " block of this size in bank 0 for combined DP +\n" " stack use. N must be page-multiple (>= 256).\n" @@ -782,7 +867,7 @@ int main(int argc, char **argv) { // intra-segment relocations at link time and have no // INTERSEG / RELOC opcodes); ABSBANK + ORG=base pins it // to a specific bank. CODE is the default (type 0). - uint16_t kind = (k == 0) ? 0x8800u : 0x8800u; + const uint16_t kind = OMF_KIND_CODE_STATIC_ABSBANK; uint32_t entryOff = (k == 0) ? s.entryOff : 0; auto seg = emitOneSeg(img, entryOff, s.base, static_cast(s.num), @@ -846,10 +931,15 @@ int main(int argc, char **argv) { if (!f) die("cannot open '" + output + "' for writing"); f.write(reinterpret_cast(blob.data()), blob.size()); + // Segment count: 1 user CODE seg; +1 for ExpressLoad wrapper; +1 + // when --stack-size adds a ~Direct DP/Stack seg. + int segCount = 1; + if (expressload) segCount++; + if (stackSize != 0) segCount++; std::fprintf(stderr, "OMF: %d segment%s%s, %zu bytes payload, entry='%s' at +0x%x -> %s " "(%zu bytes total)\n", - expressload ? 2 : 1, expressload ? "s" : "", + segCount, segCount == 1 ? "" : "s", expressload ? " (ExpressLoad)" : "", image.size(), entry.c_str(), entryOff, output.c_str(), blob.size()); diff --git a/src/llvm/lib/Target/W65816/MCTargetDesc/W65816AsmBackend.cpp b/src/llvm/lib/Target/W65816/MCTargetDesc/W65816AsmBackend.cpp index 6edd2ca..7b4a89a 100644 --- a/src/llvm/lib/Target/W65816/MCTargetDesc/W65816AsmBackend.cpp +++ b/src/llvm/lib/Target/W65816/MCTargetDesc/W65816AsmBackend.cpp @@ -6,9 +6,10 @@ // //===----------------------------------------------------------------------===// // -// Skeleton assembler backend. Fixup resolution, relaxation and nop -// generation are left unimplemented; they will be filled in once the -// instruction encodings are defined. +// W65816 assembler backend. Implements applyFixup for the +// R_W65816_* relocation family, BRA -> BRL relaxation when the 8-bit +// signed displacement won't fit, and writeNopData using 65816 NOP +// ($EA) bytes. // //===----------------------------------------------------------------------===// @@ -29,6 +30,13 @@ // W65816::BRA / W65816::BRL opcodes are exported by W65816MCTargetDesc.h // (which already includes the generated header). +// W65816 NOP machine encoding (single byte). +static constexpr unsigned char kOpcodeNOP = 0xEA; + +// Signed 8-bit branch displacement range for Bxx / BRA fixups. +static constexpr int kBranch8Min = -128; +static constexpr int kBranch8Max = 127; + using namespace llvm; namespace { @@ -110,7 +118,7 @@ public: // instead of silently truncating. if (Fixup.getKind() == W65816::fixup_8_pcrel) { int64_t Signed = static_cast(Value); - if (Signed < -128 || Signed > 127) { + if (Signed < kBranch8Min || Signed > kBranch8Max) { getContext().reportError( Fixup.getLoc(), "branch target out of range for 8-bit PC-relative branch " @@ -158,7 +166,7 @@ public: const MCSubtargetInfo *STI) const override { // The 65816 NOP is a single 0xEA byte. for (uint64_t I = 0; I < Count; ++I) - OS << char(0xEA); + OS << static_cast(kOpcodeNOP); return true; } @@ -192,7 +200,7 @@ public: if (Fixup.getKind() != W65816::fixup_8_pcrel) return false; int64_t Signed = static_cast(Value); - return Signed < -128 || Signed > 127; + return Signed < kBranch8Min || Signed > kBranch8Max; } void relaxInstruction(MCInst &Inst, diff --git a/src/llvm/lib/Target/W65816/MCTargetDesc/W65816ELFObjectWriter.cpp b/src/llvm/lib/Target/W65816/MCTargetDesc/W65816ELFObjectWriter.cpp index 2c86082..887ca82 100644 --- a/src/llvm/lib/Target/W65816/MCTargetDesc/W65816ELFObjectWriter.cpp +++ b/src/llvm/lib/Target/W65816/MCTargetDesc/W65816ELFObjectWriter.cpp @@ -24,6 +24,23 @@ using namespace llvm; +// R_W65816_* relocation numbers. These are protocol constants shared +// with link816 / omfEmit / llvm-objdump; do not renumber. If new types +// are added, mirror them in src/link816/link816.cpp's relocWidth() and +// the cRELOC pipeline. +namespace R_W65816 { +enum : unsigned { + R_IMM8 = 1, + R_IMM16 = 2, + R_IMM24 = 3, + R_PCREL8 = 4, + R_PCREL16 = 5, + R_BANK16 = 6, + R_DATA32 = 7, + R_PCREL32 = 8, +}; +} // namespace R_W65816 + namespace { class W65816ELFObjectWriter : public MCELFObjectTargetWriter { @@ -56,16 +73,16 @@ protected: // type — observed as type 249 — and broke link816.py. auto Kind = Fixup.getKind(); switch (Kind) { - case W65816::fixup_8: return 1; // R_W65816_IMM8 - case W65816::fixup_16: return 2; // R_W65816_IMM16 - case W65816::fixup_24: return 3; // R_W65816_IMM24 - case W65816::fixup_8_pcrel: return 4; // R_W65816_PCREL8 - case W65816::fixup_16_pcrel: return 5; // R_W65816_PCREL16 - case W65816::fixup_bank16: return 6; // R_W65816_BANK16 - case W65816::fixup_32: return 7; // R_W65816_DATA32 - case W65816::fixup_32_pcrel: return 8; // R_W65816_PCREL32 - case FK_Data_1: return IsPCRel ? 4 : 1; - case FK_Data_2: return IsPCRel ? 5 : 2; + case W65816::fixup_8: return R_W65816::R_IMM8; + case W65816::fixup_16: return R_W65816::R_IMM16; + case W65816::fixup_24: return R_W65816::R_IMM24; + case W65816::fixup_8_pcrel: return R_W65816::R_PCREL8; + case W65816::fixup_16_pcrel: return R_W65816::R_PCREL16; + case W65816::fixup_bank16: return R_W65816::R_BANK16; + case W65816::fixup_32: return R_W65816::R_DATA32; + case W65816::fixup_32_pcrel: return R_W65816::R_PCREL32; + case FK_Data_1: return IsPCRel ? R_W65816::R_PCREL8 : R_W65816::R_IMM8; + case FK_Data_2: return IsPCRel ? R_W65816::R_PCREL16 : R_W65816::R_IMM16; // FK_Data_4 is emitted by DWARF (.debug_info / .debug_line / // .debug_frame section-relative addresses), .eh_frame, // .debug_loclists, and user `.long` directives. Dispatch by @@ -78,7 +95,7 @@ protected: // .debug_line decoder because the 4th byte of the slot landed // on whatever followed it (most often the size byte of the // next line-program header → unit_length = 0). - case FK_Data_4: return IsPCRel ? 8 : 7; + case FK_Data_4: return IsPCRel ? R_W65816::R_PCREL32 : R_W65816::R_DATA32; default: llvm_unreachable("W65816: unknown fixup kind"); } diff --git a/src/llvm/lib/Target/W65816/W65816.h b/src/llvm/lib/Target/W65816/W65816.h index f133acf..1af1083 100644 --- a/src/llvm/lib/Target/W65816/W65816.h +++ b/src/llvm/lib/Target/W65816/W65816.h @@ -204,6 +204,7 @@ void initializeW65816SepRepCleanupPass(PassRegistry &); void initializeW65816BranchExpandPass(PassRegistry &); void initializeW65816TiedDefSpillPass(PassRegistry &); void initializeW65816ABridgeViaXPass(PassRegistry &); +void initializeW65816UnLSRPass(PassRegistry &); void initializeW65816WidenAcc16Pass(PassRegistry &); void initializeW65816SpillToXPass(PassRegistry &); void initializeW65816NegYIndYPass(PassRegistry &); diff --git a/src/llvm/lib/Target/W65816/W65816ABridgeViaX.cpp b/src/llvm/lib/Target/W65816/W65816ABridgeViaX.cpp index 64ab410..95d557e 100644 --- a/src/llvm/lib/Target/W65816/W65816ABridgeViaX.cpp +++ b/src/llvm/lib/Target/W65816/W65816ABridgeViaX.cpp @@ -8,23 +8,28 @@ // // Pre-regalloc complement to W65816TiedDefSpill. Where TiedDefSpill // preserves a multi-use Acc16 vreg by spilling it to a fresh stack -// slot around the tied-def consumer, this pass tries to do the same -// preservation via TAX/TXA: copy to an Idx16 vreg before the consumer -// (regalloc puts it in X or Y, expansion lowers the COPY to TAX/TAY), -// copy back to a fresh Acc16 vreg after. +// slot around the tied-def consumer, this pass bridges via an Img16 +// (DP-backed) vreg: park SrcReg in a fresh Img16 vreg before the +// consumer, restore to a fresh Acc16 vreg after. Regalloc places the +// Img16 in IMG0..IMG7 (DP $D0..$DE); copyPhysReg lowers the COPYs to +// STA dp / LDA dp (4 cyc each) and no system-stack slot is allocated. +// +// (The pass name dates from an earlier prototype that bridged via X +// using TAX/TXA. Cross-MBB X-liveness analysis was unimplemented and +// the X-bridge couldn't survive Idx16 clobbers between consumer and +// last use, so the bridge moved to Img16. The DP-backed form has the +// same 4-cycle round-trip cost as TAX/TXA bridges with none of the +// liveness restrictions.) // // Win per bridged pair: // stack spill: STA dp,S (5 cyc) + LDA dp,S (5 cyc) + 1 frame slot -// X bridge : TAX (2 cyc) + TXA (2 cyc) + no frame growth -// Net 6 cycles + 2 bytes saved per bridge — and we avoid one PHA per -// stack slot we didn't allocate. +// Img bridge : STA dp (4 cyc) + LDA dp (4 cyc) + no frame growth +// Net 2 cycles + (1 byte per access) saved per bridge -- and one PHA +// per avoided stack slot. // -// Bail conditions (fall back to TiedDefSpill's stack route): -// - any MI between consumer and SrcReg's last use clobbers Idx16 -// (LDX/LDY/INX/DEX/INY/DEY/TAX/TAY/TXY/TYX/PHX/PHY/PLX/PLY/etc.) -// - any call in the range (calls clobber X and Y per ABI) -// - SrcReg is used in a different MBB (cross-MBB liveness needs more -// analysis; deferred) +// Bail conditions (fall back to TiedDefSpill's stack route): any MI +// between consumer and SrcReg's last use that clobbers IMG slots, +// callees that clobber IMG0..IMG7, cross-MBB uses of SrcReg. // // Runs before TiedDefSpill so the latter doesn't double-process the // same candidates. diff --git a/src/llvm/lib/Target/W65816/W65816AsmPrinter.cpp b/src/llvm/lib/Target/W65816/W65816AsmPrinter.cpp index a92bf26..dbb1679 100644 --- a/src/llvm/lib/Target/W65816/W65816AsmPrinter.cpp +++ b/src/llvm/lib/Target/W65816/W65816AsmPrinter.cpp @@ -6,8 +6,14 @@ // //===----------------------------------------------------------------------===// // -// Skeleton assembly printer. The MCInst lowering path is wired up but no -// target-specific operand formatting is implemented yet. +// W65816 assembly printer. Owns the late pseudo-expansion path +// (MCInst lowering for the IR-pseudo opcodes that we keep through PEI +// because their machine encoding depends on AsmPrinter-time peepholes +// or runtime ABI knowledge -- BRK_pseudo, LDAi16imm_bank, JSLpseudo, +// the SEP/REP-wrapped i8 forms, etc.), plus a small set of mode-aware +// peepholes (PEA / PEI substitution for LDA+PUSH16 chains, STZ +// folding, etc.) that prefer to run after the rest of codegen has +// stabilised the MIR. // //===----------------------------------------------------------------------===// @@ -31,6 +37,39 @@ using namespace llvm; #define DEBUG_TYPE "asm-printer" +// W65816 processor-status flag masks used by SEP/REP wrapping. +// (See W65816 datasheet 6.10.) M = accumulator width (1 = 8-bit, +// 0 = 16-bit); X = index width (same convention). The wraps in this +// file toggle M only; X never changes in normal codegen. +static constexpr unsigned kPStatusM = 0x20; +[[maybe_unused]] static constexpr unsigned kPStatusX = 0x10; + +// IIgs runtime DP slots referenced from emitted code. Both are part of +// the runtime ABI -- AsmPrinter / ISelLowering / libgcc must agree. +// kRuntimePbrStashDP -- crt0 stashes the runtime PBR here so +// LDAi16imm_bank can emit `lda $BE` (PBR-byte +// load) for &symbol values in non-bank-0 placements. +// kRuntimeIndirTargetDP -- __indirTarget vector used by the +// JMP (abs) indirect-call thunk. +static constexpr unsigned kRuntimePbrStashDP = 0xBE; +[[maybe_unused]] static constexpr unsigned kRuntimeIndirTargetDP = 0x00B8; + +// DP scratch byte used by ADJCALLSTACKUP / ALLOCAfi to save A across a +// TSC/TCS bracket. Lives in the project-wide $E0..$DF DP scratch +// range; coordinate with W65816ISelLowering / W65816RegisterInfo if +// the layout changes. +static constexpr unsigned kDpScratch0 = 0xE0; + +// IIgs bank-byte mask: a 24-bit address whose top 8 bits are non-zero +// is in a non-zero bank and must be encoded via the LONG form. +static constexpr uint64_t kBankByteMask = 0xFF0000; + +// ADJCALLSTACKUP fan-out limit: PLY (1 byte / 4 cyc per pair-pop) wins +// over the 8-byte / ~14-cyc TAY/TSC/CLC/ADC/TCS/TYA bracket up through +// N = 14 even bytes; beyond that the bracket is cheaper. See the +// dispatch in the ADJCALLSTACKUP expansion. +static constexpr int kAdjStackUpPlyMaxN = 14; + namespace { class W65816AsmPrinter : public AsmPrinter { @@ -267,7 +306,7 @@ void W65816AsmPrinter::emitInstruction(const MachineInstr *MI) { if (YLive) { // Route through DP $E0 to preserve both A and Y. MCInst Sta; Sta.setOpcode(W65816::STA_DP); - Sta.addOperand(MCOperand::createImm(0xE0)); + Sta.addOperand(MCOperand::createImm(kDpScratch0)); EmitToStreamer(*OutStreamer, Sta); MCInst Tsc; Tsc.setOpcode(W65816::TSC); EmitToStreamer(*OutStreamer, Tsc); MCInst Clc; Clc.setOpcode(W65816::CLC); EmitToStreamer(*OutStreamer, Clc); @@ -276,9 +315,13 @@ void W65816AsmPrinter::emitInstruction(const MachineInstr *MI) { EmitToStreamer(*OutStreamer, Adc); MCInst Tcs; Tcs.setOpcode(W65816::TCS); EmitToStreamer(*OutStreamer, Tcs); MCInst Lda; Lda.setOpcode(W65816::LDA_DP); - Lda.addOperand(MCOperand::createImm(0xE0)); + Lda.addOperand(MCOperand::createImm(kDpScratch0)); EmitToStreamer(*OutStreamer, Lda); - } else if (N <= 14 && (N % 2) == 0) { + } else if (N <= kAdjStackUpPlyMaxN && (N % 2) == 0) { + // Repeated PLY (1 byte / 4 cyc each) wins over the TAY/TSC/CLC/ + // ADC/TCS/TYA bracket (8 bytes / ~14 cyc fixed) for N <= 14; + // beyond that the bracket is cheaper. Must be even (PLY pops + // 16-bit pairs). for (int i = 0; i < N / 2; ++i) { MCInst Ply; Ply.setOpcode(W65816::PLY); EmitToStreamer(*OutStreamer, Ply); @@ -348,7 +391,7 @@ void W65816AsmPrinter::emitInstruction(const MachineInstr *MI) { Lda.addOperand(MCOperand::createImm(0)); } else { Lda.setOpcode(W65816::LDA_DP); - Lda.addOperand(MCOperand::createImm(0xBE)); + Lda.addOperand(MCOperand::createImm(kRuntimePbrStashDP)); } EmitToStreamer(*OutStreamer, Lda); return; @@ -380,7 +423,7 @@ void W65816AsmPrinter::emitInstruction(const MachineInstr *MI) { // writes `*(uint16 *)0xE19E00UL = 0` we MUST keep the // LDA #0 + STA_Long pair so the bank-explicit form survives. bool AddrFitsIn16 = !It->getOperand(1).isImm() || - (It->getOperand(1).getImm() & 0xFF0000) == 0; + (It->getOperand(1).getImm() & kBankByteMask) == 0; if (AddrFitsIn16) { MCInst Stz; Stz.setOpcode(W65816::STZ_Abs); @@ -401,6 +444,10 @@ void W65816AsmPrinter::emitInstruction(const MachineInstr *MI) { if (It != MI->getParent()->end() && It->getOpcode() == W65816::PUSH16) { auto It2 = std::next(It); while (It2 != MI->getParent()->end() && It2->isDebugInstr()) ++It2; + // If PUSH16 is the last MI in the BB we leave the peephole as a + // no-op (conservative): the PUSH chain almost always feeds a JSL + // within the same BB, and proving A-dead at BB exit via successor + // live-in scan is not worth the bookkeeping. bool ADead = false; if (It2 != MI->getParent()->end()) { const TargetRegisterInfo *TRI = @@ -408,13 +455,6 @@ void W65816AsmPrinter::emitInstruction(const MachineInstr *MI) { if (It2->modifiesRegister(W65816::A, TRI) && !It2->readsRegister(W65816::A, TRI)) ADead = true; - } else { - // PUSH16 is the last instruction in the BB. A is dead at - // BB exit iff it's not live-out. Check the BB's live-out - // set via successors; if no successor lists A as live-in, - // it's safe. Conservative: treat as not-dead (skip peephole). - // This case is uncommon — the PUSH chain almost always feeds - // a JSL within the same BB. } if (ADead) { MCInst Pea; @@ -445,7 +485,7 @@ void W65816AsmPrinter::emitInstruction(const MachineInstr *MI) { // hit. We mark the next-SEP-to-skip via a per-AsmPrinter flag // so the SEP visit drops it. MCInst Sep; Sep.setOpcode(W65816::SEP); - Sep.addOperand(MCOperand::createImm(0x20)); + Sep.addOperand(MCOperand::createImm(kPStatusM)); EmitToStreamer(*OutStreamer, Sep); MCInst Lda; Lda.setOpcode(W65816::LDA_Imm8); @@ -487,9 +527,9 @@ void W65816AsmPrinter::emitInstruction(const MachineInstr *MI) { if (It != MI->getParent()->end() && It->getOpcode() == W65816::SEP && It->getNumOperands() >= 1 && It->getOperand(0).isImm() && - It->getOperand(0).getImm() == 0x20) { + It->getOperand(0).getImm() == kPStatusM) { SkipRep = true; - SkipNextSepImm = 0x20; + SkipNextSepImm = static_cast(kPStatusM); } // STA8abs / STA8long don't expose their SEP at MIR — the wrap is // emitted at MC layer. Detect them here so we can elide the @@ -505,7 +545,7 @@ void W65816AsmPrinter::emitInstruction(const MachineInstr *MI) { } if (!SkipRep) { MCInst Rep; Rep.setOpcode(W65816::REP); - Rep.addOperand(MCOperand::createImm(0x20)); + Rep.addOperand(MCOperand::createImm(kPStatusM)); EmitToStreamer(*OutStreamer, Rep); } return; @@ -533,7 +573,7 @@ void W65816AsmPrinter::emitInstruction(const MachineInstr *MI) { EmitToStreamer(*OutStreamer, Lda); return; } - if ((A & 0xFF0000) != 0) { + if ((A & kBankByteMask) != 0) { MCInst Lda; Lda.setOpcode(W65816::LDA_Long); Lda.addOperand(lowerOperand(AddrOp, MCInstLowering)); @@ -564,7 +604,7 @@ void W65816AsmPrinter::emitInstruction(const MachineInstr *MI) { EmitToStreamer(*OutStreamer, Sta); return; } - if ((A & 0xFF0000) != 0) { + if ((A & kBankByteMask) != 0) { MCInst Sta; Sta.setOpcode(W65816::STA_Long); Sta.addOperand(lowerOperand(AddrOp, MCInstLowering)); @@ -649,7 +689,7 @@ void W65816AsmPrinter::emitInstruction(const MachineInstr *MI) { bool IsSub = MI->getOpcode() == W65816::SBCi8imm; // SEP/REP wrap (see LDAi8imm comment). MCInst Sep; Sep.setOpcode(W65816::SEP); - Sep.addOperand(MCOperand::createImm(0x20)); + Sep.addOperand(MCOperand::createImm(kPStatusM)); EmitToStreamer(*OutStreamer, Sep); MCInst Carry; Carry.setOpcode(IsSub ? W65816::SEC : W65816::CLC); @@ -660,7 +700,7 @@ void W65816AsmPrinter::emitInstruction(const MachineInstr *MI) { Op.addOperand(MCOperand::createImm(Val)); EmitToStreamer(*OutStreamer, Op); MCInst Rep; Rep.setOpcode(W65816::REP); - Rep.addOperand(MCOperand::createImm(0x20)); + Rep.addOperand(MCOperand::createImm(kPStatusM)); EmitToStreamer(*OutStreamer, Rep); return; } @@ -682,11 +722,11 @@ void W65816AsmPrinter::emitInstruction(const MachineInstr *MI) { Op.addOperand(MCOperand::createImm(Val)); // SEP/REP wrap (see LDAi8imm comment). MCInst Sep; Sep.setOpcode(W65816::SEP); - Sep.addOperand(MCOperand::createImm(0x20)); + Sep.addOperand(MCOperand::createImm(kPStatusM)); EmitToStreamer(*OutStreamer, Sep); EmitToStreamer(*OutStreamer, Op); MCInst Rep; Rep.setOpcode(W65816::REP); - Rep.addOperand(MCOperand::createImm(0x20)); + Rep.addOperand(MCOperand::createImm(kPStatusM)); EmitToStreamer(*OutStreamer, Rep); return; } @@ -696,7 +736,7 @@ void W65816AsmPrinter::emitInstruction(const MachineInstr *MI) { // LDA_Long (0xAF, bank-explicit) for const-int MMIO addresses. bool IsLong = MI->getOpcode() == W65816::LDA8long; MCInst Sep; Sep.setOpcode(W65816::SEP); - Sep.addOperand(MCOperand::createImm(0x20)); + Sep.addOperand(MCOperand::createImm(kPStatusM)); EmitToStreamer(*OutStreamer, Sep); MCInst Lda; Lda.setOpcode(IsLong ? W65816::LDA_Long : W65816::LDA_Abs); @@ -709,7 +749,7 @@ void W65816AsmPrinter::emitInstruction(const MachineInstr *MI) { Lda.addOperand(Addr); EmitToStreamer(*OutStreamer, Lda); MCInst Rep; Rep.setOpcode(W65816::REP); - Rep.addOperand(MCOperand::createImm(0x20)); + Rep.addOperand(MCOperand::createImm(kPStatusM)); EmitToStreamer(*OutStreamer, Rep); return; } @@ -717,14 +757,14 @@ void W65816AsmPrinter::emitInstruction(const MachineInstr *MI) { // i8 indexed-global load: SEP #0x20 ; LDA , X ; REP #0x20 // X holds the index (set up by CopyToReg before this MI). MCInst Sep; Sep.setOpcode(W65816::SEP); - Sep.addOperand(MCOperand::createImm(0x20)); + Sep.addOperand(MCOperand::createImm(kPStatusM)); EmitToStreamer(*OutStreamer, Sep); MCInst Lda; Lda.setOpcode(W65816::LDA_AbsX); Lda.addOperand(lowerOperand(MI->getOperand(0), MCInstLowering)); EmitToStreamer(*OutStreamer, Lda); MCInst Rep; Rep.setOpcode(W65816::REP); - Rep.addOperand(MCOperand::createImm(0x20)); + Rep.addOperand(MCOperand::createImm(kPStatusM)); EmitToStreamer(*OutStreamer, Rep); return; } @@ -732,14 +772,14 @@ void W65816AsmPrinter::emitInstruction(const MachineInstr *MI) { // i8 indexed-global store: SEP #0x20 ; STA , X ; REP #0x20 // A holds the value, X holds the index. MCInst Sep; Sep.setOpcode(W65816::SEP); - Sep.addOperand(MCOperand::createImm(0x20)); + Sep.addOperand(MCOperand::createImm(kPStatusM)); EmitToStreamer(*OutStreamer, Sep); MCInst Sta; Sta.setOpcode(W65816::STA_AbsX); Sta.addOperand(lowerOperand(MI->getOperand(0), MCInstLowering)); EmitToStreamer(*OutStreamer, Sta); MCInst Rep; Rep.setOpcode(W65816::REP); - Rep.addOperand(MCOperand::createImm(0x20)); + Rep.addOperand(MCOperand::createImm(kPStatusM)); EmitToStreamer(*OutStreamer, Rep); return; } @@ -764,7 +804,7 @@ void W65816AsmPrinter::emitInstruction(const MachineInstr *MI) { SkipNextSta8Wrap = false; if (!UsesAcc8 && !SkipOpenSep) { MCInst Sep; Sep.setOpcode(W65816::SEP); - Sep.addOperand(MCOperand::createImm(0x20)); + Sep.addOperand(MCOperand::createImm(kPStatusM)); EmitToStreamer(*OutStreamer, Sep); } MCInst Sta; @@ -784,7 +824,7 @@ void W65816AsmPrinter::emitInstruction(const MachineInstr *MI) { EmitToStreamer(*OutStreamer, Sta); if (!UsesAcc8) { MCInst Rep; Rep.setOpcode(W65816::REP); - Rep.addOperand(MCOperand::createImm(0x20)); + Rep.addOperand(MCOperand::createImm(kPStatusM)); EmitToStreamer(*OutStreamer, Rep); } return; @@ -825,7 +865,7 @@ void W65816AsmPrinter::emitInstruction(const MachineInstr *MI) { // i8 immediate compare — needs M=1 so the CPU only reads 1 byte // for the immediate. See LDAi8imm comment for the wrap rationale. MCInst Sep; Sep.setOpcode(W65816::SEP); - Sep.addOperand(MCOperand::createImm(0x20)); + Sep.addOperand(MCOperand::createImm(kPStatusM)); EmitToStreamer(*OutStreamer, Sep); MCInst Cmp; Cmp.setOpcode(W65816::CMP_Imm8); @@ -833,7 +873,7 @@ void W65816AsmPrinter::emitInstruction(const MachineInstr *MI) { Cmp.addOperand(MCOperand::createImm(Val)); EmitToStreamer(*OutStreamer, Cmp); MCInst Rep; Rep.setOpcode(W65816::REP); - Rep.addOperand(MCOperand::createImm(0x20)); + Rep.addOperand(MCOperand::createImm(kPStatusM)); EmitToStreamer(*OutStreamer, Rep); return; } @@ -965,12 +1005,12 @@ void W65816AsmPrinter::emitInstruction(const MachineInstr *MI) { // Size is in A on entry — but we need A=SP after TSC, so first // stash the size to DP scratch. MCInst Sta1; Sta1.setOpcode(W65816::STA_DP); - Sta1.addOperand(MCOperand::createImm(0xE0)); + Sta1.addOperand(MCOperand::createImm(kDpScratch0)); EmitToStreamer(*OutStreamer, Sta1); MCInst Tsc; Tsc.setOpcode(W65816::TSC); EmitToStreamer(*OutStreamer, Tsc); MCInst Sec; Sec.setOpcode(W65816::SEC); EmitToStreamer(*OutStreamer, Sec); MCInst Sbc; Sbc.setOpcode(W65816::SBC_DP); - Sbc.addOperand(MCOperand::createImm(0xE0)); + Sbc.addOperand(MCOperand::createImm(kDpScratch0)); EmitToStreamer(*OutStreamer, Sbc); MCInst Tcs; Tcs.setOpcode(W65816::TCS); EmitToStreamer(*OutStreamer, Tcs); MCInst Ina; Ina.setOpcode(W65816::INA); EmitToStreamer(*OutStreamer, Ina); diff --git a/src/llvm/lib/Target/W65816/W65816BranchExpand.cpp b/src/llvm/lib/Target/W65816/W65816BranchExpand.cpp index fd6b5c0..e23591e 100644 --- a/src/llvm/lib/Target/W65816/W65816BranchExpand.cpp +++ b/src/llvm/lib/Target/W65816/W65816BranchExpand.cpp @@ -162,8 +162,7 @@ static unsigned estimateDistance(MachineFunction &MF, // sliced after each non-final conditional, so every MBB ends up with // at most one conditional terminator. Returns true if any MBB was // split. -static bool splitMultiBranchMBBs(MachineFunction &MF, - const TargetInstrInfo *TII) { +static bool splitMultiBranchMBBs(MachineFunction &MF) { bool Changed = false; // Snapshot MBBs first (we mutate the list during iteration). SmallVector MBBs; @@ -233,7 +232,6 @@ static bool splitMultiBranchMBBs(MachineFunction &MF, // see if another split is needed (multi-multi-branch case). Changed = true; Sliced = true; - (void)TII; // unused for now } } return Changed; @@ -354,7 +352,7 @@ bool W65816BranchExpand::runOnMachineFunction(MachineFunction &MF) { AnyChanged |= dropDeadConditionalsToBRATarget(MF); // Step 1: split multi-conditional-terminator MBBs. - AnyChanged |= splitMultiBranchMBBs(MF, TII); + AnyChanged |= splitMultiBranchMBBs(MF); // Step 2: iterate to fixed-point. Each expansion adds 3 bytes // (bridge BRA), which may push another previously-OK branch over diff --git a/src/llvm/lib/Target/W65816/W65816I32IncFold.cpp b/src/llvm/lib/Target/W65816/W65816I32IncFold.cpp index b6bab79..3b7dfdb 100644 --- a/src/llvm/lib/Target/W65816/W65816I32IncFold.cpp +++ b/src/llvm/lib/Target/W65816/W65816I32IncFold.cpp @@ -68,10 +68,6 @@ char W65816I32IncFold::ID = 0; INITIALIZE_PASS(W65816I32IncFold, DEBUG_TYPE, "W65816 i32 += 1 fold", false, false) -namespace llvm { -void initializeW65816I32IncFoldPass(PassRegistry &); -} - // Match the 6-instruction sequence; returns the post-pattern iterator // and fills in the lo/hi stack-rel offsets if the pattern matches. // Tolerates intervening TAX/TXA pairs (which regalloc inserts as diff --git a/src/llvm/lib/Target/W65816/W65816ImgCalleeSave.cpp b/src/llvm/lib/Target/W65816/W65816ImgCalleeSave.cpp index 121ceeb..4560c1a 100644 --- a/src/llvm/lib/Target/W65816/W65816ImgCalleeSave.cpp +++ b/src/llvm/lib/Target/W65816/W65816ImgCalleeSave.cpp @@ -87,10 +87,6 @@ char W65816ImgCalleeSave::ID = 0; INITIALIZE_PASS(W65816ImgCalleeSave, DEBUG_TYPE, "W65816 IMG8..IMG15 callee save/restore", false, false) -namespace llvm { -void initializeW65816ImgCalleeSavePass(PassRegistry &); -} - FunctionPass *llvm::createW65816ImgCalleeSave() { return new W65816ImgCalleeSave(); } @@ -188,7 +184,7 @@ bool W65816ImgCalleeSave::runOnMachineFunction(MachineFunction &MF) { // // copyPhysReg lowers `COPY $imgN = $a` to `STA_DP imm:0xCx`, so we // check both the physreg-DEF form AND the DP-imm-store form. - bool WrittenSlot[8] = {false}; + bool UsedSlot[8] = {false}; bool AnyWritten = false; for (auto &MBB : MF) { for (auto &MI : MBB) { @@ -197,7 +193,7 @@ bool W65816ImgCalleeSave::runOnMachineFunction(MachineFunction &MF) { if (!MO.isReg() || MO.getReg() == 0 || !MO.isDef()) continue; int idx = classifyImgReg(MO.getReg()); if (idx >= 0) { - WrittenSlot[idx] = true; + UsedSlot[idx] = true; AnyWritten = true; } } @@ -205,15 +201,12 @@ bool W65816ImgCalleeSave::runOnMachineFunction(MachineFunction &MF) { auto [idx, mode] = classifyDpImmAsImg(MI); if (idx >= 0 && (mode == DpAccess::Write || mode == DpAccess::ReadWrite)) { - WrittenSlot[idx] = true; + UsedSlot[idx] = true; AnyWritten = true; } } } if (!AnyWritten) return false; - // Rename for downstream Step 2/3/4 readability — they use UsedSlot. - bool (&UsedSlot)[8] = WrittenSlot; - (void)AnyWritten; // Step 2: allocate one frame slot per used IMG. Size = 2 bytes (each // Img16 holds a 16-bit value). Mark as a spill slot so PEI accounts diff --git a/src/llvm/lib/Target/W65816/W65816Layer2Gate.cpp b/src/llvm/lib/Target/W65816/W65816Layer2Gate.cpp index 37da391..975ec85 100644 --- a/src/llvm/lib/Target/W65816/W65816Layer2Gate.cpp +++ b/src/llvm/lib/Target/W65816/W65816Layer2Gate.cpp @@ -215,14 +215,10 @@ namespace llvm { class W65816Layer2StampPass : public PassInfoMixin { public: PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM) { - bool Changed = false; for (Function &F : M) { - Changed |= stampFunction(F); + stampFunction(F); } - if (!Changed) { - return PreservedAnalyses::all(); - } - // We only added a function attribute, no IR-level effects. Preserve + // We only add a function attribute, no IR-level effects. Preserve // everything; the inliner et al. will copy the attribute on inline. return PreservedAnalyses::all(); } diff --git a/src/llvm/lib/Target/W65816/W65816NarrowI32Mul.cpp b/src/llvm/lib/Target/W65816/W65816NarrowI32Mul.cpp index f2e1292..490a4bc 100644 --- a/src/llvm/lib/Target/W65816/W65816NarrowI32Mul.cpp +++ b/src/llvm/lib/Target/W65816/W65816NarrowI32Mul.cpp @@ -189,7 +189,6 @@ bool W65816NarrowI32Mul::runOnFunction(Function &F) { // low-16 bits as the original i32 add at every observable point // (the back-edge value can wrap on the exit iteration but is // never observed — exit takes the trip-end branch first). - bool NarrowedAny = false; SmallVector PhiWorklist; for (BasicBlock &BB : F) { for (PHINode &PN : BB.phis()) { @@ -282,7 +281,6 @@ bool W65816NarrowI32Mul::runOnFunction(Function &F) { Incr->replaceAllUsesWith(UndefValue::get(Incr->getType())); Incr->eraseFromParent(); PN->eraseFromParent(); - NarrowedAny = true; } return true; } diff --git a/src/llvm/lib/Target/W65816/W65816NegYIndY.cpp b/src/llvm/lib/Target/W65816/W65816NegYIndY.cpp index dd7fc82..1dfcf48 100644 --- a/src/llvm/lib/Target/W65816/W65816NegYIndY.cpp +++ b/src/llvm/lib/Target/W65816/W65816NegYIndY.cpp @@ -41,6 +41,13 @@ using namespace llvm; +// DP scratch byte used to park X when the negative-Y inserter needs to +// route through TAX/TXA. Lives in the project-wide $E0..$DF scratch +// range; $E0 is reserved for ADJCALLSTACKUP's A-preserve so we use +// $E2 here. Coordinate with W65816AsmPrinter / W65816ISelLowering / +// W65816RegisterInfo if the layout changes. +static constexpr unsigned kDpScratchX = 0xE2; + #define DEBUG_TYPE "w65816-neg-y-indy" namespace { @@ -110,9 +117,9 @@ bool W65816NegYIndY::runOnMachineFunction(MachineFunction &MF) { if (XLive || xDef) break; } if (XLive) { - // Save X to DP $E2 (don't use $E0 — that's the A-preserve - // slot in call-frame teardown and may be live). - BuildMI(MBB, MI, DL, TII->get(W65816::STX_DP)).addImm(0xE2); + // Save X to DP kDpScratchX ($E2) -- $E0 is reserved as the + // A-preserve slot in call-frame teardown and may be live. + BuildMI(MBB, MI, DL, TII->get(W65816::STX_DP)).addImm(kDpScratchX); } if (IsLDA) { // LDA disp,S ; CLC ; ADC #neg ; TAX ; LDA $0000,X @@ -154,7 +161,7 @@ bool W65816NegYIndY::runOnMachineFunction(MachineFunction &MF) { } if (XLive) { // Restore X from DP $E2. - BuildMI(MBB, MI, DL, TII->get(W65816::LDX_DP)).addImm(0xE2); + BuildMI(MBB, MI, DL, TII->get(W65816::LDX_DP)).addImm(kDpScratchX); } // Erase original LDY and the (sr,s),Y op. if (LastLDY) { LastLDY->eraseFromParent(); LastLDY = nullptr; } diff --git a/src/llvm/lib/Target/W65816/W65816PromoteFiToImg.cpp b/src/llvm/lib/Target/W65816/W65816PromoteFiToImg.cpp index dd6c65f..a75b41a 100644 --- a/src/llvm/lib/Target/W65816/W65816PromoteFiToImg.cpp +++ b/src/llvm/lib/Target/W65816/W65816PromoteFiToImg.cpp @@ -99,64 +99,29 @@ FunctionPass *llvm::createW65816PromoteFiToImg() { } -// Returns the operand index of the FrameIndex for the given FI pseudo -// opcode, or -1 if this opcode isn't a promotable FI carrier. -static int getFiOperandIdx(unsigned Opc) { - switch (Opc) { - case W65816::LDAfi: return 1; - case W65816::STAfi: return 1; - case W65816::CMPfi: return 1; - case W65816::ADCfi: - case W65816::SBCfi: - case W65816::ANDfi: - case W65816::ORAfi: - case W65816::EORfi: return 2; - default: return -1; - } -} - - -// Map a promotable FI pseudo to the corresponding DP MC opcode. -static unsigned getDpOpcode(unsigned Opc) { - switch (Opc) { - case W65816::LDAfi: return W65816::LDA_DP; - case W65816::STAfi: return W65816::STA_DP; - case W65816::CMPfi: return W65816::CMP_DP; - case W65816::ADCfi: return W65816::ADC_DP; - case W65816::SBCfi: return W65816::SBC_DP; - case W65816::ANDfi: return W65816::AND_DP; - case W65816::ORAfi: return W65816::ORA_DP; - case W65816::EORfi: return W65816::EOR_DP; - default: return 0; - } -} - - -// IMG8..IMG15 sit at DP addresses 0xC0, 0xC2, ..., 0xCE. IMG0..IMG7 -// are at 0xD0..0xDE. Returns the DP byte for IMGn. -static uint8_t dpAddrForImg(unsigned ImgIdx) { - assert(ImgIdx < 16 && "IMG index out of range"); - if (ImgIdx < 8) return 0xD0 + 2 * ImgIdx; - return 0xC0 + 2 * (ImgIdx - 8); -} - - bool W65816PromoteFiToImg::runOnMachineFunction(MachineFunction &MF) { - // DISABLED again 2026-05-13 (3rd-attempt write-up). Two new findings: + // DISABLED 2026-05-13 (3rd-attempt write-up). Two findings blocked + // re-enable: // 1. With kMaxPromote=2 and IMG0..7 (caller-save, skip ImgCalleeSave), - // sumSquares regressed 56 → 72 inst because the FIs picked by - // access-count (fi#2, fi#3) are intermediate spill temps, not - // the i32-accumulator's halves (which are different FIs). The - // loop body ends up using BOTH IMG and stack slots for related - // values. - // 2. To pick the RIGHT FIs (those corresponding to PHI-cycled - // values like the i32 accumulator), we need either: - // (a) IR-level analysis BEFORE FI assignment, or - // (b) Post-RA dataflow analysis to identify "long-lived" FIs - // (active across the loop back-edge with no def/use boundary). - // This is the next blocker. Disabled until either (a) or (b) is - // implemented. + // sumSquares regressed 56 -> 72 inst because the FIs picked by + // access-count are intermediate spill temps, not the i32-accumulator + // halves (which are different FIs). Loop body ends up using BOTH + // IMG and stack slots for related values. + // 2. To pick the RIGHT FIs (those corresponding to PHI-cycled values + // like the i32 accumulator), we need either IR-level analysis + // BEFORE FI assignment, or post-RA dataflow analysis to identify + // long-lived FIs (active across the loop back-edge with no def/use + // boundary). + // The pass framework is retained so the pipeline slot stays documented; + // see git history for the disabled prototype body. + (void)MF; return false; +} + + +#if 0 +// Disabled prototype body retained for reference; see comment above. +bool W65816PromoteFiToImg::runOnMachineFunctionDisabled(MachineFunction &MF) { if (skipFunction(MF.getFunction())) return false; const W65816Subtarget &STI = MF.getSubtarget(); const W65816InstrInfo *TII = STI.getInstrInfo(); @@ -396,3 +361,4 @@ bool W65816PromoteFiToImg::runOnMachineFunction(MachineFunction &MF) { } return Changed; } +#endif diff --git a/src/llvm/lib/Target/W65816/W65816SepRepCleanup.cpp b/src/llvm/lib/Target/W65816/W65816SepRepCleanup.cpp index 9b64457..72f55ec 100644 --- a/src/llvm/lib/Target/W65816/W65816SepRepCleanup.cpp +++ b/src/llvm/lib/Target/W65816/W65816SepRepCleanup.cpp @@ -50,6 +50,9 @@ using namespace llvm; #define DEBUG_TYPE "w65816-sep-rep-cleanup" +// W65816 processor status M-bit mask (set/clear via SEP/REP #$20). +static constexpr int kMBit = 0x20; + namespace { class W65816SepRepCleanup : public MachineFunctionPass { @@ -276,7 +279,7 @@ bool W65816SepRepCleanup::runOnMachineFunction(MachineFunction &MF) { for (auto It = MBB.begin(); It != MBB.end(); ++It) { if (It->getOpcode() != W65816::SEP) continue; if (It->getNumOperands() < 1 || !It->getOperand(0).isImm()) continue; - if (It->getOperand(0).getImm() != 0x20) continue; + if (It->getOperand(0).getImm() != kMBit) continue; // Walk forward looking for LDAi8imm before any STAfi_indY // or REP at this nesting level. auto Walker = std::next(It); @@ -312,7 +315,7 @@ bool W65816SepRepCleanup::runOnMachineFunction(MachineFunction &MF) { if (Back->getOpcode() == W65816::SEP && Back->getNumOperands() >= 1 && Back->getOperand(0).isImm() && - Back->getOperand(0).getImm() == 0x20) { + Back->getOperand(0).getImm() == kMBit) { OuterSep = &*Back; break; } @@ -409,7 +412,7 @@ bool W65816SepRepCleanup::runOnMachineFunction(MachineFunction &MF) { if (Op1 != W65816::REP && Op1 != W65816::SEP) continue; if (It->getNumOperands() < 1 || !It->getOperand(0).isImm()) continue; int Imm1 = It->getOperand(0).getImm(); - if (Imm1 != 0x20) continue; // M-bit only + if (Imm1 != kMBit) continue; // M-bit only // Walk forward across mode-neutral ops looking for the matching // opposite toggle. Bail at calls, asm, ALU ops on A, etc. unsigned WantOp = (Op1 == W65816::REP) ? W65816::SEP : W65816::REP; @@ -1119,361 +1122,12 @@ bool W65816SepRepCleanup::runOnMachineFunction(MachineFunction &MF) { } } - // Store forwarding (disabled — CRC32 regressed and I couldn't - // nail down the safety hole in time). Even with PHP-wrap guards - // and SP-modifier bails, the first fire (in memmove) silently - // miscompiles something that CRC32 later depends on. Pattern - // is sound; safety analysis isn't complete. See - // feedback_close_gap_attempts_round2.md for details. - #if 0 - // Store forwarding for PHI memory copies. Pattern (sumSquares - // loop body): - // - // STA X,s ; A → slot X (some intermediate result) - // [code that modifies A but doesn't touch slot X or slot Y] - // LDA X,s ; reload A from slot X - // STA Y,s ; A → slot Y (the PHI copy) - // - // Transform: insert `STA Y,s` right after the first `STA X,s` (A - // still holds the same value at that point), then drop the LDA- - // STA pair. Net: -1 inst per pattern occurrence. - // - // Safety constraints (all between STA X and the LDA-STA pair, in - // the same MBB, in straight-line code): - // - No instruction writes slot X (else the LDA would see a - // different value than the original STA). - // - No instruction reads OR writes slot Y (else our early STA Y - // would be observed mid-flight with a different value than - // before, or our inserted store would be overwritten and the - // intervening read of Y in the original would have seen the - // overwrite). - // - No call / inline asm / branch (conservatively: those can - // touch memory we don't model). - { - auto isStackRelMC2 = [](unsigned Op) { - return Op == W65816::LDA_StackRel || Op == W65816::STA_StackRel || - Op == W65816::ADC_StackRel || Op == W65816::SBC_StackRel || - Op == W65816::AND_StackRel || Op == W65816::ORA_StackRel || - Op == W65816::EOR_StackRel || Op == W65816::CMP_StackRel; - }; - auto srAccess2 = [&](const MachineInstr &MI, int64_t &Off) -> bool { - if (!isStackRelMC2(MI.getOpcode())) return false; - if (MI.getNumOperands() < 1 || !MI.getOperand(0).isImm()) return false; - Off = MI.getOperand(0).getImm(); - return true; - }; - auto isStaSr = [](const MachineInstr &MI) { - return MI.getOpcode() == W65816::STA_StackRel; - }; - auto isLdaSr = [](const MachineInstr &MI) { - return MI.getOpcode() == W65816::LDA_StackRel; - }; - SmallVector ToErase; - SmallVector, 4> ToInsert; - static int g_fireLimit = -1; - static int g_fireCount = 0; - static bool initd = false; - if (!initd) { - if (const char *e = getenv("STORE_FWD_LIMIT")) g_fireLimit = atoi(e); - initd = true; - } - for (MachineBasicBlock &MBB : MF) { - for (auto It = MBB.begin(); It != MBB.end(); ++It) { - if (!isStaSr(*It)) continue; - int64_t X; - if (!srAccess2(*It, X)) continue; - MachineInstr *StaX = &*It; - // Check if StaX is INSIDE an open PHP/PLP wrap. In that case - // its operand offset has been pre-bumped by +1, and inserting - // a sibling STA Y immediately after writes at the WRONG slot - // (the un-bumped Y). Walk backward: if we find a PHP without - // a matching PLP first, bail. - { - bool insideWrap = false; - int depth = 0; - auto B = It; - while (B != MBB.begin()) { - --B; - if (B->getOpcode() == W65816::PLP) depth++; - else if (B->getOpcode() == W65816::PHP) { - if (depth > 0) depth--; - else { insideWrap = true; break; } - } - } - if (insideWrap) continue; - } - // Walk forward looking for LDA X ; STA Y. Conservative bail - // on any non-tracked memory op (indirect pointer access, - // DP/abs ops, etc.) which could alias slot Y via memory. - bool ok = true; - int64_t Y = -1; - MachineInstr *LdaX = nullptr; - MachineInstr *StaY = nullptr; - for (auto Walker = std::next(It); Walker != MBB.end(); ++Walker) { - if (Walker->isDebugInstr()) continue; - if (Walker->isCall() || Walker->isInlineAsm() || - Walker->isBranch() || Walker->isReturn()) { - ok = false; break; - } - // Found LDA X? - int64_t Off; - if (isLdaSr(*Walker) && srAccess2(*Walker, Off) && Off == X) { - LdaX = &*Walker; - auto Next = std::next(Walker); - while (Next != MBB.end() && Next->isDebugInstr()) ++Next; - if (Next == MBB.end() || !isStaSr(*Next) || - !srAccess2(*Next, Y) || Y == X) { - ok = false; - } else { - StaY = &*Next; - } - break; - } - // Stack-rel access to X (write or read): bail. - if (srAccess2(*Walker, Off) && Off == X) { - ok = false; break; - } - // Any memory-touching op that's NOT a tracked stack-rel - // access — bail. Indirect pointer stores/loads (DPIndY / - // DPIndLong / abs / etc.) could alias slot Y via a pointer - // we can't trace, and the safety check below would miss it. - if ((Walker->mayLoad() || Walker->mayStore()) && - !isStackRelMC2(Walker->getOpcode())) { - ok = false; break; - } - // SP-modifying ops shift the stack-rel addressing window — - // a later `lda X, s` reads a DIFFERENT byte than the earlier - // `sta X, s` (or worse, the new stack pointer points into - // saved P/retaddr). Bail on TCS (direct SP write) and on - // any stack push/pop (PHx/PLx/PEA/PEI/COP/BRK). Also bail - // on PHP/PLP because the wrap pass already bumped in-wrap - // stack-rel ops by +1 — our inserted STA after STA X writes - // at the un-bumped offset which gets the WRONG slot. - { - unsigned WO = Walker->getOpcode(); - if (WO == W65816::TCS || WO == W65816::PHA || - WO == W65816::PLA || WO == W65816::PHX || - WO == W65816::PLX || WO == W65816::PHY || - WO == W65816::PLY || WO == W65816::PHP || - WO == W65816::PLP || WO == W65816::PHB || - WO == W65816::PLB || WO == W65816::PHD || - WO == W65816::PLD || WO == W65816::PHK || - WO == W65816::PEA || WO == W65816::PEI_DP) { - ok = false; break; - } - } - } - if (!ok || !LdaX || !StaY) continue; - if (g_fireLimit >= 0 && g_fireCount >= g_fireLimit) continue; - g_fireCount++; - errs() << "SF FIRE " << g_fireCount << " in " << MF.getName() - << " MBB " << MBB.getNumber() - << " X=" << X << " Y=" << StaY->getOperand(0).getImm() - << "\n"; - // Now re-walk from std::next(It) up to LdaX and verify no - // access to slot Y in that gap. - ok = true; - for (auto W2 = std::next(It); W2 != LdaX->getIterator(); ++W2) { - if (W2->isDebugInstr()) continue; - int64_t Off; - if (srAccess2(*W2, Off) && Off == Y) { ok = false; break; } - } - if (!ok) continue; - // Safe to apply: schedule the StaY-after-StaX insert, and - // erase LdaX and StaY. - ToInsert.push_back({StaX, Y}); - ToErase.push_back(LdaX); - ToErase.push_back(StaY); - Changed = true; - } - } - // Apply (insertions first; iterators stay valid through erase). - for (auto &P : ToInsert) { - MachineInstr *StaX = std::get<0>(P); - int64_t Y = std::get<1>(P); - MachineBasicBlock *MBB = StaX->getParent(); - DebugLoc DL = StaX->getDebugLoc(); - auto NextIt = std::next(StaX->getIterator()); - BuildMI(*MBB, NextIt, DL, TII.get(W65816::STA_StackRel)) - .addImm(Y); - } - for (MachineInstr *MI : ToErase) MI->eraseFromParent(); - } - #endif - // (Redundant CMP #0 elimination — disabled, hit VLA sum_n - // regression. Carry-flag bookkeeping across the CMP turned out to - // have more cases than my forward-walk modeled. See - // feedback_cmp_zero_elim.md.) - #if 0 - { - auto isNZSetOnA = [](unsigned Op) { - switch (Op) { - case W65816::DEA_PSEUDO: case W65816::INA_PSEUDO: - case W65816::ADC_StackRel: case W65816::ADC_DP: case W65816::ADC_Imm16: - case W65816::SBC_StackRel: case W65816::SBC_DP: case W65816::SBC_Imm16: - case W65816::AND_StackRel: case W65816::AND_DP: case W65816::AND_Imm16: - case W65816::ORA_StackRel: case W65816::ORA_DP: case W65816::ORA_Imm16: - case W65816::EOR_StackRel: case W65816::EOR_DP: case W65816::EOR_Imm16: - case W65816::LDA_StackRel: case W65816::LDA_DP: - case W65816::LDAi16imm: case W65816::LDA_Imm16: - case W65816::TXA: case W65816::TYA: - case W65816::ADCi16imm: case W65816::ADCEi16imm: - case W65816::SBCi16imm: case W65816::SBCEi16imm: - return true; - default: - return false; - } - }; - auto isCmpZero = [](const MachineInstr &MI) { - if (MI.getOpcode() != W65816::CMPi16imm) return false; - // Operand layout: lhs (Acc16), imm. Find the imm. - for (const MachineOperand &MO : MI.operands()) { - if (MO.isImm()) return MO.getImm() == 0; - } - return false; - }; - auto modifiesA = [](const MachineInstr &MI) { - for (const MachineOperand &MO : MI.operands()) { - if (MO.isReg() && MO.getReg() == W65816::A && MO.isDef()) - return true; - } - return false; - }; - auto readsC = [](const MachineInstr &MI) { - // We don't model individual flag bits; approximate by checking - // if the MI reads $p AND is one of the carry-consuming ops. - unsigned Op = MI.getOpcode(); - switch (Op) { - case W65816::ADC_StackRel: case W65816::ADC_DP: case W65816::ADC_Imm16: - case W65816::SBC_StackRel: case W65816::SBC_DP: case W65816::SBC_Imm16: - case W65816::ADCEi16imm: case W65816::SBCEi16imm: - case W65816::BCC: case W65816::BCS: - case W65816::ROL_A: case W65816::ROR_A: - return true; - default: - return false; - } - }; - SmallVector CmpsToErase; - for (MachineBasicBlock &MBB : MF) { - for (MachineInstr &MI : MBB) { - if (!isCmpZero(MI)) continue; - // Walk backward, skipping flag-preserving instructions. - bool foundProducer = false; - auto Back = MI.getIterator(); - while (Back != MBB.begin()) { - --Back; - if (Back->isDebugInstr()) continue; - if (Back->isCall() || Back->isInlineAsm()) break; - if (modifiesA(*Back)) { - foundProducer = isNZSetOnA(Back->getOpcode()); - break; - } - bool defsP = false; - for (const MachineOperand &MO : Back->operands()) { - if (MO.isReg() && MO.getReg() == W65816::P && MO.isDef()) { - defsP = true; break; - } - } - if (defsP) break; - } - if (!foundProducer) continue; - // Walk FORWARD from CMP: until the next C-defining MI, no MI - // reads C. - bool cConsumed = false; - for (auto Fwd = std::next(MI.getIterator()); Fwd != MBB.end(); ++Fwd) { - if (Fwd->isDebugInstr()) continue; - if (readsC(*Fwd)) { cConsumed = true; break; } - // Next def of $p: subsequent reads aren't ours. - bool defsP = false; - for (const MachineOperand &MO : Fwd->operands()) { - if (MO.isReg() && MO.getReg() == W65816::P && MO.isDef()) { - defsP = true; break; - } - } - if (defsP) break; - } - if (cConsumed) continue; - CmpsToErase.push_back(&MI); - } - } - for (MachineInstr *MI : CmpsToErase) MI->eraseFromParent(); - if (!CmpsToErase.empty()) Changed = true; - } - #endif - // (Narrow PHI-copy slot collapse — disabled, qsort regression.) - #if 0 - { - auto isStackRelMC2 = [](unsigned Op) { - return Op == W65816::LDA_StackRel || Op == W65816::STA_StackRel || - Op == W65816::ADC_StackRel || Op == W65816::SBC_StackRel || - Op == W65816::AND_StackRel || Op == W65816::ORA_StackRel || - Op == W65816::EOR_StackRel || Op == W65816::CMP_StackRel; - }; - auto srAccess2 = [&](const MachineInstr &MI, int64_t &Off) { - if (!isStackRelMC2(MI.getOpcode())) return false; - if (MI.getNumOperands() < 1 || !MI.getOperand(0).isImm()) return false; - Off = MI.getOperand(0).getImm(); - return true; - }; - DenseMap Refs; - DenseMap StaInst, LdaInst; - DenseMap NSta, NLda; - for (MachineBasicBlock &MBB : MF) { - for (MachineInstr &MI : MBB) { - int64_t Off; - if (!srAccess2(MI, Off)) continue; - Refs[Off]++; - if (MI.getOpcode() == W65816::STA_StackRel) { - NSta[Off]++; StaInst[Off] = &MI; - } else if (MI.getOpcode() == W65816::LDA_StackRel) { - NLda[Off]++; LdaInst[Off] = &MI; - } - } - } - SmallVector ToErase; - for (auto &P : Refs) { - int64_t X = P.first; - if (P.second != 2) continue; // exactly 2 references - if (NSta[X] != 1 || NLda[X] != 1) continue; - MachineInstr *Sta = StaInst[X]; - MachineInstr *Lda = LdaInst[X]; - if (Sta->getParent() != Lda->getParent()) continue; - MachineBasicBlock *MBB = Sta->getParent(); - // Sta must be before Lda. - bool staBefore = false; - for (auto It = MBB->begin(); It != MBB->end(); ++It) { - if (&*It == Sta) { staBefore = true; break; } - if (&*It == Lda) break; - } - if (!staBefore) continue; - // Next after Lda must be STA Y where Y != X. - auto NextIt = std::next(Lda->getIterator()); - while (NextIt != MBB->end() && NextIt->isDebugInstr()) ++NextIt; - if (NextIt == MBB->end()) continue; - int64_t Y; - if (NextIt->getOpcode() != W65816::STA_StackRel || - !srAccess2(*NextIt, Y) || Y == X) continue; - // Between Sta and Lda, no read/write of slot Y, no call, no - // anything that would re-set slot Y's value mid-flight. - bool ok = true; - for (auto It = std::next(Sta->getIterator()); It != Lda->getIterator(); - ++It) { - if (It->isDebugInstr()) continue; - if (It->isCall() || It->isInlineAsm()) { ok = false; break; } - int64_t Off; - if (srAccess2(*It, Off) && Off == Y) { ok = false; break; } - } - if (!ok) continue; - // Redirect the original STA to write to Y; delete the LDA-STA pair. - Sta->getOperand(0).setImm(Y); - ToErase.push_back(Lda); - ToErase.push_back(&*NextIt); - Changed = true; - } - for (MachineInstr *MI : ToErase) MI->eraseFromParent(); - } - #endif + // Three prototype peepholes were tried here and removed once shown + // to regress benchmarks; design notes in + // feedback_close_gap_attempts_round2.md / feedback_cmp_zero_elim.md: + // - PHI store-forwarding (CRC32 regression / memmove safety hole). + // - Redundant CMP #0 elimination (VLA sum_n carry-flag bookkeeping). + // - Narrow PHI-copy slot collapse (qsort regression). return Changed; } diff --git a/src/llvm/lib/Target/W65816/W65816SpillToX.cpp b/src/llvm/lib/Target/W65816/W65816SpillToX.cpp index 765976a..7f14eb4 100644 --- a/src/llvm/lib/Target/W65816/W65816SpillToX.cpp +++ b/src/llvm/lib/Target/W65816/W65816SpillToX.cpp @@ -127,7 +127,7 @@ static bool touchesX(const MachineInstr &MI, const TargetRegisterInfo *TRI) { return xEffect(MI, TRI) != XNone; } -// Returns true if MI is `STAfi $a, slot, 0`. +// Returns FI if MI is `STAfi $a, slot, 0`, else -1. static int matchSTAfi(const MachineInstr &MI) { if (MI.getOpcode() != W65816::STAfi) return -1; if (MI.getNumOperands() < 3) return -1; diff --git a/src/llvm/lib/Target/W65816/W65816StackRelToImg.cpp b/src/llvm/lib/Target/W65816/W65816StackRelToImg.cpp index 073905a..e2a9524 100644 --- a/src/llvm/lib/Target/W65816/W65816StackRelToImg.cpp +++ b/src/llvm/lib/Target/W65816/W65816StackRelToImg.cpp @@ -800,33 +800,6 @@ bool W65816StackRelToImg::runOnMachineFunction(MachineFunction &MF) { // unhandled — they can shift SP arbitrarily. Caller must bail. return 0; }; - auto miBailsAnalysis = [](const MachineInstr &MI) -> bool { - // We don't bail on TCS or ADJCALLSTACK*. TCS in prologue/epilogue - // resets SP to a known value (the "canonical" SP for that region); - // since stack-rel accesses don't span TCS in well-formed code (the - // prologue allocates, body uses, epilogue deallocates), treating - // SP as continuing across TCS gives correct relative offsets for - // accesses inside each region. ADJCALLSTACK* aren't usually - // present at pre-emit time (PEI eliminates them or AsmPrinter - // handles). If they're still present, treat as 0 SP-shift — - // the actual PUSH16 ops carry the real shift. - return false; - }; - auto miSpDeltaWithAdj = [&](const MachineInstr &MI) -> int { - if (MI.getOpcode() == W65816::ADJCALLSTACKDOWN || - MI.getOpcode() == W65816::ADJCALLSTACKUP) { - // Skip — the actual PUSH16/PEA/PHA ops inside the call seq - // carry the SP delta. - return 0; - } - if (MI.getOpcode() == W65816::TCS) { - // TCS sets SP; we treat it as a "reset to canonical SP" point. - // Return 0 here; the calling code can do the reset. - return 0; - } - return 0; - }; - (void)miSpDeltaWithAdj; while (!Worklist.empty() && SpAdjValid) { MachineBasicBlock *MBB = Worklist.pop_back_val(); if (!Visited.insert(MBB).second) continue; diff --git a/src/llvm/lib/Target/W65816/W65816StackSlotMerge.cpp b/src/llvm/lib/Target/W65816/W65816StackSlotMerge.cpp index a84f13c..66c09e3 100644 --- a/src/llvm/lib/Target/W65816/W65816StackSlotMerge.cpp +++ b/src/llvm/lib/Target/W65816/W65816StackSlotMerge.cpp @@ -166,20 +166,26 @@ static bool semanticallyDefsA(const MachineInstr &MI) { // Walk backward from MI in its MBB looking for the most recent A-define. // Returns the MI that defines A, or nullptr if none in the same MBB. -// Skips debug instructions. Stops at MBB boundary, calls, branches, -// inline asm. -static MachineInstr *findPriorADef(MachineInstr *MI) { +// Skips debug instructions. When BailOnCall is true, also stops at +// calls / inline asm (used by the Case (3) twin check where call effects +// invalidate the value-equivalence reasoning). +static MachineInstr *findADefBackward(MachineInstr *MI, bool BailOnCall) { MachineBasicBlock *MBB = MI->getParent(); auto It = MI->getIterator(); while (It != MBB->begin()) { --It; if (It->isDebugInstr()) continue; - if (It->isCall() || It->isInlineAsm()) return nullptr; + if (BailOnCall && (It->isCall() || It->isInlineAsm())) return nullptr; if (semanticallyDefsA(*It)) return &*It; } return nullptr; } +// Convenience: Case (3) twin matcher (bails on calls/inline asm). +static MachineInstr *findPriorADef(MachineInstr *MI) { + return findADefBackward(MI, /*BailOnCall=*/true); +} + // Walk forward from `Start` (exclusive) up to (but not including) `End` // in the same MBB, tracking whether slot `WatchSlot` is written. @@ -252,17 +258,9 @@ static bool usesFlagsP(const MachineInstr &MI) { } -// Returns the MOST RECENT A-defining MI strictly before MI in its MBB, -// skipping debug instructions. Returns nullptr if none in the same MBB. +// Convenience: Case (2) twin matcher (does NOT bail on calls/inline asm). static MachineInstr *findMostRecentADef(MachineInstr *MI) { - MachineBasicBlock *MBB = MI->getParent(); - auto It = MI->getIterator(); - while (It != MBB->begin()) { - --It; - if (It->isDebugInstr()) continue; - if (semanticallyDefsA(*It)) return &*It; - } - return nullptr; + return findADefBackward(MI, /*BailOnCall=*/false); } @@ -283,7 +281,6 @@ static MachineInstr *findMostRecentADef(MachineInstr *MI) { static MachineInstr *findTwin(MachineInstr *StaX, ArrayRef StasY) { MachineBasicBlock *MBBStaX = StaX->getParent(); - int64_t XOff = StaX->getOperand(0).getImm(); // Cases (1) + (2): same MBB. for (MachineInstr *StaY : StasY) { if (StaY->getParent() != MBBStaX) continue; @@ -342,7 +339,6 @@ static MachineInstr *findTwin(MachineInstr *StaX, } if (XConst == YConst) return StaY; } - (void)XOff; return nullptr; } diff --git a/src/llvm/lib/Target/W65816/W65816TiedDefSpill.cpp b/src/llvm/lib/Target/W65816/W65816TiedDefSpill.cpp index 3942ad9..a1a0c80 100644 --- a/src/llvm/lib/Target/W65816/W65816TiedDefSpill.cpp +++ b/src/llvm/lib/Target/W65816/W65816TiedDefSpill.cpp @@ -8,10 +8,10 @@ // // Pre-regalloc pass: when a tied-def Acc16 instruction (ADCfi, SBCfi, // ANDfi, ORAfi, EORfi, ADCi16imm, SBCi16imm, ANDi16imm, ORAi16imm, -// EORi16imm, ADCabs, SBCabs, ANDabs, ORAabs, EORabs, INA_PSEUDO, -// DEA_PSEUDO, ASLA16, LSRA16, NEGA16, SHL8A, SRL8A, SRA15A, etc.) has -// a source vreg whose value is *also* needed past the consumer, fast -// regalloc fails to insert the necessary save/restore on its own. +// EORi16imm, ADCabs, SBCabs -- see isTiedAcc16Consumer below for the +// authoritative list) has a source vreg whose value is *also* needed +// past the consumer, fast regalloc fails to insert the necessary +// save/restore on its own. // (Acc16 has exactly one physical register, so the consumer's // tied-def overwrites the source; with multiple consumers/post-uses // the source must be spilled and reloaded.) diff --git a/src/llvm/lib/Target/W65816/W65816UnLSR.cpp b/src/llvm/lib/Target/W65816/W65816UnLSR.cpp index 425c334..5143cb1 100644 --- a/src/llvm/lib/Target/W65816/W65816UnLSR.cpp +++ b/src/llvm/lib/Target/W65816/W65816UnLSR.cpp @@ -59,10 +59,6 @@ using namespace llvm; #define DEBUG_TYPE "w65816-un-lsr" -namespace llvm { -void initializeW65816UnLSRPass(PassRegistry &); -} - namespace { class W65816UnLSR : public FunctionPass { @@ -84,7 +80,6 @@ public: private: bool processLoop(Loop *L); bool processCounterToPtrPHIs(Loop *L); - bool processReturnedCounter(Loop *L); }; } // namespace @@ -107,7 +102,6 @@ bool W65816UnLSR::runOnFunction(Function &F) { for (Loop *L : LI) { Changed |= processLoop(L); Changed |= processCounterToPtrPHIs(L); - // processReturnedCounter remains disabled — see note above. SmallVector Worklist(L->begin(), L->end()); while (!Worklist.empty()) { Loop *Sub = Worklist.pop_back_val(); @@ -120,241 +114,6 @@ bool W65816UnLSR::runOnFunction(Function &F) { } -// strLen-style undo: LSR converts `return p - s` into a counter PHI -// `%lsr.iv` that increments per iter and is returned directly: -// %lsr.iv = phi i16 [-1, %entry], [%lsr.iv.next, %latch] -// %p.0 = phi ptr [%s, %entry], [%incdec.ptr, %latch] -// %incdec.ptr = getelementptr i8, %p.0, i32 1 -// %lsr.iv.next = add i16 %lsr.iv, 1 -// br ..., %exit, %loop -// %exit: -// ret i16 %lsr.iv.next -// -// LSR's reasoning: cheaper to maintain a counter than compute (p - s) -// at exit. On W65816 the opposite is true: counter inc per iter costs -// 5 cyc/iter * N iters; one-time sub at exit costs ~10 cyc total. -// -// This undo finds the counter PHI, verifies its only out-of-loop use -// is via LCSSA → return, finds the sibling pointer PHI with the same -// stride, and replaces the return value with -// `(i16)(p_lcssa - base) + (K_init + 1)`. Erases the counter PHI. -// -// Saves ~5 cyc/iter on strLen-shape loops with a returned counter. -bool W65816UnLSR::processReturnedCounter(Loop *L) { - BasicBlock *Header = L->getHeader(); - BasicBlock *Latch = L->getLoopLatch(); - BasicBlock *Preheader = L->getLoopPreheader(); - if (!Latch || !Preheader) return false; - - // Single-exit loop. - SmallVector ExitBlocks; - L->getExitBlocks(ExitBlocks); - if (ExitBlocks.size() != 1) return false; - BasicBlock *Exit = ExitBlocks[0]; - - // Find a candidate counter PHI: integer, init=ConstantInt, step=+1. - PHINode *CounterPHI = nullptr; - ConstantInt *KInit = nullptr; - BinaryOperator *CounterStep = nullptr; - for (PHINode &PN : Header->phis()) { - if (!PN.getType()->isIntegerTy()) continue; - if (PN.getNumIncomingValues() != 2) continue; - Value *Init = nullptr, *Step = nullptr; - for (unsigned i = 0; i < PN.getNumIncomingValues(); ++i) { - BasicBlock *Pred = PN.getIncomingBlock(i); - if (L->contains(Pred)) Step = PN.getIncomingValue(i); - else Init = PN.getIncomingValue(i); - } - if (!Init || !Step) continue; - auto *InitC = dyn_cast(Init); - if (!InitC) continue; - auto *StepBO = dyn_cast(Step); - if (!StepBO || StepBO->getOpcode() != Instruction::Add) continue; - Value *Other = nullptr; - if (StepBO->getOperand(0) == &PN) Other = StepBO->getOperand(1); - else if (StepBO->getOperand(1) == &PN) Other = StepBO->getOperand(0); - if (!Other) continue; - auto *StepCI = dyn_cast(Other); - if (!StepCI || !StepCI->isOne()) continue; - CounterPHI = &PN; - KInit = InitC; - CounterStep = StepBO; - break; - } - if (!CounterPHI) return false; - - // The counter PHI must be used INSIDE the loop only by its increment - // and OUTSIDE the loop only via an LCSSA PHI in the exit block that - // feeds a return. Same for the increment. - auto isOnlyInLoopUseTheStep = [&](Value *V) { - for (User *U : V->users()) { - auto *UI = dyn_cast(U); - if (!UI) return false; - if (!L->contains(UI)) continue; // out-of-loop is handled separately - if (UI == CounterStep) continue; - // The PHI itself is allowed (V might be CounterStep, used by - // CounterPHI's back-edge incoming). - if (UI == CounterPHI) continue; - return false; - } - return true; - }; - if (!isOnlyInLoopUseTheStep(CounterPHI)) return false; - if (!isOnlyInLoopUseTheStep(CounterStep)) return false; - - // Find a use of CounterPHI or CounterStep that's a ReturnInst. - // The use might be DIRECT (no LCSSA — common after LCSSA cleanup) - // or via an LCSSA PHI in the exit block. - ReturnInst *Ret = nullptr; - Value *RetSource = nullptr; // the value the ret reads - PHINode *ExitLCSSA = nullptr; // optional LCSSA PHI to erase - bool fromNext = false; // true if return source is CounterStep - auto findRet = [&](Value *V, bool isNext) -> bool { - for (User *U : V->users()) { - auto *UI = dyn_cast(U); - if (!UI) continue; - // Skip in-loop uses (those are the counter increment chain). - if (L->contains(UI->getParent())) continue; - if (auto *R = dyn_cast(UI)) { - if (R->getReturnValue() != V) continue; - Ret = R; RetSource = V; fromNext = isNext; return true; - } - // LCSSA PHI in the exit block? - if (auto *PN = dyn_cast(UI)) { - if (PN->getParent() != Exit) continue; - if (PN->getNumIncomingValues() != 1) continue; - if (PN->getIncomingValue(0) != V) continue; - if (!PN->hasOneUse()) continue; - auto *R = dyn_cast(PN->user_back()); - if (!R || R->getReturnValue() != PN) continue; - Ret = R; RetSource = V; fromNext = isNext; ExitLCSSA = PN; - return true; - } - } - return false; - }; - if (!findRet(CounterStep, true) && !findRet(CounterPHI, false)) - return false; - - // Find a sibling pointer PHI: init=Base, latch incoming is a - // `getelementptr i8, %ptr, 1` of itself. - PHINode *PtrPHI = nullptr; - Value *Base = nullptr; - GetElementPtrInst *PtrStep = nullptr; - for (PHINode &PN : Header->phis()) { - if (!PN.getType()->isPointerTy()) continue; - if (PN.getNumIncomingValues() != 2) continue; - Value *Init = nullptr, *Step = nullptr; - for (unsigned i = 0; i < PN.getNumIncomingValues(); ++i) { - BasicBlock *Pred = PN.getIncomingBlock(i); - if (L->contains(Pred)) Step = PN.getIncomingValue(i); - else Init = PN.getIncomingValue(i); - } - if (!Init || !Step) continue; - auto *StepGEP = dyn_cast(Step); - if (!StepGEP) continue; - if (StepGEP->getPointerOperand() != &PN) continue; - if (StepGEP->getNumIndices() != 1) continue; - if (!StepGEP->getSourceElementType()->isIntegerTy(8)) continue; - auto *StrideCI = dyn_cast(StepGEP->getOperand(1)); - if (!StrideCI || !StrideCI->isOne()) continue; - PtrPHI = &PN; - Base = Init; - PtrStep = StepGEP; - break; - } - if (!PtrPHI) return false; - - // The pointer-PHI must have an LCSSA in the exit (so we can compute - // p_lcssa - base). Find it or create one. - PHINode *PtrLCSSA = nullptr; - for (PHINode &EPN : Exit->phis()) { - if (EPN.getNumIncomingValues() != 1) continue; - if (EPN.getIncomingValue(0) == PtrPHI) { - PtrLCSSA = &EPN; break; - } - } - if (!PtrLCSSA) { - // Create LCSSA for PtrPHI. - IRBuilder<> B(&Exit->front()); - PtrLCSSA = B.CreatePHI(PtrPHI->getType(), 1, "unlsr.p.lcssa"); - PtrLCSSA->addIncoming(PtrPHI, Latch); - } - - // Build replacement value: (i16)(p_lcssa - base) + (K_init + (fromNext ? 1 : 0)) - // For fromNext=true (returning %counter.next): value = K_init + iters - // p_lcssa - base = iters (in bytes, stride 1) → value = K_init + (p_lcssa - base) - // But we want: counter.next at exit = K_init + iters; and p_lcssa - base = iters. - // So replacement = (i16)(p_lcssa - base) + K_init. - // For strLen: K_init = -1; iters at exit = K (where ret = K - 1 + 1 = K) - // Wait let me re-derive. counter init = -1. iter 1 entry: counter = -1. - // iter 1 exit: counter.next = 0. Suppose exit-iter is iter K. Then at - // iter K's icmp-true, counter.next = -1 + K. - // And p_lcssa = base + (K - 1) (since iter K had p.0 = base + K-1). - // So p_lcssa - base = K - 1. - // We want counter.next = K - 1 (because exit-iter is iter K, but counter.next - // was computed before icmp tested 0 - so it's K - 1 (with K iters = K decisions)) - // Hmm, off-by-one is tricky. Let me just test empirically. - - // The "return value type" we'll cast to. - Type *RetTy = Ret->getReturnValue()->getType(); - if (!RetTy->isIntegerTy()) return false; - Instruction *InsertPt = ExitLCSSA ? ExitLCSSA->getNextNode() : Ret; - IRBuilder<> B(InsertPt); - // (p_lcssa - base) as integer. - Value *PLcssaInt = B.CreatePtrToInt(PtrLCSSA, Type::getInt32Ty(Header->getContext()), "unlsr.plcssa.i"); - Value *BaseInt = B.CreatePtrToInt(Base, Type::getInt32Ty(Header->getContext()), "unlsr.base.i"); - Value *Diff = B.CreateSub(PLcssaInt, BaseInt, "unlsr.diff"); - // Truncate to counter type. - Value *DiffI = B.CreateTrunc(Diff, CounterPHI->getType(), "unlsr.diff.trunc"); - // For fromNext (returning %counter.next): replacement = diff + (K_init + 1). - // At exit, counter.next = K_init + iters. - // p_lcssa - base = iters (in bytes; stride 1). Wait but iters is the iter count. - // Let me re-check with concrete example. - // strLen("a\0"): iter 1: p.0 = s, *p='a'!=0, p++, counter=-1, counter.next=0. - // iter 2: p.0 = s+1, *p=0, exit. counter=0, counter.next=1. - // At exit: counter.next = 1. iters before exit-iter's icmp-true = 2. - // p_lcssa = s+1 (the iter-2 entry value). p_lcssa - base = 1. - // counter.next = 1 = K_init + 2 = -1 + 2 = 1. ✓ - // p_lcssa - base = 1. So counter.next = p_lcssa - base + 0. - // (K_init + iters - (iters - (p_lcssa - base))) = K_init + (p_lcssa - base) = K_init + 1. - // Wait: counter.next = K_init + iters; p_lcssa - base = iters - 1. - // So counter.next = K_init + (p_lcssa - base) + 1. - // For K_init = -1: counter.next = -1 + 1 + 1 = 1 if iters=2. ✓ - // So replacement = diff + (K_init + 1). - int64_t Adjust = KInit->getSExtValue() + (fromNext ? 1 : 0); - Value *Result = DiffI; - if (Adjust != 0) { - Result = B.CreateAdd(DiffI, - ConstantInt::get(CounterPHI->getType(), Adjust), - "unlsr.result"); - } - // Cast to return type if different. - if (Result->getType() != RetTy) { - if (CounterPHI->getType()->getIntegerBitWidth() < - RetTy->getIntegerBitWidth()) - Result = B.CreateZExt(Result, RetTy); - else - Result = B.CreateTrunc(Result, RetTy); - } - // Replace the return. If there's an LCSSA PHI, replace it. Otherwise - // replace the direct use in `ret`. - if (ExitLCSSA) { - ExitLCSSA->replaceAllUsesWith(Result); - ExitLCSSA->eraseFromParent(); - } else { - Ret->setOperand(0, Result); - } - - // Erase the counter PHI and its increment. - CounterStep->replaceAllUsesWith(UndefValue::get(CounterPHI->getType())); - CounterPHI->replaceAllUsesWith(UndefValue::get(CounterPHI->getType())); - CounterStep->eraseFromParent(); - CounterPHI->eraseFromParent(); - return true; -} - - // strcpy-style undo: LSR converts two pointer PHIs (`src.addr.0` and // `d.0` each stepping by 1) into a single counter PHI (`lsr.iv`) plus // GEPs `(base, counter)` per iter. On 65816 the counter+GEP form diff --git a/src/llvm/lib/Target/W65816/W65816WidenAcc16.cpp b/src/llvm/lib/Target/W65816/W65816WidenAcc16.cpp index 226e1dc..b3279d6 100644 --- a/src/llvm/lib/Target/W65816/W65816WidenAcc16.cpp +++ b/src/llvm/lib/Target/W65816/W65816WidenAcc16.cpp @@ -84,27 +84,6 @@ static bool flowsToIncompatiblePhysReg(Register VReg, return false; } -// Returns true if VReg's def is a COPY from a physreg whose class is not -// Wide16-compatible. copyPhysReg only handles a fixed set of source/dest -// pairs; an incompatible source physreg (e.g., DPF0, the i64-return -// high-half carrier) lowered to an IMG dest would crash with an -// "unhandled copyPhysReg" assertion at AsmPrinter time. (Currently -// only the Phase-2 PHI widening uses this; that's disabled, so mark -// unused.) -[[maybe_unused]] static bool comesFromIncompatiblePhysReg(Register VReg, - const MachineRegisterInfo &MRI) { - for (auto &D : MRI.def_instructions(VReg)) { - if (!D.isCopy()) continue; - const MachineOperand &Src = D.getOperand(1); - if (!Src.isReg() || !Src.getReg().isPhysical()) continue; - Register P = Src.getReg(); - if (P == W65816::A) continue; - if (P >= W65816::IMG0 && P <= W65816::IMG15) continue; - return true; - } - return false; -} - // Returns true if the vreg is used by any PHI. PHI input/result must // share the same register class (verifier requirement). Rather than // also widen the PHI's result and recursively all of its uses, we skip @@ -212,196 +191,9 @@ bool W65816WidenAcc16::runOnMachineFunction(MachineFunction &MF) { Changed = true; } - // Phase 2: PHI cycle widening. EXPERIMENTAL, currently disabled — - // see end of pass for explanation. - #if 0 - // PHIs whose def class is Acc16 keep - // the value pinned to $a across iterations, forcing stack spills - // when the PHI is live across calls or other A-clobbering ops. - // For sumSquares-style loops with an i32 accumulator, this manifests - // as per-iter `LDA slot ; ADC ; STA slot ; LDA slot ; STA slot` (the - // last LDA/STA pair is the PHI-back-edge copy). If we widen the - // PHI's def to Wide16, regalloc can keep it in an IMG slot and the - // back-edge PHI copy collapses to a register coalesce. - // - // To widen a PHI: - // 1. Compute the SCC of Acc16 vregs connected by PHI edges (PHI - // def ↔ PHI incoming vreg). This catches mutually-recursive - // PHIs in nested loops. - // 2. For every member: verify all non-PHI uses accept Wide16, no - // flow to a physreg, single def. - // 3. For each PHI in the SCC, walk its incoming list. Each - // incoming vreg is either ALREADY in the SCC (another PHI, no - // bridge needed) or an external Acc16 vreg whose value flows - // into the SCC — bridge it by inserting `WWide = COPY W` at - // the end of the predecessor block and pointing the PHI's - // incoming at WWide. - // 4. Change every SCC member's register class to Wide16. - auto worklistInsertIfAcc16 = [&MRI](Register V, - DenseSet &Seen, - SmallVectorImpl &WL) { - if (!V.isVirtual()) return; - if (MRI.getRegClass(V) != &W65816::Acc16RegClass) return; - if (!Seen.insert(V).second) return; - WL.push_back(V); - }; - - SmallVector AcctPhis; - for (MachineBasicBlock &MBB : MF) { - for (MachineInstr &MI : MBB.phis()) { - Register DefV = MI.getOperand(0).getReg(); - if (MRI.getRegClass(DefV) == &W65816::Acc16RegClass) { - AcctPhis.push_back(&MI); - } - } - } - DenseSet ProcessedPhiVregs; - for (MachineInstr *Seed : AcctPhis) { - Register SeedDef = Seed->getOperand(0).getReg(); - if (ProcessedPhiVregs.count(SeedDef)) continue; - // Build SCC by following PHI edges in both directions. - DenseSet Comp; - SmallVector Stack; - worklistInsertIfAcc16(SeedDef, Comp, Stack); - while (!Stack.empty()) { - Register V = Stack.pop_back_val(); - // Forward: V flows into other PHIs as an incoming → include those PHI defs. - for (auto &U : MRI.use_nodbg_instructions(V)) { - if (!U.isPHI()) continue; - Register PhiDef = U.getOperand(0).getReg(); - worklistInsertIfAcc16(PhiDef, Comp, Stack); - } - // Backward: if V is itself a PHI def, include the incoming vregs. - MachineInstr *DM = &*MRI.def_instructions(V).begin(); - if (!DM || !DM->isPHI()) continue; - for (unsigned i = 1, e = DM->getNumOperands(); i < e; i += 2) { - MachineOperand &MO = DM->getOperand(i); - if (!MO.isReg() || !MO.getReg().isVirtual()) continue; - worklistInsertIfAcc16(MO.getReg(), Comp, Stack); - } - } - for (Register V : Comp) ProcessedPhiVregs.insert(V); - - // Validate every member. PHI uses are ACCEPTED when the consumer - // PHI is itself in the SCC (those PHIs are being widened in - // lock-step). Narrow-class uses (e.g., INA_PSEUDO's tied-def - // input requires Acc16) are ALSO accepted — we'll insert a - // Wide16→Acc16 COPY at the use site after widening. The only - // unrecoverable cases are: PHI uses where the consumer PHI is - // outside the SCC (forcing cross-SCC class merging), and physreg - // flow to $x/$y/etc. (handled separately above). - auto usesAcceptInSCC = [&](Register V, - SmallVectorImpl *NarrowSites) - -> bool { - for (auto &MO : MRI.use_nodbg_operands(V)) { - MachineInstr *UMI = MO.getParent(); - if (UMI->isCopy()) continue; - if (UMI->isPHI()) { - Register PhiDef = UMI->getOperand(0).getReg(); - if (Comp.count(PhiDef)) continue; // co-widened - return false; - } - unsigned OpIdx = UMI->getOperandNo(&MO); - const TargetRegisterClass *Expected = - TII->getRegClass(UMI->getDesc(), OpIdx); - if (!Expected) continue; - if (Expected == &W65816::Wide16RegClass) continue; - if (Expected->hasSubClassEq(&W65816::Wide16RegClass)) continue; - // Expected is narrower than Wide16 (e.g., Acc16-only tied - // input). Mark for runtime narrowing — we'll insert a COPY - // at apply time. - if (NarrowSites) NarrowSites->push_back(&MO); - } - return true; - }; - bool ok = true; - SmallVector NarrowSites; - for (Register V : Comp) { - if (!MRI.hasOneDef(V)) { ok = false; break; } - if (flowsToIncompatiblePhysReg(V, MRI)) { ok = false; break; } - if (comesFromIncompatiblePhysReg(V, MRI)) { ok = false; break; } - if (!usesAcceptInSCC(V, &NarrowSites)) { ok = false; break; } - } - if (!ok) continue; - - // Apply widening. First insert bridge COPYs at predecessor edges - // for external (non-Comp) Acc16 incomings to each PHI in Comp. - SmallVector, 16> BridgeSites; - for (Register V : Comp) { - MachineInstr *DM = &*MRI.def_instructions(V).begin(); - if (!DM->isPHI()) continue; - for (unsigned i = 1, e = DM->getNumOperands(); i < e; i += 2) { - MachineOperand &MO = DM->getOperand(i); - if (!MO.isReg() || !MO.getReg().isVirtual()) continue; - Register Inc = MO.getReg(); - if (Comp.count(Inc)) continue; // in-SCC, no bridge needed - // External incoming: ensure it's currently Acc16; if so, we'll - // insert a COPY at the predecessor block's end. - if (MRI.getRegClass(Inc) != &W65816::Acc16RegClass && - MRI.getRegClass(Inc) != &W65816::Wide16RegClass) { - ok = false; - break; - } - BridgeSites.push_back({DM, i}); - } - if (!ok) break; - } - if (!ok) continue; - - // Insert bridges. - for (auto &Site : BridgeSites) { - MachineInstr *PhiMI = Site.first; - unsigned OpIdx = Site.second; - Register Inc = PhiMI->getOperand(OpIdx).getReg(); - MachineBasicBlock *PredMBB = PhiMI->getOperand(OpIdx + 1).getMBB(); - // If already Wide16 (e.g., another candidate widened it already), - // no bridge needed — but we still need the PHI incoming to use - // a Wide16 vreg. Use Inc directly. - if (MRI.getRegClass(Inc) == &W65816::Wide16RegClass) { - continue; - } - // Insert COPY before the predecessor's terminator(s). - auto InsertPos = PredMBB->getFirstTerminator(); - DebugLoc DL = (InsertPos == PredMBB->end()) - ? PredMBB->findBranchDebugLoc() - : InsertPos->getDebugLoc(); - Register WideInc = MRI.createVirtualRegister(&W65816::Wide16RegClass); - BuildMI(*PredMBB, InsertPos, DL, TII->get(TargetOpcode::COPY), - WideInc) - .addReg(Inc); - PhiMI->getOperand(OpIdx).setReg(WideInc); - PhiMI->getOperand(OpIdx).setIsKill(false); - } - - // Force every SCC member to Img16 (IMG-only, no A). Using Wide16 - // (A + IMG) doesn't work here: the Register Coalescer joins our - // Wide16 vregs with adjacent Acc16 vregs (intersection = Acc16) - // and narrows them back to A-only, defeating the widening. Img16 - // intersects Acc16 to ∅, so the coalescer can't merge — the PHI - // stays in IMG. This is correct anyway for the common case (PHI - // live across a call): A is JSL-clobbered, so it can't carry the - // value through, and IMG8..15 is the right home. - for (Register V : Comp) { - MRI.setRegClass(V, &W65816::Img16RegClass); - } - // Insert narrowing COPYs at each narrow-class use site. Each site - // is `... = OP V, ...` where the operand requires Acc16 but V is - // now Wide16. Replace with `%Vacc = COPY V (Acc16); ... = OP %Vacc, ...`. - for (MachineOperand *MO : NarrowSites) { - MachineInstr *UMI = MO->getParent(); - Register OldReg = MO->getReg(); - Register NarrowReg = - MRI.createVirtualRegister(&W65816::Acc16RegClass); - DebugLoc DL = UMI->getDebugLoc(); - BuildMI(*UMI->getParent(), UMI, DL, TII->get(TargetOpcode::COPY), - NarrowReg) - .addReg(OldReg); - MO->setReg(NarrowReg); - MO->setIsKill(false); - } - Changed = true; - } - #endif + // Phase 2: PHI cycle widening was prototyped here but never landed. + // The prototype body lived in an #if 0 block that was removed once + // we settled on Phase 1 as the only effective half of the pass. // Why disabled (2026-05-13 attempt): // - Widening PHI cycles to Wide16 (= A + IMG0..15) is undone by the // Register Coalescer: it joins our Wide16 vregs with adjacent diff --git a/tests/ubsan/README.md b/tests/ubsan/README.md index 28d4e79..c64f8c0 100644 --- a/tests/ubsan/README.md +++ b/tests/ubsan/README.md @@ -1,20 +1,26 @@ # tests/ubsan — UBSan-min smoke probe (Phase 6.2) -Three-case probe that exercises the `-fsanitize=undefined +Nine-case probe that exercises the `-fsanitize=undefined -fsanitize-minimal-runtime` instrumentation end-to-end on the W65816 target: -| Kind | UB | Sentinel | -|-----------------------|----------------------------------|--------------| -| `add-overflow` | i16 `INT_MAX + 1` | `$025000=0xC0DE` | -| `shift-out-of-bounds` | u16 `1 << 17` | `$025002=0xC0DF` | -| `divrem-overflow` | i16 `n / 0` | `$025004=0xC0E0` | -| (liveness) | tail of `main` reached | `$025006=0xC0DA` | +| Kind | UB | Sentinel | +|------------------------|----------------------------------|----------------------| +| `add-overflow` | i16 `INT_MAX + 1` | `$025000=0xC0DE` | +| `shift-out-of-bounds` | u16 `1 << 17` | `$025002=0xC0DF` | +| `divrem-overflow` | i16 `n / 0` | `$025004=0xC0E0` | +| `sub-overflow` | i16 `INT_MIN - 1` | `$025006=0xC0E1` | +| `mul-overflow` | i16 `INT_MAX * 2` | `$025008=0xC0E2` | +| `negate-overflow` | i16 `-INT_MIN` | `$02500A=0xC0E3` | +| `pointer-overflow` | `(char*)0xFFFFFFF0 + 0x40` | `$02500C=0xC0E4` | +| `load-invalid-value` | `_Bool` loaded from byte = 2 | `$02500E=0xC0E5` | +| `out-of-bounds` | `arr[idx>=N]` on static array | `$025010=0xC0E6` | +| (liveness) | tail of `main` reached | `$025012=0xC0DA` | -The probe ships strong override defs for the three `__ubsan_handle_*_minimal` -recovering handlers it exercises; the remaining 22 are pulled in from -`runtime/ubsan.o` so any extra UB site clang emits (e.g. constant-fold -overflow at `-O2`) still resolves cleanly. +The probe ships strong override defs for the nine `__ubsan_handle_*_minimal` +recovering handlers it exercises; the remaining handlers are pulled in +from `runtime/ubsan.o` so any extra UB site clang emits (e.g. constant- +fold overflow at `-O2`) still resolves cleanly. ## Build + run @@ -27,8 +33,14 @@ Expected output: MAME-READ addr=0x025000 val=0xc0de MAME-READ addr=0x025002 val=0xc0df MAME-READ addr=0x025004 val=0xc0e0 -MAME-READ addr=0x025006 val=0xc0da -MAME OK: 4 reads matched +MAME-READ addr=0x025006 val=0xc0e1 +MAME-READ addr=0x025008 val=0xc0e2 +MAME-READ addr=0x02500a val=0xc0e3 +MAME-READ addr=0x02500c val=0xc0e4 +MAME-READ addr=0x02500e val=0xc0e5 +MAME-READ addr=0x025010 val=0xc0e6 +MAME-READ addr=0x025012 val=0xc0da +MAME OK: 10 reads matched ``` ## What this probe is NOT @@ -39,9 +51,14 @@ MAME OK: 4 reads matched overrides the handlers so it can verify the *call edge* without pulling in console code. A separate diagnostic-format probe would link `libc.o` + `libcGno.o` + GNO crt0 and assert on stderr. -- It is **not** a sweep of all 25 handler kinds. The user-spec scope - is "3 representative kinds". The other 22 are link-tested - implicitly by `runtime/ubsan.o`'s symbol set being available. +- It is **not** a sweep of all 25 handler kinds. The kinds covered + are all the cheap-to-trigger recoverable handlers that clang emits + at `-O2` for the W65816 target. Aborting-only kinds (e.g. + `builtin_unreachable_minimal`, `missing_return_minimal`) cannot be + exercised here because returning from the handler after the IR + `unreachable` is itself UB. Float-cast-overflow / VLA-not-positive + / type-mismatch / CFI / Objective-C kinds are linked but not + triggered. ## Files diff --git a/tests/ubsan/runUbsanProbe.sh b/tests/ubsan/runUbsanProbe.sh index 0530f8e..fc107b5 100755 --- a/tests/ubsan/runUbsanProbe.sh +++ b/tests/ubsan/runUbsanProbe.sh @@ -7,12 +7,14 @@ # What this verifies: # - clang accepts -fsanitize=undefined -fsanitize-minimal-runtime on # the w65816 target. -# - The three exercised UB kinds (add-overflow / shift-out-of-bounds / -# divrem-overflow) instrument as expected — the handler-fired byte -# flips inside the per-kind handler override. +# - Nine exercised UB kinds (add-overflow / shift-out-of-bounds / +# divrem-overflow / sub-overflow / mul-overflow / negate-overflow / +# pointer-overflow / load-invalid-value / out-of-bounds) instrument +# as expected -- the handler-fired byte flips inside the per-kind +# handler override. # - The recovering minimal runtime returns to the caller cleanly, so # the probe continues writing sentinels past each UB site. -# - runtime/ubsan.o links + resolves the other 22 handler kinds without +# - runtime/ubsan.o links + resolves the other handler kinds without # pulling in console code that the probe doesn't need. set -eu @@ -27,7 +29,7 @@ bash "$SCRIPT_DIR/build.sh" # Link. crt0.o + the probe + ubsan.o + libgcc.o (for the i16 div+rem # helpers triggerDivByZero needs). We deliberately do NOT link libc.o -# — the probe sets memory sentinels directly, doesn't call printf, and +# -- the probe sets memory sentinels directly, doesn't call printf, and # pulling libc.o in would also pull snprintf.o (~9 KB) for no benefit. "$PROJECT_ROOT/tools/link816" -o ubsanProbe.bin \ --text-base 0x1000 --bss-base 0xA000 --map ubsanProbe.map \ @@ -39,11 +41,22 @@ bash "$SCRIPT_DIR/build.sh" ls -la ubsanProbe.bin echo "" -# Sentinels: +# Sentinels (one per recoverable handler exercised, plus a tail +# liveness sentinel). Each is a 16-bit write at $025000+kind*2. # $025000 = 0xC0DE add-overflow handler fired # $025002 = 0xC0DF shift-out-of-bounds handler fired # $025004 = 0xC0E0 divrem-overflow handler fired -# $025006 = 0xC0DA all three recovered and main reached its tail +# $025006 = 0xC0E1 sub-overflow handler fired +# $025008 = 0xC0E2 mul-overflow handler fired +# $02500A = 0xC0E3 negate-overflow handler fired +# $02500C = 0xC0E4 pointer-overflow handler fired +# $02500E = 0xC0E5 load-invalid-value handler fired +# $025010 = 0xC0E6 out-of-bounds handler fired +# $025012 = 0xC0DA all nine recovered and main reached its tail bash "$PROJECT_ROOT/scripts/runInMame.sh" \ "$SCRIPT_DIR/ubsanProbe.bin" \ - --check 0x025000=C0DE 0x025002=C0DF 0x025004=C0E0 0x025006=C0DA + --check \ + 0x025000=C0DE 0x025002=C0DF 0x025004=C0E0 \ + 0x025006=C0E1 0x025008=C0E2 0x02500A=C0E3 \ + 0x02500C=C0E4 0x02500E=C0E5 0x025010=C0E6 \ + 0x025012=C0DA diff --git a/tests/ubsan/ubsanProbe.c b/tests/ubsan/ubsanProbe.c index 01db8fc..9e1fbc8 100644 --- a/tests/ubsan/ubsanProbe.c +++ b/tests/ubsan/ubsanProbe.c @@ -1,44 +1,62 @@ // Phase 6.2 UBSan-min smoke probe. // -// Three UB cases (one each from the spec): -// kind 0 (sentinel 0xC0DE): signed-overflow add (i16 INT_MAX + 1) -// kind 1 (sentinel 0xC0DF): shift-out-of-bounds (1 << 17 on a u16) -// kind 2 (sentinel 0xC0E0): divide-by-zero (n / 0) +// Nine UB cases — one per recoverable handler kind we exercise: +// kind 0 (sentinel 0xC0DE): add-overflow (i16 INT_MAX + 1) +// kind 1 (sentinel 0xC0DF): shift-out-of-bounds (1 << 17 on a u16) +// kind 2 (sentinel 0xC0E0): divrem-overflow (n / 0) +// kind 3 (sentinel 0xC0E1): sub-overflow (INT_MIN - 1) +// kind 4 (sentinel 0xC0E2): mul-overflow (INT_MAX * 2) +// kind 5 (sentinel 0xC0E3): negate-overflow (-INT_MIN) +// kind 6 (sentinel 0xC0E4): pointer-overflow (ptr + huge offset) +// kind 7 (sentinel 0xC0E5): load-invalid-value (_Bool from byte=2) +// kind 8 (sentinel 0xC0E6): out-of-bounds (arr[idx>=N]) // -// The probe overrides the three relevant `__ubsan_handle_*_minimal` -// recovering handlers with strong definitions that record their -// firing in a static state byte. After each UB, the probe writes -// 0xC0DE + kind to $025000 to prove (a) the instrumentation fired and -// (b) execution recovered cleanly past the UB. The recover handler +// The probe overrides each relevant `__ubsan_handle_*_minimal` recovering +// handler with a strong definition that records its firing in a static +// state byte. After each UB, the probe writes 0xC0DE+kind to a per-kind +// 16-bit slot at 0x025000+kind*2 to prove (a) the instrumentation fired +// and (b) execution recovered cleanly past the UB. The recover handler // returning normally is the whole point of -fsanitize-minimal-runtime // + -fsanitize-recover; this probe is what proves the round-trip. // -// To verify all three at once we cascade the sentinel writes through a -// staircase of $025000 / $025002 / $025004 word stores so the smoke -// harness can read three independent 16-bit values back from MAME. +// To verify all nine at once we cascade the sentinel writes through a +// staircase of word stores so the smoke harness can read independent +// 16-bit values back from MAME. // // Compile with -fsanitize=undefined -fsanitize-minimal-runtime. #include -// Bank-2 BSS at $025000-$025006 — outside the SHR shadow and outside +// Bank-2 BSS at $025000-$025014 -- outside the SHR shadow and outside // $C000-$CFFF IO window. link816 places .bss at the user-specified // --bss-base (we pass 0xA000) so these constant addresses are // independent of BSS layout. -#define MARK_ADD_OVF ((volatile uint16_t *)0x025000UL) -#define MARK_SHIFT_OOB ((volatile uint16_t *)0x025002UL) -#define MARK_DIV_ZERO ((volatile uint16_t *)0x025004UL) -#define DONE_SENTINEL ((volatile uint16_t *)0x025006UL) +#define MARK_ADD_OVF ((volatile uint16_t *)0x025000UL) +#define MARK_SHIFT_OOB ((volatile uint16_t *)0x025002UL) +#define MARK_DIV_ZERO ((volatile uint16_t *)0x025004UL) +#define MARK_SUB_OVF ((volatile uint16_t *)0x025006UL) +#define MARK_MUL_OVF ((volatile uint16_t *)0x025008UL) +#define MARK_NEG_OVF ((volatile uint16_t *)0x02500AUL) +#define MARK_PTR_OVF ((volatile uint16_t *)0x02500CUL) +#define MARK_LOAD_INVAL ((volatile uint16_t *)0x02500EUL) +#define MARK_OUT_OF_BNDS ((volatile uint16_t *)0x025010UL) +#define DONE_SENTINEL ((volatile uint16_t *)0x025012UL) // Strong overrides win over runtime/ubsan.o's weak-by-link defaults. // Each fires once per kind and records that the corresponding UB // instrumentation reached us. Recovering handlers MUST return so the // probe continues executing past the UB site. -static volatile uint8_t handlerFiredAdd = 0; -static volatile uint8_t handlerFiredShift = 0; -static volatile uint8_t handlerFiredDiv = 0; +static volatile uint8_t handlerFiredAdd = 0; +static volatile uint8_t handlerFiredShift = 0; +static volatile uint8_t handlerFiredDiv = 0; +static volatile uint8_t handlerFiredSub = 0; +static volatile uint8_t handlerFiredMul = 0; +static volatile uint8_t handlerFiredNeg = 0; +static volatile uint8_t handlerFiredPtr = 0; +static volatile uint8_t handlerFiredLoadInv = 0; +static volatile uint8_t handlerFiredOob = 0; void __ubsan_handle_add_overflow_minimal(void) { @@ -56,6 +74,36 @@ void __ubsan_handle_divrem_overflow_minimal(void) { } +void __ubsan_handle_sub_overflow_minimal(void) { + handlerFiredSub = 1; +} + + +void __ubsan_handle_mul_overflow_minimal(void) { + handlerFiredMul = 1; +} + + +void __ubsan_handle_negate_overflow_minimal(void) { + handlerFiredNeg = 1; +} + + +void __ubsan_handle_pointer_overflow_minimal(void) { + handlerFiredPtr = 1; +} + + +void __ubsan_handle_load_invalid_value_minimal(void) { + handlerFiredLoadInv = 1; +} + + +void __ubsan_handle_out_of_bounds_minimal(void) { + handlerFiredOob = 1; +} + + // Each UB site goes through a noinline wrapper so the optimizer // cannot constant-fold the operation away. __attribute__((noinline)) // + volatile inputs blocks the obvious folding paths; we also wrap @@ -79,6 +127,47 @@ static int16_t triggerDivByZero(int16_t a, int16_t b) { } +__attribute__((noinline)) +static int16_t triggerSubOverflow(int16_t a, int16_t b) { + return a - b; +} + + +__attribute__((noinline)) +static int16_t triggerMulOverflow(int16_t a, int16_t b) { + return a * b; +} + + +__attribute__((noinline)) +static int16_t triggerNegateOverflow(int16_t a) { + return -a; +} + + +__attribute__((noinline)) +static char *triggerPointerOverflow(char *p, int32_t o) { + return p + o; +} + + +__attribute__((noinline)) +static int triggerLoadInvalidValue(volatile uint8_t *p) { + _Bool v = *(_Bool *)p; + // Use the value so the load isn't dead-stripped. We don't trust + // the post-instrumentation cast to a 0/1 narrow value -- the + // important thing is the load itself fired the handler. + return v ? 1 : 0; +} + + +__attribute__((noinline)) +static int16_t triggerOutOfBounds(int16_t idx) { + static int16_t arr[4] = { 10, 20, 30, 40 }; + return arr[idx]; +} + + int main(void) { // --- case 0: signed-overflow add (INT16_MAX + 1) --- volatile int16_t aMax = 0x7FFF; @@ -104,12 +193,58 @@ int main(void) { *MARK_DIV_ZERO = 0xC0E0; } - // Final liveness sentinel — only written if we got past all three + // --- case 3: sub-overflow (INT16_MIN - 1) --- + volatile int16_t aMin = (int16_t)0x8000; + (void)triggerSubOverflow(aMin, aOne); + if (handlerFiredSub) { + *MARK_SUB_OVF = 0xC0E1; + } + + // --- case 4: mul-overflow (INT16_MAX * 2 wraps) --- + volatile int16_t aTwo = 2; + (void)triggerMulOverflow(aMax, aTwo); + if (handlerFiredMul) { + *MARK_MUL_OVF = 0xC0E2; + } + + // --- case 5: negate-overflow (-INT16_MIN) --- + (void)triggerNegateOverflow(aMin); + if (handlerFiredNeg) { + *MARK_NEG_OVF = 0xC0E3; + } + + // --- case 6: pointer-overflow (signed-wrap on i16 addr) --- + // Cast a high address to char* and add a positive offset that + // overflows the address calculation. -fsanitize=pointer-overflow + // fires on signed-overflow of the offset add. + volatile uint32_t hiAddr = 0xFFFFFFF0UL; + volatile int32_t big = 0x40; + char *p = (char *)(uintptr_t)hiAddr; + (void)triggerPointerOverflow(p, big); + if (handlerFiredPtr) { + *MARK_PTR_OVF = 0xC0E4; + } + + // --- case 7: load-invalid-value (_Bool from byte=2) --- + volatile uint8_t boolByte = 2; + (void)triggerLoadInvalidValue(&boolByte); + if (handlerFiredLoadInv) { + *MARK_LOAD_INVAL = 0xC0E5; + } + + // --- case 8: out-of-bounds (static arr[idx>=N]) --- + volatile int16_t badIdx = 7; + (void)triggerOutOfBounds(badIdx); + if (handlerFiredOob) { + *MARK_OUT_OF_BNDS = 0xC0E6; + } + + // Final liveness sentinel -- only written if we got past all nine // UB sites without the runtime aborting (which would have spun on // a BRK_pseudo at $70 instead of reaching here). *DONE_SENTINEL = 0xC0DA; - // Halt — crt0's return-from-main path hits a BRK that headless + // Halt -- crt0's return-from-main path hits a BRK that headless // MAME wild-jumps from, so spin-wait instead. while (1) { } diff --git a/tests/ubsan/ubsanProbe.manifest.json b/tests/ubsan/ubsanProbe.manifest.json index abbcef3..cfe69df 100644 --- a/tests/ubsan/ubsanProbe.manifest.json +++ b/tests/ubsan/ubsanProbe.manifest.json @@ -13,7 +13,7 @@ "num": 1, "name": "SEG1", "base": "0x001000", - "size": 3432, + "size": 5084, "image": "ubsanProbe.bin", "entry_offset": "0x0000" } @@ -22,6 +22,12 @@ {"addr": "0x025000", "expect": "0xC0DE", "label": "add-overflow handler fired"}, {"addr": "0x025002", "expect": "0xC0DF", "label": "shift-out-of-bounds handler fired"}, {"addr": "0x025004", "expect": "0xC0E0", "label": "divrem-overflow handler fired"}, - {"addr": "0x025006", "expect": "0xC0DA", "label": "main reached tail after all three recoveries"} + {"addr": "0x025006", "expect": "0xC0E1", "label": "sub-overflow handler fired"}, + {"addr": "0x025008", "expect": "0xC0E2", "label": "mul-overflow handler fired"}, + {"addr": "0x02500A", "expect": "0xC0E3", "label": "negate-overflow handler fired"}, + {"addr": "0x02500C", "expect": "0xC0E4", "label": "pointer-overflow handler fired"}, + {"addr": "0x02500E", "expect": "0xC0E5", "label": "load-invalid-value handler fired"}, + {"addr": "0x025010", "expect": "0xC0E6", "label": "out-of-bounds handler fired"}, + {"addr": "0x025012", "expect": "0xC0DA", "label": "main reached tail after all nine recoveries"} ] }