This commit is contained in:
Scott Duensing 2026-06-03 16:08:42 -05:00
parent da095402ec
commit 09f7405362
38 changed files with 2454 additions and 1599 deletions

7
.gitignore vendored
View file

@ -19,6 +19,13 @@ tools/
*.map *.map
*.reloc *.reloc
# Exception: demo .rsrc/ fixture directories ship TYPECODE_ID.bin files
# as source. Each rsrcBundle test reads them at build time and emits
# the AppleSingle + sidecar in the same dir; the .apl + sidecar are
# build artifacts (caught by *.bin above for the sidecar; .apl is
# tracked by name). We carve out the source fixtures here.
!demos/*.rsrc/*.bin
# Per-target build directories. # Per-target build directories.
tests/coremark/build/ tests/coremark/build/
tests/lua/build/ tests/lua/build/

Binary file not shown.

View file

@ -1,59 +1,134 @@
// rsrcProbe.c - Phase 3.4 stub-only Resource Manager smoke probe. // rsrcProbe.c - Phase 3.4 real Resource Manager smoke probe.
// //
// What this verifies right now: // Replaces the stub-only probe. Builds a tiny in-memory .rsrc fixture,
// - resourceProbeInit() returns RES_ERR_BLOCKED (the stub-only path), // registers it with mfsRegister, opens it via openResourceFile, loads
// - iigsLoadResource() returns NULL with err = RES_ERR_BLOCKED, // a known rText resource, and verifies the bytes match the expected
// - iigsGetResourceSize() returns 0 with err = RES_ERR_BLOCKED, // payload. This exercises the real parser path top-to-bottom without
// - the runtime resource.o links cleanly under -O2, // needing a ProDOS resource fork.
// - the demo's OMF can be bundled with rsrcBundle.py (post-step in
// demos/build.sh when demos/rsrcProbe.rsrc/ is present).
//
// Marker discipline. Page-1 ($70..$73) per the cursorProbe.c
// convention - runViaFinder.sh samples direct-page bytes reliably
// across MAME timings, and full-24-bit BSS-style markers (0x025000)
// don't survive the Loader/Finder relocation games on GS/OS 6.0.2.
// //
// Markers (page-1 direct page, per cursorProbe convention):
// $70 := 0x99 end-of-main success sentinel // $70 := 0x99 end-of-main success sentinel
// $71 := initRc as int8 (expected 0xff = (uint8_t)RES_ERR_BLOCKED) // $71 := 0x01 if openResourceFile succeeded (refnum != 0)
// $72 := loadErr (expected 0xff) // $72 := 0x01 if loadResource returned a non-NULL handle whose
// $73 := 0x01 if resourceRuntimeEnabled()==0 (today's stub answer) // bytes match "HELLO" and size is 5
// $73 := 0x01 if loadResource second call returned the SAME handle
// (cache hit) and closeResourceFile returned RES_OK
// //
// Build: bash demos/build.sh rsrcProbe // Build: bash demos/build.sh rsrcProbe
// Run: bash scripts/runViaFinder.sh demos/rsrcProbe.omf \ // Run: bash scripts/runViaFinder.sh demos/rsrcProbe.omf \
// --check 0x70=0x99 // --check 0x70=0x99 0x71=0x01 0x72=0x01 0x73=0x01
// runViaFinder LAUNCHES the OMF and samples at frame 6000; no keypress
// is required because we drop into while(1) immediately after writing
// the markers.
#include <stdint.h> #include <stdint.h>
#include <stdio.h>
#include <string.h>
#include "iigs/resource.h" #include "iigs/resource.h"
// rResourceMap fixture: header + 5-byte rText payload + one rIndex entry.
//
// Header (24 bytes, little-endian):
// rmVersion = 0x0000
// rmToIndex = 0x0000001D (29)
// rmFileNum = 0
// rmID = 0
// rmIndexSize = 0x00000014 (20 bytes = 1 entry)
// rmIndexUsed = 0x00000001
// rmFreeListSize = 0
// rmFreeListUsed = 0
// rmPad = 0
// Payload (5 bytes) at offset 24: "HELLO"
// rIndex entry (20 bytes) at offset 29:
// rType = 0x8014 (rText)
// rID = 0x00000001
// rOffset = 0x00000018 (24)
// rAttr = 0
// rSize = 0x00000005
// rHandle = 0
static const uint8_t kFixture[49] = {
// header
0x00, 0x00, // rmVersion
0x1D, 0x00, 0x00, 0x00, // rmToIndex = 29
0x00, 0x00, // rmFileNum
0x00, 0x00, // rmID
0x14, 0x00, 0x00, 0x00, // rmIndexSize = 20
0x01, 0x00, 0x00, 0x00, // rmIndexUsed = 1
0x00, 0x00, // rmFreeListSize
0x00, 0x00, // rmFreeListUsed
0x00, 0x00, // rmPad
// payload at offset 24: "HELLO"
0x48, 0x45, 0x4C, 0x4C, 0x4F,
// rIndex entry at offset 29
0x14, 0x80, // rType = 0x8014
0x01, 0x00, 0x00, 0x00, // rID = 1
0x18, 0x00, 0x00, 0x00, // rOffset = 24
0x00, 0x00, // rAttr
0x05, 0x00, 0x00, 0x00, // rSize = 5
0x00, 0x00, 0x00, 0x00 // rHandle
};
static const char kFixturePath[] = "rsrc.fixture";
static const char kExpectedText[] = "HELLO";
static const uint32_t kExpectedSize = 5;
int main(void) { int main(void) {
volatile uint8_t *mark0 = (volatile uint8_t *)0x70; volatile uint8_t *mark0 = (volatile uint8_t *)0x70;
volatile uint8_t *mark1 = (volatile uint8_t *)0x71; volatile uint8_t *mark1 = (volatile uint8_t *)0x71;
volatile uint8_t *mark2 = (volatile uint8_t *)0x72; volatile uint8_t *mark2 = (volatile uint8_t *)0x72;
volatile uint8_t *mark3 = (volatile uint8_t *)0x73; volatile uint8_t *mark3 = (volatile uint8_t *)0x73;
*mark0 = 0x10; // entry sentinel: we did reach main() *mark0 = 0x10;
int initRc = resourceProbeInit(); *mark1 = 0x00;
*mark1 = (uint8_t)initRc; *mark2 = 0x00;
*mark3 = 0x00;
int loadErr = 0; // Stage the fixture as a read-only memory-backed file. Cast away
void **h = iigsLoadResource(RES_TYPE_RTEXT, 1, &loadErr); // const for the mfsRegister buffer pointer; the resource manager
(void)h; // only ever reads.
*mark2 = (uint8_t)loadErr; if (mfsRegister(kFixturePath, (void *)kFixture, sizeof(kFixture), sizeof(kFixture), 0) != 0) {
while (1) {
}
}
int sizeErr = 0; resourceProbeInit();
uint32_t sz = iigsGetResourceSize(RES_TYPE_RTEXT, 1, &sizeErr);
(void)sz;
*mark3 = (uint8_t)(resourceRuntimeEnabled() == 0 ? 0x01 : 0x00); int rcOpen = 0;
ResourceRefNumT ref = openResourceFile(kFixturePath, 0, 0, &rcOpen);
if (ref != 0 && rcOpen == RES_OK) {
*mark1 = 0x01;
}
int rcLoad = 0;
void **h = loadResource(RES_TYPE_RTEXT, 1, &rcLoad);
if (h && rcLoad == RES_OK) {
const uint8_t *bytes = (const uint8_t *)*h;
uint32_t sz = getResourceSize(h);
int match = (sz == kExpectedSize);
if (match) {
for (uint32_t i = 0; i < kExpectedSize; i++) {
if (bytes[i] != (uint8_t)kExpectedText[i]) {
match = 0;
break;
}
}
}
if (match) {
*mark2 = 0x01;
}
}
// Second load - cache hit must return the SAME handle. Then
// close the file, which must report RES_OK.
int rcLoad2 = 0;
void **h2 = loadResource(RES_TYPE_RTEXT, 1, &rcLoad2);
int sameHandle = (h2 == h && h2 != 0);
int rcClose = closeResourceFile(ref);
if (sameHandle && rcClose == RES_OK) {
*mark3 = 0x01;
}
// Success marker last - if any of the calls above trapped (which
// they shouldn't in stub-only mode), the harness will see $70
// != 0x99 and report failure.
*mark0 = 0x99; *mark0 = 0x99;
while (1) { while (1) {

View file

@ -0,0 +1 @@
iconBytesPlaceholder

View file

@ -0,0 +1 @@
HELLO

View file

@ -1,34 +1,37 @@
// iigs/resource.h - typed-C facade over the IIgs Resource Manager. // iigs/resource.h - typed-C facade over the IIgs Resource Manager.
// //
// Phase 3.4 STUB-ONLY landing. The bundler + linker integration ship // Phase 3.4 REAL implementation: parses .rsrc resource forks via the
// fully (see tools/rsrcBundle/), but the *runtime* path is blocked on // stdio surface (fopen/fread/fseek/fclose) and serves resources from a
// Phase 1.1 (the GS/OS fopen hang). GS/OS 6.0.2 + ResourceStartUp + // per-file cache. Read-only. No AddResource, no DetachResource, no
// OpenResourceFile reaches the same path that hangs in fopen today, so // partial-load, no encryption - those are features we do not yet need.
// the LoadResource()/GetResourceSize() entry points below return error
// codes instead of calling the toolbox. When Phase 1.1 lands, flip
// IIGS_RESOURCE_RUNTIME_ENABLED to 1 (or define it at the compiler
// level) and rebuild the runtime - the same C surface stays.
// //
// What you GET today: // What you GET today:
// - resourceProbeInit() reports whether the runtime path is enabled. // - openResourceFile(path, accessByte, fileType) -> refNum (>0) or
// - LoadResource() / GetResourceSize() return RES_ERR_BLOCKED unless // 0 on failure (errno-style code lands in *err if provided).
// IIGS_RESOURCE_RUNTIME_ENABLED is set at compile time. // - loadResource(type, id) -> Handle (void **) on success; cached so
// repeated calls return the same handle. *handle points at the
// resource bytes (already read from the file).
// - releaseResource(verb, handle) -> 0 on success. verb 0 just
// releases the current load; verb 1 also evicts the cache entry
// and frees the data.
// - closeResourceFile(refNum) -> 0 on success. Frees all cached
// handles owned by that file.
// //
// HLock semantics (IMPORTANT for future Phase 1.1 unblock): // On-disk format (Apple IIgs Toolbox Reference Vol 3, ch.42):
// The toolbox LoadResource() returns a HANDLE (void **) to a master // File offset 0: rResourceMap header (24 bytes, little-endian fields
// pointer in MM-relocatable storage. The application MUST call // because the 65816 is LE). Field rmToIndex is the file offset of
// HLock() before dereferencing if it intends to call ANY toolbox // the rIndex table; rmIndexUsed is the number of valid entries; the
// routine that could trigger a heap compaction (most do). Without // remaining header fields are bookkeeping/zero at build time.
// the HLock, the master pointer can be rewritten under you between // Body bytes: resource payloads at the offsets recorded in rIndex.
// the LoadResource and the deref. The typed wrappers below DO NOT // At rmToIndex: array of 20-byte rIndex entries, each:
// call HLock for you - that is a deliberate choice because over- // uint16 rType, uint32 rID, uint32 rOffset, uint16 rAttr,
// locking is a memory-fragmentation footgun and the right scope is // uint32 rSize, uint32 rHandle (zero on disk).
// workload-specific. Callers should: //
// void **h = LoadResourceTyped(0x8014, 1); // HLock semantics:
// HLock(h); // The handles we return are NOT relocatable - they point straight at
// const RTextT *t = (const RTextT *)*h; // a malloc'd payload buffer. That means HLock/HUnlock are no-ops
// ... use t ... // here. The void ** indirection is preserved so that real Memory
// HUnlock(h); // Manager handles can swap in later without changing callers.
#ifndef IIGS_RESOURCE_H #ifndef IIGS_RESOURCE_H
#define IIGS_RESOURCE_H #define IIGS_RESOURCE_H
@ -40,36 +43,39 @@ extern "C" {
#include <stdint.h> #include <stdint.h>
// Flip to 1 (or pass -DIIGS_RESOURCE_RUNTIME_ENABLED=1 on the build line)
// once Phase 1.1 unblocks GS/OS fopen on 6.0.2. At that point the typed
// wrappers below dispatch into the live toolbox; until then they stub.
#ifndef IIGS_RESOURCE_RUNTIME_ENABLED
#define IIGS_RESOURCE_RUNTIME_ENABLED 0
#endif
// Status codes returned by the typed wrappers. Mirror the runtime's // Status codes returned by the typed wrappers. Mirror the runtime's
// existing errno-style convention (negative = error). // existing errno-style convention (negative = error).
enum { enum {
RES_OK = 0, RES_OK = 0,
RES_ERR_BLOCKED = -1, // Phase 1.1 runtime path still blocked RES_ERR_BLOCKED = -1, // legacy stub marker - kept for
RES_ERR_NOT_STARTED = -2, // resourceProbeInit() not called yet // backwards compat with old probes
RES_ERR_NOT_FOUND = -3, // OpenResourceFile / LoadResource failed RES_ERR_NOT_STARTED = -2, // openResourceFile not called yet
RES_ERR_TOOLBOX = -4 // Resource Manager returned non-zero RES_ERR_NOT_FOUND = -3, // file open / resource lookup failed
RES_ERR_TOOLBOX = -4, // map header corrupt / IO failure
RES_ERR_NO_MEM = -5, // malloc failed
RES_ERR_BAD_HANDLE = -6 // release/close given an unknown ref
}; };
// Resource type codes we expect to bundle. See Apple IIgs Toolbox // Resource type codes we expect to bundle. See Apple IIgs Toolbox
// Reference Vol 3 chapter 42 for the canonical list. Defined here as // Reference Vol 3 chapter 42 for the canonical list.
// constants so callers don't have to use raw hex.
#define RES_TYPE_RICON 0x8005 #define RES_TYPE_RICON 0x8005
#define RES_TYPE_RTEXT 0x8014 #define RES_TYPE_RTEXT 0x8014
#define RES_TYPE_RPSTRING 0x8015 #define RES_TYPE_RPSTRING 0x8015
#define RES_TYPE_RCSTRING 0x8016 #define RES_TYPE_RCSTRING 0x8016
// Resource ID type matching the toolbox (32-bit on disk and in the // Build-time tunables. These cap the per-process resource footprint.
// rIndex; the public API uses uint32_t). #ifndef IIGS_RES_MAX_FILES
#define IIGS_RES_MAX_FILES 2
#endif
#ifndef IIGS_RES_MAX_HANDLES
#define IIGS_RES_MAX_HANDLES 16
#endif
// Resource ID (32-bit on disk and in the rIndex).
typedef uint32_t IigsResIdT; typedef uint32_t IigsResIdT;
@ -78,37 +84,87 @@ typedef uint32_t IigsResIdT;
typedef uint16_t IigsResTypeT; typedef uint16_t IigsResTypeT;
// One-shot Resource Manager bring-up. Calls MMStartUp + TLStartUp + // 24-byte resource map header at the start of every .rsrc file.
// ResourceStartUp + OpenResourceFile (on our own pathname) when the typedef struct {
// runtime path is enabled. Always callable; safe to call more than uint16_t rmVersion;
// once (subsequent calls are no-ops). uint32_t rmToIndex;
// uint16_t rmFileNum;
// Returns: uint16_t rmID;
// RES_OK if the resource fork was opened (or the stub uint32_t rmIndexSize;
// path "succeeded" with no-op behavior), uint32_t rmIndexUsed;
// RES_ERR_BLOCKED if compiled with IIGS_RESOURCE_RUNTIME_ENABLED=0 uint16_t rmFreeListSize;
// (the default until Phase 1.1 lands), uint16_t rmFreeListUsed;
// RES_ERR_TOOLBOX if any of the StartUp calls returned non-zero. uint16_t rmPad;
} ResourceMapHeaderT;
// 20-byte rIndex entry.
typedef struct {
uint16_t rType;
uint32_t rID;
uint32_t rOffset;
uint16_t rAttr;
uint32_t rSize;
uint32_t rHandle;
} ResourceIndexEntryT;
// Refnum returned by openResourceFile. Zero means "no file"; valid
// refnums start at 1.
typedef uint16_t ResourceRefNumT;
// One-shot init. Returns RES_OK; safe to call more than once.
int resourceProbeInit(void); int resourceProbeInit(void);
// Read whether the runtime path is live. Cheap; returns 1 iff a // Reports whether the Resource Manager is alive. Always 1 after
// successful resourceProbeInit() has run AND the build enabled the // resourceProbeInit() has run.
// runtime path. Returns 0 in the stub-only landing.
int resourceRuntimeEnabled(void); int resourceRuntimeEnabled(void);
// LoadResource typed wrapper. Returns a HANDLE (void **) on success, // Opens a resource fork at `path`. `accessByte` and `fileType` are
// or NULL on failure (and sets *err if non-NULL). // accepted for API parity with the toolbox but ignored on read-only
// in-memory backends. Returns refnum (>0) on success, 0 on failure.
// If `err` is non-NULL it receives RES_OK or one of RES_ERR_*.
ResourceRefNumT openResourceFile(const char *path, uint8_t accessByte,
uint16_t fileType, int *err);
// Closes a resource fork and frees any handles cached for that file.
// Returns RES_OK or RES_ERR_BAD_HANDLE.
int closeResourceFile(ResourceRefNumT refNum);
// Loads a resource by (type, id). Searches all open resource files
// in open order and returns a cached handle if the same (type, id)
// was previously loaded from any open file. Returns NULL on failure.
// //
// Caller is responsible for HLock/HUnlock pairing around any usage that // The returned handle is `void **`; `*handle` is the resource bytes.
// crosses a toolbox call; see HLock semantics block at the top of this void **loadResource(IigsResTypeT type, IigsResIdT id, int *err);
// file.
// Releases a previously-loaded resource.
// verb 0: keep the cached payload (cheap; the handle may be reused).
// verb 1: evict the cache entry and free the payload.
// Returns RES_OK on success.
int releaseResource(int verb, void **handle);
// Convenience: byte size of the resource pointed to by `handle`.
// Returns 0 if `handle` is not in the cache.
uint32_t getResourceSize(void **handle);
// ---- Legacy stub API kept for backwards compatibility ----
// The pre-Phase-3.4 stub exposed iigsLoadResource / iigsGetResourceSize
// for the rsrcProbe markers. Those now dispatch to the real
// implementation when at least one resource file is open. They report
// RES_ERR_NOT_STARTED when no file is open (instead of the old
// RES_ERR_BLOCKED), preserving the "did Phase 3.4 land?" signal.
void **iigsLoadResource(IigsResTypeT resType, IigsResIdT resId, int *err); void **iigsLoadResource(IigsResTypeT resType, IigsResIdT resId, int *err);
// GetResourceSize typed wrapper. Returns the byte size of the resource
// or 0 on failure (and sets *err if non-NULL).
uint32_t iigsGetResourceSize(IigsResTypeT resType, IigsResIdT resId, uint32_t iigsGetResourceSize(IigsResTypeT resType, IigsResIdT resId,
int *err); int *err);

View file

@ -384,102 +384,11 @@ typedef __builtin_va_list va_list;
#define va_arg(ap, ty) __builtin_va_arg(ap, ty) #define va_arg(ap, ty) __builtin_va_arg(ap, ty)
#define va_end(ap) __builtin_va_end(ap) #define va_end(ap) __builtin_va_end(ap)
static void writeUDec(unsigned int n) { // vprintf / printf used to dispatch through their own small format
char buf[6]; // 16-bit: max 5 digits + null // helpers (writeUDec/writeDec/writeULong/writeHex/writeStr/writeSignedLong/
int i = 0; // writeDouble). Once vprintf was rewritten to route through vsnprintf
if (n == 0) { putchar('0'); return; } // (so printf and snprintf share one format engine in snprintf.c), the
while (n > 0) { buf[i++] = '0' + (n % 10); n /= 10; } // helpers became dead weight and were removed.
while (i > 0) putchar(buf[--i]);
}
static void writeDec(int n) {
// For INT_MIN, `-n` overflows signed int (UB). Negate as unsigned
// — well-defined (two's-complement wrap), and the magnitude is
// identical for the print path.
if (n < 0) { putchar('-'); writeUDec((unsigned int)(0u - (unsigned int)n)); }
else writeUDec((unsigned int)n);
}
static void writeULong(unsigned long n) {
char buf[11]; // 32-bit: max 10 digits + null
int i = 0;
if (n == 0) { putchar('0'); return; }
while (n > 0) { buf[i++] = '0' + (n % 10); n /= 10; }
while (i > 0) putchar(buf[--i]);
}
static void writeHex(unsigned int n, int width) {
static const char digits[] = "0123456789abcdef";
// unsigned int is 16-bit on this target -> at most 4 hex digits.
// Cap width to that; without it `printf("%08x", ...)` blew past
// the buf[] tail and corrupted the stack.
char buf[4];
if (width > 4) width = 4;
int i = 0;
if (n == 0) { buf[i++] = '0'; }
while (n > 0 && i < 4) { buf[i++] = digits[n & 0xF]; n >>= 4; }
while (i < width) buf[i++] = '0';
while (i > 0) putchar(buf[--i]);
}
static void writeStr(const char *s) {
if (!s) s = "(null)";
while (*s) { putchar(*s); s++; }
}
// Format-spec handlers used to be marked noinline to keep vprintf's
// main loop small for the long-branch limitation; now W65816BranchExpand
// reliably promotes Bxx to BRL when needed, so the inliner is free to
// merge them when it wants.
static void writeSignedLong(long n) {
// See writeDec: avoid the signed-overflow UB on LONG_MIN.
if (n < 0) { putchar('-'); writeULong(0ul - (unsigned long)n); }
else writeULong((unsigned long)n);
}
// Minimal %f / %g support. Uses double soft-float; precision capped
// at 6 fractional digits (the C default). Doesn't handle Inf/NaN
// specially — prints the integer extraction, which will be 0 for
// non-finite values. Not IEEE-precise (intermediate truncation in
// the soft-double mul/div), but good enough for typical formatted
// numeric output.
static void writeDouble(double v, int prec) {
if (prec < 0) prec = 6;
if (prec > 9) prec = 9;
// Test the IEEE-754 sign bit (so -0.0 prints with the sign per
// C99) and avoid the soft-float __ltdf2 comparison, which has
// historically miscompiled for negative inputs (see snprintf.c
// banner for the same workaround).
unsigned long long vbits;
__builtin_memcpy(&vbits, &v, 8);
if (vbits & ((unsigned long long)1 << 63)) {
putchar('-');
vbits &= ~((unsigned long long)1 << 63);
__builtin_memcpy(&v, &vbits, 8);
}
long ipart = (long)v;
writeULong((unsigned long)ipart);
if (prec == 0) return;
putchar('.');
double frac = v - (double)ipart;
// Multiply fraction by 10^prec, then print as integer with leading zeros.
long mul = 1;
for (int i = 0; i < prec; i++) mul *= 10;
long fdigits = (long)(frac * (double)mul);
if (fdigits < 0) fdigits = -fdigits;
char buf[10];
int n = 0;
long scale = mul / 10;
while (n < prec) {
if (scale == 0) scale = 1;
long d = fdigits / scale;
buf[n++] = '0' + (char)(d % 10);
scale /= 10;
if (scale == 0) break;
}
while (n < prec) buf[n++] = '0';
for (int i = 0; i < n; i++) putchar(buf[i]);
}
extern int vsnprintf(char *buf, size_t n, const char *fmt, va_list ap); extern int vsnprintf(char *buf, size_t n, const char *fmt, va_list ap);
@ -724,10 +633,11 @@ void free(void *p) {
} }
void *calloc(size_t nmemb, size_t size) { void *calloc(size_t nmemb, size_t size) {
// size_t is 16-bit on this target; nmemb*size can overflow and // size_t is 32-bit, so the multiply itself won't overflow for any
// wrap to a small value (e.g. calloc(65536, 1) -> 0 -> 2-byte // realistic input. The 0xFFFF cap is a "fits in one 64KB bank"
// alloc), then the caller writes way past the returned region. // sanity check: the heap lives in bank 0 below the IO window, so
// Bail when the multiplication would overflow. // any single allocation must fit there. calloc(65536, 1) returns
// null rather than silently truncating into the IO range.
if (size != 0 && nmemb > (size_t)0xFFFF / size) return (void *)0; if (size != 0 && nmemb > (size_t)0xFFFF / size) return (void *)0;
size_t total = nmemb * size; size_t total = nmemb * size;
void *p = malloc(total); void *p = malloc(total);
@ -757,6 +667,15 @@ void *realloc(void *ptr, size_t n) {
typedef void (*AtexitFn)(void); typedef void (*AtexitFn)(void);
static AtexitFn __atexitFn = (AtexitFn)0; static AtexitFn __atexitFn = (AtexitFn)0;
// BRK $00 then spin -- halts a 65816 in BRK so MAME's debugger catches
// it; the spin loop guards against the (rare) case where BRK returns.
static void __halt(void) __attribute__((noreturn));
static void __halt(void) {
__asm__ volatile (".byte 0x00, 0x00");
while (1) {}
}
void exit(int code) { void exit(int code) {
(void)code; (void)code;
// C99 7.20.4.3: exit() must invoke registered atexit handlers in // C99 7.20.4.3: exit() must invoke registered atexit handlers in
@ -766,9 +685,7 @@ void exit(int code) {
__atexitFn = (AtexitFn)0; // prevent re-entry if fn calls exit __atexitFn = (AtexitFn)0; // prevent re-entry if fn calls exit
fn(); fn();
} }
// BRK $00 — halts a 65816 in BRK, MAME's debugger catches. __halt();
__asm__ volatile (".byte 0x00, 0x00");
while (1) {} // unreachable
} }
// ---- errno ---- // ---- errno ----
@ -1128,9 +1045,9 @@ typedef struct __sFILE {
static char __tmpNames[MFS_MAX_FILES][LIBC_L_TMPNAM]; static char __tmpNames[MFS_MAX_FILES][LIBC_L_TMPNAM];
static FILE __mfs[MFS_MAX_FILES] = { static FILE __mfs[MFS_MAX_FILES] = {
{ FILE_KIND_STDIN, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0 }, { .kind = FILE_KIND_STDIN, .unget = -1 },
{ FILE_KIND_STDOUT, 1, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0 }, { .kind = FILE_KIND_STDOUT, .writable = 1, .unget = -1 },
{ FILE_KIND_STDERR, 1, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0 }, { .kind = FILE_KIND_STDERR, .writable = 1, .unget = -1 },
}; };
FILE *stdin = &__mfs[0]; FILE *stdin = &__mfs[0];
@ -1278,9 +1195,6 @@ int fclose(FILE *stream) {
return 0; return 0;
} }
// Forward decls for routines that live in snprintf.c.
extern int vsnprintf(char *buf, size_t n, const char *fmt, va_list ap);
// Forward decl for vfprintf so fprintf can call it. // Forward decl for vfprintf so fprintf can call it.
int vfprintf(FILE *stream, const char *fmt, va_list ap); int vfprintf(FILE *stream, const char *fmt, va_list ap);
size_t fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream); size_t fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream);
@ -1377,8 +1291,7 @@ static AtexitFn __quickFn = (AtexitFn)0;
void _Exit(int code) { void _Exit(int code) {
(void)code; (void)code;
__asm__ volatile (".byte 0x00, 0x00"); __halt();
while (1) {} // unreachable
} }
void quick_exit(int code) { void quick_exit(int code) {
@ -1388,8 +1301,7 @@ void quick_exit(int code) {
__quickFn = (AtexitFn)0; __quickFn = (AtexitFn)0;
fn(); fn();
} }
__asm__ volatile (".byte 0x00, 0x00"); __halt();
while (1) {} // unreachable
} }
int at_quick_exit(AtexitFn fn) { int at_quick_exit(AtexitFn fn) {
@ -1438,20 +1350,26 @@ static void initFileMem(FILE *f, const MfsEntry *reg, int wantWrite) {
// LIBC_PATH_MAX (kept in sync with limits.h's PATH_MAX) so user code // LIBC_PATH_MAX (kept in sync with limits.h's PATH_MAX) so user code
// that bounds-checks against PATH_MAX stays consistent with what fopen // that bounds-checks against PATH_MAX stays consistent with what fopen
// will accept. // will accept.
static struct { typedef struct __GsosPathBufT {
u16 length; u16 length;
char text[LIBC_PATH_MAX]; char text[LIBC_PATH_MAX];
} __gsosPathBuf; } __GsosPathBufT;
static int __buildGSString(const char *path) { static __GsosPathBufT __gsosPathBuf;
static int __fillGSString(__GsosPathBufT *buf, const char *path) {
size_t n = 0; size_t n = 0;
while (path[n] && n < LIBC_PATH_MAX) n++; while (path[n] && n < LIBC_PATH_MAX) n++;
if (path[n]) return -1; // path > PATH_MAX chars if (path[n]) return -1; // path > PATH_MAX chars
__gsosPathBuf.length = (u16)n; buf->length = (u16)n;
for (size_t i = 0; i < n; i++) __gsosPathBuf.text[i] = path[i]; for (size_t i = 0; i < n; i++) buf->text[i] = path[i];
return 0; return 0;
} }
static int __buildGSString(const char *path) {
return __fillGSString(&__gsosPathBuf, path);
}
FILE *fopen(const char *path, const char *mode) { FILE *fopen(const char *path, const char *mode) {
if (!path || !mode) return (FILE *)0; if (!path || !mode) return (FILE *)0;
int wantWrite = 0; int wantWrite = 0;
@ -1486,7 +1404,6 @@ FILE *fopen(const char *path, const char *mode) {
if (reg) { if (reg) {
initFileMem(f, reg, wantWrite); initFileMem(f, reg, wantWrite);
(void)wantRead;
if (truncate) f->size = 0; if (truncate) f->size = 0;
if (append) f->pos = f->size; if (append) f->pos = f->size;
return f; return f;
@ -1547,15 +1464,16 @@ FILE *fopen(const char *path, const char *mode) {
gsosSetMark(&m); gsosSetMark(&m);
} }
} }
(void)wantRead;
return f; return f;
} }
size_t fread(void *ptr, size_t size, size_t nmemb, FILE *stream) { size_t fread(void *ptr, size_t size, size_t nmemb, FILE *stream) {
if (!stream) return 0; if (!stream) return 0;
if (size == 0 || nmemb == 0) return 0; if (size == 0 || nmemb == 0) return 0;
// Avoid 32-bit overflow on size * nmemb: cap nmemb so each item // size_t is u32 here, so the multiply itself can't overflow. The
// (size bytes) fits in remaining 16-bit address space. // 0xFFFE cap is a "single 64KB bank" limit -- the underlying
// mem/GSOS backends address by 16-bit offset, so any single fread
// must fit in one bank.
if (nmemb > (size_t)0xFFFE / size) nmemb = (size_t)0xFFFE / size; if (nmemb > (size_t)0xFFFE / size) nmemb = (size_t)0xFFFE / size;
if (stream->kind == FILE_KIND_GSOS) { if (stream->kind == FILE_KIND_GSOS) {
// Drain unget byte first if present. // Drain unget byte first if present.
@ -1605,8 +1523,10 @@ size_t fread(void *ptr, size_t size, size_t nmemb, FILE *stream) {
size_t fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream) { size_t fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream) {
if (!stream) return 0; if (!stream) return 0;
if (size == 0 || nmemb == 0) return 0; if (size == 0 || nmemb == 0) return 0;
// Cap nmemb so each item (size bytes) fits in the address space // size_t is u32 here, so the multiply itself can't overflow. The
// — avoids 32-bit `size * nmemb` that the i32 multiply path triggers. // 0xFFFE cap is a "single 64KB bank" limit -- the underlying
// mem/GSOS backends address by 16-bit offset, so any single fwrite
// must fit in one bank.
if (nmemb > (size_t)0xFFFE / size) nmemb = (size_t)0xFFFE / size; if (nmemb > (size_t)0xFFFE / size) nmemb = (size_t)0xFFFE / size;
const char *in = (const char *)ptr; const char *in = (const char *)ptr;
if (stream->kind == FILE_KIND_STDOUT || stream->kind == FILE_KIND_STDERR) { if (stream->kind == FILE_KIND_STDOUT || stream->kind == FILE_KIND_STDERR) {
@ -1814,7 +1734,6 @@ void setbuf(FILE *stream, char *buf) {
// GS/OS. This matches both ProDOS `/VOL/FILE` and HFS `:Vol:File:` // GS/OS. This matches both ProDOS `/VOL/FILE` and HFS `:Vol:File:`
// conventions without forcing the caller to declare which. // conventions without forcing the caller to declare which.
int mfsUnregister(const char *path);
extern int rand(void); extern int rand(void);
// True when `path` looks like a GS/OS volume path (contains `/` or // True when `path` looks like a GS/OS volume path (contains `/` or
@ -1863,18 +1782,10 @@ static int __sameParentDir(const char *a, const char *b) {
// simultaneously (old+new for ChangePath), and Destroy of the source // simultaneously (old+new for ChangePath), and Destroy of the source
// at the end of the cross-dir fallback can reuse __gsosPathBuf for the // at the end of the cross-dir fallback can reuse __gsosPathBuf for the
// source name. Keeps the destination name alive across all calls. // source name. Keeps the destination name alive across all calls.
static struct { static __GsosPathBufT __gsosPathBuf2;
u16 length;
char text[LIBC_PATH_MAX];
} __gsosPathBuf2;
static int __buildGSString2(const char *path) { static int __buildGSString2(const char *path) {
size_t n = 0; return __fillGSString(&__gsosPathBuf2, path);
while (path[n] && n < LIBC_PATH_MAX) n++;
if (path[n]) return -1;
__gsosPathBuf2.length = (u16)n;
for (size_t i = 0; i < n; i++) __gsosPathBuf2.text[i] = path[i];
return 0;
} }
int remove(const char *path) { int remove(const char *path) {

View file

@ -1,149 +1,479 @@
// resource.c - iigs/resource.h implementation. Phase 3.4 STUB-ONLY // resource.c - Apple IIgs Resource Manager - real implementation.
// landing.
// //
// Phase 1.1 (GS/OS fopen hang on 6.0.2) blocks the live runtime path. // Replaces the Phase 3.4 stub. Reads .rsrc resource forks via the
// ResourceStartUp + OpenResourceFile reaches the same blocking code, // stdio surface (fopen/fread/fseek/fclose) and caches loaded payloads
// so all three entry points (init, load, size) return RES_ERR_BLOCKED // by (type, id) so repeated loadResource() calls return the same
// unless the build defines IIGS_RESOURCE_RUNTIME_ENABLED=1. When that // handle. Read-only - no AddResource / DetachResource / partial-load.
// flips on (Phase 1.1 lands), the toolbox calls below activate and the
// typed wrappers route through the real Resource Manager.
// //
// HLock semantics: // File format (Apple IIgs Toolbox Reference Vol 3, ch.42):
// LoadResource (toolbox 0x0E1E) returns a HANDLE - a pointer to a // bytes 0..23 : ResourceMapHeaderT (little-endian fields)
// master pointer in Memory-Manager-relocatable storage. Until you // bytes ... : payload blobs at offsets recorded in the index
// call HLock(handle), any subsequent toolbox call can compact the // bytes at rmToIndex : rmIndexUsed * ResourceIndexEntryT entries
// heap and move the underlying bytes. The typed wrappers DO NOT
// call HLock for the caller; that is the caller's responsibility
// per the contract in iigs/resource.h.
// //
// Why we stub instead of returning best-effort answers: // Handle convention: we return a `void **` whose dereference yields the
// A real LoadResource that silently returned NULL would be ambiguous // resource bytes. The handle storage lives in this file's static
// with "resource not found". RES_ERR_BLOCKED lets the demo + smoke // table; the bytes themselves are malloc'd at first load and freed at
// harness distinguish "Phase 1.1 hasn't landed" from "your TYPECODE_ID // releaseResource(verb=1) or closeResourceFile().
// bundle was missing a resource". Once Phase 1.1 lands, callers see
// the real error codes (RES_ERR_NOT_FOUND, RES_ERR_TOOLBOX) instead.
#include "iigs/resource.h" #include "iigs/resource.h"
#include "iigs/toolbox.h"
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
// Set to non-zero by a successful resourceProbeInit() call. Read by // --- Prototypes ---
// resourceRuntimeEnabled() to report status without re-running init. static int freeHandleSlot(int slot);
// In the stub-only landing this never reaches 1 because the runtime static int findHandleByPtr(void **handle);
// path is compiled out. static int findHandleByTypeId(IigsResTypeT type, IigsResIdT id);
static int gResourceReady = 0; static int findHandleSlot(void);
static int findOpenFileSlot(void);
static int loadIndex(int fileSlot);
static void *readPayload(int fileSlot, uint32_t offset, uint32_t size);
static int readU16(FILE *f, uint16_t *out);
static int readU32(FILE *f, uint32_t *out);
static int readMapHeader(FILE *f, ResourceMapHeaderT *hdr);
// Cached refNum from OpenResourceFile. Populated only when the // --- Internal types ---
// runtime path is enabled. unsigned short to match the toolbox typedef struct {
// signature (refNum is a 16-bit GS/OS fileID). int inUse;
static unsigned short gResourceRefNum = 0; FILE *fp;
ResourceMapHeaderT hdr;
ResourceIndexEntryT *index; // malloc'd; rmIndexUsed entries
uint16_t refNum; // 1..N, matches slot+1
} ResourceFileT;
// Stub flag to keep the unused-static-warning quiet when the runtime typedef struct {
// path is compiled out. The compiler folds the function bodies below int inUse;
// to constant returns under -O2 anyway; this just keeps -Wunused happy int fileSlot; // which ResourceFileT owns it
// across both build modes. IigsResTypeT type;
static void touchUnused(void) { IigsResIdT id;
(void)gResourceRefNum; void *data; // payload bytes
uint32_t size;
void *masterPtr; // master ptr cell -> &data
} HandleSlotT;
// --- State ---
// Declared volatile to defeat the GlobalOpt i1-narrowing pass that
// otherwise produces an `i1, zext` load the W65816 backend can't select.
// (See MEMORY.md: feedback_i1_load_custom.md.)
static volatile int gResourceReady = 0;
static ResourceFileT gFiles[IIGS_RES_MAX_FILES];
static HandleSlotT gHandles[IIGS_RES_MAX_HANDLES];
int closeResourceFile(ResourceRefNumT refNum) {
if (refNum == 0 || refNum > IIGS_RES_MAX_FILES) {
return RES_ERR_BAD_HANDLE;
}
int slot = (int)refNum - 1;
if (!gFiles[slot].inUse) {
return RES_ERR_BAD_HANDLE;
}
// Free every cached handle owned by this file.
for (int i = 0; i < IIGS_RES_MAX_HANDLES; i++) {
if (gHandles[i].inUse && gHandles[i].fileSlot == slot) {
freeHandleSlot(i);
}
}
if (gFiles[slot].index) {
free(gFiles[slot].index);
gFiles[slot].index = (ResourceIndexEntryT *)0;
}
if (gFiles[slot].fp) {
fclose(gFiles[slot].fp);
gFiles[slot].fp = (FILE *)0;
}
gFiles[slot].inUse = 0;
return RES_OK;
} }
#if IIGS_RESOURCE_RUNTIME_ENABLED static int findHandleByPtr(void **handle) {
// Path passed to OpenResourceFile. When the runtime path is live the if (!handle) {
// expectation is that this is the application's own pathname (the OMF return -1;
// the Loader launched), so OpenResourceFile attaches to the file's }
// resource fork. GS/OS holds the boot pathname in a known low-memory for (int i = 0; i < IIGS_RES_MAX_HANDLES; i++) {
// vector; we resolve it at init time and cache here. if (gHandles[i].inUse && (void **)&gHandles[i].data == handle) {
// return i;
// The exact pathname-resolution sequence is intentionally NOT implemented }
// in this stub-only landing - it is part of the Phase 1.1 unblock work }
// (the same code that fixes fopen will plumb the pathname through). return -1;
static char gOwnPathName[256] = { 0 }; }
#endif
static int findHandleByTypeId(IigsResTypeT type, IigsResIdT id) {
for (int i = 0; i < IIGS_RES_MAX_HANDLES; i++) {
if (gHandles[i].inUse && gHandles[i].type == type && gHandles[i].id == id) {
return i;
}
}
return -1;
}
static int findHandleSlot(void) {
for (int i = 0; i < IIGS_RES_MAX_HANDLES; i++) {
if (!gHandles[i].inUse) {
return i;
}
}
return -1;
}
static int findOpenFileSlot(void) {
for (int i = 0; i < IIGS_RES_MAX_FILES; i++) {
if (!gFiles[i].inUse) {
return i;
}
}
return -1;
}
static int freeHandleSlot(int slot) {
if (slot < 0 || slot >= IIGS_RES_MAX_HANDLES) {
return RES_ERR_BAD_HANDLE;
}
if (!gHandles[slot].inUse) {
return RES_ERR_BAD_HANDLE;
}
if (gHandles[slot].data) {
free(gHandles[slot].data);
gHandles[slot].data = (void *)0;
}
gHandles[slot].inUse = 0;
gHandles[slot].fileSlot = -1;
gHandles[slot].type = 0;
gHandles[slot].id = 0;
gHandles[slot].size = 0;
return RES_OK;
}
uint32_t getResourceSize(void **handle) {
int slot = findHandleByPtr(handle);
if (slot < 0) {
return 0;
}
return gHandles[slot].size;
}
// Convenience wrapper kept for backwards compat with the old probe.
// Scans the cache + open files for (type, id) and reports the size.
uint32_t iigsGetResourceSize(IigsResTypeT resType, IigsResIdT resId, int *err) {
if (!gResourceReady) {
if (err) {
*err = RES_ERR_NOT_STARTED;
}
return 0;
}
int hSlot = findHandleByTypeId(resType, resId);
if (hSlot >= 0) {
if (err) {
*err = RES_OK;
}
return gHandles[hSlot].size;
}
// Not cached - scan every open file's index for the entry.
for (int f = 0; f < IIGS_RES_MAX_FILES; f++) {
if (!gFiles[f].inUse || !gFiles[f].index) {
continue;
}
uint32_t n = gFiles[f].hdr.rmIndexUsed;
for (uint32_t i = 0; i < n; i++) {
ResourceIndexEntryT *e = &gFiles[f].index[i];
if (e->rType == resType && e->rID == resId) {
if (err) {
*err = RES_OK;
}
return e->rSize;
}
}
}
if (err) {
*err = RES_ERR_NOT_FOUND;
}
return 0;
}
// Convenience wrapper kept for backwards compat with the old probe.
void **iigsLoadResource(IigsResTypeT resType, IigsResIdT resId, int *err) {
return loadResource(resType, resId, err);
}
// Reads the 20-byte rIndex table for a freshly-opened file. Returns
// RES_OK or an RES_ERR_* code. Caller has populated gFiles[slot].hdr.
static int loadIndex(int fileSlot) {
ResourceFileT *rf = &gFiles[fileSlot];
uint32_t n = rf->hdr.rmIndexUsed;
if (n == 0) {
rf->index = (ResourceIndexEntryT *)0;
return RES_OK;
}
// Sanity-check against malloc'ing absurd amounts.
if (n > 1024) {
return RES_ERR_TOOLBOX;
}
ResourceIndexEntryT *idx = (ResourceIndexEntryT *)malloc(sizeof(ResourceIndexEntryT) * n);
if (!idx) {
return RES_ERR_NO_MEM;
}
if (fseek(rf->fp, (long)rf->hdr.rmToIndex, 0) != 0) {
free(idx);
return RES_ERR_TOOLBOX;
}
for (uint32_t i = 0; i < n; i++) {
uint16_t t;
uint32_t id;
uint32_t off;
uint16_t attr;
uint32_t sz;
uint32_t h;
if (readU16(rf->fp, &t) != 0 ||
readU32(rf->fp, &id) != 0 ||
readU32(rf->fp, &off) != 0 ||
readU16(rf->fp, &attr) != 0 ||
readU32(rf->fp, &sz) != 0 ||
readU32(rf->fp, &h) != 0) {
free(idx);
return RES_ERR_TOOLBOX;
}
idx[i].rType = t;
idx[i].rID = id;
idx[i].rOffset = off;
idx[i].rAttr = attr;
idx[i].rSize = sz;
idx[i].rHandle = h;
}
rf->index = idx;
return RES_OK;
}
void **loadResource(IigsResTypeT type, IigsResIdT id, int *err) {
if (!gResourceReady) {
if (err) {
*err = RES_ERR_NOT_STARTED;
}
return (void **)0;
}
// Cache hit?
int hSlot = findHandleByTypeId(type, id);
if (hSlot >= 0) {
if (err) {
*err = RES_OK;
}
return (void **)&gHandles[hSlot].data;
}
// Cache miss - find the resource in any open file.
for (int f = 0; f < IIGS_RES_MAX_FILES; f++) {
if (!gFiles[f].inUse || !gFiles[f].index) {
continue;
}
uint32_t n = gFiles[f].hdr.rmIndexUsed;
for (uint32_t i = 0; i < n; i++) {
ResourceIndexEntryT *e = &gFiles[f].index[i];
if (e->rType != type || e->rID != id) {
continue;
}
int slot = findHandleSlot();
if (slot < 0) {
if (err) {
*err = RES_ERR_NO_MEM;
}
return (void **)0;
}
void *bytes = readPayload(f, e->rOffset, e->rSize);
if (!bytes) {
if (err) {
*err = RES_ERR_TOOLBOX;
}
return (void **)0;
}
gHandles[slot].inUse = 1;
gHandles[slot].fileSlot = f;
gHandles[slot].type = type;
gHandles[slot].id = id;
gHandles[slot].data = bytes;
gHandles[slot].size = e->rSize;
if (err) {
*err = RES_OK;
}
return (void **)&gHandles[slot].data;
}
}
if (err) {
*err = RES_ERR_NOT_FOUND;
}
return (void **)0;
}
ResourceRefNumT openResourceFile(const char *path, uint8_t accessByte, uint16_t fileType, int *err) {
(void)accessByte;
(void)fileType;
if (!path) {
if (err) {
*err = RES_ERR_NOT_FOUND;
}
return 0;
}
int slot = findOpenFileSlot();
if (slot < 0) {
if (err) {
*err = RES_ERR_NO_MEM;
}
return 0;
}
FILE *fp = fopen(path, "rb");
if (!fp) {
if (err) {
*err = RES_ERR_NOT_FOUND;
}
return 0;
}
ResourceFileT *rf = &gFiles[slot];
if (readMapHeader(fp, &rf->hdr) != 0) {
fclose(fp);
if (err) {
*err = RES_ERR_TOOLBOX;
}
return 0;
}
rf->fp = fp;
rf->inUse = 1;
rf->refNum = (uint16_t)(slot + 1);
rf->index = (ResourceIndexEntryT *)0;
int rc = loadIndex(slot);
if (rc != RES_OK) {
fclose(fp);
rf->fp = (FILE *)0;
rf->inUse = 0;
if (err) {
*err = rc;
}
return 0;
}
gResourceReady = 1;
if (err) {
*err = RES_OK;
}
return rf->refNum;
}
// Allocates and reads `size` bytes at `offset` from the file at
// `fileSlot`. Returns NULL on any error.
static void *readPayload(int fileSlot, uint32_t offset, uint32_t size) {
if (size == 0) {
return (void *)0;
}
void *buf = malloc(size);
if (!buf) {
return (void *)0;
}
FILE *fp = gFiles[fileSlot].fp;
if (fseek(fp, (long)offset, 0) != 0) {
free(buf);
return (void *)0;
}
size_t got = fread(buf, 1, size, fp);
if (got != size) {
free(buf);
return (void *)0;
}
return buf;
}
// Reads a little-endian uint16 from `f`. Returns 0 on success.
static int readU16(FILE *f, uint16_t *out) {
uint8_t b[2];
if (fread(b, 1, 2, f) != 2) {
return -1;
}
*out = (uint16_t)(b[0] | ((uint16_t)b[1] << 8));
return 0;
}
// Reads a little-endian uint32 from `f`. Returns 0 on success.
static int readU32(FILE *f, uint32_t *out) {
uint8_t b[4];
if (fread(b, 1, 4, f) != 4) {
return -1;
}
*out = (uint32_t)b[0] |
((uint32_t)b[1] << 8) |
((uint32_t)b[2] << 16) |
((uint32_t)b[3] << 24);
return 0;
}
// Reads the 24-byte rResourceMap header at offset 0.
static int readMapHeader(FILE *f, ResourceMapHeaderT *hdr) {
if (fseek(f, 0L, 0) != 0) {
return -1;
}
if (readU16(f, &hdr->rmVersion) != 0) return -1;
if (readU32(f, &hdr->rmToIndex) != 0) return -1;
if (readU16(f, &hdr->rmFileNum) != 0) return -1;
if (readU16(f, &hdr->rmID) != 0) return -1;
if (readU32(f, &hdr->rmIndexSize) != 0) return -1;
if (readU32(f, &hdr->rmIndexUsed) != 0) return -1;
if (readU16(f, &hdr->rmFreeListSize) != 0) return -1;
if (readU16(f, &hdr->rmFreeListUsed) != 0) return -1;
if (readU16(f, &hdr->rmPad) != 0) return -1;
return 0;
}
int releaseResource(int verb, void **handle) {
int slot = findHandleByPtr(handle);
if (slot < 0) {
return RES_ERR_BAD_HANDLE;
}
if (verb == 0) {
// Soft release: keep cached payload. Real toolbox would decrement
// a use-count; we just succeed.
return RES_OK;
}
return freeHandleSlot(slot);
}
int resourceProbeInit(void) { int resourceProbeInit(void) {
touchUnused(); // Zero the tables. Safe to call repeatedly - subsequent calls do
#if IIGS_RESOURCE_RUNTIME_ENABLED // not touch already-open files.
// Live path - placeholder until Phase 1.1 lands. We deliberately if (!gResourceReady) {
// do not call ResourceStartUp here in the stub-only landing because for (int i = 0; i < IIGS_RES_MAX_FILES; i++) {
// (a) it requires MMStartUp to have run already and (b) calling gFiles[i].inUse = 0;
// ResourceStartUp on a userId we don't own would corrupt the gFiles[i].fp = (FILE *)0;
// toolbox's per-app state. Phase 1.1's actual implementation will gFiles[i].index = (ResourceIndexEntryT *)0;
// look like: gFiles[i].refNum = 0;
// MMStartUp(); }
// TLStartUp(); for (int i = 0; i < IIGS_RES_MAX_HANDLES; i++) {
// ResourceStartUp(myUserId); gHandles[i].inUse = 0;
// gResourceRefNum = OpenResourceFile(0x0001, NULL, gOwnPathName); gHandles[i].fileSlot = -1;
// gResourceReady = (gResourceRefNum != 0) ? 1 : 0; gHandles[i].data = (void *)0;
return RES_ERR_BLOCKED; gHandles[i].size = 0;
#else }
return RES_ERR_BLOCKED; gResourceReady = 1;
#endif }
return RES_OK;
} }
int resourceRuntimeEnabled(void) { int resourceRuntimeEnabled(void) {
return gResourceReady; return gResourceReady;
} }
void **iigsLoadResource(IigsResTypeT resType, IigsResIdT resId, int *err) {
(void)resType;
(void)resId;
#if IIGS_RESOURCE_RUNTIME_ENABLED
if (!gResourceReady) {
if (err) {
*err = RES_ERR_NOT_STARTED;
}
return (void **)0;
}
// Phase 1.1 will plug LoadResource(resType, resId) here. Toolbox
// pushes 4-byte ID as a long, returns handle in PHA slot. Caller
// must HLock() before dereferencing (see header notes).
void **h = (void **)LoadResource((unsigned short)resType, (long)resId);
if (!h) {
if (err) {
*err = RES_ERR_NOT_FOUND;
}
return (void **)0;
}
if (err) {
*err = RES_OK;
}
return h;
#else
if (err) {
*err = RES_ERR_BLOCKED;
}
return (void **)0;
#endif
}
uint32_t iigsGetResourceSize(IigsResTypeT resType, IigsResIdT resId,
int *err) {
(void)resType;
(void)resId;
#if IIGS_RESOURCE_RUNTIME_ENABLED
if (!gResourceReady) {
if (err) {
*err = RES_ERR_NOT_STARTED;
}
return 0;
}
// GetResourceSize returns a 32-bit byte count via the toolbox.
uint32_t sz = (uint32_t)GetResourceSize((unsigned short)resType,
(long)resId);
if (err) {
*err = (sz == 0) ? RES_ERR_NOT_FOUND : RES_OK;
}
return sz;
#else
if (err) {
*err = RES_ERR_BLOCKED;
}
return 0;
#endif
}

View file

@ -40,6 +40,13 @@ typedef __builtin_va_list va_list;
#define va_arg(ap, ty) __builtin_va_arg(ap, ty) #define va_arg(ap, ty) __builtin_va_arg(ap, ty)
#define va_end(ap) __builtin_va_end(ap) #define va_end(ap) __builtin_va_end(ap)
// Unbounded sink sentinel used by sprintf/vsprintf. Setting gEnd to
// `buf + 0xFFFE` looks innocuous but clang lowers the +0xFFFE to a
// `dec a; dec a` peephole (0xFFFE is -2 in 16-bit), giving gEnd =
// buf - 2 -- the `cur < end` bounds test then always fails. Use the
// absolute top-of-bank sentinel instead.
#define SPRINTF_END_SENTINEL ((char *)0xFFFF)
static char *gCur; static char *gCur;
static char *gEnd; static char *gEnd;
@ -757,12 +764,9 @@ int snprintf(char *buf, size_t n, const char *fmt, ...) {
int sprintf(char *buf, const char *fmt, ...) { int sprintf(char *buf, const char *fmt, ...) {
gCur = buf; gCur = buf;
// sprintf is unbounded. Setting gEnd = buf + 0xFFFE looks innocuous // sprintf is unbounded; see SPRINTF_END_SENTINEL above for the
// but clang lowers the +0xFFFE to a `dec a; dec a` peephole (since // reason we don't use buf + 0xFFFE.
// 0xFFFE is -2 in 16-bit), giving gEnd = buf - 2 — and then the gEnd = SPRINTF_END_SENTINEL;
// emit() bounds test `cur < end` is always false, so nothing gets
// written. Use the absolute top-of-bank sentinel instead.
gEnd = (char *)0xFFFF;
gTotal = 0; gTotal = 0;
va_list ap; va_list ap;
va_start(ap, fmt); va_start(ap, fmt);
@ -782,7 +786,7 @@ int vsnprintf(char *buf, size_t n, const char *fmt, va_list ap) {
int vsprintf(char *buf, const char *fmt, va_list ap) { int vsprintf(char *buf, const char *fmt, va_list ap) {
gCur = buf; gCur = buf;
gEnd = (char *)0xFFFF; gEnd = SPRINTF_END_SENTINEL;
gTotal = 0; gTotal = 0;
return format(fmt, ap); return format(fmt, ap);
} }

View file

@ -39,6 +39,7 @@
# DEBUGGER_E2E=1 scripts/mameDebug.py --bin ... --map ... --dwarf ... # DEBUGGER_E2E=1 scripts/mameDebug.py --bin ... --map ... --dwarf ...
import argparse import argparse
import importlib.util
import os import os
import re import re
import subprocess import subprocess
@ -50,6 +51,21 @@ SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
ROOT = os.path.dirname(SCRIPT_DIR) ROOT = os.path.dirname(SCRIPT_DIR)
# Import pc2line.py as a module so the REPL can reuse its DWARF parsing
# (line table, DIE walking, type chains, locals evaluator) without
# shelling out + reparsing on every command. pc2line.py is the single
# source of truth for DWARF semantics; we must NOT duplicate any of it.
def _loadPc2lineModule():
spec = importlib.util.spec_from_file_location(
"pc2line", os.path.join(SCRIPT_DIR, "pc2line.py"))
mod = importlib.util.module_from_spec(spec)
spec.loader.exec_module(mod)
return mod
pc2line = _loadPc2lineModule()
# ---- Map + DWARF helpers --------------------------------------------- # ---- Map + DWARF helpers ---------------------------------------------
def loadMapSyms(path): def loadMapSyms(path):
@ -561,6 +577,766 @@ def interactiveMode(args):
return 0 return 0
# ---- REPL mode (--repl) ---------------------------------------------
#
# An interactive prompt that gives `gdb`-flavour commands on top of the
# load-snapshot-resolve cycle. Because MAME has no bidirectional Lua
# RPC channel under `-debugger none`, every "execute the program"
# command (run / continue / step / next) maps to one MAME process
# launch. The Lua autoboot writes the program into bank-0 memory,
# installs all queued breakpoints, runs until the first hit, captures
# a register + memory snapshot, and exits. The Python REPL then
# decodes the snapshot to answer `print`, `bt`, `where` from cached
# state — no further MAME launch needed for those.
#
# Commands:
# break <sym|file:line|0xADDR> set/queue a breakpoint
# run | continue [c] launch MAME, stop at first bp hit
# step | next advance to next source line
# (via DWARF line table; one bp install)
# bt | backtrace walk the JSL frame chain from S
# where PC -> source line for the last hit
# print <symbol> decode bytes at &symbol per DWARF type
# info locals show formal_parameters + locals
# info breakpoints list queued breakpoints
# delete <N> remove breakpoint by index
# quit | q exit
# ? this help
#
# Smoke-checkable: pipe a script of `break main\nrun\nwhere\nquit\n`
# into `mameDebug.py --repl ...` and assert the BP-HIT + WHERE output.
REPL_HELP = """\
Commands:
break <sym|file:line|0xADDR> set/queue a breakpoint
run | continue launch MAME, stop at first hit
step | next advance to next source line (DWARF)
bt | backtrace walk JSL frame chain from S
where PC -> source line for the last hit
print <symbol> decode bytes at &symbol per DWARF type
info locals show formal_parameters + locals
info breakpoints list queued breakpoints
delete <N> remove breakpoint by index
quit | q exit
? this help
"""
# Lua autoboot for the REPL. Differs from the --trace template in three
# ways:
# 1. Breakpoint actions also dump (a) a 64-byte stack window around S
# and (b) per-symbol memory regions for `print` requests, both as
# tagged log lines so the host can parse.
# 2. exit_frame is generous (240) so a slow run still completes.
# 3. The list of "watch" memory regions is parameterised — the host
# stamps in (addr, len) pairs based on queued `print <symbol>`
# requests.
REPL_LUA_TEMPLATE = r"""
-- mameDebug REPL autoboot (generated by scripts/mameDebug.py --repl)
local BIN_PATH = "{bin_path}"
local LOAD_AT = 0x{load_at:04x}
local START_PC = 0x{start_pc:06x}
local BPS = {{ {bp_list} }}
local WATCHES = {{ {watch_list} }} -- list of {{addr, len}} pairs
local installed = false
local frame = 0
local cpu, dbg, mem
emu.register_frame_done(function()
frame = frame + 1
if frame == 30 and not installed then
cpu = manager.machine.devices[":maincpu"]
dbg = cpu.debug
mem = cpu.spaces["program"]
local f = io.open(BIN_PATH, "rb")
if not f then
print("MAMEDBG-BIN-MISSING " .. BIN_PATH)
manager.machine:exit()
return
end
local data = f:read("*all")
f:close()
for i = 1, #data do
local addr = LOAD_AT + i - 1
if not (addr >= 0x00C000 and addr < 0x00D000) then
mem:write_u8(addr, data:byte(i))
end
end
cpu.state["PC"].value = START_PC
cpu.state["PB"].value = 0x00
cpu.state["DB"].value = 0x00
cpu.state["D"].value = 0x00
cpu.state["P"].value = 0x04
cpu.state["E"].value = 0
cpu.state["S"].value = 0x01FF
-- Build the bp action. We use the 3-arg bpset form (1-arg
-- crashes MAME). The action stamps a magic marker into bank-2
-- scratch ($020010 / 0xDEAD) so the periodic poller can detect
-- the hit and dump memory from a SAFE context (the action
-- string itself can't call multi-statement loops cleanly).
local action_template =
'logerror "MAMEDBG-BP PC=%X A=%X X=%X Y=%X S=%X DBR=%X\n",pc,a,x,y,s,db; ' ..
'w@0x020010=0xDEAD; w@0x020012=s; w@0x020014=pc & 0xFFFF; w@0x020016=(pc>>16) & 0xFF; go'
for _, pc in ipairs(BPS) do
dbg:bpset(pc, '', action_template)
end
print(string.format("MAMEDBG-LOADED bytes=%d bps=%d watches=%d",
#data, #BPS, #WATCHES))
installed = true
end
if frame == {exit_frame} then
print("MAMEDBG-EXIT frame=" .. frame)
manager.machine:exit()
end
end)
-- Marker-driven snapshot dumper. Once the bp action stamps 0xDEAD at
-- $020010, this periodic handler reads S + PC from the scratch slots
-- and dumps the watched memory regions, then clears the marker.
local snapshotted = false
emu.register_periodic(function()
if installed and not snapshotted and mem ~= nil then
local marker = mem:read_u16(0x020010)
if marker == 0xDEAD then
local s_val = mem:read_u16(0x020012)
local pc_lo = mem:read_u16(0x020014)
local pc_bnk = mem:read_u8(0x020016)
local full_pc = (pc_bnk * 0x10000) + pc_lo
print(string.format("MAMEDBG-SNAP S=0x%04X PC=0x%06X",
s_val, full_pc))
-- Dump 64 bytes of the stack window above S (S+1 .. S+64).
-- That's where the topmost JSL return frame lives.
for ofs = 1, 64 do
local addr = s_val + ofs
local v = mem:read_u8(addr)
print(string.format("MAMEDBG-STACK addr=0x%06X val=0x%02X",
addr, v))
end
-- Dump each user-requested watch.
for _, w in ipairs(WATCHES) do
local addr, n = w[1], w[2]
for ofs = 0, n - 1 do
local v = mem:read_u8(addr + ofs)
print(string.format("MAMEDBG-WATCH addr=0x%06X val=0x%02X",
addr + ofs, v))
end
end
mem:write_u16(0x020010, 0)
snapshotted = true
end
end
end)
"""
def buildReplLuaScript(bin_path, load_at, bp_pcs, watch_regions,
start_pc, exit_frame):
"""Build a MAME autoboot Lua script for one REPL run.
bp_pcs: list of int (24-bit PCs) breakpoints to install.
watch_regions: list of (addr, length) tuples per-symbol memory
dumps stamped at the first BP hit.
"""
bp_list = ", ".join(f"0x{p:06x}" for p in bp_pcs)
watch_list = ", ".join(f"{{0x{a:06x}, {n}}}" for a, n in watch_regions)
return REPL_LUA_TEMPLATE.format(
bin_path = bin_path,
load_at = load_at,
start_pc = start_pc,
bp_list = bp_list or "",
watch_list = watch_list or "",
exit_frame = exit_frame,
)
# Regex for snapshot/watch/stack lines emitted by the REPL Lua script.
SNAP_RE = re.compile(r"MAMEDBG-SNAP\s+S=0x([0-9A-Fa-f]+)\s+PC=0x([0-9A-Fa-f]+)")
WATCH_RE = re.compile(r"MAMEDBG-WATCH\s+addr=0x([0-9A-Fa-f]+)\s+val=0x([0-9A-Fa-f]+)")
STACK_RE = re.compile(r"MAMEDBG-STACK\s+addr=0x([0-9A-Fa-f]+)\s+val=0x([0-9A-Fa-f]+)")
class ReplState:
"""All persistent state across REPL commands."""
def __init__(self, args):
self.args = args
# Map: address -> symbol name (binary-searchable by funcAt)
self.syms = pc2line.loadMapSymbols(args.map)
# DWARF: line table + DIE trees (parsed once, reused)
self.sectionPayloads = pc2line.loadSidecarSectionsAll(args.dwarf)
self.cus = pc2line.parseAllCus(self.sectionPayloads)
self.lineTable = pc2line.buildTable(args.dwarf)
# Breakpoints: list of (pc, label) - label is the original spec
self.breakpoints = []
# Watches: dict {symbol: (addr, length)}. Length picked from
# the symbol's DWARF type when available, else fall back to 2.
self.watches = {}
# Last snapshot — populated after a run. Empty until first run.
self.lastSnap = None # {"pc": int, "sp": int}
self.lastWatchBytes = {} # {addr: byte} (last run only)
self.lastStackBytes = {} # {addr: byte} (last run only)
def resolveSpec(self, spec):
"""Resolve `FUNC`, `FILE:LINE`, or `0xADDR` to a 24-bit PC.
Returns (pc, label) or (None, error_msg).
"""
spec = spec.strip()
# Hex address?
if spec.lower().startswith("0x"):
try:
return (int(spec, 16), spec)
except ValueError:
return (None, f"invalid hex: {spec!r}")
# File:line?
if ":" in spec:
file_part, line_part = spec.rsplit(":", 1)
try:
want_line = int(line_part)
except ValueError:
return (None, f"invalid line: {line_part!r}")
# Prefer the smallest-PC entry on the requested line so the
# bp lands on the statement's first instruction, not a
# later trailing entry.
best = None
for pc, fidx, ln, ft in self.lineTable:
if ln != want_line:
continue
if 0 < fidx <= len(ft):
fname = os.path.basename(ft[fidx - 1])
else:
fname = "?"
# Match if fname matches OR fname is "?" (DWARF5
# file_idx=0 path means "the CU's primary file" — we
# treat that as a wildcard match for the user-supplied
# file name).
if fname == file_part or fname.endswith(file_part) \
or fname == "?":
if best is None or pc < best[0]:
best = (pc, fname)
if best is not None:
return (best[0], f"{best[1]}:{want_line}")
return (None, f"no DWARF line entry for {spec!r}")
# Bare symbol name — lookup in map.
for addr, sym in self.syms:
if sym == spec:
return (addr, sym)
return (None, f"symbol {spec!r} not in map")
def symbolSize(self, symname):
"""Best-effort size of a global symbol's storage (in bytes).
Looks up DW_TAG_variable DIEs across all CUs. Returns the
resolved type's byte size, or None if not findable. Falls back
to caller-default (2) when None.
"""
for cu in self.cus:
if cu.root is None:
continue
for die in self._iterDies(cu.root):
if die.tag != pc2line.DW_TAG_variable:
continue
nm = pc2line.dieName(cu, die)
if nm != symname:
continue
tref = die.getRaw(pc2line.DW_AT_type)
if tref is None:
return None
target = pc2line._findDieByOffset(cu, tref[0])
return self._typeByteSize(cu, target)
return None
def _iterDies(self, die):
yield die
for ch in die.children:
yield from self._iterDies(ch)
def _typeByteSize(self, cu, die):
"""Walk a type DIE chain, return byte size or None."""
if die is None:
return None
seen = set()
cur = die
while cur is not None and cur.offset not in seen:
seen.add(cur.offset)
tag = cur.tag
# Base / structure / union / enum types carry DW_AT_byte_size.
bs = cur.getRaw(0x0b) # DW_AT_byte_size
if bs is not None:
return bs[0]
if tag == pc2line.DW_TAG_pointer_type:
# 24-bit byte addresses are stored as 4-byte ptr32 by
# default in our ABI; default-on Layer 2 builds use 4-byte
# ptrs. Fall back to addr_size if recorded.
return cu.addr_size
if tag in (0x26, 0x35, 0x37, 0x38):
# const/volatile/restrict/typedef — follow.
t = cur.getRaw(pc2line.DW_AT_type)
if t is None:
return None
cur = pc2line._findDieByOffset(cu, t[0])
continue
if tag == 0x01: # DW_TAG_array_type
t = cur.getRaw(pc2line.DW_AT_type)
if t is None:
return None
elem = self._typeByteSize(cu,
pc2line._findDieByOffset(cu, t[0]))
if elem is None:
return None
# Find first subrange child for count.
for ch in cur.children:
if ch.tag == 0x21: # DW_TAG_subrange_type
ub = ch.getRaw(0x2f) # DW_AT_upper_bound
if ub is not None:
return elem * (ub[0] + 1)
return None
# Other tags — give up.
return None
return None
def typeStrOfSymbol(self, symname):
"""Return a printable type string for a global symbol, or '?'."""
for cu in self.cus:
if cu.root is None:
continue
for die in self._iterDies(cu.root):
if die.tag != pc2line.DW_TAG_variable:
continue
nm = pc2line.dieName(cu, die)
if nm == symname:
return pc2line.varTypeStr(cu, die)
return "?"
def replLaunchMame(state, bp_pcs, start_pc, watch_regions, seconds=4):
"""Launch one MAME run with the queued breakpoints + watches.
Returns the captured stdout/stderr text. Parses MAMEDBG-SNAP,
MAMEDBG-WATCH, MAMEDBG-STACK lines into state.lastSnap +
state.lastWatchBytes + state.lastStackBytes.
"""
lua = buildReplLuaScript(state.args.bin, state.args.load_at,
bp_pcs, watch_regions,
start_pc=start_pc,
exit_frame=240)
with tempfile.NamedTemporaryFile("w", suffix=".lua",
delete=False) as lf:
lf.write(lua)
lua_path = lf.name
try:
out = runMame(lua_path, seconds=seconds, debug_flag=True)
finally:
try:
os.unlink(lua_path)
except OSError:
pass
# Parse snapshot lines.
state.lastSnap = None
state.lastWatchBytes = {}
state.lastStackBytes = {}
bps = []
for ln in out.splitlines():
m = BP_RE.search(ln)
if m:
bps.append({
"pc": int(m.group(1), 16),
"a": int(m.group(2), 16),
"x": int(m.group(3), 16),
"y": int(m.group(4), 16),
"s": int(m.group(5), 16),
"db": int(m.group(6), 16),
})
m = SNAP_RE.search(ln)
if m:
state.lastSnap = {
"sp": int(m.group(1), 16),
"pc": int(m.group(2), 16),
}
m = WATCH_RE.search(ln)
if m:
state.lastWatchBytes[int(m.group(1), 16)] = int(m.group(2), 16)
m = STACK_RE.search(ln)
if m:
state.lastStackBytes[int(m.group(1), 16)] = int(m.group(2), 16)
state.lastBps = bps
return out
def replPrintWhere(state):
"""Print PC -> source line for the last snapshot."""
if state.lastSnap is None:
print(" no snapshot yet — `run` first")
return
pc = state.lastSnap["pc"]
sp = state.lastSnap["sp"]
row = pc2line.query(state.lineTable, pc)
func = pc2line.funcAt(state.syms, pc)
if row is None:
print(f" PC=0x{pc:06x} (no DWARF line) FUNC={func} S=0x{sp:04x}")
else:
_, fname, ln = row
print(f" PC=0x{pc:06x} FILE={fname} LINE={ln} FUNC={func} "
f"S=0x{sp:04x}")
def replPrintBacktrace(state):
"""Walk the JSL return frame chain starting from the captured S.
The W65816 JSL pushes 3 bytes per call (PCL, PCH, PBR). Our ABI is
empty-descending: S points to the next-free byte. So the topmost
return-address triplet lives at S+1, S+2, S+3. We read it from the
captured stack window. We have no DW_AT_frame_base / DW_CFA_*
sidecar yet, so we can't walk past one frame — but we can show the
return address of the current function, which is what most debug
sessions need anyway.
"""
if state.lastSnap is None:
print(" no snapshot yet — `run` first")
return
pc = state.lastSnap["pc"]
sp = state.lastSnap["sp"]
func = pc2line.funcAt(state.syms, pc)
row = pc2line.query(state.lineTable, pc)
if row is None:
print(f" #0 PC=0x{pc:06x} FUNC={func}")
else:
_, fname, ln = row
print(f" #0 PC=0x{pc:06x} {fname}:{ln} FUNC={func}")
# Try to read S+1..S+3 from the captured stack window.
pcl_addr = (sp + 1) & 0xFFFF
pch_addr = (sp + 2) & 0xFFFF
pbr_addr = (sp + 3) & 0xFFFF
pcl = state.lastStackBytes.get(pcl_addr)
pch = state.lastStackBytes.get(pch_addr)
pbr = state.lastStackBytes.get(pbr_addr)
if pcl is None or pch is None or pbr is None:
print(" #1 <return address not in captured stack window>")
return
# JSL pushes the address of the LAST byte of the JSL instruction,
# so the actual return target is ret_addr + 1.
ret_pc = (pbr << 16) | (pch << 8) | pcl
ret_pc = (ret_pc + 1) & 0xFFFFFF
ret_func = pc2line.funcAt(state.syms, ret_pc)
ret_row = pc2line.query(state.lineTable, ret_pc)
if ret_row is None:
print(f" #1 PC=0x{ret_pc:06x} FUNC={ret_func}")
else:
_, fname, ln = ret_row
print(f" #1 PC=0x{ret_pc:06x} {fname}:{ln} FUNC={ret_func}")
def replPrintSymbol(state, spec):
"""Decode a symbol's bytes from the last snapshot and print them
per the symbol's DWARF type. If the symbol hasn't been watched
yet (or no run has happened), instruct the user to `run` first.
"""
addr = None
for a, s in state.syms:
if s == spec:
addr = a
break
if addr is None:
print(f" no such symbol: {spec!r}")
return
# Make sure it's queued as a watch for the next run.
if spec not in state.watches:
sz = state.symbolSize(spec)
if sz is None or sz <= 0:
sz = 2
if sz > 64:
# Truncate: large structs/arrays surface the first 64 bytes.
sz = 64
state.watches[spec] = (addr, sz)
if state.lastSnap is None or not state.lastWatchBytes:
print(f" &{spec} = 0x{addr:06x} (watch queued — run to capture)")
return
addr_w, length = state.watches[spec]
bytes_ = bytearray(length)
have_all = True
for i in range(length):
b = state.lastWatchBytes.get(addr_w + i)
if b is None:
have_all = False
break
bytes_[i] = b
type_str = state.typeStrOfSymbol(spec)
if not have_all:
print(f" {spec}: ADDR=0x{addr:06x} TYPE={type_str} "
f"(no snapshot bytes — run again to capture)")
return
decoded = _decodeBytes(type_str, bytes_)
hex_dump = " ".join(f"{b:02x}" for b in bytes_)
print(f" {spec} : {type_str} = {decoded}")
print(f" ADDR=0x{addr:06x} BYTES=[{hex_dump}]")
def _decodeBytes(type_str, raw):
"""Best-effort C-value print for a small byte buffer.
Recognises:
- int/short/char (1/2/4 byte ints, little-endian)
- unsigned variants
- any "* " (pointer) type print as hex address
- struct/union show raw hex (the caller already prints BYTES=)
Floats are out of scope per the task; print bytes as hex.
"""
ts = type_str.strip()
if not raw:
return "<empty>"
# Pointer types -> print as hex address of the right width.
if ts.endswith("*") or " *" in ts:
if len(raw) >= 4:
v = raw[0] | (raw[1] << 8) | (raw[2] << 16) | (raw[3] << 24)
return f"0x{v & 0xFFFFFFFF:08x}"
if len(raw) >= 2:
v = raw[0] | (raw[1] << 8)
return f"0x{v:04x}"
return f"0x{raw[0]:02x}"
# Integer base types.
int_widths = {
"char": 1, "signed char": 1, "unsigned char": 1,
"_Bool": 1, "bool": 1,
"short": 2, "short int": 2,
"unsigned short": 2, "unsigned short int": 2,
"int": 2, "unsigned int": 2, "signed int": 2,
"long": 4, "long int": 4, "signed long": 4,
"unsigned long": 4, "unsigned long int": 4,
"long long": 4, "unsigned long long": 4,
}
signed_set = {"char", "signed char", "short", "short int",
"int", "signed int", "long", "long int",
"signed long", "long long"}
if ts in int_widths:
w = int_widths[ts]
n = min(w, len(raw))
v = 0
for i in range(n):
v |= raw[i] << (8 * i)
if ts in signed_set:
top = 1 << (8 * n - 1)
if v & top:
v = v - (1 << (8 * n))
return f"{v} (0x{v & ((1 << (8*n)) - 1):0{2*n}x})"
# struct / union / class — caller dumps raw bytes.
if ts.startswith("struct ") or ts.startswith("union ") \
or ts.startswith("class "):
# Show u16 words as a partial decode hint (often the first
# field is an integer the user wants to see).
if len(raw) >= 2:
first_u16 = raw[0] | (raw[1] << 8)
return f"<{ts}; first u16 = 0x{first_u16:04x}>"
return f"<{ts}>"
# Array type — show first elements as best-effort integers.
if "[" in ts and ts.endswith("]"):
first = " ".join(f"0x{b:02x}" for b in raw[:8])
return f"[{first}{', ...' if len(raw) > 8 else ''}]"
return "<no decoder>"
def replInfoLocals(state):
"""Show formal_parameters + locals at the last snapshot PC."""
if state.lastSnap is None:
print(" no snapshot yet — `run` first")
return
pc = state.lastSnap["pc"]
sp = state.lastSnap["sp"]
cu, sub, locs = pc2line.localsAtPc(state.cus, pc, sp_value=sp)
if sub is None:
print(f" no subprogram at PC=0x{pc:06x}")
return
sub_name = pc2line.dieName(cu, sub) or "<unnamed>"
print(f" in {sub_name!r} at PC=0x{pc:06x} S=0x{sp:04x}")
if not locs:
print(" (no formal_parameter / variable in scope)")
return
for name, ty, loc, _die in locs:
if loc.kind == "memory":
print(f" {name} : {ty} ADDR=0x{loc.addr:06x}")
elif loc.kind == "register":
if loc.dp_addr is not None:
print(f" {name} : {ty} REG=DW{loc.reg_dw} "
f"ADDR=0x{loc.dp_addr:06x}")
else:
print(f" {name} : {ty} REG=DW{loc.reg_dw}")
elif loc.kind == "value":
print(f" {name} : {ty} VALUE=0x{loc.value:x}")
else:
print(f" {name} : {ty} UNSUPPORTED={loc.reason}")
def replNextLinePc(state, current_pc):
"""Return the PC of the DWARF line entry strictly after current_pc,
or None if there isn't one (end of program / no DWARF).
"""
# The line table is unsorted in source order; iterate to find the
# smallest entry whose PC is strictly greater than current_pc.
best = None
for pc, _fidx, _ln, _ft in state.lineTable:
if pc > current_pc:
if best is None or pc < best:
best = pc
return best
def replLoop(state):
"""Run the REPL. Reads commands from stdin, dispatches each one."""
interactive_tty = sys.stdin.isatty()
if interactive_tty:
print("mameDebug REPL. Type ? for help.")
while True:
try:
if interactive_tty:
line = input("(dbg) ")
else:
line = input() # no prompt in batch mode (cleaner output)
except EOFError:
if interactive_tty:
print()
break
line = line.strip()
if not line or line.startswith("#"):
continue
# Echo command in batch mode so the smoke test can diff output.
if not interactive_tty:
print(f"(dbg) {line}")
cmd, _, rest = line.partition(" ")
rest = rest.strip()
if cmd in ("q", "quit", "exit"):
break
if cmd == "?" or cmd == "help":
print(REPL_HELP)
continue
if cmd in ("break", "b"):
if not rest:
print(" usage: break <sym|file:line|0xADDR>")
continue
pc, label = state.resolveSpec(rest)
if pc is None:
print(f" cannot resolve: {label}")
continue
state.breakpoints.append((pc, label))
idx = len(state.breakpoints)
print(f" bp #{idx} at 0x{pc:06x} ({label})")
continue
if cmd in ("info",):
if rest == "breakpoints":
if not state.breakpoints:
print(" no breakpoints")
else:
for i, (pc, lab) in enumerate(state.breakpoints, 1):
print(f" #{i} 0x{pc:06x} ({lab})")
continue
if rest == "locals":
replInfoLocals(state)
continue
print(f" unknown info subcommand: {rest!r}")
continue
if cmd == "delete":
try:
idx = int(rest)
except ValueError:
print(" usage: delete <N>")
continue
if idx < 1 or idx > len(state.breakpoints):
print(f" no breakpoint #{idx}")
continue
del state.breakpoints[idx - 1]
print(f" deleted bp #{idx}")
continue
if cmd in ("run", "r", "continue", "c"):
if not state.breakpoints:
print(" no breakpoints set — nothing to break on")
continue
bp_pcs = [pc for pc, _ in state.breakpoints]
# Decide start_pc: --from-start runs through crt0; default
# is to jump to the first bp (matches --trace behaviour).
if state.args.from_start:
start_pc = state.args.load_at
else:
start_pc = bp_pcs[0]
watch_regions = list(state.watches.values())
replLaunchMame(state, bp_pcs, start_pc, watch_regions,
seconds=state.args.seconds)
if state.lastSnap is None:
print(" WARN: no BP-HIT captured (timed out?)")
else:
replPrintWhere(state)
continue
if cmd in ("step", "s", "next", "n"):
# Both map to "advance to next source line via DWARF" in
# our snapshot-based model. Requires a prior snapshot to
# know "where we are".
if state.lastSnap is None:
# No prior snapshot: just do `run` (start of program).
if not state.breakpoints:
print(" no breakpoints set — `break` first")
continue
bp_pcs = [pc for pc, _ in state.breakpoints]
start_pc = (state.args.load_at if state.args.from_start
else bp_pcs[0])
replLaunchMame(state, bp_pcs, start_pc,
list(state.watches.values()),
seconds=state.args.seconds)
if state.lastSnap is not None:
replPrintWhere(state)
continue
current_pc = state.lastSnap["pc"]
next_pc = replNextLinePc(state, current_pc)
if next_pc is None:
print(" no next DWARF line entry — at end of program")
continue
print(f" stepping to next DWARF line at 0x{next_pc:06x}")
replLaunchMame(state, [next_pc], current_pc,
list(state.watches.values()),
seconds=state.args.seconds)
if state.lastSnap is None:
print(" WARN: step did not hit the bp (timed out?)")
else:
replPrintWhere(state)
continue
if cmd == "where":
replPrintWhere(state)
continue
if cmd in ("bt", "backtrace"):
replPrintBacktrace(state)
continue
if cmd in ("print", "p"):
if not rest:
print(" usage: print <symbol>")
continue
replPrintSymbol(state, rest)
continue
print(f" unknown command: {line!r} (try ?)")
return 0
def replMode(args):
"""Entry point for `--repl`."""
state = ReplState(args)
if args.break_at:
# --break is interpreted as "queue this bp before reading any
# interactive commands" — useful when scripting.
pc, label = state.resolveSpec(args.break_at)
if pc is None:
print(f"mameDebug: --break {args.break_at!r}: {label}",
file=sys.stderr)
return 2
state.breakpoints.append((pc, label))
print(f" bp #1 at 0x{pc:06x} ({label}) [from --break]")
return replLoop(state)
# ---- main ------------------------------------------------------------ # ---- main ------------------------------------------------------------
def main(): def main():
@ -579,6 +1355,13 @@ def main():
ap.add_argument("--trace", action="store_true", ap.add_argument("--trace", action="store_true",
help="default-on smoke mode: set bp, capture one " help="default-on smoke mode: set bp, capture one "
"BP-HIT, resolve via pc2line, exit 0") "BP-HIT, resolve via pc2line, exit 0")
ap.add_argument("--repl", action="store_true",
help="interactive REPL. Reads stdin commands "
"(break/run/step/next/where/bt/print/info/"
"delete/quit). Each `run`/`step`/`next` "
"launches one MAME process. `print`, `bt`, "
"and `where` decode the captured snapshot "
"and need no further MAME launch.")
ap.add_argument("--from-start", action="store_true", ap.add_argument("--from-start", action="store_true",
help="start execution at LOAD_AT (i.e. through " help="start execution at LOAD_AT (i.e. through "
"the crt0). Default is to jump straight to " "the crt0). Default is to jump straight to "
@ -611,6 +1394,8 @@ def main():
return 2 return 2
if args.trace: if args.trace:
return traceMode(args) return traceMode(args)
if args.repl:
return replMode(args)
return interactiveMode(args) return interactiveMode(args)

127
scripts/probeReplSmoke.sh Executable file
View file

@ -0,0 +1,127 @@
#!/usr/bin/env bash
# probeReplSmoke.sh - non-interactive smoke check for mameDebug.py
# --repl mode. Pipes a canned script (`break main`, `run`, `where`,
# `quit`) into the REPL and asserts that:
# 1. The REPL parses each command without error
# 2. A breakpoint resolves through the link816 map
# 3. MAME launches with the bp installed and surfaces a BP-HIT line
# 4. `where` resolves the captured PC to a source line via DWARF
#
# Exit 0 on full pass. Exit 77 (autotools "skip") if MAME / toolchain
# missing. Exit 1 on any unexpected REPL output or missing capture.
#
# Usage: probeReplSmoke.sh [--verbose]
set -euo pipefail
HERE="$(cd "$(dirname "$0")" && pwd)"
ROOT="$(cd "$HERE/.." && pwd)"
VERBOSE=0
if [ "${1:-}" = "--verbose" ]; then
VERBOSE=1
fi
CLANG="$ROOT/tools/llvm-mos-build/bin/clang"
LLVMMC="$ROOT/tools/llvm-mos-build/bin/llvm-mc"
LINK="$ROOT/tools/link816"
if [ ! -x "$CLANG" ] || [ ! -x "$LLVMMC" ] || [ ! -x "$LINK" ]; then
echo "probeReplSmoke: missing toolchain (clang/llvm-mc/link816)" >&2
exit 77
fi
if ! command -v mame >/dev/null 2>&1; then
echo "probeReplSmoke: mame not on PATH; skipping" >&2
exit 77
fi
WORK="$(mktemp -d)"
trap 'rm -rf "$WORK"' EXIT
CFILE="$WORK/repltest.c"
OFILE="$WORK/repltest.o"
OCRT0="$WORK/crt0.o"
OLIBGCC="$WORK/libgcc.o"
BIN="$WORK/repltest.bin"
MAP="$WORK/repltest.map"
DWARF="$WORK/repltest.dwarf"
OUT="$WORK/repl.out"
cat > "$CFILE" <<'EOF'
int gAnswer = 42;
int add(int a, int b) {
int c = a + b;
return c;
}
int main(void) {
int r = add(3, 4);
gAnswer = r;
while (1) { }
return r;
}
EOF
"$CLANG" --target=w65816 -O0 -g -ffunction-sections \
-c "$CFILE" -o "$OFILE" 2>/dev/null
"$LLVMMC" -arch=w65816 -filetype=obj \
"$ROOT/runtime/src/crt0.s" -o "$OCRT0" 2>/dev/null
"$LLVMMC" -arch=w65816 -filetype=obj \
"$ROOT/runtime/src/libgcc.s" -o "$OLIBGCC" 2>/dev/null
"$LINK" -o "$BIN" --text-base 0x1000 \
--map "$MAP" --debug-out "$DWARF" \
"$OCRT0" "$OFILE" "$OLIBGCC" >/dev/null 2>&1 || true
[ -s "$BIN" ] || { echo "probeReplSmoke: empty .bin"; exit 1; }
[ -s "$DWARF" ] || { echo "probeReplSmoke: empty DWARF sidecar"; exit 1; }
[ -s "$MAP" ] || { echo "probeReplSmoke: empty map"; exit 1; }
# Pipe the canned REPL script.
printf 'break main\nrun\nwhere\nquit\n' \
| timeout 60 python3 "$HERE/mameDebug.py" --repl \
--bin "$BIN" --map "$MAP" --dwarf "$DWARF" \
--seconds 4 > "$OUT" 2>&1 || {
echo "probeReplSmoke: mameDebug.py --repl failed" >&2
cat "$OUT" >&2
exit 1
}
if [ "$VERBOSE" -eq 1 ]; then
cat "$OUT" >&2
fi
# Required output lines:
# "(dbg) break main" - command echo
# " bp #1 at 0x...... (main)" - bp set ack
# "(dbg) run" - command echo
# " PC=0x...... ... FUNC=main ..." - where output after run
# "(dbg) where" - command echo
# " PC=0x...... ... FUNC=main ..." - where output (manual)
# "(dbg) quit" - command echo
if ! grep -q "bp #1 at 0x" "$OUT"; then
echo "probeReplSmoke: missing 'bp #1 at 0x...' breakpoint ack" >&2
cat "$OUT" >&2
exit 1
fi
if ! grep -q "FUNC=main" "$OUT"; then
echo "probeReplSmoke: missing FUNC=main in 'where' output" >&2
cat "$OUT" >&2
exit 1
fi
# The `where` command (run AFTER the `run` command) must produce
# output too — verify by counting occurrences of "PC=0x" prefix lines.
PC_HITS=$(grep -c "^ PC=0x" "$OUT" || true)
if [ "$PC_HITS" -lt 2 ]; then
echo "probeReplSmoke: expected >= 2 PC=0x lines (run + where), got $PC_HITS" >&2
cat "$OUT" >&2
exit 1
fi
# Bonus: verify the captured PC equals the map entry for `main`.
MAIN_PC=$(awk '$2 == "main" { print $1; exit }' "$MAP")
[ -n "$MAIN_PC" ] || { echo "probeReplSmoke: no 'main' symbol in map"; exit 1; }
MAIN_PC_LC=$(echo "$MAIN_PC" | tr 'A-Z' 'a-z')
if ! grep -qi "PC=$MAIN_PC_LC " "$OUT"; then
echo "probeReplSmoke: captured PC does not match map[main]=$MAIN_PC" >&2
cat "$OUT" >&2
exit 1
fi
echo "probeReplSmoke: OK (bp resolved, BP-HIT captured, where decoded)"
exit 0

View file

@ -1146,6 +1146,20 @@ EOF
fi fi
fi fi
# Phase 3.3: mameDebug.py --repl non-interactive smoke. Pipes a
# canned `break main / run / where / quit` script into the REPL and
# asserts that (1) the bp resolves via the link816 map, (2) MAME
# launches and surfaces a BP-HIT, (3) the captured PC is decoded
# through DWARF into FUNC=main on the where output, and (4) the
# captured PC equals the map's entry for main. MAME-gated.
if command -v mame >/dev/null && [ -d "$PROJECT_ROOT/tools/mame/roms" ]; then
log "check: mameDebug.py --repl non-interactive (break/run/where/quit)"
if ! bash "$PROJECT_ROOT/scripts/probeReplSmoke.sh" >/dev/null 2>&1; then
bash "$PROJECT_ROOT/scripts/probeReplSmoke.sh" --verbose >&2 || true
die "mameDebug.py --repl smoke probe failed"
fi
fi
# iigs/sound.h + iigs/eventLoop.h headers compile cleanly through # iigs/sound.h + iigs/eventLoop.h headers compile cleanly through
# clang with the runtime include path. Catches missing extern "C" # clang with the runtime include path. Catches missing extern "C"
# wraps, broken struct layouts, or unresolved tool-call stubs. # wraps, broken struct layouts, or unresolved tool-call stubs.
@ -5988,12 +6002,19 @@ EOF
# omfEmit --stack-size: append a ~Direct DP/Stack segment so the # omfEmit --stack-size: append a ~Direct DP/Stack segment so the
# GS/OS Loader allocates an explicit-sized DP+stack chunk instead # GS/OS Loader allocates an explicit-sized DP+stack chunk instead
# of its 4KB default. KIND=0x1012 (DP/Stack | PRIVATE), LENGTH and # of its 4KB default. KIND=0x4012 (DP/Stack | RELOAD), LENGTH =
# RESSPC both = requested size, ALIGN=0x100 (page-aligned per spec). # requested size, RESSPC=0 (the stack bytes are carried in LCONST
# Plain (non-ExpressLoad) multi-segment OMFs do not launch under # because the ExpressLoad fast path can't be trusted to honor
# GS/OS 6.0.2 Loader (verified empirically), so --stack-size auto- # RESSPC — same trick the user CODE seg uses for BSS). ALIGN=
# enables --expressload: the OMF becomes 3 segments (ExpressLoad, # 0x100 (page-aligned per spec). Plain (non-ExpressLoad) multi-
# code, DP/Stack), with DP/Stack as segnum 3. # segment OMFs do not launch under GS/OS 6.0.2 Loader (verified
# empirically), so --stack-size auto-enables --expressload: the
# OMF becomes 3 segments (ExpressLoad, code, DP/Stack), with
# DP/Stack as segnum 3. The ExpressLoad load script also carries
# a segtable + remap + header_info entry for the DP/Stack so the
# Loader's fast path actually honors it (without that the Loader
# silently drops the seg and uses its default 4KB allocation —
# see feedback_gsos_fopen_partial_diagnosis).
log "check: omfEmit --stack-size emits a DP/Stack ~Direct segment" log "check: omfEmit --stack-size emits a DP/Stack ~Direct segment"
omfStk="$(mktemp --suffix=.omf)" omfStk="$(mktemp --suffix=.omf)"
"$PROJECT_ROOT/tools/omfEmit" \ "$PROJECT_ROOT/tools/omfEmit" \
@ -6022,16 +6043,34 @@ align = struct.unpack_from('<I', data, sp+28)[0]
segnum = struct.unpack_from('<H', data, sp+34)[0] segnum = struct.unpack_from('<H', data, sp+34)[0]
dispnm = struct.unpack_from('<H', data, sp+40)[0] dispnm = struct.unpack_from('<H', data, sp+40)[0]
name = data[sp+dispnm+10:sp+dispnm+20].decode('ascii', errors='replace').rstrip() name = data[sp+dispnm+10:sp+dispnm+20].decode('ascii', errors='replace').rstrip()
if kind != 0x1012: if kind != 0x4012:
sys.exit(f"DP/Stack KIND=0x{kind:04x} (expected 0x1012)") sys.exit(f"DP/Stack KIND=0x{kind:04x} (expected 0x4012 = DP/Stack|RELOAD)")
if length != 4096 or resspc != 4096: if length != 4096:
sys.exit(f"DP/Stack LENGTH={length} RESSPC={resspc} (expected 4096)") sys.exit(f"DP/Stack LENGTH={length} (expected 4096)")
if resspc != 0:
sys.exit(f"DP/Stack RESSPC={resspc} (expected 0; stack carried as LCONST zeros)")
if align != 0x100: if align != 0x100:
sys.exit(f"DP/Stack ALIGN=0x{align:x} (expected 0x100 = page-aligned)") sys.exit(f"DP/Stack ALIGN=0x{align:x} (expected 0x100 = page-aligned)")
if segnum != 3: if segnum != 3:
sys.exit(f"DP/Stack SEGNUM={segnum} (expected 3)") sys.exit(f"DP/Stack SEGNUM={segnum} (expected 3)")
if name != "~Direct": if name != "~Direct":
sys.exit(f"DP/Stack name='{name}' (expected ~Direct)") sys.exit(f"DP/Stack name='{name}' (expected ~Direct)")
# ExpressLoad seg must carry a header_info record for the DP/Stack so
# the Loader's fast path honors it. count=N-2 where N=total segs (3),
# so count=1. Walk the script and confirm.
elBody = 0 + 44 + 23 # ExpressLoad hdr + LABLEN=0 name area (10 + 1 + 12)
if data[elBody] != 0xF2:
sys.exit(f"ExpressLoad body op 0x{data[elBody]:02x} (expected 0xF2 LCONST)")
elLen = struct.unpack_from('<I', data, elBody+1)[0]
elData = data[elBody+5:elBody+5+elLen]
count = struct.unpack_from('<H', elData, 4)[0]
if count != 1:
sys.exit(f"ExpressLoad count={count} (expected 1 = N-2 for 3 segs)")
# Second remap entry (DP/Stack) should map old seg 2 -> new seg 3
remapOff = 6 + 8*2
rm = struct.unpack_from('<H', elData, remapOff+2)[0]
if rm != 3:
sys.exit(f"DP/Stack ExpressLoad remap={rm} (expected 3)")
PY PY
rm -f "$omfStk" rm -f "$omfStk"
@ -6355,6 +6394,93 @@ EOF
"$omfGsf" "$testFileGsf" "$omfGsf" "$testFileGsf"
fi fi
# Stack-size end-to-end: omfEmit --stack-size must actually propagate a
# larger DP/Stack chunk to the GS/OS Loader. Background: prior to the
# 2026-06-02 fix, the ~Direct DP/Stack segment was appended to the OMF
# but the ExpressLoad fast path silently ignored it (no segtable +
# remap + header_info entries for the seg), so --stack-size was a no-op
# under ExpressLoad mode. Default Loader allocation = 4KB at $0800 →
# SP=$17FF; --stack-size 0x4000 should yield a 16KB chunk → SP=$47FF.
# This check captures SP at entry to main() and confirms the high byte.
# Gated on the same prereqs as the GSOS_FILE_SMOKE check above; toggled
# off with SMOKE_SKIP_STACKSIZE=1.
if [ "${SMOKE_SKIP_STACKSIZE:-0}" != "1" ] \
&& [ -x "$CLANG" ] && [ -x "$CADIUS" ] && [ -f "$SYSDISK" ] \
&& command -v mame >/dev/null 2>&1; then
log "check: omfEmit --stack-size grows DP/Stack chunk under real GS/OS Loader"
cStkFile="$(mktemp --suffix=.c)"
oStkFile="$(mktemp --suffix=.o)"
binStk="$(mktemp --suffix=.bin)"
mapStk="$(mktemp --suffix=.map)"
relStk="$(mktemp --suffix=.reloc)"
omfStkWith="$(mktemp --suffix=.omf)"
omfStkWithout="$(mktemp --suffix=.omf)"
cat > "$cStkFile" <<'EOF'
// Stack-size end-to-end probe: capture SP at entry to main() and
// store its high byte at $71 so the harness can verify Loader honored
// --stack-size. $70 = 0x99 marker = program ran.
int main(void) {
__asm__ volatile (
"rep #0x30\n"
"tsc\n"
"xba\n"
"sep #0x20\n"
"sta 0x71\n"
"rep #0x20\n"
);
*(volatile unsigned char *)0x70 = 0x99;
for (volatile unsigned long s = 0; s < 600000UL; s++) { }
return 0;
}
EOF
"$CLANG" --target=w65816 -I"$PROJECT_ROOT/runtime/include" -O2 -ffunction-sections -c \
"$cStkFile" -o "$oStkFile"
"$PROJECT_ROOT/tools/link816" -o "$binStk" --text-base 0x1000 \
--map "$mapStk" --reloc-out "$relStk" \
"$PROJECT_ROOT/runtime/crt0Gsos.o" "$oStkFile" \
"$PROJECT_ROOT/runtime/libc.o" \
"$PROJECT_ROOT/runtime/snprintf.o" \
"$PROJECT_ROOT/runtime/extras.o" \
"$PROJECT_ROOT/runtime/softFloat.o" \
"$PROJECT_ROOT/runtime/softDouble.o" \
"$PROJECT_ROOT/runtime/iigsGsos.o" \
"$PROJECT_ROOT/runtime/iigsToolbox.o" \
"$PROJECT_ROOT/runtime/libgcc.o" 2>/tmp/stkprobe-link.err >/dev/null \
|| die "stack-size smoke: link failed: $(cat /tmp/stkprobe-link.err)"
# WITH --stack-size 0x4000 (16 KB chunk; Loader places at $0800,
# SP lands at $47FF → high byte $47).
"$PROJECT_ROOT/tools/omfEmit" --input "$binStk" --map "$mapStk" \
--base 0x1000 --entry __start --output "$omfStkWith" \
--name STKPROBE --stack-size 0x4000 --relocs "$relStk" >/dev/null 2>&1
if [ ! -s "$omfStkWith" ]; then
die "stack-size smoke: omfEmit (with stack-size) produced empty OMF"
fi
if ! bash "$PROJECT_ROOT/scripts/runViaFinder.sh" "$omfStkWith" \
--check 0x70=0x99 0x71=0x47 >/dev/null 2>&1; then
bash "$PROJECT_ROOT/scripts/runViaFinder.sh" "$omfStkWith" \
--check 0x70=0x99 0x71=0x47 2>&1 | tail -5 >&2
die "stack-size smoke FAILED: SP high byte != 0x47 with --stack-size 0x4000 (Loader silently dropped the seg?)"
fi
# WITHOUT --stack-size: Loader default 4 KB chunk → SP=$17FF →
# high byte $17. This second run guards against a spurious pass
# of the first (e.g. if every program by coincidence got SP=$47FF
# without our seg).
"$PROJECT_ROOT/tools/omfEmit" --input "$binStk" --map "$mapStk" \
--base 0x1000 --entry __start --output "$omfStkWithout" \
--name STKPROBE --expressload --relocs "$relStk" >/dev/null 2>&1
if [ ! -s "$omfStkWithout" ]; then
die "stack-size smoke: omfEmit (no stack-size) produced empty OMF"
fi
if ! bash "$PROJECT_ROOT/scripts/runViaFinder.sh" "$omfStkWithout" \
--check 0x70=0x99 0x71=0x17 >/dev/null 2>&1; then
bash "$PROJECT_ROOT/scripts/runViaFinder.sh" "$omfStkWithout" \
--check 0x70=0x99 0x71=0x17 2>&1 | tail -5 >&2
die "stack-size smoke FAILED: baseline SP high byte != 0x17 (Loader default-allocation shifted?)"
fi
rm -f "$cStkFile" "$oStkFile" "$binStk" "$mapStk" "$relStk" \
"$omfStkWith" "$omfStkWithout"
fi
# W65816 codegen-shape regression pins. Tiny FileCheck assertions on # W65816 codegen-shape regression pins. Tiny FileCheck assertions on
# specific lowering behaviors that have broken before; runs in well # specific lowering behaviors that have broken before; runs in well
# under a second. See scripts/runFileCheckTests.sh. # under a second. See scripts/runFileCheckTests.sh.
@ -6535,23 +6661,25 @@ else
log "OK: cursorProbe Push/Pop arrow+busy returned cleanly + marker set" log "OK: cursorProbe Push/Pop arrow+busy returned cleanly + marker set"
fi fi
# Phase 3.4 resourcemgr STUB-ONLY landing. Verifies: # Phase 3.4 resourcemgr REAL implementation. Verifies:
# - resource.o links into a normal GS/OS demo, # - resource.o links into a normal GS/OS demo,
# - resourceProbeInit() / iigsLoadResource() / iigsGetResourceSize() # - the demo stages an in-memory .rsrc fixture via mfsRegister,
# all return RES_ERR_BLOCKED in stub mode (mark 0x71/0x72 = 0xff), # opens it through openResourceFile (real parser), loads an rText
# - resourceRuntimeEnabled() returns 0 in stub mode (mark 0x73 = 0x01), # resource by (type, id), verifies the payload bytes match
# - demos/build.sh's rsrcBundle post-step produces an AppleSingle blob # "HELLO" and the size is 5,
# and the cadius _ResourceFork.bin sidecar when demos/rsrcProbe.rsrc/ # - second loadResource() call returns the SAME handle (cache hit),
# is present (verified by file existence). # - closeResourceFile() returns RES_OK,
# The live resource-fork pathway in MAME is NOT exercised here - the # - demos/build.sh's rsrcBundle post-step still produces an AppleSingle
# whole point of the stub-only landing is that Phase 1.1 (GS/OS fopen # blob + cadius sidecar when demos/rsrcProbe.rsrc/ is present.
# hang) blocks the live path on GS/OS 6.0.2. # The fixture also doubles as a bundler-output verification: the on-disk
# sidecar bytes from rsrcBundle.py match the in-memory fixture byte-for-
# byte, so passing this check confirms parser + bundler agree on format.
if [ "${SMOKE_SKIP_RSRC:-0}" = 1 ]; then if [ "${SMOKE_SKIP_RSRC:-0}" = 1 ]; then
warn "SMOKE_SKIP_RSRC=1; skipping Phase 3.4 rsrcProbe stage" warn "SMOKE_SKIP_RSRC=1; skipping Phase 3.4 rsrcProbe stage"
elif [ ! -f "$SYSDISK_DR" ] || [ ! -x "$CADIUS_DR" ] || ! command -v mame >/dev/null 2>&1; then elif [ ! -f "$SYSDISK_DR" ] || [ ! -x "$CADIUS_DR" ] || ! command -v mame >/dev/null 2>&1; then
warn "Phase 3.4 rsrcProbe prerequisites missing; skipping" warn "Phase 3.4 rsrcProbe prerequisites missing; skipping"
else else
log "check: rsrcProbe stub Resource Manager facade runs under GS/OS" log "check: rsrcProbe real Resource Manager (open/load/release/close) under GS/OS"
bash "$PROJECT_ROOT/demos/build.sh" rsrcProbe >/tmp/rsrcBuildOut 2>&1 || { bash "$PROJECT_ROOT/demos/build.sh" rsrcProbe >/tmp/rsrcBuildOut 2>&1 || {
cat /tmp/rsrcBuildOut >&2 cat /tmp/rsrcBuildOut >&2
die "demos/build.sh rsrcProbe failed" die "demos/build.sh rsrcProbe failed"
@ -6565,11 +6693,11 @@ else
fi fi
bash "$PROJECT_ROOT/scripts/runViaFinder.sh" \ bash "$PROJECT_ROOT/scripts/runViaFinder.sh" \
"$PROJECT_ROOT/demos/rsrcProbe.omf" \ "$PROJECT_ROOT/demos/rsrcProbe.omf" \
--check 0x70=0x99 0x71=0xff 0x72=0xff 0x73=0x01 >/tmp/rsrcRunOut 2>&1 || { --check 0x70=0x99 0x71=0x01 0x72=0x01 0x73=0x01 >/tmp/rsrcRunOut 2>&1 || {
cat /tmp/rsrcRunOut >&2 cat /tmp/rsrcRunOut >&2
die "rsrcProbe did not set expected stub-mode markers" die "rsrcProbe did not set expected real-impl markers"
} }
log "OK: rsrcProbe (stub-mode RES_ERR_BLOCKED markers all green)" log "OK: rsrcProbe (real Resource Manager open/load/cache/close all green)"
fi fi
# Phase 4.2 sprite engine: standalone SHR 320 init + 16x16 4bpp packed # Phase 4.2 sprite engine: standalone SHR 320 init + 16x16 4bpp packed
@ -6621,15 +6749,23 @@ fi
# Phase 6.2 UBSan-min smoke probe: build a tiny program with # Phase 6.2 UBSan-min smoke probe: build a tiny program with
# `-fsanitize=undefined -fsanitize-minimal-runtime`, link against the # `-fsanitize=undefined -fsanitize-minimal-runtime`, link against the
# new runtime/ubsan.o, and verify three representative UB kinds # new runtime/ubsan.o, and verify nine recoverable UB kinds
# (add-overflow / shift-out-of-bounds / divrem-overflow) instrument # (add-overflow / shift-out-of-bounds / divrem-overflow / sub-overflow /
# cleanly + recover. Bare-metal (no GS/OS), so we only require `mame`. # mul-overflow / negate-overflow / pointer-overflow / load-invalid-value /
# out-of-bounds) instrument cleanly + recover. Bare-metal (no GS/OS),
# so we only require `mame`.
# #
# What this probe pins: # What this probe pins:
# $025000 = 0xC0DE add-overflow handler fired and recovered # $025000 = 0xC0DE add-overflow handler fired and recovered
# $025002 = 0xC0DF shift-out-of-bounds handler fired and recovered # $025002 = 0xC0DF shift-out-of-bounds handler fired and recovered
# $025004 = 0xC0E0 divrem-overflow handler fired and recovered # $025004 = 0xC0E0 divrem-overflow handler fired and recovered
# $025006 = 0xC0DA main reached its tail past all three UBs # $025006 = 0xC0E1 sub-overflow handler fired and recovered
# $025008 = 0xC0E2 mul-overflow handler fired and recovered
# $02500A = 0xC0E3 negate-overflow handler fired and recovered
# $02500C = 0xC0E4 pointer-overflow handler fired and recovered
# $02500E = 0xC0E5 load-invalid-value handler fired and recovered
# $025010 = 0xC0E6 out-of-bounds handler fired and recovered
# $025012 = 0xC0DA main reached its tail past all nine UBs
# #
# Gated on `mame`. Override with SMOKE_SKIP_UBSAN=1. # Gated on `mame`. Override with SMOKE_SKIP_UBSAN=1.
if [ "${SMOKE_SKIP_UBSAN:-0}" = 1 ]; then if [ "${SMOKE_SKIP_UBSAN:-0}" = 1 ]; then
@ -6637,12 +6773,12 @@ if [ "${SMOKE_SKIP_UBSAN:-0}" = 1 ]; then
elif ! command -v mame >/dev/null 2>&1 || [ ! -d "$PROJECT_ROOT/tools/mame/roms" ]; then elif ! command -v mame >/dev/null 2>&1 || [ ! -d "$PROJECT_ROOT/tools/mame/roms" ]; then
warn "Phase 6.2 ubsan prerequisites missing (mame); skipping" warn "Phase 6.2 ubsan prerequisites missing (mame); skipping"
else else
log "check: ubsanProbe (UBSan-min: add-overflow + shift-OOB + div-by-zero) in MAME" log "check: ubsanProbe (UBSan-min: 9 UB kinds) in MAME"
bash "$PROJECT_ROOT/tests/ubsan/runUbsanProbe.sh" >/tmp/ubsanRunOut 2>&1 || { bash "$PROJECT_ROOT/tests/ubsan/runUbsanProbe.sh" >/tmp/ubsanRunOut 2>&1 || {
cat /tmp/ubsanRunOut >&2 cat /tmp/ubsanRunOut >&2
die "ubsanProbe did not set expected handler-fired markers" die "ubsanProbe did not set expected handler-fired markers"
} }
log "OK: ubsanProbe (3 UB kinds instrumented + recovered + tail reached)" log "OK: ubsanProbe (9 UB kinds instrumented + recovered + tail reached)"
fi fi
log "all smoke checks passed" log "all smoke checks passed"

View file

@ -73,12 +73,12 @@ struct Elf32Shdr {
uint32_t sh_entsize; uint32_t sh_entsize;
}; };
static constexpr uint32_t SHT_NULL = 0; [[maybe_unused]] static constexpr uint32_t SHT_NULL = 0;
static constexpr uint32_t SHT_PROGBITS = 1; [[maybe_unused]] static constexpr uint32_t SHT_PROGBITS = 1;
static constexpr uint32_t SHT_SYMTAB = 2; static constexpr uint32_t SHT_SYMTAB = 2;
static constexpr uint32_t SHT_STRTAB = 3; static constexpr uint32_t SHT_STRTAB = 3;
static constexpr uint32_t SHT_RELA = 4; static constexpr uint32_t SHT_RELA = 4;
static constexpr uint32_t SHT_NOBITS = 8; [[maybe_unused]] static constexpr uint32_t SHT_NOBITS = 8;
struct Elf32Sym { struct Elf32Sym {
uint32_t st_name; uint32_t st_name;
@ -104,12 +104,12 @@ static constexpr uint16_t EM_NONE = 0;
inline uint8_t ELF32_ST_TYPE(uint8_t i) { return i & 0x0F; } inline uint8_t ELF32_ST_TYPE(uint8_t i) { return i & 0x0F; }
inline uint8_t ELF32_ST_BIND(uint8_t i) { return (i >> 4) & 0x0F; } inline uint8_t ELF32_ST_BIND(uint8_t i) { return (i >> 4) & 0x0F; }
static constexpr uint8_t STB_LOCAL = 0; static constexpr uint8_t STB_LOCAL = 0;
static constexpr uint8_t STB_GLOBAL = 1; [[maybe_unused]] static constexpr uint8_t STB_GLOBAL = 1;
static constexpr uint8_t STB_WEAK = 2; static constexpr uint8_t STB_WEAK = 2;
static constexpr uint8_t STT_NOTYPE = 0; [[maybe_unused]] static constexpr uint8_t STT_NOTYPE = 0;
static constexpr uint8_t STT_OBJECT = 1; [[maybe_unused]] static constexpr uint8_t STT_OBJECT = 1;
static constexpr uint8_t STT_FUNC = 2; [[maybe_unused]] static constexpr uint8_t STT_FUNC = 2;
static constexpr uint8_t STT_SECTION = 3; static constexpr uint8_t STT_SECTION = 3;
struct Elf32Rela { struct Elf32Rela {
@ -170,9 +170,10 @@ static std::string sectionKind(const std::string &name) {
// .init_array entries are 16-bit function pointers; treat as // .init_array entries are 16-bit function pointers; treat as
// rodata so they end up in the read-only image and get a stable // rodata so they end up in the read-only image and get a stable
// address. The linker emits __init_array_start/_end so crt0 can // address. The linker emits __init_array_start/_end so crt0 can
// walk them. Same for .fini_array (destructors). // walk them. (.fini_array is not yet wired up; ELF input is
// accepted but the sections are dropped — runtime has no
// destructor-walk path today.)
if (name == ".init_array" || name.rfind(".init_array.", 0) == 0) return "init_array"; if (name == ".init_array" || name.rfind(".init_array.", 0) == 0) return "init_array";
if (name == ".fini_array" || name.rfind(".fini_array.", 0) == 0) return "fini_array";
// DWARF debug sections that are *targets* of intra-debug relocs // DWARF debug sections that are *targets* of intra-debug relocs
// (e.g. .debug_info -> .debug_str via R_W65816_DATA32, or // (e.g. .debug_info -> .debug_str via R_W65816_DATA32, or
// .debug_str_offsets -> .debug_str via R_W65816_DATA32). Treat // .debug_str_offsets -> .debug_str via R_W65816_DATA32). Treat
@ -384,6 +385,26 @@ static std::vector<Imm24Site> gImm24Sites;
static uint32_t gTextBaseForSites = 0; static uint32_t gTextBaseForSites = 0;
static bool gRecordSites = false; static bool gRecordSites = false;
// Record an intra-segment patch site for cRELOC emission. A target
// below the text base is never intra-segment (it is an undefined-weak
// resolving to 0, or an absolute address) and is skipped — see the
// commentary at the R_W65816_IMM16 callsite for why this matters.
static void recordCRelocSite(uint32_t patchAddr, uint32_t target,
uint8_t byteCnt, uint8_t bitShift) {
if (!gRecordSites) return;
uint32_t targetBank = target & 0xFF0000;
uint32_t baseBank = gTextBaseForSites & 0xFF0000;
if (targetBank != baseBank) return;
if (target < gTextBaseForSites) return;
Imm24Site s;
s.patchOff = patchAddr - gTextBaseForSites;
s.offsetRef = target - gTextBaseForSites;
s.byteCnt = byteCnt;
s.bitShift = bitShift;
gImm24Sites.push_back(s);
}
// Number of bytes patched by a given reloc type. Used by callers // Number of bytes patched by a given reloc type. Used by callers
// that need to range-check a reloc offset against a buffer size // that need to range-check a reloc offset against a buffer size
// without re-deriving the width inline. Returns 0 for unknown // without re-deriving the width inline. Returns 0 for unknown
@ -411,7 +432,7 @@ static uint32_t relocWidth(uint8_t rtype) {
static void applyReloc(std::vector<uint8_t> &buf, uint32_t off, static void applyReloc(std::vector<uint8_t> &buf, uint32_t off,
uint32_t patchAddr, uint32_t target, uint32_t patchAddr, uint32_t target,
uint8_t rtype, const std::string &symName) { uint8_t rtype, const std::string &symName) {
int64_t Signed; int64_t pcrelDisp;
switch (rtype) { switch (rtype) {
case R_W65816_IMM8: case R_W65816_IMM8:
if (target > 0xFF) if (target > 0xFF)
@ -433,9 +454,6 @@ static void applyReloc(std::vector<uint8_t> &buf, uint32_t off,
// time. Without this, `lda absConst` reads from the wrong // time. Without this, `lda absConst` reads from the wrong
// address when the segment doesn't land at link-time-base // address when the segment doesn't land at link-time-base
// (e.g., link-time-base=0x1000 but Loader places at bank:0). // (e.g., link-time-base=0x1000 but Loader places at bank:0).
if (gRecordSites) {
uint32_t targetBank = target & 0xFF0000;
uint32_t baseBank = gTextBaseForSites & 0xFF0000;
// A target below the text base is never an intra-segment // A target below the text base is never an intra-segment
// relocatable site: it is an undefined-weak symbol (resolveSym // relocatable site: it is an undefined-weak symbol (resolveSym
// resolves those to 0) or an absolute address. Recording a // resolves those to 0) or an absolute address. Recording a
@ -443,18 +461,9 @@ static void applyReloc(std::vector<uint8_t> &buf, uint32_t off,
// (omfEmit rejects it as out-of-range) and (b) make the Loader // (omfEmit rejects it as out-of-range) and (b) make the Loader
// rewrite a genuine null to segPlacedBase, breaking the // rewrite a genuine null to segPlacedBase, breaking the
// `if (weakFn) weakFn()` null test that the null is meant to fail. // `if (weakFn) weakFn()` null test that the null is meant to fail.
if (targetBank == baseBank && target >= gTextBaseForSites) { // recordCRelocSite handles the gate; byteCnt=2 distinguishes
Imm24Site s; // from IMM24 (3) so omfEmit emits cRELOC ByteCnt=2 here.
s.patchOff = patchAddr - gTextBaseForSites; recordCRelocSite(patchAddr, target, /*byteCnt=*/2, /*bitShift=*/0);
s.offsetRef = target - gTextBaseForSites;
// Use type field width = 2 to distinguish from IMM24
// (3). Imm24Site struct is reused — emitOmf will
// emit cRELOC ByteCnt=2 for this.
s.byteCnt = 2;
s.bitShift = 0;
gImm24Sites.push_back(s);
}
}
break; break;
case R_W65816_BANK16: case R_W65816_BANK16:
// 2-byte patch: byte 0 = bank of target, byte 1 = 0 (pad). // 2-byte patch: byte 0 = bank of target, byte 1 = 0 (pad).
@ -463,20 +472,9 @@ static void applyReloc(std::vector<uint8_t> &buf, uint32_t off,
// the value reflects the actually-placed bank. // the value reflects the actually-placed bank.
buf[off] = static_cast<uint8_t>((target >> 16) & 0xFF); buf[off] = static_cast<uint8_t>((target >> 16) & 0xFF);
buf[off + 1] = 0; buf[off + 1] = 0;
if (gRecordSites) { // bitShift=16: cRELOC Loader patches the bank byte from
uint32_t targetBank = target & 0xFF0000; // (segPlacedBase + offsetRef) >> 16 at load time.
uint32_t baseBank = gTextBaseForSites & 0xFF0000; recordCRelocSite(patchAddr, target, /*byteCnt=*/2, /*bitShift=*/16);
// See R_W65816_IMM16: skip undefined-weak/absolute targets
// below the text base (no valid intra-segment cRELOC).
if (targetBank == baseBank && target >= gTextBaseForSites) {
Imm24Site s;
s.patchOff = patchAddr - gTextBaseForSites;
s.offsetRef = target - gTextBaseForSites;
s.byteCnt = 2;
s.bitShift = 16;
gImm24Sites.push_back(s);
}
}
break; break;
case R_W65816_IMM24: case R_W65816_IMM24:
if (target > 0xFFFFFF) if (target > 0xFFFFFF)
@ -485,46 +483,30 @@ static void applyReloc(std::vector<uint8_t> &buf, uint32_t off,
buf[off] = static_cast<uint8_t>(target & 0xFF); buf[off] = static_cast<uint8_t>(target & 0xFF);
buf[off + 1] = static_cast<uint8_t>((target >> 8) & 0xFF); buf[off + 1] = static_cast<uint8_t>((target >> 8) & 0xFF);
buf[off + 2] = static_cast<uint8_t>((target >> 16) & 0xFF); buf[off + 2] = static_cast<uint8_t>((target >> 16) & 0xFF);
// Record the site for OMF cRELOC emission (only if recording is
// enabled — gRecordSites is set by the CLI when --reloc-out is
// requested). The patch offset is within the segment image; the
// reference offset is the in-segment offset of the target.
if (gRecordSites) {
// Only intra-segment refs need cRELOC; cross-bank refs (to // Only intra-segment refs need cRELOC; cross-bank refs (to
// GS/OS dispatcher etc.) target absolute fixed addresses // GS/OS dispatcher etc.) target absolute fixed addresses
// and shouldn't be relocated by the Loader. // and shouldn't be relocated by the Loader. recordCRelocSite
uint32_t targetBank = target & 0xFF0000; // applies the same gates as R_W65816_IMM16.
uint32_t baseBank = gTextBaseForSites & 0xFF0000; recordCRelocSite(patchAddr, target, /*byteCnt=*/3, /*bitShift=*/0);
// See R_W65816_IMM16: skip undefined-weak/absolute targets
// below the text base (no valid intra-segment cRELOC).
if (targetBank == baseBank && target >= gTextBaseForSites) {
Imm24Site s;
s.patchOff = patchAddr - gTextBaseForSites;
s.offsetRef = target - gTextBaseForSites;
s.byteCnt = 3;
s.bitShift = 0;
gImm24Sites.push_back(s);
}
}
break; break;
case R_W65816_PCREL8: case R_W65816_PCREL8:
Signed = static_cast<int64_t>(target) - (static_cast<int64_t>(patchAddr) + 1); pcrelDisp = static_cast<int64_t>(target) - (static_cast<int64_t>(patchAddr) + 1);
if (Signed < -128 || Signed > 127) { if (pcrelDisp < -128 || pcrelDisp > 127) {
char msg[256]; char msg[256];
std::snprintf(msg, sizeof(msg), std::snprintf(msg, sizeof(msg),
"R_W65816_PCREL8 to '%s' out of branch range (%lld bytes)", "R_W65816_PCREL8 to '%s' out of branch range (%lld bytes)",
symName.c_str(), (long long)Signed); symName.c_str(), (long long)pcrelDisp);
die(msg); die(msg);
} }
buf[off] = static_cast<uint8_t>(Signed & 0xFF); buf[off] = static_cast<uint8_t>(pcrelDisp & 0xFF);
break; break;
case R_W65816_PCREL16: case R_W65816_PCREL16:
Signed = static_cast<int64_t>(target) - (static_cast<int64_t>(patchAddr) + 2); pcrelDisp = static_cast<int64_t>(target) - (static_cast<int64_t>(patchAddr) + 2);
if (Signed < -32768 || Signed > 32767) if (pcrelDisp < -32768 || pcrelDisp > 32767)
die("R_W65816_PCREL16 to '" + symName + die("R_W65816_PCREL16 to '" + symName +
"' out of BRL range"); "' out of BRL range");
buf[off] = static_cast<uint8_t>(Signed & 0xFF); buf[off] = static_cast<uint8_t>(pcrelDisp & 0xFF);
buf[off + 1] = static_cast<uint8_t>((Signed >> 8) & 0xFF); buf[off + 1] = static_cast<uint8_t>((pcrelDisp >> 8) & 0xFF);
break; break;
case R_W65816_DATA32: case R_W65816_DATA32:
// 4-byte LE absolute. Used in DWARF .debug_* sections // 4-byte LE absolute. Used in DWARF .debug_* sections
@ -554,33 +536,22 @@ static void applyReloc(std::vector<uint8_t> &buf, uint32_t off,
// patches the low 3 bytes of the 4-byte slot at load time, // patches the low 3 bytes of the 4-byte slot at load time,
// leaving the high (pad) byte at 0 (writes the resolved // leaving the high (pad) byte at 0 (writes the resolved
// 24-bit value bank:offset with bitShift=0 == no shift). // 24-bit value bank:offset with bitShift=0 == no shift).
if (gRecordSites) { recordCRelocSite(patchAddr, target, /*byteCnt=*/3, /*bitShift=*/0);
uint32_t targetBank = target & 0xFF0000;
uint32_t baseBank = gTextBaseForSites & 0xFF0000;
if (targetBank == baseBank && target >= gTextBaseForSites) {
Imm24Site s;
s.patchOff = patchAddr - gTextBaseForSites;
s.offsetRef = target - gTextBaseForSites;
s.byteCnt = 3;
s.bitShift = 0;
gImm24Sites.push_back(s);
}
}
break; break;
case R_W65816_PCREL32: case R_W65816_PCREL32:
// 4-byte signed PC-relative. PCREL displacements have the // 4-byte signed PC-relative. PCREL displacements have the
// PC pointing past the slot — the convention used by every // PC pointing past the slot — the convention used by every
// other PCREL reloc in this file (PCREL8 adds 1, PCREL16 // other PCREL reloc in this file (PCREL8 adds 1, PCREL16
// adds 2), so PCREL32 adds 4. // adds 2), so PCREL32 adds 4.
Signed = static_cast<int64_t>(target) - (static_cast<int64_t>(patchAddr) + 4); pcrelDisp = static_cast<int64_t>(target) - (static_cast<int64_t>(patchAddr) + 4);
// No range check: 32-bit signed displacement covers the // No range check: 32-bit signed displacement covers the
// full address space. In practice this fires for DWARF // full address space. In practice this fires for DWARF
// intra-section diffs where target and patchAddr live in // intra-section diffs where target and patchAddr live in
// the same section, so Signed is small. // the same section, so pcrelDisp is small.
buf[off] = static_cast<uint8_t>(Signed & 0xFF); buf[off] = static_cast<uint8_t>(pcrelDisp & 0xFF);
buf[off + 1] = static_cast<uint8_t>((Signed >> 8) & 0xFF); buf[off + 1] = static_cast<uint8_t>((pcrelDisp >> 8) & 0xFF);
buf[off + 2] = static_cast<uint8_t>((Signed >> 16) & 0xFF); buf[off + 2] = static_cast<uint8_t>((pcrelDisp >> 16) & 0xFF);
buf[off + 3] = static_cast<uint8_t>((Signed >> 24) & 0xFF); buf[off + 3] = static_cast<uint8_t>((pcrelDisp >> 24) & 0xFF);
break; break;
default: { default: {
char msg[128]; char msg[128];
@ -1106,11 +1077,6 @@ struct Linker {
curRem -= seg; curRem -= seg;
if (curRem == 0) { segIdx++; break; } if (curRem == 0) { segIdx++; break; }
curBase += seg; // advance within bank or to next curBase += seg; // advance within bank or to next
if ((curBase & 0xFFFFu) == 0) {
// Crossed bank boundary — already at start of next bank.
} else if ((curBase & 0xFF0000u) != ((curBase - 1) & 0xFF0000u)) {
// Just crossed into next bank.
}
} }
// Zero out any unused segment slots so crt0 sees size=0. // Zero out any unused segment slots so crt0 sees size=0.
for (uint32_t i = segIdx; i < 4; i++) { for (uint32_t i = segIdx; i < 4; i++) {
@ -1709,13 +1675,9 @@ int main(int argc, char **argv) {
if (++i >= argc) usage(argv[0]); if (++i >= argc) usage(argv[0]);
relocOutPath = argv[i++]; relocOutPath = argv[i++];
} else if (a == "--gc-sections") { } else if (a == "--gc-sections") {
// Drop sections not reachable from __start / main / // GC of unreachable sections is on by default; --gc-sections
// init_array. Requires `-ffunction-sections` (so each // is accepted as a no-op alias for clarity. Use
// function is in its own section). Significantly shrinks // --no-gc-sections to disable.
// text for programs that link the whole runtime but only
// use a fraction of it. ON by default; --no-gc-sections
// disables.
linker.gcSections = true;
i++; i++;
} else if (a == "--no-gc-sections") { } else if (a == "--no-gc-sections") {
linker.gcSections = false; linker.gcSections = false;

BIN
src/link816/omfEmit Executable file

Binary file not shown.

View file

@ -32,6 +32,24 @@
namespace { namespace {
// OMF v2.1 protocol constants -- single source of truth for the header
// layout and opcode set. See Apple IIgs Tech Note #17 and the FTN
// reference. Don't renumber; values are shared with the loader.
static constexpr uint8_t OMF_OP_LCONST = 0xF2;
static constexpr uint8_t OMF_OP_CRELOC = 0xF5;
static constexpr uint8_t OMF_OP_END = 0x00;
[[maybe_unused]] static constexpr uint8_t OMF_NUMLEN = 4;
[[maybe_unused]] static constexpr uint8_t OMF_VERSION_V21 = 0x02;
[[maybe_unused]] static constexpr uint32_t OMF_HDR_SIZE = 44;
[[maybe_unused]] static constexpr uint32_t OMF_LABLEN_FIXED = 10;
static constexpr uint16_t OMF_KIND_CODE_PRIV = 0x1000;
static constexpr uint16_t OMF_KIND_DPSTACK = 0x4012; // DP/Stack | RELOAD; matches real-world GNO/ME ~_STACK format
static constexpr uint16_t OMF_KIND_DATA_STATIC = 0x8001;
static constexpr uint16_t OMF_KIND_CODE_STATIC_ABSBANK = 0x8800;
// cRELOC opcode wire size: opcode + ByteCnt + BitShift + OffsetPatch +
// OffsetReference = 1 + 1 + 1 + 2 + 2 = 7 bytes per site.
static constexpr uint32_t OMF_CRELOC_BYTES_PER_SITE = 7;
[[noreturn]] static void die(const std::string &msg) { [[noreturn]] static void die(const std::string &msg) {
std::fprintf(stderr, "omfEmit: %s\n", msg.c_str()); std::fprintf(stderr, "omfEmit: %s\n", msg.c_str());
std::exit(1); std::exit(1);
@ -48,9 +66,7 @@ struct RelocSite {
uint8_t byteCnt; uint8_t byteCnt;
uint8_t bitShift; // 0 for offset relocs, 16 for BANK16 uint8_t bitShift; // 0 for offset relocs, 16 for BANK16
}; };
} // close namespace
std::vector<RelocSite> gReloc24Sites; std::vector<RelocSite> gReloc24Sites;
namespace {
static std::vector<uint8_t> readFile(const std::string &path) { static std::vector<uint8_t> readFile(const std::string &path) {
std::ifstream f(path, std::ios::binary); std::ifstream f(path, std::ios::binary);
@ -135,7 +151,7 @@ static std::vector<uint8_t> emitOneSeg(const std::vector<uint8_t> &image,
// literal bytes. With NUMLEN=4 (standard for v2.1), the count // literal bytes. With NUMLEN=4 (standard for v2.1), the count
// field is 4 bytes. Verified empirically against real /SYSTEM/ // field is 4 bytes. Verified empirically against real /SYSTEM/
// START on GS/OS 6.0.2: every segment uses 0xF2 + 4-byte count. // START on GS/OS 6.0.2: every segment uses 0xF2 + 4-byte count.
body.push_back(0xF2); // LCONST opcode body.push_back(OMF_OP_LCONST); // LCONST opcode
put32(body, static_cast<uint32_t>(combined.size())); put32(body, static_cast<uint32_t>(combined.size()));
body.insert(body.end(), combined.begin(), combined.end()); body.insert(body.end(), combined.begin(), combined.end());
} }
@ -150,14 +166,14 @@ static std::vector<uint8_t> emitOneSeg(const std::vector<uint8_t> &image,
// (segPlacedBase + OffsetReference) at load time. This is what // (segPlacedBase + OffsetReference) at load time. This is what
// makes JSL/JML/STAlong/etc. with intra-segment targets work when // makes JSL/JML/STAlong/etc. with intra-segment targets work when
// the Loader places us at non-zero bank. // the Loader places us at non-zero bank.
for (const auto &s : ::gReloc24Sites) { for (const auto &s : gReloc24Sites) {
body.push_back(0xF5); body.push_back(OMF_OP_CRELOC);
body.push_back(s.byteCnt); // ByteCnt (2 or 3) body.push_back(s.byteCnt); // ByteCnt (2 or 3)
body.push_back(s.bitShift); // BitShift (0 or 16) body.push_back(s.bitShift); // BitShift (0 or 16)
put16(body, s.patchOff); // OffsetPatch put16(body, s.patchOff); // OffsetPatch
put16(body, s.offsetRef); // OffsetReference put16(body, s.offsetRef); // OffsetReference
} }
body.push_back(0x00); // END opcode body.push_back(OMF_OP_END); // END opcode
// Real OMF format (Merlin32 convention, verified GS/OS Loader-launchable): // Real OMF format (Merlin32 convention, verified GS/OS Loader-launchable):
// - LABLEN = 10: both LOAD_NAME and SEG_NAME are 10 bytes wide, // - LABLEN = 10: both LOAD_NAME and SEG_NAME are 10 bytes wide,
@ -247,13 +263,21 @@ static std::vector<uint8_t> emitOneSeg(const std::vector<uint8_t> &image,
// allocate a page-aligned, locked memory block of that size in // allocate a page-aligned, locked memory block of that size in
// bank $00." // bank $00."
// //
// The body is just an END opcode (no LCONST data — RESSPC alone tells // The body is an LCONST opcode followed by `length` zero bytes plus an
// the Loader how big to make the allocation, and the bytes don't need // END opcode — matching the real-world format used by every GNO/ME
// to come from the file). KIND = 0x1012 = DP/Stack | PRIVATE — the // command (e.g. /GNO.BOOT/bin/echo's ~_STACK seg). Empirically a body
// PRIVATE attribute matches Apple's `makedirect` reference utility // of just END (no LCONST, relying on RESSPC for allocation) makes the
// (ksherlock/omfutils). // GS/OS Loader's ExpressLoad fast path silently drop the seg and fall
// back to its default 4 KB DP/Stack — hence this code emits real
// content so the Loader has something to copy. KIND = 0x4012 (RELOAD
// | DP/Stack) also matches the working GNO format; the earlier 0x1012
// (PRIVATE | DP/Stack) is what `makedirect` ships but doesn't survive
// ExpressLoad fast-path processing.
static std::vector<uint8_t> emitDpStackSeg(uint32_t length, uint16_t segNum) { static std::vector<uint8_t> emitDpStackSeg(uint32_t length, uint16_t segNum) {
std::vector<uint8_t> body; std::vector<uint8_t> body;
body.push_back(0xF2); // LCONST opcode
put32(body, length); // 4-byte literal length
body.insert(body.end(), length, 0); // `length` zero bytes
body.push_back(0x00); // END opcode body.push_back(0x00); // END opcode
constexpr uint8_t LABLEN_VAL = 10; constexpr uint8_t LABLEN_VAL = 10;
const std::string segNameTxt = "~Direct"; const std::string segNameTxt = "~Direct";
@ -267,10 +291,13 @@ static std::vector<uint8_t> emitDpStackSeg(uint32_t length, uint16_t segNum) {
DISPNAME + loadName.size() + segName.size()); DISPNAME + loadName.size() + segName.size());
const uint32_t LENGTH = length; // memory size requested const uint32_t LENGTH = length; // memory size requested
const uint32_t BYTECNT = DISPDATA + static_cast<uint32_t>(body.size()); const uint32_t BYTECNT = DISPDATA + static_cast<uint32_t>(body.size());
const uint32_t RESSPC = length; // bytes to zero-allocate // RESSPC = 0 because the bytes are carried in LCONST (matches the
// bss-as-zeros approach used for the user CODE seg — the Loader's
// ExpressLoad fast path can't be trusted to honor RESSPC).
const uint32_t RESSPC = 0;
const uint32_t BANKSIZE = 0; // DP/Stack lives in bank 0 const uint32_t BANKSIZE = 0; // DP/Stack lives in bank 0
const uint32_t ALIGN = 0x100; // page-aligned per spec const uint32_t ALIGN = 0x100; // page-aligned per spec
const uint16_t KIND = 0x1012; // DP/Stack | PRIVATE const uint16_t KIND = OMF_KIND_DPSTACK; // DP/Stack | RELOAD
std::vector<uint8_t> hdr; std::vector<uint8_t> hdr;
put32(hdr, BYTECNT); put32(hdr, BYTECNT);
@ -324,7 +351,7 @@ static std::vector<uint8_t> emitOMF(const std::vector<uint8_t> &image,
uint32_t bssGap = 0) { uint32_t bssGap = 0) {
if (stackSize == 0) { if (stackSize == 0) {
return emitOneSeg(image, entryOffset, /*org*/0, /*segNum*/1, return emitOneSeg(image, entryOffset, /*org*/0, /*segNum*/1,
/*kind*/0x1000, name, bssSize, bssGap); /*kind*/OMF_KIND_CODE_PRIV, name, bssSize, bssGap);
} }
// DP/Stack segment ordering: Apple's `makedirect` reference utility // DP/Stack segment ordering: Apple's `makedirect` reference utility
// assigns the DP/Stack as SEGNUM 1 (its own object); when linked // assigns the DP/Stack as SEGNUM 1 (its own object); when linked
@ -334,7 +361,7 @@ static std::vector<uint8_t> emitOMF(const std::vector<uint8_t> &image,
// sets DP and SP appropriately when entering our code. // sets DP and SP appropriately when entering our code.
auto dpSeg = emitDpStackSeg(stackSize, /*segNum*/1); auto dpSeg = emitDpStackSeg(stackSize, /*segNum*/1);
auto codeSeg = emitOneSeg(image, entryOffset, /*org*/0, /*segNum*/2, auto codeSeg = emitOneSeg(image, entryOffset, /*org*/0, /*segNum*/2,
/*kind*/0x1000, name, bssSize, bssGap); /*kind*/OMF_KIND_CODE_PRIV, name, bssSize, bssGap);
std::vector<uint8_t> out; std::vector<uint8_t> out;
out.insert(out.end(), dpSeg.begin(), dpSeg.end()); out.insert(out.end(), dpSeg.begin(), dpSeg.end());
out.insert(out.end(), codeSeg.begin(), codeSeg.end()); out.insert(out.end(), codeSeg.begin(), codeSeg.end());
@ -391,6 +418,17 @@ static std::vector<uint8_t> emitOmfExpressLoad(
auto userSeg = emitOneSeg(image, entryOffset, /*org*/0, /*segNum*/2, auto userSeg = emitOneSeg(image, entryOffset, /*org*/0, /*segNum*/2,
/*kind*/0x1000, userSegName, bssSize, bssGap); /*kind*/0x1000, userSegName, bssSize, bssGap);
// Optionally build the DP/Stack segment. If present it lives in the
// file AFTER the user seg and gets its own ExpressLoad segtable +
// remap + header_info entries — otherwise the Loader's ExpressLoad
// fast path never sees the KIND=0x4012 record and reverts to its
// default 4KB DP/Stack allocation (silent --stack-size no-op).
const bool haveDpStack = (stackSize != 0);
std::vector<uint8_t> dpStackSeg;
if (haveDpStack) {
dpStackSeg = emitDpStackSeg(stackSize, /*segNum*/3);
}
// Step 2: figure out the file offsets we'll need to bake into the // Step 2: figure out the file offsets we'll need to bake into the
// load script. We don't know the ExpressLoad segment's total size // load script. We don't know the ExpressLoad segment's total size
// yet — but we can compute it because each component is a fixed // yet — but we can compute it because each component is a fixed
@ -399,11 +437,10 @@ static std::vector<uint8_t> emitOmfExpressLoad(
// ExpressLoad LCONST data layout (matches Merlin32 source — see // ExpressLoad LCONST data layout (matches Merlin32 source — see
// BuildExpressLoadSegment in Merlin32's a65816_OMF.c): // BuildExpressLoadSegment in Merlin32's a65816_OMF.c):
// 6 bytes header (4-byte reserved DWORD + 2-byte count WORD) // 6 bytes header (4-byte reserved DWORD + 2-byte count WORD)
// 8 bytes segment list (1 entry per non-ExpressLoad segment) // 8 bytes/seg segment list (1 entry per non-ExpressLoad segment)
// 2 bytes remap list (1 entry per non-ExpressLoad segment) // 2 bytes/seg remap list (1 entry per non-ExpressLoad segment)
// 16 bytes header info offsets (data_off, data_len, reloc_off, reloc_len) // 68 bytes/seg header_info (16B offsets + 32B hdr copy + 10B LOAD_NAME + 10B SEG_NAME)
// + header_xpress: bytes [12..43] of user header (32 bytes) + LOAD_NAME (10) + SEG_NAME (1+N) // total: 6 + 78*N bytes for N non-ExpressLoad segs
// = 6 + 8 + 2 + 16 + 32 + 10 + 1 + N = 75 + N bytes
// //
// KEY FIX from earlier emitter version: header is 6 bytes, NOT 8. // KEY FIX from earlier emitter version: header is 6 bytes, NOT 8.
// I had written 8 bytes (file_ref WORD + reserved WORD + extra WORD + // I had written 8 bytes (file_ref WORD + reserved WORD + extra WORD +
@ -415,7 +452,10 @@ static std::vector<uint8_t> emitOmfExpressLoad(
constexpr uint32_t HDR_SIZE = 44; constexpr uint32_t HDR_SIZE = 44;
constexpr uint32_t LOAD_NAME_SIZE = 10; constexpr uint32_t LOAD_NAME_SIZE = 10;
constexpr uint32_t SEG_NAME_SIZE = 10; // LABLEN=10 → fixed-width SEG_NAME constexpr uint32_t SEG_NAME_SIZE = 10; // LABLEN=10 → fixed-width SEG_NAME
const uint32_t userNameLen = (uint32_t)userSegName.size(); constexpr uint32_t SEGTAB_ENTRY = 8;
constexpr uint32_t REMAP_ENTRY = 2;
constexpr uint32_t HDR_INFO_ENTRY = 16 + 32 + LOAD_NAME_SIZE + SEG_NAME_SIZE; // 68
constexpr uint32_t HEADER_BYTES = 6;
const uint32_t userNameAreaSize = LOAD_NAME_SIZE + SEG_NAME_SIZE; const uint32_t userNameAreaSize = LOAD_NAME_SIZE + SEG_NAME_SIZE;
// ExpressLoad's own segment metrics. The name "~ExpressLoad" is 12 // ExpressLoad's own segment metrics. The name "~ExpressLoad" is 12
@ -423,12 +463,8 @@ static std::vector<uint8_t> emitOmfExpressLoad(
// uses LABLEN=0 (length-prefixed name): 1 length byte + 12 chars. // uses LABLEN=0 (length-prefixed name): 1 length byte + 12 chars.
const std::string elName = "~ExpressLoad"; const std::string elName = "~ExpressLoad";
const uint32_t elNameAreaSize = LOAD_NAME_SIZE + 1 + (uint32_t)elName.size(); const uint32_t elNameAreaSize = LOAD_NAME_SIZE + 1 + (uint32_t)elName.size();
// header_xpress_length = (header bytes 12..43) + LOAD_NAME + SEG_NAME const uint32_t nSegs = haveDpStack ? 2 : 1; // non-ExpressLoad segs
// = 32 + 10 + 10 = 52 bytes const uint32_t elDataSize = HEADER_BYTES + (SEGTAB_ENTRY + REMAP_ENTRY + HDR_INFO_ENTRY) * nSegs;
// Per-segment ExpressLoad data: 8 (table) + 2 (remap) + 16 (offsets) + 52 = 78 bytes
// Header (6 bytes) + per-segment data: 6 + 78 = 84
const uint32_t elDataSize = 84;
(void)userNameLen; // truncated in user seg name; LABLEN=10 fixed
// Body size = 1 byte LCONST opcode + 4 byte length + data + 1 byte END // Body size = 1 byte LCONST opcode + 4 byte length + data + 1 byte END
const uint32_t elBodySize = 1 + 4 + elDataSize + 1; const uint32_t elBodySize = 1 + 4 + elDataSize + 1;
const uint32_t elSegSize = HDR_SIZE + elNameAreaSize + elBodySize; const uint32_t elSegSize = HDR_SIZE + elNameAreaSize + elBodySize;
@ -438,24 +474,47 @@ static std::vector<uint8_t> emitOmfExpressLoad(
const uint32_t userBodyOpOff = userSegStart + HDR_SIZE + userNameAreaSize; const uint32_t userBodyOpOff = userSegStart + HDR_SIZE + userNameAreaSize;
const uint32_t userDataOff = userBodyOpOff + 5; // 1 op + 4 length const uint32_t userDataOff = userBodyOpOff + 5; // 1 op + 4 length
// DP/Stack segment file offsets (after user seg). The DP/Stack body
// mirrors the real GNO/ME ~_STACK seg format: an LCONST opcode + 4
// byte length + `stackSize` zero bytes + END. ExpressLoad's
// hdr_info entry has to point at the LCONST data so the Loader
// copies the right number of zeros into the allocated chunk — a
// body of just END (RESSPC-only) silently no-ops on the
// ExpressLoad fast path, which is the bug this whole section fixes.
const uint32_t dpStackSegStart = userSegStart + (uint32_t)userSeg.size();
const uint32_t dpStackBodyOff = dpStackSegStart + HDR_SIZE + (LOAD_NAME_SIZE + SEG_NAME_SIZE);
const uint32_t dpStackDataOff = dpStackBodyOff + 5; // 1 op + 4 length
// Step 3: build the ExpressLoad LCONST data. // Step 3: build the ExpressLoad LCONST data.
std::vector<uint8_t> elData; std::vector<uint8_t> elData;
// Header (6 bytes): reserved DWORD + count WORD // Header (6 bytes): reserved DWORD + count WORD. count = N-2 where
// N = total segments in the file (including ExpressLoad). With a
// DP/Stack seg N=3 so count=1; without it N=2 so count=0.
put32(elData, 0); // reserved put32(elData, 0); // reserved
put16(elData, 0); // count = N-2 = 0 (for 2 segs) put16(elData, (uint16_t)(haveDpStack ? 1 : 0)); // count = N-2
// Segment list (1 × 8 bytes) // Segment list: one 8-byte entry per non-ExpressLoad segment. Each
// Self-rel offset = (header info offset within elData) - (this entry pos) // entry's first WORD is the SELF-RELATIVE offset (from this entry's
// = 16 - 6 = 10 // own start) to the segment's header_info record.
constexpr uint32_t segListEntryOff = 6; const uint32_t segTableOff = HEADER_BYTES;
const uint32_t headerInfoOff = 6 + 8 + 2; // header + segtable + remap const uint32_t remapOff = segTableOff + SEGTAB_ENTRY * nSegs;
put16(elData, (uint16_t)(headerInfoOff - segListEntryOff)); const uint32_t hdrInfoOff = remapOff + REMAP_ENTRY * nSegs;
for (uint32_t i = 0; i < nSegs; i++) {
const uint32_t thisEntryOff = segTableOff + SEGTAB_ENTRY * i;
const uint32_t thisHdrInfoOff = hdrInfoOff + HDR_INFO_ENTRY * i;
put16(elData, (uint16_t)(thisHdrInfoOff - thisEntryOff)); // self-rel
put16(elData, 0); // flags put16(elData, 0); // flags
put32(elData, 0); // handle put32(elData, 0); // handle
}
// Remap list: old seg 1 (which would be our user seg without // Remap list: 1 WORD per non-ExpressLoad seg, giving the new
// ExpressLoad) maps to new seg 2 (since ExpressLoad takes seg 1). // segment number for each old segment position. Old seg 1 (user
// code, would-be sole seg without ExpressLoad) → new seg 2.
// Old seg 2 (DP/Stack, only present when --stack-size) → new seg 3.
put16(elData, 2); put16(elData, 2);
if (haveDpStack) {
put16(elData, 3);
}
// Header info entry for the user segment. // Header info entry for the user segment.
// data length = LCONST data size in the file. emitOneSeg embeds // data length = LCONST data size in the file. emitOneSeg embeds
@ -473,11 +532,10 @@ static std::vector<uint8_t> emitOmfExpressLoad(
put32(elData, 0); // reloc offset put32(elData, 0); // reloc offset
put32(elData, 0); // reloc length put32(elData, 0); // reloc length
} else { } else {
const uint32_t crelocBytesPerSite = 7; // 0xF5 + 1+1+2+2
const uint32_t crelocOff = const uint32_t crelocOff =
userDataOff + (uint32_t)image.size() + bssGap + bssSize; userDataOff + (uint32_t)image.size() + bssGap + bssSize;
const uint32_t crelocLen = const uint32_t crelocLen =
crelocBytesPerSite * (uint32_t)gReloc24Sites.size(); OMF_CRELOC_BYTES_PER_SITE * (uint32_t)gReloc24Sites.size();
put32(elData, crelocOff); put32(elData, crelocOff);
put32(elData, crelocLen); put32(elData, crelocLen);
} }
@ -498,6 +556,34 @@ static std::vector<uint8_t> emitOmfExpressLoad(
elData.push_back(i < truncated.size() ? (uint8_t)truncated[i] : 0x20); elData.push_back(i < truncated.size() ? (uint8_t)truncated[i] : 0x20);
} }
// Header info entry for the DP/Stack segment (when present).
// data_off / data_len point at the LCONST zero bytes carried in the
// DP/Stack seg's body, mirroring the working real-world layout
// (GNO/ME ~_STACK). No cRELOC entries for a DP/Stack seg, so
// reloc fields are 0.
if (haveDpStack) {
if (dpStackSeg.size() < HDR_SIZE) die("internal: DP/Stack seg too small");
put32(elData, dpStackDataOff); // data offset (LCONST data)
put32(elData, stackSize); // data length (= stack size)
put32(elData, 0); // reloc offset
put32(elData, 0); // reloc length
// Header copy: bytes [12..43] of DP/Stack segment header.
elData.insert(elData.end(), dpStackSeg.begin() + 12, dpStackSeg.begin() + HDR_SIZE);
elData[elData.size() - 32 + 30] = 0; // DISPDATA hi → 0
elData[elData.size() - 32 + 31] = 0;
// LOAD_NAME (10 bytes, space-padded)
for (int i = 0; i < (int)LOAD_NAME_SIZE; i++) elData.push_back(0x20);
// SEG_NAME = "~Direct" padded to 10 bytes (must match the value
// stored by emitDpStackSeg, otherwise ExpressLoad's name match
// could fail; the seg-name area in the file uses 10 spaces base
// with "~Direct" overwriting the first 7).
const char *dpName = "~Direct";
const size_t dpNameLen = 7;
for (size_t i = 0; i < SEG_NAME_SIZE; i++) {
elData.push_back(i < dpNameLen ? (uint8_t)dpName[i] : 0x20);
}
}
if (elData.size() != elDataSize) if (elData.size() != elDataSize)
die("internal: ExpressLoad data size mismatch"); die("internal: ExpressLoad data size mismatch");
@ -513,7 +599,7 @@ static std::vector<uint8_t> emitOmfExpressLoad(
elHdr.push_back(4); // NUMLEN elHdr.push_back(4); // NUMLEN
elHdr.push_back(2); // VERSION (0x02 = v2.1) elHdr.push_back(2); // VERSION (0x02 = v2.1)
put32(elHdr, 0); // BANKSIZE = 0 for DATA seg put32(elHdr, 0); // BANKSIZE = 0 for DATA seg
put16(elHdr, 0x8001); // KIND = DATA|STATIC put16(elHdr, OMF_KIND_DATA_STATIC); // KIND = DATA|STATIC
elHdr.push_back(0); elHdr.push_back(0); // undef elHdr.push_back(0); elHdr.push_back(0); // undef
put32(elHdr, 0); // ORG put32(elHdr, 0); // ORG
put32(elHdr, 0); // ALIGN put32(elHdr, 0); // ALIGN
@ -542,16 +628,15 @@ static std::vector<uint8_t> emitOmfExpressLoad(
die("internal: ExpressLoad segment size mismatch"); die("internal: ExpressLoad segment size mismatch");
// Step 6: concatenate ExpressLoad + user segment + optional DP/Stack. // Step 6: concatenate ExpressLoad + user segment + optional DP/Stack.
// The DP/Stack seg sits AFTER the user seg; the Loader walks file- // The DP/Stack seg's presence is now also recorded in the
// ordered segments after the ExpressLoad load step completes, and // ExpressLoad load script (segtable + remap + header_info entries
// processes each segment by KIND. The ExpressLoad load script only // above) so the Loader's fast path honors KIND=0x4012 instead of
// tracks code/data segs; the DP/Stack seg is found by KIND walk. // silently dropping it to its default 4 KB DP/Stack allocation.
std::vector<uint8_t> result; std::vector<uint8_t> result;
result.insert(result.end(), elSeg.begin(), elSeg.end()); result.insert(result.end(), elSeg.begin(), elSeg.end());
result.insert(result.end(), userSeg.begin(), userSeg.end()); result.insert(result.end(), userSeg.begin(), userSeg.end());
if (stackSize != 0) { if (haveDpStack) {
auto dpSeg = emitDpStackSeg(stackSize, /*segNum*/3); result.insert(result.end(), dpStackSeg.begin(), dpStackSeg.end());
result.insert(result.end(), dpSeg.begin(), dpSeg.end());
} }
return result; return result;
} }
@ -674,7 +759,7 @@ static void usage(const char *argv0) {
" sidecar; emit cRELOC (0xF5) opcodes after LCONST\n" " sidecar; emit cRELOC (0xF5) opcodes after LCONST\n"
" so the Loader patches intra-segment 24-bit refs\n" " so the Loader patches intra-segment 24-bit refs\n"
" (JSL/JML/STAlong/etc.) when placing the segment.\n" " (JSL/JML/STAlong/etc.) when placing the segment.\n"
" --stack-size N append a ~Direct DP/Stack segment (KIND=0x1012)\n" " --stack-size N append a ~Direct DP/Stack segment (KIND=0x4012)\n"
" of N bytes. The Loader allocates a page-aligned\n" " of N bytes. The Loader allocates a page-aligned\n"
" block of this size in bank 0 for combined DP +\n" " block of this size in bank 0 for combined DP +\n"
" stack use. N must be page-multiple (>= 256).\n" " stack use. N must be page-multiple (>= 256).\n"
@ -782,7 +867,7 @@ int main(int argc, char **argv) {
// intra-segment relocations at link time and have no // intra-segment relocations at link time and have no
// INTERSEG / RELOC opcodes); ABSBANK + ORG=base pins it // INTERSEG / RELOC opcodes); ABSBANK + ORG=base pins it
// to a specific bank. CODE is the default (type 0). // to a specific bank. CODE is the default (type 0).
uint16_t kind = (k == 0) ? 0x8800u : 0x8800u; const uint16_t kind = OMF_KIND_CODE_STATIC_ABSBANK;
uint32_t entryOff = (k == 0) ? s.entryOff : 0; uint32_t entryOff = (k == 0) ? s.entryOff : 0;
auto seg = emitOneSeg(img, entryOff, s.base, auto seg = emitOneSeg(img, entryOff, s.base,
static_cast<uint16_t>(s.num), static_cast<uint16_t>(s.num),
@ -846,10 +931,15 @@ int main(int argc, char **argv) {
if (!f) die("cannot open '" + output + "' for writing"); if (!f) die("cannot open '" + output + "' for writing");
f.write(reinterpret_cast<const char *>(blob.data()), blob.size()); f.write(reinterpret_cast<const char *>(blob.data()), blob.size());
// Segment count: 1 user CODE seg; +1 for ExpressLoad wrapper; +1
// when --stack-size adds a ~Direct DP/Stack seg.
int segCount = 1;
if (expressload) segCount++;
if (stackSize != 0) segCount++;
std::fprintf(stderr, std::fprintf(stderr,
"OMF: %d segment%s%s, %zu bytes payload, entry='%s' at +0x%x -> %s " "OMF: %d segment%s%s, %zu bytes payload, entry='%s' at +0x%x -> %s "
"(%zu bytes total)\n", "(%zu bytes total)\n",
expressload ? 2 : 1, expressload ? "s" : "", segCount, segCount == 1 ? "" : "s",
expressload ? " (ExpressLoad)" : "", expressload ? " (ExpressLoad)" : "",
image.size(), entry.c_str(), entryOff, image.size(), entry.c_str(), entryOff,
output.c_str(), blob.size()); output.c_str(), blob.size());

View file

@ -6,9 +6,10 @@
// //
//===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===//
// //
// Skeleton assembler backend. Fixup resolution, relaxation and nop // W65816 assembler backend. Implements applyFixup for the
// generation are left unimplemented; they will be filled in once the // R_W65816_* relocation family, BRA -> BRL relaxation when the 8-bit
// instruction encodings are defined. // signed displacement won't fit, and writeNopData using 65816 NOP
// ($EA) bytes.
// //
//===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===//
@ -29,6 +30,13 @@
// W65816::BRA / W65816::BRL opcodes are exported by W65816MCTargetDesc.h // W65816::BRA / W65816::BRL opcodes are exported by W65816MCTargetDesc.h
// (which already includes the generated header). // (which already includes the generated header).
// W65816 NOP machine encoding (single byte).
static constexpr unsigned char kOpcodeNOP = 0xEA;
// Signed 8-bit branch displacement range for Bxx / BRA fixups.
static constexpr int kBranch8Min = -128;
static constexpr int kBranch8Max = 127;
using namespace llvm; using namespace llvm;
namespace { namespace {
@ -110,7 +118,7 @@ public:
// instead of silently truncating. // instead of silently truncating.
if (Fixup.getKind() == W65816::fixup_8_pcrel) { if (Fixup.getKind() == W65816::fixup_8_pcrel) {
int64_t Signed = static_cast<int64_t>(Value); int64_t Signed = static_cast<int64_t>(Value);
if (Signed < -128 || Signed > 127) { if (Signed < kBranch8Min || Signed > kBranch8Max) {
getContext().reportError( getContext().reportError(
Fixup.getLoc(), Fixup.getLoc(),
"branch target out of range for 8-bit PC-relative branch " "branch target out of range for 8-bit PC-relative branch "
@ -158,7 +166,7 @@ public:
const MCSubtargetInfo *STI) const override { const MCSubtargetInfo *STI) const override {
// The 65816 NOP is a single 0xEA byte. // The 65816 NOP is a single 0xEA byte.
for (uint64_t I = 0; I < Count; ++I) for (uint64_t I = 0; I < Count; ++I)
OS << char(0xEA); OS << static_cast<char>(kOpcodeNOP);
return true; return true;
} }
@ -192,7 +200,7 @@ public:
if (Fixup.getKind() != W65816::fixup_8_pcrel) if (Fixup.getKind() != W65816::fixup_8_pcrel)
return false; return false;
int64_t Signed = static_cast<int64_t>(Value); int64_t Signed = static_cast<int64_t>(Value);
return Signed < -128 || Signed > 127; return Signed < kBranch8Min || Signed > kBranch8Max;
} }
void relaxInstruction(MCInst &Inst, void relaxInstruction(MCInst &Inst,

View file

@ -24,6 +24,23 @@
using namespace llvm; using namespace llvm;
// R_W65816_* relocation numbers. These are protocol constants shared
// with link816 / omfEmit / llvm-objdump; do not renumber. If new types
// are added, mirror them in src/link816/link816.cpp's relocWidth() and
// the cRELOC pipeline.
namespace R_W65816 {
enum : unsigned {
R_IMM8 = 1,
R_IMM16 = 2,
R_IMM24 = 3,
R_PCREL8 = 4,
R_PCREL16 = 5,
R_BANK16 = 6,
R_DATA32 = 7,
R_PCREL32 = 8,
};
} // namespace R_W65816
namespace { namespace {
class W65816ELFObjectWriter : public MCELFObjectTargetWriter { class W65816ELFObjectWriter : public MCELFObjectTargetWriter {
@ -56,16 +73,16 @@ protected:
// type — observed as type 249 — and broke link816.py. // type — observed as type 249 — and broke link816.py.
auto Kind = Fixup.getKind(); auto Kind = Fixup.getKind();
switch (Kind) { switch (Kind) {
case W65816::fixup_8: return 1; // R_W65816_IMM8 case W65816::fixup_8: return R_W65816::R_IMM8;
case W65816::fixup_16: return 2; // R_W65816_IMM16 case W65816::fixup_16: return R_W65816::R_IMM16;
case W65816::fixup_24: return 3; // R_W65816_IMM24 case W65816::fixup_24: return R_W65816::R_IMM24;
case W65816::fixup_8_pcrel: return 4; // R_W65816_PCREL8 case W65816::fixup_8_pcrel: return R_W65816::R_PCREL8;
case W65816::fixup_16_pcrel: return 5; // R_W65816_PCREL16 case W65816::fixup_16_pcrel: return R_W65816::R_PCREL16;
case W65816::fixup_bank16: return 6; // R_W65816_BANK16 case W65816::fixup_bank16: return R_W65816::R_BANK16;
case W65816::fixup_32: return 7; // R_W65816_DATA32 case W65816::fixup_32: return R_W65816::R_DATA32;
case W65816::fixup_32_pcrel: return 8; // R_W65816_PCREL32 case W65816::fixup_32_pcrel: return R_W65816::R_PCREL32;
case FK_Data_1: return IsPCRel ? 4 : 1; case FK_Data_1: return IsPCRel ? R_W65816::R_PCREL8 : R_W65816::R_IMM8;
case FK_Data_2: return IsPCRel ? 5 : 2; case FK_Data_2: return IsPCRel ? R_W65816::R_PCREL16 : R_W65816::R_IMM16;
// FK_Data_4 is emitted by DWARF (.debug_info / .debug_line / // FK_Data_4 is emitted by DWARF (.debug_info / .debug_line /
// .debug_frame section-relative addresses), .eh_frame, // .debug_frame section-relative addresses), .eh_frame,
// .debug_loclists, and user `.long` directives. Dispatch by // .debug_loclists, and user `.long` directives. Dispatch by
@ -78,7 +95,7 @@ protected:
// .debug_line decoder because the 4th byte of the slot landed // .debug_line decoder because the 4th byte of the slot landed
// on whatever followed it (most often the size byte of the // on whatever followed it (most often the size byte of the
// next line-program header → unit_length = 0). // next line-program header → unit_length = 0).
case FK_Data_4: return IsPCRel ? 8 : 7; case FK_Data_4: return IsPCRel ? R_W65816::R_PCREL32 : R_W65816::R_DATA32;
default: default:
llvm_unreachable("W65816: unknown fixup kind"); llvm_unreachable("W65816: unknown fixup kind");
} }

View file

@ -204,6 +204,7 @@ void initializeW65816SepRepCleanupPass(PassRegistry &);
void initializeW65816BranchExpandPass(PassRegistry &); void initializeW65816BranchExpandPass(PassRegistry &);
void initializeW65816TiedDefSpillPass(PassRegistry &); void initializeW65816TiedDefSpillPass(PassRegistry &);
void initializeW65816ABridgeViaXPass(PassRegistry &); void initializeW65816ABridgeViaXPass(PassRegistry &);
void initializeW65816UnLSRPass(PassRegistry &);
void initializeW65816WidenAcc16Pass(PassRegistry &); void initializeW65816WidenAcc16Pass(PassRegistry &);
void initializeW65816SpillToXPass(PassRegistry &); void initializeW65816SpillToXPass(PassRegistry &);
void initializeW65816NegYIndYPass(PassRegistry &); void initializeW65816NegYIndYPass(PassRegistry &);

View file

@ -8,23 +8,28 @@
// //
// Pre-regalloc complement to W65816TiedDefSpill. Where TiedDefSpill // Pre-regalloc complement to W65816TiedDefSpill. Where TiedDefSpill
// preserves a multi-use Acc16 vreg by spilling it to a fresh stack // preserves a multi-use Acc16 vreg by spilling it to a fresh stack
// slot around the tied-def consumer, this pass tries to do the same // slot around the tied-def consumer, this pass bridges via an Img16
// preservation via TAX/TXA: copy to an Idx16 vreg before the consumer // (DP-backed) vreg: park SrcReg in a fresh Img16 vreg before the
// (regalloc puts it in X or Y, expansion lowers the COPY to TAX/TAY), // consumer, restore to a fresh Acc16 vreg after. Regalloc places the
// copy back to a fresh Acc16 vreg after. // Img16 in IMG0..IMG7 (DP $D0..$DE); copyPhysReg lowers the COPYs to
// STA dp / LDA dp (4 cyc each) and no system-stack slot is allocated.
//
// (The pass name dates from an earlier prototype that bridged via X
// using TAX/TXA. Cross-MBB X-liveness analysis was unimplemented and
// the X-bridge couldn't survive Idx16 clobbers between consumer and
// last use, so the bridge moved to Img16. The DP-backed form has the
// same 4-cycle round-trip cost as TAX/TXA bridges with none of the
// liveness restrictions.)
// //
// Win per bridged pair: // Win per bridged pair:
// stack spill: STA dp,S (5 cyc) + LDA dp,S (5 cyc) + 1 frame slot // stack spill: STA dp,S (5 cyc) + LDA dp,S (5 cyc) + 1 frame slot
// X bridge : TAX (2 cyc) + TXA (2 cyc) + no frame growth // Img bridge : STA dp (4 cyc) + LDA dp (4 cyc) + no frame growth
// Net 6 cycles + 2 bytes saved per bridge — and we avoid one PHA per // Net 2 cycles + (1 byte per access) saved per bridge -- and one PHA
// stack slot we didn't allocate. // per avoided stack slot.
// //
// Bail conditions (fall back to TiedDefSpill's stack route): // Bail conditions (fall back to TiedDefSpill's stack route): any MI
// - any MI between consumer and SrcReg's last use clobbers Idx16 // between consumer and SrcReg's last use that clobbers IMG slots,
// (LDX/LDY/INX/DEX/INY/DEY/TAX/TAY/TXY/TYX/PHX/PHY/PLX/PLY/etc.) // callees that clobber IMG0..IMG7, cross-MBB uses of SrcReg.
// - any call in the range (calls clobber X and Y per ABI)
// - SrcReg is used in a different MBB (cross-MBB liveness needs more
// analysis; deferred)
// //
// Runs before TiedDefSpill so the latter doesn't double-process the // Runs before TiedDefSpill so the latter doesn't double-process the
// same candidates. // same candidates.

View file

@ -6,8 +6,14 @@
// //
//===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===//
// //
// Skeleton assembly printer. The MCInst lowering path is wired up but no // W65816 assembly printer. Owns the late pseudo-expansion path
// target-specific operand formatting is implemented yet. // (MCInst lowering for the IR-pseudo opcodes that we keep through PEI
// because their machine encoding depends on AsmPrinter-time peepholes
// or runtime ABI knowledge -- BRK_pseudo, LDAi16imm_bank, JSLpseudo,
// the SEP/REP-wrapped i8 forms, etc.), plus a small set of mode-aware
// peepholes (PEA / PEI substitution for LDA+PUSH16 chains, STZ
// folding, etc.) that prefer to run after the rest of codegen has
// stabilised the MIR.
// //
//===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===//
@ -31,6 +37,39 @@ using namespace llvm;
#define DEBUG_TYPE "asm-printer" #define DEBUG_TYPE "asm-printer"
// W65816 processor-status flag masks used by SEP/REP wrapping.
// (See W65816 datasheet 6.10.) M = accumulator width (1 = 8-bit,
// 0 = 16-bit); X = index width (same convention). The wraps in this
// file toggle M only; X never changes in normal codegen.
static constexpr unsigned kPStatusM = 0x20;
[[maybe_unused]] static constexpr unsigned kPStatusX = 0x10;
// IIgs runtime DP slots referenced from emitted code. Both are part of
// the runtime ABI -- AsmPrinter / ISelLowering / libgcc must agree.
// kRuntimePbrStashDP -- crt0 stashes the runtime PBR here so
// LDAi16imm_bank can emit `lda $BE` (PBR-byte
// load) for &symbol values in non-bank-0 placements.
// kRuntimeIndirTargetDP -- __indirTarget vector used by the
// JMP (abs) indirect-call thunk.
static constexpr unsigned kRuntimePbrStashDP = 0xBE;
[[maybe_unused]] static constexpr unsigned kRuntimeIndirTargetDP = 0x00B8;
// DP scratch byte used by ADJCALLSTACKUP / ALLOCAfi to save A across a
// TSC/TCS bracket. Lives in the project-wide $E0..$DF DP scratch
// range; coordinate with W65816ISelLowering / W65816RegisterInfo if
// the layout changes.
static constexpr unsigned kDpScratch0 = 0xE0;
// IIgs bank-byte mask: a 24-bit address whose top 8 bits are non-zero
// is in a non-zero bank and must be encoded via the LONG form.
static constexpr uint64_t kBankByteMask = 0xFF0000;
// ADJCALLSTACKUP fan-out limit: PLY (1 byte / 4 cyc per pair-pop) wins
// over the 8-byte / ~14-cyc TAY/TSC/CLC/ADC/TCS/TYA bracket up through
// N = 14 even bytes; beyond that the bracket is cheaper. See the
// dispatch in the ADJCALLSTACKUP expansion.
static constexpr int kAdjStackUpPlyMaxN = 14;
namespace { namespace {
class W65816AsmPrinter : public AsmPrinter { class W65816AsmPrinter : public AsmPrinter {
@ -267,7 +306,7 @@ void W65816AsmPrinter::emitInstruction(const MachineInstr *MI) {
if (YLive) { if (YLive) {
// Route through DP $E0 to preserve both A and Y. // Route through DP $E0 to preserve both A and Y.
MCInst Sta; Sta.setOpcode(W65816::STA_DP); MCInst Sta; Sta.setOpcode(W65816::STA_DP);
Sta.addOperand(MCOperand::createImm(0xE0)); Sta.addOperand(MCOperand::createImm(kDpScratch0));
EmitToStreamer(*OutStreamer, Sta); EmitToStreamer(*OutStreamer, Sta);
MCInst Tsc; Tsc.setOpcode(W65816::TSC); EmitToStreamer(*OutStreamer, Tsc); MCInst Tsc; Tsc.setOpcode(W65816::TSC); EmitToStreamer(*OutStreamer, Tsc);
MCInst Clc; Clc.setOpcode(W65816::CLC); EmitToStreamer(*OutStreamer, Clc); MCInst Clc; Clc.setOpcode(W65816::CLC); EmitToStreamer(*OutStreamer, Clc);
@ -276,9 +315,13 @@ void W65816AsmPrinter::emitInstruction(const MachineInstr *MI) {
EmitToStreamer(*OutStreamer, Adc); EmitToStreamer(*OutStreamer, Adc);
MCInst Tcs; Tcs.setOpcode(W65816::TCS); EmitToStreamer(*OutStreamer, Tcs); MCInst Tcs; Tcs.setOpcode(W65816::TCS); EmitToStreamer(*OutStreamer, Tcs);
MCInst Lda; Lda.setOpcode(W65816::LDA_DP); MCInst Lda; Lda.setOpcode(W65816::LDA_DP);
Lda.addOperand(MCOperand::createImm(0xE0)); Lda.addOperand(MCOperand::createImm(kDpScratch0));
EmitToStreamer(*OutStreamer, Lda); EmitToStreamer(*OutStreamer, Lda);
} else if (N <= 14 && (N % 2) == 0) { } else if (N <= kAdjStackUpPlyMaxN && (N % 2) == 0) {
// Repeated PLY (1 byte / 4 cyc each) wins over the TAY/TSC/CLC/
// ADC/TCS/TYA bracket (8 bytes / ~14 cyc fixed) for N <= 14;
// beyond that the bracket is cheaper. Must be even (PLY pops
// 16-bit pairs).
for (int i = 0; i < N / 2; ++i) { for (int i = 0; i < N / 2; ++i) {
MCInst Ply; Ply.setOpcode(W65816::PLY); MCInst Ply; Ply.setOpcode(W65816::PLY);
EmitToStreamer(*OutStreamer, Ply); EmitToStreamer(*OutStreamer, Ply);
@ -348,7 +391,7 @@ void W65816AsmPrinter::emitInstruction(const MachineInstr *MI) {
Lda.addOperand(MCOperand::createImm(0)); Lda.addOperand(MCOperand::createImm(0));
} else { } else {
Lda.setOpcode(W65816::LDA_DP); Lda.setOpcode(W65816::LDA_DP);
Lda.addOperand(MCOperand::createImm(0xBE)); Lda.addOperand(MCOperand::createImm(kRuntimePbrStashDP));
} }
EmitToStreamer(*OutStreamer, Lda); EmitToStreamer(*OutStreamer, Lda);
return; return;
@ -380,7 +423,7 @@ void W65816AsmPrinter::emitInstruction(const MachineInstr *MI) {
// writes `*(uint16 *)0xE19E00UL = 0` we MUST keep the // writes `*(uint16 *)0xE19E00UL = 0` we MUST keep the
// LDA #0 + STA_Long pair so the bank-explicit form survives. // LDA #0 + STA_Long pair so the bank-explicit form survives.
bool AddrFitsIn16 = !It->getOperand(1).isImm() || bool AddrFitsIn16 = !It->getOperand(1).isImm() ||
(It->getOperand(1).getImm() & 0xFF0000) == 0; (It->getOperand(1).getImm() & kBankByteMask) == 0;
if (AddrFitsIn16) { if (AddrFitsIn16) {
MCInst Stz; MCInst Stz;
Stz.setOpcode(W65816::STZ_Abs); Stz.setOpcode(W65816::STZ_Abs);
@ -401,6 +444,10 @@ void W65816AsmPrinter::emitInstruction(const MachineInstr *MI) {
if (It != MI->getParent()->end() && It->getOpcode() == W65816::PUSH16) { if (It != MI->getParent()->end() && It->getOpcode() == W65816::PUSH16) {
auto It2 = std::next(It); auto It2 = std::next(It);
while (It2 != MI->getParent()->end() && It2->isDebugInstr()) ++It2; while (It2 != MI->getParent()->end() && It2->isDebugInstr()) ++It2;
// If PUSH16 is the last MI in the BB we leave the peephole as a
// no-op (conservative): the PUSH chain almost always feeds a JSL
// within the same BB, and proving A-dead at BB exit via successor
// live-in scan is not worth the bookkeeping.
bool ADead = false; bool ADead = false;
if (It2 != MI->getParent()->end()) { if (It2 != MI->getParent()->end()) {
const TargetRegisterInfo *TRI = const TargetRegisterInfo *TRI =
@ -408,13 +455,6 @@ void W65816AsmPrinter::emitInstruction(const MachineInstr *MI) {
if (It2->modifiesRegister(W65816::A, TRI) && if (It2->modifiesRegister(W65816::A, TRI) &&
!It2->readsRegister(W65816::A, TRI)) !It2->readsRegister(W65816::A, TRI))
ADead = true; ADead = true;
} else {
// PUSH16 is the last instruction in the BB. A is dead at
// BB exit iff it's not live-out. Check the BB's live-out
// set via successors; if no successor lists A as live-in,
// it's safe. Conservative: treat as not-dead (skip peephole).
// This case is uncommon — the PUSH chain almost always feeds
// a JSL within the same BB.
} }
if (ADead) { if (ADead) {
MCInst Pea; MCInst Pea;
@ -445,7 +485,7 @@ void W65816AsmPrinter::emitInstruction(const MachineInstr *MI) {
// hit. We mark the next-SEP-to-skip via a per-AsmPrinter flag // hit. We mark the next-SEP-to-skip via a per-AsmPrinter flag
// so the SEP visit drops it. // so the SEP visit drops it.
MCInst Sep; Sep.setOpcode(W65816::SEP); MCInst Sep; Sep.setOpcode(W65816::SEP);
Sep.addOperand(MCOperand::createImm(0x20)); Sep.addOperand(MCOperand::createImm(kPStatusM));
EmitToStreamer(*OutStreamer, Sep); EmitToStreamer(*OutStreamer, Sep);
MCInst Lda; MCInst Lda;
Lda.setOpcode(W65816::LDA_Imm8); Lda.setOpcode(W65816::LDA_Imm8);
@ -487,9 +527,9 @@ void W65816AsmPrinter::emitInstruction(const MachineInstr *MI) {
if (It != MI->getParent()->end() && if (It != MI->getParent()->end() &&
It->getOpcode() == W65816::SEP && It->getOpcode() == W65816::SEP &&
It->getNumOperands() >= 1 && It->getOperand(0).isImm() && It->getNumOperands() >= 1 && It->getOperand(0).isImm() &&
It->getOperand(0).getImm() == 0x20) { It->getOperand(0).getImm() == kPStatusM) {
SkipRep = true; SkipRep = true;
SkipNextSepImm = 0x20; SkipNextSepImm = static_cast<int>(kPStatusM);
} }
// STA8abs / STA8long don't expose their SEP at MIR — the wrap is // STA8abs / STA8long don't expose their SEP at MIR — the wrap is
// emitted at MC layer. Detect them here so we can elide the // emitted at MC layer. Detect them here so we can elide the
@ -505,7 +545,7 @@ void W65816AsmPrinter::emitInstruction(const MachineInstr *MI) {
} }
if (!SkipRep) { if (!SkipRep) {
MCInst Rep; Rep.setOpcode(W65816::REP); MCInst Rep; Rep.setOpcode(W65816::REP);
Rep.addOperand(MCOperand::createImm(0x20)); Rep.addOperand(MCOperand::createImm(kPStatusM));
EmitToStreamer(*OutStreamer, Rep); EmitToStreamer(*OutStreamer, Rep);
} }
return; return;
@ -533,7 +573,7 @@ void W65816AsmPrinter::emitInstruction(const MachineInstr *MI) {
EmitToStreamer(*OutStreamer, Lda); EmitToStreamer(*OutStreamer, Lda);
return; return;
} }
if ((A & 0xFF0000) != 0) { if ((A & kBankByteMask) != 0) {
MCInst Lda; MCInst Lda;
Lda.setOpcode(W65816::LDA_Long); Lda.setOpcode(W65816::LDA_Long);
Lda.addOperand(lowerOperand(AddrOp, MCInstLowering)); Lda.addOperand(lowerOperand(AddrOp, MCInstLowering));
@ -564,7 +604,7 @@ void W65816AsmPrinter::emitInstruction(const MachineInstr *MI) {
EmitToStreamer(*OutStreamer, Sta); EmitToStreamer(*OutStreamer, Sta);
return; return;
} }
if ((A & 0xFF0000) != 0) { if ((A & kBankByteMask) != 0) {
MCInst Sta; MCInst Sta;
Sta.setOpcode(W65816::STA_Long); Sta.setOpcode(W65816::STA_Long);
Sta.addOperand(lowerOperand(AddrOp, MCInstLowering)); Sta.addOperand(lowerOperand(AddrOp, MCInstLowering));
@ -649,7 +689,7 @@ void W65816AsmPrinter::emitInstruction(const MachineInstr *MI) {
bool IsSub = MI->getOpcode() == W65816::SBCi8imm; bool IsSub = MI->getOpcode() == W65816::SBCi8imm;
// SEP/REP wrap (see LDAi8imm comment). // SEP/REP wrap (see LDAi8imm comment).
MCInst Sep; Sep.setOpcode(W65816::SEP); MCInst Sep; Sep.setOpcode(W65816::SEP);
Sep.addOperand(MCOperand::createImm(0x20)); Sep.addOperand(MCOperand::createImm(kPStatusM));
EmitToStreamer(*OutStreamer, Sep); EmitToStreamer(*OutStreamer, Sep);
MCInst Carry; MCInst Carry;
Carry.setOpcode(IsSub ? W65816::SEC : W65816::CLC); Carry.setOpcode(IsSub ? W65816::SEC : W65816::CLC);
@ -660,7 +700,7 @@ void W65816AsmPrinter::emitInstruction(const MachineInstr *MI) {
Op.addOperand(MCOperand::createImm(Val)); Op.addOperand(MCOperand::createImm(Val));
EmitToStreamer(*OutStreamer, Op); EmitToStreamer(*OutStreamer, Op);
MCInst Rep; Rep.setOpcode(W65816::REP); MCInst Rep; Rep.setOpcode(W65816::REP);
Rep.addOperand(MCOperand::createImm(0x20)); Rep.addOperand(MCOperand::createImm(kPStatusM));
EmitToStreamer(*OutStreamer, Rep); EmitToStreamer(*OutStreamer, Rep);
return; return;
} }
@ -682,11 +722,11 @@ void W65816AsmPrinter::emitInstruction(const MachineInstr *MI) {
Op.addOperand(MCOperand::createImm(Val)); Op.addOperand(MCOperand::createImm(Val));
// SEP/REP wrap (see LDAi8imm comment). // SEP/REP wrap (see LDAi8imm comment).
MCInst Sep; Sep.setOpcode(W65816::SEP); MCInst Sep; Sep.setOpcode(W65816::SEP);
Sep.addOperand(MCOperand::createImm(0x20)); Sep.addOperand(MCOperand::createImm(kPStatusM));
EmitToStreamer(*OutStreamer, Sep); EmitToStreamer(*OutStreamer, Sep);
EmitToStreamer(*OutStreamer, Op); EmitToStreamer(*OutStreamer, Op);
MCInst Rep; Rep.setOpcode(W65816::REP); MCInst Rep; Rep.setOpcode(W65816::REP);
Rep.addOperand(MCOperand::createImm(0x20)); Rep.addOperand(MCOperand::createImm(kPStatusM));
EmitToStreamer(*OutStreamer, Rep); EmitToStreamer(*OutStreamer, Rep);
return; return;
} }
@ -696,7 +736,7 @@ void W65816AsmPrinter::emitInstruction(const MachineInstr *MI) {
// LDA_Long (0xAF, bank-explicit) for const-int MMIO addresses. // LDA_Long (0xAF, bank-explicit) for const-int MMIO addresses.
bool IsLong = MI->getOpcode() == W65816::LDA8long; bool IsLong = MI->getOpcode() == W65816::LDA8long;
MCInst Sep; Sep.setOpcode(W65816::SEP); MCInst Sep; Sep.setOpcode(W65816::SEP);
Sep.addOperand(MCOperand::createImm(0x20)); Sep.addOperand(MCOperand::createImm(kPStatusM));
EmitToStreamer(*OutStreamer, Sep); EmitToStreamer(*OutStreamer, Sep);
MCInst Lda; MCInst Lda;
Lda.setOpcode(IsLong ? W65816::LDA_Long : W65816::LDA_Abs); Lda.setOpcode(IsLong ? W65816::LDA_Long : W65816::LDA_Abs);
@ -709,7 +749,7 @@ void W65816AsmPrinter::emitInstruction(const MachineInstr *MI) {
Lda.addOperand(Addr); Lda.addOperand(Addr);
EmitToStreamer(*OutStreamer, Lda); EmitToStreamer(*OutStreamer, Lda);
MCInst Rep; Rep.setOpcode(W65816::REP); MCInst Rep; Rep.setOpcode(W65816::REP);
Rep.addOperand(MCOperand::createImm(0x20)); Rep.addOperand(MCOperand::createImm(kPStatusM));
EmitToStreamer(*OutStreamer, Rep); EmitToStreamer(*OutStreamer, Rep);
return; return;
} }
@ -717,14 +757,14 @@ void W65816AsmPrinter::emitInstruction(const MachineInstr *MI) {
// i8 indexed-global load: SEP #0x20 ; LDA <addr>, X ; REP #0x20 // i8 indexed-global load: SEP #0x20 ; LDA <addr>, X ; REP #0x20
// X holds the index (set up by CopyToReg before this MI). // X holds the index (set up by CopyToReg before this MI).
MCInst Sep; Sep.setOpcode(W65816::SEP); MCInst Sep; Sep.setOpcode(W65816::SEP);
Sep.addOperand(MCOperand::createImm(0x20)); Sep.addOperand(MCOperand::createImm(kPStatusM));
EmitToStreamer(*OutStreamer, Sep); EmitToStreamer(*OutStreamer, Sep);
MCInst Lda; MCInst Lda;
Lda.setOpcode(W65816::LDA_AbsX); Lda.setOpcode(W65816::LDA_AbsX);
Lda.addOperand(lowerOperand(MI->getOperand(0), MCInstLowering)); Lda.addOperand(lowerOperand(MI->getOperand(0), MCInstLowering));
EmitToStreamer(*OutStreamer, Lda); EmitToStreamer(*OutStreamer, Lda);
MCInst Rep; Rep.setOpcode(W65816::REP); MCInst Rep; Rep.setOpcode(W65816::REP);
Rep.addOperand(MCOperand::createImm(0x20)); Rep.addOperand(MCOperand::createImm(kPStatusM));
EmitToStreamer(*OutStreamer, Rep); EmitToStreamer(*OutStreamer, Rep);
return; return;
} }
@ -732,14 +772,14 @@ void W65816AsmPrinter::emitInstruction(const MachineInstr *MI) {
// i8 indexed-global store: SEP #0x20 ; STA <addr>, X ; REP #0x20 // i8 indexed-global store: SEP #0x20 ; STA <addr>, X ; REP #0x20
// A holds the value, X holds the index. // A holds the value, X holds the index.
MCInst Sep; Sep.setOpcode(W65816::SEP); MCInst Sep; Sep.setOpcode(W65816::SEP);
Sep.addOperand(MCOperand::createImm(0x20)); Sep.addOperand(MCOperand::createImm(kPStatusM));
EmitToStreamer(*OutStreamer, Sep); EmitToStreamer(*OutStreamer, Sep);
MCInst Sta; MCInst Sta;
Sta.setOpcode(W65816::STA_AbsX); Sta.setOpcode(W65816::STA_AbsX);
Sta.addOperand(lowerOperand(MI->getOperand(0), MCInstLowering)); Sta.addOperand(lowerOperand(MI->getOperand(0), MCInstLowering));
EmitToStreamer(*OutStreamer, Sta); EmitToStreamer(*OutStreamer, Sta);
MCInst Rep; Rep.setOpcode(W65816::REP); MCInst Rep; Rep.setOpcode(W65816::REP);
Rep.addOperand(MCOperand::createImm(0x20)); Rep.addOperand(MCOperand::createImm(kPStatusM));
EmitToStreamer(*OutStreamer, Rep); EmitToStreamer(*OutStreamer, Rep);
return; return;
} }
@ -764,7 +804,7 @@ void W65816AsmPrinter::emitInstruction(const MachineInstr *MI) {
SkipNextSta8Wrap = false; SkipNextSta8Wrap = false;
if (!UsesAcc8 && !SkipOpenSep) { if (!UsesAcc8 && !SkipOpenSep) {
MCInst Sep; Sep.setOpcode(W65816::SEP); MCInst Sep; Sep.setOpcode(W65816::SEP);
Sep.addOperand(MCOperand::createImm(0x20)); Sep.addOperand(MCOperand::createImm(kPStatusM));
EmitToStreamer(*OutStreamer, Sep); EmitToStreamer(*OutStreamer, Sep);
} }
MCInst Sta; MCInst Sta;
@ -784,7 +824,7 @@ void W65816AsmPrinter::emitInstruction(const MachineInstr *MI) {
EmitToStreamer(*OutStreamer, Sta); EmitToStreamer(*OutStreamer, Sta);
if (!UsesAcc8) { if (!UsesAcc8) {
MCInst Rep; Rep.setOpcode(W65816::REP); MCInst Rep; Rep.setOpcode(W65816::REP);
Rep.addOperand(MCOperand::createImm(0x20)); Rep.addOperand(MCOperand::createImm(kPStatusM));
EmitToStreamer(*OutStreamer, Rep); EmitToStreamer(*OutStreamer, Rep);
} }
return; return;
@ -825,7 +865,7 @@ void W65816AsmPrinter::emitInstruction(const MachineInstr *MI) {
// i8 immediate compare — needs M=1 so the CPU only reads 1 byte // i8 immediate compare — needs M=1 so the CPU only reads 1 byte
// for the immediate. See LDAi8imm comment for the wrap rationale. // for the immediate. See LDAi8imm comment for the wrap rationale.
MCInst Sep; Sep.setOpcode(W65816::SEP); MCInst Sep; Sep.setOpcode(W65816::SEP);
Sep.addOperand(MCOperand::createImm(0x20)); Sep.addOperand(MCOperand::createImm(kPStatusM));
EmitToStreamer(*OutStreamer, Sep); EmitToStreamer(*OutStreamer, Sep);
MCInst Cmp; MCInst Cmp;
Cmp.setOpcode(W65816::CMP_Imm8); Cmp.setOpcode(W65816::CMP_Imm8);
@ -833,7 +873,7 @@ void W65816AsmPrinter::emitInstruction(const MachineInstr *MI) {
Cmp.addOperand(MCOperand::createImm(Val)); Cmp.addOperand(MCOperand::createImm(Val));
EmitToStreamer(*OutStreamer, Cmp); EmitToStreamer(*OutStreamer, Cmp);
MCInst Rep; Rep.setOpcode(W65816::REP); MCInst Rep; Rep.setOpcode(W65816::REP);
Rep.addOperand(MCOperand::createImm(0x20)); Rep.addOperand(MCOperand::createImm(kPStatusM));
EmitToStreamer(*OutStreamer, Rep); EmitToStreamer(*OutStreamer, Rep);
return; return;
} }
@ -965,12 +1005,12 @@ void W65816AsmPrinter::emitInstruction(const MachineInstr *MI) {
// Size is in A on entry — but we need A=SP after TSC, so first // Size is in A on entry — but we need A=SP after TSC, so first
// stash the size to DP scratch. // stash the size to DP scratch.
MCInst Sta1; Sta1.setOpcode(W65816::STA_DP); MCInst Sta1; Sta1.setOpcode(W65816::STA_DP);
Sta1.addOperand(MCOperand::createImm(0xE0)); Sta1.addOperand(MCOperand::createImm(kDpScratch0));
EmitToStreamer(*OutStreamer, Sta1); EmitToStreamer(*OutStreamer, Sta1);
MCInst Tsc; Tsc.setOpcode(W65816::TSC); EmitToStreamer(*OutStreamer, Tsc); MCInst Tsc; Tsc.setOpcode(W65816::TSC); EmitToStreamer(*OutStreamer, Tsc);
MCInst Sec; Sec.setOpcode(W65816::SEC); EmitToStreamer(*OutStreamer, Sec); MCInst Sec; Sec.setOpcode(W65816::SEC); EmitToStreamer(*OutStreamer, Sec);
MCInst Sbc; Sbc.setOpcode(W65816::SBC_DP); MCInst Sbc; Sbc.setOpcode(W65816::SBC_DP);
Sbc.addOperand(MCOperand::createImm(0xE0)); Sbc.addOperand(MCOperand::createImm(kDpScratch0));
EmitToStreamer(*OutStreamer, Sbc); EmitToStreamer(*OutStreamer, Sbc);
MCInst Tcs; Tcs.setOpcode(W65816::TCS); EmitToStreamer(*OutStreamer, Tcs); MCInst Tcs; Tcs.setOpcode(W65816::TCS); EmitToStreamer(*OutStreamer, Tcs);
MCInst Ina; Ina.setOpcode(W65816::INA); EmitToStreamer(*OutStreamer, Ina); MCInst Ina; Ina.setOpcode(W65816::INA); EmitToStreamer(*OutStreamer, Ina);

View file

@ -162,8 +162,7 @@ static unsigned estimateDistance(MachineFunction &MF,
// sliced after each non-final conditional, so every MBB ends up with // sliced after each non-final conditional, so every MBB ends up with
// at most one conditional terminator. Returns true if any MBB was // at most one conditional terminator. Returns true if any MBB was
// split. // split.
static bool splitMultiBranchMBBs(MachineFunction &MF, static bool splitMultiBranchMBBs(MachineFunction &MF) {
const TargetInstrInfo *TII) {
bool Changed = false; bool Changed = false;
// Snapshot MBBs first (we mutate the list during iteration). // Snapshot MBBs first (we mutate the list during iteration).
SmallVector<MachineBasicBlock *, 16> MBBs; SmallVector<MachineBasicBlock *, 16> MBBs;
@ -233,7 +232,6 @@ static bool splitMultiBranchMBBs(MachineFunction &MF,
// see if another split is needed (multi-multi-branch case). // see if another split is needed (multi-multi-branch case).
Changed = true; Changed = true;
Sliced = true; Sliced = true;
(void)TII; // unused for now
} }
} }
return Changed; return Changed;
@ -354,7 +352,7 @@ bool W65816BranchExpand::runOnMachineFunction(MachineFunction &MF) {
AnyChanged |= dropDeadConditionalsToBRATarget(MF); AnyChanged |= dropDeadConditionalsToBRATarget(MF);
// Step 1: split multi-conditional-terminator MBBs. // Step 1: split multi-conditional-terminator MBBs.
AnyChanged |= splitMultiBranchMBBs(MF, TII); AnyChanged |= splitMultiBranchMBBs(MF);
// Step 2: iterate to fixed-point. Each expansion adds 3 bytes // Step 2: iterate to fixed-point. Each expansion adds 3 bytes
// (bridge BRA), which may push another previously-OK branch over // (bridge BRA), which may push another previously-OK branch over

View file

@ -68,10 +68,6 @@ char W65816I32IncFold::ID = 0;
INITIALIZE_PASS(W65816I32IncFold, DEBUG_TYPE, INITIALIZE_PASS(W65816I32IncFold, DEBUG_TYPE,
"W65816 i32 += 1 fold", false, false) "W65816 i32 += 1 fold", false, false)
namespace llvm {
void initializeW65816I32IncFoldPass(PassRegistry &);
}
// Match the 6-instruction sequence; returns the post-pattern iterator // Match the 6-instruction sequence; returns the post-pattern iterator
// and fills in the lo/hi stack-rel offsets if the pattern matches. // and fills in the lo/hi stack-rel offsets if the pattern matches.
// Tolerates intervening TAX/TXA pairs (which regalloc inserts as // Tolerates intervening TAX/TXA pairs (which regalloc inserts as

View file

@ -87,10 +87,6 @@ char W65816ImgCalleeSave::ID = 0;
INITIALIZE_PASS(W65816ImgCalleeSave, DEBUG_TYPE, INITIALIZE_PASS(W65816ImgCalleeSave, DEBUG_TYPE,
"W65816 IMG8..IMG15 callee save/restore", false, false) "W65816 IMG8..IMG15 callee save/restore", false, false)
namespace llvm {
void initializeW65816ImgCalleeSavePass(PassRegistry &);
}
FunctionPass *llvm::createW65816ImgCalleeSave() { FunctionPass *llvm::createW65816ImgCalleeSave() {
return new W65816ImgCalleeSave(); return new W65816ImgCalleeSave();
} }
@ -188,7 +184,7 @@ bool W65816ImgCalleeSave::runOnMachineFunction(MachineFunction &MF) {
// //
// copyPhysReg lowers `COPY $imgN = $a` to `STA_DP imm:0xCx`, so we // copyPhysReg lowers `COPY $imgN = $a` to `STA_DP imm:0xCx`, so we
// check both the physreg-DEF form AND the DP-imm-store form. // check both the physreg-DEF form AND the DP-imm-store form.
bool WrittenSlot[8] = {false}; bool UsedSlot[8] = {false};
bool AnyWritten = false; bool AnyWritten = false;
for (auto &MBB : MF) { for (auto &MBB : MF) {
for (auto &MI : MBB) { for (auto &MI : MBB) {
@ -197,7 +193,7 @@ bool W65816ImgCalleeSave::runOnMachineFunction(MachineFunction &MF) {
if (!MO.isReg() || MO.getReg() == 0 || !MO.isDef()) continue; if (!MO.isReg() || MO.getReg() == 0 || !MO.isDef()) continue;
int idx = classifyImgReg(MO.getReg()); int idx = classifyImgReg(MO.getReg());
if (idx >= 0) { if (idx >= 0) {
WrittenSlot[idx] = true; UsedSlot[idx] = true;
AnyWritten = true; AnyWritten = true;
} }
} }
@ -205,15 +201,12 @@ bool W65816ImgCalleeSave::runOnMachineFunction(MachineFunction &MF) {
auto [idx, mode] = classifyDpImmAsImg(MI); auto [idx, mode] = classifyDpImmAsImg(MI);
if (idx >= 0 && if (idx >= 0 &&
(mode == DpAccess::Write || mode == DpAccess::ReadWrite)) { (mode == DpAccess::Write || mode == DpAccess::ReadWrite)) {
WrittenSlot[idx] = true; UsedSlot[idx] = true;
AnyWritten = true; AnyWritten = true;
} }
} }
} }
if (!AnyWritten) return false; if (!AnyWritten) return false;
// Rename for downstream Step 2/3/4 readability — they use UsedSlot.
bool (&UsedSlot)[8] = WrittenSlot;
(void)AnyWritten;
// Step 2: allocate one frame slot per used IMG. Size = 2 bytes (each // Step 2: allocate one frame slot per used IMG. Size = 2 bytes (each
// Img16 holds a 16-bit value). Mark as a spill slot so PEI accounts // Img16 holds a 16-bit value). Mark as a spill slot so PEI accounts

View file

@ -215,14 +215,10 @@ namespace llvm {
class W65816Layer2StampPass : public PassInfoMixin<W65816Layer2StampPass> { class W65816Layer2StampPass : public PassInfoMixin<W65816Layer2StampPass> {
public: public:
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM) { PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM) {
bool Changed = false;
for (Function &F : M) { for (Function &F : M) {
Changed |= stampFunction(F); stampFunction(F);
} }
if (!Changed) { // We only add a function attribute, no IR-level effects. Preserve
return PreservedAnalyses::all();
}
// We only added a function attribute, no IR-level effects. Preserve
// everything; the inliner et al. will copy the attribute on inline. // everything; the inliner et al. will copy the attribute on inline.
return PreservedAnalyses::all(); return PreservedAnalyses::all();
} }

View file

@ -189,7 +189,6 @@ bool W65816NarrowI32Mul::runOnFunction(Function &F) {
// low-16 bits as the original i32 add at every observable point // low-16 bits as the original i32 add at every observable point
// (the back-edge value can wrap on the exit iteration but is // (the back-edge value can wrap on the exit iteration but is
// never observed — exit takes the trip-end branch first). // never observed — exit takes the trip-end branch first).
bool NarrowedAny = false;
SmallVector<PHINode *, 4> PhiWorklist; SmallVector<PHINode *, 4> PhiWorklist;
for (BasicBlock &BB : F) { for (BasicBlock &BB : F) {
for (PHINode &PN : BB.phis()) { for (PHINode &PN : BB.phis()) {
@ -282,7 +281,6 @@ bool W65816NarrowI32Mul::runOnFunction(Function &F) {
Incr->replaceAllUsesWith(UndefValue::get(Incr->getType())); Incr->replaceAllUsesWith(UndefValue::get(Incr->getType()));
Incr->eraseFromParent(); Incr->eraseFromParent();
PN->eraseFromParent(); PN->eraseFromParent();
NarrowedAny = true;
} }
return true; return true;
} }

View file

@ -41,6 +41,13 @@
using namespace llvm; using namespace llvm;
// DP scratch byte used to park X when the negative-Y inserter needs to
// route through TAX/TXA. Lives in the project-wide $E0..$DF scratch
// range; $E0 is reserved for ADJCALLSTACKUP's A-preserve so we use
// $E2 here. Coordinate with W65816AsmPrinter / W65816ISelLowering /
// W65816RegisterInfo if the layout changes.
static constexpr unsigned kDpScratchX = 0xE2;
#define DEBUG_TYPE "w65816-neg-y-indy" #define DEBUG_TYPE "w65816-neg-y-indy"
namespace { namespace {
@ -110,9 +117,9 @@ bool W65816NegYIndY::runOnMachineFunction(MachineFunction &MF) {
if (XLive || xDef) break; if (XLive || xDef) break;
} }
if (XLive) { if (XLive) {
// Save X to DP $E2 (don't use $E0 — that's the A-preserve // Save X to DP kDpScratchX ($E2) -- $E0 is reserved as the
// slot in call-frame teardown and may be live). // A-preserve slot in call-frame teardown and may be live.
BuildMI(MBB, MI, DL, TII->get(W65816::STX_DP)).addImm(0xE2); BuildMI(MBB, MI, DL, TII->get(W65816::STX_DP)).addImm(kDpScratchX);
} }
if (IsLDA) { if (IsLDA) {
// LDA disp,S ; CLC ; ADC #neg ; TAX ; LDA $0000,X // LDA disp,S ; CLC ; ADC #neg ; TAX ; LDA $0000,X
@ -154,7 +161,7 @@ bool W65816NegYIndY::runOnMachineFunction(MachineFunction &MF) {
} }
if (XLive) { if (XLive) {
// Restore X from DP $E2. // Restore X from DP $E2.
BuildMI(MBB, MI, DL, TII->get(W65816::LDX_DP)).addImm(0xE2); BuildMI(MBB, MI, DL, TII->get(W65816::LDX_DP)).addImm(kDpScratchX);
} }
// Erase original LDY and the (sr,s),Y op. // Erase original LDY and the (sr,s),Y op.
if (LastLDY) { LastLDY->eraseFromParent(); LastLDY = nullptr; } if (LastLDY) { LastLDY->eraseFromParent(); LastLDY = nullptr; }

View file

@ -99,64 +99,29 @@ FunctionPass *llvm::createW65816PromoteFiToImg() {
} }
// Returns the operand index of the FrameIndex for the given FI pseudo
// opcode, or -1 if this opcode isn't a promotable FI carrier.
static int getFiOperandIdx(unsigned Opc) {
switch (Opc) {
case W65816::LDAfi: return 1;
case W65816::STAfi: return 1;
case W65816::CMPfi: return 1;
case W65816::ADCfi:
case W65816::SBCfi:
case W65816::ANDfi:
case W65816::ORAfi:
case W65816::EORfi: return 2;
default: return -1;
}
}
// Map a promotable FI pseudo to the corresponding DP MC opcode.
static unsigned getDpOpcode(unsigned Opc) {
switch (Opc) {
case W65816::LDAfi: return W65816::LDA_DP;
case W65816::STAfi: return W65816::STA_DP;
case W65816::CMPfi: return W65816::CMP_DP;
case W65816::ADCfi: return W65816::ADC_DP;
case W65816::SBCfi: return W65816::SBC_DP;
case W65816::ANDfi: return W65816::AND_DP;
case W65816::ORAfi: return W65816::ORA_DP;
case W65816::EORfi: return W65816::EOR_DP;
default: return 0;
}
}
// IMG8..IMG15 sit at DP addresses 0xC0, 0xC2, ..., 0xCE. IMG0..IMG7
// are at 0xD0..0xDE. Returns the DP byte for IMGn.
static uint8_t dpAddrForImg(unsigned ImgIdx) {
assert(ImgIdx < 16 && "IMG index out of range");
if (ImgIdx < 8) return 0xD0 + 2 * ImgIdx;
return 0xC0 + 2 * (ImgIdx - 8);
}
bool W65816PromoteFiToImg::runOnMachineFunction(MachineFunction &MF) { bool W65816PromoteFiToImg::runOnMachineFunction(MachineFunction &MF) {
// DISABLED again 2026-05-13 (3rd-attempt write-up). Two new findings: // DISABLED 2026-05-13 (3rd-attempt write-up). Two findings blocked
// re-enable:
// 1. With kMaxPromote=2 and IMG0..7 (caller-save, skip ImgCalleeSave), // 1. With kMaxPromote=2 and IMG0..7 (caller-save, skip ImgCalleeSave),
// sumSquares regressed 56 72 inst because the FIs picked by // sumSquares regressed 56 -> 72 inst because the FIs picked by
// access-count (fi#2, fi#3) are intermediate spill temps, not // access-count are intermediate spill temps, not the i32-accumulator
// the i32-accumulator's halves (which are different FIs). The // halves (which are different FIs). Loop body ends up using BOTH
// loop body ends up using BOTH IMG and stack slots for related // IMG and stack slots for related values.
// values. // 2. To pick the RIGHT FIs (those corresponding to PHI-cycled values
// 2. To pick the RIGHT FIs (those corresponding to PHI-cycled // like the i32 accumulator), we need either IR-level analysis
// values like the i32 accumulator), we need either: // BEFORE FI assignment, or post-RA dataflow analysis to identify
// (a) IR-level analysis BEFORE FI assignment, or // long-lived FIs (active across the loop back-edge with no def/use
// (b) Post-RA dataflow analysis to identify "long-lived" FIs // boundary).
// (active across the loop back-edge with no def/use boundary). // The pass framework is retained so the pipeline slot stays documented;
// This is the next blocker. Disabled until either (a) or (b) is // see git history for the disabled prototype body.
// implemented. (void)MF;
return false; return false;
}
#if 0
// Disabled prototype body retained for reference; see comment above.
bool W65816PromoteFiToImg::runOnMachineFunctionDisabled(MachineFunction &MF) {
if (skipFunction(MF.getFunction())) return false; if (skipFunction(MF.getFunction())) return false;
const W65816Subtarget &STI = MF.getSubtarget<W65816Subtarget>(); const W65816Subtarget &STI = MF.getSubtarget<W65816Subtarget>();
const W65816InstrInfo *TII = STI.getInstrInfo(); const W65816InstrInfo *TII = STI.getInstrInfo();
@ -396,3 +361,4 @@ bool W65816PromoteFiToImg::runOnMachineFunction(MachineFunction &MF) {
} }
return Changed; return Changed;
} }
#endif

View file

@ -50,6 +50,9 @@ using namespace llvm;
#define DEBUG_TYPE "w65816-sep-rep-cleanup" #define DEBUG_TYPE "w65816-sep-rep-cleanup"
// W65816 processor status M-bit mask (set/clear via SEP/REP #$20).
static constexpr int kMBit = 0x20;
namespace { namespace {
class W65816SepRepCleanup : public MachineFunctionPass { class W65816SepRepCleanup : public MachineFunctionPass {
@ -276,7 +279,7 @@ bool W65816SepRepCleanup::runOnMachineFunction(MachineFunction &MF) {
for (auto It = MBB.begin(); It != MBB.end(); ++It) { for (auto It = MBB.begin(); It != MBB.end(); ++It) {
if (It->getOpcode() != W65816::SEP) continue; if (It->getOpcode() != W65816::SEP) continue;
if (It->getNumOperands() < 1 || !It->getOperand(0).isImm()) continue; if (It->getNumOperands() < 1 || !It->getOperand(0).isImm()) continue;
if (It->getOperand(0).getImm() != 0x20) continue; if (It->getOperand(0).getImm() != kMBit) continue;
// Walk forward looking for LDAi8imm before any STAfi_indY // Walk forward looking for LDAi8imm before any STAfi_indY
// or REP at this nesting level. // or REP at this nesting level.
auto Walker = std::next(It); auto Walker = std::next(It);
@ -312,7 +315,7 @@ bool W65816SepRepCleanup::runOnMachineFunction(MachineFunction &MF) {
if (Back->getOpcode() == W65816::SEP && if (Back->getOpcode() == W65816::SEP &&
Back->getNumOperands() >= 1 && Back->getNumOperands() >= 1 &&
Back->getOperand(0).isImm() && Back->getOperand(0).isImm() &&
Back->getOperand(0).getImm() == 0x20) { Back->getOperand(0).getImm() == kMBit) {
OuterSep = &*Back; OuterSep = &*Back;
break; break;
} }
@ -409,7 +412,7 @@ bool W65816SepRepCleanup::runOnMachineFunction(MachineFunction &MF) {
if (Op1 != W65816::REP && Op1 != W65816::SEP) continue; if (Op1 != W65816::REP && Op1 != W65816::SEP) continue;
if (It->getNumOperands() < 1 || !It->getOperand(0).isImm()) continue; if (It->getNumOperands() < 1 || !It->getOperand(0).isImm()) continue;
int Imm1 = It->getOperand(0).getImm(); int Imm1 = It->getOperand(0).getImm();
if (Imm1 != 0x20) continue; // M-bit only if (Imm1 != kMBit) continue; // M-bit only
// Walk forward across mode-neutral ops looking for the matching // Walk forward across mode-neutral ops looking for the matching
// opposite toggle. Bail at calls, asm, ALU ops on A, etc. // opposite toggle. Bail at calls, asm, ALU ops on A, etc.
unsigned WantOp = (Op1 == W65816::REP) ? W65816::SEP : W65816::REP; unsigned WantOp = (Op1 == W65816::REP) ? W65816::SEP : W65816::REP;
@ -1119,361 +1122,12 @@ bool W65816SepRepCleanup::runOnMachineFunction(MachineFunction &MF) {
} }
} }
// Store forwarding (disabled — CRC32 regressed and I couldn't // Three prototype peepholes were tried here and removed once shown
// nail down the safety hole in time). Even with PHP-wrap guards // to regress benchmarks; design notes in
// and SP-modifier bails, the first fire (in memmove) silently // feedback_close_gap_attempts_round2.md / feedback_cmp_zero_elim.md:
// miscompiles something that CRC32 later depends on. Pattern // - PHI store-forwarding (CRC32 regression / memmove safety hole).
// is sound; safety analysis isn't complete. See // - Redundant CMP #0 elimination (VLA sum_n carry-flag bookkeeping).
// feedback_close_gap_attempts_round2.md for details. // - Narrow PHI-copy slot collapse (qsort regression).
#if 0
// Store forwarding for PHI memory copies. Pattern (sumSquares
// loop body):
//
// STA X,s ; A → slot X (some intermediate result)
// [code that modifies A but doesn't touch slot X or slot Y]
// LDA X,s ; reload A from slot X
// STA Y,s ; A → slot Y (the PHI copy)
//
// Transform: insert `STA Y,s` right after the first `STA X,s` (A
// still holds the same value at that point), then drop the LDA-
// STA pair. Net: -1 inst per pattern occurrence.
//
// Safety constraints (all between STA X and the LDA-STA pair, in
// the same MBB, in straight-line code):
// - No instruction writes slot X (else the LDA would see a
// different value than the original STA).
// - No instruction reads OR writes slot Y (else our early STA Y
// would be observed mid-flight with a different value than
// before, or our inserted store would be overwritten and the
// intervening read of Y in the original would have seen the
// overwrite).
// - No call / inline asm / branch (conservatively: those can
// touch memory we don't model).
{
auto isStackRelMC2 = [](unsigned Op) {
return Op == W65816::LDA_StackRel || Op == W65816::STA_StackRel ||
Op == W65816::ADC_StackRel || Op == W65816::SBC_StackRel ||
Op == W65816::AND_StackRel || Op == W65816::ORA_StackRel ||
Op == W65816::EOR_StackRel || Op == W65816::CMP_StackRel;
};
auto srAccess2 = [&](const MachineInstr &MI, int64_t &Off) -> bool {
if (!isStackRelMC2(MI.getOpcode())) return false;
if (MI.getNumOperands() < 1 || !MI.getOperand(0).isImm()) return false;
Off = MI.getOperand(0).getImm();
return true;
};
auto isStaSr = [](const MachineInstr &MI) {
return MI.getOpcode() == W65816::STA_StackRel;
};
auto isLdaSr = [](const MachineInstr &MI) {
return MI.getOpcode() == W65816::LDA_StackRel;
};
SmallVector<MachineInstr *, 4> ToErase;
SmallVector<std::tuple<MachineInstr *, int64_t>, 4> ToInsert;
static int g_fireLimit = -1;
static int g_fireCount = 0;
static bool initd = false;
if (!initd) {
if (const char *e = getenv("STORE_FWD_LIMIT")) g_fireLimit = atoi(e);
initd = true;
}
for (MachineBasicBlock &MBB : MF) {
for (auto It = MBB.begin(); It != MBB.end(); ++It) {
if (!isStaSr(*It)) continue;
int64_t X;
if (!srAccess2(*It, X)) continue;
MachineInstr *StaX = &*It;
// Check if StaX is INSIDE an open PHP/PLP wrap. In that case
// its operand offset has been pre-bumped by +1, and inserting
// a sibling STA Y immediately after writes at the WRONG slot
// (the un-bumped Y). Walk backward: if we find a PHP without
// a matching PLP first, bail.
{
bool insideWrap = false;
int depth = 0;
auto B = It;
while (B != MBB.begin()) {
--B;
if (B->getOpcode() == W65816::PLP) depth++;
else if (B->getOpcode() == W65816::PHP) {
if (depth > 0) depth--;
else { insideWrap = true; break; }
}
}
if (insideWrap) continue;
}
// Walk forward looking for LDA X ; STA Y. Conservative bail
// on any non-tracked memory op (indirect pointer access,
// DP/abs ops, etc.) which could alias slot Y via memory.
bool ok = true;
int64_t Y = -1;
MachineInstr *LdaX = nullptr;
MachineInstr *StaY = nullptr;
for (auto Walker = std::next(It); Walker != MBB.end(); ++Walker) {
if (Walker->isDebugInstr()) continue;
if (Walker->isCall() || Walker->isInlineAsm() ||
Walker->isBranch() || Walker->isReturn()) {
ok = false; break;
}
// Found LDA X?
int64_t Off;
if (isLdaSr(*Walker) && srAccess2(*Walker, Off) && Off == X) {
LdaX = &*Walker;
auto Next = std::next(Walker);
while (Next != MBB.end() && Next->isDebugInstr()) ++Next;
if (Next == MBB.end() || !isStaSr(*Next) ||
!srAccess2(*Next, Y) || Y == X) {
ok = false;
} else {
StaY = &*Next;
}
break;
}
// Stack-rel access to X (write or read): bail.
if (srAccess2(*Walker, Off) && Off == X) {
ok = false; break;
}
// Any memory-touching op that's NOT a tracked stack-rel
// access — bail. Indirect pointer stores/loads (DPIndY /
// DPIndLong / abs / etc.) could alias slot Y via a pointer
// we can't trace, and the safety check below would miss it.
if ((Walker->mayLoad() || Walker->mayStore()) &&
!isStackRelMC2(Walker->getOpcode())) {
ok = false; break;
}
// SP-modifying ops shift the stack-rel addressing window —
// a later `lda X, s` reads a DIFFERENT byte than the earlier
// `sta X, s` (or worse, the new stack pointer points into
// saved P/retaddr). Bail on TCS (direct SP write) and on
// any stack push/pop (PHx/PLx/PEA/PEI/COP/BRK). Also bail
// on PHP/PLP because the wrap pass already bumped in-wrap
// stack-rel ops by +1 — our inserted STA after STA X writes
// at the un-bumped offset which gets the WRONG slot.
{
unsigned WO = Walker->getOpcode();
if (WO == W65816::TCS || WO == W65816::PHA ||
WO == W65816::PLA || WO == W65816::PHX ||
WO == W65816::PLX || WO == W65816::PHY ||
WO == W65816::PLY || WO == W65816::PHP ||
WO == W65816::PLP || WO == W65816::PHB ||
WO == W65816::PLB || WO == W65816::PHD ||
WO == W65816::PLD || WO == W65816::PHK ||
WO == W65816::PEA || WO == W65816::PEI_DP) {
ok = false; break;
}
}
}
if (!ok || !LdaX || !StaY) continue;
if (g_fireLimit >= 0 && g_fireCount >= g_fireLimit) continue;
g_fireCount++;
errs() << "SF FIRE " << g_fireCount << " in " << MF.getName()
<< " MBB " << MBB.getNumber()
<< " X=" << X << " Y=" << StaY->getOperand(0).getImm()
<< "\n";
// Now re-walk from std::next(It) up to LdaX and verify no
// access to slot Y in that gap.
ok = true;
for (auto W2 = std::next(It); W2 != LdaX->getIterator(); ++W2) {
if (W2->isDebugInstr()) continue;
int64_t Off;
if (srAccess2(*W2, Off) && Off == Y) { ok = false; break; }
}
if (!ok) continue;
// Safe to apply: schedule the StaY-after-StaX insert, and
// erase LdaX and StaY.
ToInsert.push_back({StaX, Y});
ToErase.push_back(LdaX);
ToErase.push_back(StaY);
Changed = true;
}
}
// Apply (insertions first; iterators stay valid through erase).
for (auto &P : ToInsert) {
MachineInstr *StaX = std::get<0>(P);
int64_t Y = std::get<1>(P);
MachineBasicBlock *MBB = StaX->getParent();
DebugLoc DL = StaX->getDebugLoc();
auto NextIt = std::next(StaX->getIterator());
BuildMI(*MBB, NextIt, DL, TII.get(W65816::STA_StackRel))
.addImm(Y);
}
for (MachineInstr *MI : ToErase) MI->eraseFromParent();
}
#endif
// (Redundant CMP #0 elimination — disabled, hit VLA sum_n
// regression. Carry-flag bookkeeping across the CMP turned out to
// have more cases than my forward-walk modeled. See
// feedback_cmp_zero_elim.md.)
#if 0
{
auto isNZSetOnA = [](unsigned Op) {
switch (Op) {
case W65816::DEA_PSEUDO: case W65816::INA_PSEUDO:
case W65816::ADC_StackRel: case W65816::ADC_DP: case W65816::ADC_Imm16:
case W65816::SBC_StackRel: case W65816::SBC_DP: case W65816::SBC_Imm16:
case W65816::AND_StackRel: case W65816::AND_DP: case W65816::AND_Imm16:
case W65816::ORA_StackRel: case W65816::ORA_DP: case W65816::ORA_Imm16:
case W65816::EOR_StackRel: case W65816::EOR_DP: case W65816::EOR_Imm16:
case W65816::LDA_StackRel: case W65816::LDA_DP:
case W65816::LDAi16imm: case W65816::LDA_Imm16:
case W65816::TXA: case W65816::TYA:
case W65816::ADCi16imm: case W65816::ADCEi16imm:
case W65816::SBCi16imm: case W65816::SBCEi16imm:
return true;
default:
return false;
}
};
auto isCmpZero = [](const MachineInstr &MI) {
if (MI.getOpcode() != W65816::CMPi16imm) return false;
// Operand layout: lhs (Acc16), imm. Find the imm.
for (const MachineOperand &MO : MI.operands()) {
if (MO.isImm()) return MO.getImm() == 0;
}
return false;
};
auto modifiesA = [](const MachineInstr &MI) {
for (const MachineOperand &MO : MI.operands()) {
if (MO.isReg() && MO.getReg() == W65816::A && MO.isDef())
return true;
}
return false;
};
auto readsC = [](const MachineInstr &MI) {
// We don't model individual flag bits; approximate by checking
// if the MI reads $p AND is one of the carry-consuming ops.
unsigned Op = MI.getOpcode();
switch (Op) {
case W65816::ADC_StackRel: case W65816::ADC_DP: case W65816::ADC_Imm16:
case W65816::SBC_StackRel: case W65816::SBC_DP: case W65816::SBC_Imm16:
case W65816::ADCEi16imm: case W65816::SBCEi16imm:
case W65816::BCC: case W65816::BCS:
case W65816::ROL_A: case W65816::ROR_A:
return true;
default:
return false;
}
};
SmallVector<MachineInstr *, 4> CmpsToErase;
for (MachineBasicBlock &MBB : MF) {
for (MachineInstr &MI : MBB) {
if (!isCmpZero(MI)) continue;
// Walk backward, skipping flag-preserving instructions.
bool foundProducer = false;
auto Back = MI.getIterator();
while (Back != MBB.begin()) {
--Back;
if (Back->isDebugInstr()) continue;
if (Back->isCall() || Back->isInlineAsm()) break;
if (modifiesA(*Back)) {
foundProducer = isNZSetOnA(Back->getOpcode());
break;
}
bool defsP = false;
for (const MachineOperand &MO : Back->operands()) {
if (MO.isReg() && MO.getReg() == W65816::P && MO.isDef()) {
defsP = true; break;
}
}
if (defsP) break;
}
if (!foundProducer) continue;
// Walk FORWARD from CMP: until the next C-defining MI, no MI
// reads C.
bool cConsumed = false;
for (auto Fwd = std::next(MI.getIterator()); Fwd != MBB.end(); ++Fwd) {
if (Fwd->isDebugInstr()) continue;
if (readsC(*Fwd)) { cConsumed = true; break; }
// Next def of $p: subsequent reads aren't ours.
bool defsP = false;
for (const MachineOperand &MO : Fwd->operands()) {
if (MO.isReg() && MO.getReg() == W65816::P && MO.isDef()) {
defsP = true; break;
}
}
if (defsP) break;
}
if (cConsumed) continue;
CmpsToErase.push_back(&MI);
}
}
for (MachineInstr *MI : CmpsToErase) MI->eraseFromParent();
if (!CmpsToErase.empty()) Changed = true;
}
#endif
// (Narrow PHI-copy slot collapse — disabled, qsort regression.)
#if 0
{
auto isStackRelMC2 = [](unsigned Op) {
return Op == W65816::LDA_StackRel || Op == W65816::STA_StackRel ||
Op == W65816::ADC_StackRel || Op == W65816::SBC_StackRel ||
Op == W65816::AND_StackRel || Op == W65816::ORA_StackRel ||
Op == W65816::EOR_StackRel || Op == W65816::CMP_StackRel;
};
auto srAccess2 = [&](const MachineInstr &MI, int64_t &Off) {
if (!isStackRelMC2(MI.getOpcode())) return false;
if (MI.getNumOperands() < 1 || !MI.getOperand(0).isImm()) return false;
Off = MI.getOperand(0).getImm();
return true;
};
DenseMap<int64_t, unsigned> Refs;
DenseMap<int64_t, MachineInstr *> StaInst, LdaInst;
DenseMap<int64_t, unsigned> NSta, NLda;
for (MachineBasicBlock &MBB : MF) {
for (MachineInstr &MI : MBB) {
int64_t Off;
if (!srAccess2(MI, Off)) continue;
Refs[Off]++;
if (MI.getOpcode() == W65816::STA_StackRel) {
NSta[Off]++; StaInst[Off] = &MI;
} else if (MI.getOpcode() == W65816::LDA_StackRel) {
NLda[Off]++; LdaInst[Off] = &MI;
}
}
}
SmallVector<MachineInstr *, 4> ToErase;
for (auto &P : Refs) {
int64_t X = P.first;
if (P.second != 2) continue; // exactly 2 references
if (NSta[X] != 1 || NLda[X] != 1) continue;
MachineInstr *Sta = StaInst[X];
MachineInstr *Lda = LdaInst[X];
if (Sta->getParent() != Lda->getParent()) continue;
MachineBasicBlock *MBB = Sta->getParent();
// Sta must be before Lda.
bool staBefore = false;
for (auto It = MBB->begin(); It != MBB->end(); ++It) {
if (&*It == Sta) { staBefore = true; break; }
if (&*It == Lda) break;
}
if (!staBefore) continue;
// Next after Lda must be STA Y where Y != X.
auto NextIt = std::next(Lda->getIterator());
while (NextIt != MBB->end() && NextIt->isDebugInstr()) ++NextIt;
if (NextIt == MBB->end()) continue;
int64_t Y;
if (NextIt->getOpcode() != W65816::STA_StackRel ||
!srAccess2(*NextIt, Y) || Y == X) continue;
// Between Sta and Lda, no read/write of slot Y, no call, no
// anything that would re-set slot Y's value mid-flight.
bool ok = true;
for (auto It = std::next(Sta->getIterator()); It != Lda->getIterator();
++It) {
if (It->isDebugInstr()) continue;
if (It->isCall() || It->isInlineAsm()) { ok = false; break; }
int64_t Off;
if (srAccess2(*It, Off) && Off == Y) { ok = false; break; }
}
if (!ok) continue;
// Redirect the original STA to write to Y; delete the LDA-STA pair.
Sta->getOperand(0).setImm(Y);
ToErase.push_back(Lda);
ToErase.push_back(&*NextIt);
Changed = true;
}
for (MachineInstr *MI : ToErase) MI->eraseFromParent();
}
#endif
return Changed; return Changed;
} }

View file

@ -127,7 +127,7 @@ static bool touchesX(const MachineInstr &MI, const TargetRegisterInfo *TRI) {
return xEffect(MI, TRI) != XNone; return xEffect(MI, TRI) != XNone;
} }
// Returns true if MI is `STAfi $a, slot, 0`. // Returns FI if MI is `STAfi $a, slot, 0`, else -1.
static int matchSTAfi(const MachineInstr &MI) { static int matchSTAfi(const MachineInstr &MI) {
if (MI.getOpcode() != W65816::STAfi) return -1; if (MI.getOpcode() != W65816::STAfi) return -1;
if (MI.getNumOperands() < 3) return -1; if (MI.getNumOperands() < 3) return -1;

View file

@ -800,33 +800,6 @@ bool W65816StackRelToImg::runOnMachineFunction(MachineFunction &MF) {
// unhandled — they can shift SP arbitrarily. Caller must bail. // unhandled — they can shift SP arbitrarily. Caller must bail.
return 0; return 0;
}; };
auto miBailsAnalysis = [](const MachineInstr &MI) -> bool {
// We don't bail on TCS or ADJCALLSTACK*. TCS in prologue/epilogue
// resets SP to a known value (the "canonical" SP for that region);
// since stack-rel accesses don't span TCS in well-formed code (the
// prologue allocates, body uses, epilogue deallocates), treating
// SP as continuing across TCS gives correct relative offsets for
// accesses inside each region. ADJCALLSTACK* aren't usually
// present at pre-emit time (PEI eliminates them or AsmPrinter
// handles). If they're still present, treat as 0 SP-shift —
// the actual PUSH16 ops carry the real shift.
return false;
};
auto miSpDeltaWithAdj = [&](const MachineInstr &MI) -> int {
if (MI.getOpcode() == W65816::ADJCALLSTACKDOWN ||
MI.getOpcode() == W65816::ADJCALLSTACKUP) {
// Skip — the actual PUSH16/PEA/PHA ops inside the call seq
// carry the SP delta.
return 0;
}
if (MI.getOpcode() == W65816::TCS) {
// TCS sets SP; we treat it as a "reset to canonical SP" point.
// Return 0 here; the calling code can do the reset.
return 0;
}
return 0;
};
(void)miSpDeltaWithAdj;
while (!Worklist.empty() && SpAdjValid) { while (!Worklist.empty() && SpAdjValid) {
MachineBasicBlock *MBB = Worklist.pop_back_val(); MachineBasicBlock *MBB = Worklist.pop_back_val();
if (!Visited.insert(MBB).second) continue; if (!Visited.insert(MBB).second) continue;

View file

@ -166,20 +166,26 @@ static bool semanticallyDefsA(const MachineInstr &MI) {
// Walk backward from MI in its MBB looking for the most recent A-define. // Walk backward from MI in its MBB looking for the most recent A-define.
// Returns the MI that defines A, or nullptr if none in the same MBB. // Returns the MI that defines A, or nullptr if none in the same MBB.
// Skips debug instructions. Stops at MBB boundary, calls, branches, // Skips debug instructions. When BailOnCall is true, also stops at
// inline asm. // calls / inline asm (used by the Case (3) twin check where call effects
static MachineInstr *findPriorADef(MachineInstr *MI) { // invalidate the value-equivalence reasoning).
static MachineInstr *findADefBackward(MachineInstr *MI, bool BailOnCall) {
MachineBasicBlock *MBB = MI->getParent(); MachineBasicBlock *MBB = MI->getParent();
auto It = MI->getIterator(); auto It = MI->getIterator();
while (It != MBB->begin()) { while (It != MBB->begin()) {
--It; --It;
if (It->isDebugInstr()) continue; if (It->isDebugInstr()) continue;
if (It->isCall() || It->isInlineAsm()) return nullptr; if (BailOnCall && (It->isCall() || It->isInlineAsm())) return nullptr;
if (semanticallyDefsA(*It)) return &*It; if (semanticallyDefsA(*It)) return &*It;
} }
return nullptr; return nullptr;
} }
// Convenience: Case (3) twin matcher (bails on calls/inline asm).
static MachineInstr *findPriorADef(MachineInstr *MI) {
return findADefBackward(MI, /*BailOnCall=*/true);
}
// Walk forward from `Start` (exclusive) up to (but not including) `End` // Walk forward from `Start` (exclusive) up to (but not including) `End`
// in the same MBB, tracking whether slot `WatchSlot` is written. // in the same MBB, tracking whether slot `WatchSlot` is written.
@ -252,17 +258,9 @@ static bool usesFlagsP(const MachineInstr &MI) {
} }
// Returns the MOST RECENT A-defining MI strictly before MI in its MBB, // Convenience: Case (2) twin matcher (does NOT bail on calls/inline asm).
// skipping debug instructions. Returns nullptr if none in the same MBB.
static MachineInstr *findMostRecentADef(MachineInstr *MI) { static MachineInstr *findMostRecentADef(MachineInstr *MI) {
MachineBasicBlock *MBB = MI->getParent(); return findADefBackward(MI, /*BailOnCall=*/false);
auto It = MI->getIterator();
while (It != MBB->begin()) {
--It;
if (It->isDebugInstr()) continue;
if (semanticallyDefsA(*It)) return &*It;
}
return nullptr;
} }
@ -283,7 +281,6 @@ static MachineInstr *findMostRecentADef(MachineInstr *MI) {
static MachineInstr *findTwin(MachineInstr *StaX, static MachineInstr *findTwin(MachineInstr *StaX,
ArrayRef<MachineInstr *> StasY) { ArrayRef<MachineInstr *> StasY) {
MachineBasicBlock *MBBStaX = StaX->getParent(); MachineBasicBlock *MBBStaX = StaX->getParent();
int64_t XOff = StaX->getOperand(0).getImm();
// Cases (1) + (2): same MBB. // Cases (1) + (2): same MBB.
for (MachineInstr *StaY : StasY) { for (MachineInstr *StaY : StasY) {
if (StaY->getParent() != MBBStaX) continue; if (StaY->getParent() != MBBStaX) continue;
@ -342,7 +339,6 @@ static MachineInstr *findTwin(MachineInstr *StaX,
} }
if (XConst == YConst) return StaY; if (XConst == YConst) return StaY;
} }
(void)XOff;
return nullptr; return nullptr;
} }

View file

@ -8,10 +8,10 @@
// //
// Pre-regalloc pass: when a tied-def Acc16 instruction (ADCfi, SBCfi, // Pre-regalloc pass: when a tied-def Acc16 instruction (ADCfi, SBCfi,
// ANDfi, ORAfi, EORfi, ADCi16imm, SBCi16imm, ANDi16imm, ORAi16imm, // ANDfi, ORAfi, EORfi, ADCi16imm, SBCi16imm, ANDi16imm, ORAi16imm,
// EORi16imm, ADCabs, SBCabs, ANDabs, ORAabs, EORabs, INA_PSEUDO, // EORi16imm, ADCabs, SBCabs -- see isTiedAcc16Consumer below for the
// DEA_PSEUDO, ASLA16, LSRA16, NEGA16, SHL8A, SRL8A, SRA15A, etc.) has // authoritative list) has a source vreg whose value is *also* needed
// a source vreg whose value is *also* needed past the consumer, fast // past the consumer, fast regalloc fails to insert the necessary
// regalloc fails to insert the necessary save/restore on its own. // save/restore on its own.
// (Acc16 has exactly one physical register, so the consumer's // (Acc16 has exactly one physical register, so the consumer's
// tied-def overwrites the source; with multiple consumers/post-uses // tied-def overwrites the source; with multiple consumers/post-uses
// the source must be spilled and reloaded.) // the source must be spilled and reloaded.)

View file

@ -59,10 +59,6 @@ using namespace llvm;
#define DEBUG_TYPE "w65816-un-lsr" #define DEBUG_TYPE "w65816-un-lsr"
namespace llvm {
void initializeW65816UnLSRPass(PassRegistry &);
}
namespace { namespace {
class W65816UnLSR : public FunctionPass { class W65816UnLSR : public FunctionPass {
@ -84,7 +80,6 @@ public:
private: private:
bool processLoop(Loop *L); bool processLoop(Loop *L);
bool processCounterToPtrPHIs(Loop *L); bool processCounterToPtrPHIs(Loop *L);
bool processReturnedCounter(Loop *L);
}; };
} // namespace } // namespace
@ -107,7 +102,6 @@ bool W65816UnLSR::runOnFunction(Function &F) {
for (Loop *L : LI) { for (Loop *L : LI) {
Changed |= processLoop(L); Changed |= processLoop(L);
Changed |= processCounterToPtrPHIs(L); Changed |= processCounterToPtrPHIs(L);
// processReturnedCounter remains disabled — see note above.
SmallVector<Loop *, 4> Worklist(L->begin(), L->end()); SmallVector<Loop *, 4> Worklist(L->begin(), L->end());
while (!Worklist.empty()) { while (!Worklist.empty()) {
Loop *Sub = Worklist.pop_back_val(); Loop *Sub = Worklist.pop_back_val();
@ -120,241 +114,6 @@ bool W65816UnLSR::runOnFunction(Function &F) {
} }
// strLen-style undo: LSR converts `return p - s` into a counter PHI
// `%lsr.iv` that increments per iter and is returned directly:
// %lsr.iv = phi i16 [-1, %entry], [%lsr.iv.next, %latch]
// %p.0 = phi ptr [%s, %entry], [%incdec.ptr, %latch]
// %incdec.ptr = getelementptr i8, %p.0, i32 1
// %lsr.iv.next = add i16 %lsr.iv, 1
// br ..., %exit, %loop
// %exit:
// ret i16 %lsr.iv.next
//
// LSR's reasoning: cheaper to maintain a counter than compute (p - s)
// at exit. On W65816 the opposite is true: counter inc per iter costs
// 5 cyc/iter * N iters; one-time sub at exit costs ~10 cyc total.
//
// This undo finds the counter PHI, verifies its only out-of-loop use
// is via LCSSA → return, finds the sibling pointer PHI with the same
// stride, and replaces the return value with
// `(i16)(p_lcssa - base) + (K_init + 1)`. Erases the counter PHI.
//
// Saves ~5 cyc/iter on strLen-shape loops with a returned counter.
bool W65816UnLSR::processReturnedCounter(Loop *L) {
BasicBlock *Header = L->getHeader();
BasicBlock *Latch = L->getLoopLatch();
BasicBlock *Preheader = L->getLoopPreheader();
if (!Latch || !Preheader) return false;
// Single-exit loop.
SmallVector<BasicBlock *, 2> ExitBlocks;
L->getExitBlocks(ExitBlocks);
if (ExitBlocks.size() != 1) return false;
BasicBlock *Exit = ExitBlocks[0];
// Find a candidate counter PHI: integer, init=ConstantInt, step=+1.
PHINode *CounterPHI = nullptr;
ConstantInt *KInit = nullptr;
BinaryOperator *CounterStep = nullptr;
for (PHINode &PN : Header->phis()) {
if (!PN.getType()->isIntegerTy()) continue;
if (PN.getNumIncomingValues() != 2) continue;
Value *Init = nullptr, *Step = nullptr;
for (unsigned i = 0; i < PN.getNumIncomingValues(); ++i) {
BasicBlock *Pred = PN.getIncomingBlock(i);
if (L->contains(Pred)) Step = PN.getIncomingValue(i);
else Init = PN.getIncomingValue(i);
}
if (!Init || !Step) continue;
auto *InitC = dyn_cast<ConstantInt>(Init);
if (!InitC) continue;
auto *StepBO = dyn_cast<BinaryOperator>(Step);
if (!StepBO || StepBO->getOpcode() != Instruction::Add) continue;
Value *Other = nullptr;
if (StepBO->getOperand(0) == &PN) Other = StepBO->getOperand(1);
else if (StepBO->getOperand(1) == &PN) Other = StepBO->getOperand(0);
if (!Other) continue;
auto *StepCI = dyn_cast<ConstantInt>(Other);
if (!StepCI || !StepCI->isOne()) continue;
CounterPHI = &PN;
KInit = InitC;
CounterStep = StepBO;
break;
}
if (!CounterPHI) return false;
// The counter PHI must be used INSIDE the loop only by its increment
// and OUTSIDE the loop only via an LCSSA PHI in the exit block that
// feeds a return. Same for the increment.
auto isOnlyInLoopUseTheStep = [&](Value *V) {
for (User *U : V->users()) {
auto *UI = dyn_cast<Instruction>(U);
if (!UI) return false;
if (!L->contains(UI)) continue; // out-of-loop is handled separately
if (UI == CounterStep) continue;
// The PHI itself is allowed (V might be CounterStep, used by
// CounterPHI's back-edge incoming).
if (UI == CounterPHI) continue;
return false;
}
return true;
};
if (!isOnlyInLoopUseTheStep(CounterPHI)) return false;
if (!isOnlyInLoopUseTheStep(CounterStep)) return false;
// Find a use of CounterPHI or CounterStep that's a ReturnInst.
// The use might be DIRECT (no LCSSA — common after LCSSA cleanup)
// or via an LCSSA PHI in the exit block.
ReturnInst *Ret = nullptr;
Value *RetSource = nullptr; // the value the ret reads
PHINode *ExitLCSSA = nullptr; // optional LCSSA PHI to erase
bool fromNext = false; // true if return source is CounterStep
auto findRet = [&](Value *V, bool isNext) -> bool {
for (User *U : V->users()) {
auto *UI = dyn_cast<Instruction>(U);
if (!UI) continue;
// Skip in-loop uses (those are the counter increment chain).
if (L->contains(UI->getParent())) continue;
if (auto *R = dyn_cast<ReturnInst>(UI)) {
if (R->getReturnValue() != V) continue;
Ret = R; RetSource = V; fromNext = isNext; return true;
}
// LCSSA PHI in the exit block?
if (auto *PN = dyn_cast<PHINode>(UI)) {
if (PN->getParent() != Exit) continue;
if (PN->getNumIncomingValues() != 1) continue;
if (PN->getIncomingValue(0) != V) continue;
if (!PN->hasOneUse()) continue;
auto *R = dyn_cast<ReturnInst>(PN->user_back());
if (!R || R->getReturnValue() != PN) continue;
Ret = R; RetSource = V; fromNext = isNext; ExitLCSSA = PN;
return true;
}
}
return false;
};
if (!findRet(CounterStep, true) && !findRet(CounterPHI, false))
return false;
// Find a sibling pointer PHI: init=Base, latch incoming is a
// `getelementptr i8, %ptr, 1` of itself.
PHINode *PtrPHI = nullptr;
Value *Base = nullptr;
GetElementPtrInst *PtrStep = nullptr;
for (PHINode &PN : Header->phis()) {
if (!PN.getType()->isPointerTy()) continue;
if (PN.getNumIncomingValues() != 2) continue;
Value *Init = nullptr, *Step = nullptr;
for (unsigned i = 0; i < PN.getNumIncomingValues(); ++i) {
BasicBlock *Pred = PN.getIncomingBlock(i);
if (L->contains(Pred)) Step = PN.getIncomingValue(i);
else Init = PN.getIncomingValue(i);
}
if (!Init || !Step) continue;
auto *StepGEP = dyn_cast<GetElementPtrInst>(Step);
if (!StepGEP) continue;
if (StepGEP->getPointerOperand() != &PN) continue;
if (StepGEP->getNumIndices() != 1) continue;
if (!StepGEP->getSourceElementType()->isIntegerTy(8)) continue;
auto *StrideCI = dyn_cast<ConstantInt>(StepGEP->getOperand(1));
if (!StrideCI || !StrideCI->isOne()) continue;
PtrPHI = &PN;
Base = Init;
PtrStep = StepGEP;
break;
}
if (!PtrPHI) return false;
// The pointer-PHI must have an LCSSA in the exit (so we can compute
// p_lcssa - base). Find it or create one.
PHINode *PtrLCSSA = nullptr;
for (PHINode &EPN : Exit->phis()) {
if (EPN.getNumIncomingValues() != 1) continue;
if (EPN.getIncomingValue(0) == PtrPHI) {
PtrLCSSA = &EPN; break;
}
}
if (!PtrLCSSA) {
// Create LCSSA for PtrPHI.
IRBuilder<> B(&Exit->front());
PtrLCSSA = B.CreatePHI(PtrPHI->getType(), 1, "unlsr.p.lcssa");
PtrLCSSA->addIncoming(PtrPHI, Latch);
}
// Build replacement value: (i16)(p_lcssa - base) + (K_init + (fromNext ? 1 : 0))
// For fromNext=true (returning %counter.next): value = K_init + iters
// p_lcssa - base = iters (in bytes, stride 1) → value = K_init + (p_lcssa - base)
// But we want: counter.next at exit = K_init + iters; and p_lcssa - base = iters.
// So replacement = (i16)(p_lcssa - base) + K_init.
// For strLen: K_init = -1; iters at exit = K (where ret = K - 1 + 1 = K)
// Wait let me re-derive. counter init = -1. iter 1 entry: counter = -1.
// iter 1 exit: counter.next = 0. Suppose exit-iter is iter K. Then at
// iter K's icmp-true, counter.next = -1 + K.
// And p_lcssa = base + (K - 1) (since iter K had p.0 = base + K-1).
// So p_lcssa - base = K - 1.
// We want counter.next = K - 1 (because exit-iter is iter K, but counter.next
// was computed before icmp tested 0 - so it's K - 1 (with K iters = K decisions))
// Hmm, off-by-one is tricky. Let me just test empirically.
// The "return value type" we'll cast to.
Type *RetTy = Ret->getReturnValue()->getType();
if (!RetTy->isIntegerTy()) return false;
Instruction *InsertPt = ExitLCSSA ? ExitLCSSA->getNextNode() : Ret;
IRBuilder<> B(InsertPt);
// (p_lcssa - base) as integer.
Value *PLcssaInt = B.CreatePtrToInt(PtrLCSSA, Type::getInt32Ty(Header->getContext()), "unlsr.plcssa.i");
Value *BaseInt = B.CreatePtrToInt(Base, Type::getInt32Ty(Header->getContext()), "unlsr.base.i");
Value *Diff = B.CreateSub(PLcssaInt, BaseInt, "unlsr.diff");
// Truncate to counter type.
Value *DiffI = B.CreateTrunc(Diff, CounterPHI->getType(), "unlsr.diff.trunc");
// For fromNext (returning %counter.next): replacement = diff + (K_init + 1).
// At exit, counter.next = K_init + iters.
// p_lcssa - base = iters (in bytes; stride 1). Wait but iters is the iter count.
// Let me re-check with concrete example.
// strLen("a\0"): iter 1: p.0 = s, *p='a'!=0, p++, counter=-1, counter.next=0.
// iter 2: p.0 = s+1, *p=0, exit. counter=0, counter.next=1.
// At exit: counter.next = 1. iters before exit-iter's icmp-true = 2.
// p_lcssa = s+1 (the iter-2 entry value). p_lcssa - base = 1.
// counter.next = 1 = K_init + 2 = -1 + 2 = 1. ✓
// p_lcssa - base = 1. So counter.next = p_lcssa - base + 0.
// (K_init + iters - (iters - (p_lcssa - base))) = K_init + (p_lcssa - base) = K_init + 1.
// Wait: counter.next = K_init + iters; p_lcssa - base = iters - 1.
// So counter.next = K_init + (p_lcssa - base) + 1.
// For K_init = -1: counter.next = -1 + 1 + 1 = 1 if iters=2. ✓
// So replacement = diff + (K_init + 1).
int64_t Adjust = KInit->getSExtValue() + (fromNext ? 1 : 0);
Value *Result = DiffI;
if (Adjust != 0) {
Result = B.CreateAdd(DiffI,
ConstantInt::get(CounterPHI->getType(), Adjust),
"unlsr.result");
}
// Cast to return type if different.
if (Result->getType() != RetTy) {
if (CounterPHI->getType()->getIntegerBitWidth() <
RetTy->getIntegerBitWidth())
Result = B.CreateZExt(Result, RetTy);
else
Result = B.CreateTrunc(Result, RetTy);
}
// Replace the return. If there's an LCSSA PHI, replace it. Otherwise
// replace the direct use in `ret`.
if (ExitLCSSA) {
ExitLCSSA->replaceAllUsesWith(Result);
ExitLCSSA->eraseFromParent();
} else {
Ret->setOperand(0, Result);
}
// Erase the counter PHI and its increment.
CounterStep->replaceAllUsesWith(UndefValue::get(CounterPHI->getType()));
CounterPHI->replaceAllUsesWith(UndefValue::get(CounterPHI->getType()));
CounterStep->eraseFromParent();
CounterPHI->eraseFromParent();
return true;
}
// strcpy-style undo: LSR converts two pointer PHIs (`src.addr.0` and // strcpy-style undo: LSR converts two pointer PHIs (`src.addr.0` and
// `d.0` each stepping by 1) into a single counter PHI (`lsr.iv`) plus // `d.0` each stepping by 1) into a single counter PHI (`lsr.iv`) plus
// GEPs `(base, counter)` per iter. On 65816 the counter+GEP form // GEPs `(base, counter)` per iter. On 65816 the counter+GEP form

View file

@ -84,27 +84,6 @@ static bool flowsToIncompatiblePhysReg(Register VReg,
return false; return false;
} }
// Returns true if VReg's def is a COPY from a physreg whose class is not
// Wide16-compatible. copyPhysReg only handles a fixed set of source/dest
// pairs; an incompatible source physreg (e.g., DPF0, the i64-return
// high-half carrier) lowered to an IMG dest would crash with an
// "unhandled copyPhysReg" assertion at AsmPrinter time. (Currently
// only the Phase-2 PHI widening uses this; that's disabled, so mark
// unused.)
[[maybe_unused]] static bool comesFromIncompatiblePhysReg(Register VReg,
const MachineRegisterInfo &MRI) {
for (auto &D : MRI.def_instructions(VReg)) {
if (!D.isCopy()) continue;
const MachineOperand &Src = D.getOperand(1);
if (!Src.isReg() || !Src.getReg().isPhysical()) continue;
Register P = Src.getReg();
if (P == W65816::A) continue;
if (P >= W65816::IMG0 && P <= W65816::IMG15) continue;
return true;
}
return false;
}
// Returns true if the vreg is used by any PHI. PHI input/result must // Returns true if the vreg is used by any PHI. PHI input/result must
// share the same register class (verifier requirement). Rather than // share the same register class (verifier requirement). Rather than
// also widen the PHI's result and recursively all of its uses, we skip // also widen the PHI's result and recursively all of its uses, we skip
@ -212,196 +191,9 @@ bool W65816WidenAcc16::runOnMachineFunction(MachineFunction &MF) {
Changed = true; Changed = true;
} }
// Phase 2: PHI cycle widening. EXPERIMENTAL, currently disabled — // Phase 2: PHI cycle widening was prototyped here but never landed.
// see end of pass for explanation. // The prototype body lived in an #if 0 block that was removed once
#if 0 // we settled on Phase 1 as the only effective half of the pass.
// PHIs whose def class is Acc16 keep
// the value pinned to $a across iterations, forcing stack spills
// when the PHI is live across calls or other A-clobbering ops.
// For sumSquares-style loops with an i32 accumulator, this manifests
// as per-iter `LDA slot ; ADC ; STA slot ; LDA slot ; STA slot` (the
// last LDA/STA pair is the PHI-back-edge copy). If we widen the
// PHI's def to Wide16, regalloc can keep it in an IMG slot and the
// back-edge PHI copy collapses to a register coalesce.
//
// To widen a PHI:
// 1. Compute the SCC of Acc16 vregs connected by PHI edges (PHI
// def ↔ PHI incoming vreg). This catches mutually-recursive
// PHIs in nested loops.
// 2. For every member: verify all non-PHI uses accept Wide16, no
// flow to a physreg, single def.
// 3. For each PHI in the SCC, walk its incoming list. Each
// incoming vreg is either ALREADY in the SCC (another PHI, no
// bridge needed) or an external Acc16 vreg whose value flows
// into the SCC — bridge it by inserting `WWide = COPY W` at
// the end of the predecessor block and pointing the PHI's
// incoming at WWide.
// 4. Change every SCC member's register class to Wide16.
auto worklistInsertIfAcc16 = [&MRI](Register V,
DenseSet<Register> &Seen,
SmallVectorImpl<Register> &WL) {
if (!V.isVirtual()) return;
if (MRI.getRegClass(V) != &W65816::Acc16RegClass) return;
if (!Seen.insert(V).second) return;
WL.push_back(V);
};
SmallVector<MachineInstr *, 16> AcctPhis;
for (MachineBasicBlock &MBB : MF) {
for (MachineInstr &MI : MBB.phis()) {
Register DefV = MI.getOperand(0).getReg();
if (MRI.getRegClass(DefV) == &W65816::Acc16RegClass) {
AcctPhis.push_back(&MI);
}
}
}
DenseSet<Register> ProcessedPhiVregs;
for (MachineInstr *Seed : AcctPhis) {
Register SeedDef = Seed->getOperand(0).getReg();
if (ProcessedPhiVregs.count(SeedDef)) continue;
// Build SCC by following PHI edges in both directions.
DenseSet<Register> Comp;
SmallVector<Register, 8> Stack;
worklistInsertIfAcc16(SeedDef, Comp, Stack);
while (!Stack.empty()) {
Register V = Stack.pop_back_val();
// Forward: V flows into other PHIs as an incoming → include those PHI defs.
for (auto &U : MRI.use_nodbg_instructions(V)) {
if (!U.isPHI()) continue;
Register PhiDef = U.getOperand(0).getReg();
worklistInsertIfAcc16(PhiDef, Comp, Stack);
}
// Backward: if V is itself a PHI def, include the incoming vregs.
MachineInstr *DM = &*MRI.def_instructions(V).begin();
if (!DM || !DM->isPHI()) continue;
for (unsigned i = 1, e = DM->getNumOperands(); i < e; i += 2) {
MachineOperand &MO = DM->getOperand(i);
if (!MO.isReg() || !MO.getReg().isVirtual()) continue;
worklistInsertIfAcc16(MO.getReg(), Comp, Stack);
}
}
for (Register V : Comp) ProcessedPhiVregs.insert(V);
// Validate every member. PHI uses are ACCEPTED when the consumer
// PHI is itself in the SCC (those PHIs are being widened in
// lock-step). Narrow-class uses (e.g., INA_PSEUDO's tied-def
// input requires Acc16) are ALSO accepted — we'll insert a
// Wide16→Acc16 COPY at the use site after widening. The only
// unrecoverable cases are: PHI uses where the consumer PHI is
// outside the SCC (forcing cross-SCC class merging), and physreg
// flow to $x/$y/etc. (handled separately above).
auto usesAcceptInSCC = [&](Register V,
SmallVectorImpl<MachineOperand *> *NarrowSites)
-> bool {
for (auto &MO : MRI.use_nodbg_operands(V)) {
MachineInstr *UMI = MO.getParent();
if (UMI->isCopy()) continue;
if (UMI->isPHI()) {
Register PhiDef = UMI->getOperand(0).getReg();
if (Comp.count(PhiDef)) continue; // co-widened
return false;
}
unsigned OpIdx = UMI->getOperandNo(&MO);
const TargetRegisterClass *Expected =
TII->getRegClass(UMI->getDesc(), OpIdx);
if (!Expected) continue;
if (Expected == &W65816::Wide16RegClass) continue;
if (Expected->hasSubClassEq(&W65816::Wide16RegClass)) continue;
// Expected is narrower than Wide16 (e.g., Acc16-only tied
// input). Mark for runtime narrowing — we'll insert a COPY
// at apply time.
if (NarrowSites) NarrowSites->push_back(&MO);
}
return true;
};
bool ok = true;
SmallVector<MachineOperand *, 8> NarrowSites;
for (Register V : Comp) {
if (!MRI.hasOneDef(V)) { ok = false; break; }
if (flowsToIncompatiblePhysReg(V, MRI)) { ok = false; break; }
if (comesFromIncompatiblePhysReg(V, MRI)) { ok = false; break; }
if (!usesAcceptInSCC(V, &NarrowSites)) { ok = false; break; }
}
if (!ok) continue;
// Apply widening. First insert bridge COPYs at predecessor edges
// for external (non-Comp) Acc16 incomings to each PHI in Comp.
SmallVector<std::pair<MachineInstr *, unsigned>, 16> BridgeSites;
for (Register V : Comp) {
MachineInstr *DM = &*MRI.def_instructions(V).begin();
if (!DM->isPHI()) continue;
for (unsigned i = 1, e = DM->getNumOperands(); i < e; i += 2) {
MachineOperand &MO = DM->getOperand(i);
if (!MO.isReg() || !MO.getReg().isVirtual()) continue;
Register Inc = MO.getReg();
if (Comp.count(Inc)) continue; // in-SCC, no bridge needed
// External incoming: ensure it's currently Acc16; if so, we'll
// insert a COPY at the predecessor block's end.
if (MRI.getRegClass(Inc) != &W65816::Acc16RegClass &&
MRI.getRegClass(Inc) != &W65816::Wide16RegClass) {
ok = false;
break;
}
BridgeSites.push_back({DM, i});
}
if (!ok) break;
}
if (!ok) continue;
// Insert bridges.
for (auto &Site : BridgeSites) {
MachineInstr *PhiMI = Site.first;
unsigned OpIdx = Site.second;
Register Inc = PhiMI->getOperand(OpIdx).getReg();
MachineBasicBlock *PredMBB = PhiMI->getOperand(OpIdx + 1).getMBB();
// If already Wide16 (e.g., another candidate widened it already),
// no bridge needed — but we still need the PHI incoming to use
// a Wide16 vreg. Use Inc directly.
if (MRI.getRegClass(Inc) == &W65816::Wide16RegClass) {
continue;
}
// Insert COPY before the predecessor's terminator(s).
auto InsertPos = PredMBB->getFirstTerminator();
DebugLoc DL = (InsertPos == PredMBB->end())
? PredMBB->findBranchDebugLoc()
: InsertPos->getDebugLoc();
Register WideInc = MRI.createVirtualRegister(&W65816::Wide16RegClass);
BuildMI(*PredMBB, InsertPos, DL, TII->get(TargetOpcode::COPY),
WideInc)
.addReg(Inc);
PhiMI->getOperand(OpIdx).setReg(WideInc);
PhiMI->getOperand(OpIdx).setIsKill(false);
}
// Force every SCC member to Img16 (IMG-only, no A). Using Wide16
// (A + IMG) doesn't work here: the Register Coalescer joins our
// Wide16 vregs with adjacent Acc16 vregs (intersection = Acc16)
// and narrows them back to A-only, defeating the widening. Img16
// intersects Acc16 to ∅, so the coalescer can't merge — the PHI
// stays in IMG. This is correct anyway for the common case (PHI
// live across a call): A is JSL-clobbered, so it can't carry the
// value through, and IMG8..15 is the right home.
for (Register V : Comp) {
MRI.setRegClass(V, &W65816::Img16RegClass);
}
// Insert narrowing COPYs at each narrow-class use site. Each site
// is `... = OP V, ...` where the operand requires Acc16 but V is
// now Wide16. Replace with `%Vacc = COPY V (Acc16); ... = OP %Vacc, ...`.
for (MachineOperand *MO : NarrowSites) {
MachineInstr *UMI = MO->getParent();
Register OldReg = MO->getReg();
Register NarrowReg =
MRI.createVirtualRegister(&W65816::Acc16RegClass);
DebugLoc DL = UMI->getDebugLoc();
BuildMI(*UMI->getParent(), UMI, DL, TII->get(TargetOpcode::COPY),
NarrowReg)
.addReg(OldReg);
MO->setReg(NarrowReg);
MO->setIsKill(false);
}
Changed = true;
}
#endif
// Why disabled (2026-05-13 attempt): // Why disabled (2026-05-13 attempt):
// - Widening PHI cycles to Wide16 (= A + IMG0..15) is undone by the // - Widening PHI cycles to Wide16 (= A + IMG0..15) is undone by the
// Register Coalescer: it joins our Wide16 vregs with adjacent // Register Coalescer: it joins our Wide16 vregs with adjacent

View file

@ -1,20 +1,26 @@
# tests/ubsan — UBSan-min smoke probe (Phase 6.2) # tests/ubsan — UBSan-min smoke probe (Phase 6.2)
Three-case probe that exercises the `-fsanitize=undefined Nine-case probe that exercises the `-fsanitize=undefined
-fsanitize-minimal-runtime` instrumentation end-to-end on the W65816 -fsanitize-minimal-runtime` instrumentation end-to-end on the W65816
target: target:
| Kind | UB | Sentinel | | Kind | UB | Sentinel |
|-----------------------|----------------------------------|--------------| |------------------------|----------------------------------|----------------------|
| `add-overflow` | i16 `INT_MAX + 1` | `$025000=0xC0DE` | | `add-overflow` | i16 `INT_MAX + 1` | `$025000=0xC0DE` |
| `shift-out-of-bounds` | u16 `1 << 17` | `$025002=0xC0DF` | | `shift-out-of-bounds` | u16 `1 << 17` | `$025002=0xC0DF` |
| `divrem-overflow` | i16 `n / 0` | `$025004=0xC0E0` | | `divrem-overflow` | i16 `n / 0` | `$025004=0xC0E0` |
| (liveness) | tail of `main` reached | `$025006=0xC0DA` | | `sub-overflow` | i16 `INT_MIN - 1` | `$025006=0xC0E1` |
| `mul-overflow` | i16 `INT_MAX * 2` | `$025008=0xC0E2` |
| `negate-overflow` | i16 `-INT_MIN` | `$02500A=0xC0E3` |
| `pointer-overflow` | `(char*)0xFFFFFFF0 + 0x40` | `$02500C=0xC0E4` |
| `load-invalid-value` | `_Bool` loaded from byte = 2 | `$02500E=0xC0E5` |
| `out-of-bounds` | `arr[idx>=N]` on static array | `$025010=0xC0E6` |
| (liveness) | tail of `main` reached | `$025012=0xC0DA` |
The probe ships strong override defs for the three `__ubsan_handle_*_minimal` The probe ships strong override defs for the nine `__ubsan_handle_*_minimal`
recovering handlers it exercises; the remaining 22 are pulled in from recovering handlers it exercises; the remaining handlers are pulled in
`runtime/ubsan.o` so any extra UB site clang emits (e.g. constant-fold from `runtime/ubsan.o` so any extra UB site clang emits (e.g. constant-
overflow at `-O2`) still resolves cleanly. fold overflow at `-O2`) still resolves cleanly.
## Build + run ## Build + run
@ -27,8 +33,14 @@ Expected output:
MAME-READ addr=0x025000 val=0xc0de MAME-READ addr=0x025000 val=0xc0de
MAME-READ addr=0x025002 val=0xc0df MAME-READ addr=0x025002 val=0xc0df
MAME-READ addr=0x025004 val=0xc0e0 MAME-READ addr=0x025004 val=0xc0e0
MAME-READ addr=0x025006 val=0xc0da MAME-READ addr=0x025006 val=0xc0e1
MAME OK: 4 reads matched MAME-READ addr=0x025008 val=0xc0e2
MAME-READ addr=0x02500a val=0xc0e3
MAME-READ addr=0x02500c val=0xc0e4
MAME-READ addr=0x02500e val=0xc0e5
MAME-READ addr=0x025010 val=0xc0e6
MAME-READ addr=0x025012 val=0xc0da
MAME OK: 10 reads matched
``` ```
## What this probe is NOT ## What this probe is NOT
@ -39,9 +51,14 @@ MAME OK: 4 reads matched
overrides the handlers so it can verify the *call edge* without overrides the handlers so it can verify the *call edge* without
pulling in console code. A separate diagnostic-format probe would pulling in console code. A separate diagnostic-format probe would
link `libc.o` + `libcGno.o` + GNO crt0 and assert on stderr. link `libc.o` + `libcGno.o` + GNO crt0 and assert on stderr.
- It is **not** a sweep of all 25 handler kinds. The user-spec scope - It is **not** a sweep of all 25 handler kinds. The kinds covered
is "3 representative kinds". The other 22 are link-tested are all the cheap-to-trigger recoverable handlers that clang emits
implicitly by `runtime/ubsan.o`'s symbol set being available. at `-O2` for the W65816 target. Aborting-only kinds (e.g.
`builtin_unreachable_minimal`, `missing_return_minimal`) cannot be
exercised here because returning from the handler after the IR
`unreachable` is itself UB. Float-cast-overflow / VLA-not-positive
/ type-mismatch / CFI / Objective-C kinds are linked but not
triggered.
## Files ## Files

View file

@ -7,12 +7,14 @@
# What this verifies: # What this verifies:
# - clang accepts -fsanitize=undefined -fsanitize-minimal-runtime on # - clang accepts -fsanitize=undefined -fsanitize-minimal-runtime on
# the w65816 target. # the w65816 target.
# - The three exercised UB kinds (add-overflow / shift-out-of-bounds / # - Nine exercised UB kinds (add-overflow / shift-out-of-bounds /
# divrem-overflow) instrument as expected — the handler-fired byte # divrem-overflow / sub-overflow / mul-overflow / negate-overflow /
# flips inside the per-kind handler override. # pointer-overflow / load-invalid-value / out-of-bounds) instrument
# as expected -- the handler-fired byte flips inside the per-kind
# handler override.
# - The recovering minimal runtime returns to the caller cleanly, so # - The recovering minimal runtime returns to the caller cleanly, so
# the probe continues writing sentinels past each UB site. # the probe continues writing sentinels past each UB site.
# - runtime/ubsan.o links + resolves the other 22 handler kinds without # - runtime/ubsan.o links + resolves the other handler kinds without
# pulling in console code that the probe doesn't need. # pulling in console code that the probe doesn't need.
set -eu set -eu
@ -27,7 +29,7 @@ bash "$SCRIPT_DIR/build.sh"
# Link. crt0.o + the probe + ubsan.o + libgcc.o (for the i16 div+rem # Link. crt0.o + the probe + ubsan.o + libgcc.o (for the i16 div+rem
# helpers triggerDivByZero needs). We deliberately do NOT link libc.o # helpers triggerDivByZero needs). We deliberately do NOT link libc.o
# the probe sets memory sentinels directly, doesn't call printf, and # -- the probe sets memory sentinels directly, doesn't call printf, and
# pulling libc.o in would also pull snprintf.o (~9 KB) for no benefit. # pulling libc.o in would also pull snprintf.o (~9 KB) for no benefit.
"$PROJECT_ROOT/tools/link816" -o ubsanProbe.bin \ "$PROJECT_ROOT/tools/link816" -o ubsanProbe.bin \
--text-base 0x1000 --bss-base 0xA000 --map ubsanProbe.map \ --text-base 0x1000 --bss-base 0xA000 --map ubsanProbe.map \
@ -39,11 +41,22 @@ bash "$SCRIPT_DIR/build.sh"
ls -la ubsanProbe.bin ls -la ubsanProbe.bin
echo "" echo ""
# Sentinels: # Sentinels (one per recoverable handler exercised, plus a tail
# liveness sentinel). Each is a 16-bit write at $025000+kind*2.
# $025000 = 0xC0DE add-overflow handler fired # $025000 = 0xC0DE add-overflow handler fired
# $025002 = 0xC0DF shift-out-of-bounds handler fired # $025002 = 0xC0DF shift-out-of-bounds handler fired
# $025004 = 0xC0E0 divrem-overflow handler fired # $025004 = 0xC0E0 divrem-overflow handler fired
# $025006 = 0xC0DA all three recovered and main reached its tail # $025006 = 0xC0E1 sub-overflow handler fired
# $025008 = 0xC0E2 mul-overflow handler fired
# $02500A = 0xC0E3 negate-overflow handler fired
# $02500C = 0xC0E4 pointer-overflow handler fired
# $02500E = 0xC0E5 load-invalid-value handler fired
# $025010 = 0xC0E6 out-of-bounds handler fired
# $025012 = 0xC0DA all nine recovered and main reached its tail
bash "$PROJECT_ROOT/scripts/runInMame.sh" \ bash "$PROJECT_ROOT/scripts/runInMame.sh" \
"$SCRIPT_DIR/ubsanProbe.bin" \ "$SCRIPT_DIR/ubsanProbe.bin" \
--check 0x025000=C0DE 0x025002=C0DF 0x025004=C0E0 0x025006=C0DA --check \
0x025000=C0DE 0x025002=C0DF 0x025004=C0E0 \
0x025006=C0E1 0x025008=C0E2 0x02500A=C0E3 \
0x02500C=C0E4 0x02500E=C0E5 0x025010=C0E6 \
0x025012=C0DA

View file

@ -1,35 +1,47 @@
// Phase 6.2 UBSan-min smoke probe. // Phase 6.2 UBSan-min smoke probe.
// //
// Three UB cases (one each from the spec): // Nine UB cases — one per recoverable handler kind we exercise:
// kind 0 (sentinel 0xC0DE): signed-overflow add (i16 INT_MAX + 1) // kind 0 (sentinel 0xC0DE): add-overflow (i16 INT_MAX + 1)
// kind 1 (sentinel 0xC0DF): shift-out-of-bounds (1 << 17 on a u16) // kind 1 (sentinel 0xC0DF): shift-out-of-bounds (1 << 17 on a u16)
// kind 2 (sentinel 0xC0E0): divide-by-zero (n / 0) // kind 2 (sentinel 0xC0E0): divrem-overflow (n / 0)
// kind 3 (sentinel 0xC0E1): sub-overflow (INT_MIN - 1)
// kind 4 (sentinel 0xC0E2): mul-overflow (INT_MAX * 2)
// kind 5 (sentinel 0xC0E3): negate-overflow (-INT_MIN)
// kind 6 (sentinel 0xC0E4): pointer-overflow (ptr + huge offset)
// kind 7 (sentinel 0xC0E5): load-invalid-value (_Bool from byte=2)
// kind 8 (sentinel 0xC0E6): out-of-bounds (arr[idx>=N])
// //
// The probe overrides the three relevant `__ubsan_handle_*_minimal` // The probe overrides each relevant `__ubsan_handle_*_minimal` recovering
// recovering handlers with strong definitions that record their // handler with a strong definition that records its firing in a static
// firing in a static state byte. After each UB, the probe writes // state byte. After each UB, the probe writes 0xC0DE+kind to a per-kind
// 0xC0DE + kind to $025000 to prove (a) the instrumentation fired and // 16-bit slot at 0x025000+kind*2 to prove (a) the instrumentation fired
// (b) execution recovered cleanly past the UB. The recover handler // and (b) execution recovered cleanly past the UB. The recover handler
// returning normally is the whole point of -fsanitize-minimal-runtime // returning normally is the whole point of -fsanitize-minimal-runtime
// + -fsanitize-recover; this probe is what proves the round-trip. // + -fsanitize-recover; this probe is what proves the round-trip.
// //
// To verify all three at once we cascade the sentinel writes through a // To verify all nine at once we cascade the sentinel writes through a
// staircase of $025000 / $025002 / $025004 word stores so the smoke // staircase of word stores so the smoke harness can read independent
// harness can read three independent 16-bit values back from MAME. // 16-bit values back from MAME.
// //
// Compile with -fsanitize=undefined -fsanitize-minimal-runtime. // Compile with -fsanitize=undefined -fsanitize-minimal-runtime.
#include <stdint.h> #include <stdint.h>
// Bank-2 BSS at $025000-$025006 — outside the SHR shadow and outside // Bank-2 BSS at $025000-$025014 -- outside the SHR shadow and outside
// $C000-$CFFF IO window. link816 places .bss at the user-specified // $C000-$CFFF IO window. link816 places .bss at the user-specified
// --bss-base (we pass 0xA000) so these constant addresses are // --bss-base (we pass 0xA000) so these constant addresses are
// independent of BSS layout. // independent of BSS layout.
#define MARK_ADD_OVF ((volatile uint16_t *)0x025000UL) #define MARK_ADD_OVF ((volatile uint16_t *)0x025000UL)
#define MARK_SHIFT_OOB ((volatile uint16_t *)0x025002UL) #define MARK_SHIFT_OOB ((volatile uint16_t *)0x025002UL)
#define MARK_DIV_ZERO ((volatile uint16_t *)0x025004UL) #define MARK_DIV_ZERO ((volatile uint16_t *)0x025004UL)
#define DONE_SENTINEL ((volatile uint16_t *)0x025006UL) #define MARK_SUB_OVF ((volatile uint16_t *)0x025006UL)
#define MARK_MUL_OVF ((volatile uint16_t *)0x025008UL)
#define MARK_NEG_OVF ((volatile uint16_t *)0x02500AUL)
#define MARK_PTR_OVF ((volatile uint16_t *)0x02500CUL)
#define MARK_LOAD_INVAL ((volatile uint16_t *)0x02500EUL)
#define MARK_OUT_OF_BNDS ((volatile uint16_t *)0x025010UL)
#define DONE_SENTINEL ((volatile uint16_t *)0x025012UL)
// Strong overrides win over runtime/ubsan.o's weak-by-link defaults. // Strong overrides win over runtime/ubsan.o's weak-by-link defaults.
@ -39,6 +51,12 @@
static volatile uint8_t handlerFiredAdd = 0; static volatile uint8_t handlerFiredAdd = 0;
static volatile uint8_t handlerFiredShift = 0; static volatile uint8_t handlerFiredShift = 0;
static volatile uint8_t handlerFiredDiv = 0; static volatile uint8_t handlerFiredDiv = 0;
static volatile uint8_t handlerFiredSub = 0;
static volatile uint8_t handlerFiredMul = 0;
static volatile uint8_t handlerFiredNeg = 0;
static volatile uint8_t handlerFiredPtr = 0;
static volatile uint8_t handlerFiredLoadInv = 0;
static volatile uint8_t handlerFiredOob = 0;
void __ubsan_handle_add_overflow_minimal(void) { void __ubsan_handle_add_overflow_minimal(void) {
@ -56,6 +74,36 @@ void __ubsan_handle_divrem_overflow_minimal(void) {
} }
void __ubsan_handle_sub_overflow_minimal(void) {
handlerFiredSub = 1;
}
void __ubsan_handle_mul_overflow_minimal(void) {
handlerFiredMul = 1;
}
void __ubsan_handle_negate_overflow_minimal(void) {
handlerFiredNeg = 1;
}
void __ubsan_handle_pointer_overflow_minimal(void) {
handlerFiredPtr = 1;
}
void __ubsan_handle_load_invalid_value_minimal(void) {
handlerFiredLoadInv = 1;
}
void __ubsan_handle_out_of_bounds_minimal(void) {
handlerFiredOob = 1;
}
// Each UB site goes through a noinline wrapper so the optimizer // Each UB site goes through a noinline wrapper so the optimizer
// cannot constant-fold the operation away. __attribute__((noinline)) // cannot constant-fold the operation away. __attribute__((noinline))
// + volatile inputs blocks the obvious folding paths; we also wrap // + volatile inputs blocks the obvious folding paths; we also wrap
@ -79,6 +127,47 @@ static int16_t triggerDivByZero(int16_t a, int16_t b) {
} }
__attribute__((noinline))
static int16_t triggerSubOverflow(int16_t a, int16_t b) {
return a - b;
}
__attribute__((noinline))
static int16_t triggerMulOverflow(int16_t a, int16_t b) {
return a * b;
}
__attribute__((noinline))
static int16_t triggerNegateOverflow(int16_t a) {
return -a;
}
__attribute__((noinline))
static char *triggerPointerOverflow(char *p, int32_t o) {
return p + o;
}
__attribute__((noinline))
static int triggerLoadInvalidValue(volatile uint8_t *p) {
_Bool v = *(_Bool *)p;
// Use the value so the load isn't dead-stripped. We don't trust
// the post-instrumentation cast to a 0/1 narrow value -- the
// important thing is the load itself fired the handler.
return v ? 1 : 0;
}
__attribute__((noinline))
static int16_t triggerOutOfBounds(int16_t idx) {
static int16_t arr[4] = { 10, 20, 30, 40 };
return arr[idx];
}
int main(void) { int main(void) {
// --- case 0: signed-overflow add (INT16_MAX + 1) --- // --- case 0: signed-overflow add (INT16_MAX + 1) ---
volatile int16_t aMax = 0x7FFF; volatile int16_t aMax = 0x7FFF;
@ -104,12 +193,58 @@ int main(void) {
*MARK_DIV_ZERO = 0xC0E0; *MARK_DIV_ZERO = 0xC0E0;
} }
// Final liveness sentinel — only written if we got past all three // --- case 3: sub-overflow (INT16_MIN - 1) ---
volatile int16_t aMin = (int16_t)0x8000;
(void)triggerSubOverflow(aMin, aOne);
if (handlerFiredSub) {
*MARK_SUB_OVF = 0xC0E1;
}
// --- case 4: mul-overflow (INT16_MAX * 2 wraps) ---
volatile int16_t aTwo = 2;
(void)triggerMulOverflow(aMax, aTwo);
if (handlerFiredMul) {
*MARK_MUL_OVF = 0xC0E2;
}
// --- case 5: negate-overflow (-INT16_MIN) ---
(void)triggerNegateOverflow(aMin);
if (handlerFiredNeg) {
*MARK_NEG_OVF = 0xC0E3;
}
// --- case 6: pointer-overflow (signed-wrap on i16 addr) ---
// Cast a high address to char* and add a positive offset that
// overflows the address calculation. -fsanitize=pointer-overflow
// fires on signed-overflow of the offset add.
volatile uint32_t hiAddr = 0xFFFFFFF0UL;
volatile int32_t big = 0x40;
char *p = (char *)(uintptr_t)hiAddr;
(void)triggerPointerOverflow(p, big);
if (handlerFiredPtr) {
*MARK_PTR_OVF = 0xC0E4;
}
// --- case 7: load-invalid-value (_Bool from byte=2) ---
volatile uint8_t boolByte = 2;
(void)triggerLoadInvalidValue(&boolByte);
if (handlerFiredLoadInv) {
*MARK_LOAD_INVAL = 0xC0E5;
}
// --- case 8: out-of-bounds (static arr[idx>=N]) ---
volatile int16_t badIdx = 7;
(void)triggerOutOfBounds(badIdx);
if (handlerFiredOob) {
*MARK_OUT_OF_BNDS = 0xC0E6;
}
// Final liveness sentinel -- only written if we got past all nine
// UB sites without the runtime aborting (which would have spun on // UB sites without the runtime aborting (which would have spun on
// a BRK_pseudo at $70 instead of reaching here). // a BRK_pseudo at $70 instead of reaching here).
*DONE_SENTINEL = 0xC0DA; *DONE_SENTINEL = 0xC0DA;
// Halt — crt0's return-from-main path hits a BRK that headless // Halt -- crt0's return-from-main path hits a BRK that headless
// MAME wild-jumps from, so spin-wait instead. // MAME wild-jumps from, so spin-wait instead.
while (1) { while (1) {
} }

View file

@ -13,7 +13,7 @@
"num": 1, "num": 1,
"name": "SEG1", "name": "SEG1",
"base": "0x001000", "base": "0x001000",
"size": 3432, "size": 5084,
"image": "ubsanProbe.bin", "image": "ubsanProbe.bin",
"entry_offset": "0x0000" "entry_offset": "0x0000"
} }
@ -22,6 +22,12 @@
{"addr": "0x025000", "expect": "0xC0DE", "label": "add-overflow handler fired"}, {"addr": "0x025000", "expect": "0xC0DE", "label": "add-overflow handler fired"},
{"addr": "0x025002", "expect": "0xC0DF", "label": "shift-out-of-bounds handler fired"}, {"addr": "0x025002", "expect": "0xC0DF", "label": "shift-out-of-bounds handler fired"},
{"addr": "0x025004", "expect": "0xC0E0", "label": "divrem-overflow handler fired"}, {"addr": "0x025004", "expect": "0xC0E0", "label": "divrem-overflow handler fired"},
{"addr": "0x025006", "expect": "0xC0DA", "label": "main reached tail after all three recoveries"} {"addr": "0x025006", "expect": "0xC0E1", "label": "sub-overflow handler fired"},
{"addr": "0x025008", "expect": "0xC0E2", "label": "mul-overflow handler fired"},
{"addr": "0x02500A", "expect": "0xC0E3", "label": "negate-overflow handler fired"},
{"addr": "0x02500C", "expect": "0xC0E4", "label": "pointer-overflow handler fired"},
{"addr": "0x02500E", "expect": "0xC0E5", "label": "load-invalid-value handler fired"},
{"addr": "0x025010", "expect": "0xC0E6", "label": "out-of-bounds handler fired"},
{"addr": "0x025012", "expect": "0xC0DA", "label": "main reached tail after all nine recoveries"}
] ]
} }