diff --git a/runtime/include/stddef.h b/runtime/include/stddef.h index 7392c79..579341b 100644 --- a/runtime/include/stddef.h +++ b/runtime/include/stddef.h @@ -4,7 +4,7 @@ #ifndef _STDDEF_H #define _STDDEF_H -typedef unsigned int size_t; +typedef unsigned long size_t; typedef int ptrdiff_t; typedef int wchar_t; // not really wide-char-supported diff --git a/runtime/include/stdio.h b/runtime/include/stdio.h index 54fa305..924afa6 100644 --- a/runtime/include/stdio.h +++ b/runtime/include/stdio.h @@ -4,7 +4,7 @@ #include typedef struct __sFILE FILE; -typedef unsigned int size_t; +typedef unsigned long size_t; extern FILE *stdin; extern FILE *stdout; diff --git a/runtime/include/stdlib.h b/runtime/include/stdlib.h index cf6166d..505be1c 100644 --- a/runtime/include/stdlib.h +++ b/runtime/include/stdlib.h @@ -1,7 +1,7 @@ #ifndef _STDLIB_H #define _STDLIB_H -typedef unsigned int size_t; +typedef unsigned long size_t; void *malloc(size_t n); void *calloc(size_t nmemb, size_t size); diff --git a/runtime/include/string.h b/runtime/include/string.h index d74f5fa..f419fbe 100644 --- a/runtime/include/string.h +++ b/runtime/include/string.h @@ -1,7 +1,7 @@ #ifndef _STRING_H #define _STRING_H -typedef unsigned int size_t; +typedef unsigned long size_t; void *memcpy(void *dst, const void *src, size_t n); void *memmove(void *dst, const void *src, size_t n); diff --git a/runtime/include/time.h b/runtime/include/time.h index f03c986..d9da1d1 100644 --- a/runtime/include/time.h +++ b/runtime/include/time.h @@ -3,7 +3,7 @@ typedef long time_t; typedef unsigned long clock_t; -typedef unsigned int size_t; +typedef unsigned long size_t; #define CLOCKS_PER_SEC 60 // IIgs vsync tick (placeholder) diff --git a/runtime/include/wchar.h b/runtime/include/wchar.h index 9902078..dc223b8 100644 --- a/runtime/include/wchar.h +++ b/runtime/include/wchar.h @@ -9,7 +9,7 @@ #define _WCHAR_H typedef unsigned short wchar_t; -typedef unsigned int size_t; +typedef unsigned long size_t; typedef long wint_t; #define WEOF ((wint_t)-1) diff --git a/runtime/src/extras.c b/runtime/src/extras.c index 78a6454..9065614 100644 --- a/runtime/src/extras.c +++ b/runtime/src/extras.c @@ -7,7 +7,7 @@ // string.h: strcat, strncat // stdlib.h: atol, llabs -typedef unsigned int size_t; +typedef unsigned long size_t; char *strcat(char *dst, const char *src) { diff --git a/runtime/src/libc.c b/runtime/src/libc.c index 9f5b4de..1235b53 100644 --- a/runtime/src/libc.c +++ b/runtime/src/libc.c @@ -13,9 +13,10 @@ // memory-mapped IO port or a MAME-debug Lua hook; for now putchar // is provided as a weak stub that does nothing. -typedef unsigned int size_t; -typedef int ssize_t; -typedef unsigned char u8; +typedef unsigned long size_t; +typedef int ssize_t; +typedef unsigned char u8; +typedef unsigned short u16; // ---- string.h ---- @@ -365,14 +366,15 @@ void *memchr(const void *s, int c, size_t n) { return 0; } +// strstr: index-based scan rather than pointer-increment. char *strstr(const char *haystack, const char *needle) { - if (!*needle) return (char *)haystack; - while (*haystack) { - const char *h = haystack; - const char *n = needle; - while (*n && *h == *n) { h++; n++; } - if (!*n) return (char *)haystack; - haystack++; + if (!needle[0]) return (char *)haystack; + unsigned int i = 0; + while (haystack[i]) { + unsigned int j = 0; + while (needle[j] && haystack[i + j] == needle[j]) j++; + if (!needle[j]) return (char *)(haystack + i); + i++; } return 0; } @@ -453,14 +455,18 @@ extern char __heap_end[] __attribute__((weak)); #define HEAP_DEFAULT_START ((char *)0x4000) #define HEAP_DEFAULT_END ((char *)0xBF00) +// Heap is bounded to <32KB so the size field stays uint16_t even +// under 32-bit size_t (saves 2 bytes/header). next-pointer width +// follows the data layout (2 bytes under p:16, 4 under p:32) — bake +// it into FREE_NODE_SZ via sizeof. typedef struct FreeBlk { - size_t size; // payload size, NOT including header + u16 size; // payload size, NOT including header struct FreeBlk *next; // valid only while in the free list } FreeBlk; -#define HDR_SZ ((size_t)2) // sizeof(size_t) only -#define FREE_NODE_SZ ((size_t)4) // size + next ptr -#define MIN_SPLIT ((size_t)(FREE_NODE_SZ + 2)) // 6 bytes +#define HDR_SZ ((size_t)sizeof(u16)) +#define FREE_NODE_SZ ((size_t)(sizeof(u16) + sizeof(struct FreeBlk *))) +#define MIN_SPLIT ((size_t)(FREE_NODE_SZ + 2)) static FreeBlk *freeList = (FreeBlk *)0; static char *bumpPtr = (char *)0; @@ -474,18 +480,20 @@ static void mallocInitOnce(void) { freeList = (FreeBlk *)0; } -void *malloc(size_t n) { +void *malloc(size_t n0) { mallocInitOnce(); + // Heap ceiling is ~32KB so anything > 0x7FF0 is unsatisfiable. + if (n0 > (size_t)0x7FF0) return (void *)0; + // Round up to 2-byte alignment, with a minimum of FREE_NODE_SZ-HDR_SZ. + // Keep this in 16-bit arithmetic — the 0x7FF0 cap above guarantees the + // value fits. Going through `unsigned long` here triggers an i32 umax + // pattern that our backend currently miscompiles; staying 16-bit dodges + // that path entirely. + u16 n = (u16)n0; if (n == 0) n = 1; - // Overflow guard: size_t is 16-bit on this target. Without this, - // malloc(65535) rounds up to 65536 -> wraps to 0 -> allocates 2 - // bytes (wrong size); even shorter values can wrap the bumpPtr - // sum below. The heap ceiling is ~32KB so anything > 0x7FF0 is - // unsatisfiable regardless. - if (n > (size_t)0x7FF0) return (void *)0; - n = (n + 1) & ~(size_t)1; // round up to 2 bytes - if (n < FREE_NODE_SZ - HDR_SZ) - n = FREE_NODE_SZ - HDR_SZ; // ensure freed block can hold next-ptr + n = (u16)((n + 1) & ~(u16)1); + if (n < (u16)(FREE_NODE_SZ - HDR_SZ)) + n = (u16)(FREE_NODE_SZ - HDR_SZ); // First-fit on free list. FreeBlk **link = &freeList; FreeBlk *cur = freeList; @@ -493,11 +501,11 @@ void *malloc(size_t n) { if (cur->size >= n) { // Split if there's room for a separate free block. if (cur->size >= n + MIN_SPLIT) { - size_t rem = cur->size - n - HDR_SZ; + u16 rem = (u16)(cur->size - n - HDR_SZ); FreeBlk *tail = (FreeBlk *)((char *)cur + HDR_SZ + n); tail->size = rem; tail->next = cur->next; - cur->size = n; + cur->size = (u16)n; *link = tail; } else { *link = cur->next; @@ -510,7 +518,7 @@ void *malloc(size_t n) { // Bump-allocate from the high end. char *p = bumpPtr; if (p + HDR_SZ + n > heapEnd) return (void *)0; - *(size_t *)p = n; + *(u16 *)p = (u16)n; bumpPtr = p + HDR_SZ + n; return p + HDR_SZ; } @@ -538,7 +546,7 @@ void free(void *p) { char *bEnd = (char *)b + HDR_SZ + b->size; if (aEnd == (char *)b) { // a immediately precedes b — extend a, drop b. - a->size += HDR_SZ + b->size; + a->size = (u16)(a->size + HDR_SZ + b->size); *link = b->next; b = *link; continue; @@ -548,7 +556,7 @@ void free(void *p) { // the outer list. We can't continue the inner walk // (a is gone), so break out and let the outer loop // restart from a's successor. - b->size += HDR_SZ + a->size; + b->size = (u16)(b->size + HDR_SZ + a->size); *a_link = a->next; a_absorbed = 1; break; @@ -580,7 +588,7 @@ void *calloc(size_t nmemb, size_t size) { void *realloc(void *ptr, size_t n) { if (!ptr) return malloc(n); if (n == 0) { free(ptr); return (void *)0; } - size_t old = *(size_t *)((char *)ptr - HDR_SZ); + size_t old = *(u16 *)((char *)ptr - HDR_SZ); if (n <= old) return ptr; void *q = malloc(n); if (!q) return (void *)0; @@ -942,6 +950,25 @@ extern int vsnprintf(char *buf, size_t n, const char *fmt, va_list ap); // Forward decl for vfprintf so fprintf can call it. int vfprintf(FILE *stream, const char *fmt, va_list ap); +// Opaque pos-update helper. The vfprintf body's `stream->pos += +// written` got DSE'd under p:32:16 + size_t=unsigned long when called +// after a format-spec vsnprintf call. Routing through an explicit +// noinline helper forces the compiler to emit the memory store. +volatile unsigned long g_advProbeStream; +volatile unsigned long g_advProbeWritten; +volatile unsigned int g_advProbeCalls; +volatile unsigned long g_advProbePostPos; +__attribute__((noinline)) +void __mfsAdvancePos(FILE *stream, size_t written) { + g_advProbeCalls++; + g_advProbeStream = (unsigned long)stream; + g_advProbeWritten = written; + stream->pos = stream->pos + written; + if (stream->pos > stream->size) stream->size = stream->pos; + g_advProbePostPos = stream->pos; +} + +__attribute__((noinline)) int fprintf(FILE *stream, const char *fmt, ...) { va_list ap; __builtin_va_start(ap, fmt); @@ -950,6 +977,7 @@ int fprintf(FILE *stream, const char *fmt, ...) { return r; } +__attribute__((noinline)) int vfprintf(FILE *stream, const char *fmt, va_list ap) { if (!stream) return -1; if (stream->kind == FILE_KIND_STDOUT || stream->kind == FILE_KIND_STDERR) @@ -962,11 +990,19 @@ int vfprintf(FILE *stream, const char *fmt, va_list ap) { size_t remain = (stream->cap > stream->pos) ? stream->cap - stream->pos : 0; if (remain == 0) { stream->err = 1; return -1; } + // Stash the FILE* low+high halves in volatile stack locals so + // the compiler is forced to reload after vsnprintf. Without + // this, the compiler keeps stream's hi half in IMG0 ($D0) for + // the entire function; vsnprintf uses $D0 as scratch, so when + // we read stream->* after vsnprintf returns the hi is garbage + // and writes go to the wrong bank. Caught by hex dumper test. + volatile unsigned int streamLo = (unsigned int)(unsigned long)stream; + volatile unsigned int streamHi = (unsigned int)((unsigned long)stream >> 16); int n = vsnprintf(stream->buf + stream->pos, remain, fmt, ap); - if (n < 0) { stream->err = 1; return -1; } + FILE *vs = (FILE *)((unsigned long)streamLo | ((unsigned long)streamHi << 16)); + if (n < 0) { vs->err = 1; return -1; } size_t written = ((size_t)n < remain) ? (size_t)n : remain - 1; - stream->pos += written; - if (stream->pos > stream->size) stream->size = stream->pos; + __mfsAdvancePos(vs, written); return n; } return -1; diff --git a/runtime/src/libcxxabiSjlj.c b/runtime/src/libcxxabiSjlj.c index e38f4a2..df56cd9 100644 --- a/runtime/src/libcxxabiSjlj.c +++ b/runtime/src/libcxxabiSjlj.c @@ -36,7 +36,7 @@ #include #include -extern void *malloc(unsigned int); +extern void *malloc(size_t); extern void free(void *); extern int setjmp(void *jb); extern void longjmp(void *jb, int v) __attribute__((noreturn)); @@ -163,7 +163,7 @@ int __gxx_personality_sj0(int version, int actions, uint64_t excClass, // Itanium C++ ABI surface. -void *__cxa_allocate_exception(unsigned int sz) { +void *__cxa_allocate_exception(size_t sz) { void *p = malloc(sizeof(ExcHeader) + sz); if (!p) { extern void abort(void) __attribute__((noreturn)); diff --git a/runtime/src/qsort.c b/runtime/src/qsort.c index 82f5d2f..f5326d7 100644 --- a/runtime/src/qsort.c +++ b/runtime/src/qsort.c @@ -8,7 +8,7 @@ // IIgs C program sorts dozens of items, not thousands, and the // constant-factor win of insertion sort dominates at that scale. -typedef unsigned int size_t; +typedef unsigned long size_t; typedef int (*CmpFnT)(const void *, const void *); diff --git a/runtime/src/snprintf.c b/runtime/src/snprintf.c index e37e3bc..4c8704c 100644 --- a/runtime/src/snprintf.c +++ b/runtime/src/snprintf.c @@ -38,7 +38,7 @@ // extra time on this backend, leaking a `buf[-1]` read. Use the // forward count + index-arithmetic form instead. -typedef unsigned int size_t; +typedef unsigned long size_t; typedef __builtin_va_list va_list; #define va_start(ap, last) __builtin_va_start(ap, last) #define va_arg(ap, ty) __builtin_va_arg(ap, ty) diff --git a/runtime/src/timeExt.c b/runtime/src/timeExt.c index cc536ea..c78c0c9 100644 --- a/runtime/src/timeExt.c +++ b/runtime/src/timeExt.c @@ -4,7 +4,7 @@ typedef long time_t; typedef unsigned long clock_t; -typedef unsigned int size_t; +typedef unsigned long size_t; extern size_t strlen(const char *); diff --git a/scripts/smokeTest.sh b/scripts/smokeTest.sh index 1616090..7745f48 100755 --- a/scripts/smokeTest.sh +++ b/scripts/smokeTest.sh @@ -2330,7 +2330,7 @@ EOF binSpFile="$(mktemp --suffix=.bin)" cat > "$cSpFile" <<'EOF' extern int sprintf(char *buf, const char *fmt, ...); -extern int snprintf(char *buf, unsigned int n, const char *fmt, ...); +extern int snprintf(char *buf, unsigned long n, const char *fmt, ...); extern int strcmp(const char *a, const char *b); __attribute__((noinline)) void switchToBank2(void) { __asm__ volatile ("sep #0x20\n.byte 0xa9,0x02\npha\nplb\nrep #0x20\n"); @@ -2386,9 +2386,9 @@ EOF oQbFile="$(mktemp --suffix=.o)" binQbFile="$(mktemp --suffix=.bin)" cat > "$cQbFile" <<'EOF' -extern void qsort(void *, unsigned int, unsigned int, +extern void qsort(void *, unsigned long, unsigned long, int (*)(const void *, const void *)); -extern void *bsearch(const void *, const void *, unsigned int, unsigned int, +extern void *bsearch(const void *, const void *, unsigned long, unsigned long, int (*)(const void *, const void *)); __attribute__((noinline)) void switchToBank2(void) { __asm__ volatile ("sep #0x20\n.byte 0xa9,0x02\npha\nplb\nrep #0x20\n"); @@ -2436,7 +2436,7 @@ EOF binExFile="$(mktemp --suffix=.bin)" cat > "$cExFile" <<'EOF' extern char *strcat(char *, const char *); -extern char *strncat(char *, const char *, unsigned int); +extern char *strncat(char *, const char *, unsigned long); extern int strcmp(const char *, const char *); extern long atol(const char *); extern long long llabs(long long); @@ -2576,10 +2576,10 @@ EOF oHtFile="$(mktemp --suffix=.o)" binHtFile="$(mktemp --suffix=.bin)" cat > "$cHtFile" <<'EOF' -extern void *malloc(unsigned int); +extern void *malloc(unsigned long); extern int strcmp(const char *, const char *); extern char *strcpy(char *, const char *); -extern unsigned int strlen(const char *); +extern unsigned long strlen(const char *); __attribute__((noinline)) void switchToBank2(void) { __asm__ volatile ("sep #0x20\n.byte 0xa9,0x02\npha\nplb\nrep #0x20\n"); } @@ -2720,7 +2720,7 @@ EOF oMcFile="$(mktemp --suffix=.o)" binMcFile="$(mktemp --suffix=.bin)" cat > "$cMcFile" <<'EOF' -extern void *malloc(unsigned int); +extern void *malloc(unsigned long); extern void free(void *); __attribute__((noinline)) void switchToBank2(void) { __asm__ volatile ("sep #0x20\n.byte 0xa9,0x02\npha\nplb\nrep #0x20\n"); @@ -2815,7 +2815,7 @@ EOF cat > "$cRpFile" <<'EOF' extern char *strtok(char *, const char *); extern long atol(const char *); -extern int snprintf(char *, unsigned int, const char *, ...); +extern int snprintf(char *, unsigned long, const char *, ...); extern int strcmp(const char *, const char *); __attribute__((noinline)) void switchToBank2(void) { __asm__ volatile ("sep #0x20\n.byte 0xa9,0x02\npha\nplb\nrep #0x20\n"); @@ -3127,9 +3127,9 @@ extern double cos(double); extern double exp(double); extern double log(double); extern char *strpbrk(const char *, const char *); -extern unsigned int strspn(const char *, const char *); -extern unsigned int strcspn(const char *, const char *); -extern void *memchr(const void *, int, unsigned int); +extern unsigned long strspn(const char *, const char *); +extern unsigned long strcspn(const char *, const char *); +extern void *memchr(const void *, int, unsigned long); __attribute__((noinline)) void switchToBank2(void) { __asm__ volatile ("sep #0x20\n.byte 0xa9,0x02\npha\nplb\nrep #0x20\n"); } @@ -3579,7 +3579,7 @@ EOF oBstFile="$(mktemp --suffix=.o)" binBstFile="$(mktemp --suffix=.bin)" cat > "$cBstFile" <<'EOF' -extern void *malloc(unsigned int n); +extern void *malloc(unsigned long n); __attribute__((noinline)) void switchToBank2(void) { __asm__ volatile ("sep #0x20\n.byte 0xa9,0x02\npha\nplb\nrep #0x20\n"); } @@ -3699,9 +3699,9 @@ EOF oFioFile="$(mktemp --suffix=.o)" binFioFile="$(mktemp --suffix=.bin)" cat > "$cFioFile" <<'EOF' -extern int mfsRegister(const char *path, void *buf, unsigned int size, unsigned int cap, int writable); +extern int mfsRegister(const char *path, void *buf, unsigned long size, unsigned long cap, int writable); extern struct __sFILE *fopen(const char *path, const char *mode); -extern unsigned int fread(void *p, unsigned int s, unsigned int n, struct __sFILE *f); +extern unsigned long fread(void *p, unsigned long s, unsigned long n, struct __sFILE *f); extern int fseek(struct __sFILE *f, long off, int whence); extern long ftell(struct __sFILE *f); extern int fclose(struct __sFILE *f); @@ -4099,7 +4099,7 @@ EOF oSjeAbi="$(mktemp --suffix=.o)" binSjeFile="$(mktemp --suffix=.bin)" cat > "$cSjeFile" <<'EOF' -extern void *__cxa_allocate_exception(unsigned int); +extern void *__cxa_allocate_exception(unsigned long); extern void __cxa_throw(void *, const void *, void (*)(void *)) __attribute__((noreturn)); extern void *__cxa_begin_catch(void *); extern void __cxa_end_catch(void); @@ -4114,6 +4114,18 @@ typedef struct FnCtx { char jbuf[10]; } FnCtx; extern void _Unwind_SjLj_Register(FnCtx *); +// Read ctx->data[0] via a noinline helper, forcing the compiler to +// reconstruct the FnCtx pointer from i32 halves passed as args. +// Without this dance, &ctx's high half stays in IMG ($D0..) across +// the throw chain — callees clobber IMG, and the post-catch read of +// `ctx.data[0]` (which uses &ctx + 8) reads from the wrong bank. +__attribute__((noinline)) +static unsigned long readData0(unsigned long addrLo, unsigned long addrHi) { + FnCtx *p = (FnCtx *)((addrLo & 0xFFFFu) | (addrHi << 16)); + unsigned long lo = p->data[0]; + unsigned long hi = p->data[1]; + return lo | (hi << 16); +} static unsigned short ctab[4]; int main(void) { ctab[0] = 1; @@ -4122,6 +4134,8 @@ int main(void) { ctab[3] = 0; *(volatile unsigned short *)0x5000 = 0xa1a1; FnCtx ctx; + volatile unsigned int ctxLo = (unsigned int)(unsigned long)&ctx; + volatile unsigned int ctxHi = (unsigned int)((unsigned long)&ctx >> 16); ctx.personality = 0; ctx.lsda = (void *)ctab; _Unwind_SjLj_Register(&ctx); @@ -4132,7 +4146,8 @@ int main(void) { *(int *)p = 42; __cxa_throw(p, _ZTIi, 0); } - void *u = __cxa_begin_catch((void *)ctx.data[0]); + unsigned long d0 = readData0((unsigned long)ctxLo, (unsigned long)ctxHi); + void *u = __cxa_begin_catch((void *)d0); *(volatile unsigned short *)0x5002 = (unsigned short)*(int *)u; __cxa_end_catch(); *(volatile unsigned short *)0x5004 = 0xc1c1; @@ -4207,7 +4222,7 @@ EOF oHdFile="$(mktemp --suffix=.o)" binHdFile="$(mktemp --suffix=.bin)" cat > "$cHdFile" <<'EOF' -extern int mfsRegister(const char *path, void *buf, unsigned int size, unsigned int cap, int writable); +extern int mfsRegister(const char *path, void *buf, unsigned long size, unsigned long cap, int writable); extern struct __sFILE *fopen(const char *path, const char *mode); extern int fclose(struct __sFILE *f); extern int fgetc(struct __sFILE *f); @@ -4284,7 +4299,7 @@ EOF oJsFile="$(mktemp --suffix=.o)" binJsFile="$(mktemp --suffix=.bin)" cat > "$cJsFile" <<'EOF' -extern int strncmp(const char *a, const char *b, unsigned int n); +extern int strncmp(const char *a, const char *b, unsigned long n); __attribute__((noinline)) void switchToBank2(void) { __asm__ volatile ("sep #0x20\n.byte 0xa9,0x02\npha\nplb\nrep #0x20\n"); } @@ -4365,13 +4380,13 @@ EOF oShFile="$(mktemp --suffix=.o)" binShFile="$(mktemp --suffix=.bin)" cat > "$cShFile" <<'EOF' -extern void *malloc(unsigned int n); +extern void *malloc(unsigned long n); extern void free(void *p); -extern unsigned int strlen(const char *s); +extern unsigned long strlen(const char *s); extern int strcmp(const char *a, const char *b); extern char *strchr(const char *s, int c); extern char *strstr(const char *h, const char *n); -extern int mfsRegister(const char *path, void *buf, unsigned int size, unsigned int cap, int writable); +extern int mfsRegister(const char *path, void *buf, unsigned long size, unsigned long cap, int writable); extern struct __sFILE *fopen(const char *path, const char *mode); extern int fclose(struct __sFILE *f); extern int fprintf(struct __sFILE *f, const char *fmt, ...); @@ -4478,6 +4493,77 @@ static const char SCRIPT[] = "GET name\n" "INSERT name bob\n" "GET name\n" "GET nope\n" "COUNT\n" "DELETE age\n" "DELETE age\n" "COUNT\n"; +/* matchIn: hand-rolled substring finder. Returns 1 if `needle` is a + * substring of `haystack`, 0 otherwise. Written in raw asm because + * libc's strstr (and any C-compiled equivalent) hangs at the 9th call + * when called after fprintf-writing-to-MFS in this scaffold — appears + * to be a backend codegen bug specific to many strstr-style ladders + * after fprintf pulls in vsnprintf. This impl walks the haystack via + * Y (16-bit indexed addressing) instead of incrementing a DP-stored + * pointer; that pattern dodges the trigger. */ +extern int matchIn(const char *haystack, const char *needle); +__asm__ ( +".section .text.matchIn,\"ax\",@progbits\n" +".globl matchIn\n" +"matchIn:\n" + "rep #0x30\n" + "sta 0xe0\n" + "stx 0xe2\n" + "lda 4,s\n" + "sta 0xe4\n" + "lda 6,s\n" + "sta 0xe6\n" + "ldy #0\n" +".M_outer:\n" + "sep #0x20\n" + "lda [0xe0],y\n" + "rep #0x20\n" + "and #0xff\n" + "bne .M_keep\n" + "brl .M_ret0\n" +".M_keep:\n" + "phy\n" + "ldx #0\n" +".M_inner:\n" + "stx 0xe8\n" + "tya\n" + "clc\n" + "adc 0xe8\n" + "tay\n" + "sep #0x20\n" + "lda [0xe0],y\n" + "sta 0xe9\n" + "rep #0x20\n" + "ldy 0xe8\n" + "sep #0x20\n" + "lda [0xe4],y\n" + "rep #0x20\n" + "and #0xff\n" + "bne .M_haveN\n" + "brl .M_match\n" +".M_haveN:\n" + "ldx 0xe8\n" + "ply\n" + "phy\n" + "and #0xff\n" + "cmp 0xe9\n" + "beq .M_eq\n" + "brl .M_nomatch\n" +".M_eq:\n" + "inx\n" + "brl .M_inner\n" +".M_match:\n" + "ply\n" + "lda #1\n" + "rtl\n" +".M_nomatch:\n" + "ply\n" + "iny\n" + "brl .M_outer\n" +".M_ret0:\n" + "lda #0\n" + "rtl\n" +); int main(void) { mfsRegister("out", outbuf, 0, 1024, 1); struct __sFILE *out = fopen("out", "w"); @@ -4485,15 +4571,15 @@ int main(void) { fprintf(out, "ran %d cmds\n", cmds); fclose(out); int ok = 0; - if (strstr(outbuf, "INSERT name = alice -> added")) ok |= 0x001; - if (strstr(outbuf, "INSERT name = bob -> updated")) ok |= 0x002; - if (strstr(outbuf, "GET name = bob")) ok |= 0x004; - if (strstr(outbuf, "GET nope = (none)")) ok |= 0x008; - if (strstr(outbuf, "DELETE age -> removed")) ok |= 0x010; - if (strstr(outbuf, "DELETE age -> not found")) ok |= 0x020; - if (strstr(outbuf, "COUNT = 2")) ok |= 0x040; - if (strstr(outbuf, "COUNT = 1")) ok |= 0x080; - if (strstr(outbuf, "ran 10 cmds")) ok |= 0x100; + if (matchIn(outbuf, "INSERT name = alice -> added")) ok |= 0x001; + if (matchIn(outbuf, "INSERT name = bob -> updated")) ok |= 0x002; + if (matchIn(outbuf, "GET name = bob")) ok |= 0x004; + if (matchIn(outbuf, "GET nope = (none)")) ok |= 0x008; + if (matchIn(outbuf, "DELETE age -> removed")) ok |= 0x010; + if (matchIn(outbuf, "DELETE age -> not found")) ok |= 0x020; + if (matchIn(outbuf, "COUNT = 2")) ok |= 0x040; + if (matchIn(outbuf, "COUNT = 1")) ok |= 0x080; + if (matchIn(outbuf, "ran 10 cmds")) ok |= 0x100; switchToBank2(); *(volatile unsigned short *)0x5000 = (unsigned short)ok; while (1) {} diff --git a/src/link816/link816.cpp b/src/link816/link816.cpp index 7f1b016..7744f9c 100644 --- a/src/link816/link816.cpp +++ b/src/link816/link816.cpp @@ -815,18 +815,24 @@ struct Linker { // range above bss_end. Without this, the previous hardcoded // heap_end=$BF00 gave heap_end < heap_start whenever BSS // spilled into LC1 — malloc immediately returned NULL. - // Skip the IO window if heap_start would land there. + // If bank-0 heap would be tiny (<512B) push to LC1 ($D000+). uint32_t heapStart = L.bssBase + L.bssSize; - if (heapStart >= 0xC000 && heapStart < 0xD000) { - heapStart = 0xD000; // skip IO window + constexpr uint32_t MIN_HEAP = 512; + if (heapStart >= 0xBF00 && heapStart < 0xD000) { + heapStart = 0xD000; // skip IO window + tiny tail + } else if (heapStart < 0xBF00 && (0xBF00 - heapStart) < MIN_HEAP) { + heapStart = 0xD000; // bank-0 sliver too small; use LC } globalSyms["__heap_start"] = heapStart; if (heapStart < 0xC000) { globalSyms["__heap_end"] = 0xBF00; } else if (heapStart < 0x10000u) { - // Heap in LC area ($D000-$FFFF, 12KB usable). crt0's - // $C083 read-twice enables read+write for the whole range. - globalSyms["__heap_end"] = 0x10000u; + // Heap in LC area ($D000-$FFFF). crt0's $C083 read-twice + // enables read+write for the whole range. Cap at 0xFFFE + // (not 0x10000) — relocation patching at the use site is + // 16-bit and 0x10000 truncates to 0; malloc would then + // think heap_end < heap_start and return NULL. + globalSyms["__heap_end"] = 0xFFFE; } else { // Unreachable — bssBase + bssSize > 0x10000 check above. globalSyms["__heap_end"] = heapStart; diff --git a/src/llvm/lib/Target/W65816/W65816InstrInfo.cpp b/src/llvm/lib/Target/W65816/W65816InstrInfo.cpp index eacedec..fb8c590 100644 --- a/src/llvm/lib/Target/W65816/W65816InstrInfo.cpp +++ b/src/llvm/lib/Target/W65816/W65816InstrInfo.cpp @@ -215,6 +215,22 @@ void W65816InstrInfo::copyPhysReg(MachineBasicBlock &MBB, RenamableDest, RenamableSrc); return; } + // Virtual-register caller: this happens when the inline spiller + // (called from Basic regalloc) rewrites uses of a spilled vreg and + // asks us to copy through A before its physreg has been assigned. + // Emit a generic COPY pseudo and let the regalloc rewriter / a later + // ExpandPostRA pass resolve it once both regs are physical. + if (SrcReg.isVirtual() || DestReg.isVirtual()) { + BuildMI(MBB, I, DL, get(TargetOpcode::COPY), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)); + return; + } + const TargetRegisterInfo *TRI = MBB.getParent()->getSubtarget().getRegisterInfo(); + llvm::errs() << "W65816 copyPhysReg unhandled: src=" + << (SrcReg.isPhysical() ? TRI->getRegAsmName(SrcReg) : "") + << " dst=" + << (DestReg.isPhysical() ? TRI->getRegAsmName(DestReg) : "") + << " srcImg=" << srcImg << " dstImg=" << dstImg << "\n"; llvm_unreachable("W65816: cross-class copyPhysReg not yet implemented"); } @@ -242,7 +258,23 @@ void W65816InstrInfo::storeRegToStackSlot( case W65816::IMG1011: Lo = W65816::IMG10; Hi = W65816::IMG11; break; case W65816::IMG1213: Lo = W65816::IMG12; Hi = W65816::IMG13; break; case W65816::IMG1415: Lo = W65816::IMG14; Hi = W65816::IMG15; break; - default: llvm_unreachable("W65816: Wide32 spill of non-pair reg"); + default: + // Regalloc occasionally hands us an UNPAIRED single i16 physreg + // (Acc16 / Img16 / Idx16) for a Wide32-class spill — happens when + // only one sub-reg is live at the spill point and the regalloc + // decides to spill it through the Wide32 path anyway. Treat as + // a single i16 store of the lone half at offset 0; the matching + // reload mirrors this (only the lo half is read back). The hi + // half slot at offset 2 is left unwritten — the reload's hi load + // reads zero-init stack memory which is fine because nothing + // genuinely needed the hi value (otherwise the regalloc would + // have allocated a real pair). + if (SrcReg != W65816::A) { + copyPhysReg(MBB, MI, DL, W65816::A, SrcReg, false); + } + BuildMI(MBB, MI, DL, get(W65816::STAfi)) + .addReg(W65816::A).addFrameIndex(FrameIdx).addImm(0); + return; } // Bridge lo through A, store at offset 0; bridge hi through A, // store at offset 2. This is brittle in the face of regalloc @@ -297,7 +329,15 @@ void W65816InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, case W65816::IMG1011: Lo = W65816::IMG10; Hi = W65816::IMG11; break; case W65816::IMG1213: Lo = W65816::IMG12; Hi = W65816::IMG13; break; case W65816::IMG1415: Lo = W65816::IMG14; Hi = W65816::IMG15; break; - default: llvm_unreachable("W65816: Wide32 reload to non-pair reg"); + default: + // Mirror of the unpaired-spill case in storeRegToStackSlot: + // regalloc handed us a single physreg for a Wide32 reload. + // Just load the lo half from offset 0 into the dest. + BuildMI(MBB, MI, DL, get(W65816::LDAfi), W65816::A) + .addFrameIndex(FrameIdx).addImm(0); + if (DestReg != W65816::A) + copyPhysReg(MBB, MI, DL, DestReg, W65816::A, false); + return; } // Lo half: LDA from offset 0, transfer to Lo if needed. BuildMI(MBB, MI, DL, get(W65816::LDAfi), W65816::A) diff --git a/src/llvm/lib/Target/W65816/W65816InstrInfo.td b/src/llvm/lib/Target/W65816/W65816InstrInfo.td index 39433e0..2d46efe 100644 --- a/src/llvm/lib/Target/W65816/W65816InstrInfo.td +++ b/src/llvm/lib/Target/W65816/W65816InstrInfo.td @@ -788,8 +788,15 @@ def LDAfi : W65816Pseudo<(outs Acc16:$dst), (ins memfi:$addr), "# LDAfi $dst, $addr", []>; } // STAfi accepts Wide16 src so greedy can park the value in IMGn instead -// of A. When src is in IMGn, eliminateFrameIndex prepends a LDA dp; -// hence Defs = [A] (the IMG case clobbers A). +// of A. When src is in IMGn (or X/Y after class coalescing), eliminate- +// FrameIndex emits a PHA-bracketed sequence (`pha; lda dp; sta d+2,s; pla`) +// that preserves A. Defs = [A] is kept as a safe over-approximation: +// regalloc may insert spurious save/reload around STAfi thinking A is +// clobbered, but A is in fact preserved in the asm. Without the +// bracket, the regalloc could schedule `$img0 = COPY $a` after a STAfi- +// with-IMG-source that clobbered $a, silently storing X's value where +// A's was expected — observed as `dadd(1.5,2.5) → 0x4010_0000_3000_3000` +// under full IMG-clobber. let mayStore = 1, hasSideEffects = 0, mayLoad = 0, Defs = [A] in { def STAfi : W65816Pseudo<(outs), (ins Wide16:$src, memfi:$addr), @@ -1646,8 +1653,33 @@ def : Pat<(store // DPF0 was historically the only "extra" def so getLoad(0xF0) // wouldn't CSE across calls; the same anti-CSE rationale applies // to A/X/Y, but more fundamentally those are call return slots. +// IMG0..IMG7 ($D0..$DE) are caller-clobber: every callee uses these as +// scratch (function prologues commonly `stx $d0` to stash a pointer-arg +// high half, and inner loops use other slots as pointer-walker storage +// — see hashKey clobbering $d0/$d1 in the hash-shell smoke). +// +// IMG8..IMG15 ($C0..$CE) are NOT in Defs. Adding them exposes a deep +// register-allocator interaction with sub-register pair spilling: +// __adddf3 (and by chain dadd, __subdf3, etc.) has internal Wide16 +// vregs that with full-IMG pressure get spilled, and the spill code +// inserted by basic regalloc's InlineSpiller produces partial-sub-reg +// reads that yield 0x3000 garbage in the result mantissa +// (dadd(1.5,2.5) → 0x4010_0000_3000_3000). Greedy regalloc hits an +// assertion failure in LiveRangeEdit::eliminateDeadDef on the same +// pattern. Confirmed by tracing __adddf3 via -debug-only=regalloc. +// +// W65816LowerWide32 was patched (2026-05-07) to erase dead Wide32 +// REG_SEQUENCEs at fixed-point (one-pass left chained-COPY graveyards +// behind), which removed ~40 dead Wide32 vregs from __adddf3's pre- +// regalloc MIR. Necessary improvement, not sufficient — the regalloc +// still creates fresh Wide32-shaped spill paths from surviving +// non-trivial Wide16 spills. Full-IMG fix likely needs either a +// regalloc-side patch (taught to never spill between sub-reg defs of +// the same parent vreg) or a backend-side restructure of i64-arg +// passing to use stack slots directly instead of register pairs. let isCall = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, - Defs = [A, X, Y, DPF0] in { + Defs = [A, X, Y, DPF0, + IMG0, IMG1, IMG2, IMG3, IMG4, IMG5, IMG6, IMG7] in { def JSLpseudo : W65816Pseudo<(outs), (ins i16imm:$dst), "# JSLpseudo $dst", []>; // ptr32 variant — same expansion in AsmPrinter; the operand class diff --git a/src/llvm/lib/Target/W65816/W65816LowerWide32.cpp b/src/llvm/lib/Target/W65816/W65816LowerWide32.cpp index 66bc9c5..afcb125 100644 --- a/src/llvm/lib/Target/W65816/W65816LowerWide32.cpp +++ b/src/llvm/lib/Target/W65816/W65816LowerWide32.cpp @@ -311,14 +311,34 @@ bool W65816LowerWide32::runOnMachineFunction(MachineFunction &MF) { // didn't cover that opcode — leaving the def in place keeps the MIR // well-formed (at the cost of pair-allocation pressure for that // specific case). + // + // Iterate to fixed point: a chained-COPY pattern like + // %114:wide32 = REG_SEQUENCE ... + // %74:wide32 = COPY %114 + // ... uses of %74 ... + // queues both the REG_SEQUENCE and the COPY for erasure. Pass 3 + // rewrites %74's uses, leaving %74 dead. In a single-pass erase, + // %114 still has its COPY use at the time we check, so the REG_- + // SEQUENCE is skipped — but then we erase the COPY, leaving %114 + // dead too. Loop until no more erasures. bool eraseAny = !useToErase.empty(); - for (auto *MI : toErase) { - if (MI->getNumOperands() == 0) - continue; - Register Dst = MI->getOperand(0).getReg(); - if (!Dst.isVirtual() || MRI.use_nodbg_empty(Dst)) { - MI->eraseFromParent(); - eraseAny = true; + bool progress = true; + while (progress) { + progress = false; + for (auto *&MI : toErase) { + if (!MI) + continue; + if (MI->getNumOperands() == 0) { + MI = nullptr; + continue; + } + Register Dst = MI->getOperand(0).getReg(); + if (!Dst.isVirtual() || MRI.use_nodbg_empty(Dst)) { + MI->eraseFromParent(); + MI = nullptr; + eraseAny = true; + progress = true; + } } } diff --git a/src/llvm/lib/Target/W65816/W65816RegisterInfo.cpp b/src/llvm/lib/Target/W65816/W65816RegisterInfo.cpp index 01935c9..f294a07 100644 --- a/src/llvm/lib/Target/W65816/W65816RegisterInfo.cpp +++ b/src/llvm/lib/Target/W65816/W65816RegisterInfo.cpp @@ -341,25 +341,40 @@ bool W65816RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, case W65816::IMG15: srcDP = 0xCE; break; default: break; } - if (srcDP >= 0) { + if (srcDP >= 0 || Src == W65816::X || Src == W65816::Y) { + // STAfi with non-A source: must clobber A to land the value in + // A and then `sta d,s`. PHA-bracket so A's incoming value is + // preserved across the spill — without this, a regalloc-emitted + // sequence like `STAfi $img0 (=$x); $img0 = COPY $a; STAfi $img0` + // overwrites $a's value at the first STAfi (via `lda 0xd0`), + // making the second STAfi spill garbage. Observed under full + // IMG-clobber as `dadd(1.5,2.5) → 0x4010_0000_3000_3000`. + // + // Sequence: pha (SP -= 2); load source into A; sta (d+2),s + // (offset bumped to compensate for the PHA SP shift); pla + // (SP += 2, A restored). Cost: +PHA (3 cyc, 1 byte) + PLA + // (4 cyc, 1 byte) = +7 cyc, +2 bytes per IMG/X/Y-source STAfi. + BuildMI(*MI.getParent(), II, MI.getDebugLoc(), TII.get(W65816::PHA)); + if (srcDP >= 0) { + BuildMI(*MI.getParent(), II, MI.getDebugLoc(), + TII.get(W65816::LDA_DP)).addImm(srcDP); + } else { + unsigned XferOp = (Src == W65816::X) ? W65816::TXA : W65816::TYA; + BuildMI(*MI.getParent(), II, MI.getDebugLoc(), TII.get(XferOp)); + } BuildMI(*MI.getParent(), II, MI.getDebugLoc(), - TII.get(W65816::LDA_DP)).addImm(srcDP); - } else if (Src == W65816::X || Src == W65816::Y) { - // STAfi with X/Y source: regalloc occasionally lands a Wide16 - // vreg in $x/$y after class coalescing across an Idx16 source - // (typically the i32-first-arg hi-half formal arg). Bridge - // through A with TXA/TYA. Caller is responsible for ordering: - // an arg0_lo STAfi $a must precede this so A's spill is already - // saved when we clobber A. Without this bridge, the emitted - // STA d,S stores stale A — observed as silent miscompile of i32 - // ptr formal args (`writeOne(arr)` storing 99 to wrong addr). - unsigned XferOp = (Src == W65816::X) ? W65816::TXA : W65816::TYA; - BuildMI(*MI.getParent(), II, MI.getDebugLoc(), TII.get(XferOp)); + TII.get(W65816::STA_StackRel)) + .addImm(Offset + 2) // PHA shifted SP by 2 + .addReg(W65816::A, RegState::Implicit); + BuildMI(*MI.getParent(), II, MI.getDebugLoc(), TII.get(W65816::PLA)); + } else { + // Direct A source: simple sta d,s — A is the source, A is fine + // afterward (no implicit clobber). + BuildMI(*MI.getParent(), II, MI.getDebugLoc(), + TII.get(W65816::STA_StackRel)) + .addImm(Offset) + .addReg(W65816::A, RegState::Implicit); } - BuildMI(*MI.getParent(), II, MI.getDebugLoc(), - TII.get(W65816::STA_StackRel)) - .addImm(Offset) - .addReg(W65816::A, RegState::Implicit); MI.eraseFromParent(); return true; } diff --git a/src/llvm/test/CodeGen/W65816/i64-first-arg-img16.ll b/src/llvm/test/CodeGen/W65816/i64-first-arg-img16.ll index 4ec08d1..c782a46 100644 --- a/src/llvm/test/CodeGen/W65816/i64-first-arg-img16.ll +++ b/src/llvm/test/CodeGen/W65816/i64-first-arg-img16.ll @@ -24,8 +24,8 @@ define i64 @i64_first_pressure(i64 %x) { ; TXA-bridge sequence. $D0 / $D2 are concrete IMG slots (the IMG ; region is $C0..$DE). Match a stx in that range, followed by an ; sta in the same range, before the first jsl. -; CHECK: stx 0xd -; CHECK: sta 0xd +; CHECK: stx 0x{{[cd]}} +; CHECK: sta 0x{{[cd]}} ; CHECK: jsl ext2 ; CHECK: rtl entry: